Message ID | 20200630031852.45383-1-richard.weiyang@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: define pte_add_end for consistency | expand |
On 30.06.20 05:18, Wei Yang wrote: > When walking page tables, we define several helpers to get the address of > the next boundary. But we don't have one for pte level. > > Let's define it and consolidate the code in several places. > > Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> > --- > arch/x86/mm/init_64.c | 6 ++---- > include/linux/pgtable.h | 7 +++++++ > mm/kasan/init.c | 4 +--- > 3 files changed, 10 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index dbae185511cd..f902fbd17f27 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, > > pte = pte_start + pte_index(addr); > for (; addr < end; addr = next, pte++) { > - next = (addr + PAGE_SIZE) & PAGE_MASK; > - if (next > end) > - next = end; > + next = pte_addr_end(addr, end); > > if (!pte_present(*pte)) > continue; > @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, > get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); > > if (!boot_cpu_has(X86_FEATURE_PSE)) { > - next = (addr + PAGE_SIZE) & PAGE_MASK; > + next = pte_addr_end(addr, end); > pmd = pmd_offset(pud, addr); > if (pmd_none(*pmd)) > continue; > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 32b6c52d41b9..0de09c6c89d2 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) > }) > #endif > > +#ifndef pte_addr_end > +#define pte_addr_end(addr, end) \ > +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ > + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ > +}) > +#endif > + > /* > * When walking page tables, we usually want to skip any p?d_none entries; > * and any p?d_bad entries - reporting the error before resetting to none. > diff --git a/mm/kasan/init.c b/mm/kasan/init.c > index fe6be0be1f76..89f748601f74 100644 > --- a/mm/kasan/init.c > +++ b/mm/kasan/init.c > @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, > unsigned long next; > > for (; addr < end; addr = next, pte++) { > - next = (addr + PAGE_SIZE) & PAGE_MASK; > - if (next > end) > - next = end; > + next = pte_addr_end(addr, end); > > if (!pte_present(*pte)) > continue; > I'm not really a friend of this I have to say. We're simply iterating over single pages, not much magic .... What would definitely make sense is replacing (addr + PAGE_SIZE) & PAGE_MASK; by PAGE_ALIGN() ...
On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >On 30.06.20 05:18, Wei Yang wrote: >> When walking page tables, we define several helpers to get the address of >> the next boundary. But we don't have one for pte level. >> >> Let's define it and consolidate the code in several places. >> >> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >> --- >> arch/x86/mm/init_64.c | 6 ++---- >> include/linux/pgtable.h | 7 +++++++ >> mm/kasan/init.c | 4 +--- >> 3 files changed, 10 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >> index dbae185511cd..f902fbd17f27 100644 >> --- a/arch/x86/mm/init_64.c >> +++ b/arch/x86/mm/init_64.c >> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >> >> pte = pte_start + pte_index(addr); >> for (; addr < end; addr = next, pte++) { >> - next = (addr + PAGE_SIZE) & PAGE_MASK; >> - if (next > end) >> - next = end; >> + next = pte_addr_end(addr, end); >> >> if (!pte_present(*pte)) >> continue; >> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >> >> if (!boot_cpu_has(X86_FEATURE_PSE)) { >> - next = (addr + PAGE_SIZE) & PAGE_MASK; >> + next = pte_addr_end(addr, end); >> pmd = pmd_offset(pud, addr); >> if (pmd_none(*pmd)) >> continue; >> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >> index 32b6c52d41b9..0de09c6c89d2 100644 >> --- a/include/linux/pgtable.h >> +++ b/include/linux/pgtable.h >> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >> }) >> #endif >> >> +#ifndef pte_addr_end >> +#define pte_addr_end(addr, end) \ >> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >> +}) >> +#endif >> + >> /* >> * When walking page tables, we usually want to skip any p?d_none entries; >> * and any p?d_bad entries - reporting the error before resetting to none. >> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >> index fe6be0be1f76..89f748601f74 100644 >> --- a/mm/kasan/init.c >> +++ b/mm/kasan/init.c >> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >> unsigned long next; >> >> for (; addr < end; addr = next, pte++) { >> - next = (addr + PAGE_SIZE) & PAGE_MASK; >> - if (next > end) >> - next = end; >> + next = pte_addr_end(addr, end); >> >> if (!pte_present(*pte)) >> continue; >> > >I'm not really a friend of this I have to say. We're simply iterating >over single pages, not much magic .... Hmm... yes, we are iterating on Page boundary, while we many have the case when addr or end is not PAGE_ALIGN. > >What would definitely make sense is replacing (addr + PAGE_SIZE) & >PAGE_MASK; by PAGE_ALIGN() ... > No, PAGE_ALIGN() is expanded to be (addr + PAGE_SIZE - 1) & PAGE_MASK; If we change the code to PAGE_ALIGN(), we would end up with infinite loop. >-- >Thanks, > >David / dhildenb
On 01.07.20 04:11, Wei Yang wrote: > On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >> On 30.06.20 05:18, Wei Yang wrote: >>> When walking page tables, we define several helpers to get the address of >>> the next boundary. But we don't have one for pte level. >>> >>> Let's define it and consolidate the code in several places. >>> >>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>> --- >>> arch/x86/mm/init_64.c | 6 ++---- >>> include/linux/pgtable.h | 7 +++++++ >>> mm/kasan/init.c | 4 +--- >>> 3 files changed, 10 insertions(+), 7 deletions(-) >>> >>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>> index dbae185511cd..f902fbd17f27 100644 >>> --- a/arch/x86/mm/init_64.c >>> +++ b/arch/x86/mm/init_64.c >>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>> >>> pte = pte_start + pte_index(addr); >>> for (; addr < end; addr = next, pte++) { >>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>> - if (next > end) >>> - next = end; >>> + next = pte_addr_end(addr, end); >>> >>> if (!pte_present(*pte)) >>> continue; >>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>> >>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>> + next = pte_addr_end(addr, end); >>> pmd = pmd_offset(pud, addr); >>> if (pmd_none(*pmd)) >>> continue; >>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>> index 32b6c52d41b9..0de09c6c89d2 100644 >>> --- a/include/linux/pgtable.h >>> +++ b/include/linux/pgtable.h >>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>> }) >>> #endif >>> >>> +#ifndef pte_addr_end >>> +#define pte_addr_end(addr, end) \ >>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>> +}) >>> +#endif >>> + >>> /* >>> * When walking page tables, we usually want to skip any p?d_none entries; >>> * and any p?d_bad entries - reporting the error before resetting to none. >>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>> index fe6be0be1f76..89f748601f74 100644 >>> --- a/mm/kasan/init.c >>> +++ b/mm/kasan/init.c >>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>> unsigned long next; >>> >>> for (; addr < end; addr = next, pte++) { >>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>> - if (next > end) >>> - next = end; >>> + next = pte_addr_end(addr, end); >>> >>> if (!pte_present(*pte)) >>> continue; >>> >> >> I'm not really a friend of this I have to say. We're simply iterating >> over single pages, not much magic .... > > Hmm... yes, we are iterating on Page boundary, while we many have the case > when addr or end is not PAGE_ALIGN. I really do wonder if not having page aligned addresses actually happens in real life. Page tables operate on page granularity, and adding/removing unaligned parts feels wrong ... and that's also why I dislike such a helper. 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand the logic (WARN_ON()) correctly, we bail out in case we would ever end up in such a scenario, where we would want to add/remove things not aligned to PAGE_SIZE. 2. remove_pagetable()...->remove_pte_table() vmemmap_free() should never try to de-populate sub-pages. Even with sub-section hot-add/remove (2MB / 512 pages), with valid struct page sizes (56, 64, 72, 80), we always end up with full pages. kernel_physical_mapping_remove() is only called via arch_remove_memory(). That will never remove unaligned parts. 3. register_page_bootmem_memmap() It operates on full pages only. This needs in-depth analysis, but my gut feeling is that this alignment is unnecessary. > >> >> What would definitely make sense is replacing (addr + PAGE_SIZE) & >> PAGE_MASK; by PAGE_ALIGN() ... >> > > No, PAGE_ALIGN() is expanded to be > > (addr + PAGE_SIZE - 1) & PAGE_MASK; > > If we change the code to PAGE_ALIGN(), we would end up with infinite loop. Very right, it would have to be PAGE_ALIGN(addr + 1).
On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote: >On 01.07.20 04:11, Wei Yang wrote: >> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >>> On 30.06.20 05:18, Wei Yang wrote: >>>> When walking page tables, we define several helpers to get the address of >>>> the next boundary. But we don't have one for pte level. >>>> >>>> Let's define it and consolidate the code in several places. >>>> >>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>>> --- >>>> arch/x86/mm/init_64.c | 6 ++---- >>>> include/linux/pgtable.h | 7 +++++++ >>>> mm/kasan/init.c | 4 +--- >>>> 3 files changed, 10 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>>> index dbae185511cd..f902fbd17f27 100644 >>>> --- a/arch/x86/mm/init_64.c >>>> +++ b/arch/x86/mm/init_64.c >>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>>> >>>> pte = pte_start + pte_index(addr); >>>> for (; addr < end; addr = next, pte++) { >>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>> - if (next > end) >>>> - next = end; >>>> + next = pte_addr_end(addr, end); >>>> >>>> if (!pte_present(*pte)) >>>> continue; >>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>>> >>>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>> + next = pte_addr_end(addr, end); >>>> pmd = pmd_offset(pud, addr); >>>> if (pmd_none(*pmd)) >>>> continue; >>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>>> index 32b6c52d41b9..0de09c6c89d2 100644 >>>> --- a/include/linux/pgtable.h >>>> +++ b/include/linux/pgtable.h >>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>>> }) >>>> #endif >>>> >>>> +#ifndef pte_addr_end >>>> +#define pte_addr_end(addr, end) \ >>>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>>> +}) >>>> +#endif >>>> + >>>> /* >>>> * When walking page tables, we usually want to skip any p?d_none entries; >>>> * and any p?d_bad entries - reporting the error before resetting to none. >>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>>> index fe6be0be1f76..89f748601f74 100644 >>>> --- a/mm/kasan/init.c >>>> +++ b/mm/kasan/init.c >>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>>> unsigned long next; >>>> >>>> for (; addr < end; addr = next, pte++) { >>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>> - if (next > end) >>>> - next = end; >>>> + next = pte_addr_end(addr, end); >>>> >>>> if (!pte_present(*pte)) >>>> continue; >>>> >>> >>> I'm not really a friend of this I have to say. We're simply iterating >>> over single pages, not much magic .... >> >> Hmm... yes, we are iterating on Page boundary, while we many have the case >> when addr or end is not PAGE_ALIGN. > >I really do wonder if not having page aligned addresses actually happens >in real life. Page tables operate on page granularity, and >adding/removing unaligned parts feels wrong ... and that's also why I >dislike such a helper. > >1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand >the logic (WARN_ON()) correctly, we bail out in case we would ever end >up in such a scenario, where we would want to add/remove things not >aligned to PAGE_SIZE. > >2. remove_pagetable()...->remove_pte_table() > >vmemmap_free() should never try to de-populate sub-pages. Even with >sub-section hot-add/remove (2MB / 512 pages), with valid struct page >sizes (56, 64, 72, 80), we always end up with full pages. > >kernel_physical_mapping_remove() is only called via >arch_remove_memory(). That will never remove unaligned parts. > I don't have a very clear mind now, while when you look into remove_pte_table(), it has two cases based on alignment of addr and next. If we always remove a page, the second case won't happen? >3. register_page_bootmem_memmap() > >It operates on full pages only. > > >This needs in-depth analysis, but my gut feeling is that this alignment >is unnecessary. > >> >>> >>> What would definitely make sense is replacing (addr + PAGE_SIZE) & >>> PAGE_MASK; by PAGE_ALIGN() ... >>> >> >> No, PAGE_ALIGN() is expanded to be >> >> (addr + PAGE_SIZE - 1) & PAGE_MASK; >> >> If we change the code to PAGE_ALIGN(), we would end up with infinite loop. > >Very right, it would have to be PAGE_ALIGN(addr + 1). > >-- >Thanks, > >David / dhildenb
On 01.07.20 13:54, Wei Yang wrote: > On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote: >> On 01.07.20 04:11, Wei Yang wrote: >>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >>>> On 30.06.20 05:18, Wei Yang wrote: >>>>> When walking page tables, we define several helpers to get the address of >>>>> the next boundary. But we don't have one for pte level. >>>>> >>>>> Let's define it and consolidate the code in several places. >>>>> >>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>>>> --- >>>>> arch/x86/mm/init_64.c | 6 ++---- >>>>> include/linux/pgtable.h | 7 +++++++ >>>>> mm/kasan/init.c | 4 +--- >>>>> 3 files changed, 10 insertions(+), 7 deletions(-) >>>>> >>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>>>> index dbae185511cd..f902fbd17f27 100644 >>>>> --- a/arch/x86/mm/init_64.c >>>>> +++ b/arch/x86/mm/init_64.c >>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>>>> >>>>> pte = pte_start + pte_index(addr); >>>>> for (; addr < end; addr = next, pte++) { >>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>> - if (next > end) >>>>> - next = end; >>>>> + next = pte_addr_end(addr, end); >>>>> >>>>> if (!pte_present(*pte)) >>>>> continue; >>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>>>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>>>> >>>>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>> + next = pte_addr_end(addr, end); >>>>> pmd = pmd_offset(pud, addr); >>>>> if (pmd_none(*pmd)) >>>>> continue; >>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>>>> index 32b6c52d41b9..0de09c6c89d2 100644 >>>>> --- a/include/linux/pgtable.h >>>>> +++ b/include/linux/pgtable.h >>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>>>> }) >>>>> #endif >>>>> >>>>> +#ifndef pte_addr_end >>>>> +#define pte_addr_end(addr, end) \ >>>>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>>>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>>>> +}) >>>>> +#endif >>>>> + >>>>> /* >>>>> * When walking page tables, we usually want to skip any p?d_none entries; >>>>> * and any p?d_bad entries - reporting the error before resetting to none. >>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>>>> index fe6be0be1f76..89f748601f74 100644 >>>>> --- a/mm/kasan/init.c >>>>> +++ b/mm/kasan/init.c >>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>>>> unsigned long next; >>>>> >>>>> for (; addr < end; addr = next, pte++) { >>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>> - if (next > end) >>>>> - next = end; >>>>> + next = pte_addr_end(addr, end); >>>>> >>>>> if (!pte_present(*pte)) >>>>> continue; >>>>> >>>> >>>> I'm not really a friend of this I have to say. We're simply iterating >>>> over single pages, not much magic .... >>> >>> Hmm... yes, we are iterating on Page boundary, while we many have the case >>> when addr or end is not PAGE_ALIGN. >> >> I really do wonder if not having page aligned addresses actually happens >> in real life. Page tables operate on page granularity, and >> adding/removing unaligned parts feels wrong ... and that's also why I >> dislike such a helper. >> >> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand >> the logic (WARN_ON()) correctly, we bail out in case we would ever end >> up in such a scenario, where we would want to add/remove things not >> aligned to PAGE_SIZE. >> >> 2. remove_pagetable()...->remove_pte_table() >> >> vmemmap_free() should never try to de-populate sub-pages. Even with >> sub-section hot-add/remove (2MB / 512 pages), with valid struct page >> sizes (56, 64, 72, 80), we always end up with full pages. >> >> kernel_physical_mapping_remove() is only called via >> arch_remove_memory(). That will never remove unaligned parts. >> > > I don't have a very clear mind now, while when you look into > remove_pte_table(), it has two cases based on alignment of addr and next. > > If we always remove a page, the second case won't happen? So, the code talks about that the second case can only happen for vmemmap, never for direct mappings. I don't see a way how this could ever happen with current page sizes, even with sub-section hotadd (2MB). Maybe that is a legacy leftover or was never relevant? Or I am missing something important, where we could have sub-4k-page vmemmap data.
On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote: >On 01.07.20 13:54, Wei Yang wrote: >> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote: >>> On 01.07.20 04:11, Wei Yang wrote: >>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >>>>> On 30.06.20 05:18, Wei Yang wrote: >>>>>> When walking page tables, we define several helpers to get the address of >>>>>> the next boundary. But we don't have one for pte level. >>>>>> >>>>>> Let's define it and consolidate the code in several places. >>>>>> >>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>>>>> --- >>>>>> arch/x86/mm/init_64.c | 6 ++---- >>>>>> include/linux/pgtable.h | 7 +++++++ >>>>>> mm/kasan/init.c | 4 +--- >>>>>> 3 files changed, 10 insertions(+), 7 deletions(-) >>>>>> >>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>>>>> index dbae185511cd..f902fbd17f27 100644 >>>>>> --- a/arch/x86/mm/init_64.c >>>>>> +++ b/arch/x86/mm/init_64.c >>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>>>>> >>>>>> pte = pte_start + pte_index(addr); >>>>>> for (; addr < end; addr = next, pte++) { >>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>> - if (next > end) >>>>>> - next = end; >>>>>> + next = pte_addr_end(addr, end); >>>>>> >>>>>> if (!pte_present(*pte)) >>>>>> continue; >>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>>>>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>>>>> >>>>>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>> + next = pte_addr_end(addr, end); >>>>>> pmd = pmd_offset(pud, addr); >>>>>> if (pmd_none(*pmd)) >>>>>> continue; >>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>>>>> index 32b6c52d41b9..0de09c6c89d2 100644 >>>>>> --- a/include/linux/pgtable.h >>>>>> +++ b/include/linux/pgtable.h >>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>>>>> }) >>>>>> #endif >>>>>> >>>>>> +#ifndef pte_addr_end >>>>>> +#define pte_addr_end(addr, end) \ >>>>>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>>>>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>>>>> +}) >>>>>> +#endif >>>>>> + >>>>>> /* >>>>>> * When walking page tables, we usually want to skip any p?d_none entries; >>>>>> * and any p?d_bad entries - reporting the error before resetting to none. >>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>>>>> index fe6be0be1f76..89f748601f74 100644 >>>>>> --- a/mm/kasan/init.c >>>>>> +++ b/mm/kasan/init.c >>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>>>>> unsigned long next; >>>>>> >>>>>> for (; addr < end; addr = next, pte++) { >>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>> - if (next > end) >>>>>> - next = end; >>>>>> + next = pte_addr_end(addr, end); >>>>>> >>>>>> if (!pte_present(*pte)) >>>>>> continue; >>>>>> >>>>> >>>>> I'm not really a friend of this I have to say. We're simply iterating >>>>> over single pages, not much magic .... >>>> >>>> Hmm... yes, we are iterating on Page boundary, while we many have the case >>>> when addr or end is not PAGE_ALIGN. >>> >>> I really do wonder if not having page aligned addresses actually happens >>> in real life. Page tables operate on page granularity, and >>> adding/removing unaligned parts feels wrong ... and that's also why I >>> dislike such a helper. >>> >>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand >>> the logic (WARN_ON()) correctly, we bail out in case we would ever end >>> up in such a scenario, where we would want to add/remove things not >>> aligned to PAGE_SIZE. >>> >>> 2. remove_pagetable()...->remove_pte_table() >>> >>> vmemmap_free() should never try to de-populate sub-pages. Even with >>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page >>> sizes (56, 64, 72, 80), we always end up with full pages. >>> >>> kernel_physical_mapping_remove() is only called via >>> arch_remove_memory(). That will never remove unaligned parts. >>> >> >> I don't have a very clear mind now, while when you look into >> remove_pte_table(), it has two cases based on alignment of addr and next. >> >> If we always remove a page, the second case won't happen? > >So, the code talks about that the second case can only happen for >vmemmap, never for direct mappings. > >I don't see a way how this could ever happen with current page sizes, >even with sub-section hotadd (2MB). Maybe that is a legacy leftover or >was never relevant? Or I am missing something important, where we could >have sub-4k-page vmemmap data. > I took a calculation on the sub-section page struct size, it is page size (4K) aligned. This means you are right, which we won't depopulate a sub-page. And yes, I am not sure all those variants would fit this case. So I would like to leave as it now. How about your opinion? >-- >Thanks, > >David / dhildenb
On 03.07.20 03:34, Wei Yang wrote: > On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote: >> On 01.07.20 13:54, Wei Yang wrote: >>> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote: >>>> On 01.07.20 04:11, Wei Yang wrote: >>>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >>>>>> On 30.06.20 05:18, Wei Yang wrote: >>>>>>> When walking page tables, we define several helpers to get the address of >>>>>>> the next boundary. But we don't have one for pte level. >>>>>>> >>>>>>> Let's define it and consolidate the code in several places. >>>>>>> >>>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>>>>>> --- >>>>>>> arch/x86/mm/init_64.c | 6 ++---- >>>>>>> include/linux/pgtable.h | 7 +++++++ >>>>>>> mm/kasan/init.c | 4 +--- >>>>>>> 3 files changed, 10 insertions(+), 7 deletions(-) >>>>>>> >>>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>>>>>> index dbae185511cd..f902fbd17f27 100644 >>>>>>> --- a/arch/x86/mm/init_64.c >>>>>>> +++ b/arch/x86/mm/init_64.c >>>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>>>>>> >>>>>>> pte = pte_start + pte_index(addr); >>>>>>> for (; addr < end; addr = next, pte++) { >>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>> - if (next > end) >>>>>>> - next = end; >>>>>>> + next = pte_addr_end(addr, end); >>>>>>> >>>>>>> if (!pte_present(*pte)) >>>>>>> continue; >>>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>>>>>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>>>>>> >>>>>>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>> + next = pte_addr_end(addr, end); >>>>>>> pmd = pmd_offset(pud, addr); >>>>>>> if (pmd_none(*pmd)) >>>>>>> continue; >>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>>>>>> index 32b6c52d41b9..0de09c6c89d2 100644 >>>>>>> --- a/include/linux/pgtable.h >>>>>>> +++ b/include/linux/pgtable.h >>>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>>>>>> }) >>>>>>> #endif >>>>>>> >>>>>>> +#ifndef pte_addr_end >>>>>>> +#define pte_addr_end(addr, end) \ >>>>>>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>>>>>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>>>>>> +}) >>>>>>> +#endif >>>>>>> + >>>>>>> /* >>>>>>> * When walking page tables, we usually want to skip any p?d_none entries; >>>>>>> * and any p?d_bad entries - reporting the error before resetting to none. >>>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>>>>>> index fe6be0be1f76..89f748601f74 100644 >>>>>>> --- a/mm/kasan/init.c >>>>>>> +++ b/mm/kasan/init.c >>>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>>>>>> unsigned long next; >>>>>>> >>>>>>> for (; addr < end; addr = next, pte++) { >>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>> - if (next > end) >>>>>>> - next = end; >>>>>>> + next = pte_addr_end(addr, end); >>>>>>> >>>>>>> if (!pte_present(*pte)) >>>>>>> continue; >>>>>>> >>>>>> >>>>>> I'm not really a friend of this I have to say. We're simply iterating >>>>>> over single pages, not much magic .... >>>>> >>>>> Hmm... yes, we are iterating on Page boundary, while we many have the case >>>>> when addr or end is not PAGE_ALIGN. >>>> >>>> I really do wonder if not having page aligned addresses actually happens >>>> in real life. Page tables operate on page granularity, and >>>> adding/removing unaligned parts feels wrong ... and that's also why I >>>> dislike such a helper. >>>> >>>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand >>>> the logic (WARN_ON()) correctly, we bail out in case we would ever end >>>> up in such a scenario, where we would want to add/remove things not >>>> aligned to PAGE_SIZE. >>>> >>>> 2. remove_pagetable()...->remove_pte_table() >>>> >>>> vmemmap_free() should never try to de-populate sub-pages. Even with >>>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page >>>> sizes (56, 64, 72, 80), we always end up with full pages. >>>> >>>> kernel_physical_mapping_remove() is only called via >>>> arch_remove_memory(). That will never remove unaligned parts. >>>> >>> >>> I don't have a very clear mind now, while when you look into >>> remove_pte_table(), it has two cases based on alignment of addr and next. >>> >>> If we always remove a page, the second case won't happen? >> >> So, the code talks about that the second case can only happen for >> vmemmap, never for direct mappings. >> >> I don't see a way how this could ever happen with current page sizes, >> even with sub-section hotadd (2MB). Maybe that is a legacy leftover or >> was never relevant? Or I am missing something important, where we could >> have sub-4k-page vmemmap data. >> > > I took a calculation on the sub-section page struct size, it is page size (4K) > aligned. This means you are right, which we won't depopulate a sub-page. > > And yes, I am not sure all those variants would fit this case. So I would like > to leave as it now. How about your opinion? I'd say we clean this up and protect it by WARN_ON_ONCE(). Then, it won't need another round of investigation to find out that handling sub-pages is irrelevant. If you don't want to tackle this, I can have a look. Just let me know.
On Fri, Jul 03, 2020 at 09:23:30AM +0200, David Hildenbrand wrote: >On 03.07.20 03:34, Wei Yang wrote: >> On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote: >>> On 01.07.20 13:54, Wei Yang wrote: >>>> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote: >>>>> On 01.07.20 04:11, Wei Yang wrote: >>>>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote: >>>>>>> On 30.06.20 05:18, Wei Yang wrote: >>>>>>>> When walking page tables, we define several helpers to get the address of >>>>>>>> the next boundary. But we don't have one for pte level. >>>>>>>> >>>>>>>> Let's define it and consolidate the code in several places. >>>>>>>> >>>>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> >>>>>>>> --- >>>>>>>> arch/x86/mm/init_64.c | 6 ++---- >>>>>>>> include/linux/pgtable.h | 7 +++++++ >>>>>>>> mm/kasan/init.c | 4 +--- >>>>>>>> 3 files changed, 10 insertions(+), 7 deletions(-) >>>>>>>> >>>>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >>>>>>>> index dbae185511cd..f902fbd17f27 100644 >>>>>>>> --- a/arch/x86/mm/init_64.c >>>>>>>> +++ b/arch/x86/mm/init_64.c >>>>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, >>>>>>>> >>>>>>>> pte = pte_start + pte_index(addr); >>>>>>>> for (; addr < end; addr = next, pte++) { >>>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>>> - if (next > end) >>>>>>>> - next = end; >>>>>>>> + next = pte_addr_end(addr, end); >>>>>>>> >>>>>>>> if (!pte_present(*pte)) >>>>>>>> continue; >>>>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, >>>>>>>> get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); >>>>>>>> >>>>>>>> if (!boot_cpu_has(X86_FEATURE_PSE)) { >>>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>>> + next = pte_addr_end(addr, end); >>>>>>>> pmd = pmd_offset(pud, addr); >>>>>>>> if (pmd_none(*pmd)) >>>>>>>> continue; >>>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>>>>>>> index 32b6c52d41b9..0de09c6c89d2 100644 >>>>>>>> --- a/include/linux/pgtable.h >>>>>>>> +++ b/include/linux/pgtable.h >>>>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) >>>>>>>> }) >>>>>>>> #endif >>>>>>>> >>>>>>>> +#ifndef pte_addr_end >>>>>>>> +#define pte_addr_end(addr, end) \ >>>>>>>> +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ >>>>>>>> + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ >>>>>>>> +}) >>>>>>>> +#endif >>>>>>>> + >>>>>>>> /* >>>>>>>> * When walking page tables, we usually want to skip any p?d_none entries; >>>>>>>> * and any p?d_bad entries - reporting the error before resetting to none. >>>>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c >>>>>>>> index fe6be0be1f76..89f748601f74 100644 >>>>>>>> --- a/mm/kasan/init.c >>>>>>>> +++ b/mm/kasan/init.c >>>>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, >>>>>>>> unsigned long next; >>>>>>>> >>>>>>>> for (; addr < end; addr = next, pte++) { >>>>>>>> - next = (addr + PAGE_SIZE) & PAGE_MASK; >>>>>>>> - if (next > end) >>>>>>>> - next = end; >>>>>>>> + next = pte_addr_end(addr, end); >>>>>>>> >>>>>>>> if (!pte_present(*pte)) >>>>>>>> continue; >>>>>>>> >>>>>>> >>>>>>> I'm not really a friend of this I have to say. We're simply iterating >>>>>>> over single pages, not much magic .... >>>>>> >>>>>> Hmm... yes, we are iterating on Page boundary, while we many have the case >>>>>> when addr or end is not PAGE_ALIGN. >>>>> >>>>> I really do wonder if not having page aligned addresses actually happens >>>>> in real life. Page tables operate on page granularity, and >>>>> adding/removing unaligned parts feels wrong ... and that's also why I >>>>> dislike such a helper. >>>>> >>>>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand >>>>> the logic (WARN_ON()) correctly, we bail out in case we would ever end >>>>> up in such a scenario, where we would want to add/remove things not >>>>> aligned to PAGE_SIZE. >>>>> >>>>> 2. remove_pagetable()...->remove_pte_table() >>>>> >>>>> vmemmap_free() should never try to de-populate sub-pages. Even with >>>>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page >>>>> sizes (56, 64, 72, 80), we always end up with full pages. >>>>> >>>>> kernel_physical_mapping_remove() is only called via >>>>> arch_remove_memory(). That will never remove unaligned parts. >>>>> >>>> >>>> I don't have a very clear mind now, while when you look into >>>> remove_pte_table(), it has two cases based on alignment of addr and next. >>>> >>>> If we always remove a page, the second case won't happen? >>> >>> So, the code talks about that the second case can only happen for >>> vmemmap, never for direct mappings. >>> >>> I don't see a way how this could ever happen with current page sizes, >>> even with sub-section hotadd (2MB). Maybe that is a legacy leftover or >>> was never relevant? Or I am missing something important, where we could >>> have sub-4k-page vmemmap data. >>> >> >> I took a calculation on the sub-section page struct size, it is page size (4K) >> aligned. This means you are right, which we won't depopulate a sub-page. >> >> And yes, I am not sure all those variants would fit this case. So I would like >> to leave as it now. How about your opinion? > >I'd say we clean this up and protect it by WARN_ON_ONCE(). Then, it >won't need another round of investigation to find out that handling >sub-pages is irrelevant. > >If you don't want to tackle this, I can have a look. Just let me know. > Actually, I don't get what you are trying to do. So go ahead, maybe I can review your change. >-- >Thanks, > >David / dhildenb
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dbae185511cd..f902fbd17f27 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, pte = pte_start + pte_index(addr); for (; addr < end; addr = next, pte++) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - if (next > end) - next = end; + next = pte_addr_end(addr, end); if (!pte_present(*pte)) continue; @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); if (!boot_cpu_has(X86_FEATURE_PSE)) { - next = (addr + PAGE_SIZE) & PAGE_MASK; + next = pte_addr_end(addr, end); pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) continue; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32b6c52d41b9..0de09c6c89d2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) }) #endif +#ifndef pte_addr_end +#define pte_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK; \ + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ +}) +#endif + /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. diff --git a/mm/kasan/init.c b/mm/kasan/init.c index fe6be0be1f76..89f748601f74 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long next; for (; addr < end; addr = next, pte++) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - if (next > end) - next = end; + next = pte_addr_end(addr, end); if (!pte_present(*pte)) continue;
When walking page tables, we define several helpers to get the address of the next boundary. But we don't have one for pte level. Let's define it and consolidate the code in several places. Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com> --- arch/x86/mm/init_64.c | 6 ++---- include/linux/pgtable.h | 7 +++++++ mm/kasan/init.c | 4 +--- 3 files changed, 10 insertions(+), 7 deletions(-)