Message ID | 20230918230202.254631-7-mike.kravetz@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Batch hugetlb vmemmap modification operations | expand |
On 2023/9/19 07:01, Mike Kravetz wrote: > From: Joao Martins <joao.m.martins@oracle.com> > > In an effort to minimize amount of TLB flushes, batch all PMD splits > belonging to a range of pages in order to perform only 1 (global) TLB > flush. > > Add a flags field to the walker and pass whether it's a bulk allocation > or just a single page to decide to remap. First value > (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB > flush when we split the PMD. > > Rebased and updated by Mike Kravetz > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> > --- > mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 75 insertions(+), 4 deletions(-) > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index 147ed15bcae4..e8bc2f7567db 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -27,6 +27,7 @@ > * @reuse_addr: the virtual address of the @reuse_page page. > * @vmemmap_pages: the list head of the vmemmap pages that can be freed > * or is mapped from. > + * @flags: used to modify behavior in bulk operations > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -35,9 +36,11 @@ struct vmemmap_remap_walk { > struct page *reuse_page; > unsigned long reuse_addr; > struct list_head *vmemmap_pages; > +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) Please add a brief comment following this macro to explain what's the behavior. > + unsigned long flags; > }; > > -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) > +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) > { > pmd_t __pmd; > int i; > @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) > /* Make pte visible before pmd. See comment in pmd_install(). */ > smp_wmb(); > pmd_populate_kernel(&init_mm, pmd, pgtable); > - flush_tlb_kernel_range(start, start + PMD_SIZE); > + if (flush) > + flush_tlb_kernel_range(start, start + PMD_SIZE); > } else { > pte_free_kernel(&init_mm, pgtable); > } > @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, > do { > int ret; > > - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); > + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, > + walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)? Thanks. > if (ret) > return ret; > > next = pmd_addr_end(addr, end); > + > + /* > + * We are only splitting, not remapping the hugetlb vmemmap > + * pages. > + */ > + if (!walk->remap_pte) > + continue; > + > vmemmap_pte_range(pmd, addr, next, walk); > } while (pmd++, addr = next, addr != end); > > @@ -198,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, > return ret; > } while (pgd++, addr = next, addr != end); > > - flush_tlb_kernel_range(start, end); > + if (walk->remap_pte) > + flush_tlb_kernel_range(start, end); > > return 0; > } > @@ -300,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > } > > +/** > + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) > + * backing PMDs of the directmap into PTEs > + * @start: start address of the vmemmap virtual address range that we want > + * to remap. > + * @end: end address of the vmemmap virtual address range that we want to > + * remap. > + * @reuse: reuse address. > + * > + * Return: %0 on success, negative error code otherwise. > + */ > +static int vmemmap_remap_split(unsigned long start, unsigned long end, > + unsigned long reuse) > +{ > + int ret; > + struct vmemmap_remap_walk walk = { > + .remap_pte = NULL, > + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, > + }; > + > + /* See the comment in the vmemmap_remap_free(). */ > + BUG_ON(start - reuse != PAGE_SIZE); > + > + mmap_read_lock(&init_mm); > + ret = vmemmap_remap_range(reuse, end, &walk); > + mmap_read_unlock(&init_mm); > + > + return ret; > +} > + > /** > * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) > * to the page which @reuse is mapped to, then free vmemmap > @@ -323,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_remap_pte, > .reuse_addr = reuse, > .vmemmap_pages = vmemmap_pages, > + .flags = 0, > }; > int nid = page_to_nid((struct page *)reuse); > gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; > @@ -371,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_restore_pte, > .reuse_addr = reuse, > .vmemmap_pages = vmemmap_pages, > + .flags = 0, > }; > > vmemmap_remap_range(reuse, end, &walk); > @@ -422,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, > .remap_pte = vmemmap_restore_pte, > .reuse_addr = reuse, > .vmemmap_pages = &vmemmap_pages, > + .flags = 0, > }; > > /* See the comment in the vmemmap_remap_free(). */ > @@ -630,11 +677,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) > free_vmemmap_page_list(&vmemmap_pages); > } > > +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) > +{ > + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; > + unsigned long vmemmap_reuse; > + > + if (!vmemmap_should_optimize(h, head)) > + return; > + > + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); > + vmemmap_reuse = vmemmap_start; > + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; > + > + /* > + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, > + * @vmemmap_end] > + */ > + vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); > +} > + > void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) > { > struct folio *folio; > LIST_HEAD(vmemmap_pages); > > + list_for_each_entry(folio, folio_list, lru) > + hugetlb_vmemmap_split(h, &folio->page); > + > + flush_tlb_all(); > + > list_for_each_entry(folio, folio_list, lru) { > int ret = __hugetlb_vmemmap_optimize(h, &folio->page, > &vmemmap_pages);
On 2023/9/19 07:01, Mike Kravetz wrote: > From: Joao Martins <joao.m.martins@oracle.com> > > In an effort to minimize amount of TLB flushes, batch all PMD splits > belonging to a range of pages in order to perform only 1 (global) TLB > flush. > > Add a flags field to the walker and pass whether it's a bulk allocation > or just a single page to decide to remap. First value > (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB > flush when we split the PMD. > > Rebased and updated by Mike Kravetz > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> > --- > mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 75 insertions(+), 4 deletions(-) > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index 147ed15bcae4..e8bc2f7567db 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -27,6 +27,7 @@ > * @reuse_addr: the virtual address of the @reuse_page page. > * @vmemmap_pages: the list head of the vmemmap pages that can be freed > * or is mapped from. > + * @flags: used to modify behavior in bulk operations Better to describe it as "used to modify behavior in vmemmap page table walking operations" > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -35,9 +36,11 @@ struct vmemmap_remap_walk { > struct page *reuse_page; > unsigned long reuse_addr; > struct list_head *vmemmap_pages; > +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) > + unsigned long flags; > }; > > -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) > +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) > { > pmd_t __pmd; > int i; > @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) > /* Make pte visible before pmd. See comment in pmd_install(). */ > smp_wmb(); > pmd_populate_kernel(&init_mm, pmd, pgtable); > - flush_tlb_kernel_range(start, start + PMD_SIZE); > + if (flush) > + flush_tlb_kernel_range(start, start + PMD_SIZE); > } else { > pte_free_kernel(&init_mm, pgtable); > } > @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, > do { > int ret; > > - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); > + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, > + walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); > if (ret) > return ret; > > next = pmd_addr_end(addr, end); > + > + /* > + * We are only splitting, not remapping the hugetlb vmemmap > + * pages. > + */ > + if (!walk->remap_pte) > + continue; > + > vmemmap_pte_range(pmd, addr, next, walk); > } while (pmd++, addr = next, addr != end); > > @@ -198,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, > return ret; > } while (pgd++, addr = next, addr != end); > > - flush_tlb_kernel_range(start, end); > + if (walk->remap_pte) > + flush_tlb_kernel_range(start, end); > > return 0; > } > @@ -300,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > } > > +/** > + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) > + * backing PMDs of the directmap into PTEs > + * @start: start address of the vmemmap virtual address range that we want > + * to remap. > + * @end: end address of the vmemmap virtual address range that we want to > + * remap. > + * @reuse: reuse address. > + * > + * Return: %0 on success, negative error code otherwise. > + */ > +static int vmemmap_remap_split(unsigned long start, unsigned long end, > + unsigned long reuse) > +{ > + int ret; > + struct vmemmap_remap_walk walk = { > + .remap_pte = NULL, > + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, > + }; > + > + /* See the comment in the vmemmap_remap_free(). */ > + BUG_ON(start - reuse != PAGE_SIZE); > + > + mmap_read_lock(&init_mm); > + ret = vmemmap_remap_range(reuse, end, &walk); > + mmap_read_unlock(&init_mm); > + > + return ret; > +} > + > /** > * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) > * to the page which @reuse is mapped to, then free vmemmap > @@ -323,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_remap_pte, > .reuse_addr = reuse, > .vmemmap_pages = vmemmap_pages, > + .flags = 0, > }; > int nid = page_to_nid((struct page *)reuse); > gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; > @@ -371,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_restore_pte, > .reuse_addr = reuse, > .vmemmap_pages = vmemmap_pages, > + .flags = 0, > }; > > vmemmap_remap_range(reuse, end, &walk); > @@ -422,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, > .remap_pte = vmemmap_restore_pte, > .reuse_addr = reuse, > .vmemmap_pages = &vmemmap_pages, > + .flags = 0, > }; > > /* See the comment in the vmemmap_remap_free(). */ > @@ -630,11 +677,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) > free_vmemmap_page_list(&vmemmap_pages); > } > > +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) > +{ > + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; > + unsigned long vmemmap_reuse; > + > + if (!vmemmap_should_optimize(h, head)) > + return; > + > + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); > + vmemmap_reuse = vmemmap_start; > + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; > + > + /* > + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, > + * @vmemmap_end] > + */ > + vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); > +} > + > void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) > { > struct folio *folio; > LIST_HEAD(vmemmap_pages); > > + list_for_each_entry(folio, folio_list, lru) > + hugetlb_vmemmap_split(h, &folio->page); > + > + flush_tlb_all(); > + > list_for_each_entry(folio, folio_list, lru) { > int ret = __hugetlb_vmemmap_optimize(h, &folio->page, > &vmemmap_pages); This is unlikely to be failed since the page table allocation is moved to the above (Note that the head vmemmap page allocation is not mandatory). So we should handle the error case in the above splitting operation. Thanks.
On 19/09/2023 07:27, Muchun Song wrote: > On 2023/9/19 07:01, Mike Kravetz wrote: >> From: Joao Martins <joao.m.martins@oracle.com> >> >> In an effort to minimize amount of TLB flushes, batch all PMD splits >> belonging to a range of pages in order to perform only 1 (global) TLB >> flush. >> >> Add a flags field to the walker and pass whether it's a bulk allocation >> or just a single page to decide to remap. First value >> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB >> flush when we split the PMD. >> >> Rebased and updated by Mike Kravetz >> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> >> --- >> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 75 insertions(+), 4 deletions(-) >> >> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >> index 147ed15bcae4..e8bc2f7567db 100644 >> --- a/mm/hugetlb_vmemmap.c >> +++ b/mm/hugetlb_vmemmap.c >> @@ -27,6 +27,7 @@ >> * @reuse_addr: the virtual address of the @reuse_page page. >> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >> * or is mapped from. >> + * @flags: used to modify behavior in bulk operations >> */ >> struct vmemmap_remap_walk { >> void (*remap_pte)(pte_t *pte, unsigned long addr, >> @@ -35,9 +36,11 @@ struct vmemmap_remap_walk { >> struct page *reuse_page; >> unsigned long reuse_addr; >> struct list_head *vmemmap_pages; >> +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) > > Please add a brief comment following this macro to explain what's the > behavior. > /* Skip the TLB flush when we split the PMD */ And will also do it in the next patch with: /* Skip the TLB flush when we remap the PTE */ >> + unsigned long flags; >> }; >> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) >> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) >> { >> pmd_t __pmd; >> int i; >> @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long >> start) >> /* Make pte visible before pmd. See comment in pmd_install(). */ >> smp_wmb(); >> pmd_populate_kernel(&init_mm, pmd, pgtable); >> - flush_tlb_kernel_range(start, start + PMD_SIZE); >> + if (flush) >> + flush_tlb_kernel_range(start, start + PMD_SIZE); >> } else { >> pte_free_kernel(&init_mm, pgtable); >> } >> @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long >> addr, >> do { >> int ret; >> - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); >> + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, >> + walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); > > !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)? > Yeah -- Gah, I must be very distracted. Thanks
On 19/09/2023 07:42, Muchun Song wrote: > On 2023/9/19 07:01, Mike Kravetz wrote: >> From: Joao Martins <joao.m.martins@oracle.com> >> >> In an effort to minimize amount of TLB flushes, batch all PMD splits >> belonging to a range of pages in order to perform only 1 (global) TLB >> flush. >> >> Add a flags field to the walker and pass whether it's a bulk allocation >> or just a single page to decide to remap. First value >> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB >> flush when we split the PMD. >> >> Rebased and updated by Mike Kravetz >> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> >> --- >> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 75 insertions(+), 4 deletions(-) >> >> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >> index 147ed15bcae4..e8bc2f7567db 100644 >> --- a/mm/hugetlb_vmemmap.c >> +++ b/mm/hugetlb_vmemmap.c >> @@ -27,6 +27,7 @@ >> * @reuse_addr: the virtual address of the @reuse_page page. >> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >> * or is mapped from. >> + * @flags: used to modify behavior in bulk operations > > Better to describe it as "used to modify behavior in vmemmap page table walking > operations" > OK >> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >> *folio_list) >> { >> struct folio *folio; >> LIST_HEAD(vmemmap_pages); >> + list_for_each_entry(folio, folio_list, lru) >> + hugetlb_vmemmap_split(h, &folio->page); >> + >> + flush_tlb_all(); >> + >> list_for_each_entry(folio, folio_list, lru) { >> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >> &vmemmap_pages); > > This is unlikely to be failed since the page table allocation > is moved to the above > (Note that the head vmemmap page allocation > is not mandatory). Good point that I almost forgot > So we should handle the error case in the above > splitting operation. But back to the previous discussion in v2... the thinking was that /some/ PMDs got split, and say could allow some PTE remapping to occur and free some pages back (each page allows 6 more splits worst case). Then the next __hugetlb_vmemmap_optimize() will have to split PMD pages again for those hugepages that failed the batch PMD split (as we only defer the PTE remap tlb flush in this stage). Unless this isn't something worth handling Joao
> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: > > On 19/09/2023 07:42, Muchun Song wrote: >> On 2023/9/19 07:01, Mike Kravetz wrote: >>> From: Joao Martins <joao.m.martins@oracle.com> >>> >>> In an effort to minimize amount of TLB flushes, batch all PMD splits >>> belonging to a range of pages in order to perform only 1 (global) TLB >>> flush. >>> >>> Add a flags field to the walker and pass whether it's a bulk allocation >>> or just a single page to decide to remap. First value >>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB >>> flush when we split the PMD. >>> >>> Rebased and updated by Mike Kravetz >>> >>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> >>> --- >>> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- >>> 1 file changed, 75 insertions(+), 4 deletions(-) >>> >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >>> index 147ed15bcae4..e8bc2f7567db 100644 >>> --- a/mm/hugetlb_vmemmap.c >>> +++ b/mm/hugetlb_vmemmap.c >>> @@ -27,6 +27,7 @@ >>> * @reuse_addr: the virtual address of the @reuse_page page. >>> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>> * or is mapped from. >>> + * @flags: used to modify behavior in bulk operations >> >> Better to describe it as "used to modify behavior in vmemmap page table walking >> operations" >> > OK > >>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >>> *folio_list) >>> { >>> struct folio *folio; >>> LIST_HEAD(vmemmap_pages); >>> + list_for_each_entry(folio, folio_list, lru) >>> + hugetlb_vmemmap_split(h, &folio->page); >>> + >>> + flush_tlb_all(); >>> + >>> list_for_each_entry(folio, folio_list, lru) { >>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>> &vmemmap_pages); >> >> This is unlikely to be failed since the page table allocation >> is moved to the above > >> (Note that the head vmemmap page allocation >> is not mandatory). > > Good point that I almost forgot > >> So we should handle the error case in the above >> splitting operation. > > But back to the previous discussion in v2... the thinking was that /some/ PMDs > got split, and say could allow some PTE remapping to occur and free some pages > back (each page allows 6 more splits worst case). Then the next > __hugetlb_vmemmap_optimize() will have to split PMD pages again for those > hugepages that failed the batch PMD split (as we only defer the PTE remap tlb > flush in this stage). Oh, yes. Maybe we could break the above traversal as early as possible once we enter an ENOMEM? > > Unless this isn't something worth handling > > Joao
On 19/09/2023 09:41, Muchun Song wrote: >> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >> On 19/09/2023 07:42, Muchun Song wrote: >>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>> From: Joao Martins <joao.m.martins@oracle.com> >>>> >>>> In an effort to minimize amount of TLB flushes, batch all PMD splits >>>> belonging to a range of pages in order to perform only 1 (global) TLB >>>> flush. >>>> >>>> Add a flags field to the walker and pass whether it's a bulk allocation >>>> or just a single page to decide to remap. First value >>>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB >>>> flush when we split the PMD. >>>> >>>> Rebased and updated by Mike Kravetz >>>> >>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> >>>> --- >>>> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- >>>> 1 file changed, 75 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >>>> index 147ed15bcae4..e8bc2f7567db 100644 >>>> --- a/mm/hugetlb_vmemmap.c >>>> +++ b/mm/hugetlb_vmemmap.c >>>> @@ -27,6 +27,7 @@ >>>> * @reuse_addr: the virtual address of the @reuse_page page. >>>> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>>> * or is mapped from. >>>> + * @flags: used to modify behavior in bulk operations >>> >>> Better to describe it as "used to modify behavior in vmemmap page table walking >>> operations" >>> >> OK >> >>>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >>>> *folio_list) >>>> { >>>> struct folio *folio; >>>> LIST_HEAD(vmemmap_pages); >>>> + list_for_each_entry(folio, folio_list, lru) >>>> + hugetlb_vmemmap_split(h, &folio->page); >>>> + >>>> + flush_tlb_all(); >>>> + >>>> list_for_each_entry(folio, folio_list, lru) { >>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>> &vmemmap_pages); >>> >>> This is unlikely to be failed since the page table allocation >>> is moved to the above >> >>> (Note that the head vmemmap page allocation >>> is not mandatory). >> >> Good point that I almost forgot >> >>> So we should handle the error case in the above >>> splitting operation. >> >> But back to the previous discussion in v2... the thinking was that /some/ PMDs >> got split, and say could allow some PTE remapping to occur and free some pages >> back (each page allows 6 more splits worst case). Then the next >> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >> flush in this stage). > > Oh, yes. Maybe we could break the above traversal as early as possible > once we enter an ENOMEM? > Sounds good -- no point in keep trying to split if we are failing with OOM. Perhaps a comment in both of these clauses (the early break on split and the OOM handling in batch optimize) could help make this clear. >> >> Unless this isn't something worth handling >> >> Joao > > >
> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote: > > On 19/09/2023 09:41, Muchun Song wrote: >>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >>> On 19/09/2023 07:42, Muchun Song wrote: >>>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>>> From: Joao Martins <joao.m.martins@oracle.com> >>>>> >>>>> In an effort to minimize amount of TLB flushes, batch all PMD splits >>>>> belonging to a range of pages in order to perform only 1 (global) TLB >>>>> flush. >>>>> >>>>> Add a flags field to the walker and pass whether it's a bulk allocation >>>>> or just a single page to decide to remap. First value >>>>> (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB >>>>> flush when we split the PMD. >>>>> >>>>> Rebased and updated by Mike Kravetz >>>>> >>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com> >>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> >>>>> --- >>>>> mm/hugetlb_vmemmap.c | 79 +++++++++++++++++++++++++++++++++++++++++--- >>>>> 1 file changed, 75 insertions(+), 4 deletions(-) >>>>> >>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >>>>> index 147ed15bcae4..e8bc2f7567db 100644 >>>>> --- a/mm/hugetlb_vmemmap.c >>>>> +++ b/mm/hugetlb_vmemmap.c >>>>> @@ -27,6 +27,7 @@ >>>>> * @reuse_addr: the virtual address of the @reuse_page page. >>>>> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>>>> * or is mapped from. >>>>> + * @flags: used to modify behavior in bulk operations >>>> >>>> Better to describe it as "used to modify behavior in vmemmap page table walking >>>> operations" >>>> >>> OK >>> >>>>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >>>>> *folio_list) >>>>> { >>>>> struct folio *folio; >>>>> LIST_HEAD(vmemmap_pages); >>>>> + list_for_each_entry(folio, folio_list, lru) >>>>> + hugetlb_vmemmap_split(h, &folio->page); >>>>> + >>>>> + flush_tlb_all(); >>>>> + >>>>> list_for_each_entry(folio, folio_list, lru) { >>>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>>> &vmemmap_pages); >>>> >>>> This is unlikely to be failed since the page table allocation >>>> is moved to the above >>> >>>> (Note that the head vmemmap page allocation >>>> is not mandatory). >>> >>> Good point that I almost forgot >>> >>>> So we should handle the error case in the above >>>> splitting operation. >>> >>> But back to the previous discussion in v2... the thinking was that /some/ PMDs >>> got split, and say could allow some PTE remapping to occur and free some pages >>> back (each page allows 6 more splits worst case). Then the next >>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >>> flush in this stage). >> >> Oh, yes. Maybe we could break the above traversal as early as possible >> once we enter an ENOMEM? >> > > Sounds good -- no point in keep trying to split if we are failing with OOM. > > Perhaps a comment in both of these clauses (the early break on split and the OOM > handling in batch optimize) could help make this clear. Make sense. Thanks. > >>> >>> Unless this isn't something worth handling >>> >>> Joao
On 19/09/2023 09:57, Muchun Song wrote: >> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote: >> On 19/09/2023 09:41, Muchun Song wrote: >>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >>>> On 19/09/2023 07:42, Muchun Song wrote: >>>>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>>>> list_for_each_entry(folio, folio_list, lru) { >>>>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>>>> &vmemmap_pages); >>>>> >>>>> This is unlikely to be failed since the page table allocation >>>>> is moved to the above >>>> >>>>> (Note that the head vmemmap page allocation >>>>> is not mandatory). >>>> >>>> Good point that I almost forgot >>>> >>>>> So we should handle the error case in the above >>>>> splitting operation. >>>> >>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs >>>> got split, and say could allow some PTE remapping to occur and free some pages >>>> back (each page allows 6 more splits worst case). Then the next >>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >>>> flush in this stage). >>> >>> Oh, yes. Maybe we could break the above traversal as early as possible >>> once we enter an ENOMEM? >>> >> >> Sounds good -- no point in keep trying to split if we are failing with OOM. >> >> Perhaps a comment in both of these clauses (the early break on split and the OOM >> handling in batch optimize) could help make this clear. > > Make sense. These are the changes I have so far for this patch based on the discussion so far. For next one it's at the end: diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e8bc2f7567db..d9c6f2cf698c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,7 +27,8 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. - * @flags: used to modify behavior in bulk operations + * @flags: used to modify behavior in vmemmap page table walking + * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -36,6 +37,8 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; + +/* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) unsigned long flags; }; @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, int ret; ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, - walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); if (ret) return ret; @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) free_vmemmap_page_list(&vmemmap_pages); } -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) { unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; if (!vmemmap_should_optimize(h, head)) - return; + return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ - vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l struct folio *folio; LIST_HEAD(vmemmap_pages); - list_for_each_entry(folio, folio_list, lru) - hugetlb_vmemmap_split(h, &folio->page); + list_for_each_entry(folio, folio_list, lru) { + int ret = hugetlb_vmemmap_split(h, &folio->page); + + /* + * Spliting the PMD requires allocating a page, thus lets fail + * early once we encounter the first OOM. No point in retrying + * as it can be dynamically done on remap with the memory + * we get back from the vmemmap deduplication. + */ + if (ret == -ENOMEM) + break; + } flush_tlb_all(); For patch 7, I only have commentary added derived from this earlier discussion above: diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index d9c6f2cf698c..f6a1020a4b6a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) +/* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) unsigned long flags; }; @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. + * This can occur in the case that both spliting fails + * halfway and head page allocation also failed. In this + * case __hugetlb_vmemmap_optimize() would free memory + * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote: > > On 19/09/2023 09:57, Muchun Song wrote: >>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote: >>> On 19/09/2023 09:41, Muchun Song wrote: >>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >>>>> On 19/09/2023 07:42, Muchun Song wrote: >>>>>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>>>>> list_for_each_entry(folio, folio_list, lru) { >>>>>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>>>>> &vmemmap_pages); >>>>>> >>>>>> This is unlikely to be failed since the page table allocation >>>>>> is moved to the above >>>>> >>>>>> (Note that the head vmemmap page allocation >>>>>> is not mandatory). >>>>> >>>>> Good point that I almost forgot >>>>> >>>>>> So we should handle the error case in the above >>>>>> splitting operation. >>>>> >>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs >>>>> got split, and say could allow some PTE remapping to occur and free some pages >>>>> back (each page allows 6 more splits worst case). Then the next >>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >>>>> flush in this stage). >>>> >>>> Oh, yes. Maybe we could break the above traversal as early as possible >>>> once we enter an ENOMEM? >>>> >>> >>> Sounds good -- no point in keep trying to split if we are failing with OOM. >>> >>> Perhaps a comment in both of these clauses (the early break on split and the OOM >>> handling in batch optimize) could help make this clear. >> >> Make sense. > > These are the changes I have so far for this patch based on the discussion so > far. For next one it's at the end: Code looks good to me. One nit below. > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index e8bc2f7567db..d9c6f2cf698c 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -27,7 +27,8 @@ > * @reuse_addr: the virtual address of the @reuse_page page. > * @vmemmap_pages: the list head of the vmemmap pages that can be freed > * or is mapped from. > - * @flags: used to modify behavior in bulk operations > + * @flags: used to modify behavior in vmemmap page table walking > + * operations. > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -36,6 +37,8 @@ struct vmemmap_remap_walk { > struct page *reuse_page; > unsigned long reuse_addr; > struct list_head *vmemmap_pages; > + > +/* Skip the TLB flush when we split the PMD */ > #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) > unsigned long flags; > }; > @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, > int ret; > > ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, > - walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); > + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); > if (ret) > return ret; > > @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, > struct page *head) > free_vmemmap_page_list(&vmemmap_pages); > } > > -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) > +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) > { > unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; > unsigned long vmemmap_reuse; > > if (!vmemmap_should_optimize(h, head)) > - return; > + return 0; > > vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); > vmemmap_reuse = vmemmap_start; > @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h, > struct page *head) > * Split PMDs on the vmemmap virtual address range [@vmemmap_start, > * @vmemmap_end] > */ > - vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); > + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); > } > > void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head > *folio_list) > @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, > struct list_head *folio_l > struct folio *folio; > LIST_HEAD(vmemmap_pages); > > - list_for_each_entry(folio, folio_list, lru) > - hugetlb_vmemmap_split(h, &folio->page); > + list_for_each_entry(folio, folio_list, lru) { > + int ret = hugetlb_vmemmap_split(h, &folio->page); > + > + /* > + * Spliting the PMD requires allocating a page, thus lets fail ^^^^ ^^^ Splitting page table page I'd like to specify the functionality of the allocated page. > + * early once we encounter the first OOM. No point in retrying > + * as it can be dynamically done on remap with the memory > + * we get back from the vmemmap deduplication. > + */ > + if (ret == -ENOMEM) > + break; > + } > > flush_tlb_all(); > > For patch 7, I only have commentary added derived from this earlier discussion > above: > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index d9c6f2cf698c..f6a1020a4b6a 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { > > /* Skip the TLB flush when we split the PMD */ > #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) > +/* Skip the TLB flush when we remap the PTE */ > #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) > unsigned long flags; > }; > > @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, > struct list_head *folio_l > > list_for_each_entry(folio, folio_list, lru) { > int ret = __hugetlb_vmemmap_optimize(h, &folio->page, > &vmemmap_pages, > VMEMMAP_REMAP_NO_TLB_FLUSH); > > /* > * Pages to be freed may have been accumulated. If we > * encounter an ENOMEM, free what we have and try again. > + * This can occur in the case that both spliting fails ^^^ splitting > + * halfway and head page allocation also failed. In this ^^^^^^^ head vmemmap page Otherwise: Reviewed-by: Muchun Song <songmuchun@bytedance.com> Thanks. > + * case __hugetlb_vmemmap_optimize() would free memory > + * allowing more vmemmap remaps to occur. > */ > if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { >
On 20/09/2023 03:47, Muchun Song wrote: >> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote: >> On 19/09/2023 09:57, Muchun Song wrote: >>>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote: >>>> On 19/09/2023 09:41, Muchun Song wrote: >>>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >>>>>> On 19/09/2023 07:42, Muchun Song wrote: >>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>>>>>> list_for_each_entry(folio, folio_list, lru) { >>>>>>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>>>>>> &vmemmap_pages); >>>>>>> >>>>>>> This is unlikely to be failed since the page table allocation >>>>>>> is moved to the above >>>>>> >>>>>>> (Note that the head vmemmap page allocation >>>>>>> is not mandatory). >>>>>> >>>>>> Good point that I almost forgot >>>>>> >>>>>>> So we should handle the error case in the above >>>>>>> splitting operation. >>>>>> >>>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs >>>>>> got split, and say could allow some PTE remapping to occur and free some pages >>>>>> back (each page allows 6 more splits worst case). Then the next >>>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >>>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >>>>>> flush in this stage). >>>>> >>>>> Oh, yes. Maybe we could break the above traversal as early as possible >>>>> once we enter an ENOMEM? >>>>> >>>> >>>> Sounds good -- no point in keep trying to split if we are failing with OOM. >>>> >>>> Perhaps a comment in both of these clauses (the early break on split and the OOM >>>> handling in batch optimize) could help make this clear. >>> >>> Make sense. >> >> These are the changes I have so far for this patch based on the discussion so >> far. For next one it's at the end: > > Code looks good to me. One nit below. > Thanks >> >> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >> index e8bc2f7567db..d9c6f2cf698c 100644 >> --- a/mm/hugetlb_vmemmap.c >> +++ b/mm/hugetlb_vmemmap.c >> @@ -27,7 +27,8 @@ >> * @reuse_addr: the virtual address of the @reuse_page page. >> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >> * or is mapped from. >> - * @flags: used to modify behavior in bulk operations >> + * @flags: used to modify behavior in vmemmap page table walking >> + * operations. >> */ >> struct vmemmap_remap_walk { >> void (*remap_pte)(pte_t *pte, unsigned long addr, >> @@ -36,6 +37,8 @@ struct vmemmap_remap_walk { >> struct page *reuse_page; >> unsigned long reuse_addr; >> struct list_head *vmemmap_pages; >> + >> +/* Skip the TLB flush when we split the PMD */ >> #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) >> unsigned long flags; >> }; >> @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, >> int ret; >> >> ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, >> - walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); >> + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); >> if (ret) >> return ret; >> >> @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, >> struct page *head) >> free_vmemmap_page_list(&vmemmap_pages); >> } >> >> -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) >> +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) >> { >> unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; >> unsigned long vmemmap_reuse; >> >> if (!vmemmap_should_optimize(h, head)) >> - return; >> + return 0; >> >> vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); >> vmemmap_reuse = vmemmap_start; >> @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h, >> struct page *head) >> * Split PMDs on the vmemmap virtual address range [@vmemmap_start, >> * @vmemmap_end] >> */ >> - vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); >> + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); >> } >> >> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >> *folio_list) >> @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, >> struct list_head *folio_l >> struct folio *folio; >> LIST_HEAD(vmemmap_pages); >> >> - list_for_each_entry(folio, folio_list, lru) >> - hugetlb_vmemmap_split(h, &folio->page); >> + list_for_each_entry(folio, folio_list, lru) { >> + int ret = hugetlb_vmemmap_split(h, &folio->page); >> + >> + /* >> + * Spliting the PMD requires allocating a page, thus lets fail > ^^^^ ^^^ > Splitting page table page > > I'd like to specify the functionality of the allocated page. > OK >> + * early once we encounter the first OOM. No point in retrying >> + * as it can be dynamically done on remap with the memory >> + * we get back from the vmemmap deduplication. >> + */ >> + if (ret == -ENOMEM) >> + break; >> + } >> >> flush_tlb_all(); >> >> For patch 7, I only have commentary added derived from this earlier discussion >> above: >> >> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >> index d9c6f2cf698c..f6a1020a4b6a 100644 >> --- a/mm/hugetlb_vmemmap.c >> +++ b/mm/hugetlb_vmemmap.c >> @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { >> >> /* Skip the TLB flush when we split the PMD */ >> #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) >> +/* Skip the TLB flush when we remap the PTE */ >> #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) >> unsigned long flags; >> }; >> >> @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, >> struct list_head *folio_l >> >> list_for_each_entry(folio, folio_list, lru) { >> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >> &vmemmap_pages, >> VMEMMAP_REMAP_NO_TLB_FLUSH); >> >> /* >> * Pages to be freed may have been accumulated. If we >> * encounter an ENOMEM, free what we have and try again. >> + * This can occur in the case that both spliting fails > ^^^ > splitting > ok >> + * halfway and head page allocation also failed. In this > ^^^^^^^ > head vmemmap page > ok > Otherwise: > > Reviewed-by: Muchun Song <songmuchun@bytedance.com> > Thanks, I assume that's for both patches? > Thanks. > >> + * case __hugetlb_vmemmap_optimize() would free memory >> + * allowing more vmemmap remaps to occur. >> */ >> if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { >> > >
> On Sep 20, 2023, at 18:39, Joao Martins <joao.m.martins@oracle.com> wrote: > > On 20/09/2023 03:47, Muchun Song wrote: >>> On Sep 19, 2023, at 23:09, Joao Martins <joao.m.martins@oracle.com> wrote: >>> On 19/09/2023 09:57, Muchun Song wrote: >>>>> On Sep 19, 2023, at 16:55, Joao Martins <joao.m.martins@oracle.com> wrote: >>>>> On 19/09/2023 09:41, Muchun Song wrote: >>>>>>> On Sep 19, 2023, at 16:26, Joao Martins <joao.m.martins@oracle.com> wrote: >>>>>>> On 19/09/2023 07:42, Muchun Song wrote: >>>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote: >>>>>>>>> list_for_each_entry(folio, folio_list, lru) { >>>>>>>>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>>>>>>>> &vmemmap_pages); >>>>>>>> >>>>>>>> This is unlikely to be failed since the page table allocation >>>>>>>> is moved to the above >>>>>>> >>>>>>>> (Note that the head vmemmap page allocation >>>>>>>> is not mandatory). >>>>>>> >>>>>>> Good point that I almost forgot >>>>>>> >>>>>>>> So we should handle the error case in the above >>>>>>>> splitting operation. >>>>>>> >>>>>>> But back to the previous discussion in v2... the thinking was that /some/ PMDs >>>>>>> got split, and say could allow some PTE remapping to occur and free some pages >>>>>>> back (each page allows 6 more splits worst case). Then the next >>>>>>> __hugetlb_vmemmap_optimize() will have to split PMD pages again for those >>>>>>> hugepages that failed the batch PMD split (as we only defer the PTE remap tlb >>>>>>> flush in this stage). >>>>>> >>>>>> Oh, yes. Maybe we could break the above traversal as early as possible >>>>>> once we enter an ENOMEM? >>>>>> >>>>> >>>>> Sounds good -- no point in keep trying to split if we are failing with OOM. >>>>> >>>>> Perhaps a comment in both of these clauses (the early break on split and the OOM >>>>> handling in batch optimize) could help make this clear. >>>> >>>> Make sense. >>> >>> These are the changes I have so far for this patch based on the discussion so >>> far. For next one it's at the end: >> >> Code looks good to me. One nit below. >> > Thanks > >>> >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >>> index e8bc2f7567db..d9c6f2cf698c 100644 >>> --- a/mm/hugetlb_vmemmap.c >>> +++ b/mm/hugetlb_vmemmap.c >>> @@ -27,7 +27,8 @@ >>> * @reuse_addr: the virtual address of the @reuse_page page. >>> * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>> * or is mapped from. >>> - * @flags: used to modify behavior in bulk operations >>> + * @flags: used to modify behavior in vmemmap page table walking >>> + * operations. >>> */ >>> struct vmemmap_remap_walk { >>> void (*remap_pte)(pte_t *pte, unsigned long addr, >>> @@ -36,6 +37,8 @@ struct vmemmap_remap_walk { >>> struct page *reuse_page; >>> unsigned long reuse_addr; >>> struct list_head *vmemmap_pages; >>> + >>> +/* Skip the TLB flush when we split the PMD */ >>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) >>> unsigned long flags; >>> }; >>> @@ -132,7 +135,7 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, >>> int ret; >>> >>> ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, >>> - walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); >>> + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); >>> if (ret) >>> return ret; >>> >>> @@ -677,13 +680,13 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, >>> struct page *head) >>> free_vmemmap_page_list(&vmemmap_pages); >>> } >>> >>> -static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) >>> +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) >>> { >>> unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; >>> unsigned long vmemmap_reuse; >>> >>> if (!vmemmap_should_optimize(h, head)) >>> - return; >>> + return 0; >>> >>> vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); >>> vmemmap_reuse = vmemmap_start; >>> @@ -693,7 +696,7 @@ static void hugetlb_vmemmap_split(const struct hstate *h, >>> struct page *head) >>> * Split PMDs on the vmemmap virtual address range [@vmemmap_start, >>> * @vmemmap_end] >>> */ >>> - vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); >>> + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); >>> } >>> >>> void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head >>> *folio_list) >>> @@ -701,8 +704,18 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, >>> struct list_head *folio_l >>> struct folio *folio; >>> LIST_HEAD(vmemmap_pages); >>> >>> - list_for_each_entry(folio, folio_list, lru) >>> - hugetlb_vmemmap_split(h, &folio->page); >>> + list_for_each_entry(folio, folio_list, lru) { >>> + int ret = hugetlb_vmemmap_split(h, &folio->page); >>> + >>> + /* >>> + * Spliting the PMD requires allocating a page, thus lets fail >> ^^^^ ^^^ >> Splitting page table page >> >> I'd like to specify the functionality of the allocated page. >> > OK > >>> + * early once we encounter the first OOM. No point in retrying >>> + * as it can be dynamically done on remap with the memory >>> + * we get back from the vmemmap deduplication. >>> + */ >>> + if (ret == -ENOMEM) >>> + break; >>> + } >>> >>> flush_tlb_all(); >>> >>> For patch 7, I only have commentary added derived from this earlier discussion >>> above: >>> >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c >>> index d9c6f2cf698c..f6a1020a4b6a 100644 >>> --- a/mm/hugetlb_vmemmap.c >>> +++ b/mm/hugetlb_vmemmap.c >>> @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { >>> >>> /* Skip the TLB flush when we split the PMD */ >>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) >>> +/* Skip the TLB flush when we remap the PTE */ >>> #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) >>> unsigned long flags; >>> }; >>> >>> @@ -721,19 +739,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, >>> struct list_head *folio_l >>> >>> list_for_each_entry(folio, folio_list, lru) { >>> int ret = __hugetlb_vmemmap_optimize(h, &folio->page, >>> &vmemmap_pages, >>> VMEMMAP_REMAP_NO_TLB_FLUSH); >>> >>> /* >>> * Pages to be freed may have been accumulated. If we >>> * encounter an ENOMEM, free what we have and try again. >>> + * This can occur in the case that both spliting fails >> ^^^ >> splitting >> > > ok > >>> + * halfway and head page allocation also failed. In this >> ^^^^^^^ >> head vmemmap page >> > ok > >> Otherwise: >> >> Reviewed-by: Muchun Song <songmuchun@bytedance.com> >> > > Thanks, I assume that's for both patches? Yes. Thanks. > >> Thanks. >> >>> + * case __hugetlb_vmemmap_optimize() would free memory >>> + * allowing more vmemmap remaps to occur. >>> */ >>> if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 147ed15bcae4..e8bc2f7567db 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,6 +27,7 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in bulk operations */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,9 +36,11 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) { pmd_t __pmd; int i; @@ -80,7 +83,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + if (flush) + flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -127,11 +131,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, do { int ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, + walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH); if (ret) return ret; next = pmd_addr_end(addr, end); + + /* + * We are only splitting, not remapping the hugetlb vmemmap + * pages. + */ + if (!walk->remap_pte) + continue; + vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -198,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + if (walk->remap_pte) + flush_tlb_kernel_range(start, end); return 0; } @@ -300,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } +/** + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) + * backing PMDs of the directmap into PTEs + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_split(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + struct vmemmap_remap_walk walk = { + .remap_pte = NULL, + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return ret; +} + /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap @@ -323,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -371,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -422,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, + .flags = 0, }; /* See the comment in the vmemmap_remap_free(). */ @@ -630,11 +677,35 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) free_vmemmap_page_list(&vmemmap_pages); } +static void hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end] + */ + vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); +} + void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); + list_for_each_entry(folio, folio_list, lru) + hugetlb_vmemmap_split(h, &folio->page); + + flush_tlb_all(); + list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);