Message ID | 20230905214412.89152-11-mike.kravetz@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Batch hugetlb vmemmap modification operations | expand |
On 2023/9/6 05:44, Mike Kravetz wrote: > From: Joao Martins <joao.m.martins@oracle.com> > > Now that a list of pages is deduplicated at once, the TLB > flush can be batched for all vmemmap pages that got remapped. > > Add a flags field and pass whether it's a bulk allocation or > just a single page to decide to remap. > > The TLB flush is global as we don't have guarantees from caller > that the set of folios is contiguous, or to add complexity in > composing a list of kVAs to flush. > > Modified by Mike Kravetz to perform TLB flush on single folio if an > error is encountered. > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> > --- > mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++++++++++++-------- > 1 file changed, 30 insertions(+), 8 deletions(-) > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index d956551699bc..8c85e2c38538 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -27,6 +27,7 @@ > * @reuse_addr: the virtual address of the @reuse_page page. > * @vmemmap_pages: the list head of the vmemmap pages that can be freed > * or is mapped from. > + * @flags: used to modify behavior in bulk operations > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -35,6 +36,8 @@ struct vmemmap_remap_walk { > struct page *reuse_page; > unsigned long reuse_addr; > struct list_head *vmemmap_pages; > +#define VMEMMAP_NO_TLB_FLUSH BIT(0) > + unsigned long flags; > }; > > static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) > @@ -208,7 +211,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, > return ret; > } while (pgd++, addr = next, addr != end); > > - if (walk->remap_pte) > + if (walk->remap_pte && !(walk->flags & VMEMMAP_NO_TLB_FLUSH)) > flush_tlb_kernel_range(start, end); > > return 0; > @@ -348,12 +351,14 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, > * @reuse: reuse address. > * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers > * responsibility to free pages. > + * @flags: modifications to vmemmap_remap_walk flags > * > * Return: %0 on success, negative error code otherwise. > */ > static int vmemmap_remap_free(unsigned long start, unsigned long end, > unsigned long reuse, > - struct list_head *vmemmap_pages) > + struct list_head *vmemmap_pages, > + unsigned long flags) > { > int ret; > LIST_HEAD(freed_pages); > @@ -361,6 +366,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_remap_pte, > .reuse_addr = reuse, > .vmemmap_pages = &freed_pages, > + .flags = flags, > }; > int nid = page_to_nid((struct page *)start); > gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | > @@ -410,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, > .remap_pte = vmemmap_restore_pte, > .reuse_addr = reuse, > .vmemmap_pages = &freed_pages, > + .flags = 0, > }; > > vmemmap_remap_range(reuse, end, &walk); > @@ -597,7 +604,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h > > static void __hugetlb_vmemmap_optimize(const struct hstate *h, > struct page *head, > - struct list_head *vmemmap_pages) > + struct list_head *vmemmap_pages, > + unsigned long flags) > { > unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; > unsigned long vmemmap_reuse; > @@ -607,6 +615,18 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h, > return; > > static_branch_inc(&hugetlb_optimize_vmemmap_key); > + /* > + * Very Subtle > + * If VMEMMAP_NO_TLB_FLUSH is set, TLB flushing is not performed > + * immediately after remapping. As a result, subsequent accesses > + * and modifications to struct pages associated with the hugetlb > + * page could bet to the OLD struct pages. Set the vmemmap optimized > + * flag here so that it is copied to the new head page. This keeps > + * the old and new struct pages in sync. > + * If there is an error during optimization, we will immediately FLUSH > + * the TLB and clear the flag below. > + */ > + SetHPageVmemmapOptimized(head); > > vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); > vmemmap_reuse = vmemmap_start; > @@ -618,10 +638,10 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h, > * mapping the range to vmemmap_pages list so that they can be freed by > * the caller. > */ > - if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages)) > + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags)) { > static_branch_dec(&hugetlb_optimize_vmemmap_key); > - else > - SetHPageVmemmapOptimized(head); > + ClearHPageVmemmapOptimized(head); > + } > } > > /** > @@ -638,7 +658,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) > { > LIST_HEAD(vmemmap_pages); > > - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); > + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL); UL suffix could be dropped. Right? > free_vmemmap_page_list(&vmemmap_pages); > } > > @@ -672,7 +692,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l > flush_tlb_all(); > > list_for_each_entry(folio, folio_list, lru) > - __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); > + __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages, VMEMMAP_NO_TLB_FLUSH); > + > + flush_tlb_all(); > > free_vmemmap_page_list(&vmemmap_pages); > }
On 09/07/23 14:55, Muchun Song wrote: > > > On 2023/9/6 05:44, Mike Kravetz wrote: > > From: Joao Martins <joao.m.martins@oracle.com> > > > > Now that a list of pages is deduplicated at once, the TLB > > flush can be batched for all vmemmap pages that got remapped. > > > > Add a flags field and pass whether it's a bulk allocation or > > just a single page to decide to remap. > > > > The TLB flush is global as we don't have guarantees from caller > > that the set of folios is contiguous, or to add complexity in > > composing a list of kVAs to flush. > > > > Modified by Mike Kravetz to perform TLB flush on single folio if an > > error is encountered. > > > > Signed-off-by: Joao Martins <joao.m.martins@oracle.com> > > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> > > --- > > mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++++++++++++-------- > > 1 file changed, 30 insertions(+), 8 deletions(-) > > > > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > > @@ -638,7 +658,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) > > { > > LIST_HEAD(vmemmap_pages); > > - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); > > + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL); > > UL suffix could be dropped. Right? Yes, it can be dropped. > > > free_vmemmap_page_list(&vmemmap_pages); > > }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index d956551699bc..8c85e2c38538 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,6 +27,7 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in bulk operations */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,6 +36,8 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; +#define VMEMMAP_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) @@ -208,7 +211,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - if (walk->remap_pte) + if (walk->remap_pte && !(walk->flags & VMEMMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; @@ -348,12 +351,14 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. + * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { int ret; LIST_HEAD(freed_pages); @@ -361,6 +366,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = &freed_pages, + .flags = flags, }; int nid = page_to_nid((struct page *)start); gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | @@ -410,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &freed_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -597,7 +604,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h static void __hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; @@ -607,6 +615,18 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h, return; static_branch_inc(&hugetlb_optimize_vmemmap_key); + /* + * Very Subtle + * If VMEMMAP_NO_TLB_FLUSH is set, TLB flushing is not performed + * immediately after remapping. As a result, subsequent accesses + * and modifications to struct pages associated with the hugetlb + * page could bet to the OLD struct pages. Set the vmemmap optimized + * flag here so that it is copied to the new head page. This keeps + * the old and new struct pages in sync. + * If there is an error during optimization, we will immediately FLUSH + * the TLB and clear the flag below. + */ + SetHPageVmemmapOptimized(head); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -618,10 +638,10 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h, * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ - if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages)) + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags)) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - SetHPageVmemmapOptimized(head); + ClearHPageVmemmapOptimized(head); + } } /** @@ -638,7 +658,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL); free_vmemmap_page_list(&vmemmap_pages); } @@ -672,7 +692,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) - __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages, VMEMMAP_NO_TLB_FLUSH); + + flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); }