diff mbox series

[v2,10/11] hugetlb: batch TLB flushes when freeing vmemmap

Message ID 20230905214412.89152-11-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Batch hugetlb vmemmap modification operations | expand

Commit Message

Mike Kravetz Sept. 5, 2023, 9:44 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

Now that a list of pages is deduplicated at once, the TLB
flush can be batched for all vmemmap pages that got remapped.

Add a flags field and pass whether it's a bulk allocation or
just a single page to decide to remap.

The TLB flush is global as we don't have guarantees from caller
that the set of folios is contiguous, or to add complexity in
composing a list of kVAs to flush.

Modified by Mike Kravetz to perform TLB flush on single folio if an
error is encountered.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

Comments

Muchun Song Sept. 7, 2023, 6:55 a.m. UTC | #1
On 2023/9/6 05:44, Mike Kravetz wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
>
> Now that a list of pages is deduplicated at once, the TLB
> flush can be batched for all vmemmap pages that got remapped.
>
> Add a flags field and pass whether it's a bulk allocation or
> just a single page to decide to remap.
>
> The TLB flush is global as we don't have guarantees from caller
> that the set of folios is contiguous, or to add complexity in
> composing a list of kVAs to flush.
>
> Modified by Mike Kravetz to perform TLB flush on single folio if an
> error is encountered.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++++++++++++--------
>   1 file changed, 30 insertions(+), 8 deletions(-)
>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index d956551699bc..8c85e2c38538 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -27,6 +27,7 @@
>    * @reuse_addr:		the virtual address of the @reuse_page page.
>    * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
>    *			or is mapped from.
> + * @flags:		used to modify behavior in bulk operations
>    */
>   struct vmemmap_remap_walk {
>   	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -35,6 +36,8 @@ struct vmemmap_remap_walk {
>   	struct page		*reuse_page;
>   	unsigned long		reuse_addr;
>   	struct list_head	*vmemmap_pages;
> +#define VMEMMAP_NO_TLB_FLUSH		BIT(0)
> +	unsigned long		flags;
>   };
>   
>   static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
> @@ -208,7 +211,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>   			return ret;
>   	} while (pgd++, addr = next, addr != end);
>   
> -	if (walk->remap_pte)
> +	if (walk->remap_pte && !(walk->flags & VMEMMAP_NO_TLB_FLUSH))
>   		flush_tlb_kernel_range(start, end);
>   
>   	return 0;
> @@ -348,12 +351,14 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>    * @reuse:	reuse address.
>    * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
>    *		responsibility to free pages.
> + * @flags:	modifications to vmemmap_remap_walk flags
>    *
>    * Return: %0 on success, negative error code otherwise.
>    */
>   static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   			      unsigned long reuse,
> -			      struct list_head *vmemmap_pages)
> +			      struct list_head *vmemmap_pages,
> +			      unsigned long flags)
>   {
>   	int ret;
>   	LIST_HEAD(freed_pages);
> @@ -361,6 +366,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_remap_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= &freed_pages,
> +		.flags		= flags,
>   	};
>   	int nid = page_to_nid((struct page *)start);
>   	gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
> @@ -410,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   			.remap_pte	= vmemmap_restore_pte,
>   			.reuse_addr	= reuse,
>   			.vmemmap_pages	= &freed_pages,
> +			.flags		= 0,
>   		};
>   
>   		vmemmap_remap_range(reuse, end, &walk);
> @@ -597,7 +604,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
>   
>   static void __hugetlb_vmemmap_optimize(const struct hstate *h,
>   					struct page *head,
> -					struct list_head *vmemmap_pages)
> +					struct list_head *vmemmap_pages,
> +					unsigned long flags)
>   {
>   	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>   	unsigned long vmemmap_reuse;
> @@ -607,6 +615,18 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h,
>   		return;
>   
>   	static_branch_inc(&hugetlb_optimize_vmemmap_key);
> +	/*
> +	 * Very Subtle
> +	 * If VMEMMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
> +	 * immediately after remapping.  As a result, subsequent accesses
> +	 * and modifications to struct pages associated with the hugetlb
> +	 * page could bet to the OLD struct pages.  Set the vmemmap optimized
> +	 * flag here so that it is copied to the new head page.  This keeps
> +	 * the old and new struct pages in sync.
> +	 * If there is an error during optimization, we will immediately FLUSH
> +	 * the TLB and clear the flag below.
> +	 */
> +	SetHPageVmemmapOptimized(head);
>   
>   	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
>   	vmemmap_reuse	= vmemmap_start;
> @@ -618,10 +638,10 @@ static void __hugetlb_vmemmap_optimize(const struct hstate *h,
>   	 * mapping the range to vmemmap_pages list so that they can be freed by
>   	 * the caller.
>   	 */
> -	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages))
> +	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags)) {
>   		static_branch_dec(&hugetlb_optimize_vmemmap_key);
> -	else
> -		SetHPageVmemmapOptimized(head);
> +		ClearHPageVmemmapOptimized(head);
> +	}
>   }
>   
>   /**
> @@ -638,7 +658,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   {
>   	LIST_HEAD(vmemmap_pages);
>   
> -	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages);
> +	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL);

UL suffix could be dropped. Right?

>   	free_vmemmap_page_list(&vmemmap_pages);
>   }
>   
> @@ -672,7 +692,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
>   	flush_tlb_all();
>   
>   	list_for_each_entry(folio, folio_list, lru)
> -		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
> +		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages, VMEMMAP_NO_TLB_FLUSH);
> +
> +	flush_tlb_all();
>   
>   	free_vmemmap_page_list(&vmemmap_pages);
>   }
Mike Kravetz Sept. 7, 2023, 6:57 p.m. UTC | #2
On 09/07/23 14:55, Muchun Song wrote:
> 
> 
> On 2023/9/6 05:44, Mike Kravetz wrote:
> > From: Joao Martins <joao.m.martins@oracle.com>
> > 
> > Now that a list of pages is deduplicated at once, the TLB
> > flush can be batched for all vmemmap pages that got remapped.
> > 
> > Add a flags field and pass whether it's a bulk allocation or
> > just a single page to decide to remap.
> > 
> > The TLB flush is global as we don't have guarantees from caller
> > that the set of folios is contiguous, or to add complexity in
> > composing a list of kVAs to flush.
> > 
> > Modified by Mike Kravetz to perform TLB flush on single folio if an
> > error is encountered.
> > 
> > Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >   mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++++++++++++--------
> >   1 file changed, 30 insertions(+), 8 deletions(-)
> > 
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c

> > @@ -638,7 +658,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
> >   {
> >   	LIST_HEAD(vmemmap_pages);
> > -	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages);
> > +	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL);
> 
> UL suffix could be dropped. Right?

Yes, it can be dropped.

> 
> >   	free_vmemmap_page_list(&vmemmap_pages);
> >   }
diff mbox series

Patch

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d956551699bc..8c85e2c38538 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -27,6 +27,7 @@ 
  * @reuse_addr:		the virtual address of the @reuse_page page.
  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
  *			or is mapped from.
+ * @flags:		used to modify behavior in bulk operations
  */
 struct vmemmap_remap_walk {
 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
@@ -35,6 +36,8 @@  struct vmemmap_remap_walk {
 	struct page		*reuse_page;
 	unsigned long		reuse_addr;
 	struct list_head	*vmemmap_pages;
+#define VMEMMAP_NO_TLB_FLUSH		BIT(0)
+	unsigned long		flags;
 };
 
 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
@@ -208,7 +211,7 @@  static int vmemmap_remap_range(unsigned long start, unsigned long end,
 			return ret;
 	} while (pgd++, addr = next, addr != end);
 
-	if (walk->remap_pte)
+	if (walk->remap_pte && !(walk->flags & VMEMMAP_NO_TLB_FLUSH))
 		flush_tlb_kernel_range(start, end);
 
 	return 0;
@@ -348,12 +351,14 @@  static int vmemmap_remap_split(unsigned long start, unsigned long end,
  * @reuse:	reuse address.
  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
  *		responsibility to free pages.
+ * @flags:	modifications to vmemmap_remap_walk flags
  *
  * Return: %0 on success, negative error code otherwise.
  */
 static int vmemmap_remap_free(unsigned long start, unsigned long end,
 			      unsigned long reuse,
-			      struct list_head *vmemmap_pages)
+			      struct list_head *vmemmap_pages,
+			      unsigned long flags)
 {
 	int ret;
 	LIST_HEAD(freed_pages);
@@ -361,6 +366,7 @@  static int vmemmap_remap_free(unsigned long start, unsigned long end,
 		.remap_pte	= vmemmap_remap_pte,
 		.reuse_addr	= reuse,
 		.vmemmap_pages	= &freed_pages,
+		.flags		= flags,
 	};
 	int nid = page_to_nid((struct page *)start);
 	gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
@@ -410,6 +416,7 @@  static int vmemmap_remap_free(unsigned long start, unsigned long end,
 			.remap_pte	= vmemmap_restore_pte,
 			.reuse_addr	= reuse,
 			.vmemmap_pages	= &freed_pages,
+			.flags		= 0,
 		};
 
 		vmemmap_remap_range(reuse, end, &walk);
@@ -597,7 +604,8 @@  static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
 
 static void __hugetlb_vmemmap_optimize(const struct hstate *h,
 					struct page *head,
-					struct list_head *vmemmap_pages)
+					struct list_head *vmemmap_pages,
+					unsigned long flags)
 {
 	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 	unsigned long vmemmap_reuse;
@@ -607,6 +615,18 @@  static void __hugetlb_vmemmap_optimize(const struct hstate *h,
 		return;
 
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
+	/*
+	 * Very Subtle
+	 * If VMEMMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
+	 * immediately after remapping.  As a result, subsequent accesses
+	 * and modifications to struct pages associated with the hugetlb
+	 * page could bet to the OLD struct pages.  Set the vmemmap optimized
+	 * flag here so that it is copied to the new head page.  This keeps
+	 * the old and new struct pages in sync.
+	 * If there is an error during optimization, we will immediately FLUSH
+	 * the TLB and clear the flag below.
+	 */
+	SetHPageVmemmapOptimized(head);
 
 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
 	vmemmap_reuse	= vmemmap_start;
@@ -618,10 +638,10 @@  static void __hugetlb_vmemmap_optimize(const struct hstate *h,
 	 * mapping the range to vmemmap_pages list so that they can be freed by
 	 * the caller.
 	 */
-	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages))
+	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags)) {
 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
-	else
-		SetHPageVmemmapOptimized(head);
+		ClearHPageVmemmapOptimized(head);
+	}
 }
 
 /**
@@ -638,7 +658,7 @@  void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 	LIST_HEAD(vmemmap_pages);
 
-	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages);
+	__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0UL);
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
@@ -672,7 +692,9 @@  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
 	flush_tlb_all();
 
 	list_for_each_entry(folio, folio_list, lru)
-		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages);
+		__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages, VMEMMAP_NO_TLB_FLUSH);
+
+	flush_tlb_all();
 
 	free_vmemmap_page_list(&vmemmap_pages);
 }