diff mbox series

[6/9] mm/memory: split non-tlb flushing part from zap_page_range_single()

Message ID 20250310172318.653630-7-sj@kernel.org (mailing list archive)
State New
Headers show
Series mm/madvise: batch tlb flushes for MADV_DONTNEED and MADV_FREE | expand

Commit Message

SeongJae Park March 10, 2025, 5:23 p.m. UTC
Some of zap_page_range_single() callers such as [process_]madvise() with
MADV_DONEED[_LOCKED] cannot batch tlb flushes because
zap_page_range_single() does tlb flushing for each invocation.  Split
out the body of zap_page_range_single() except mmu_gather object
initialization and gathered tlb entries flushing parts for such batched
tlb flushing usage.

Signed-off-by: SeongJae Park <sj@kernel.org>
---
 mm/memory.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

Comments

Lorenzo Stoakes March 11, 2025, 12:45 p.m. UTC | #1
On Mon, Mar 10, 2025 at 10:23:15AM -0700, SeongJae Park wrote:
> Some of zap_page_range_single() callers such as [process_]madvise() with
> MADV_DONEED[_LOCKED] cannot batch tlb flushes because
> zap_page_range_single() does tlb flushing for each invocation.  Split
> out the body of zap_page_range_single() except mmu_gather object
> initialization and gathered tlb entries flushing parts for such batched
> tlb flushing usage.
>
> Signed-off-by: SeongJae Park <sj@kernel.org>
> ---
>  mm/memory.c | 36 ++++++++++++++++++++++--------------
>  1 file changed, 22 insertions(+), 14 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 78c7ee62795e..88c478e2ed1a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1995,38 +1995,46 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
>  	mmu_notifier_invalidate_range_end(&range);
>  }
>
> -/**
> - * zap_page_range_single - remove user pages in a given range
> - * @vma: vm_area_struct holding the applicable pages
> - * @address: starting address of pages to zap
> - * @size: number of bytes to zap
> - * @details: details of shared cache invalidation
> - *
> - * The range must fit into one VMA.
> - */
> -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> +static void unmap_vma_single(struct mmu_gather *tlb,
> +		struct vm_area_struct *vma, unsigned long address,
>  		unsigned long size, struct zap_details *details)
>  {
>  	const unsigned long end = address + size;
>  	struct mmu_notifier_range range;
> -	struct mmu_gather tlb;
>
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
>  				address, end);
>  	hugetlb_zap_begin(vma, &range.start, &range.end);
> -	tlb_gather_mmu(&tlb, vma->vm_mm);
>  	update_hiwater_rss(vma->vm_mm);
>  	mmu_notifier_invalidate_range_start(&range);
>  	/*
>  	 * unmap 'address-end' not 'range.start-range.end' as range
>  	 * could have been expanded for hugetlb pmd sharing.
>  	 */
> -	unmap_single_vma(&tlb, vma, address, end, details, false);
> +	unmap_single_vma(tlb, vma, address, end, details, false);
>  	mmu_notifier_invalidate_range_end(&range);
> -	tlb_finish_mmu(&tlb);
>  	hugetlb_zap_end(vma, details);

Previously hugetlb_zap_end() would happen after tlb_finish_mmu(), now it happens
before?

This seems like a major problem with this change. If not you need to explain why
not in the commit message.

>  }
>
> +/**
> + * zap_page_range_single - remove user pages in a given range
> + * @vma: vm_area_struct holding the applicable pages
> + * @address: starting address of pages to zap
> + * @size: number of bytes to zap
> + * @details: details of shared cache invalidation
> + *
> + * The range must fit into one VMA.
> + */
> +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> +		unsigned long size, struct zap_details *details)
> +{
> +	struct mmu_gather tlb;
> +
> +	tlb_gather_mmu(&tlb, vma->vm_mm);
> +	unmap_vma_single(&tlb, vma, address, size, details);
> +	tlb_finish_mmu(&tlb);
> +}
> +
>  /**
>   * zap_vma_ptes - remove ptes mapping the vma
>   * @vma: vm_area_struct holding ptes to be zapped
> --
> 2.39.5
SeongJae Park March 11, 2025, 8:58 p.m. UTC | #2
On Tue, 11 Mar 2025 12:45:44 +0000 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:

> On Mon, Mar 10, 2025 at 10:23:15AM -0700, SeongJae Park wrote:
> > Some of zap_page_range_single() callers such as [process_]madvise() with
> > MADV_DONEED[_LOCKED] cannot batch tlb flushes because
> > zap_page_range_single() does tlb flushing for each invocation.  Split
> > out the body of zap_page_range_single() except mmu_gather object
> > initialization and gathered tlb entries flushing parts for such batched
> > tlb flushing usage.
> >
> > Signed-off-by: SeongJae Park <sj@kernel.org>
> > ---
> >  mm/memory.c | 36 ++++++++++++++++++++++--------------
> >  1 file changed, 22 insertions(+), 14 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 78c7ee62795e..88c478e2ed1a 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1995,38 +1995,46 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> >  	mmu_notifier_invalidate_range_end(&range);
> >  }
> >
> > -/**
> > - * zap_page_range_single - remove user pages in a given range
> > - * @vma: vm_area_struct holding the applicable pages
> > - * @address: starting address of pages to zap
> > - * @size: number of bytes to zap
> > - * @details: details of shared cache invalidation
> > - *
> > - * The range must fit into one VMA.
> > - */
> > -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> > +static void unmap_vma_single(struct mmu_gather *tlb,
> > +		struct vm_area_struct *vma, unsigned long address,
> >  		unsigned long size, struct zap_details *details)
> >  {
> >  	const unsigned long end = address + size;
> >  	struct mmu_notifier_range range;
> > -	struct mmu_gather tlb;
> >
> >  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
> >  				address, end);
> >  	hugetlb_zap_begin(vma, &range.start, &range.end);
> > -	tlb_gather_mmu(&tlb, vma->vm_mm);
> >  	update_hiwater_rss(vma->vm_mm);
> >  	mmu_notifier_invalidate_range_start(&range);
> >  	/*
> >  	 * unmap 'address-end' not 'range.start-range.end' as range
> >  	 * could have been expanded for hugetlb pmd sharing.
> >  	 */
> > -	unmap_single_vma(&tlb, vma, address, end, details, false);
> > +	unmap_single_vma(tlb, vma, address, end, details, false);
> >  	mmu_notifier_invalidate_range_end(&range);
> > -	tlb_finish_mmu(&tlb);
> >  	hugetlb_zap_end(vma, details);
> 
> Previously hugetlb_zap_end() would happen after tlb_finish_mmu(), now it happens
> before?
> 
> This seems like a major problem with this change.

Oh, you're right.  This could re-introduce the racy hugetlb allocation failure
problem that fixed by commit 2820b0f09be9 ("hugetlbfs: close race between
MADV_DONTNEED and page fault").  That is, this patch can make hugetlb
allocation failures increase while MADV_DONTNEED is going on.

Maybe a straightforward fix of the problem is doing hugetlb_zap_end() for all
vmas in a batched manner, similar to that for tlb flush.  For example, add a
list or an array for the vmas in 'struct madvise_behavior', let
'unmap_vma_single()' adds each vma in there, and call hugetlb_zap_end() for
gathered vmas at vector_madvise() or do_madvise().  Does that make sense?

Also Cc-ing Rik, who is the author of the commit 2820b0f09be9 ("hugetlbfs:
close race between MADV_DONTNEED and page fault") for a case that I'm missing
something important.

> If not you need to explain why
> not in the commit message.

I now think it is a problem.  If it turns out I'm wrong, I will of course add
the reason on the commit message.


Thanks,
SJ

[...]
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 78c7ee62795e..88c478e2ed1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1995,38 +1995,46 @@  void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 	mmu_notifier_invalidate_range_end(&range);
 }
 
-/**
- * zap_page_range_single - remove user pages in a given range
- * @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
- * @details: details of shared cache invalidation
- *
- * The range must fit into one VMA.
- */
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+static void unmap_vma_single(struct mmu_gather *tlb,
+		struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	const unsigned long end = address + size;
 	struct mmu_notifier_range range;
-	struct mmu_gather tlb;
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
 				address, end);
 	hugetlb_zap_begin(vma, &range.start, &range.end);
-	tlb_gather_mmu(&tlb, vma->vm_mm);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
 	/*
 	 * unmap 'address-end' not 'range.start-range.end' as range
 	 * could have been expanded for hugetlb pmd sharing.
 	 */
-	unmap_single_vma(&tlb, vma, address, end, details, false);
+	unmap_single_vma(tlb, vma, address, end, details, false);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_finish_mmu(&tlb);
 	hugetlb_zap_end(vma, details);
 }
 
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+		unsigned long size, struct zap_details *details)
+{
+	struct mmu_gather tlb;
+
+	tlb_gather_mmu(&tlb, vma->vm_mm);
+	unmap_vma_single(&tlb, vma, address, size, details);
+	tlb_finish_mmu(&tlb);
+}
+
 /**
  * zap_vma_ptes - remove ptes mapping the vma
  * @vma: vm_area_struct holding ptes to be zapped