diff mbox series

[v3,3/4] mm: Support batched unmap for lazyfree large folios during reclamation

Message ID 20250115033808.40641-4-21cnbao@gmail.com (mailing list archive)
State New, archived
Headers show
Series mm: batched unmap lazyfree large folios during reclamation | expand

Commit Message

Barry Song Jan. 15, 2025, 3:38 a.m. UTC
From: Barry Song <v-songbaohua@oppo.com>

Currently, the PTEs and rmap of a large folio are removed one at a time.
This is not only slow but also causes the large folio to be unnecessarily
added to deferred_split, which can lead to races between the
deferred_split shrinker callback and memory reclamation. This patch
releases all PTEs and rmap entries in a batch.
Currently, it only handles lazyfree large folios.

The below microbench tries to reclaim 128MB lazyfree large folios
whose sizes are 64KiB:

 #include <stdio.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <time.h>

 #define SIZE 128*1024*1024  // 128 MB

 unsigned long read_split_deferred()
 {
 	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
			"/hugepages-64kB/stats/split_deferred", "r");
 	if (!file) {
 		perror("Error opening file");
 		return 0;
 	}

 	unsigned long value;
 	if (fscanf(file, "%lu", &value) != 1) {
 		perror("Error reading value");
 		fclose(file);
 		return 0;
 	}

 	fclose(file);
 	return value;
 }

 int main(int argc, char *argv[])
 {
 	while(1) {
 		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 		memset((void *)p, 1, SIZE);

 		madvise((void *)p, SIZE, MADV_FREE);

 		clock_t start_time = clock();
 		unsigned long start_split = read_split_deferred();
 		madvise((void *)p, SIZE, MADV_PAGEOUT);
 		clock_t end_time = clock();
 		unsigned long end_split = read_split_deferred();

 		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
 		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
 			elapsed_time, end_split - start_split);

 		munmap((void *)p, SIZE);
 	}
 	return 0;
 }

w/o patch:
~ # ./a.out
Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
...

w/ patch:
~ # ./a.out
Time taken by reclamation: 0.074231 seconds, split_deferred: 0
Time taken by reclamation: 0.071026 seconds, split_deferred: 0
Time taken by reclamation: 0.072029 seconds, split_deferred: 0
Time taken by reclamation: 0.071873 seconds, split_deferred: 0
Time taken by reclamation: 0.073573 seconds, split_deferred: 0
Time taken by reclamation: 0.071906 seconds, split_deferred: 0
Time taken by reclamation: 0.073604 seconds, split_deferred: 0
Time taken by reclamation: 0.075903 seconds, split_deferred: 0
Time taken by reclamation: 0.073191 seconds, split_deferred: 0
Time taken by reclamation: 0.071228 seconds, split_deferred: 0
Time taken by reclamation: 0.071391 seconds, split_deferred: 0
Time taken by reclamation: 0.071468 seconds, split_deferred: 0
Time taken by reclamation: 0.071896 seconds, split_deferred: 0
Time taken by reclamation: 0.072508 seconds, split_deferred: 0
Time taken by reclamation: 0.071884 seconds, split_deferred: 0
Time taken by reclamation: 0.072433 seconds, split_deferred: 0
Time taken by reclamation: 0.071939 seconds, split_deferred: 0
...

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/rmap.c | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

Comments

David Hildenbrand Feb. 4, 2025, 11:38 a.m. UTC | #1
Hi,

>   	unsigned long hsz = 0;
>   
> @@ -1780,6 +1800,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>   				hugetlb_vma_unlock_write(vma);
>   			}
>   			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> +		} else if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> +				can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) {
> +			nr_pages = folio_nr_pages(folio);
> +			flush_cache_range(vma, range.start, range.end);
> +			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
> +			if (should_defer_flush(mm, flags))
> +				set_tlb_ubc_flush_pending(mm, pteval, address,
> +					address + folio_size(folio));
> +			else
> +				flush_tlb_range(vma, range.start, range.end);
>   		} else {

I have some fixes [1] that will collide with this series. I'm currently 
preparing a v2, and am not 100% sure when the fixes will get queued+merged.

I'll base them against mm-stable for now, and send them out based on 
that, to avoid the conflicts here (should all be fairly easy to resolve 
from a quick glimpse).

So we might have to refresh this series here if the fixes go in first.

[1] https://lkml.kernel.org/r/20250129115411.2077152-1-david@redhat.com
Andrew Morton Feb. 5, 2025, 2:55 a.m. UTC | #2
On Tue, 4 Feb 2025 12:38:31 +0100 David Hildenbrand <david@redhat.com> wrote:

> Hi,
> 
> >   	unsigned long hsz = 0;
> >   
> > @@ -1780,6 +1800,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >   				hugetlb_vma_unlock_write(vma);
> >   			}
> >   			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> > +		} else if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> > +				can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) {
> > +			nr_pages = folio_nr_pages(folio);
> > +			flush_cache_range(vma, range.start, range.end);
> > +			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
> > +			if (should_defer_flush(mm, flags))
> > +				set_tlb_ubc_flush_pending(mm, pteval, address,
> > +					address + folio_size(folio));
> > +			else
> > +				flush_tlb_range(vma, range.start, range.end);
> >   		} else {
> 
> I have some fixes [1] that will collide with this series. I'm currently 
> preparing a v2, and am not 100% sure when the fixes will get queued+merged.
> 
> I'll base them against mm-stable for now, and send them out based on 
> that, to avoid the conflicts here (should all be fairly easy to resolve 
> from a quick glimpse).
> 
> So we might have to refresh this series here if the fixes go in first.
> 
> [1] https://lkml.kernel.org/r/20250129115411.2077152-1-david@redhat.com

It doesn't look like "mm: fixes for device-exclusive entries (hmm)"
will be backportable(?) but yes, we should aim to stage your fixes
against mainline and ahead of other changes to at least make life
easier for anyone who chooses to backport your fixes into an earlier
kernel.
Barry Song Feb. 5, 2025, 3:35 a.m. UTC | #3
On Wed, Feb 5, 2025 at 12:38 AM David Hildenbrand <david@redhat.com> wrote:
>
> Hi,
>
> >       unsigned long hsz = 0;
> >
> > @@ -1780,6 +1800,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> >                               hugetlb_vma_unlock_write(vma);
> >                       }
> >                       pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> > +             } else if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
> > +                             can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) {
> > +                     nr_pages = folio_nr_pages(folio);
> > +                     flush_cache_range(vma, range.start, range.end);
> > +                     pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
> > +                     if (should_defer_flush(mm, flags))
> > +                             set_tlb_ubc_flush_pending(mm, pteval, address,
> > +                                     address + folio_size(folio));
> > +                     else
> > +                             flush_tlb_range(vma, range.start, range.end);
> >               } else {
>
> I have some fixes [1] that will collide with this series. I'm currently
> preparing a v2, and am not 100% sure when the fixes will get queued+merged.
>
> I'll base them against mm-stable for now, and send them out based on
> that, to avoid the conflicts here (should all be fairly easy to resolve
> from a quick glimpse).
>
> So we might have to refresh this series here if the fixes go in first.

I assume you're referring to "[PATCH v1 08/12] mm/rmap: handle
device-exclusive entries correctly in try_to_unmap_one()". It looks
straightforward to resolve the conflict. If your patch is applied first,
I'll send a rebase.

>
> [1] https://lkml.kernel.org/r/20250129115411.2077152-1-david@redhat.com
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
diff mbox series

Patch

diff --git a/mm/rmap.c b/mm/rmap.c
index abeb9fcec384..be1978d2712d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1642,6 +1642,25 @@  void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+			struct folio *folio, pte_t *ptep)
+{
+	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+	int max_nr = folio_nr_pages(folio);
+	pte_t pte = ptep_get(ptep);
+
+	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+		return false;
+	if (pte_none(pte) || pte_unused(pte) || !pte_present(pte))
+		return false;
+	if (pte_pfn(pte) != folio_pfn(folio))
+		return false;
+
+	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+			       NULL, NULL) == max_nr;
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1655,6 +1674,7 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 	bool anon_exclusive, ret = true;
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
+	int nr_pages = 1;
 	unsigned long pfn;
 	unsigned long hsz = 0;
 
@@ -1780,6 +1800,16 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				hugetlb_vma_unlock_write(vma);
 			}
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+		} else if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+				can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) {
+			nr_pages = folio_nr_pages(folio);
+			flush_cache_range(vma, range.start, range.end);
+			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+			if (should_defer_flush(mm, flags))
+				set_tlb_ubc_flush_pending(mm, pteval, address,
+					address + folio_size(folio));
+			else
+				flush_tlb_range(vma, range.start, range.end);
 		} else {
 			flush_cache_page(vma, address, pfn);
 			/* Nuke the page table entry. */
@@ -1875,7 +1905,7 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					 * redirtied either using the page table or a previously
 					 * obtained GUP reference.
 					 */
-					set_pte_at(mm, address, pvmw.pte, pteval);
+					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 					folio_set_swapbacked(folio);
 					goto walk_abort;
 				} else if (ref_count != 1 + map_count) {
@@ -1888,10 +1918,10 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					 * We'll come back here later and detect if the folio was
 					 * dirtied when the additional reference is gone.
 					 */
-					set_pte_at(mm, address, pvmw.pte, pteval);
+					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 					goto walk_abort;
 				}
-				dec_mm_counter(mm, MM_ANONPAGES);
+				add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
 				goto discard;
 			}
 
@@ -1943,13 +1973,18 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			dec_mm_counter(mm, mm_counter_file(folio));
 		}
 discard:
-		if (unlikely(folio_test_hugetlb(folio)))
+		if (unlikely(folio_test_hugetlb(folio))) {
 			hugetlb_remove_rmap(folio);
-		else
-			folio_remove_rmap_pte(folio, subpage, vma);
+		} else {
+			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+			folio_ref_sub(folio, nr_pages - 1);
+		}
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
 		folio_put(folio);
+		/* We have already batched the entire folio */
+		if (nr_pages > 1)
+			goto walk_done;
 		continue;
 walk_abort:
 		ret = false;