diff mbox series

[v1,07/11] mm/huge_memory: convert split_huge_pages_pid() from follow_page() to folio_walk

Message ID 20240802155524.517137-8-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm: replace follow_page() by folio_walk | expand

Commit Message

David Hildenbrand Aug. 2, 2024, 3:55 p.m. UTC
Let's remove yet another follow_page() user. Note that we have to do the
split without holding the PTL, after folio_walk_end(). We don't care
about losing the secretmem check in follow_page().

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/huge_memory.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Comments

Ryan Roberts Aug. 6, 2024, 9:46 a.m. UTC | #1
On 02/08/2024 16:55, David Hildenbrand wrote:
> Let's remove yet another follow_page() user. Note that we have to do the
> split without holding the PTL, after folio_walk_end(). We don't care
> about losing the secretmem check in follow_page().

Hi David,

Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).

Newly failing test:

# # ------------------------------
# # running ./split_huge_page_test
# # ------------------------------
# # TAP version 13
# # 1..12
# # Bail out! Still AnonHugePages not split
# # # Planned tests != run tests (12 != 0)
# # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
# # [FAIL]
# not ok 52 split_huge_page_test # exit=1

It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?

bisect log:

# bad: [1e391b34f6aa043c7afa40a2103163a0ef06d179] Add linux-next specific files for 20240806
git bisect bad 1e391b34f6aa043c7afa40a2103163a0ef06d179
# good: [de9c2c66ad8e787abec7c9d7eff4f8c3cdd28aed] Linux 6.11-rc2
git bisect good de9c2c66ad8e787abec7c9d7eff4f8c3cdd28aed
# bad: [01c2d56f2c52e8af01dfd91af1fe9affc76c4c9e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
git bisect bad 01c2d56f2c52e8af01dfd91af1fe9affc76c4c9e
# bad: [01c2d56f2c52e8af01dfd91af1fe9affc76c4c9e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
git bisect bad 01c2d56f2c52e8af01dfd91af1fe9affc76c4c9e
# bad: [3610638e967f32f02c56c7cc8f7d6a815972f8c2] Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git
git bisect bad 3610638e967f32f02c56c7cc8f7d6a815972f8c2
# bad: [3610638e967f32f02c56c7cc8f7d6a815972f8c2] Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git
git bisect bad 3610638e967f32f02c56c7cc8f7d6a815972f8c2
# bad: [d35ef6c9d106eedff36908c21699e1b7f3e55584] Merge branch 'clang-format' of https://github.com/ojeda/linux.git
git bisect bad d35ef6c9d106eedff36908c21699e1b7f3e55584
# good: [e1a15959d75c9ba4b45e07e37bcf843c85750010] Merge branch 'for-linux-next-fixes' of https://gitlab.freedesktop.org/drm/misc/kernel.git
git bisect good e1a15959d75c9ba4b45e07e37bcf843c85750010
# good: [6d66cb9bdeceb769ce62591f56580ebe80f6267a] mm: swap: add a adaptive full cluster cache reclaim
git bisect good 6d66cb9bdeceb769ce62591f56580ebe80f6267a
# bad: [2b820b576dfc4aa9b65f18b68f468cb5b38ece84] mm: optimization on page allocation when CMA enabled
git bisect bad 2b820b576dfc4aa9b65f18b68f468cb5b38ece84
# bad: [ab70279848c8623027791799492a3f6e7c38a9b2] MIPS: sgi-ip27: drop HAVE_ARCH_NODEDATA_EXTENSION
git bisect bad ab70279848c8623027791799492a3f6e7c38a9b2
# bad: [539bc09ff00b29eb60f3dc8ed2d82ad2050a582d] mm/huge_memory: convert split_huge_pages_pid() from follow_page() to folio_walk
git bisect bad 539bc09ff00b29eb60f3dc8ed2d82ad2050a582d
# good: [1a37544d0e35340ce740d377d7d6c746a84e2aae] include/linux/mmzone.h: clean up watermark accessors
git bisect good 1a37544d0e35340ce740d377d7d6c746a84e2aae
# good: [22adafb60d6e1a607a3d99da90927ddd7df928ad] mm/migrate: convert do_pages_stat_array() from follow_page() to folio_walk
git bisect good 22adafb60d6e1a607a3d99da90927ddd7df928ad
# good: [57e1ccf54dba4dda6d6f0264b76e2b86eec3d401] mm/ksm: convert get_mergeable_page() from follow_page() to folio_walk
git bisect good 57e1ccf54dba4dda6d6f0264b76e2b86eec3d401
# good: [285aa1a963f310530351b0e4a2e64bc4b806e518] mm/ksm: convert scan_get_next_rmap_item() from follow_page() to folio_walk
git bisect good 285aa1a963f310530351b0e4a2e64bc4b806e518
# first bad commit: [539bc09ff00b29eb60f3dc8ed2d82ad2050a582d] mm/huge_memory: convert split_huge_pages_pid() from follow_page() to folio_walk

Thanks,
Ryan


> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  mm/huge_memory.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0167dc27e365..697fcf89f975 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -40,6 +40,7 @@
>  #include <linux/memory-tiers.h>
>  #include <linux/compat.h>
>  #include <linux/pgalloc_tag.h>
> +#include <linux/pagewalk.h>
>  
>  #include <asm/tlb.h>
>  #include <asm/pgalloc.h>
> @@ -3507,7 +3508,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  	 */
>  	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>  		struct vm_area_struct *vma = vma_lookup(mm, addr);
> -		struct page *page;
> +		struct folio_walk fw;
>  		struct folio *folio;
>  
>  		if (!vma)
> @@ -3519,13 +3520,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  			continue;
>  		}
>  
> -		/* FOLL_DUMP to ignore special (like zero) pages */
> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> -
> -		if (IS_ERR_OR_NULL(page))
> +		folio = folio_walk_start(&fw, vma, addr, 0);
> +		if (!folio)
>  			continue;
>  
> -		folio = page_folio(page);
>  		if (!is_transparent_hugepage(folio))
>  			goto next;
>  
> @@ -3544,13 +3542,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  
>  		if (!folio_trylock(folio))
>  			goto next;
> +		folio_get(folio);
> +		folio_walk_end(&fw, vma);
>  
>  		if (!split_folio_to_order(folio, new_order))
>  			split++;
>  
>  		folio_unlock(folio);
> -next:
>  		folio_put(folio);
> +
> +		cond_resched();
> +		continue;
> +next:
> +		folio_walk_end(&fw, vma);
>  		cond_resched();
>  	}
>  	mmap_read_unlock(mm);
David Hildenbrand Aug. 6, 2024, 9:56 a.m. UTC | #2
On 06.08.24 11:46, Ryan Roberts wrote:
> On 02/08/2024 16:55, David Hildenbrand wrote:
>> Let's remove yet another follow_page() user. Note that we have to do the
>> split without holding the PTL, after folio_walk_end(). We don't care
>> about losing the secretmem check in follow_page().
> 
> Hi David,
> 
> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
> 
> Newly failing test:
> 
> # # ------------------------------
> # # running ./split_huge_page_test
> # # ------------------------------
> # # TAP version 13
> # # 1..12
> # # Bail out! Still AnonHugePages not split
> # # # Planned tests != run tests (12 != 0)
> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
> # # [FAIL]
> # not ok 52 split_huge_page_test # exit=1
> 
> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?

Nothing jumps at me as well. Let me fire up the debugger :)
David Hildenbrand Aug. 6, 2024, 10:03 a.m. UTC | #3
On 06.08.24 11:56, David Hildenbrand wrote:
> On 06.08.24 11:46, Ryan Roberts wrote:
>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>> Let's remove yet another follow_page() user. Note that we have to do the
>>> split without holding the PTL, after folio_walk_end(). We don't care
>>> about losing the secretmem check in follow_page().
>>
>> Hi David,
>>
>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>
>> Newly failing test:
>>
>> # # ------------------------------
>> # # running ./split_huge_page_test
>> # # ------------------------------
>> # # TAP version 13
>> # # 1..12
>> # # Bail out! Still AnonHugePages not split
>> # # # Planned tests != run tests (12 != 0)
>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>> # # [FAIL]
>> # not ok 52 split_huge_page_test # exit=1
>>
>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
> 
> Nothing jumps at me as well. Let me fire up the debugger :)

Ah, very likely the can_split_folio() check expects a raised refcount 
already.
David Hildenbrand Aug. 6, 2024, 10:24 a.m. UTC | #4
On 06.08.24 12:03, David Hildenbrand wrote:
> On 06.08.24 11:56, David Hildenbrand wrote:
>> On 06.08.24 11:46, Ryan Roberts wrote:
>>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>>> Let's remove yet another follow_page() user. Note that we have to do the
>>>> split without holding the PTL, after folio_walk_end(). We don't care
>>>> about losing the secretmem check in follow_page().
>>>
>>> Hi David,
>>>
>>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>>
>>> Newly failing test:
>>>
>>> # # ------------------------------
>>> # # running ./split_huge_page_test
>>> # # ------------------------------
>>> # # TAP version 13
>>> # # 1..12
>>> # # Bail out! Still AnonHugePages not split
>>> # # # Planned tests != run tests (12 != 0)
>>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>>> # # [FAIL]
>>> # not ok 52 split_huge_page_test # exit=1
>>>
>>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
>>
>> Nothing jumps at me as well. Let me fire up the debugger :)
> 
> Ah, very likely the can_split_folio() check expects a raised refcount
> already.

Indeed, the following does the trick! Thanks Ryan, I could have sworn
I ran that selftest as well.

TAP version 13
1..12
ok 1 Split huge pages successful
ok 2 Split PTE-mapped huge pages successful
# Please enable pr_debug in split_huge_pages_in_file() for more info.
# Please check dmesg for more information
ok 3 File-backed THP split test done

...


@Andrew, can you squash the following?


 From e5ea585de3e089ea89bf43d8447ff9fc9b371286 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 6 Aug 2024 12:08:17 +0200
Subject: [PATCH] fixup: mm/huge_memory: convert split_huge_pages_pid() from
  follow_page() to folio_walk

We have to teach can_split_folio() that we are not holding an additional
reference.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
  include/linux/huge_mm.h | 4 ++--
  mm/huge_memory.c        | 8 ++++----
  mm/vmscan.c             | 2 +-
  3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..ce44caa40eed 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
  		unsigned long len, unsigned long pgoff, unsigned long flags,
  		vm_flags_t vm_flags);
  
-bool can_split_folio(struct folio *folio, int *pextra_pins);
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
  		unsigned int new_order);
  static inline int split_huge_page(struct page *page)
@@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
  }
  
  static inline bool
-can_split_folio(struct folio *folio, int *pextra_pins)
+can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
  {
  	return false;
  }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 697fcf89f975..c40b0dcc205b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3021,7 +3021,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
  }
  
  /* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int *pextra_pins)
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
  {
  	int extra_pins;
  
@@ -3033,7 +3033,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
  		extra_pins = folio_nr_pages(folio);
  	if (pextra_pins)
  		*pextra_pins = extra_pins;
-	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
+	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
  }
  
  /*
@@ -3201,7 +3201,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
  	 * Racy check if we can split the page, before unmap_folio() will
  	 * split PMDs
  	 */
-	if (!can_split_folio(folio, &extra_pins)) {
+	if (!can_split_folio(folio, 1, &extra_pins)) {
  		ret = -EAGAIN;
  		goto out_unlock;
  	}
@@ -3537,7 +3537,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
  		 * can be split or not. So skip the check here.
  		 */
  		if (!folio_test_private(folio) &&
-		    !can_split_folio(folio, NULL))
+		    !can_split_folio(folio, 0, NULL))
  			goto next;
  
  		if (!folio_trylock(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 31d13462571e..a332cb80e928 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1227,7 +1227,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
  					goto keep_locked;
  				if (folio_test_large(folio)) {
  					/* cannot split folio, skip it */
-					if (!can_split_folio(folio, NULL))
+					if (!can_split_folio(folio, 1, NULL))
  						goto activate_locked;
  					/*
  					 * Split partially mapped folios right away.
Ryan Roberts Aug. 6, 2024, 11:17 a.m. UTC | #5
>>>> It's trying to split some pmd-mapped THPs then checking and finding that
>>>> they are not split. The split is requested via
>>>> /sys/kernel/debug/split_huge_pages, which I believe ends up in this function
>>>> you are modifying here. Although I'll admit that looking at the change,
>>>> there is nothing obviously wrong! Any ideas?
>>>
>>> Nothing jumps at me as well. Let me fire up the debugger :)
>>
>> Ah, very likely the can_split_folio() check expects a raised refcount
>> already.
> 
> Indeed, the following does the trick! Thanks Ryan, I could have sworn
> I ran that selftest as well.

Ahha! Thanks for sorting so quickly!
Zi Yan Aug. 6, 2024, 3:36 p.m. UTC | #6
On 6 Aug 2024, at 6:24, David Hildenbrand wrote:

> On 06.08.24 12:03, David Hildenbrand wrote:
>> On 06.08.24 11:56, David Hildenbrand wrote:
>>> On 06.08.24 11:46, Ryan Roberts wrote:
>>>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>>>> Let's remove yet another follow_page() user. Note that we have to do the
>>>>> split without holding the PTL, after folio_walk_end(). We don't care
>>>>> about losing the secretmem check in follow_page().
>>>>
>>>> Hi David,
>>>>
>>>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>>>
>>>> Newly failing test:
>>>>
>>>> # # ------------------------------
>>>> # # running ./split_huge_page_test
>>>> # # ------------------------------
>>>> # # TAP version 13
>>>> # # 1..12
>>>> # # Bail out! Still AnonHugePages not split
>>>> # # # Planned tests != run tests (12 != 0)
>>>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>>>> # # [FAIL]
>>>> # not ok 52 split_huge_page_test # exit=1
>>>>
>>>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
>>>
>>> Nothing jumps at me as well. Let me fire up the debugger :)
>>
>> Ah, very likely the can_split_folio() check expects a raised refcount
>> already.
>
> Indeed, the following does the trick! Thanks Ryan, I could have sworn
> I ran that selftest as well.
>
> TAP version 13
> 1..12
> ok 1 Split huge pages successful
> ok 2 Split PTE-mapped huge pages successful
> # Please enable pr_debug in split_huge_pages_in_file() for more info.
> # Please check dmesg for more information
> ok 3 File-backed THP split test done
>
> ...
>
>
> @Andrew, can you squash the following?
>
>
> From e5ea585de3e089ea89bf43d8447ff9fc9b371286 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Tue, 6 Aug 2024 12:08:17 +0200
> Subject: [PATCH] fixup: mm/huge_memory: convert split_huge_pages_pid() from
>  follow_page() to folio_walk
>
> We have to teach can_split_folio() that we are not holding an additional
> reference.
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/huge_mm.h | 4 ++--
>  mm/huge_memory.c        | 8 ++++----
>  mm/vmscan.c             | 2 +-
>  3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..ce44caa40eed 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
>  		unsigned long len, unsigned long pgoff, unsigned long flags,
>  		vm_flags_t vm_flags);
>  -bool can_split_folio(struct folio *folio, int *pextra_pins);
> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		unsigned int new_order);
>  static inline int split_huge_page(struct page *page)
> @@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
>  }
>   static inline bool
> -can_split_folio(struct folio *folio, int *pextra_pins)
> +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>  {
>  	return false;
>  }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 697fcf89f975..c40b0dcc205b 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3021,7 +3021,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  }
>   /* Racy check whether the huge page can be split */
> -bool can_split_folio(struct folio *folio, int *pextra_pins)
> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>  {
>  	int extra_pins;
>  @@ -3033,7 +3033,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
>  		extra_pins = folio_nr_pages(folio);
>  	if (pextra_pins)
>  		*pextra_pins = extra_pins;
> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
> +	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
>  }
>   /*
> @@ -3201,7 +3201,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  	 * Racy check if we can split the page, before unmap_folio() will
>  	 * split PMDs
>  	 */
> -	if (!can_split_folio(folio, &extra_pins)) {
> +	if (!can_split_folio(folio, 1, &extra_pins)) {
>  		ret = -EAGAIN;
>  		goto out_unlock;
>  	}
> @@ -3537,7 +3537,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  		 * can be split or not. So skip the check here.
>  		 */
>  		if (!folio_test_private(folio) &&
> -		    !can_split_folio(folio, NULL))
> +		    !can_split_folio(folio, 0, NULL))
>  			goto next;
>   		if (!folio_trylock(folio))

The diff below can skip a folio with private and extra pin(s) early instead
of trying to lock and split it then failing at can_split_folio() inside
split_huge_page_to_list_to_order().

Maybe worth applying on top of yours?


diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a218320a9233..ce992d54f1da 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3532,13 +3532,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                        goto next;

                total++;
-               /*
-                * For folios with private, split_huge_page_to_list_to_order()
-                * will try to drop it before split and then check if the folio
-                * can be split or not. So skip the check here.
-                */
-               if (!folio_test_private(folio) &&
-                   !can_split_folio(folio, 0, NULL))
+
+               if (!can_split_folio(folio,
+                                    folio_test_private(folio) ? 1 : 0,
+                                    NULL))
                        goto next;

                if (!folio_trylock(folio))

Best Regards,
Yan, Zi
David Hildenbrand Aug. 7, 2024, 9:57 a.m. UTC | #7
On 06.08.24 17:36, Zi Yan wrote:
> On 6 Aug 2024, at 6:24, David Hildenbrand wrote:
> 
>> On 06.08.24 12:03, David Hildenbrand wrote:
>>> On 06.08.24 11:56, David Hildenbrand wrote:
>>>> On 06.08.24 11:46, Ryan Roberts wrote:
>>>>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>>>>> Let's remove yet another follow_page() user. Note that we have to do the
>>>>>> split without holding the PTL, after folio_walk_end(). We don't care
>>>>>> about losing the secretmem check in follow_page().
>>>>>
>>>>> Hi David,
>>>>>
>>>>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>>>>
>>>>> Newly failing test:
>>>>>
>>>>> # # ------------------------------
>>>>> # # running ./split_huge_page_test
>>>>> # # ------------------------------
>>>>> # # TAP version 13
>>>>> # # 1..12
>>>>> # # Bail out! Still AnonHugePages not split
>>>>> # # # Planned tests != run tests (12 != 0)
>>>>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>>>>> # # [FAIL]
>>>>> # not ok 52 split_huge_page_test # exit=1
>>>>>
>>>>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
>>>>
>>>> Nothing jumps at me as well. Let me fire up the debugger :)
>>>
>>> Ah, very likely the can_split_folio() check expects a raised refcount
>>> already.
>>
>> Indeed, the following does the trick! Thanks Ryan, I could have sworn
>> I ran that selftest as well.
>>
>> TAP version 13
>> 1..12
>> ok 1 Split huge pages successful
>> ok 2 Split PTE-mapped huge pages successful
>> # Please enable pr_debug in split_huge_pages_in_file() for more info.
>> # Please check dmesg for more information
>> ok 3 File-backed THP split test done
>>
>> ...
>>
>>
>> @Andrew, can you squash the following?
>>
>>
>>  From e5ea585de3e089ea89bf43d8447ff9fc9b371286 Mon Sep 17 00:00:00 2001
>> From: David Hildenbrand <david@redhat.com>
>> Date: Tue, 6 Aug 2024 12:08:17 +0200
>> Subject: [PATCH] fixup: mm/huge_memory: convert split_huge_pages_pid() from
>>   follow_page() to folio_walk
>>
>> We have to teach can_split_folio() that we are not holding an additional
>> reference.
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>   include/linux/huge_mm.h | 4 ++--
>>   mm/huge_memory.c        | 8 ++++----
>>   mm/vmscan.c             | 2 +-
>>   3 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..ce44caa40eed 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
>>   		unsigned long len, unsigned long pgoff, unsigned long flags,
>>   		vm_flags_t vm_flags);
>>   -bool can_split_folio(struct folio *folio, int *pextra_pins);
>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>   int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>   		unsigned int new_order);
>>   static inline int split_huge_page(struct page *page)
>> @@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
>>   }
>>    static inline bool
>> -can_split_folio(struct folio *folio, int *pextra_pins)
>> +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>   {
>>   	return false;
>>   }
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 697fcf89f975..c40b0dcc205b 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3021,7 +3021,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>   }
>>    /* Racy check whether the huge page can be split */
>> -bool can_split_folio(struct folio *folio, int *pextra_pins)
>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>   {
>>   	int extra_pins;
>>   @@ -3033,7 +3033,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
>>   		extra_pins = folio_nr_pages(folio);
>>   	if (pextra_pins)
>>   		*pextra_pins = extra_pins;
>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
>> +	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
>>   }
>>    /*
>> @@ -3201,7 +3201,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>   	 * Racy check if we can split the page, before unmap_folio() will
>>   	 * split PMDs
>>   	 */
>> -	if (!can_split_folio(folio, &extra_pins)) {
>> +	if (!can_split_folio(folio, 1, &extra_pins)) {
>>   		ret = -EAGAIN;
>>   		goto out_unlock;
>>   	}
>> @@ -3537,7 +3537,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>   		 * can be split or not. So skip the check here.
>>   		 */
>>   		if (!folio_test_private(folio) &&
>> -		    !can_split_folio(folio, NULL))
>> +		    !can_split_folio(folio, 0, NULL))
>>   			goto next;
>>    		if (!folio_trylock(folio))
> 
> The diff below can skip a folio with private and extra pin(s) early instead
> of trying to lock and split it then failing at can_split_folio() inside
> split_huge_page_to_list_to_order().
> 
> Maybe worth applying on top of yours?
> 
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a218320a9233..ce992d54f1da 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3532,13 +3532,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>                          goto next;
> 
>                  total++;
> -               /*
> -                * For folios with private, split_huge_page_to_list_to_order()
> -                * will try to drop it before split and then check if the folio
> -                * can be split or not. So skip the check here.
> -                */
> -               if (!folio_test_private(folio) &&
> -                   !can_split_folio(folio, 0, NULL))
> +
> +               if (!can_split_folio(folio,
> +                                    folio_test_private(folio) ? 1 : 0,
> +                                    NULL))

Hmm, it does look a bit odd. It's not something from the caller (caller_pins), but a
folio property. Likely should be handled differently.

In vmscan code, we only call can_split_folio() on anon folios where
folio_test_private() does not apply.

But indeed, in split_huge_page_to_list_to_order() we'd have to fail if
folio_test_private() still applies after

Not sure if that is really better:


diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c40b0dcc205b..7cb743047566 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3026,11 +3026,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
         int extra_pins;
  
         /* Additional pins from page cache */
-       if (folio_test_anon(folio))
+       if (folio_test_anon(folio)) {
                 extra_pins = folio_test_swapcache(folio) ?
                                 folio_nr_pages(folio) : 0;
-       else
+       } else {
                 extra_pins = folio_nr_pages(folio);
+               if (unlikely(folio_test_private(folio)))
+                       extra_pins++;
+       }
         if (pextra_pins)
                 *pextra_pins = extra_pins;
         return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
@@ -3199,9 +3202,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
  
         /*
          * Racy check if we can split the page, before unmap_folio() will
-        * split PMDs
+        * split PMDs. filemap_release_folio() will try to free buffer; if that
+        * fails, filemap_release_folio() fails.
          */
-       if (!can_split_folio(folio, 1, &extra_pins)) {
+       if (WARN_ON_ONCE(folio_test_private(folio)) ||
+           !can_split_folio(folio, 1, &extra_pins)) {
                 ret = -EAGAIN;
                 goto out_unlock;
         }
@@ -3531,13 +3536,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                         goto next;
  
                 total++;
-               /*
-                * For folios with private, split_huge_page_to_list_to_order()
-                * will try to drop it before split and then check if the folio
-                * can be split or not. So skip the check here.
-                */
-               if (!folio_test_private(folio) &&
-                   !can_split_folio(folio, 0, NULL))
+               if (!can_split_folio(folio, 0, NULL))
                         goto next;
  
                 if (!folio_trylock(folio))


It assumes that folio_set_private() is impossible after filemap_release_folio() succeeded
and we're still holding the folio lock. Then we could even get rid of the WARN_ON_ONCE().
Zi Yan Aug. 7, 2024, 2:45 p.m. UTC | #8
On 7 Aug 2024, at 5:57, David Hildenbrand wrote:

> On 06.08.24 17:36, Zi Yan wrote:
>> On 6 Aug 2024, at 6:24, David Hildenbrand wrote:
>>
>>> On 06.08.24 12:03, David Hildenbrand wrote:
>>>> On 06.08.24 11:56, David Hildenbrand wrote:
>>>>> On 06.08.24 11:46, Ryan Roberts wrote:
>>>>>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>>>>>> Let's remove yet another follow_page() user. Note that we have to do the
>>>>>>> split without holding the PTL, after folio_walk_end(). We don't care
>>>>>>> about losing the secretmem check in follow_page().
>>>>>>
>>>>>> Hi David,
>>>>>>
>>>>>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>>>>>
>>>>>> Newly failing test:
>>>>>>
>>>>>> # # ------------------------------
>>>>>> # # running ./split_huge_page_test
>>>>>> # # ------------------------------
>>>>>> # # TAP version 13
>>>>>> # # 1..12
>>>>>> # # Bail out! Still AnonHugePages not split
>>>>>> # # # Planned tests != run tests (12 != 0)
>>>>>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>>>>>> # # [FAIL]
>>>>>> # not ok 52 split_huge_page_test # exit=1
>>>>>>
>>>>>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
>>>>>
>>>>> Nothing jumps at me as well. Let me fire up the debugger :)
>>>>
>>>> Ah, very likely the can_split_folio() check expects a raised refcount
>>>> already.
>>>
>>> Indeed, the following does the trick! Thanks Ryan, I could have sworn
>>> I ran that selftest as well.
>>>
>>> TAP version 13
>>> 1..12
>>> ok 1 Split huge pages successful
>>> ok 2 Split PTE-mapped huge pages successful
>>> # Please enable pr_debug in split_huge_pages_in_file() for more info.
>>> # Please check dmesg for more information
>>> ok 3 File-backed THP split test done
>>>
>>> ...
>>>
>>>
>>> @Andrew, can you squash the following?
>>>
>>>
>>>  From e5ea585de3e089ea89bf43d8447ff9fc9b371286 Mon Sep 17 00:00:00 2001
>>> From: David Hildenbrand <david@redhat.com>
>>> Date: Tue, 6 Aug 2024 12:08:17 +0200
>>> Subject: [PATCH] fixup: mm/huge_memory: convert split_huge_pages_pid() from
>>>   follow_page() to folio_walk
>>>
>>> We have to teach can_split_folio() that we are not holding an additional
>>> reference.
>>>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> ---
>>>   include/linux/huge_mm.h | 4 ++--
>>>   mm/huge_memory.c        | 8 ++++----
>>>   mm/vmscan.c             | 2 +-
>>>   3 files changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..ce44caa40eed 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
>>>   		unsigned long len, unsigned long pgoff, unsigned long flags,
>>>   		vm_flags_t vm_flags);
>>>   -bool can_split_folio(struct folio *folio, int *pextra_pins);
>>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>   int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>   		unsigned int new_order);
>>>   static inline int split_huge_page(struct page *page)
>>> @@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
>>>   }
>>>    static inline bool
>>> -can_split_folio(struct folio *folio, int *pextra_pins)
>>> +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>   {
>>>   	return false;
>>>   }
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 697fcf89f975..c40b0dcc205b 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3021,7 +3021,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>>   }
>>>    /* Racy check whether the huge page can be split */
>>> -bool can_split_folio(struct folio *folio, int *pextra_pins)
>>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>   {
>>>   	int extra_pins;
>>>   @@ -3033,7 +3033,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
>>>   		extra_pins = folio_nr_pages(folio);
>>>   	if (pextra_pins)
>>>   		*pextra_pins = extra_pins;
>>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
>>> +	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
>>>   }
>>>    /*
>>> @@ -3201,7 +3201,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>   	 * Racy check if we can split the page, before unmap_folio() will
>>>   	 * split PMDs
>>>   	 */
>>> -	if (!can_split_folio(folio, &extra_pins)) {
>>> +	if (!can_split_folio(folio, 1, &extra_pins)) {
>>>   		ret = -EAGAIN;
>>>   		goto out_unlock;
>>>   	}
>>> @@ -3537,7 +3537,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>>   		 * can be split or not. So skip the check here.
>>>   		 */
>>>   		if (!folio_test_private(folio) &&
>>> -		    !can_split_folio(folio, NULL))
>>> +		    !can_split_folio(folio, 0, NULL))
>>>   			goto next;
>>>    		if (!folio_trylock(folio))
>>
>> The diff below can skip a folio with private and extra pin(s) early instead
>> of trying to lock and split it then failing at can_split_folio() inside
>> split_huge_page_to_list_to_order().
>>
>> Maybe worth applying on top of yours?
>>
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index a218320a9233..ce992d54f1da 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3532,13 +3532,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>                          goto next;
>>
>>                  total++;
>> -               /*
>> -                * For folios with private, split_huge_page_to_list_to_order()
>> -                * will try to drop it before split and then check if the folio
>> -                * can be split or not. So skip the check here.
>> -                */
>> -               if (!folio_test_private(folio) &&
>> -                   !can_split_folio(folio, 0, NULL))
>> +
>> +               if (!can_split_folio(folio,
>> +                                    folio_test_private(folio) ? 1 : 0,
>> +                                    NULL))
>
> Hmm, it does look a bit odd. It's not something from the caller (caller_pins), but a
> folio property. Likely should be handled differently.
>
> In vmscan code, we only call can_split_folio() on anon folios where
> folio_test_private() does not apply.
>
> But indeed, in split_huge_page_to_list_to_order() we'd have to fail if
> folio_test_private() still applies after
>
> Not sure if that is really better:

Yeah, not worth the code churn to optimize for that debugfs code.

As I looked at this patch and the fix long enough, feel free to add
Reviewed-by: Zi Yan <ziy@nvidia.com>


Best Regards,
Yan, Zi
David Hildenbrand Aug. 7, 2024, 2:52 p.m. UTC | #9
On 07.08.24 16:45, Zi Yan wrote:
> On 7 Aug 2024, at 5:57, David Hildenbrand wrote:
> 
>> On 06.08.24 17:36, Zi Yan wrote:
>>> On 6 Aug 2024, at 6:24, David Hildenbrand wrote:
>>>
>>>> On 06.08.24 12:03, David Hildenbrand wrote:
>>>>> On 06.08.24 11:56, David Hildenbrand wrote:
>>>>>> On 06.08.24 11:46, Ryan Roberts wrote:
>>>>>>> On 02/08/2024 16:55, David Hildenbrand wrote:
>>>>>>>> Let's remove yet another follow_page() user. Note that we have to do the
>>>>>>>> split without holding the PTL, after folio_walk_end(). We don't care
>>>>>>>> about losing the secretmem check in follow_page().
>>>>>>>
>>>>>>> Hi David,
>>>>>>>
>>>>>>> Our (arm64) CI is showing a regression in split_huge_page_test from mm selftests from next-20240805 onwards. Navigating around a couple of other lurking bugs, I was able to bisect to this change (which smells about right).
>>>>>>>
>>>>>>> Newly failing test:
>>>>>>>
>>>>>>> # # ------------------------------
>>>>>>> # # running ./split_huge_page_test
>>>>>>> # # ------------------------------
>>>>>>> # # TAP version 13
>>>>>>> # # 1..12
>>>>>>> # # Bail out! Still AnonHugePages not split
>>>>>>> # # # Planned tests != run tests (12 != 0)
>>>>>>> # # # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0
>>>>>>> # # [FAIL]
>>>>>>> # not ok 52 split_huge_page_test # exit=1
>>>>>>>
>>>>>>> It's trying to split some pmd-mapped THPs then checking and finding that they are not split. The split is requested via /sys/kernel/debug/split_huge_pages, which I believe ends up in this function you are modifying here. Although I'll admit that looking at the change, there is nothing obviously wrong! Any ideas?
>>>>>>
>>>>>> Nothing jumps at me as well. Let me fire up the debugger :)
>>>>>
>>>>> Ah, very likely the can_split_folio() check expects a raised refcount
>>>>> already.
>>>>
>>>> Indeed, the following does the trick! Thanks Ryan, I could have sworn
>>>> I ran that selftest as well.
>>>>
>>>> TAP version 13
>>>> 1..12
>>>> ok 1 Split huge pages successful
>>>> ok 2 Split PTE-mapped huge pages successful
>>>> # Please enable pr_debug in split_huge_pages_in_file() for more info.
>>>> # Please check dmesg for more information
>>>> ok 3 File-backed THP split test done
>>>>
>>>> ...
>>>>
>>>>
>>>> @Andrew, can you squash the following?
>>>>
>>>>
>>>>   From e5ea585de3e089ea89bf43d8447ff9fc9b371286 Mon Sep 17 00:00:00 2001
>>>> From: David Hildenbrand <david@redhat.com>
>>>> Date: Tue, 6 Aug 2024 12:08:17 +0200
>>>> Subject: [PATCH] fixup: mm/huge_memory: convert split_huge_pages_pid() from
>>>>    follow_page() to folio_walk
>>>>
>>>> We have to teach can_split_folio() that we are not holding an additional
>>>> reference.
>>>>
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>> ---
>>>>    include/linux/huge_mm.h | 4 ++--
>>>>    mm/huge_memory.c        | 8 ++++----
>>>>    mm/vmscan.c             | 2 +-
>>>>    3 files changed, 7 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index e25d9ebfdf89..ce44caa40eed 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
>>>>    		unsigned long len, unsigned long pgoff, unsigned long flags,
>>>>    		vm_flags_t vm_flags);
>>>>    -bool can_split_folio(struct folio *folio, int *pextra_pins);
>>>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>>    int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>    		unsigned int new_order);
>>>>    static inline int split_huge_page(struct page *page)
>>>> @@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
>>>>    }
>>>>     static inline bool
>>>> -can_split_folio(struct folio *folio, int *pextra_pins)
>>>> +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>>    {
>>>>    	return false;
>>>>    }
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 697fcf89f975..c40b0dcc205b 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3021,7 +3021,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>>>    }
>>>>     /* Racy check whether the huge page can be split */
>>>> -bool can_split_folio(struct folio *folio, int *pextra_pins)
>>>> +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>>    {
>>>>    	int extra_pins;
>>>>    @@ -3033,7 +3033,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
>>>>    		extra_pins = folio_nr_pages(folio);
>>>>    	if (pextra_pins)
>>>>    		*pextra_pins = extra_pins;
>>>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
>>>> +	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins;
>>>>    }
>>>>     /*
>>>> @@ -3201,7 +3201,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>    	 * Racy check if we can split the page, before unmap_folio() will
>>>>    	 * split PMDs
>>>>    	 */
>>>> -	if (!can_split_folio(folio, &extra_pins)) {
>>>> +	if (!can_split_folio(folio, 1, &extra_pins)) {
>>>>    		ret = -EAGAIN;
>>>>    		goto out_unlock;
>>>>    	}
>>>> @@ -3537,7 +3537,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>>>    		 * can be split or not. So skip the check here.
>>>>    		 */
>>>>    		if (!folio_test_private(folio) &&
>>>> -		    !can_split_folio(folio, NULL))
>>>> +		    !can_split_folio(folio, 0, NULL))
>>>>    			goto next;
>>>>     		if (!folio_trylock(folio))
>>>
>>> The diff below can skip a folio with private and extra pin(s) early instead
>>> of trying to lock and split it then failing at can_split_folio() inside
>>> split_huge_page_to_list_to_order().
>>>
>>> Maybe worth applying on top of yours?
>>>
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index a218320a9233..ce992d54f1da 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3532,13 +3532,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>>                           goto next;
>>>
>>>                   total++;
>>> -               /*
>>> -                * For folios with private, split_huge_page_to_list_to_order()
>>> -                * will try to drop it before split and then check if the folio
>>> -                * can be split or not. So skip the check here.
>>> -                */
>>> -               if (!folio_test_private(folio) &&
>>> -                   !can_split_folio(folio, 0, NULL))
>>> +
>>> +               if (!can_split_folio(folio,
>>> +                                    folio_test_private(folio) ? 1 : 0,
>>> +                                    NULL))
>>
>> Hmm, it does look a bit odd. It's not something from the caller (caller_pins), but a
>> folio property. Likely should be handled differently.
>>
>> In vmscan code, we only call can_split_folio() on anon folios where
>> folio_test_private() does not apply.
>>
>> But indeed, in split_huge_page_to_list_to_order() we'd have to fail if
>> folio_test_private() still applies after
>>
>> Not sure if that is really better:
> 
> Yeah, not worth the code churn to optimize for that debugfs code.
> 
> As I looked at this patch and the fix long enough, feel free to add
> Reviewed-by: Zi Yan <ziy@nvidia.com>

Thanks! :)
Pankaj Raghav Aug. 15, 2024, 10:04 a.m. UTC | #10
Hi David,

On Fri, Aug 02, 2024 at 05:55:20PM +0200, David Hildenbrand wrote:
>  			continue;
>  		}
>  
> -		/* FOLL_DUMP to ignore special (like zero) pages */
> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> -
> -		if (IS_ERR_OR_NULL(page))
> +		folio = folio_walk_start(&fw, vma, addr, 0);
> +		if (!folio)
>  			continue;
>  
> -		folio = page_folio(page);
>  		if (!is_transparent_hugepage(folio))
>  			goto next;
>  
> @@ -3544,13 +3542,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  
>  		if (!folio_trylock(folio))
>  			goto next;
> +		folio_get(folio);

Shouldn't we lock the folio after we increase the refcount on the folio?
i.e we do folio_get() first and then folio_trylock()?

That is how it was done before (through follow_page) and this patch changes
that. Maybe it doesn't matter? To me increasing the refcount and then
locking sounds more logical but I do see this ordering getting mixed all
over the kernel.

> +		folio_walk_end(&fw, vma);
>  
>  		if (!split_folio_to_order(folio, new_order))
>  			split++;
>  
>  		folio_unlock(folio);
> -next:
>  		folio_put(folio);
> +
> +		cond_resched();
> +		continue;
> +next:
> +		folio_walk_end(&fw, vma);
>  		cond_resched();
>  	}
>  	mmap_read_unlock(mm);
> -- 
> 2.45.2
David Hildenbrand Aug. 15, 2024, 10:20 a.m. UTC | #11
On 15.08.24 12:04, Pankaj Raghav wrote:
> Hi David,
> 
> On Fri, Aug 02, 2024 at 05:55:20PM +0200, David Hildenbrand wrote:
>>   			continue;
>>   		}
>>   
>> -		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> -
>> -		if (IS_ERR_OR_NULL(page))
>> +		folio = folio_walk_start(&fw, vma, addr, 0);
>> +		if (!folio)
>>   			continue;
>>   
>> -		folio = page_folio(page);
>>   		if (!is_transparent_hugepage(folio))
>>   			goto next;
>>   
>> @@ -3544,13 +3542,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>>   
>>   		if (!folio_trylock(folio))
>>   			goto next;
>> +		folio_get(folio);
> 
> Shouldn't we lock the folio after we increase the refcount on the folio?
> i.e we do folio_get() first and then folio_trylock()?
> 
> That is how it was done before (through follow_page) and this patch changes
> that. Maybe it doesn't matter? To me increasing the refcount and then
> locking sounds more logical but I do see this ordering getting mixed all
> over the kernel.

There is no need to grab a folio reference if we hold an implicit 
reference through the mapping that cannot go away (not that we hold the 
page table lock). Locking the folio is not special in that regard: we 
just have to make sure that the folio cannot get freed concurrently, 
which is the case here.

So here, we really only grab a reference if we have to -- when we are 
about to drop the page table lock and will continue using the folio 
afterwards.
Pankaj Raghav (Samsung) Aug. 15, 2024, 1:43 p.m. UTC | #12
On Thu, Aug 15, 2024 at 12:20:04PM +0200, David Hildenbrand wrote:
> On 15.08.24 12:04, Pankaj Raghav wrote:
> > Hi David,
> > 
> > On Fri, Aug 02, 2024 at 05:55:20PM +0200, David Hildenbrand wrote:
> > >   			continue;
> > >   		}
> > > -		/* FOLL_DUMP to ignore special (like zero) pages */
> > > -		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> > > -
> > > -		if (IS_ERR_OR_NULL(page))
> > > +		folio = folio_walk_start(&fw, vma, addr, 0);
> > > +		if (!folio)
> > >   			continue;
> > > -		folio = page_folio(page);
> > >   		if (!is_transparent_hugepage(folio))
> > >   			goto next;
> > > @@ -3544,13 +3542,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> > >   		if (!folio_trylock(folio))
> > >   			goto next;
> > > +		folio_get(folio);
> > 
> > Shouldn't we lock the folio after we increase the refcount on the folio?
> > i.e we do folio_get() first and then folio_trylock()?
> > 
> > That is how it was done before (through follow_page) and this patch changes
> > that. Maybe it doesn't matter? To me increasing the refcount and then
> > locking sounds more logical but I do see this ordering getting mixed all
> > over the kernel.
> 
> There is no need to grab a folio reference if we hold an implicit reference
> through the mapping that cannot go away (not that we hold the page table
> lock). Locking the folio is not special in that regard: we just have to make
> sure that the folio cannot get freed concurrently, which is the case here.
> 
> So here, we really only grab a reference if we have to -- when we are about
> to drop the page table lock and will continue using the folio afterwards.
Got it. Thanks!
> 
> -- 
> Cheers,
> 
> David / dhildenb
>
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0167dc27e365..697fcf89f975 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -40,6 +40,7 @@ 
 #include <linux/memory-tiers.h>
 #include <linux/compat.h>
 #include <linux/pgalloc_tag.h>
+#include <linux/pagewalk.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -3507,7 +3508,7 @@  static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 	 */
 	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
 		struct vm_area_struct *vma = vma_lookup(mm, addr);
-		struct page *page;
+		struct folio_walk fw;
 		struct folio *folio;
 
 		if (!vma)
@@ -3519,13 +3520,10 @@  static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 			continue;
 		}
 
-		/* FOLL_DUMP to ignore special (like zero) pages */
-		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
-
-		if (IS_ERR_OR_NULL(page))
+		folio = folio_walk_start(&fw, vma, addr, 0);
+		if (!folio)
 			continue;
 
-		folio = page_folio(page);
 		if (!is_transparent_hugepage(folio))
 			goto next;
 
@@ -3544,13 +3542,19 @@  static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 
 		if (!folio_trylock(folio))
 			goto next;
+		folio_get(folio);
+		folio_walk_end(&fw, vma);
 
 		if (!split_folio_to_order(folio, new_order))
 			split++;
 
 		folio_unlock(folio);
-next:
 		folio_put(folio);
+
+		cond_resched();
+		continue;
+next:
+		folio_walk_end(&fw, vma);
 		cond_resched();
 	}
 	mmap_read_unlock(mm);