diff mbox series

[hotfix,1/2] mm/thp: fix deferred split queue not partially_mapped

Message ID 760237a3-69d6-9197-432d-0306d52c048a@google.com (mailing list archive)
State New
Headers show
Series [hotfix,1/2] mm/thp: fix deferred split queue not partially_mapped | expand

Commit Message

Hugh Dickins Oct. 24, 2024, 4:10 a.m. UTC
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without).  The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.

The new unlocked list_del_init() in deferred_split_scan() is buggy.
I gave bad advice, it looks plausible since that's a local on-stack
list, but the fact is that it can race with a third party freeing or
migrating the preceding folio (properly unqueueing it with refcount 0
while holding split_queue_lock), thereby corrupting the list linkage.

The obvious answer would be to take split_queue_lock there: but it has
a long history of contention, so I'm reluctant to add to that. Instead,
make sure that there is always one safe (raised refcount) folio before,
by delaying its folio_put().  (And of course I was wrong to suggest
updating split_queue_len without the lock: leave that until the splice.)

And remove two over-eager partially_mapped checks, restoring those tests
to how they were before: if uncharge_folio() or free_tail_page_prepare()
finds _deferred_list non-empty, it's in trouble whether or not that folio
is partially_mapped (and the flag was already cleared in the latter case).

Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd@google.com>
---
 mm/huge_memory.c | 21 +++++++++++++++++----
 mm/memcontrol.c  |  3 +--
 mm/page_alloc.c  |  5 ++---
 3 files changed, 20 insertions(+), 9 deletions(-)

Comments

Usama Arif Oct. 24, 2024, 10:20 a.m. UTC | #1
On 24/10/2024 05:10, Hugh Dickins wrote:
> Recent changes are putting more pressure on THP deferred split queues:
> under load revealing long-standing races, causing list_del corruptions,
> "Bad page state"s and worse (I keep BUGs in both of those, so usually
> don't get to see how badly they end up without).  The relevant recent
> changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
> improved swap allocation, and underused THP splitting.
> 
> The new unlocked list_del_init() in deferred_split_scan() is buggy.
> I gave bad advice, it looks plausible since that's a local on-stack
> list, but the fact is that it can race with a third party freeing or
> migrating the preceding folio (properly unqueueing it with refcount 0
> while holding split_queue_lock), thereby corrupting the list linkage.
> 
> The obvious answer would be to take split_queue_lock there: but it has
> a long history of contention, so I'm reluctant to add to that. Instead,
> make sure that there is always one safe (raised refcount) folio before,
> by delaying its folio_put().  (And of course I was wrong to suggest
> updating split_queue_len without the lock: leave that until the splice.)
> 

Thanks for this, I imagine this was quite tricky to debug.

Avoiding taking the split_queue_lock by keeping reference of the
preceding folio is really nice.

And I should have spotted in the original patch that split_queue_len
shouldn't be changed without holding split_queue_lock.

Acked-by: Usama Arif <usamaarif642@gmail.com>

> And remove two over-eager partially_mapped checks, restoring those tests
> to how they were before: if uncharge_folio() or free_tail_page_prepare()
> finds _deferred_list non-empty, it's in trouble whether or not that folio
> is partially_mapped (and the flag was already cleared in the latter case).
> 
> Fixes: dafff3f4c850 ("mm: split underused THPs")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
>  mm/huge_memory.c | 21 +++++++++++++++++----
>  mm/memcontrol.c  |  3 +--
>  mm/page_alloc.c  |  5 ++---
>  3 files changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2fb328880b50..a1d345f1680c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3718,8 +3718,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>  	unsigned long flags;
>  	LIST_HEAD(list);
> -	struct folio *folio, *next;
> -	int split = 0;
> +	struct folio *folio, *next, *prev = NULL;
> +	int split = 0, removed = 0;
>  
>  #ifdef CONFIG_MEMCG
>  	if (sc->memcg)
> @@ -3775,15 +3775,28 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  		 */
>  		if (!did_split && !folio_test_partially_mapped(folio)) {
>  			list_del_init(&folio->_deferred_list);
> -			ds_queue->split_queue_len--;
> +			removed++;
> +		} else {
> +			/*
> +			 * That unlocked list_del_init() above would be unsafe,
> +			 * unless its folio is separated from any earlier folios
> +			 * left on the list (which may be concurrently unqueued)
> +			 * by one safe folio with refcount still raised.
> +			 */
> +			swap(folio, prev);
>  		}
> -		folio_put(folio);
> +		if (folio)
> +			folio_put(folio);
>  	}
>  
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>  	list_splice_tail(&list, &ds_queue->split_queue);
> +	ds_queue->split_queue_len -= removed;
>  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  
> +	if (prev)
> +		folio_put(prev);
> +
>  	/*
>  	 * Stop shrinker if we didn't split any page, but the queue is empty.
>  	 * This can happen if pages were freed under us.
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7845c64a2c57..2703227cce88 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4631,8 +4631,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>  	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>  	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>  			!folio_test_hugetlb(folio) &&
> -			!list_empty(&folio->_deferred_list) &&
> -			folio_test_partially_mapped(folio), folio);
> +			!list_empty(&folio->_deferred_list), folio);
>  
>  	/*
>  	 * Nobody should be changing or seriously looking at
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8afab64814dc..4b21a368b4e2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -961,9 +961,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>  		break;
>  	case 2:
>  		/* the second tail page: deferred_list overlaps ->mapping */
> -		if (unlikely(!list_empty(&folio->_deferred_list) &&
> -		    folio_test_partially_mapped(folio))) {
> -			bad_page(page, "partially mapped folio on deferred list");
> +		if (unlikely(!list_empty(&folio->_deferred_list))) {
> +			bad_page(page, "on deferred list");
>  			goto out;
>  		}
>  		break;
David Hildenbrand Oct. 24, 2024, 8:39 p.m. UTC | #2
On 24.10.24 06:10, Hugh Dickins wrote:
> Recent changes are putting more pressure on THP deferred split queues:
> under load revealing long-standing races, causing list_del corruptions,
> "Bad page state"s and worse (I keep BUGs in both of those, so usually
> don't get to see how badly they end up without).  The relevant recent
> changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
> improved swap allocation, and underused THP splitting.
> 
> The new unlocked list_del_init() in deferred_split_scan() is buggy.
> I gave bad advice, it looks plausible since that's a local on-stack
> list, but the fact is that it can race with a third party freeing or
> migrating the preceding folio (properly unqueueing it with refcount 0
> while holding split_queue_lock), thereby corrupting the list linkage.
> 
> The obvious answer would be to take split_queue_lock there: but it has
> a long history of contention, so I'm reluctant to add to that. Instead,
> make sure that there is always one safe (raised refcount) folio before,
> by delaying its folio_put().  (And of course I was wrong to suggest
> updating split_queue_len without the lock: leave that until the splice.)
> 
> And remove two over-eager partially_mapped checks, restoring those tests
> to how they were before: if uncharge_folio() or free_tail_page_prepare()
> finds _deferred_list non-empty, it's in trouble whether or not that folio
> is partially_mapped (and the flag was already cleared in the latter case).
> 
> Fixes: dafff3f4c850 ("mm: split underused THPs")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---

Challenging problem and elegant fix!

Reviewed-by: David Hildenbrand <david@redhat.com>

At some point you have to tell me your secrets, how you are able to 
trigger all these hard-to-trigger problems (6.8 stuff and no reports on 
the list) :)
Zi Yan Oct. 24, 2024, 10:37 p.m. UTC | #3
On 24 Oct 2024, at 0:10, Hugh Dickins wrote:

> Recent changes are putting more pressure on THP deferred split queues:
> under load revealing long-standing races, causing list_del corruptions,
> "Bad page state"s and worse (I keep BUGs in both of those, so usually
> don't get to see how badly they end up without).  The relevant recent
> changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
> improved swap allocation, and underused THP splitting.
>
> The new unlocked list_del_init() in deferred_split_scan() is buggy.
> I gave bad advice, it looks plausible since that's a local on-stack
> list, but the fact is that it can race with a third party freeing or
> migrating the preceding folio (properly unqueueing it with refcount 0
> while holding split_queue_lock), thereby corrupting the list linkage.
>
> The obvious answer would be to take split_queue_lock there: but it has
> a long history of contention, so I'm reluctant to add to that. Instead,
> make sure that there is always one safe (raised refcount) folio before,
> by delaying its folio_put().  (And of course I was wrong to suggest
> updating split_queue_len without the lock: leave that until the splice.)

I feel like this is not the right approach, since it breaks the existing
condition of changing folio->_deferred_list, namely taking
ds_queue->split_queue_lock for serialization. The contention might not be
as high as you think, since if a folio were split, the split_queue_lock
needed to be taken during split anyway. So the worse case is the same
as all folios are split. Do you see significant perf degradation due to
taking the lock when doing list_del_init()?

I am afraid if we take this route, we might hit hard-to-debug bugs
in the future when someone touches the code.

Thanks.

>
> And remove two over-eager partially_mapped checks, restoring those tests
> to how they were before: if uncharge_folio() or free_tail_page_prepare()
> finds _deferred_list non-empty, it's in trouble whether or not that folio
> is partially_mapped (and the flag was already cleared in the latter case).
>
> Fixes: dafff3f4c850 ("mm: split underused THPs")
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
>  mm/huge_memory.c | 21 +++++++++++++++++----
>  mm/memcontrol.c  |  3 +--
>  mm/page_alloc.c  |  5 ++---
>  3 files changed, 20 insertions(+), 9 deletions(-)

Best Regards,
Yan, Zi
Baolin Wang Oct. 25, 2024, 1:56 a.m. UTC | #4
On 2024/10/24 12:10, Hugh Dickins wrote:
> Recent changes are putting more pressure on THP deferred split queues:
> under load revealing long-standing races, causing list_del corruptions,
> "Bad page state"s and worse (I keep BUGs in both of those, so usually
> don't get to see how badly they end up without).  The relevant recent
> changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
> improved swap allocation, and underused THP splitting.
> 
> The new unlocked list_del_init() in deferred_split_scan() is buggy.
> I gave bad advice, it looks plausible since that's a local on-stack
> list, but the fact is that it can race with a third party freeing or
> migrating the preceding folio (properly unqueueing it with refcount 0
> while holding split_queue_lock), thereby corrupting the list linkage.
> 
> The obvious answer would be to take split_queue_lock there: but it has
> a long history of contention, so I'm reluctant to add to that. Instead,
> make sure that there is always one safe (raised refcount) folio before,
> by delaying its folio_put().  (And of course I was wrong to suggest
> updating split_queue_len without the lock: leave that until the splice.)
> 
> And remove two over-eager partially_mapped checks, restoring those tests
> to how they were before: if uncharge_folio() or free_tail_page_prepare()
> finds _deferred_list non-empty, it's in trouble whether or not that folio
> is partially_mapped (and the flag was already cleared in the latter case).
> 
> Fixes: dafff3f4c850 ("mm: split underused THPs")
> Signed-off-by: Hugh Dickins <hughd@google.com>

Good catch. LGTM.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Hugh Dickins Oct. 25, 2024, 5:41 a.m. UTC | #5
On Thu, 24 Oct 2024, Zi Yan wrote:
> On 24 Oct 2024, at 0:10, Hugh Dickins wrote:
> 
> > The new unlocked list_del_init() in deferred_split_scan() is buggy.
> > I gave bad advice, it looks plausible since that's a local on-stack
> > list, but the fact is that it can race with a third party freeing or
> > migrating the preceding folio (properly unqueueing it with refcount 0
> > while holding split_queue_lock), thereby corrupting the list linkage.
> >
> > The obvious answer would be to take split_queue_lock there: but it has
> > a long history of contention, so I'm reluctant to add to that. Instead,
> > make sure that there is always one safe (raised refcount) folio before,
> > by delaying its folio_put().  (And of course I was wrong to suggest
> > updating split_queue_len without the lock: leave that until the splice.)
> 
> I feel like this is not the right approach, since it breaks the existing
> condition of changing folio->_deferred_list, namely taking
> ds_queue->split_queue_lock for serialization. The contention might not be
> as high as you think, since if a folio were split, the split_queue_lock
> needed to be taken during split anyway. So the worse case is the same
> as all folios are split. Do you see significant perf degradation due to
> taking the lock when doing list_del_init()?
> 
> I am afraid if we take this route, we might hit hard-to-debug bugs
> in the future when someone touches the code.

You have a good point: I am adding another element of trickiness
to that already-tricky local-but-not-quite list - which has tripped
us up a few times in the past.

I do still feel that this solution is right in the spirit of that list;
but I've certainly not done any performance measurement to justify it,
nor would I ever trust my skill to do so.  I just tried to solve the
corruptions in what I thought was the best way.

(To be honest, I found this solution to the corruptions first, and thought
the bug went back to the original implemention: that its put_page() at the
end of the loop was premature all along.  It was only when writing the
commit message two days ago, that I came to realize that even put_page()
or folio_put() would be safely using the lock to unqueue: that it is only
this new list_del_init() which is the exception which introduces the bug.)

Looking at vmstats, I'm coming to believe that the performance advantage
of this way is likely to be in the noise: that mTHPs alone, and the
!partially_mapped case on top, are greatly increasing the split_deferred
stats: and may give rise to renewed complaints of lock contention, with
or without this optimization.

While I still prefer to stick with what's posted and most tested, I am
giving the locked version a run overnight.  Thanks a lot for the reviews
and acks everyone: at present Zi Yan is in the minority preferring a
locked version, but please feel free to change your vote if you wish.

Hugh
Zi Yan Oct. 25, 2024, 3:32 p.m. UTC | #6
On 25 Oct 2024, at 1:41, Hugh Dickins wrote:

> On Thu, 24 Oct 2024, Zi Yan wrote:
>> On 24 Oct 2024, at 0:10, Hugh Dickins wrote:
>>
>>> The new unlocked list_del_init() in deferred_split_scan() is buggy.
>>> I gave bad advice, it looks plausible since that's a local on-stack
>>> list, but the fact is that it can race with a third party freeing or
>>> migrating the preceding folio (properly unqueueing it with refcount 0
>>> while holding split_queue_lock), thereby corrupting the list linkage.
>>>
>>> The obvious answer would be to take split_queue_lock there: but it has
>>> a long history of contention, so I'm reluctant to add to that. Instead,
>>> make sure that there is always one safe (raised refcount) folio before,
>>> by delaying its folio_put().  (And of course I was wrong to suggest
>>> updating split_queue_len without the lock: leave that until the splice.)
>>
>> I feel like this is not the right approach, since it breaks the existing
>> condition of changing folio->_deferred_list, namely taking
>> ds_queue->split_queue_lock for serialization. The contention might not be
>> as high as you think, since if a folio were split, the split_queue_lock
>> needed to be taken during split anyway. So the worse case is the same
>> as all folios are split. Do you see significant perf degradation due to
>> taking the lock when doing list_del_init()?
>>
>> I am afraid if we take this route, we might hit hard-to-debug bugs
>> in the future when someone touches the code.
>
> You have a good point: I am adding another element of trickiness
> to that already-tricky local-but-not-quite list - which has tripped
> us up a few times in the past.
>
> I do still feel that this solution is right in the spirit of that list;
> but I've certainly not done any performance measurement to justify it,
> nor would I ever trust my skill to do so.  I just tried to solve the
> corruptions in what I thought was the best way.
>
> (To be honest, I found this solution to the corruptions first, and thought
> the bug went back to the original implemention: that its put_page() at the
> end of the loop was premature all along.  It was only when writing the
> commit message two days ago, that I came to realize that even put_page()
> or folio_put() would be safely using the lock to unqueue: that it is only
> this new list_del_init() which is the exception which introduces the bug.)
>
> Looking at vmstats, I'm coming to believe that the performance advantage
> of this way is likely to be in the noise: that mTHPs alone, and the
> !partially_mapped case on top, are greatly increasing the split_deferred
> stats: and may give rise to renewed complaints of lock contention, with
> or without this optimization.
>
> While I still prefer to stick with what's posted and most tested, I am
> giving the locked version a run overnight.  Thanks a lot for the reviews
> and acks everyone: at present Zi Yan is in the minority preferring a
> locked version, but please feel free to change your vote if you wish.

Thank you a lot for taking the time to check the locked version. Looking
forward to the result. BTW, I am not going to block this patch since it
fixes the bug.

The tricky part in deferred_list_scan() is always the use of
folio->_deferred_list without taking split_queue_lock. I am thinking about
use folio_batch to store the out-of-split_queue folios, so that _deferred_list
will not be touched when these folios are tried to be split. Basically,

1. loop through split_queue and move folios to a folio_batch until the
   folio_batch is full;
2. loop through the folio_batch to try to split each folio;
3. move the remaining folios back to split_queue.

With this approach, split_queue_lock might be taken more if there are
more than 31 (folio_batch max size) folios on split_queue and split_queue_lock
will be held longer in step 3, since the remaining folios need to be
added back to split_queue one by one instead of a single list splice.

Let me know your thoughts. I can look into this if this approach sounds
promising. Thanks.


Best Regards,
Yan, Zi
Yang Shi Oct. 25, 2024, 6:36 p.m. UTC | #7
On Fri, Oct 25, 2024 at 8:32 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 25 Oct 2024, at 1:41, Hugh Dickins wrote:
>
> > On Thu, 24 Oct 2024, Zi Yan wrote:
> >> On 24 Oct 2024, at 0:10, Hugh Dickins wrote:
> >>
> >>> The new unlocked list_del_init() in deferred_split_scan() is buggy.
> >>> I gave bad advice, it looks plausible since that's a local on-stack
> >>> list, but the fact is that it can race with a third party freeing or
> >>> migrating the preceding folio (properly unqueueing it with refcount 0
> >>> while holding split_queue_lock), thereby corrupting the list linkage.
> >>>
> >>> The obvious answer would be to take split_queue_lock there: but it has
> >>> a long history of contention, so I'm reluctant to add to that. Instead,
> >>> make sure that there is always one safe (raised refcount) folio before,
> >>> by delaying its folio_put().  (And of course I was wrong to suggest
> >>> updating split_queue_len without the lock: leave that until the splice.)
> >>
> >> I feel like this is not the right approach, since it breaks the existing
> >> condition of changing folio->_deferred_list, namely taking
> >> ds_queue->split_queue_lock for serialization. The contention might not be
> >> as high as you think, since if a folio were split, the split_queue_lock
> >> needed to be taken during split anyway. So the worse case is the same
> >> as all folios are split. Do you see significant perf degradation due to
> >> taking the lock when doing list_del_init()?
> >>
> >> I am afraid if we take this route, we might hit hard-to-debug bugs
> >> in the future when someone touches the code.
> >
> > You have a good point: I am adding another element of trickiness
> > to that already-tricky local-but-not-quite list - which has tripped
> > us up a few times in the past.
> >
> > I do still feel that this solution is right in the spirit of that list;
> > but I've certainly not done any performance measurement to justify it,
> > nor would I ever trust my skill to do so.  I just tried to solve the
> > corruptions in what I thought was the best way.
> >
> > (To be honest, I found this solution to the corruptions first, and thought
> > the bug went back to the original implemention: that its put_page() at the
> > end of the loop was premature all along.  It was only when writing the
> > commit message two days ago, that I came to realize that even put_page()
> > or folio_put() would be safely using the lock to unqueue: that it is only
> > this new list_del_init() which is the exception which introduces the bug.)
> >
> > Looking at vmstats, I'm coming to believe that the performance advantage
> > of this way is likely to be in the noise: that mTHPs alone, and the
> > !partially_mapped case on top, are greatly increasing the split_deferred
> > stats: and may give rise to renewed complaints of lock contention, with
> > or without this optimization.
> >
> > While I still prefer to stick with what's posted and most tested, I am
> > giving the locked version a run overnight.  Thanks a lot for the reviews
> > and acks everyone: at present Zi Yan is in the minority preferring a
> > locked version, but please feel free to change your vote if you wish.
>
> Thank you a lot for taking the time to check the locked version. Looking
> forward to the result. BTW, I am not going to block this patch since it
> fixes the bug.
>
> The tricky part in deferred_list_scan() is always the use of
> folio->_deferred_list without taking split_queue_lock. I am thinking about
> use folio_batch to store the out-of-split_queue folios, so that _deferred_list
> will not be touched when these folios are tried to be split. Basically,
>
> 1. loop through split_queue and move folios to a folio_batch until the
>    folio_batch is full;
> 2. loop through the folio_batch to try to split each folio;
> 3. move the remaining folios back to split_queue.
>
> With this approach, split_queue_lock might be taken more if there are
> more than 31 (folio_batch max size) folios on split_queue and split_queue_lock
> will be held longer in step 3, since the remaining folios need to be
> added back to split_queue one by one instead of a single list splice.

IMHO, the folio_batch approach is worth trying. The deferred list lock
is just held when deleting folio from deferred list and updating the
list len. Re-acquiring the lock every 31 folios seems not very bad. Of
course, some benchmark is needed.

The other subtle thing is folio->_deferred_list is reused when the
folio is moved to the local on-stack list. And some
list_empty(deferred_list) checks return true even though the folio is
actually on the local on-stack list. Some code may depend on or
inadvertently depend on this behavior. Using folio_batch may break
some assumptions, but depending on this subtle behavior is definitely
not reliable IMHO.

>
> Let me know your thoughts. I can look into this if this approach sounds
> promising. Thanks.
>
>
> Best Regards,
> Yan, Zi
Hugh Dickins Oct. 27, 2024, 4:43 a.m. UTC | #8
On Fri, 25 Oct 2024, Zi Yan wrote:
> 
> Thank you a lot for taking the time to check the locked version. Looking
> forward to the result.

The locked version worked fine (I did see some unusual filesystem on loop
errors on this laptop, but very much doubt that was related to the extra
deferred split queue locking).  But I do still prefer the original version.

> BTW, I am not going to block this patch since it fixes the bug.

Thanks!  I'll put out the same as v2 1/2.

> 
> The tricky part in deferred_list_scan() is always the use of
> folio->_deferred_list without taking split_queue_lock. I am thinking about
> use folio_batch to store the out-of-split_queue folios, so that _deferred_list
> will not be touched when these folios are tried to be split. Basically,
> 
> 1. loop through split_queue and move folios to a folio_batch until the
>    folio_batch is full;
> 2. loop through the folio_batch to try to split each folio;
> 3. move the remaining folios back to split_queue.
> 
> With this approach, split_queue_lock might be taken more if there are
> more than 31 (folio_batch max size) folios on split_queue and split_queue_lock
> will be held longer in step 3, since the remaining folios need to be
> added back to split_queue one by one instead of a single list splice.
> 
> Let me know your thoughts. I can look into this if this approach sounds
> promising. Thanks.

TBH, I just don't see the point: we have a working mechanism, and
complicating it to rely on more infrastructure, just to reach the
same endpoint, seems a waste of developer time to me.  It's not as
if a folio_batch there would make the split handling more efficient.

It would disambiguate the list_empty(), sure; but as it stands,
there's nothing in the handling which needs that disambiguated.

I can half-imagine that a folio_batch might become a better technique,
if we go on to need less restricted use of the deferred split queue:
if for some reason we grow to want to unqueue a THP while it's still
in use (as memcg move wanted in 2/2, but was not worth recoding for).

But I'm not sure whether a folio_batch would actually help there,
and I wouldn't worry about it unless there's a need.

Hugh
Hugh Dickins Oct. 27, 2024, 5:08 a.m. UTC | #9
On Fri, 25 Oct 2024, Yang Shi wrote:
> 
> The other subtle thing is folio->_deferred_list is reused when the
> folio is moved to the local on-stack list. And some

Yes.

> list_empty(deferred_list) checks return true even though the folio is
> actually on the local on-stack list. Some code may depend on or

The code definitely depends on that behaviour: that's how folios get
unqueued when refcount reaches 0, whether they are on the public list
or on the local list at that time.

> inadvertently depend on this behavior. Using folio_batch may break
> some assumptions, but depending on this subtle behavior is definitely
> not reliable IMHO.
Yang Shi Oct. 28, 2024, 6:36 p.m. UTC | #10
On Sat, Oct 26, 2024 at 10:08 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Fri, 25 Oct 2024, Yang Shi wrote:
> >
> > The other subtle thing is folio->_deferred_list is reused when the
> > folio is moved to the local on-stack list. And some
>
> Yes.
>
> > list_empty(deferred_list) checks return true even though the folio is
> > actually on the local on-stack list. Some code may depend on or
>
> The code definitely depends on that behaviour: that's how folios get
> unqueued when refcount reaches 0, whether they are on the public list
> or on the local list at that time.

Yeah, folio may have 0 refcount on the local list after that
folio_put() before it is moved back to deferred list.

The main purpose for using folio_batch is to disambiguate list_empty()
so that we don't rely on this subtle behavior. But I soon realized
this may make deferred list lock contention worse when moving the
folios back to deferred list. Currently we just need to do list
splice, but we have to add every single folio back to deferred list
one by one with folio_batch. It depends on how often folio split
fails.

>
> > inadvertently depend on this behavior. Using folio_batch may break
> > some assumptions, but depending on this subtle behavior is definitely
> > not reliable IMHO.
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2fb328880b50..a1d345f1680c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3718,8 +3718,8 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
 	unsigned long flags;
 	LIST_HEAD(list);
-	struct folio *folio, *next;
-	int split = 0;
+	struct folio *folio, *next, *prev = NULL;
+	int split = 0, removed = 0;
 
 #ifdef CONFIG_MEMCG
 	if (sc->memcg)
@@ -3775,15 +3775,28 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 		 */
 		if (!did_split && !folio_test_partially_mapped(folio)) {
 			list_del_init(&folio->_deferred_list);
-			ds_queue->split_queue_len--;
+			removed++;
+		} else {
+			/*
+			 * That unlocked list_del_init() above would be unsafe,
+			 * unless its folio is separated from any earlier folios
+			 * left on the list (which may be concurrently unqueued)
+			 * by one safe folio with refcount still raised.
+			 */
+			swap(folio, prev);
 		}
-		folio_put(folio);
+		if (folio)
+			folio_put(folio);
 	}
 
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	list_splice_tail(&list, &ds_queue->split_queue);
+	ds_queue->split_queue_len -= removed;
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
+	if (prev)
+		folio_put(prev);
+
 	/*
 	 * Stop shrinker if we didn't split any page, but the queue is empty.
 	 * This can happen if pages were freed under us.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7845c64a2c57..2703227cce88 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4631,8 +4631,7 @@  static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
 			!folio_test_hugetlb(folio) &&
-			!list_empty(&folio->_deferred_list) &&
-			folio_test_partially_mapped(folio), folio);
+			!list_empty(&folio->_deferred_list), folio);
 
 	/*
 	 * Nobody should be changing or seriously looking at
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8afab64814dc..4b21a368b4e2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -961,9 +961,8 @@  static int free_tail_page_prepare(struct page *head_page, struct page *page)
 		break;
 	case 2:
 		/* the second tail page: deferred_list overlaps ->mapping */
-		if (unlikely(!list_empty(&folio->_deferred_list) &&
-		    folio_test_partially_mapped(folio))) {
-			bad_page(page, "partially mapped folio on deferred list");
+		if (unlikely(!list_empty(&folio->_deferred_list))) {
+			bad_page(page, "on deferred list");
 			goto out;
 		}
 		break;