diff mbox series

[v2] mm/migrate: put dest folio on deferred split list if source was there.

Message ID 20240311195848.135067-1-zi.yan@sent.com (mailing list archive)
State New
Headers show
Series [v2] mm/migrate: put dest folio on deferred split list if source was there. | expand

Commit Message

Zi Yan March 11, 2024, 7:58 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

Commit 616b8371539a6 ("mm: thp: enable thp migration in generic path")
did not check if a THP is on deferred split list before migration, thus,
the destination THP is never put on deferred split list even if the source
THP might be. The opportunity of reclaiming free pages in a partially
mapped THP during deferred list scanning is lost, but no other harmful
consequence is present[1]. Checking source folio deferred split list
status before page unmapped and add destination folio to the list if
source was after migration.

[1]: https://lore.kernel.org/linux-mm/03CE3A00-917C-48CC-8E1C-6A98713C817C@nvidia.com/

From v1:
1. Used dst to get correct deferred split list after migration
   (per Ryan Roberts).

Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path")
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/huge_memory.c | 22 ----------------------
 mm/internal.h    | 23 +++++++++++++++++++++++
 mm/migrate.c     | 26 +++++++++++++++++++++++++-
 3 files changed, 48 insertions(+), 23 deletions(-)

Comments

Matthew Wilcox (Oracle) March 12, 2024, 3:45 a.m. UTC | #1
On Mon, Mar 11, 2024 at 03:58:48PM -0400, Zi Yan wrote:
> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>  		folio_lock(src);
>  	}
>  	locked = true;
> +	if (folio_test_large_rmappable(src) &&
> +		!list_empty(&src->_deferred_list)) {
> +		struct deferred_split *ds_queue = get_deferred_split_queue(src);
> +
> +		spin_lock(&ds_queue->split_queue_lock);
> +		ds_queue->split_queue_len--;
> +		list_del_init(&src->_deferred_list);
> +		spin_unlock(&ds_queue->split_queue_lock);
> +		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
> +	}

I have a few problems with this ...

Trivial: your whitespace is utterly broken.  You can't use a single tab
for both indicating control flow change and for line-too-long.

Slightly more important: You're checking list_empty outside the lock
(which is fine in order to avoid unnecessarily acquiring the lock),
but you need to re-check it inside the lock in case of a race.  And you
didn't mark it as data_race(), so KMSAN will whinge.

Much more important: You're doing this with a positive refcount, which
breaks the (undocumented) logic in deferred_split_scan() that a folio
with a positive refcount will not be removed from the list.

Maximally important: Wer shouldn't be doing any of this!  This folio is
on the deferred split list.  We shouldn't be migrating it as a single
entity; we should be splitting it now that we're in a context where we
can do the right thing and split it.  Documentation/mm/transhuge.rst
is clear that we don't split it straight away due to locking context.
Splitting it on migration is clearly the right thing to do.

If splitting fails, we should just fail the migration; splitting fails
due to excess references, and if the source folio has excess references,
then migration would fail too.
Baolin Wang March 12, 2024, 7:27 a.m. UTC | #2
On 2024/3/12 03:58, Zi Yan wrote:
> From: Zi Yan <ziy@nvidia.com>
> 
> Commit 616b8371539a6 ("mm: thp: enable thp migration in generic path")
> did not check if a THP is on deferred split list before migration, thus,
> the destination THP is never put on deferred split list even if the source
> THP might be. The opportunity of reclaiming free pages in a partially
> mapped THP during deferred list scanning is lost, but no other harmful
> consequence is present[1]. Checking source folio deferred split list
> status before page unmapped and add destination folio to the list if
> source was after migration.
> 
> [1]: https://lore.kernel.org/linux-mm/03CE3A00-917C-48CC-8E1C-6A98713C817C@nvidia.com/
> 
>  From v1:
> 1. Used dst to get correct deferred split list after migration
>     (per Ryan Roberts).
> 
> Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path")
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>   mm/huge_memory.c | 22 ----------------------
>   mm/internal.h    | 23 +++++++++++++++++++++++
>   mm/migrate.c     | 26 +++++++++++++++++++++++++-
>   3 files changed, 48 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9859aa4f7553..c6d4d0cdf4b3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -766,28 +766,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>   	return pmd;
>   }
>   
> -#ifdef CONFIG_MEMCG
> -static inline
> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
> -{
> -	struct mem_cgroup *memcg = folio_memcg(folio);
> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> -
> -	if (memcg)
> -		return &memcg->deferred_split_queue;
> -	else
> -		return &pgdat->deferred_split_queue;
> -}
> -#else
> -static inline
> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
> -{
> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> -
> -	return &pgdat->deferred_split_queue;
> -}
> -#endif
> -
>   void folio_prep_large_rmappable(struct folio *folio)
>   {
>   	if (!folio || !folio_test_large(folio))
> diff --git a/mm/internal.h b/mm/internal.h
> index d1c69119b24f..8fa36e84463a 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1107,6 +1107,29 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>   				   unsigned long addr, pmd_t *pmd,
>   				   unsigned int flags);
>   
> +#ifdef CONFIG_MEMCG
> +static inline
> +struct deferred_split *get_deferred_split_queue(struct folio *folio)
> +{
> +	struct mem_cgroup *memcg = folio_memcg(folio);
> +	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> +
> +	if (memcg)
> +		return &memcg->deferred_split_queue;
> +	else
> +		return &pgdat->deferred_split_queue;
> +}
> +#else
> +static inline
> +struct deferred_split *get_deferred_split_queue(struct folio *folio)
> +{
> +	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
> +
> +	return &pgdat->deferred_split_queue;
> +}
> +#endif
> +
> +
>   /*
>    * mm/mmap.c
>    */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 73a052a382f1..591e65658535 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -20,6 +20,7 @@
>   #include <linux/pagemap.h>
>   #include <linux/buffer_head.h>
>   #include <linux/mm_inline.h>
> +#include <linux/mmzone.h>
>   #include <linux/nsproxy.h>
>   #include <linux/ksm.h>
>   #include <linux/rmap.h>
> @@ -1037,7 +1038,10 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>   enum {
>   	PAGE_WAS_MAPPED = BIT(0),
>   	PAGE_WAS_MLOCKED = BIT(1),
> -	PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
> +	PAGE_WAS_ON_DEFERRED_LIST = BIT(2),
> +	PAGE_OLD_STATES = PAGE_WAS_MAPPED |
> +			  PAGE_WAS_MLOCKED |
> +			  PAGE_WAS_ON_DEFERRED_LIST,
>   };
>   
>   static void __migrate_folio_record(struct folio *dst,
> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>   		folio_lock(src);
>   	}
>   	locked = true;
> +	if (folio_test_large_rmappable(src) &&

IMO, you should check folio_test_large() before calling 
folio_test_large_rmappable(), since the PG_large_rmappable flag is 
stored in the first tail page.

> +		!list_empty(&src->_deferred_list)) {
> +		struct deferred_split *ds_queue = get_deferred_split_queue(src);
> +
> +		spin_lock(&ds_queue->split_queue_lock);
> +		ds_queue->split_queue_len--;
> +		list_del_init(&src->_deferred_list);
> +		spin_unlock(&ds_queue->split_queue_lock);
> +		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
> +	}
> +
>   	if (folio_test_mlocked(src))
>   		old_page_state |= PAGE_WAS_MLOCKED;
>   
> @@ -1307,6 +1322,15 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>   	if (old_page_state & PAGE_WAS_MAPPED)
>   		remove_migration_ptes(src, dst, false);
>   
> +	if (old_page_state & PAGE_WAS_ON_DEFERRED_LIST) {
> +		struct deferred_split *ds_queue = get_deferred_split_queue(dst);
> +
> +		spin_lock(&ds_queue->split_queue_lock);
> +		ds_queue->split_queue_len++;
> +		list_add(&dst->_deferred_list, &ds_queue->split_queue);
> +		spin_unlock(&ds_queue->split_queue_lock);
> +	}
> +
>   out_unlock_both:
>   	folio_unlock(dst);
>   	set_page_owner_migrate_reason(&dst->page, reason);
Ryan Roberts March 12, 2024, 8:05 a.m. UTC | #3
On 12/03/2024 03:45, Matthew Wilcox wrote:
> On Mon, Mar 11, 2024 at 03:58:48PM -0400, Zi Yan wrote:
>> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>>  		folio_lock(src);
>>  	}
>>  	locked = true;
>> +	if (folio_test_large_rmappable(src) &&

I think you also need to check that the order > 1, now that we support order-1
pagecache folios? _deferred_list only exists if order > 1.

>> +		!list_empty(&src->_deferred_list)) {
>> +		struct deferred_split *ds_queue = get_deferred_split_queue(src);
>> +
>> +		spin_lock(&ds_queue->split_queue_lock);
>> +		ds_queue->split_queue_len--;
>> +		list_del_init(&src->_deferred_list);
>> +		spin_unlock(&ds_queue->split_queue_lock);
>> +		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
>> +	}
> 
> I have a few problems with this ...
> 
> Trivial: your whitespace is utterly broken.  You can't use a single tab
> for both indicating control flow change and for line-too-long.
> 
> Slightly more important: You're checking list_empty outside the lock
> (which is fine in order to avoid unnecessarily acquiring the lock),
> but you need to re-check it inside the lock in case of a race.  And you
> didn't mark it as data_race(), so KMSAN will whinge.

I've seen data_race() used around list_empty() without the lock held
inconsistently (see deferred_split_folio()). What are the rules? Given that we
are not doing a memory access here, I don't really understand why it is needed?
list_empty() uses READ_ONCE() which I thought would be sufficient? (I've just
added a similar lockless check in my swap-out series - will add data_race() if
needed, but previously concluded it's not).

> 
> Much more important: You're doing this with a positive refcount, which
> breaks the (undocumented) logic in deferred_split_scan() that a folio
> with a positive refcount will not be removed from the list.
> 
> Maximally important: Wer shouldn't be doing any of this!  This folio is
> on the deferred split list.  We shouldn't be migrating it as a single
> entity; we should be splitting it now that we're in a context where we
> can do the right thing and split it.  Documentation/mm/transhuge.rst
> is clear that we don't split it straight away due to locking context.
> Splitting it on migration is clearly the right thing to do.
> 
> If splitting fails, we should just fail the migration; splitting fails
> due to excess references, and if the source folio has excess references,
> then migration would fail too.

This comment makes me wonder what we do in split_huge_page_to_list_to_order() if
the target order is greater than 1 and the input folio is on the deferred split
list. Looks like we currently just remove it from the deferred list. Is there a
case for putting any output folios that are still partially mapped back on the
deferred list, or splitting them to a lower order such that all output folios
are fully mapped, and all unmapped portions are freed?
Zi Yan March 12, 2024, 1:49 p.m. UTC | #4
On 12 Mar 2024, at 3:27, Baolin Wang wrote:

> On 2024/3/12 03:58, Zi Yan wrote:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> Commit 616b8371539a6 ("mm: thp: enable thp migration in generic path")
>> did not check if a THP is on deferred split list before migration, thus,
>> the destination THP is never put on deferred split list even if the source
>> THP might be. The opportunity of reclaiming free pages in a partially
>> mapped THP during deferred list scanning is lost, but no other harmful
>> consequence is present[1]. Checking source folio deferred split list
>> status before page unmapped and add destination folio to the list if
>> source was after migration.
>>
>> [1]: https://lore.kernel.org/linux-mm/03CE3A00-917C-48CC-8E1C-6A98713C817C@nvidia.com/
>>
>>  From v1:
>> 1. Used dst to get correct deferred split list after migration
>>     (per Ryan Roberts).
>>
>> Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path")
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   mm/huge_memory.c | 22 ----------------------
>>   mm/internal.h    | 23 +++++++++++++++++++++++
>>   mm/migrate.c     | 26 +++++++++++++++++++++++++-
>>   3 files changed, 48 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 9859aa4f7553..c6d4d0cdf4b3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -766,28 +766,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>>   	return pmd;
>>   }
>>  -#ifdef CONFIG_MEMCG
>> -static inline
>> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> -{
>> -	struct mem_cgroup *memcg = folio_memcg(folio);
>> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> -
>> -	if (memcg)
>> -		return &memcg->deferred_split_queue;
>> -	else
>> -		return &pgdat->deferred_split_queue;
>> -}
>> -#else
>> -static inline
>> -struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> -{
>> -	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> -
>> -	return &pgdat->deferred_split_queue;
>> -}
>> -#endif
>> -
>>   void folio_prep_large_rmappable(struct folio *folio)
>>   {
>>   	if (!folio || !folio_test_large(folio))
>> diff --git a/mm/internal.h b/mm/internal.h
>> index d1c69119b24f..8fa36e84463a 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -1107,6 +1107,29 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>>   				   unsigned long addr, pmd_t *pmd,
>>   				   unsigned int flags);
>>  +#ifdef CONFIG_MEMCG
>> +static inline
>> +struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> +{
>> +	struct mem_cgroup *memcg = folio_memcg(folio);
>> +	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> +
>> +	if (memcg)
>> +		return &memcg->deferred_split_queue;
>> +	else
>> +		return &pgdat->deferred_split_queue;
>> +}
>> +#else
>> +static inline
>> +struct deferred_split *get_deferred_split_queue(struct folio *folio)
>> +{
>> +	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
>> +
>> +	return &pgdat->deferred_split_queue;
>> +}
>> +#endif
>> +
>> +
>>   /*
>>    * mm/mmap.c
>>    */
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 73a052a382f1..591e65658535 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/pagemap.h>
>>   #include <linux/buffer_head.h>
>>   #include <linux/mm_inline.h>
>> +#include <linux/mmzone.h>
>>   #include <linux/nsproxy.h>
>>   #include <linux/ksm.h>
>>   #include <linux/rmap.h>
>> @@ -1037,7 +1038,10 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
>>   enum {
>>   	PAGE_WAS_MAPPED = BIT(0),
>>   	PAGE_WAS_MLOCKED = BIT(1),
>> -	PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
>> +	PAGE_WAS_ON_DEFERRED_LIST = BIT(2),
>> +	PAGE_OLD_STATES = PAGE_WAS_MAPPED |
>> +			  PAGE_WAS_MLOCKED |
>> +			  PAGE_WAS_ON_DEFERRED_LIST,
>>   };
>>    static void __migrate_folio_record(struct folio *dst,
>> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>>   		folio_lock(src);
>>   	}
>>   	locked = true;
>> +	if (folio_test_large_rmappable(src) &&
>
> IMO, you should check folio_test_large() before calling folio_test_large_rmappable(), since the PG_large_rmappable flag is stored in the first tail page.

You are right. Ryan also pointed this out in another email. Will fix. Thanks.


--
Best Regards,
Yan, Zi
Zi Yan March 12, 2024, 2:13 p.m. UTC | #5
On 11 Mar 2024, at 23:45, Matthew Wilcox wrote:

> On Mon, Mar 11, 2024 at 03:58:48PM -0400, Zi Yan wrote:
>> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>>  		folio_lock(src);
>>  	}
>>  	locked = true;
>> +	if (folio_test_large_rmappable(src) &&
>> +		!list_empty(&src->_deferred_list)) {
>> +		struct deferred_split *ds_queue = get_deferred_split_queue(src);
>> +
>> +		spin_lock(&ds_queue->split_queue_lock);
>> +		ds_queue->split_queue_len--;
>> +		list_del_init(&src->_deferred_list);
>> +		spin_unlock(&ds_queue->split_queue_lock);
>> +		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
>> +	}
>
> I have a few problems with this ...
>
> Trivial: your whitespace is utterly broken.  You can't use a single tab
> for both indicating control flow change and for line-too-long.

Got it. Will not do this any more.

>
> Slightly more important: You're checking list_empty outside the lock
> (which is fine in order to avoid unnecessarily acquiring the lock),
> but you need to re-check it inside the lock in case of a race.  And you
> didn't mark it as data_race(), so KMSAN will whinge.

Got it and will check data_race() related changes. Will fix.

>
> Much more important: You're doing this with a positive refcount, which
> breaks the (undocumented) logic in deferred_split_scan() that a folio
> with a positive refcount will not be removed from the list.

What is the issue here? I thought as long as the split_queue_lock is held,
it should be OK to manipulate the list.

>
> Maximally important: Wer shouldn't be doing any of this!  This folio is
> on the deferred split list.  We shouldn't be migrating it as a single
> entity; we should be splitting it now that we're in a context where we
> can do the right thing and split it.  Documentation/mm/transhuge.rst
> is clear that we don't split it straight away due to locking context.
> Splitting it on migration is clearly the right thing to do.
>
> If splitting fails, we should just fail the migration; splitting fails
> due to excess references, and if the source folio has excess references,
> then migration would fail too.

You are suggesting:
1. checking if the folio is on deferred split list or not
2. if yes, split the folio
3. if split fails, fail the migration as well.

It sounds reasonable to me. The split folios should be migrated since
the before-split folio wants to be migrated. This split is not because
no new page cannot be allocated, thus the split folios should go
into ret_folios list instead of split_folios list.

Thank you for the comments.




--
Best Regards,
Yan, Zi
Matthew Wilcox (Oracle) March 12, 2024, 2:19 p.m. UTC | #6
On Tue, Mar 12, 2024 at 10:13:16AM -0400, Zi Yan wrote:
> On 11 Mar 2024, at 23:45, Matthew Wilcox wrote:
> > Much more important: You're doing this with a positive refcount, which
> > breaks the (undocumented) logic in deferred_split_scan() that a folio
> > with a positive refcount will not be removed from the list.
> 
> What is the issue here? I thought as long as the split_queue_lock is held,
> it should be OK to manipulate the list.

I just worked this out yesterday:
https://lore.kernel.org/linux-mm/Ze9EFdFLXQEUVtKl@casper.infradead.org/
(the last chunk, starting with Ryan asking me "what about the first bug
you found")

> > Maximally important: Wer shouldn't be doing any of this!  This folio is
> > on the deferred split list.  We shouldn't be migrating it as a single
> > entity; we should be splitting it now that we're in a context where we
> > can do the right thing and split it.  Documentation/mm/transhuge.rst
> > is clear that we don't split it straight away due to locking context.
> > Splitting it on migration is clearly the right thing to do.
> >
> > If splitting fails, we should just fail the migration; splitting fails
> > due to excess references, and if the source folio has excess references,
> > then migration would fail too.
> 
> You are suggesting:
> 1. checking if the folio is on deferred split list or not
> 2. if yes, split the folio
> 3. if split fails, fail the migration as well.
> 
> It sounds reasonable to me. The split folios should be migrated since
> the before-split folio wants to be migrated. This split is not because
> no new page cannot be allocated, thus the split folios should go
> into ret_folios list instead of split_folios list.

Yes, I'm happy for the split folios to be migrated.  Bonus points if you
want to figure out what order to split the folio to ;-)  I don't think
it's critical.
Zi Yan March 12, 2024, 2:26 p.m. UTC | #7
On 12 Mar 2024, at 4:05, Ryan Roberts wrote:

> On 12/03/2024 03:45, Matthew Wilcox wrote:
>> On Mon, Mar 11, 2024 at 03:58:48PM -0400, Zi Yan wrote:
>>> @@ -1168,6 +1172,17 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
>>>  		folio_lock(src);
>>>  	}
>>>  	locked = true;
>>> +	if (folio_test_large_rmappable(src) &&
>
> I think you also need to check that the order > 1, now that we support order-1
> pagecache folios? _deferred_list only exists if order > 1.
>
>>> +		!list_empty(&src->_deferred_list)) {
>>> +		struct deferred_split *ds_queue = get_deferred_split_queue(src);
>>> +
>>> +		spin_lock(&ds_queue->split_queue_lock);
>>> +		ds_queue->split_queue_len--;
>>> +		list_del_init(&src->_deferred_list);
>>> +		spin_unlock(&ds_queue->split_queue_lock);
>>> +		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
>>> +	}
>>
>> I have a few problems with this ...
>>
>> Trivial: your whitespace is utterly broken.  You can't use a single tab
>> for both indicating control flow change and for line-too-long.
>>
>> Slightly more important: You're checking list_empty outside the lock
>> (which is fine in order to avoid unnecessarily acquiring the lock),
>> but you need to re-check it inside the lock in case of a race.  And you
>> didn't mark it as data_race(), so KMSAN will whinge.
>
> I've seen data_race() used around list_empty() without the lock held
> inconsistently (see deferred_split_folio()). What are the rules? Given that we
> are not doing a memory access here, I don't really understand why it is needed?
> list_empty() uses READ_ONCE() which I thought would be sufficient? (I've just
> added a similar lockless check in my swap-out series - will add data_race() if
> needed, but previously concluded it's not).
>
>>
>> Much more important: You're doing this with a positive refcount, which
>> breaks the (undocumented) logic in deferred_split_scan() that a folio
>> with a positive refcount will not be removed from the list.
>>
>> Maximally important: Wer shouldn't be doing any of this!  This folio is
>> on the deferred split list.  We shouldn't be migrating it as a single
>> entity; we should be splitting it now that we're in a context where we
>> can do the right thing and split it.  Documentation/mm/transhuge.rst
>> is clear that we don't split it straight away due to locking context.
>> Splitting it on migration is clearly the right thing to do.
>>
>> If splitting fails, we should just fail the migration; splitting fails
>> due to excess references, and if the source folio has excess references,
>> then migration would fail too.
>
> This comment makes me wonder what we do in split_huge_page_to_list_to_order() if
> the target order is greater than 1 and the input folio is on the deferred split
> list. Looks like we currently just remove it from the deferred list. Is there a
> case for putting any output folios that are still partially mapped back on the
> deferred list, or splitting them to a lower order such that all output folios
> are fully mapped, and all unmapped portions are freed?

I probably would let the caller of split_huge_page_to_list_to_order() to decide
whether output folios should be put back in deferred list. The caller should
determine the right order to split. Letting split_huge_page_to_list_to_order()
change new_order will confuse the caller and complicate the handling of
output folios.



--
Best Regards,
Yan, Zi
Zi Yan March 12, 2024, 3:51 p.m. UTC | #8
On 12 Mar 2024, at 10:19, Matthew Wilcox wrote:

> On Tue, Mar 12, 2024 at 10:13:16AM -0400, Zi Yan wrote:
>> On 11 Mar 2024, at 23:45, Matthew Wilcox wrote:
>>> Much more important: You're doing this with a positive refcount, which
>>> breaks the (undocumented) logic in deferred_split_scan() that a folio
>>> with a positive refcount will not be removed from the list.
>>
>> What is the issue here? I thought as long as the split_queue_lock is held,
>> it should be OK to manipulate the list.
>
> I just worked this out yesterday:
> https://lore.kernel.org/linux-mm/Ze9EFdFLXQEUVtKl@casper.infradead.org/
> (the last chunk, starting with Ryan asking me "what about the first bug
> you found")

Hmm, like you said a folio with a positive refcount will not be removed
from ds_queue->split_queue, it will have no chance going to the separate
list in deferred_list_scan() and list_del_init() will not corrupt
that list. So it should be safe. Or the issue is that before migration
adding a refcount, the folio is removed from ds_queue->split_queue
and put on the list in deferred_list_scan(), as a result, any manipulation
of folio->_deferred_list could corrupt the list. Basically,
!list_empty(&folio->_deferred_list) cannot tell if the folio is on
ds_queue->split_queue or another list. I am not sure about why "a positive
refcount" is related here.

That makes me wonder whether ds_queue->split_queue_lock is also needed
for list_for_each_entry_safe() in deferred_split_scan(). Basically,
ds_queue->split_queue_lock protects folio->_deferred_list in addition to
ds_queue->split_queue.



>>> Maximally important: Wer shouldn't be doing any of this!  This folio is
>>> on the deferred split list.  We shouldn't be migrating it as a single
>>> entity; we should be splitting it now that we're in a context where we
>>> can do the right thing and split it.  Documentation/mm/transhuge.rst
>>> is clear that we don't split it straight away due to locking context.
>>> Splitting it on migration is clearly the right thing to do.
>>>
>>> If splitting fails, we should just fail the migration; splitting fails
>>> due to excess references, and if the source folio has excess references,
>>> then migration would fail too.
>>
>> You are suggesting:
>> 1. checking if the folio is on deferred split list or not
>> 2. if yes, split the folio
>> 3. if split fails, fail the migration as well.
>>
>> It sounds reasonable to me. The split folios should be migrated since
>> the before-split folio wants to be migrated. This split is not because
>> no new page cannot be allocated, thus the split folios should go
>> into ret_folios list instead of split_folios list.
>
> Yes, I'm happy for the split folios to be migrated.  Bonus points if you
> want to figure out what order to split the folio to ;-)  I don't think
> it's critical.


--
Best Regards,
Yan, Zi
Matthew Wilcox (Oracle) March 12, 2024, 4:38 p.m. UTC | #9
On Tue, Mar 12, 2024 at 11:51:13AM -0400, Zi Yan wrote:
> On 12 Mar 2024, at 10:19, Matthew Wilcox wrote:
> 
> > On Tue, Mar 12, 2024 at 10:13:16AM -0400, Zi Yan wrote:
> >> On 11 Mar 2024, at 23:45, Matthew Wilcox wrote:
> >>> Much more important: You're doing this with a positive refcount, which
> >>> breaks the (undocumented) logic in deferred_split_scan() that a folio
> >>> with a positive refcount will not be removed from the list.
> >>
> >> What is the issue here? I thought as long as the split_queue_lock is held,
> >> it should be OK to manipulate the list.
> >
> > I just worked this out yesterday:
> > https://lore.kernel.org/linux-mm/Ze9EFdFLXQEUVtKl@casper.infradead.org/
> > (the last chunk, starting with Ryan asking me "what about the first bug
> > you found")
> 
> Hmm, like you said a folio with a positive refcount will not be removed
> from ds_queue->split_queue, it will have no chance going to the separate
> list in deferred_list_scan() and list_del_init() will not corrupt
> that list.

You've misread it.  Folios with a _zero_ refcount are not removed from
the list in deferred_split_scan.  Folios with a positive refcount are
removed from the per-node or per-cgroup list _at which point there is
an undocumented assumption_ that they will not be removed from the
local list because they have a positive refcount.

> So it should be safe. Or the issue is that before migration
> adding a refcount, the folio is removed from ds_queue->split_queue
> and put on the list in deferred_list_scan(), as a result, any manipulation
> of folio->_deferred_list could corrupt the list. Basically,
> !list_empty(&folio->_deferred_list) cannot tell if the folio is on
> ds_queue->split_queue or another list. I am not sure about why "a positive
> refcount" is related here.
> 
> That makes me wonder whether ds_queue->split_queue_lock is also needed
> for list_for_each_entry_safe() in deferred_split_scan(). Basically,
> ds_queue->split_queue_lock protects folio->_deferred_list in addition to
> ds_queue->split_queue.
> 
> 
> 
> >>> Maximally important: Wer shouldn't be doing any of this!  This folio is
> >>> on the deferred split list.  We shouldn't be migrating it as a single
> >>> entity; we should be splitting it now that we're in a context where we
> >>> can do the right thing and split it.  Documentation/mm/transhuge.rst
> >>> is clear that we don't split it straight away due to locking context.
> >>> Splitting it on migration is clearly the right thing to do.
> >>>
> >>> If splitting fails, we should just fail the migration; splitting fails
> >>> due to excess references, and if the source folio has excess references,
> >>> then migration would fail too.
> >>
> >> You are suggesting:
> >> 1. checking if the folio is on deferred split list or not
> >> 2. if yes, split the folio
> >> 3. if split fails, fail the migration as well.
> >>
> >> It sounds reasonable to me. The split folios should be migrated since
> >> the before-split folio wants to be migrated. This split is not because
> >> no new page cannot be allocated, thus the split folios should go
> >> into ret_folios list instead of split_folios list.
> >
> > Yes, I'm happy for the split folios to be migrated.  Bonus points if you
> > want to figure out what order to split the folio to ;-)  I don't think
> > it's critical.
> 
> 
> --
> Best Regards,
> Yan, Zi
Zi Yan March 12, 2024, 6:32 p.m. UTC | #10
On 12 Mar 2024, at 12:38, Matthew Wilcox wrote:

> On Tue, Mar 12, 2024 at 11:51:13AM -0400, Zi Yan wrote:
>> On 12 Mar 2024, at 10:19, Matthew Wilcox wrote:
>>
>>> On Tue, Mar 12, 2024 at 10:13:16AM -0400, Zi Yan wrote:
>>>> On 11 Mar 2024, at 23:45, Matthew Wilcox wrote:
>>>>> Much more important: You're doing this with a positive refcount, which
>>>>> breaks the (undocumented) logic in deferred_split_scan() that a folio
>>>>> with a positive refcount will not be removed from the list.
>>>>
>>>> What is the issue here? I thought as long as the split_queue_lock is held,
>>>> it should be OK to manipulate the list.
>>>
>>> I just worked this out yesterday:
>>> https://lore.kernel.org/linux-mm/Ze9EFdFLXQEUVtKl@casper.infradead.org/
>>> (the last chunk, starting with Ryan asking me "what about the first bug
>>> you found")
>>
>> Hmm, like you said a folio with a positive refcount will not be removed
>> from ds_queue->split_queue, it will have no chance going to the separate
>> list in deferred_list_scan() and list_del_init() will not corrupt
>> that list.
>
> You've misread it.  Folios with a _zero_ refcount are not removed from
> the list in deferred_split_scan.  Folios with a positive refcount are
> removed from the per-node or per-cgroup list _at which point there is
> an undocumented assumption_ that they will not be removed from the
> local list because they have a positive refcount.

But that sounds very subtle if not broken. As an outsider of
deferred_split_scan(), only !list_empty(folio->_deferred_list) is checked.
The condition can be true if the folio is on split_queue or
local list of deferred_split_scan() with elevated refcount. In that case,
the folio cannot be removed from the list (either split_queue or local list)
even if split_queue_lock is held, since local list manipulation is not under
split_queue_lock. This makes _deferred_list a one-way train to anyone
except deferred_split_scan(), namely folios can only be added into
_deferred_list until they are freed or split by deferred_split_scan().

Is that intended? If yes, maybe we should document it. If not, using
split_queue_lock to protect local list, or more explicitly folio->_deferred_list
might be better?


>> So it should be safe. Or the issue is that before migration
>> adding a refcount, the folio is removed from ds_queue->split_queue
>> and put on the list in deferred_list_scan(), as a result, any manipulation
>> of folio->_deferred_list could corrupt the list. Basically,
>> !list_empty(&folio->_deferred_list) cannot tell if the folio is on
>> ds_queue->split_queue or another list. I am not sure about why "a positive
>> refcount" is related here.
>>
>> That makes me wonder whether ds_queue->split_queue_lock is also needed
>> for list_for_each_entry_safe() in deferred_split_scan(). Basically,
>> ds_queue->split_queue_lock protects folio->_deferred_list in addition to
>> ds_queue->split_queue.
>>
>>
>>
>>>>> Maximally important: Wer shouldn't be doing any of this!  This folio is
>>>>> on the deferred split list.  We shouldn't be migrating it as a single
>>>>> entity; we should be splitting it now that we're in a context where we
>>>>> can do the right thing and split it.  Documentation/mm/transhuge.rst
>>>>> is clear that we don't split it straight away due to locking context.
>>>>> Splitting it on migration is clearly the right thing to do.
>>>>>
>>>>> If splitting fails, we should just fail the migration; splitting fails
>>>>> due to excess references, and if the source folio has excess references,
>>>>> then migration would fail too.
>>>>
>>>> You are suggesting:
>>>> 1. checking if the folio is on deferred split list or not
>>>> 2. if yes, split the folio
>>>> 3. if split fails, fail the migration as well.
>>>>
>>>> It sounds reasonable to me. The split folios should be migrated since
>>>> the before-split folio wants to be migrated. This split is not because
>>>> no new page cannot be allocated, thus the split folios should go
>>>> into ret_folios list instead of split_folios list.
>>>
>>> Yes, I'm happy for the split folios to be migrated.  Bonus points if you
>>> want to figure out what order to split the folio to ;-)  I don't think
>>> it's critical.
>>
>>
>> --
>> Best Regards,
>> Yan, Zi


--
Best Regards,
Yan, Zi
Matthew Wilcox (Oracle) March 12, 2024, 6:46 p.m. UTC | #11
On Tue, Mar 12, 2024 at 02:32:43PM -0400, Zi Yan wrote:
> On 12 Mar 2024, at 12:38, Matthew Wilcox wrote:
> > Folios with a positive refcount are
> > removed from the per-node or per-cgroup list _at which point there is
> > an undocumented assumption_ that they will not be removed from the
> > local list because they have a positive refcount.
> 
> But that sounds very subtle if not broken. As an outsider of

I merely deduced this requirement; I didn't come up with it ...

> deferred_split_scan(), only !list_empty(folio->_deferred_list) is checked.
> The condition can be true if the folio is on split_queue or
> local list of deferred_split_scan() with elevated refcount. In that case,
> the folio cannot be removed from the list (either split_queue or local list)
> even if split_queue_lock is held, since local list manipulation is not under
> split_queue_lock. This makes _deferred_list a one-way train to anyone
> except deferred_split_scan(), namely folios can only be added into
> _deferred_list until they are freed or split by deferred_split_scan().
> 
> Is that intended? If yes, maybe we should document it. If not, using
> split_queue_lock to protect local list, or more explicitly folio->_deferred_list
> might be better?

To be fair, the folio can be split by anybody as
split_huge_page_to_list_to_order() is careful to only manipulate the
deferred list while the refcount is frozen at 0.  I'm still trying to
figure out where to document this behaviour of the deferred list that
someone (for example, your good self) would actually see it.
Zi Yan March 12, 2024, 7:45 p.m. UTC | #12
On 12 Mar 2024, at 14:46, Matthew Wilcox wrote:

> On Tue, Mar 12, 2024 at 02:32:43PM -0400, Zi Yan wrote:
>> On 12 Mar 2024, at 12:38, Matthew Wilcox wrote:
>>> Folios with a positive refcount are
>>> removed from the per-node or per-cgroup list _at which point there is
>>> an undocumented assumption_ that they will not be removed from the
>>> local list because they have a positive refcount.
>>
>> But that sounds very subtle if not broken. As an outsider of
>
> I merely deduced this requirement; I didn't come up with it ...
>
>> deferred_split_scan(), only !list_empty(folio->_deferred_list) is checked.
>> The condition can be true if the folio is on split_queue or
>> local list of deferred_split_scan() with elevated refcount. In that case,
>> the folio cannot be removed from the list (either split_queue or local list)
>> even if split_queue_lock is held, since local list manipulation is not under
>> split_queue_lock. This makes _deferred_list a one-way train to anyone
>> except deferred_split_scan(), namely folios can only be added into
>> _deferred_list until they are freed or split by deferred_split_scan().
>>
>> Is that intended? If yes, maybe we should document it. If not, using
>> split_queue_lock to protect local list, or more explicitly folio->_deferred_list
>> might be better?
>
> To be fair, the folio can be split by anybody as
> split_huge_page_to_list_to_order() is careful to only manipulate the
> deferred list while the refcount is frozen at 0.  I'm still trying to
> figure out where to document this behaviour of the deferred list that
> someone (for example, your good self) would actually see it.

Yes, split is not affected. For migration, hypothetically if a folio on
deferred_list wants to be migrated without split, it needs to removed
and the dst folio needs to be added during frozen phase. That seems
to be very heavy operation to just get a folio out of deferred_list.
But we can change it when we want to do that in other scenarios.

BTW, s/_deferred_list/_deferred_list_only_removable_during_scan/ works
best for me in terms of documentation. ;)


--
Best Regards,
Yan, Zi
Yin Fengwei March 13, 2024, 2:07 a.m. UTC | #13
On 3/13/2024 2:46 AM, Matthew Wilcox wrote:
> On Tue, Mar 12, 2024 at 02:32:43PM -0400, Zi Yan wrote:
>> On 12 Mar 2024, at 12:38, Matthew Wilcox wrote:
>>> Folios with a positive refcount are
>>> removed from the per-node or per-cgroup list _at which point there is
>>> an undocumented assumption_ that they will not be removed from the
>>> local list because they have a positive refcount.
>>
>> But that sounds very subtle if not broken. As an outsider of
> 
> I merely deduced this requirement; I didn't come up with it ...
My understanding is that this requirement is because of just local
list in deferred_split_scan().

Using fbatch instead of local list here as your created for that
issue debugging can eliminate this subtlety?


Regards
Yin, Fengwei

> 
>> deferred_split_scan(), only !list_empty(folio->_deferred_list) is checked.
>> The condition can be true if the folio is on split_queue or
>> local list of deferred_split_scan() with elevated refcount. In that case,
>> the folio cannot be removed from the list (either split_queue or local list)
>> even if split_queue_lock is held, since local list manipulation is not under
>> split_queue_lock. This makes _deferred_list a one-way train to anyone
>> except deferred_split_scan(), namely folios can only be added into
>> _deferred_list until they are freed or split by deferred_split_scan().
>>
>> Is that intended? If yes, maybe we should document it. If not, using
>> split_queue_lock to protect local list, or more explicitly folio->_deferred_list
>> might be better?
> 
> To be fair, the folio can be split by anybody as
> split_huge_page_to_list_to_order() is careful to only manipulate the
> deferred list while the refcount is frozen at 0.  I'm still trying to
> figure out where to document this behaviour of the deferred list that
> someone (for example, your good self) would actually see it.
> 
> 
>
Yin Fengwei March 13, 2024, 2:33 a.m. UTC | #14
On 3/13/2024 10:07 AM, Yin, Fengwei wrote:
> 
> 
> On 3/13/2024 2:46 AM, Matthew Wilcox wrote:
>> On Tue, Mar 12, 2024 at 02:32:43PM -0400, Zi Yan wrote:
>>> On 12 Mar 2024, at 12:38, Matthew Wilcox wrote:
>>>> Folios with a positive refcount are
>>>> removed from the per-node or per-cgroup list _at which point there is
>>>> an undocumented assumption_ that they will not be removed from the
>>>> local list because they have a positive refcount.
>>>
>>> But that sounds very subtle if not broken. As an outsider of
>>
>> I merely deduced this requirement; I didn't come up with it ...
> My understanding is that this requirement is because of just local
> list in deferred_split_scan().
> 
> Using fbatch instead of local list here as your created for that
> issue debugging can eliminate this subtlety?
May not good idea because it's possible the folios in fbatch can
be removed from deferred_list by migration.

> 
> 
> Regards
> Yin, Fengwei
> 
>>
>>> deferred_split_scan(), only !list_empty(folio->_deferred_list) is 
>>> checked.
>>> The condition can be true if the folio is on split_queue or
>>> local list of deferred_split_scan() with elevated refcount. In that 
>>> case,
>>> the folio cannot be removed from the list (either split_queue or 
>>> local list)
>>> even if split_queue_lock is held, since local list manipulation is 
>>> not under
>>> split_queue_lock. This makes _deferred_list a one-way train to anyone
>>> except deferred_split_scan(), namely folios can only be added into
>>> _deferred_list until they are freed or split by deferred_split_scan().
>>>
>>> Is that intended? If yes, maybe we should document it. If not, using
>>> split_queue_lock to protect local list, or more explicitly 
>>> folio->_deferred_list
>>> might be better?
>>
>> To be fair, the folio can be split by anybody as
>> split_huge_page_to_list_to_order() is careful to only manipulate the
>> deferred list while the refcount is frozen at 0.  I'm still trying to
>> figure out where to document this behaviour of the deferred list that
>> someone (for example, your good self) would actually see it.
>>
>>
>>
>
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9859aa4f7553..c6d4d0cdf4b3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -766,28 +766,6 @@  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 	return pmd;
 }
 
-#ifdef CONFIG_MEMCG
-static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
-{
-	struct mem_cgroup *memcg = folio_memcg(folio);
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
-
-	if (memcg)
-		return &memcg->deferred_split_queue;
-	else
-		return &pgdat->deferred_split_queue;
-}
-#else
-static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
-{
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
-
-	return &pgdat->deferred_split_queue;
-}
-#endif
-
 void folio_prep_large_rmappable(struct folio *folio)
 {
 	if (!folio || !folio_test_large(folio))
diff --git a/mm/internal.h b/mm/internal.h
index d1c69119b24f..8fa36e84463a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1107,6 +1107,29 @@  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmd,
 				   unsigned int flags);
 
+#ifdef CONFIG_MEMCG
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
+{
+	struct mem_cgroup *memcg = folio_memcg(folio);
+	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+
+	if (memcg)
+		return &memcg->deferred_split_queue;
+	else
+		return &pgdat->deferred_split_queue;
+}
+#else
+static inline
+struct deferred_split *get_deferred_split_queue(struct folio *folio)
+{
+	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+
+	return &pgdat->deferred_split_queue;
+}
+#endif
+
+
 /*
  * mm/mmap.c
  */
diff --git a/mm/migrate.c b/mm/migrate.c
index 73a052a382f1..591e65658535 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -20,6 +20,7 @@ 
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
+#include <linux/mmzone.h>
 #include <linux/nsproxy.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
@@ -1037,7 +1038,10 @@  static int move_to_new_folio(struct folio *dst, struct folio *src,
 enum {
 	PAGE_WAS_MAPPED = BIT(0),
 	PAGE_WAS_MLOCKED = BIT(1),
-	PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+	PAGE_WAS_ON_DEFERRED_LIST = BIT(2),
+	PAGE_OLD_STATES = PAGE_WAS_MAPPED |
+			  PAGE_WAS_MLOCKED |
+			  PAGE_WAS_ON_DEFERRED_LIST,
 };
 
 static void __migrate_folio_record(struct folio *dst,
@@ -1168,6 +1172,17 @@  static int migrate_folio_unmap(new_folio_t get_new_folio,
 		folio_lock(src);
 	}
 	locked = true;
+	if (folio_test_large_rmappable(src) &&
+		!list_empty(&src->_deferred_list)) {
+		struct deferred_split *ds_queue = get_deferred_split_queue(src);
+
+		spin_lock(&ds_queue->split_queue_lock);
+		ds_queue->split_queue_len--;
+		list_del_init(&src->_deferred_list);
+		spin_unlock(&ds_queue->split_queue_lock);
+		old_page_state |= PAGE_WAS_ON_DEFERRED_LIST;
+	}
+
 	if (folio_test_mlocked(src))
 		old_page_state |= PAGE_WAS_MLOCKED;
 
@@ -1307,6 +1322,15 @@  static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 	if (old_page_state & PAGE_WAS_MAPPED)
 		remove_migration_ptes(src, dst, false);
 
+	if (old_page_state & PAGE_WAS_ON_DEFERRED_LIST) {
+		struct deferred_split *ds_queue = get_deferred_split_queue(dst);
+
+		spin_lock(&ds_queue->split_queue_lock);
+		ds_queue->split_queue_len++;
+		list_add(&dst->_deferred_list, &ds_queue->split_queue);
+		spin_unlock(&ds_queue->split_queue_lock);
+	}
+
 out_unlock_both:
 	folio_unlock(dst);
 	set_page_owner_migrate_reason(&dst->page, reason);