diff mbox series

[mm-unstable,v2,1/3] mm/contig_alloc: support __GFP_COMP

Message ID 20240814035451.773331-2-yuzhao@google.com (mailing list archive)
State New
Headers show
Series mm/hugetlb: alloc/free gigantic folios | expand

Commit Message

Yu Zhao Aug. 14, 2024, 3:54 a.m. UTC
Support __GFP_COMP in alloc_contig_range(). When the flag is set, upon
success the function returns a large folio prepared by
prep_new_page(), rather than a range of order-0 pages prepared by
split_free_pages() (which is renamed from split_map_pages()).

alloc_contig_range() can be used to allocate folios larger than
MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios. So on the free path,
free_one_page() needs to handle that by split_large_buddy().

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/gfp.h |  23 +++++++++
 mm/compaction.c     |  41 ++--------------
 mm/page_alloc.c     | 111 +++++++++++++++++++++++++++++++-------------
 3 files changed, 108 insertions(+), 67 deletions(-)

Comments

Yu Liao Aug. 22, 2024, 8:21 a.m. UTC | #1
On 2024/8/14 11:54, Yu Zhao wrote:
> Support __GFP_COMP in alloc_contig_range(). When the flag is set, upon
> success the function returns a large folio prepared by
> prep_new_page(), rather than a range of order-0 pages prepared by
> split_free_pages() (which is renamed from split_map_pages()).
> 
> alloc_contig_range() can be used to allocate folios larger than
> MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios. So on the free path,
> free_one_page() needs to handle that by split_large_buddy().
> 
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  include/linux/gfp.h |  23 +++++++++
>  mm/compaction.c     |  41 ++--------------
>  mm/page_alloc.c     | 111 +++++++++++++++++++++++++++++++-------------
>  3 files changed, 108 insertions(+), 67 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index f53f76e0b17e..59266df56aeb 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
>  #endif
>  void free_contig_range(unsigned long pfn, unsigned long nr_pages);
>  
> +#ifdef CONFIG_CONTIG_ALLOC
> +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
> +							int nid, nodemask_t *node)
> +{
> +	struct page *page;
> +
> +	if (WARN_ON(!order || !(gfp | __GFP_COMP)))

It doesn't seem right, it should be !(gfp & __GFP_COMP).

Best Regards,
Yu
Yu Zhao Aug. 22, 2024, 5:23 p.m. UTC | #2
On Tue, Aug 13, 2024 at 09:54:49PM -0600, Yu Zhao wrote:
> Support __GFP_COMP in alloc_contig_range(). When the flag is set, upon
> success the function returns a large folio prepared by
> prep_new_page(), rather than a range of order-0 pages prepared by
> split_free_pages() (which is renamed from split_map_pages()).
> 
> alloc_contig_range() can be used to allocate folios larger than
> MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios. So on the free path,
> free_one_page() needs to handle that by split_large_buddy().
> 
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  include/linux/gfp.h |  23 +++++++++
>  mm/compaction.c     |  41 ++--------------
>  mm/page_alloc.c     | 111 +++++++++++++++++++++++++++++++-------------
>  3 files changed, 108 insertions(+), 67 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index f53f76e0b17e..59266df56aeb 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
>  #endif
>  void free_contig_range(unsigned long pfn, unsigned long nr_pages);
>  
> +#ifdef CONFIG_CONTIG_ALLOC
> +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
> +							int nid, nodemask_t *node)
> +{
> +	struct page *page;
> +
> +	if (WARN_ON(!order || !(gfp | __GFP_COMP)))

Andrew, could you patch up the line above? This is what it's supposed
to check:

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 59266df56aeb..03ba9563c6db 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -452,7 +452,7 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
 {
 	struct page *page;
 
-	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
+	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
 		return NULL;
 
 	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
Yu Zhao Aug. 22, 2024, 5:25 p.m. UTC | #3
On Thu, Aug 22, 2024 at 2:21 AM Yu Liao <liaoyu15@huawei.com> wrote:
>
> On 2024/8/14 11:54, Yu Zhao wrote:
> > Support __GFP_COMP in alloc_contig_range(). When the flag is set, upon
> > success the function returns a large folio prepared by
> > prep_new_page(), rather than a range of order-0 pages prepared by
> > split_free_pages() (which is renamed from split_map_pages()).
> >
> > alloc_contig_range() can be used to allocate folios larger than
> > MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios. So on the free path,
> > free_one_page() needs to handle that by split_large_buddy().
> >
> > Signed-off-by: Yu Zhao <yuzhao@google.com>
> > ---
> >  include/linux/gfp.h |  23 +++++++++
> >  mm/compaction.c     |  41 ++--------------
> >  mm/page_alloc.c     | 111 +++++++++++++++++++++++++++++++-------------
> >  3 files changed, 108 insertions(+), 67 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index f53f76e0b17e..59266df56aeb 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
> >  #endif
> >  void free_contig_range(unsigned long pfn, unsigned long nr_pages);
> >
> > +#ifdef CONFIG_CONTIG_ALLOC
> > +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
> > +                                                     int nid, nodemask_t *node)
> > +{
> > +     struct page *page;
> > +
> > +     if (WARN_ON(!order || !(gfp | __GFP_COMP)))
>
> It doesn't seem right, it should be !(gfp & __GFP_COMP).

Thanks. I've asked Andrew to patch this up (and another place in mm/cma.c).
Andrew Morton Aug. 22, 2024, 6:36 p.m. UTC | #4
On Thu, 22 Aug 2024 16:21:27 +0800 Yu Liao <liaoyu15@huawei.com> wrote:

> > +#ifdef CONFIG_CONTIG_ALLOC
> > +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
> > +							int nid, nodemask_t *node)
> > +{
> > +	struct page *page;
> > +
> > +	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
> 
> It doesn't seem right, it should be !(gfp & __GFP_COMP).

Yup.  Although I'm wondering why we're checking __GFP_COMP at all? 
It's a gigantic page - we know it's compound, so why require that the
caller tell us something we already know?

--- a/include/linux/gfp.h~mm-contig_alloc-support-__gfp_comp-fix
+++ a/include/linux/gfp.h
@@ -452,7 +452,7 @@ static inline struct folio *folio_alloc_
 {
 	struct page *page;
 
-	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
+	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
 		return NULL;
 
 	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
Matthew Wilcox Aug. 22, 2024, 6:42 p.m. UTC | #5
On Thu, Aug 22, 2024 at 11:23:04AM -0600, Yu Zhao wrote:
> Andrew, could you patch up the line above? This is what it's supposed
> to check:
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 59266df56aeb..03ba9563c6db 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -452,7 +452,7 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
>  {
>  	struct page *page;
>  
> -	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
> +	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
>  		return NULL;

I don't think we should do this at all.  Just this should be enough:

	gfp |= __GFP_COMP;

same as folio_alloc() (or now folio_alloc_noprof()).
Do we really caree if somebody tries to allocate a gigantic page with an
order of 0?  It's weird, but would work, so I don't see the need for the
warning.

>  	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
>
Yu Zhao Sept. 2, 2024, 5:02 p.m. UTC | #6
On Thu, Aug 22, 2024 at 07:42:20PM +0100, Matthew Wilcox wrote:
> On Thu, Aug 22, 2024 at 11:23:04AM -0600, Yu Zhao wrote:
> > Andrew, could you patch up the line above? This is what it's supposed
> > to check:
> > 
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 59266df56aeb..03ba9563c6db 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -452,7 +452,7 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
> >  {
> >  	struct page *page;
> >  
> > -	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
> > +	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
> >  		return NULL;
> 
> I don't think we should do this at all.  Just this should be enough:
> 
> 	gfp |= __GFP_COMP;
> 
> same as folio_alloc() (or now folio_alloc_noprof()).
> Do we really caree if somebody tries to allocate a gigantic page with an
> order of 0?

If this ever happens, I'd bet it's a bug.

> It's weird, but would work, so I don't see the need for the
> warning.

So the warning could catch that, but if we think it's verbose, then
please fold the following in:

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a951de920e20..b43934d79dd9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -452,10 +452,7 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
 {
 	struct page *page;
 
-	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
-		return NULL;
-
-	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
+	page = alloc_contig_pages_noprof(1 << order, gfp | __GFP_COMP, nid, node);
 
 	return page ? page_folio(page) : NULL;
 }
David Hildenbrand Nov. 19, 2024, 3:29 p.m. UTC | #7
> +/* Split a multi-block free page into its individual pageblocks. */
> +static void split_large_buddy(struct zone *zone, struct page *page,
> +			      unsigned long pfn, int order, fpi_t fpi)
> +{
> +	unsigned long end = pfn + (1 << order);
> +
> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
> +	/* Caller removed page from freelist, buddy info cleared! */
> +	VM_WARN_ON_ONCE(PageBuddy(page));
> +
> +	if (order > pageblock_order)
> +		order = pageblock_order;
> +
> +	while (pfn != end) {
> +		int mt = get_pfnblock_migratetype(page, pfn);
> +
> +		__free_one_page(page, pfn, zone, order, mt, fpi);
> +		pfn += 1 << order;
> +		page = pfn_to_page(pfn);
> +	}
> +}

Hi,

stumbling over this while digging through the code ....

> +
>   static void free_one_page(struct zone *zone, struct page *page,
>   			  unsigned long pfn, unsigned int order,
>   			  fpi_t fpi_flags)
>   {
>   	unsigned long flags;
> -	int migratetype;
>   
>   	spin_lock_irqsave(&zone->lock, flags);
> -	migratetype = get_pfnblock_migratetype(page, pfn);
> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);

This change is rather undesired:

via __free_pages_core()->__free_pages_ok() we can easily end up here 
with order=MAX_PAGE_ORDER.

What your new code will do is split this perfectly reasonable 
MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized 
chunks, and let the buddy merging logic undo our unnecessary splitting.

Is there a way to avoid this and just process the whole MAX_PAGE_ORDER 
chunk like we used to?
Zi Yan Nov. 19, 2024, 4:12 p.m. UTC | #8
On 19 Nov 2024, at 10:29, David Hildenbrand wrote:

>> +/* Split a multi-block free page into its individual pageblocks. */
>> +static void split_large_buddy(struct zone *zone, struct page *page,
>> +			      unsigned long pfn, int order, fpi_t fpi)
>> +{
>> +	unsigned long end = pfn + (1 << order);
>> +
>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>> +	/* Caller removed page from freelist, buddy info cleared! */
>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>> +
>> +	if (order > pageblock_order)
>> +		order = pageblock_order;
>> +
>> +	while (pfn != end) {
>> +		int mt = get_pfnblock_migratetype(page, pfn);
>> +
>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>> +		pfn += 1 << order;
>> +		page = pfn_to_page(pfn);
>> +	}
>> +}
>
> Hi,
>
> stumbling over this while digging through the code ....
>
>> +
>>   static void free_one_page(struct zone *zone, struct page *page,
>>   			  unsigned long pfn, unsigned int order,
>>   			  fpi_t fpi_flags)
>>   {
>>   	unsigned long flags;
>> -	int migratetype;
>>    	spin_lock_irqsave(&zone->lock, flags);
>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>
> This change is rather undesired:
>
> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.

Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
We do not have PMD level mTHP yet. Any other possible source?

>
> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>
> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?

Probably split_large_buddy() can check the migratetypes of the to-be-freed
page, if order > pageblock_order. If all migratetypes are the same, the page
can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.

Best Regards,
Yan, Zi
David Hildenbrand Nov. 19, 2024, 4:25 p.m. UTC | #9
On 19.11.24 17:12, Zi Yan wrote:
> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
> 
>>> +/* Split a multi-block free page into its individual pageblocks. */
>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>> +{
>>> +	unsigned long end = pfn + (1 << order);
>>> +
>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>> +
>>> +	if (order > pageblock_order)
>>> +		order = pageblock_order;
>>> +
>>> +	while (pfn != end) {
>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>> +
>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>> +		pfn += 1 << order;
>>> +		page = pfn_to_page(pfn);
>>> +	}
>>> +}
>>
>> Hi,
>>
>> stumbling over this while digging through the code ....
>>
>>> +
>>>    static void free_one_page(struct zone *zone, struct page *page,
>>>    			  unsigned long pfn, unsigned int order,
>>>    			  fpi_t fpi_flags)
>>>    {
>>>    	unsigned long flags;
>>> -	int migratetype;
>>>     	spin_lock_irqsave(&zone->lock, flags);
>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>
>> This change is rather undesired:
>>
>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
> 
> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
> We do not have PMD level mTHP yet. Any other possible source?

Memory init during boot. See deferred_free_pages() and 
__free_pages_memory()->memblock_free_pages().

So this is used for exposing most memory during boot to the buddy in 
MAX_PAGE_ORDER granularity.

The other is memory hotplug via generic_online_pages().
David Hildenbrand Nov. 19, 2024, 4:31 p.m. UTC | #10
On 19.11.24 17:12, Zi Yan wrote:
> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
> 
>>> +/* Split a multi-block free page into its individual pageblocks. */
>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>> +{
>>> +	unsigned long end = pfn + (1 << order);
>>> +
>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>> +
>>> +	if (order > pageblock_order)
>>> +		order = pageblock_order;
>>> +
>>> +	while (pfn != end) {
>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>> +
>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>> +		pfn += 1 << order;
>>> +		page = pfn_to_page(pfn);
>>> +	}
>>> +}
>>
>> Hi,
>>
>> stumbling over this while digging through the code ....
>>
>>> +
>>>    static void free_one_page(struct zone *zone, struct page *page,
>>>    			  unsigned long pfn, unsigned int order,
>>>    			  fpi_t fpi_flags)
>>>    {
>>>    	unsigned long flags;
>>> -	int migratetype;
>>>     	spin_lock_irqsave(&zone->lock, flags);
>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>
>> This change is rather undesired:
>>
>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
> 
> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
> We do not have PMD level mTHP yet. Any other possible source?
> 
>>
>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>
>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
> 
> Probably split_large_buddy() can check the migratetypes of the to-be-freed
> page, if order > pageblock_order. If all migratetypes are the same, the page
> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.

Thinking about this: why do we care about the migratetype?

We only have to fallback to pageblocks if any pageblock is 
"MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we 
can just ignore the migratetype (or rather overwrite it)
Zi Yan Nov. 19, 2024, 4:41 p.m. UTC | #11
On 19 Nov 2024, at 11:31, David Hildenbrand wrote:

> On 19.11.24 17:12, Zi Yan wrote:
>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>
>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>> +{
>>>> +	unsigned long end = pfn + (1 << order);
>>>> +
>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>> +
>>>> +	if (order > pageblock_order)
>>>> +		order = pageblock_order;
>>>> +
>>>> +	while (pfn != end) {
>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>> +
>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>> +		pfn += 1 << order;
>>>> +		page = pfn_to_page(pfn);
>>>> +	}
>>>> +}
>>>
>>> Hi,
>>>
>>> stumbling over this while digging through the code ....
>>>
>>>> +
>>>>    static void free_one_page(struct zone *zone, struct page *page,
>>>>    			  unsigned long pfn, unsigned int order,
>>>>    			  fpi_t fpi_flags)
>>>>    {
>>>>    	unsigned long flags;
>>>> -	int migratetype;
>>>>     	spin_lock_irqsave(&zone->lock, flags);
>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>
>>> This change is rather undesired:
>>>
>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>
>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>> We do not have PMD level mTHP yet. Any other possible source?
>>
>>>
>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>
>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>
>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>> page, if order > pageblock_order. If all migratetypes are the same, the page
>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>
> Thinking about this: why do we care about the migratetype?
>
> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)

There are VM_WARN_ONCEs around *_free_list() operations to make sure
page migratetype matches the migratetype of the list it is on. Ignoring
migratetype would trigger these WARNs. Overwriting it would work but
free page migratetype accounting needs to be taken care of.

An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
and gigantic hugetlb folios are freed via free_one_page(), where
split_large_buddy() is used to work around the limitation.

For the two memory init cases you mentioned in the other email, maybe a new
fpi flag to make free_one_page() use __free_one_page() for them,
since migratetypes should be the same across MAX_PAGE_ORDER range there, right?


Best Regards,
Yan, Zi
David Hildenbrand Nov. 19, 2024, 4:49 p.m. UTC | #12
On 19.11.24 17:41, Zi Yan wrote:
> On 19 Nov 2024, at 11:31, David Hildenbrand wrote:
> 
>> On 19.11.24 17:12, Zi Yan wrote:
>>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>>
>>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>>> +{
>>>>> +	unsigned long end = pfn + (1 << order);
>>>>> +
>>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>>> +
>>>>> +	if (order > pageblock_order)
>>>>> +		order = pageblock_order;
>>>>> +
>>>>> +	while (pfn != end) {
>>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>>> +
>>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>>> +		pfn += 1 << order;
>>>>> +		page = pfn_to_page(pfn);
>>>>> +	}
>>>>> +}
>>>>
>>>> Hi,
>>>>
>>>> stumbling over this while digging through the code ....
>>>>
>>>>> +
>>>>>     static void free_one_page(struct zone *zone, struct page *page,
>>>>>     			  unsigned long pfn, unsigned int order,
>>>>>     			  fpi_t fpi_flags)
>>>>>     {
>>>>>     	unsigned long flags;
>>>>> -	int migratetype;
>>>>>      	spin_lock_irqsave(&zone->lock, flags);
>>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>>
>>>> This change is rather undesired:
>>>>
>>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>>
>>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>>> We do not have PMD level mTHP yet. Any other possible source?
>>>
>>>>
>>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>>
>>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>>
>>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>>> page, if order > pageblock_order. If all migratetypes are the same, the page
>>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>>
>> Thinking about this: why do we care about the migratetype?
>>
>> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)
> 
> There are VM_WARN_ONCEs around *_free_list() operations to make sure
> page migratetype matches the migratetype of the list it is on. Ignoring
> migratetype would trigger these WARNs. Overwriting it would work but
> free page migratetype accounting needs to be taken care of.
> 
> An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
> and gigantic hugetlb folios are freed via free_one_page(), where
> split_large_buddy() is used to work around the limitation.

Yes, I saw that change. But that can be easily identified cased by 
unlikely(order > MAX_PAGE_ORDER) to handle that rare case in a special way.

 > > For the two memory init cases you mentioned in the other email, 
maybe a new
> fpi flag to make free_one_page() use __free_one_page() for them,
> since migratetypes should be the same across MAX_PAGE_ORDER range there, right?

In the context of alloc_frozen_range()/free_frozen_range() I want to 
free MAX_PAGE_ORDER chunks when possible, and not have some odd logic in 
the freeing path undo some of that effort.

In the common case, the pageblocks will all have the same time when 
freeing a MAX_PAGE_ORDER page, so we should identify the odd case and 
only special-case that.
David Hildenbrand Nov. 19, 2024, 4:52 p.m. UTC | #13
On 19.11.24 17:49, David Hildenbrand wrote:
> On 19.11.24 17:41, Zi Yan wrote:
>> On 19 Nov 2024, at 11:31, David Hildenbrand wrote:
>>
>>> On 19.11.24 17:12, Zi Yan wrote:
>>>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>>>
>>>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>>>> +{
>>>>>> +	unsigned long end = pfn + (1 << order);
>>>>>> +
>>>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>>>> +
>>>>>> +	if (order > pageblock_order)
>>>>>> +		order = pageblock_order;
>>>>>> +
>>>>>> +	while (pfn != end) {
>>>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>>>> +
>>>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>>>> +		pfn += 1 << order;
>>>>>> +		page = pfn_to_page(pfn);
>>>>>> +	}
>>>>>> +}
>>>>>
>>>>> Hi,
>>>>>
>>>>> stumbling over this while digging through the code ....
>>>>>
>>>>>> +
>>>>>>      static void free_one_page(struct zone *zone, struct page *page,
>>>>>>      			  unsigned long pfn, unsigned int order,
>>>>>>      			  fpi_t fpi_flags)
>>>>>>      {
>>>>>>      	unsigned long flags;
>>>>>> -	int migratetype;
>>>>>>       	spin_lock_irqsave(&zone->lock, flags);
>>>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>>>
>>>>> This change is rather undesired:
>>>>>
>>>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>>>
>>>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>>>> We do not have PMD level mTHP yet. Any other possible source?
>>>>
>>>>>
>>>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>>>
>>>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>>>
>>>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>>>> page, if order > pageblock_order. If all migratetypes are the same, the page
>>>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>>>
>>> Thinking about this: why do we care about the migratetype?
>>>
>>> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)
>>
>> There are VM_WARN_ONCEs around *_free_list() operations to make sure
>> page migratetype matches the migratetype of the list it is on. Ignoring
>> migratetype would trigger these WARNs. Overwriting it would work but
>> free page migratetype accounting needs to be taken care of.
>>
>> An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
>> and gigantic hugetlb folios are freed via free_one_page(), where
>> split_large_buddy() is used to work around the limitation.
> 
> Yes, I saw that change. But that can be easily identified cased by
> unlikely(order > MAX_PAGE_ORDER) to handle that rare case in a special way.
> 
>   > > For the two memory init cases you mentioned in the other email,
> maybe a new
>> fpi flag to make free_one_page() use __free_one_page() for them,
>> since migratetypes should be the same across MAX_PAGE_ORDER range there, right?
> 
> In the context of alloc_frozen_range()/free_frozen_range() I want to
> free MAX_PAGE_ORDER chunks when possible, and not have some odd logic in
> the freeing path undo some of that effort.

Adding a pointer to that discussion:

https://lkml.kernel.org/r/ZzZdnuZBf-xgiwD2@casper.infradead.org
Zi Yan Nov. 20, 2024, 3:55 p.m. UTC | #14
On 19 Nov 2024, at 11:52, David Hildenbrand wrote:

> On 19.11.24 17:49, David Hildenbrand wrote:
>> On 19.11.24 17:41, Zi Yan wrote:
>>> On 19 Nov 2024, at 11:31, David Hildenbrand wrote:
>>>
>>>> On 19.11.24 17:12, Zi Yan wrote:
>>>>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>>>>
>>>>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>>>>> +{
>>>>>>> +	unsigned long end = pfn + (1 << order);
>>>>>>> +
>>>>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>>>>> +
>>>>>>> +	if (order > pageblock_order)
>>>>>>> +		order = pageblock_order;
>>>>>>> +
>>>>>>> +	while (pfn != end) {
>>>>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>>>>> +
>>>>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>>>>> +		pfn += 1 << order;
>>>>>>> +		page = pfn_to_page(pfn);
>>>>>>> +	}
>>>>>>> +}
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> stumbling over this while digging through the code ....
>>>>>>
>>>>>>> +
>>>>>>>      static void free_one_page(struct zone *zone, struct page *page,
>>>>>>>      			  unsigned long pfn, unsigned int order,
>>>>>>>      			  fpi_t fpi_flags)
>>>>>>>      {
>>>>>>>      	unsigned long flags;
>>>>>>> -	int migratetype;
>>>>>>>       	spin_lock_irqsave(&zone->lock, flags);
>>>>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>>>>
>>>>>> This change is rather undesired:
>>>>>>
>>>>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>>>>
>>>>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>>>>> We do not have PMD level mTHP yet. Any other possible source?
>>>>>
>>>>>>
>>>>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>>>>
>>>>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>>>>
>>>>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>>>>> page, if order > pageblock_order. If all migratetypes are the same, the page
>>>>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>>>>
>>>> Thinking about this: why do we care about the migratetype?
>>>>
>>>> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)
>>>
>>> There are VM_WARN_ONCEs around *_free_list() operations to make sure
>>> page migratetype matches the migratetype of the list it is on. Ignoring
>>> migratetype would trigger these WARNs. Overwriting it would work but
>>> free page migratetype accounting needs to be taken care of.
>>>
>>> An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
>>> and gigantic hugetlb folios are freed via free_one_page(), where
>>> split_large_buddy() is used to work around the limitation.
>>
>> Yes, I saw that change. But that can be easily identified cased by
>> unlikely(order > MAX_PAGE_ORDER) to handle that rare case in a special way.
>>
>>   > > For the two memory init cases you mentioned in the other email,
>> maybe a new
>>> fpi flag to make free_one_page() use __free_one_page() for them,
>>> since migratetypes should be the same across MAX_PAGE_ORDER range there, right?
>>
>> In the context of alloc_frozen_range()/free_frozen_range() I want to
>> free MAX_PAGE_ORDER chunks when possible, and not have some odd logic in
>> the freeing path undo some of that effort.
>
> Adding a pointer to that discussion:
>
> https://lkml.kernel.org/r/ZzZdnuZBf-xgiwD2@casper.infradead.org

Thanks.

So you are thinking about something like this:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b6958333054d..3d3341dc1ad1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1254,7 +1254,12 @@ static void free_one_page(struct zone *zone, struct page *page,
 	unsigned long flags;

 	spin_lock_irqsave(&zone->lock, flags);
-	split_large_buddy(zone, page, pfn, order, fpi_flags);
+	if (unlikely(order > MAX_PAGE_ORDER))
+		split_large_buddy(zone, page, pfn, order, fpi_flags);
+	else {
+		int migratetype = get_pfnblock_migratetype(page, pfn);
+		__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+	}
 	spin_unlock_irqrestore(&zone->lock, flags);

 	__count_vm_events(PGFREE, 1 << order);


Is it possible to have a MAX_PAGE_ORDER hugetlb folio? If no, we are good.
If yes, alloc_contig_range() can change the migratetype of one of half that folio
and during migration phase, that folio will be freed via __free_one_page()
and causing migratetype mismatch.

Best Regards,
Yan, Zi
David Hildenbrand Nov. 20, 2024, 4:13 p.m. UTC | #15
On 20.11.24 16:55, Zi Yan wrote:
> On 19 Nov 2024, at 11:52, David Hildenbrand wrote:
> 
>> On 19.11.24 17:49, David Hildenbrand wrote:
>>> On 19.11.24 17:41, Zi Yan wrote:
>>>> On 19 Nov 2024, at 11:31, David Hildenbrand wrote:
>>>>
>>>>> On 19.11.24 17:12, Zi Yan wrote:
>>>>>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>>>>>
>>>>>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>>>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>>>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>>>>>> +{
>>>>>>>> +	unsigned long end = pfn + (1 << order);
>>>>>>>> +
>>>>>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>>>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>>>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>>>>>> +
>>>>>>>> +	if (order > pageblock_order)
>>>>>>>> +		order = pageblock_order;
>>>>>>>> +
>>>>>>>> +	while (pfn != end) {
>>>>>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>>>>>> +
>>>>>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>>>>>> +		pfn += 1 << order;
>>>>>>>> +		page = pfn_to_page(pfn);
>>>>>>>> +	}
>>>>>>>> +}
>>>>>>>
>>>>>>> Hi,
>>>>>>>
>>>>>>> stumbling over this while digging through the code ....
>>>>>>>
>>>>>>>> +
>>>>>>>>       static void free_one_page(struct zone *zone, struct page *page,
>>>>>>>>       			  unsigned long pfn, unsigned int order,
>>>>>>>>       			  fpi_t fpi_flags)
>>>>>>>>       {
>>>>>>>>       	unsigned long flags;
>>>>>>>> -	int migratetype;
>>>>>>>>        	spin_lock_irqsave(&zone->lock, flags);
>>>>>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>>>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>>>>>
>>>>>>> This change is rather undesired:
>>>>>>>
>>>>>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>>>>>
>>>>>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>>>>>> We do not have PMD level mTHP yet. Any other possible source?
>>>>>>
>>>>>>>
>>>>>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>>>>>
>>>>>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>>>>>
>>>>>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>>>>>> page, if order > pageblock_order. If all migratetypes are the same, the page
>>>>>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>>>>>
>>>>> Thinking about this: why do we care about the migratetype?
>>>>>
>>>>> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)
>>>>
>>>> There are VM_WARN_ONCEs around *_free_list() operations to make sure
>>>> page migratetype matches the migratetype of the list it is on. Ignoring
>>>> migratetype would trigger these WARNs. Overwriting it would work but
>>>> free page migratetype accounting needs to be taken care of.
>>>>
>>>> An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
>>>> and gigantic hugetlb folios are freed via free_one_page(), where
>>>> split_large_buddy() is used to work around the limitation.
>>>
>>> Yes, I saw that change. But that can be easily identified cased by
>>> unlikely(order > MAX_PAGE_ORDER) to handle that rare case in a special way.
>>>
>>>    > > For the two memory init cases you mentioned in the other email,
>>> maybe a new
>>>> fpi flag to make free_one_page() use __free_one_page() for them,
>>>> since migratetypes should be the same across MAX_PAGE_ORDER range there, right?
>>>
>>> In the context of alloc_frozen_range()/free_frozen_range() I want to
>>> free MAX_PAGE_ORDER chunks when possible, and not have some odd logic in
>>> the freeing path undo some of that effort.
>>
>> Adding a pointer to that discussion:
>>
>> https://lkml.kernel.org/r/ZzZdnuZBf-xgiwD2@casper.infradead.org
> 
> Thanks.
> 
> So you are thinking about something like this:
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b6958333054d..3d3341dc1ad1 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1254,7 +1254,12 @@ static void free_one_page(struct zone *zone, struct page *page,
>   	unsigned long flags;
> 
>   	spin_lock_irqsave(&zone->lock, flags);
> -	split_large_buddy(zone, page, pfn, order, fpi_flags);
> +	if (unlikely(order > MAX_PAGE_ORDER))
> +		split_large_buddy(zone, page, pfn, order, fpi_flags);> +	else {
> +		int migratetype = get_pfnblock_migratetype(page, pfn);
> +		__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
> +	}
>   	spin_unlock_irqrestore(&zone->lock, flags);
> 
>   	__count_vm_events(PGFREE, 1 << order);
> 

Something in that direction, but likely something like:

if (likely(order <= pageblock_order)) {
	int migratetype = get_pfnblock_migratetype(page, pfn);
	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
} else if (order <= MAX_PAGE_ORDER) {
	/* We might have to split when some pageblocks differ. */
	maybe_split_large_buddy(zone, page, pfn, order, fpi_flags);
} else {
	/* The buddy works in max MAX_PAGE_ORDER chunks. */
	for /* each MAX_PAGE_ORDER chunk */
		maybe_split_large_buddy(zone, page, pfn, MAX_PAGE_ORDER, fpi_flags);
		pfn += MAX_ORDER_NR_PAGES;
		page = ...
	}	
}

Whereby maybe_split_large_buddy would check if it has to do anything with the
pageblocks, or whether it can just call __free_one_page(order). It would only
accept order <=

In the future with free_frozen_pages(), the last else case would vanish.

> 
> Is it possible to have a MAX_PAGE_ORDER hugetlb folio? If no, we are good.

Hm, I wouldn't rule it out. To be future proof, we'd likely should add a
way to handle it. Shouldn't be too hard and expensive: simply read all
pageblock migratetypes.

I wonder if there are configs (no page isolation?) where we don't have to
check anything.

> If yes, alloc_contig_range() can change the migratetype of one of half that folio
> and during migration phase, that folio will be freed via __free_one_page()
> and causing migratetype mismatch.

Can you remind me how that happens?

Likely, we isolate a single pageblock, and shortly after we free a larger
page that covers multiple pageblocks? So this could affect other
alloc_contig_range() users as well, I assume?

Thanks!
Zi Yan Nov. 20, 2024, 4:46 p.m. UTC | #16
On 20 Nov 2024, at 11:13, David Hildenbrand wrote:

> On 20.11.24 16:55, Zi Yan wrote:
>> On 19 Nov 2024, at 11:52, David Hildenbrand wrote:
>>
>>> On 19.11.24 17:49, David Hildenbrand wrote:
>>>> On 19.11.24 17:41, Zi Yan wrote:
>>>>> On 19 Nov 2024, at 11:31, David Hildenbrand wrote:
>>>>>
>>>>>> On 19.11.24 17:12, Zi Yan wrote:
>>>>>>> On 19 Nov 2024, at 10:29, David Hildenbrand wrote:
>>>>>>>
>>>>>>>>> +/* Split a multi-block free page into its individual pageblocks. */
>>>>>>>>> +static void split_large_buddy(struct zone *zone, struct page *page,
>>>>>>>>> +			      unsigned long pfn, int order, fpi_t fpi)
>>>>>>>>> +{
>>>>>>>>> +	unsigned long end = pfn + (1 << order);
>>>>>>>>> +
>>>>>>>>> +	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
>>>>>>>>> +	/* Caller removed page from freelist, buddy info cleared! */
>>>>>>>>> +	VM_WARN_ON_ONCE(PageBuddy(page));
>>>>>>>>> +
>>>>>>>>> +	if (order > pageblock_order)
>>>>>>>>> +		order = pageblock_order;
>>>>>>>>> +
>>>>>>>>> +	while (pfn != end) {
>>>>>>>>> +		int mt = get_pfnblock_migratetype(page, pfn);
>>>>>>>>> +
>>>>>>>>> +		__free_one_page(page, pfn, zone, order, mt, fpi);
>>>>>>>>> +		pfn += 1 << order;
>>>>>>>>> +		page = pfn_to_page(pfn);
>>>>>>>>> +	}
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> stumbling over this while digging through the code ....
>>>>>>>>
>>>>>>>>> +
>>>>>>>>>       static void free_one_page(struct zone *zone, struct page *page,
>>>>>>>>>       			  unsigned long pfn, unsigned int order,
>>>>>>>>>       			  fpi_t fpi_flags)
>>>>>>>>>       {
>>>>>>>>>       	unsigned long flags;
>>>>>>>>> -	int migratetype;
>>>>>>>>>        	spin_lock_irqsave(&zone->lock, flags);
>>>>>>>>> -	migratetype = get_pfnblock_migratetype(page, pfn);
>>>>>>>>> -	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>>>>>>>>
>>>>>>>> This change is rather undesired:
>>>>>>>>
>>>>>>>> via __free_pages_core()->__free_pages_ok() we can easily end up here with order=MAX_PAGE_ORDER.
>>>>>>>
>>>>>>> Do you have a concrete example? PMD THP on x86_64 is pageblock_order.
>>>>>>> We do not have PMD level mTHP yet. Any other possible source?
>>>>>>>
>>>>>>>>
>>>>>>>> What your new code will do is split this perfectly reasonable MAX_PAGE_ORDER chunk via split_large_buddy() into pageblock-sized chunks, and let the buddy merging logic undo our unnecessary splitting.
>>>>>>>>
>>>>>>>> Is there a way to avoid this and just process the whole MAX_PAGE_ORDER chunk like we used to?
>>>>>>>
>>>>>>> Probably split_large_buddy() can check the migratetypes of the to-be-freed
>>>>>>> page, if order > pageblock_order. If all migratetypes are the same, the page
>>>>>>> can be freed at MAX_PAGE_ORDER, otherwise pageblock_order.
>>>>>>
>>>>>> Thinking about this: why do we care about the migratetype?
>>>>>>
>>>>>> We only have to fallback to pageblocks if any pageblock is "MIGRATE_ISOLATE" (and maybe MIGRATE_CMA), but not all. Otherwise, we can just ignore the migratetype (or rather overwrite it)
>>>>>
>>>>> There are VM_WARN_ONCEs around *_free_list() operations to make sure
>>>>> page migratetype matches the migratetype of the list it is on. Ignoring
>>>>> migratetype would trigger these WARNs. Overwriting it would work but
>>>>> free page migratetype accounting needs to be taken care of.
>>>>>
>>>>> An implicit reason is that __free_one_page() does not support >MAX_PAGE_ORDER
>>>>> and gigantic hugetlb folios are freed via free_one_page(), where
>>>>> split_large_buddy() is used to work around the limitation.
>>>>
>>>> Yes, I saw that change. But that can be easily identified cased by
>>>> unlikely(order > MAX_PAGE_ORDER) to handle that rare case in a special way.
>>>>
>>>>    > > For the two memory init cases you mentioned in the other email,
>>>> maybe a new
>>>>> fpi flag to make free_one_page() use __free_one_page() for them,
>>>>> since migratetypes should be the same across MAX_PAGE_ORDER range there, right?
>>>>
>>>> In the context of alloc_frozen_range()/free_frozen_range() I want to
>>>> free MAX_PAGE_ORDER chunks when possible, and not have some odd logic in
>>>> the freeing path undo some of that effort.
>>>
>>> Adding a pointer to that discussion:
>>>
>>> https://lkml.kernel.org/r/ZzZdnuZBf-xgiwD2@casper.infradead.org
>>
>> Thanks.
>>
>> So you are thinking about something like this:
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index b6958333054d..3d3341dc1ad1 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1254,7 +1254,12 @@ static void free_one_page(struct zone *zone, struct page *page,
>>   	unsigned long flags;
>>
>>   	spin_lock_irqsave(&zone->lock, flags);
>> -	split_large_buddy(zone, page, pfn, order, fpi_flags);
>> +	if (unlikely(order > MAX_PAGE_ORDER))
>> +		split_large_buddy(zone, page, pfn, order, fpi_flags);> +	else {
>> +		int migratetype = get_pfnblock_migratetype(page, pfn);
>> +		__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
>> +	}
>>   	spin_unlock_irqrestore(&zone->lock, flags);
>>
>>   	__count_vm_events(PGFREE, 1 << order);
>>
>
> Something in that direction, but likely something like:
>
> if (likely(order <= pageblock_order)) {
> 	int migratetype = get_pfnblock_migratetype(page, pfn);
> 	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
> } else if (order <= MAX_PAGE_ORDER) {
> 	/* We might have to split when some pageblocks differ. */
> 	maybe_split_large_buddy(zone, page, pfn, order, fpi_flags);
> } else {
> 	/* The buddy works in max MAX_PAGE_ORDER chunks. */
> 	for /* each MAX_PAGE_ORDER chunk */
> 		maybe_split_large_buddy(zone, page, pfn, MAX_PAGE_ORDER, fpi_flags);
> 		pfn += MAX_ORDER_NR_PAGES;
> 		page = ...
> 	}	
> }
>
> Whereby maybe_split_large_buddy would check if it has to do anything with the
> pageblocks, or whether it can just call __free_one_page(order). It would only
> accept order <=
>
> In the future with free_frozen_pages(), the last else case would vanish.
>
>>
>> Is it possible to have a MAX_PAGE_ORDER hugetlb folio? If no, we are good.
>
> Hm, I wouldn't rule it out. To be future proof, we'd likely should add a
> way to handle it. Shouldn't be too hard and expensive: simply read all
> pageblock migratetypes.
>
> I wonder if there are configs (no page isolation?) where we don't have to
> check anything.

CONFIG_CONTIG_ALLOC? CONFIG_MEMORY_ISOLATION seems too much.

>
>> If yes, alloc_contig_range() can change the migratetype of one of half that folio
>> and during migration phase, that folio will be freed via __free_one_page()
>> and causing migratetype mismatch.
>
> Can you remind me how that happens?

There is a MAX_PAGE_ORDER hugetlb folio across two pageblocks and
alloc_contig_range() wants to get a range that overlaps with first pageblock
of that hugetlb folio. All pageblocks are marked as isolated first during
start_isolate_page_range(), then __alloc_contig_migrate_range() isolates
and migrates the hugetlb folio. During migration, the original hugetlb
folio is freed and since it is MAX_PAGE_ORDER and based on my code above,
it is freed via __free_one_page() with two different migratetypes.

But with your maybe_split_large_buddy(), it would work.

>
> Likely, we isolate a single pageblock, and shortly after we free a larger
> page that covers multiple pageblocks? So this could affect other
> alloc_contig_range() users as well, I assume?



Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53f76e0b17e..59266df56aeb 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -446,4 +446,27 @@  extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
 #endif
 void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
+#ifdef CONFIG_CONTIG_ALLOC
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+							int nid, nodemask_t *node)
+{
+	struct page *page;
+
+	if (WARN_ON(!order || !(gfp | __GFP_COMP)))
+		return NULL;
+
+	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
+
+	return page ? page_folio(page) : NULL;
+}
+#else
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+							int nid, nodemask_t *node)
+{
+	return NULL;
+}
+#endif
+/* This should be paired with folio_put() rather than free_contig_range(). */
+#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
+
 #endif /* __LINUX_GFP_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index eb95e9b435d0..d1041fbce679 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -86,33 +86,6 @@  static struct page *mark_allocated_noprof(struct page *page, unsigned int order,
 }
 #define mark_allocated(...)	alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
 
-static void split_map_pages(struct list_head *freepages)
-{
-	unsigned int i, order;
-	struct page *page, *next;
-	LIST_HEAD(tmp_list);
-
-	for (order = 0; order < NR_PAGE_ORDERS; order++) {
-		list_for_each_entry_safe(page, next, &freepages[order], lru) {
-			unsigned int nr_pages;
-
-			list_del(&page->lru);
-
-			nr_pages = 1 << order;
-
-			mark_allocated(page, order, __GFP_MOVABLE);
-			if (order)
-				split_page(page, order);
-
-			for (i = 0; i < nr_pages; i++) {
-				list_add(&page->lru, &tmp_list);
-				page++;
-			}
-		}
-		list_splice_init(&tmp_list, &freepages[0]);
-	}
-}
-
 static unsigned long release_free_list(struct list_head *freepages)
 {
 	int order;
@@ -742,11 +715,11 @@  static unsigned long isolate_freepages_block(struct compact_control *cc,
  *
  * Non-free pages, invalid PFNs, or zone boundaries within the
  * [start_pfn, end_pfn) range are considered errors, cause function to
- * undo its actions and return zero.
+ * undo its actions and return zero. cc->freepages[] are empty.
  *
  * Otherwise, function returns one-past-the-last PFN of isolated page
  * (which may be greater then end_pfn if end fell in a middle of
- * a free page).
+ * a free page). cc->freepages[] contain free pages isolated.
  */
 unsigned long
 isolate_freepages_range(struct compact_control *cc,
@@ -754,10 +727,9 @@  isolate_freepages_range(struct compact_control *cc,
 {
 	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
 	int order;
-	struct list_head tmp_freepages[NR_PAGE_ORDERS];
 
 	for (order = 0; order < NR_PAGE_ORDERS; order++)
-		INIT_LIST_HEAD(&tmp_freepages[order]);
+		INIT_LIST_HEAD(&cc->freepages[order]);
 
 	pfn = start_pfn;
 	block_start_pfn = pageblock_start_pfn(pfn);
@@ -788,7 +760,7 @@  isolate_freepages_range(struct compact_control *cc,
 			break;
 
 		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-					block_end_pfn, tmp_freepages, 0, true);
+					block_end_pfn, cc->freepages, 0, true);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -807,13 +779,10 @@  isolate_freepages_range(struct compact_control *cc,
 
 	if (pfn < end_pfn) {
 		/* Loop terminated early, cleanup. */
-		release_free_list(tmp_freepages);
+		release_free_list(cc->freepages);
 		return 0;
 	}
 
-	/* __isolate_free_page() does not map the pages */
-	split_map_pages(tmp_freepages);
-
 	/* We don't use freelists for anything. */
 	return pfn;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5841bbea482a..0a43e4ea29e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1197,16 +1197,36 @@  static void free_pcppages_bulk(struct zone *zone, int count,
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
+/* Split a multi-block free page into its individual pageblocks. */
+static void split_large_buddy(struct zone *zone, struct page *page,
+			      unsigned long pfn, int order, fpi_t fpi)
+{
+	unsigned long end = pfn + (1 << order);
+
+	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
+	/* Caller removed page from freelist, buddy info cleared! */
+	VM_WARN_ON_ONCE(PageBuddy(page));
+
+	if (order > pageblock_order)
+		order = pageblock_order;
+
+	while (pfn != end) {
+		int mt = get_pfnblock_migratetype(page, pfn);
+
+		__free_one_page(page, pfn, zone, order, mt, fpi);
+		pfn += 1 << order;
+		page = pfn_to_page(pfn);
+	}
+}
+
 static void free_one_page(struct zone *zone, struct page *page,
 			  unsigned long pfn, unsigned int order,
 			  fpi_t fpi_flags)
 {
 	unsigned long flags;
-	int migratetype;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	migratetype = get_pfnblock_migratetype(page, pfn);
-	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+	split_large_buddy(zone, page, pfn, order, fpi_flags);
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
@@ -1698,27 +1718,6 @@  static unsigned long find_large_buddy(unsigned long start_pfn)
 	return start_pfn;
 }
 
-/* Split a multi-block free page into its individual pageblocks */
-static void split_large_buddy(struct zone *zone, struct page *page,
-			      unsigned long pfn, int order)
-{
-	unsigned long end_pfn = pfn + (1 << order);
-
-	VM_WARN_ON_ONCE(order <= pageblock_order);
-	VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
-
-	/* Caller removed page from freelist, buddy info cleared! */
-	VM_WARN_ON_ONCE(PageBuddy(page));
-
-	while (pfn != end_pfn) {
-		int mt = get_pfnblock_migratetype(page, pfn);
-
-		__free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
-		pfn += pageblock_nr_pages;
-		page = pfn_to_page(pfn);
-	}
-}
-
 /**
  * move_freepages_block_isolate - move free pages in block for page isolation
  * @zone: the zone
@@ -1759,7 +1758,7 @@  bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 		del_page_from_free_list(buddy, zone, order,
 					get_pfnblock_migratetype(buddy, pfn));
 		set_pageblock_migratetype(page, migratetype);
-		split_large_buddy(zone, buddy, pfn, order);
+		split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
 		return true;
 	}
 
@@ -1770,7 +1769,7 @@  bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 		del_page_from_free_list(page, zone, order,
 					get_pfnblock_migratetype(page, pfn));
 		set_pageblock_migratetype(page, migratetype);
-		split_large_buddy(zone, page, pfn, order);
+		split_large_buddy(zone, page, pfn, order, FPI_NONE);
 		return true;
 	}
 move:
@@ -6440,6 +6439,31 @@  int __alloc_contig_migrate_range(struct compact_control *cc,
 	return (ret < 0) ? ret : 0;
 }
 
+static void split_free_pages(struct list_head *list)
+{
+	int order;
+
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
+		struct page *page, *next;
+		int nr_pages = 1 << order;
+
+		list_for_each_entry_safe(page, next, &list[order], lru) {
+			int i;
+
+			post_alloc_hook(page, order, __GFP_MOVABLE);
+			if (!order)
+				continue;
+
+			split_page(page, order);
+
+			/* Add all subpages to the order-0 head, in sequence. */
+			list_del(&page->lru);
+			for (i = 0; i < nr_pages; i++)
+				list_add_tail(&page[i].lru, &list[0]);
+		}
+	}
+}
+
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
@@ -6552,12 +6576,25 @@  int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 		goto done;
 	}
 
-	/* Free head and tail (if any) */
-	if (start != outer_start)
-		free_contig_range(outer_start, start - outer_start);
-	if (end != outer_end)
-		free_contig_range(end, outer_end - end);
+	if (!(gfp_mask & __GFP_COMP)) {
+		split_free_pages(cc.freepages);
 
+		/* Free head and tail (if any) */
+		if (start != outer_start)
+			free_contig_range(outer_start, start - outer_start);
+		if (end != outer_end)
+			free_contig_range(end, outer_end - end);
+	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
+		struct page *head = pfn_to_page(start);
+		int order = ilog2(end - start);
+
+		check_new_pages(head, order);
+		prep_new_page(head, order, gfp_mask, 0);
+	} else {
+		ret = -EINVAL;
+		WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
+		     start, end, outer_start, outer_end);
+	}
 done:
 	undo_isolate_page_range(start, end, migratetype);
 	return ret;
@@ -6666,6 +6703,18 @@  struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
 void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 {
 	unsigned long count = 0;
+	struct folio *folio = pfn_folio(pfn);
+
+	if (folio_test_large(folio)) {
+		int expected = folio_nr_pages(folio);
+
+		if (nr_pages == expected)
+			folio_put(folio);
+		else
+			WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
+			     pfn, nr_pages, expected);
+		return;
+	}
 
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);