diff mbox series

[net-next,v13,08/14] mm: page_frag: some minor refactoring before adding new API

Message ID 20240808123714.462740-9-linyunsheng@huawei.com (mailing list archive)
State Deferred
Delegated to: Netdev Maintainers
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 72 this patch: 72
netdev/build_tools success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers warning 1 maintainers not CCed: chuck.lever@oracle.com
netdev/build_clang success Errors and warnings before: 131 this patch: 131
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4982 this patch: 4982
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-08-09--09-00 (tests: 705)

Commit Message

Yunsheng Lin Aug. 8, 2024, 12:37 p.m. UTC
Refactor common codes from __page_frag_alloc_va_align()
to __page_frag_cache_reload(), so that the new API can
make use of them.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |   2 +-
 mm/page_frag_cache.c            | 138 ++++++++++++++++++--------------
 2 files changed, 81 insertions(+), 59 deletions(-)

Comments

Alexander Duyck Aug. 14, 2024, 5:54 p.m. UTC | #1
On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> Refactor common codes from __page_frag_alloc_va_align()
> to __page_frag_cache_reload(), so that the new API can
> make use of them.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |   2 +-
>  mm/page_frag_cache.c            | 138 ++++++++++++++++++--------------
>  2 files changed, 81 insertions(+), 59 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 4ce924eaf1b1..0abffdd10a1c 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -52,7 +52,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  

Still not a fan of this. Just setting encoded_va to 0 should be enough
as the other fields will automatically be overwritten when the new page
is allocated.

Relying on memset is problematic at best since you then introduce the
potential for issues where remaining somehow gets corrupted but
encoded_va/page is 0. I would rather have both of these being checked
as a part of allocation than just just assuming it is valid if
remaining is set.

I would prefer to keep the check for a non-0 encoded_page value and
then check remaining rather than just rely on remaining as it creates a
single point of failure. With that we can safely tear away a page and
the next caller to try to allocate will populated a new page and the
associated fields.

>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 2544b292375a..4e6b1c4684f0 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -19,8 +19,27 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> -static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> -					     gfp_t gfp_mask)
> +static bool __page_frag_cache_reuse(unsigned long encoded_va,
> +				    unsigned int pagecnt_bias)
> +{
> +	struct page *page;
> +
> +	page = virt_to_page((void *)encoded_va);
> +	if (!page_ref_sub_and_test(page, pagecnt_bias))
> +		return false;
> +
> +	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +		free_unref_page(page, encoded_page_order(encoded_va));
> +		return false;
> +	}
> +
> +	/* OK, page count is 0, we can safely set it */
> +	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +	return true;
> +}
> +
> +static bool __page_frag_cache_refill(struct page_frag_cache *nc,
> +				     gfp_t gfp_mask)
>  {
>  	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
>  	struct page *page = NULL;
> @@ -35,8 +54,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>  		if (unlikely(!page)) {
> -			nc->encoded_va = 0;
> -			return NULL;
> +			memset(nc, 0, sizeof(*nc));
> +			return false;
>  		}
>  
>  		order = 0;
> @@ -45,7 +64,33 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  	nc->encoded_va = encode_aligned_va(page_address(page), order,
>  					   page_is_pfmemalloc(page));
>  
> -	return page;
> +	/* Even if we own the page, we do not use atomic_set().
> +	 * This would break get_page_unless_zero() users.
> +	 */
> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> +
> +	return true;
> +}
> +
> +/* Reload cache by reusing the old cache if it is possible, or
> + * refilling from the page allocator.
> + */
> +static bool __page_frag_cache_reload(struct page_frag_cache *nc,
> +				     gfp_t gfp_mask)
> +{
> +	if (likely(nc->encoded_va)) {
> +		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> +			goto out;
> +	}
> +
> +	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> +		return false;
> +
> +out:
> +	/* reset page count bias and remaining to start of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> +	nc->remaining = page_frag_cache_page_size(nc->encoded_va);

One thought I am having is that it might be better to have the
pagecnt_bias get set at the same time as the page_ref_add or the
set_page_count call. In addition setting the remaining value at the
same time probably would make sense as in the refill case you can make
use of the "order" value directly instead of having to write/read it
out of the encoded va/page.

With that we could simplify this function and get something closer to
what we had for the original alloc_va_align code.

> +	return true;
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc)
> @@ -55,7 +100,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
>  
>  	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
>  				nc->pagecnt_bias);
> -	nc->encoded_va = 0;
> +	memset(nc, 0, sizeof(*nc));
>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -73,67 +118,44 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int align_mask)
>  {
>  	unsigned long encoded_va = nc->encoded_va;
> -	unsigned int size, remaining;
> -	struct page *page;
> -
> -	if (unlikely(!encoded_va)) {

We should still be checking this before we even touch remaining.
Otherwise we greatly increase the risk of providing a bad virtual
address and have greatly decreased the likelihood of us catching
potential errors gracefully.

> -refill:
> -		page = __page_frag_cache_refill(nc, gfp_mask);
> -		if (!page)
> -			return NULL;
> -
> -		encoded_va = nc->encoded_va;
> -		size = page_frag_cache_page_size(encoded_va);
> -
> -		/* Even if we own the page, we do not use atomic_set().
> -		 * This would break get_page_unless_zero() users.
> -		 */
> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> -
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->remaining = size;

With my suggested change above you could essentially just drop the
block starting from the comment and this function wouldn't need to
change as much as it is.

> -	} else {
> -		size = page_frag_cache_page_size(encoded_va);
> -	}
> +	unsigned int remaining;
>  
>  	remaining = nc->remaining & align_mask;
> -	if (unlikely(remaining < fragsz)) {
> -		if (unlikely(fragsz > PAGE_SIZE)) {
> -			/*
> -			 * The caller is trying to allocate a fragment
> -			 * with fragsz > PAGE_SIZE but the cache isn't big
> -			 * enough to satisfy the request, this may
> -			 * happen in low memory conditions.
> -			 * We don't release the cache page because
> -			 * it could make memory pressure worse
> -			 * so we simply return NULL here.
> -			 */
> -			return NULL;
> -		}
> -
> -		page = virt_to_page((void *)encoded_va);
>  
> -		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> -			goto refill;
> -
> -		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> -			free_unref_page(page, encoded_page_order(encoded_va));
> -			goto refill;
> -		}

Likewise for this block here. We can essentially just make use of the
__page_frag_cache_reuse function without the need to do a complete
rework of the code.

> +	/* As we have ensured remaining is zero when initializing and draining old
> +	 * cache, 'remaining >= fragsz' checking is enough to indicate there is
> +	 * enough available space for the new fragment allocation.
> +	 */
> +	if (likely(remaining >= fragsz)) {
> +		nc->pagecnt_bias--;
> +		nc->remaining = remaining - fragsz;
>  
> -		/* OK, page count is 0, we can safely set it */
> -		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> +		return encoded_page_address(encoded_va) +
> +			(page_frag_cache_page_size(encoded_va) - remaining);
> +	}
>  
> -		/* reset page count bias and remaining to start of new frag */
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		remaining = size;
> +	if (unlikely(fragsz > PAGE_SIZE)) {
> +		/*
> +		 * The caller is trying to allocate a fragment with
> +		 * fragsz > PAGE_SIZE but the cache isn't big enough to satisfy
> +		 * the request, this may happen in low memory conditions. We don't
> +		 * release the cache page because it could make memory pressure
> +		 * worse so we simply return NULL here.
> +		 */
> +		return NULL;
>  	}
>  
> +	if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
> +		return NULL;
> +
> +	/* As the we are allocating fragment from cache by count-up way, the offset
> +	 * of allocated fragment from the just reloaded cache is zero, so remaining
> +	 * aligning and offset calculation are not needed.
> +	 */
>  	nc->pagecnt_bias--;
> -	nc->remaining = remaining - fragsz;
> +	nc->remaining -= fragsz;
>  
> -	return encoded_page_address(encoded_va) + (size - remaining);
> +	return encoded_page_address(nc->encoded_va);
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>
Yunsheng Lin Aug. 15, 2024, 3:04 a.m. UTC | #2
On 2024/8/15 1:54, Alexander H Duyck wrote:
> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
>> Refactor common codes from __page_frag_alloc_va_align()
>> to __page_frag_cache_reload(), so that the new API can
>> make use of them.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  include/linux/page_frag_cache.h |   2 +-
>>  mm/page_frag_cache.c            | 138 ++++++++++++++++++--------------
>>  2 files changed, 81 insertions(+), 59 deletions(-)
>>
>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>> index 4ce924eaf1b1..0abffdd10a1c 100644
>> --- a/include/linux/page_frag_cache.h
>> +++ b/include/linux/page_frag_cache.h
>> @@ -52,7 +52,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>  
>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>  {
>> -	nc->encoded_va = 0;
>> +	memset(nc, 0, sizeof(*nc));
>>  }
>>  
> 
> Still not a fan of this. Just setting encoded_va to 0 should be enough
> as the other fields will automatically be overwritten when the new page
> is allocated.
> 
> Relying on memset is problematic at best since you then introduce the
> potential for issues where remaining somehow gets corrupted but
> encoded_va/page is 0. I would rather have both of these being checked
> as a part of allocation than just just assuming it is valid if
> remaining is set.

Does adding something like VM_BUG_ON(!nc->encoded_va && nc->remaining) to
catch the above problem address your above concern?

> 
> I would prefer to keep the check for a non-0 encoded_page value and
> then check remaining rather than just rely on remaining as it creates a
> single point of failure. With that we can safely tear away a page and
> the next caller to try to allocate will populated a new page and the
> associated fields.

As mentioned before, the memset() is used mainly because of:
1. avoid a checking in the fast path.
2. avoid duplicating the checking pattern you mentioned above for the
   new API.

> 
>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index 2544b292375a..4e6b1c4684f0 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -19,8 +19,27 @@
>>  #include <linux/page_frag_cache.h>
>>  #include "internal.h"
>>  

...

>> +
>> +/* Reload cache by reusing the old cache if it is possible, or
>> + * refilling from the page allocator.
>> + */
>> +static bool __page_frag_cache_reload(struct page_frag_cache *nc,
>> +				     gfp_t gfp_mask)
>> +{
>> +	if (likely(nc->encoded_va)) {
>> +		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
>> +			goto out;
>> +	}
>> +
>> +	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>> +		return false;
>> +
>> +out:
>> +	/* reset page count bias and remaining to start of new frag */
>> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> +	nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> 
> One thought I am having is that it might be better to have the
> pagecnt_bias get set at the same time as the page_ref_add or the
> set_page_count call. In addition setting the remaining value at the
> same time probably would make sense as in the refill case you can make
> use of the "order" value directly instead of having to write/read it
> out of the encoded va/page.

Probably, there is always tradeoff to make regarding avoid code
duplication and avoid reading the order, I am not sure it matters
for both for case, I would rather keep the above pattern if there
is not obvious benefit for the other pattern.

> 
> With that we could simplify this function and get something closer to
> what we had for the original alloc_va_align code.
> 
>> +	return true;
>>  }
>>  
>>  void page_frag_cache_drain(struct page_frag_cache *nc)
>> @@ -55,7 +100,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
>>  
>>  	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
>>  				nc->pagecnt_bias);
>> -	nc->encoded_va = 0;
>> +	memset(nc, 0, sizeof(*nc));
>>  }
>>  EXPORT_SYMBOL(page_frag_cache_drain);
>>  
>> @@ -73,67 +118,44 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>>  				 unsigned int align_mask)
>>  {
>>  	unsigned long encoded_va = nc->encoded_va;
>> -	unsigned int size, remaining;
>> -	struct page *page;
>> -
>> -	if (unlikely(!encoded_va)) {
> 
> We should still be checking this before we even touch remaining.
> Otherwise we greatly increase the risk of providing a bad virtual
> address and have greatly decreased the likelihood of us catching
> potential errors gracefully.
> 
>> -refill:
>> -		page = __page_frag_cache_refill(nc, gfp_mask);
>> -		if (!page)
>> -			return NULL;
>> -
>> -		encoded_va = nc->encoded_va;
>> -		size = page_frag_cache_page_size(encoded_va);
>> -
>> -		/* Even if we own the page, we do not use atomic_set().
>> -		 * This would break get_page_unless_zero() users.
>> -		 */
>> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>> -
>> -		/* reset page count bias and remaining to start of new frag */
>> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> -		nc->remaining = size;
> 
> With my suggested change above you could essentially just drop the
> block starting from the comment and this function wouldn't need to
> change as much as it is.

It seems you are still suggesting that new API also duplicates the old
checking pattern in __page_frag_alloc_va_align()?

I would rather avoid the above if something like VM_BUG_ON() can address
your above concern.
Alexander Duyck Aug. 15, 2024, 3:09 p.m. UTC | #3
On Wed, Aug 14, 2024 at 8:04 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/8/15 1:54, Alexander H Duyck wrote:
> > On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> >> Refactor common codes from __page_frag_alloc_va_align()
> >> to __page_frag_cache_reload(), so that the new API can
> >> make use of them.
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >>  include/linux/page_frag_cache.h |   2 +-
> >>  mm/page_frag_cache.c            | 138 ++++++++++++++++++--------------
> >>  2 files changed, 81 insertions(+), 59 deletions(-)
> >>
> >> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> >> index 4ce924eaf1b1..0abffdd10a1c 100644
> >> --- a/include/linux/page_frag_cache.h
> >> +++ b/include/linux/page_frag_cache.h
> >> @@ -52,7 +52,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
> >>
> >>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
> >>  {
> >> -    nc->encoded_va = 0;
> >> +    memset(nc, 0, sizeof(*nc));
> >>  }
> >>
> >
> > Still not a fan of this. Just setting encoded_va to 0 should be enough
> > as the other fields will automatically be overwritten when the new page
> > is allocated.
> >
> > Relying on memset is problematic at best since you then introduce the
> > potential for issues where remaining somehow gets corrupted but
> > encoded_va/page is 0. I would rather have both of these being checked
> > as a part of allocation than just just assuming it is valid if
> > remaining is set.
>
> Does adding something like VM_BUG_ON(!nc->encoded_va && nc->remaining) to
> catch the above problem address your above concern?

Not really. I would prefer to just retain the existing behavior.

> >
> > I would prefer to keep the check for a non-0 encoded_page value and
> > then check remaining rather than just rely on remaining as it creates a
> > single point of failure. With that we can safely tear away a page and
> > the next caller to try to allocate will populated a new page and the
> > associated fields.
>
> As mentioned before, the memset() is used mainly because of:
> 1. avoid a checking in the fast path.
> 2. avoid duplicating the checking pattern you mentioned above for the
>    new API.

I'm not a fan of the new code flow after getting rid of the checking
in the fast path. The code is becoming a tangled mess of spaghetti
code in my opinion. Arguably the patches don't help as you are taking
huge steps in many of these patches and it is making it hard to read.
In addition the code becomes more obfuscated with each patch which is
one of the reasons why I would have preferred to see this set broken
into a couple sets so we can give it some time for any of the kinks to
get worked out.

> >
> >>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index 2544b292375a..4e6b1c4684f0 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -19,8 +19,27 @@
> >>  #include <linux/page_frag_cache.h>
> >>  #include "internal.h"
> >>
>
> ...
>
> >> +
> >> +/* Reload cache by reusing the old cache if it is possible, or
> >> + * refilling from the page allocator.
> >> + */
> >> +static bool __page_frag_cache_reload(struct page_frag_cache *nc,
> >> +                                 gfp_t gfp_mask)
> >> +{
> >> +    if (likely(nc->encoded_va)) {
> >> +            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> >> +                    goto out;
> >> +    }
> >> +
> >> +    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> >> +            return false;
> >> +
> >> +out:
> >> +    /* reset page count bias and remaining to start of new frag */
> >> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> +    nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> >
> > One thought I am having is that it might be better to have the
> > pagecnt_bias get set at the same time as the page_ref_add or the
> > set_page_count call. In addition setting the remaining value at the
> > same time probably would make sense as in the refill case you can make
> > use of the "order" value directly instead of having to write/read it
> > out of the encoded va/page.
>
> Probably, there is always tradeoff to make regarding avoid code
> duplication and avoid reading the order, I am not sure it matters
> for both for case, I would rather keep the above pattern if there
> is not obvious benefit for the other pattern.

Part of it is more about keeping the functions contained to generating
self contained objects. I am not a fan of us splitting up the page
init into a few sections as it makes it much easier to mess up a page
by changing one spot and overlooking the fact that an additional page
is needed somewhere else.

> >
> > With that we could simplify this function and get something closer to
> > what we had for the original alloc_va_align code.
> >
> >> +    return true;
> >>  }
> >>
> >>  void page_frag_cache_drain(struct page_frag_cache *nc)
> >> @@ -55,7 +100,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
> >>
> >>      __page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
> >>                              nc->pagecnt_bias);
> >> -    nc->encoded_va = 0;
> >> +    memset(nc, 0, sizeof(*nc));
> >>  }
> >>  EXPORT_SYMBOL(page_frag_cache_drain);
> >>
> >> @@ -73,67 +118,44 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> >>                               unsigned int align_mask)
> >>  {
> >>      unsigned long encoded_va = nc->encoded_va;
> >> -    unsigned int size, remaining;
> >> -    struct page *page;
> >> -
> >> -    if (unlikely(!encoded_va)) {
> >
> > We should still be checking this before we even touch remaining.
> > Otherwise we greatly increase the risk of providing a bad virtual
> > address and have greatly decreased the likelihood of us catching
> > potential errors gracefully.
> >
> >> -refill:
> >> -            page = __page_frag_cache_refill(nc, gfp_mask);
> >> -            if (!page)
> >> -                    return NULL;
> >> -
> >> -            encoded_va = nc->encoded_va;
> >> -            size = page_frag_cache_page_size(encoded_va);
> >> -
> >> -            /* Even if we own the page, we do not use atomic_set().
> >> -             * This would break get_page_unless_zero() users.
> >> -             */
> >> -            page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> >> -
> >> -            /* reset page count bias and remaining to start of new frag */
> >> -            nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> -            nc->remaining = size;
> >
> > With my suggested change above you could essentially just drop the
> > block starting from the comment and this function wouldn't need to
> > change as much as it is.
>
> It seems you are still suggesting that new API also duplicates the old
> checking pattern in __page_frag_alloc_va_align()?
>
> I would rather avoid the above if something like VM_BUG_ON() can address
> your above concern.

Yes, that is what I am suggesting. It makes the code much less prone
to any sort of possible races as resetting encoded_va would make it so
that reads for all the other fields would be skipped versus having to
use a memset which differs in implementation depending on the
architecture.
Yunsheng Lin Aug. 16, 2024, 11:58 a.m. UTC | #4
On 2024/8/15 23:09, Alexander Duyck wrote:
> On Wed, Aug 14, 2024 at 8:04 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/8/15 1:54, Alexander H Duyck wrote:
>>> On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
>>>> Refactor common codes from __page_frag_alloc_va_align()
>>>> to __page_frag_cache_reload(), so that the new API can
>>>> make use of them.
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>>  include/linux/page_frag_cache.h |   2 +-
>>>>  mm/page_frag_cache.c            | 138 ++++++++++++++++++--------------
>>>>  2 files changed, 81 insertions(+), 59 deletions(-)
>>>>
>>>> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
>>>> index 4ce924eaf1b1..0abffdd10a1c 100644
>>>> --- a/include/linux/page_frag_cache.h
>>>> +++ b/include/linux/page_frag_cache.h
>>>> @@ -52,7 +52,7 @@ static inline void *encoded_page_address(unsigned long encoded_va)
>>>>
>>>>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>>>>  {
>>>> -    nc->encoded_va = 0;
>>>> +    memset(nc, 0, sizeof(*nc));
>>>>  }
>>>>
>>>
>>> Still not a fan of this. Just setting encoded_va to 0 should be enough
>>> as the other fields will automatically be overwritten when the new page
>>> is allocated.
>>>
>>> Relying on memset is problematic at best since you then introduce the
>>> potential for issues where remaining somehow gets corrupted but
>>> encoded_va/page is 0. I would rather have both of these being checked
>>> as a part of allocation than just just assuming it is valid if
>>> remaining is set.
>>
>> Does adding something like VM_BUG_ON(!nc->encoded_va && nc->remaining) to
>> catch the above problem address your above concern?
> 
> Not really. I would prefer to just retain the existing behavior.
As my understanding, it is a implementation detail that API caller does
not need to care about if the API is used correctly. If not, we have bigger
problem than above.

If there is a error in that implementation, it would be good to point it
out. And there is already a comment explaining that implementation detail
in this patch, doesn't adding a explicit VM_BUG_ON() make it more obvious.

> 
>>>
>>> I would prefer to keep the check for a non-0 encoded_page value and
>>> then check remaining rather than just rely on remaining as it creates a
>>> single point of failure. With that we can safely tear away a page and
>>> the next caller to try to allocate will populated a new page and the
>>> associated fields.
>>
>> As mentioned before, the memset() is used mainly because of:
>> 1. avoid a checking in the fast path.
>> 2. avoid duplicating the checking pattern you mentioned above for the
>>    new API.
> 
> I'm not a fan of the new code flow after getting rid of the checking
> in the fast path. The code is becoming a tangled mess of spaghetti

I am not sure if you get the point that getting rid of nc->encoded_va
checking in the fast path is the reason we are able able to refactor
common codes into __page_frag_cache_reload(), so that both the old API
and new APIs can reuse the common codes.

> code in my opinion. Arguably the patches don't help as you are taking
> huge steps in many of these patches and it is making it hard to read.
> In addition the code becomes more obfuscated with each patch which is
> one of the reasons why I would have preferred to see this set broken
> into a couple sets so we can give it some time for any of the kinks to
> get worked out.

If there is no new APIs added, I guess I am agreed with your above
argument.
With the new API for new use case, the refactoring in this patch make
code more reusable and maintainable, that is why I would have preferred
not to break this patchset into more patchsets as it is already hard to
argue the reason behind the refactoring.

> 
>>>
>>>>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index 2544b292375a..4e6b1c4684f0 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -19,8 +19,27 @@
>>>>  #include <linux/page_frag_cache.h>
>>>>  #include "internal.h"
>>>>
>>
>> ...
>>
>>>> +
>>>> +/* Reload cache by reusing the old cache if it is possible, or
>>>> + * refilling from the page allocator.
>>>> + */
>>>> +static bool __page_frag_cache_reload(struct page_frag_cache *nc,
>>>> +                                 gfp_t gfp_mask)
>>>> +{
>>>> +    if (likely(nc->encoded_va)) {
>>>> +            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
>>>> +                    goto out;
>>>> +    }
>>>> +
>>>> +    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>>>> +            return false;
>>>> +
>>>> +out:
>>>> +    /* reset page count bias and remaining to start of new frag */
>>>> +    nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> +    nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>>>
>>> One thought I am having is that it might be better to have the
>>> pagecnt_bias get set at the same time as the page_ref_add or the
>>> set_page_count call. In addition setting the remaining value at the
>>> same time probably would make sense as in the refill case you can make
>>> use of the "order" value directly instead of having to write/read it
>>> out of the encoded va/page.
>>
>> Probably, there is always tradeoff to make regarding avoid code
>> duplication and avoid reading the order, I am not sure it matters
>> for both for case, I would rather keep the above pattern if there
>> is not obvious benefit for the other pattern.
> 
> Part of it is more about keeping the functions contained to generating
> self contained objects. I am not a fan of us splitting up the page
> init into a few sections as it makes it much easier to mess up a page
> by changing one spot and overlooking the fact that an additional page
> is needed somewhere else.

To be honest, I am not so obsessed with where are pagecnt_bias and
remaining set.
I am obsessed with whether the __page_frag_cache_reload() is needed.
Let's be more specific about your suggestion here: are you suggesting to
remove __page_frag_cache_reload()?
If yes, are you really expecting both old and new API duplicating the
below checking pattern? Why?

if (likely(nc->encoded_va)) {
	if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
		...
}

if (unlikely(remaining < fragsz)) {
	page = __page_frag_cache_refill(nc, gfp_mask);
	....
}

If no, doesn't it make sense to call __page_frag_cache_reload() for both
old and new API?

> 
>>>
>>> With that we could simplify this function and get something closer to
>>> what we had for the original alloc_va_align code.
>>>
>>>> +    return true;
>>>>  }
>>>>
>>>>  void page_frag_cache_drain(struct page_frag_cache *nc)
>>>> @@ -55,7 +100,7 @@ void page_frag_cache_drain(struct page_frag_cache *nc)
>>>>
>>>>      __page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
>>>>                              nc->pagecnt_bias);
>>>> -    nc->encoded_va = 0;
>>>> +    memset(nc, 0, sizeof(*nc));
>>>>  }
>>>>  EXPORT_SYMBOL(page_frag_cache_drain);
>>>>
>>>> @@ -73,67 +118,44 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>>>>                               unsigned int align_mask)
>>>>  {
>>>>      unsigned long encoded_va = nc->encoded_va;
>>>> -    unsigned int size, remaining;
>>>> -    struct page *page;
>>>> -
>>>> -    if (unlikely(!encoded_va)) {
>>>
>>> We should still be checking this before we even touch remaining.
>>> Otherwise we greatly increase the risk of providing a bad virtual
>>> address and have greatly decreased the likelihood of us catching
>>> potential errors gracefully.

I would argue that by duplicating the above checking for both the old
and new API will make the code less maintainable, for example, when
fixing bug or making changing to the duplicated checking, it is more
likely miss to change some of them if there are duplicated checking
codes.

>>>
>>>> -refill:
>>>> -            page = __page_frag_cache_refill(nc, gfp_mask);
>>>> -            if (!page)
>>>> -                    return NULL;
>>>> -
>>>> -            encoded_va = nc->encoded_va;
>>>> -            size = page_frag_cache_page_size(encoded_va);
>>>> -
>>>> -            /* Even if we own the page, we do not use atomic_set().
>>>> -             * This would break get_page_unless_zero() users.
>>>> -             */
>>>> -            page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>>>> -
>>>> -            /* reset page count bias and remaining to start of new frag */
>>>> -            nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> -            nc->remaining = size;
>>>
>>> With my suggested change above you could essentially just drop the
>>> block starting from the comment and this function wouldn't need to
>>> change as much as it is.
>>
>> It seems you are still suggesting that new API also duplicates the old
>> checking pattern in __page_frag_alloc_va_align()?
>>
>> I would rather avoid the above if something like VM_BUG_ON() can address
>> your above concern.
> 
> Yes, that is what I am suggesting. It makes the code much less prone
> to any sort of possible races as resetting encoded_va would make it so
> that reads for all the other fields would be skipped versus having to
> use a memset which differs in implementation depending on the
> architecture.

It would good to be more specific about what is the race here? if it does
exist, we can fix it.

And it would be good to have more specific suggestion and argument too,
othewise it just you arguing for preserving old behavior to make
the code much less prone to any sort of possible races, and me arguing
for making the old code more reusable and maintainable for the new API.
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 4ce924eaf1b1..0abffdd10a1c 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -52,7 +52,7 @@  static inline void *encoded_page_address(unsigned long encoded_va)
 
 static inline void page_frag_cache_init(struct page_frag_cache *nc)
 {
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 
 static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 2544b292375a..4e6b1c4684f0 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -19,8 +19,27 @@ 
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
+static bool __page_frag_cache_reuse(unsigned long encoded_va,
+				    unsigned int pagecnt_bias)
+{
+	struct page *page;
+
+	page = virt_to_page((void *)encoded_va);
+	if (!page_ref_sub_and_test(page, pagecnt_bias))
+		return false;
+
+	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
+		free_unref_page(page, encoded_page_order(encoded_va));
+		return false;
+	}
+
+	/* OK, page count is 0, we can safely set it */
+	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+	return true;
+}
+
+static bool __page_frag_cache_refill(struct page_frag_cache *nc,
+				     gfp_t gfp_mask)
 {
 	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
 	struct page *page = NULL;
@@ -35,8 +54,8 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	if (unlikely(!page)) {
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 		if (unlikely(!page)) {
-			nc->encoded_va = 0;
-			return NULL;
+			memset(nc, 0, sizeof(*nc));
+			return false;
 		}
 
 		order = 0;
@@ -45,7 +64,33 @@  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	nc->encoded_va = encode_aligned_va(page_address(page), order,
 					   page_is_pfmemalloc(page));
 
-	return page;
+	/* Even if we own the page, we do not use atomic_set().
+	 * This would break get_page_unless_zero() users.
+	 */
+	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+	return true;
+}
+
+/* Reload cache by reusing the old cache if it is possible, or
+ * refilling from the page allocator.
+ */
+static bool __page_frag_cache_reload(struct page_frag_cache *nc,
+				     gfp_t gfp_mask)
+{
+	if (likely(nc->encoded_va)) {
+		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
+			goto out;
+	}
+
+	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
+		return false;
+
+out:
+	/* reset page count bias and remaining to start of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	nc->remaining = page_frag_cache_page_size(nc->encoded_va);
+	return true;
 }
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
@@ -55,7 +100,7 @@  void page_frag_cache_drain(struct page_frag_cache *nc)
 
 	__page_frag_cache_drain(virt_to_head_page((void *)nc->encoded_va),
 				nc->pagecnt_bias);
-	nc->encoded_va = 0;
+	memset(nc, 0, sizeof(*nc));
 }
 EXPORT_SYMBOL(page_frag_cache_drain);
 
@@ -73,67 +118,44 @@  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int align_mask)
 {
 	unsigned long encoded_va = nc->encoded_va;
-	unsigned int size, remaining;
-	struct page *page;
-
-	if (unlikely(!encoded_va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-		encoded_va = nc->encoded_va;
-		size = page_frag_cache_page_size(encoded_va);
-
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->remaining = size;
-	} else {
-		size = page_frag_cache_page_size(encoded_va);
-	}
+	unsigned int remaining;
 
 	remaining = nc->remaining & align_mask;
-	if (unlikely(remaining < fragsz)) {
-		if (unlikely(fragsz > PAGE_SIZE)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			return NULL;
-		}
-
-		page = virt_to_page((void *)encoded_va);
 
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
-			free_unref_page(page, encoded_page_order(encoded_va));
-			goto refill;
-		}
+	/* As we have ensured remaining is zero when initializing and draining old
+	 * cache, 'remaining >= fragsz' checking is enough to indicate there is
+	 * enough available space for the new fragment allocation.
+	 */
+	if (likely(remaining >= fragsz)) {
+		nc->pagecnt_bias--;
+		nc->remaining = remaining - fragsz;
 
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+		return encoded_page_address(encoded_va) +
+			(page_frag_cache_page_size(encoded_va) - remaining);
+	}
 
-		/* reset page count bias and remaining to start of new frag */
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		remaining = size;
+	if (unlikely(fragsz > PAGE_SIZE)) {
+		/*
+		 * The caller is trying to allocate a fragment with
+		 * fragsz > PAGE_SIZE but the cache isn't big enough to satisfy
+		 * the request, this may happen in low memory conditions. We don't
+		 * release the cache page because it could make memory pressure
+		 * worse so we simply return NULL here.
+		 */
+		return NULL;
 	}
 
+	if (unlikely(!__page_frag_cache_reload(nc, gfp_mask)))
+		return NULL;
+
+	/* As the we are allocating fragment from cache by count-up way, the offset
+	 * of allocated fragment from the just reloaded cache is zero, so remaining
+	 * aligning and offset calculation are not needed.
+	 */
 	nc->pagecnt_bias--;
-	nc->remaining = remaining - fragsz;
+	nc->remaining -= fragsz;
 
-	return encoded_page_address(encoded_va) + (size - remaining);
+	return encoded_page_address(nc->encoded_va);
 }
 EXPORT_SYMBOL(__page_frag_alloc_va_align);