diff mbox series

[net-next,v13,11/14] mm: page_frag: introduce prepare/probe/commit API

Message ID 20240808123714.462740-12-linyunsheng@huawei.com (mailing list archive)
State Deferred
Delegated to: Netdev Maintainers
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 72 this patch: 72
netdev/build_tools success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers success CCed 3 of 3 maintainers
netdev/build_clang success Errors and warnings before: 131 this patch: 131
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4982 this patch: 4982
netdev/checkpatch warning WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-08-09--09-00 (tests: 705)

Commit Message

Yunsheng Lin Aug. 8, 2024, 12:37 p.m. UTC
There are many use cases that need minimum memory in order
for forward progress, but more performant if more memory is
available or need to probe the cache info to use any memory
available for frag caoleasing reason.

Currently skb_page_frag_refill() API is used to solve the
above use cases, but caller needs to know about the internal
detail and access the data field of 'struct page_frag' to
meet the requirement of the above use cases and its
implementation is similar to the one in mm subsystem.

To unify those two page_frag implementations, introduce a
prepare API to ensure minimum memory is satisfied and return
how much the actual memory is available to the caller and a
probe API to report the current available memory to caller
without doing cache refilling. The caller needs to either call
the commit API to report how much memory it actually uses, or
not do so if deciding to not use any memory.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 include/linux/page_frag_cache.h |  75 ++++++++++++++++
 mm/page_frag_cache.c            | 152 ++++++++++++++++++++++++++++----
 2 files changed, 212 insertions(+), 15 deletions(-)

Comments

Alexander Duyck Aug. 14, 2024, 9 p.m. UTC | #1
On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> There are many use cases that need minimum memory in order
> for forward progress, but more performant if more memory is
> available or need to probe the cache info to use any memory
> available for frag caoleasing reason.
> 
> Currently skb_page_frag_refill() API is used to solve the
> above use cases, but caller needs to know about the internal
> detail and access the data field of 'struct page_frag' to
> meet the requirement of the above use cases and its
> implementation is similar to the one in mm subsystem.
> 
> To unify those two page_frag implementations, introduce a
> prepare API to ensure minimum memory is satisfied and return
> how much the actual memory is available to the caller and a
> probe API to report the current available memory to caller
> without doing cache refilling. The caller needs to either call
> the commit API to report how much memory it actually uses, or
> not do so if deciding to not use any memory.
> 
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  include/linux/page_frag_cache.h |  75 ++++++++++++++++
>  mm/page_frag_cache.c            | 152 ++++++++++++++++++++++++++++----
>  2 files changed, 212 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 0abffdd10a1c..ba5d7f8a03cd 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -7,6 +7,8 @@
>  #include <linux/build_bug.h>
>  #include <linux/log2.h>
>  #include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/mmdebug.h>
>  #include <linux/mm_types_task.h>
>  
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> @@ -67,6 +69,9 @@ static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc);
>  void __page_frag_cache_drain(struct page *page, unsigned int count);
> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> +				unsigned int *offset, unsigned int fragsz,
> +				gfp_t gfp);
>  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask);
> @@ -79,12 +84,82 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
>  }
>  
> +static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
> +{
> +	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
> +}
> +
>  static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
>  				       unsigned int fragsz, gfp_t gfp_mask)
>  {
>  	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
>  }
>  
> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
> +				 gfp_t gfp);
> +
> +static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
> +						     unsigned int *fragsz,
> +						     gfp_t gfp,
> +						     unsigned int align)
> +{
> +	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
> +	nc->remaining = nc->remaining & -align;
> +	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
> +}
> +
> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> +					unsigned int *offset,
> +					unsigned int *fragsz, gfp_t gfp);
> +
> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> +				     unsigned int *offset,
> +				     unsigned int *fragsz,
> +				     void **va, gfp_t gfp);
> +
> +static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
> +						 unsigned int *offset,
> +						 unsigned int *fragsz,
> +						 void **va)
> +{
> +	unsigned long encoded_va = nc->encoded_va;
> +	struct page *page;
> +
> +	VM_BUG_ON(!*fragsz);
> +	if (unlikely(nc->remaining < *fragsz))
> +		return NULL;
> +
> +	*va = encoded_page_address(encoded_va);
> +	page = virt_to_page(*va);
> +	*fragsz = nc->remaining;
> +	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
> +	*va += *offset;
> +
> +	return page;
> +}
> +

I still think this should be populating a bio_vec instead of passing
multiple arguments by pointer. With that you would be able to get all
the fields without as many arguments having to be passed.

> +static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
> +					  unsigned int fragsz)
> +{
> +	VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
> +	nc->pagecnt_bias--;
> +	nc->remaining -= fragsz;
> +}
> +

I would really like to see this accept a bio_vec as well. With that you
could verify the page and offset matches the expected value before
applying fragsz.

> +static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
> +						unsigned int fragsz)
> +{
> +	VM_BUG_ON(fragsz > nc->remaining);
> +	nc->remaining -= fragsz;
> +}
> +

Same here.

> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> +					 unsigned int fragsz)
> +{
> +	nc->pagecnt_bias++;
> +	nc->remaining += fragsz;
> +}
> +

This doesn't add up. Why would you need abort if you have commit? Isn't
this more of a revert? I wouldn't think that would be valid as it is
possible you took some sort of action that might have resulted in this
memory already being shared. We shouldn't allow rewinding the offset
pointer without knowing that there are no other entities sharing the
page.

>  void page_frag_free_va(void *addr);
>  
>  #endif
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 27596b84b452..f8fad7d2cca8 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -19,27 +19,27 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> -static bool __page_frag_cache_reuse(unsigned long encoded_va,
> -				    unsigned int pagecnt_bias)
> +static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
> +					    unsigned int pagecnt_bias)
>  {
>  	struct page *page;
>  
>  	page = virt_to_page((void *)encoded_va);
>  	if (!page_ref_sub_and_test(page, pagecnt_bias))
> -		return false;
> +		return NULL;
>  
>  	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
>  		free_unref_page(page, encoded_page_order(encoded_va));
> -		return false;
> +		return NULL;
>  	}
>  
>  	/* OK, page count is 0, we can safely set it */
>  	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> -	return true;
> +	return page;
>  }
>  
> -static bool __page_frag_cache_refill(struct page_frag_cache *nc,
> -				     gfp_t gfp_mask)
> +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> +					     gfp_t gfp_mask)
>  {
>  	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
>  	struct page *page = NULL;
> @@ -55,7 +55,7 @@ static bool __page_frag_cache_refill(struct page_frag_cache *nc,
>  		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
>  		if (unlikely(!page)) {
>  			memset(nc, 0, sizeof(*nc));
> -			return false;
> +			return NULL;
>  		}
>  
>  		order = 0;
> @@ -69,29 +69,151 @@ static bool __page_frag_cache_refill(struct page_frag_cache *nc,
>  	 */
>  	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>  
> -	return true;
> +	return page;
>  }
>  
>  /* Reload cache by reusing the old cache if it is possible, or
>   * refilling from the page allocator.
>   */
> -static bool __page_frag_cache_reload(struct page_frag_cache *nc,
> -				     gfp_t gfp_mask)
> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
> +					     gfp_t gfp_mask)
>  {
> +	struct page *page;
> +
>  	if (likely(nc->encoded_va)) {
> -		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> +		page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
> +		if (page)
>  			goto out;
>  	}
>  
> -	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> -		return false;
> +	page = __page_frag_cache_refill(nc, gfp_mask);
> +	if (unlikely(!page))
> +		return NULL;
>  
>  out:
>  	/* reset page count bias and remaining to start of new frag */
>  	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>  	nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> -	return true;
> +	return page;
> +}
> +

None of the functions above need to be returning page.

> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
> +				 unsigned int *fragsz, gfp_t gfp)
> +{
> +	unsigned int remaining = nc->remaining;
> +
> +	VM_BUG_ON(!*fragsz);
> +	if (likely(remaining >= *fragsz)) {
> +		unsigned long encoded_va = nc->encoded_va;
> +
> +		*fragsz = remaining;
> +
> +		return encoded_page_address(encoded_va) +
> +			(page_frag_cache_page_size(encoded_va) - remaining);
> +	}
> +
> +	if (unlikely(*fragsz > PAGE_SIZE))
> +		return NULL;
> +
> +	/* When reload fails, nc->encoded_va and nc->remaining are both reset
> +	 * to zero, so there is no need to check the return value here.
> +	 */
> +	__page_frag_cache_reload(nc, gfp);
> +
> +	*fragsz = nc->remaining;
> +	return encoded_page_address(nc->encoded_va);
> +}
> +EXPORT_SYMBOL(page_frag_alloc_va_prepare);
> +
> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> +					unsigned int *offset,
> +					unsigned int *fragsz, gfp_t gfp)
> +{
> +	unsigned int remaining = nc->remaining;
> +	struct page *page;
> +
> +	VM_BUG_ON(!*fragsz);
> +	if (likely(remaining >= *fragsz)) {
> +		unsigned long encoded_va = nc->encoded_va;
> +
> +		*offset = page_frag_cache_page_size(encoded_va) - remaining;
> +		*fragsz = remaining;
> +
> +		return virt_to_page((void *)encoded_va);
> +	}
> +
> +	if (unlikely(*fragsz > PAGE_SIZE))
> +		return NULL;
> +
> +	page = __page_frag_cache_reload(nc, gfp);
> +	*offset = 0;
> +	*fragsz = nc->remaining;
> +	return page;
> +}
> +EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
> +
> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> +				     unsigned int *offset,
> +				     unsigned int *fragsz,
> +				     void **va, gfp_t gfp)
> +{
> +	unsigned int remaining = nc->remaining;
> +	struct page *page;
> +
> +	VM_BUG_ON(!*fragsz);
> +	if (likely(remaining >= *fragsz)) {
> +		unsigned long encoded_va = nc->encoded_va;
> +
> +		*offset = page_frag_cache_page_size(encoded_va) - remaining;
> +		*va = encoded_page_address(encoded_va) + *offset;
> +		*fragsz = remaining;
> +
> +		return virt_to_page((void *)encoded_va);
> +	}
> +
> +	if (unlikely(*fragsz > PAGE_SIZE))
> +		return NULL;
> +
> +	page = __page_frag_cache_reload(nc, gfp);
> +	*offset = 0;
> +	*fragsz = nc->remaining;
> +	*va = encoded_page_address(nc->encoded_va);
> +
> +	return page;
> +}
> +EXPORT_SYMBOL(page_frag_alloc_prepare);
> +
> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> +				unsigned int *offset, unsigned int fragsz,
> +				gfp_t gfp)
> +{
> +	unsigned int remaining = nc->remaining;
> +	struct page *page;
> +
> +	VM_BUG_ON(!fragsz);
> +	if (likely(remaining >= fragsz)) {
> +		unsigned long encoded_va = nc->encoded_va;
> +
> +		*offset = page_frag_cache_page_size(encoded_va) -
> +				remaining;
> +
> +		return virt_to_page((void *)encoded_va);
> +	}
> +
> +	if (unlikely(fragsz > PAGE_SIZE))
> +		return NULL;
> +
> +	page = __page_frag_cache_reload(nc, gfp);
> +	if (unlikely(!page))
> +		return NULL;
> +
> +	*offset = 0;
> +	nc->remaining = remaining - fragsz;
> +	nc->pagecnt_bias--;
> +
> +	return page;
>  }
> +EXPORT_SYMBOL(page_frag_alloc_pg);

Again, this isn't returning a page. It is essentially returning a
bio_vec without calling it as such. You might as well pass the bio_vec
pointer as an argument and just have it populate it directly.

It would be identical to the existing page_frag for all intents and
purposes. In addition you could use that as an intermediate value
between the page_frag_cache for your prepare/commit call setup as you
could limit the size/bv_len to being the only item that can be
adjusted, specifically reduced between the prepare and commit calls.
Yunsheng Lin Aug. 15, 2024, 3:05 a.m. UTC | #2
On 2024/8/15 5:00, Alexander H Duyck wrote:

...

>
>> +static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
>> +						 unsigned int *offset,
>> +						 unsigned int *fragsz,
>> +						 void **va)
>> +{
>> +	unsigned long encoded_va = nc->encoded_va;
>> +	struct page *page;
>> +
>> +	VM_BUG_ON(!*fragsz);
>> +	if (unlikely(nc->remaining < *fragsz))
>> +		return NULL;
>> +
>> +	*va = encoded_page_address(encoded_va);
>> +	page = virt_to_page(*va);
>> +	*fragsz = nc->remaining;
>> +	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
>> +	*va += *offset;
>> +
>> +	return page;
>> +}
>> +
> 
> I still think this should be populating a bio_vec instead of passing
> multiple arguments by pointer. With that you would be able to get all
> the fields without as many arguments having to be passed.

As I was already arguing in [1]:
If most of the page_frag API callers doesn't access 'struct bio_vec'
directly and use something like bvec_iter_* API to do the accessing,
then I am agreed with the above argument.

But right now, most of the page_frag API callers are accessing 'va'
directly to do the memcpy'ing, and accessing 'page & off & len' directly
to do skb frag filling, so I am not really sure what's the point of
indirection using the 'struct bio_vec' here.

And adding 'struct bio_vec' for page_frag and accessing the value of it
directly may be against of the design choice of 'struct bio_vec', as
there seems to be no inline helper defined to access the value of
'struct bio_vec' directly in bvec.h

1. https://lore.kernel.org/all/ca6be29e-ab53-4673-9624-90d41616a154@huawei.com/

> 
>> +static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
>> +					  unsigned int fragsz)
>> +{
>> +	VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
>> +	nc->pagecnt_bias--;
>> +	nc->remaining -= fragsz;
>> +}
>> +
> 

> 
>> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
>> +					 unsigned int fragsz)
>> +{
>> +	nc->pagecnt_bias++;
>> +	nc->remaining += fragsz;
>> +}
>> +
> 
> This doesn't add up. Why would you need abort if you have commit? Isn't
> this more of a revert? I wouldn't think that would be valid as it is
> possible you took some sort of action that might have resulted in this
> memory already being shared. We shouldn't allow rewinding the offset
> pointer without knowing that there are no other entities sharing the
> page.

This is used for __tun_build_skb() in drivers/net/tun.c as below, mainly
used to avoid performance penalty for XDP drop case:

--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1598,21 +1598,19 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 }

 static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
-				       struct page_frag *alloc_frag, char *buf,
-				       int buflen, int len, int pad)
+				       char *buf, int buflen, int len, int pad)
 {
 	struct sk_buff *skb = build_skb(buf, buflen);

-	if (!skb)
+	if (!skb) {
+		page_frag_free_va(buf);
 		return ERR_PTR(-ENOMEM);
+	}

 	skb_reserve(skb, pad);
 	skb_put(skb, len);
 	skb_set_owner_w(skb, tfile->socket.sk);

-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
-
 	return skb;
 }

@@ -1660,7 +1658,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     struct virtio_net_hdr *hdr,
 				     int len, int *skb_xdp)
 {
-	struct page_frag *alloc_frag = &current->task_frag;
+	struct page_frag_cache *alloc_frag = &current->task_frag;
 	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
@@ -1676,16 +1674,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	buflen += SKB_DATA_ALIGN(len + pad);
 	rcu_read_unlock();

-	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
-	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+	buf = page_frag_alloc_va_align(alloc_frag, buflen, GFP_KERNEL,
+				       SMP_CACHE_BYTES);
+	if (unlikely(!buf))
 		return ERR_PTR(-ENOMEM);

-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied = copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + pad,
-				     len, from);
-	if (copied != len)
+	copied = copy_from_iter(buf + pad, len, from);
+	if (copied != len) {
+		page_frag_alloc_abort(alloc_frag, buflen);
 		return ERR_PTR(-EFAULT);
+	}

 	/* There's a small window that XDP may be set after the check
 	 * of xdp_prog above, this should be rare and for simplicity
@@ -1693,8 +1691,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	 */
 	if (hdr->gso_type || !xdp_prog) {
 		*skb_xdp = 1;
-		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
-				       pad);
+		return __tun_build_skb(tfile, buf, buflen, len, pad);
 	}

 	*skb_xdp = 0;
@@ -1711,21 +1708,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		xdp_prepare_buff(&xdp, buf, pad, len, false);

 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
-		if (act == XDP_REDIRECT || act == XDP_TX) {
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
-		}
 		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
-		if (err < 0) {
-			if (act == XDP_REDIRECT || act == XDP_TX)
-				put_page(alloc_frag->page);
-			goto out;
-		}
-
 		if (err == XDP_REDIRECT)
 			xdp_do_flush();
-		if (err != XDP_PASS)
+
+		if (err == XDP_REDIRECT || err == XDP_TX) {
+			goto out;
+		} else if (err < 0 || err != XDP_PASS) {
+			page_frag_alloc_abort(alloc_frag, buflen);
 			goto out;
+		}

 		pad = xdp.data - xdp.data_hard_start;
 		len = xdp.data_end - xdp.data;
@@ -1734,7 +1726,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	rcu_read_unlock();
 	local_bh_enable();

-	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
+	return __tun_build_skb(tfile, buf, buflen, len, pad);

 out:
 	bpf_net_ctx_clear(bpf_net_ctx);


> 
>>  void page_frag_free_va(void *addr);
>>  
>>  #endif
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c

...

>> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
>> +					     gfp_t gfp_mask)
>>  {
>> +	struct page *page;
>> +
>>  	if (likely(nc->encoded_va)) {
>> -		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
>> +		page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
>> +		if (page)
>>  			goto out;
>>  	}
>>  
>> -	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>> -		return false;
>> +	page = __page_frag_cache_refill(nc, gfp_mask);
>> +	if (unlikely(!page))
>> +		return NULL;
>>  
>>  out:
>>  	/* reset page count bias and remaining to start of new frag */
>>  	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>  	nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>> -	return true;
>> +	return page;
>> +}
>> +
> 
> None of the functions above need to be returning page.

Are you still suggesting to always use virt_to_page() even when it is
not really necessary? why not return the page here to avoid the
virt_to_page()?

> 
>> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
>> +				 unsigned int *fragsz, gfp_t gfp)
>> +{
>> +	unsigned int remaining = nc->remaining;
>> +
>> +	VM_BUG_ON(!*fragsz);
>> +	if (likely(remaining >= *fragsz)) {
>> +		unsigned long encoded_va = nc->encoded_va;
>> +
>> +		*fragsz = remaining;
>> +
>> +		return encoded_page_address(encoded_va) +
>> +			(page_frag_cache_page_size(encoded_va) - remaining);
>> +	}
>> +
>> +	if (unlikely(*fragsz > PAGE_SIZE))
>> +		return NULL;
>> +
>> +	/* When reload fails, nc->encoded_va and nc->remaining are both reset
>> +	 * to zero, so there is no need to check the return value here.
>> +	 */
>> +	__page_frag_cache_reload(nc, gfp);
>> +
>> +	*fragsz = nc->remaining;
>> +	return encoded_page_address(nc->encoded_va);
>> +}
>> +EXPORT_SYMBOL(page_frag_alloc_va_prepare);

...

>> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
>> +				unsigned int *offset, unsigned int fragsz,
>> +				gfp_t gfp)
>> +{
>> +	unsigned int remaining = nc->remaining;
>> +	struct page *page;
>> +
>> +	VM_BUG_ON(!fragsz);
>> +	if (likely(remaining >= fragsz)) {
>> +		unsigned long encoded_va = nc->encoded_va;
>> +
>> +		*offset = page_frag_cache_page_size(encoded_va) -
>> +				remaining;
>> +
>> +		return virt_to_page((void *)encoded_va);
>> +	}
>> +
>> +	if (unlikely(fragsz > PAGE_SIZE))
>> +		return NULL;
>> +
>> +	page = __page_frag_cache_reload(nc, gfp);
>> +	if (unlikely(!page))
>> +		return NULL;
>> +
>> +	*offset = 0;
>> +	nc->remaining = remaining - fragsz;
>> +	nc->pagecnt_bias--;
>> +
>> +	return page;
>>  }
>> +EXPORT_SYMBOL(page_frag_alloc_pg);
> 
> Again, this isn't returning a page. It is essentially returning a
> bio_vec without calling it as such. You might as well pass the bio_vec
> pointer as an argument and just have it populate it directly.

I really don't think your bio_vec suggestion make much sense  for now as
the reason mentioned in below:

"Through a quick look, there seems to be at least three structs which have
similar values: struct bio_vec & struct skb_frag & struct page_frag.

As your above agrument about using bio_vec, it seems it is ok to use any
one of them as each one of them seems to have almost all the values we
are using?

Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
> 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
the page_frag API, 'struct skb_frag' is the second preference because we
mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
preference because it just happen to have almost all the values needed.

Is there any specific reason other than the above "almost all the values you
are using are exposed by that structure already " that you prefer bio_vec?"

1. https://lore.kernel.org/all/ca6be29e-ab53-4673-9624-90d41616a154@huawei.com/

>
Alexander Duyck Aug. 15, 2024, 3:25 p.m. UTC | #3
On Wed, Aug 14, 2024 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/8/15 5:00, Alexander H Duyck wrote:

...

> >> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> >> +                                     unsigned int fragsz)
> >> +{
> >> +    nc->pagecnt_bias++;
> >> +    nc->remaining += fragsz;
> >> +}
> >> +
> >
> > This doesn't add up. Why would you need abort if you have commit? Isn't
> > this more of a revert? I wouldn't think that would be valid as it is
> > possible you took some sort of action that might have resulted in this
> > memory already being shared. We shouldn't allow rewinding the offset
> > pointer without knowing that there are no other entities sharing the
> > page.
>
> This is used for __tun_build_skb() in drivers/net/tun.c as below, mainly
> used to avoid performance penalty for XDP drop case:

Yeah, I reviewed that patch. As I said there, rewinding the offset
should be avoided unless you can verify you are the only owner of the
page as you have no guarantees that somebody else didn't take an
access to the page/data to send it off somewhere else. Once you expose
the page to any other entity it should be written off or committed in
your case and you should move on to the next block.


>
> >> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
> >> +                                         gfp_t gfp_mask)
> >>  {
> >> +    struct page *page;
> >> +
> >>      if (likely(nc->encoded_va)) {
> >> -            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> >> +            page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
> >> +            if (page)
> >>                      goto out;
> >>      }
> >>
> >> -    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> >> -            return false;
> >> +    page = __page_frag_cache_refill(nc, gfp_mask);
> >> +    if (unlikely(!page))
> >> +            return NULL;
> >>
> >>  out:
> >>      /* reset page count bias and remaining to start of new frag */
> >>      nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >>      nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> >> -    return true;
> >> +    return page;
> >> +}
> >> +
> >
> > None of the functions above need to be returning page.
>
> Are you still suggesting to always use virt_to_page() even when it is
> not really necessary? why not return the page here to avoid the
> virt_to_page()?

Yes. The likelihood of you needing to pass this out as a page should
be low as most cases will just be you using the virtual address
anyway. You are essentially trading off branching for not having to
use virt_to_page. It is unnecessary optimization.


>
> >> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> >> +                            unsigned int *offset, unsigned int fragsz,
> >> +                            gfp_t gfp)
> >> +{
> >> +    unsigned int remaining = nc->remaining;
> >> +    struct page *page;
> >> +
> >> +    VM_BUG_ON(!fragsz);
> >> +    if (likely(remaining >= fragsz)) {
> >> +            unsigned long encoded_va = nc->encoded_va;
> >> +
> >> +            *offset = page_frag_cache_page_size(encoded_va) -
> >> +                            remaining;
> >> +
> >> +            return virt_to_page((void *)encoded_va);
> >> +    }
> >> +
> >> +    if (unlikely(fragsz > PAGE_SIZE))
> >> +            return NULL;
> >> +
> >> +    page = __page_frag_cache_reload(nc, gfp);
> >> +    if (unlikely(!page))
> >> +            return NULL;
> >> +
> >> +    *offset = 0;
> >> +    nc->remaining = remaining - fragsz;
> >> +    nc->pagecnt_bias--;
> >> +
> >> +    return page;
> >>  }
> >> +EXPORT_SYMBOL(page_frag_alloc_pg);
> >
> > Again, this isn't returning a page. It is essentially returning a
> > bio_vec without calling it as such. You might as well pass the bio_vec
> > pointer as an argument and just have it populate it directly.
>
> I really don't think your bio_vec suggestion make much sense  for now as
> the reason mentioned in below:
>
> "Through a quick look, there seems to be at least three structs which have
> similar values: struct bio_vec & struct skb_frag & struct page_frag.
>
> As your above agrument about using bio_vec, it seems it is ok to use any
> one of them as each one of them seems to have almost all the values we
> are using?
>
> Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
> > 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
> the page_frag API, 'struct skb_frag' is the second preference because we
> mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
> preference because it just happen to have almost all the values needed.

That is why I said I would be okay with us passing page_frag in patch
12 after looking closer at the code. The fact is it should make the
review of that patch set much easier if you essentially just pass the
page_frag back out of the call. Then it could be used in exactly the
same way it was before and should reduce the total number of lines of
code that need to be changed.

> Is there any specific reason other than the above "almost all the values you
> are using are exposed by that structure already " that you prefer bio_vec?"
>
> 1. https://lore.kernel.org/all/ca6be29e-ab53-4673-9624-90d41616a154@huawei.com/

My reason for preferring bio_vec is that of the 3 it is the most setup
to be used as a local variable versus something stored in a struct
such as page_frag or used for some specialty user case such as
skb_frag_t. In addition it already has a set of helpers for converting
it to a virtual address or copying data to and from it which would
make it easier to get rid of a bunch of duplicate code.
Yunsheng Lin Aug. 16, 2024, 12:01 p.m. UTC | #4
On 2024/8/15 23:25, Alexander Duyck wrote:
> On Wed, Aug 14, 2024 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/8/15 5:00, Alexander H Duyck wrote:
> 
> ...
> 
>>>> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
>>>> +                                     unsigned int fragsz)
>>>> +{
>>>> +    nc->pagecnt_bias++;
>>>> +    nc->remaining += fragsz;
>>>> +}
>>>> +
>>>
>>> This doesn't add up. Why would you need abort if you have commit? Isn't
>>> this more of a revert? I wouldn't think that would be valid as it is
>>> possible you took some sort of action that might have resulted in this
>>> memory already being shared. We shouldn't allow rewinding the offset
>>> pointer without knowing that there are no other entities sharing the
>>> page.
>>
>> This is used for __tun_build_skb() in drivers/net/tun.c as below, mainly
>> used to avoid performance penalty for XDP drop case:
> 
> Yeah, I reviewed that patch. As I said there, rewinding the offset
> should be avoided unless you can verify you are the only owner of the
> page as you have no guarantees that somebody else didn't take an
> access to the page/data to send it off somewhere else. Once you expose
> the page to any other entity it should be written off or committed in
> your case and you should move on to the next block.

Yes, the expectation is that somebody else didn't take an access to the
page/data to send it off somewhere else between page_frag_alloc_va()
and page_frag_alloc_abort(), did you see expectation was broken in that
patch? If yes, we should fix that by using page_frag_free_va() related
API instead of using page_frag_alloc_abort().

> 
> 
>>
>>>> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
>>>> +                                         gfp_t gfp_mask)
>>>>  {
>>>> +    struct page *page;
>>>> +
>>>>      if (likely(nc->encoded_va)) {
>>>> -            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
>>>> +            page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
>>>> +            if (page)
>>>>                      goto out;
>>>>      }
>>>>
>>>> -    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>>>> -            return false;
>>>> +    page = __page_frag_cache_refill(nc, gfp_mask);
>>>> +    if (unlikely(!page))
>>>> +            return NULL;
>>>>
>>>>  out:
>>>>      /* reset page count bias and remaining to start of new frag */
>>>>      nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>>      nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>>>> -    return true;
>>>> +    return page;
>>>> +}
>>>> +
>>>
>>> None of the functions above need to be returning page.
>>
>> Are you still suggesting to always use virt_to_page() even when it is
>> not really necessary? why not return the page here to avoid the
>> virt_to_page()?
> 
> Yes. The likelihood of you needing to pass this out as a page should
> be low as most cases will just be you using the virtual address
> anyway. You are essentially trading off branching for not having to
> use virt_to_page. It is unnecessary optimization.

As my understanding, I am not trading off branching for not having to
use virt_to_page, the branching is already needed no matter we utilize
it to avoid calling virt_to_page() or not, please be more specific about
which branching is traded off for not having to use virt_to_page() here.


> 
> 
>>
>>>> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
>>>> +                            unsigned int *offset, unsigned int fragsz,
>>>> +                            gfp_t gfp)
>>>> +{
>>>> +    unsigned int remaining = nc->remaining;
>>>> +    struct page *page;
>>>> +
>>>> +    VM_BUG_ON(!fragsz);
>>>> +    if (likely(remaining >= fragsz)) {
>>>> +            unsigned long encoded_va = nc->encoded_va;
>>>> +
>>>> +            *offset = page_frag_cache_page_size(encoded_va) -
>>>> +                            remaining;
>>>> +
>>>> +            return virt_to_page((void *)encoded_va);
>>>> +    }
>>>> +
>>>> +    if (unlikely(fragsz > PAGE_SIZE))
>>>> +            return NULL;
>>>> +
>>>> +    page = __page_frag_cache_reload(nc, gfp);
>>>> +    if (unlikely(!page))
>>>> +            return NULL;
>>>> +
>>>> +    *offset = 0;
>>>> +    nc->remaining = remaining - fragsz;
>>>> +    nc->pagecnt_bias--;
>>>> +
>>>> +    return page;
>>>>  }
>>>> +EXPORT_SYMBOL(page_frag_alloc_pg);
>>>
>>> Again, this isn't returning a page. It is essentially returning a
>>> bio_vec without calling it as such. You might as well pass the bio_vec
>>> pointer as an argument and just have it populate it directly.
>>
>> I really don't think your bio_vec suggestion make much sense  for now as
>> the reason mentioned in below:
>>
>> "Through a quick look, there seems to be at least three structs which have
>> similar values: struct bio_vec & struct skb_frag & struct page_frag.
>>
>> As your above agrument about using bio_vec, it seems it is ok to use any
>> one of them as each one of them seems to have almost all the values we
>> are using?
>>
>> Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
>>> 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
>> the page_frag API, 'struct skb_frag' is the second preference because we
>> mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
>> preference because it just happen to have almost all the values needed.
> 
> That is why I said I would be okay with us passing page_frag in patch
> 12 after looking closer at the code. The fact is it should make the
> review of that patch set much easier if you essentially just pass the
> page_frag back out of the call. Then it could be used in exactly the
> same way it was before and should reduce the total number of lines of
> code that need to be changed.

So the your suggestion changed to something like below?

int page_frag_alloc_pfrag(struct page_frag_cache *nc, struct page_frag *pfrag);

The API naming of 'page_frag_alloc_pfrag' seems a little odd to me, any better
one in your mind?

> 
>> Is there any specific reason other than the above "almost all the values you
>> are using are exposed by that structure already " that you prefer bio_vec?"
>>
>> 1. https://lore.kernel.org/all/ca6be29e-ab53-4673-9624-90d41616a154@huawei.com/
> 
> My reason for preferring bio_vec is that of the 3 it is the most setup
> to be used as a local variable versus something stored in a struct
> such as page_frag or used for some specialty user case such as
> skb_frag_t. In addition it already has a set of helpers for converting
> it to a virtual address or copying data to and from it which would
> make it easier to get rid of a bunch of duplicate code.
Alexander Duyck Aug. 19, 2024, 3:52 p.m. UTC | #5
On Fri, Aug 16, 2024 at 5:01 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/8/15 23:25, Alexander Duyck wrote:
> > On Wed, Aug 14, 2024 at 8:05 PM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/8/15 5:00, Alexander H Duyck wrote:
> >
> > ...
> >
> >>>> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> >>>> +                                     unsigned int fragsz)
> >>>> +{
> >>>> +    nc->pagecnt_bias++;
> >>>> +    nc->remaining += fragsz;
> >>>> +}
> >>>> +
> >>>
> >>> This doesn't add up. Why would you need abort if you have commit? Isn't
> >>> this more of a revert? I wouldn't think that would be valid as it is
> >>> possible you took some sort of action that might have resulted in this
> >>> memory already being shared. We shouldn't allow rewinding the offset
> >>> pointer without knowing that there are no other entities sharing the
> >>> page.
> >>
> >> This is used for __tun_build_skb() in drivers/net/tun.c as below, mainly
> >> used to avoid performance penalty for XDP drop case:
> >
> > Yeah, I reviewed that patch. As I said there, rewinding the offset
> > should be avoided unless you can verify you are the only owner of the
> > page as you have no guarantees that somebody else didn't take an
> > access to the page/data to send it off somewhere else. Once you expose
> > the page to any other entity it should be written off or committed in
> > your case and you should move on to the next block.
>
> Yes, the expectation is that somebody else didn't take an access to the
> page/data to send it off somewhere else between page_frag_alloc_va()
> and page_frag_alloc_abort(), did you see expectation was broken in that
> patch? If yes, we should fix that by using page_frag_free_va() related
> API instead of using page_frag_alloc_abort().

The problem is when you expose it to XDP there are a number of
different paths it can take. As such you shouldn't be expecting XDP to
not do something like that. Basically you have to check the reference
count before you can rewind the page.

> >
> >
> >>
> >>>> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
> >>>> +                                         gfp_t gfp_mask)
> >>>>  {
> >>>> +    struct page *page;
> >>>> +
> >>>>      if (likely(nc->encoded_va)) {
> >>>> -            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> >>>> +            page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
> >>>> +            if (page)
> >>>>                      goto out;
> >>>>      }
> >>>>
> >>>> -    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> >>>> -            return false;
> >>>> +    page = __page_frag_cache_refill(nc, gfp_mask);
> >>>> +    if (unlikely(!page))
> >>>> +            return NULL;
> >>>>
> >>>>  out:
> >>>>      /* reset page count bias and remaining to start of new frag */
> >>>>      nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >>>>      nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> >>>> -    return true;
> >>>> +    return page;
> >>>> +}
> >>>> +
> >>>
> >>> None of the functions above need to be returning page.
> >>
> >> Are you still suggesting to always use virt_to_page() even when it is
> >> not really necessary? why not return the page here to avoid the
> >> virt_to_page()?
> >
> > Yes. The likelihood of you needing to pass this out as a page should
> > be low as most cases will just be you using the virtual address
> > anyway. You are essentially trading off branching for not having to
> > use virt_to_page. It is unnecessary optimization.
>
> As my understanding, I am not trading off branching for not having to
> use virt_to_page, the branching is already needed no matter we utilize
> it to avoid calling virt_to_page() or not, please be more specific about
> which branching is traded off for not having to use virt_to_page() here.

The virt_to_page overhead isn't that high. It would be better to just
use a consistent path rather than try to optimize for an unlikely
branch in your datapath.

> >
> >
> >>
> >>>> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> >>>> +                            unsigned int *offset, unsigned int fragsz,
> >>>> +                            gfp_t gfp)
> >>>> +{
> >>>> +    unsigned int remaining = nc->remaining;
> >>>> +    struct page *page;
> >>>> +
> >>>> +    VM_BUG_ON(!fragsz);
> >>>> +    if (likely(remaining >= fragsz)) {
> >>>> +            unsigned long encoded_va = nc->encoded_va;
> >>>> +
> >>>> +            *offset = page_frag_cache_page_size(encoded_va) -
> >>>> +                            remaining;
> >>>> +
> >>>> +            return virt_to_page((void *)encoded_va);
> >>>> +    }
> >>>> +
> >>>> +    if (unlikely(fragsz > PAGE_SIZE))
> >>>> +            return NULL;
> >>>> +
> >>>> +    page = __page_frag_cache_reload(nc, gfp);
> >>>> +    if (unlikely(!page))
> >>>> +            return NULL;
> >>>> +
> >>>> +    *offset = 0;
> >>>> +    nc->remaining = remaining - fragsz;
> >>>> +    nc->pagecnt_bias--;
> >>>> +
> >>>> +    return page;
> >>>>  }
> >>>> +EXPORT_SYMBOL(page_frag_alloc_pg);
> >>>
> >>> Again, this isn't returning a page. It is essentially returning a
> >>> bio_vec without calling it as such. You might as well pass the bio_vec
> >>> pointer as an argument and just have it populate it directly.
> >>
> >> I really don't think your bio_vec suggestion make much sense  for now as
> >> the reason mentioned in below:
> >>
> >> "Through a quick look, there seems to be at least three structs which have
> >> similar values: struct bio_vec & struct skb_frag & struct page_frag.
> >>
> >> As your above agrument about using bio_vec, it seems it is ok to use any
> >> one of them as each one of them seems to have almost all the values we
> >> are using?
> >>
> >> Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
> >>> 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
> >> the page_frag API, 'struct skb_frag' is the second preference because we
> >> mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
> >> preference because it just happen to have almost all the values needed.
> >
> > That is why I said I would be okay with us passing page_frag in patch
> > 12 after looking closer at the code. The fact is it should make the
> > review of that patch set much easier if you essentially just pass the
> > page_frag back out of the call. Then it could be used in exactly the
> > same way it was before and should reduce the total number of lines of
> > code that need to be changed.
>
> So the your suggestion changed to something like below?
>
> int page_frag_alloc_pfrag(struct page_frag_cache *nc, struct page_frag *pfrag);
>
> The API naming of 'page_frag_alloc_pfrag' seems a little odd to me, any better
> one in your mind?

Well at this point we are populating/getting/pulling a page frag from
the page frag cache. Maybe look for a word other than alloc such as
populate. Essentially what you are doing is populating the pfrag from
the frag cache, although I thought there was a size value you passed
for that isn't there?
Yunsheng Lin Aug. 20, 2024, 1:08 p.m. UTC | #6
On 2024/8/19 23:52, Alexander Duyck wrote:

>>
>> Yes, the expectation is that somebody else didn't take an access to the
>> page/data to send it off somewhere else between page_frag_alloc_va()
>> and page_frag_alloc_abort(), did you see expectation was broken in that
>> patch? If yes, we should fix that by using page_frag_free_va() related
>> API instead of using page_frag_alloc_abort().
> 
> The problem is when you expose it to XDP there are a number of
> different paths it can take. As such you shouldn't be expecting XDP to
> not do something like that. Basically you have to check the reference

Even if XDP operations like xdp_do_redirect() or tun_xdp_xmit() return
failure, we still can not do that? It seems odd that happens.
If not, can we use page_frag_alloc_abort() with fragsz being zero to avoid
atomic operation?

> count before you can rewind the page.
> 
>>>
>>>
>>>>
>>>>>> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
>>>>>> +                                         gfp_t gfp_mask)
>>>>>>  {
>>>>>> +    struct page *page;
>>>>>> +
>>>>>>      if (likely(nc->encoded_va)) {
>>>>>> -            if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
>>>>>> +            page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
>>>>>> +            if (page)
>>>>>>                      goto out;
>>>>>>      }
>>>>>>
>>>>>> -    if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>>>>>> -            return false;
>>>>>> +    page = __page_frag_cache_refill(nc, gfp_mask);
>>>>>> +    if (unlikely(!page))
>>>>>> +            return NULL;
>>>>>>
>>>>>>  out:
>>>>>>      /* reset page count bias and remaining to start of new frag */
>>>>>>      nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>>>>      nc->remaining = page_frag_cache_page_size(nc->encoded_va);
>>>>>> -    return true;
>>>>>> +    return page;
>>>>>> +}
>>>>>> +
>>>>>
>>>>> None of the functions above need to be returning page.
>>>>
>>>> Are you still suggesting to always use virt_to_page() even when it is
>>>> not really necessary? why not return the page here to avoid the
>>>> virt_to_page()?
>>>
>>> Yes. The likelihood of you needing to pass this out as a page should
>>> be low as most cases will just be you using the virtual address
>>> anyway. You are essentially trading off branching for not having to
>>> use virt_to_page. It is unnecessary optimization.
>>
>> As my understanding, I am not trading off branching for not having to
>> use virt_to_page, the branching is already needed no matter we utilize
>> it to avoid calling virt_to_page() or not, please be more specific about
>> which branching is traded off for not having to use virt_to_page() here.
> 
> The virt_to_page overhead isn't that high. It would be better to just
> use a consistent path rather than try to optimize for an unlikely
> branch in your datapath.

I am not sure if I understand what do you mean by 'consistent path' here.
If I understand your comment correctly, the path is already not consistent
to avoid having to fetch size multiple times multiple ways as mentioned in
[1]. As below, doesn't it seems nature to avoid virt_to_page() calling while
avoiding page_frag_cache_page_size() calling, even if it is an unlikely case
as you mentioned:

struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
                                unsigned int *offset, unsigned int fragsz,
                                gfp_t gfp)
{
        unsigned int remaining = nc->remaining;
        struct page *page;

        VM_BUG_ON(!fragsz);
        if (likely(remaining >= fragsz)) {
                unsigned long encoded_va = nc->encoded_va;

                *offset = page_frag_cache_page_size(encoded_va) -
                                remaining;

                return virt_to_page((void *)encoded_va);
        }

        if (unlikely(fragsz > PAGE_SIZE))
                return NULL;

        page = __page_frag_cache_reload(nc, gfp);
        if (unlikely(!page))
                return NULL;

        *offset = 0;
        nc->remaining -= fragsz;
        nc->pagecnt_bias--;

        return page;
}

1. https://lore.kernel.org/all/CAKgT0UeQ9gwYo7qttak0UgXC9+kunO2gedm_yjtPiMk4VJp9yQ@mail.gmail.com/

> 
>>>
>>>
>>>>
>>>>>> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
>>>>>> +                            unsigned int *offset, unsigned int fragsz,
>>>>>> +                            gfp_t gfp)
>>>>>> +{
>>>>>> +    unsigned int remaining = nc->remaining;
>>>>>> +    struct page *page;
>>>>>> +
>>>>>> +    VM_BUG_ON(!fragsz);
>>>>>> +    if (likely(remaining >= fragsz)) {
>>>>>> +            unsigned long encoded_va = nc->encoded_va;
>>>>>> +
>>>>>> +            *offset = page_frag_cache_page_size(encoded_va) -
>>>>>> +                            remaining;
>>>>>> +
>>>>>> +            return virt_to_page((void *)encoded_va);
>>>>>> +    }
>>>>>> +
>>>>>> +    if (unlikely(fragsz > PAGE_SIZE))
>>>>>> +            return NULL;
>>>>>> +
>>>>>> +    page = __page_frag_cache_reload(nc, gfp);
>>>>>> +    if (unlikely(!page))
>>>>>> +            return NULL;
>>>>>> +
>>>>>> +    *offset = 0;
>>>>>> +    nc->remaining = remaining - fragsz;
>>>>>> +    nc->pagecnt_bias--;
>>>>>> +
>>>>>> +    return page;
>>>>>>  }
>>>>>> +EXPORT_SYMBOL(page_frag_alloc_pg);
>>>>>
>>>>> Again, this isn't returning a page. It is essentially returning a
>>>>> bio_vec without calling it as such. You might as well pass the bio_vec
>>>>> pointer as an argument and just have it populate it directly.
>>>>
>>>> I really don't think your bio_vec suggestion make much sense  for now as
>>>> the reason mentioned in below:
>>>>
>>>> "Through a quick look, there seems to be at least three structs which have
>>>> similar values: struct bio_vec & struct skb_frag & struct page_frag.
>>>>
>>>> As your above agrument about using bio_vec, it seems it is ok to use any
>>>> one of them as each one of them seems to have almost all the values we
>>>> are using?
>>>>
>>>> Personally, my preference over them: 'struct page_frag' > 'struct skb_frag'
>>>>> 'struct bio_vec', as the naming of 'struct page_frag' seems to best match
>>>> the page_frag API, 'struct skb_frag' is the second preference because we
>>>> mostly need to fill skb frag anyway, and 'struct bio_vec' is the last
>>>> preference because it just happen to have almost all the values needed.
>>>
>>> That is why I said I would be okay with us passing page_frag in patch
>>> 12 after looking closer at the code. The fact is it should make the
>>> review of that patch set much easier if you essentially just pass the
>>> page_frag back out of the call. Then it could be used in exactly the
>>> same way it was before and should reduce the total number of lines of
>>> code that need to be changed.
>>
>> So the your suggestion changed to something like below?
>>
>> int page_frag_alloc_pfrag(struct page_frag_cache *nc, struct page_frag *pfrag);
>>
>> The API naming of 'page_frag_alloc_pfrag' seems a little odd to me, any better
>> one in your mind?
> 
> Well at this point we are populating/getting/pulling a page frag from
> the page frag cache. Maybe look for a word other than alloc such as
> populate. Essentially what you are doing is populating the pfrag from
> the frag cache, although I thought there was a size value you passed
> for that isn't there?

'struct page_frag' does have a size field, but I am not sure if I
understand what do you mean by  'although I thought there was a size
value you passed for that isn't there?‘ yet.
diff mbox series

Patch

diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index 0abffdd10a1c..ba5d7f8a03cd 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,8 @@ 
 #include <linux/build_bug.h>
 #include <linux/log2.h>
 #include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmdebug.h>
 #include <linux/mm_types_task.h>
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -67,6 +69,9 @@  static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
 
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
 				 unsigned int fragsz, gfp_t gfp_mask,
 				 unsigned int align_mask);
@@ -79,12 +84,82 @@  static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
 }
 
+static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
+{
+	return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
+}
+
 static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
 				       unsigned int fragsz, gfp_t gfp_mask)
 {
 	return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
 }
 
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
+				 gfp_t gfp);
+
+static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
+						     unsigned int *fragsz,
+						     gfp_t gfp,
+						     unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
+	nc->remaining = nc->remaining & -align;
+	return page_frag_alloc_va_prepare(nc, fragsz, gfp);
+}
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp);
+
+static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
+						 unsigned int *offset,
+						 unsigned int *fragsz,
+						 void **va)
+{
+	unsigned long encoded_va = nc->encoded_va;
+	struct page *page;
+
+	VM_BUG_ON(!*fragsz);
+	if (unlikely(nc->remaining < *fragsz))
+		return NULL;
+
+	*va = encoded_page_address(encoded_va);
+	page = virt_to_page(*va);
+	*fragsz = nc->remaining;
+	*offset = page_frag_cache_page_size(encoded_va) - *fragsz;
+	*va += *offset;
+
+	return page;
+}
+
+static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
+					  unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
+	nc->pagecnt_bias--;
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
+						unsigned int fragsz)
+{
+	VM_BUG_ON(fragsz > nc->remaining);
+	nc->remaining -= fragsz;
+}
+
+static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
+					 unsigned int fragsz)
+{
+	nc->pagecnt_bias++;
+	nc->remaining += fragsz;
+}
+
 void page_frag_free_va(void *addr);
 
 #endif
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 27596b84b452..f8fad7d2cca8 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -19,27 +19,27 @@ 
 #include <linux/page_frag_cache.h>
 #include "internal.h"
 
-static bool __page_frag_cache_reuse(unsigned long encoded_va,
-				    unsigned int pagecnt_bias)
+static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
+					    unsigned int pagecnt_bias)
 {
 	struct page *page;
 
 	page = virt_to_page((void *)encoded_va);
 	if (!page_ref_sub_and_test(page, pagecnt_bias))
-		return false;
+		return NULL;
 
 	if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
 		free_unref_page(page, encoded_page_order(encoded_va));
-		return false;
+		return NULL;
 	}
 
 	/* OK, page count is 0, we can safely set it */
 	set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-	return true;
+	return page;
 }
 
-static bool __page_frag_cache_refill(struct page_frag_cache *nc,
-				     gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
 {
 	unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
 	struct page *page = NULL;
@@ -55,7 +55,7 @@  static bool __page_frag_cache_refill(struct page_frag_cache *nc,
 		page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
 		if (unlikely(!page)) {
 			memset(nc, 0, sizeof(*nc));
-			return false;
+			return NULL;
 		}
 
 		order = 0;
@@ -69,29 +69,151 @@  static bool __page_frag_cache_refill(struct page_frag_cache *nc,
 	 */
 	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
 
-	return true;
+	return page;
 }
 
 /* Reload cache by reusing the old cache if it is possible, or
  * refilling from the page allocator.
  */
-static bool __page_frag_cache_reload(struct page_frag_cache *nc,
-				     gfp_t gfp_mask)
+static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
+					     gfp_t gfp_mask)
 {
+	struct page *page;
+
 	if (likely(nc->encoded_va)) {
-		if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
+		page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
+		if (page)
 			goto out;
 	}
 
-	if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
-		return false;
+	page = __page_frag_cache_refill(nc, gfp_mask);
+	if (unlikely(!page))
+		return NULL;
 
 out:
 	/* reset page count bias and remaining to start of new frag */
 	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 	nc->remaining = page_frag_cache_page_size(nc->encoded_va);
-	return true;
+	return page;
+}
+
+void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
+				 unsigned int *fragsz, gfp_t gfp)
+{
+	unsigned int remaining = nc->remaining;
+
+	VM_BUG_ON(!*fragsz);
+	if (likely(remaining >= *fragsz)) {
+		unsigned long encoded_va = nc->encoded_va;
+
+		*fragsz = remaining;
+
+		return encoded_page_address(encoded_va) +
+			(page_frag_cache_page_size(encoded_va) - remaining);
+	}
+
+	if (unlikely(*fragsz > PAGE_SIZE))
+		return NULL;
+
+	/* When reload fails, nc->encoded_va and nc->remaining are both reset
+	 * to zero, so there is no need to check the return value here.
+	 */
+	__page_frag_cache_reload(nc, gfp);
+
+	*fragsz = nc->remaining;
+	return encoded_page_address(nc->encoded_va);
+}
+EXPORT_SYMBOL(page_frag_alloc_va_prepare);
+
+struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
+					unsigned int *offset,
+					unsigned int *fragsz, gfp_t gfp)
+{
+	unsigned int remaining = nc->remaining;
+	struct page *page;
+
+	VM_BUG_ON(!*fragsz);
+	if (likely(remaining >= *fragsz)) {
+		unsigned long encoded_va = nc->encoded_va;
+
+		*offset = page_frag_cache_page_size(encoded_va) - remaining;
+		*fragsz = remaining;
+
+		return virt_to_page((void *)encoded_va);
+	}
+
+	if (unlikely(*fragsz > PAGE_SIZE))
+		return NULL;
+
+	page = __page_frag_cache_reload(nc, gfp);
+	*offset = 0;
+	*fragsz = nc->remaining;
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
+
+struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
+				     unsigned int *offset,
+				     unsigned int *fragsz,
+				     void **va, gfp_t gfp)
+{
+	unsigned int remaining = nc->remaining;
+	struct page *page;
+
+	VM_BUG_ON(!*fragsz);
+	if (likely(remaining >= *fragsz)) {
+		unsigned long encoded_va = nc->encoded_va;
+
+		*offset = page_frag_cache_page_size(encoded_va) - remaining;
+		*va = encoded_page_address(encoded_va) + *offset;
+		*fragsz = remaining;
+
+		return virt_to_page((void *)encoded_va);
+	}
+
+	if (unlikely(*fragsz > PAGE_SIZE))
+		return NULL;
+
+	page = __page_frag_cache_reload(nc, gfp);
+	*offset = 0;
+	*fragsz = nc->remaining;
+	*va = encoded_page_address(nc->encoded_va);
+
+	return page;
+}
+EXPORT_SYMBOL(page_frag_alloc_prepare);
+
+struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
+				unsigned int *offset, unsigned int fragsz,
+				gfp_t gfp)
+{
+	unsigned int remaining = nc->remaining;
+	struct page *page;
+
+	VM_BUG_ON(!fragsz);
+	if (likely(remaining >= fragsz)) {
+		unsigned long encoded_va = nc->encoded_va;
+
+		*offset = page_frag_cache_page_size(encoded_va) -
+				remaining;
+
+		return virt_to_page((void *)encoded_va);
+	}
+
+	if (unlikely(fragsz > PAGE_SIZE))
+		return NULL;
+
+	page = __page_frag_cache_reload(nc, gfp);
+	if (unlikely(!page))
+		return NULL;
+
+	*offset = 0;
+	nc->remaining = remaining - fragsz;
+	nc->pagecnt_bias--;
+
+	return page;
 }
+EXPORT_SYMBOL(page_frag_alloc_pg);
 
 void page_frag_cache_drain(struct page_frag_cache *nc)
 {