diff mbox series

[net-next,v3,09/18] page_pool: allow mixing PPs within one bulk

Message ID 20241030165201.442301-10-aleksander.lobakin@intel.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series xdp: a fistful of generic changes (+libeth_xdp) | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 3 maintainers not CCed: horms@kernel.org hawk@kernel.org ilias.apalodimas@linaro.org
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 249 this patch: 249
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 189 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0
netdev/contest warning net-next-2024-10-31--21-00 (tests: 779)

Commit Message

Alexander Lobakin Oct. 30, 2024, 4:51 p.m. UTC
The main reason for this change was to allow mixing pages from different
&page_pools within one &xdp_buff/&xdp_frame. Why not?
Adjust xdp_return_frame_bulk() and page_pool_put_page_bulk(), so that
they won't be tied to a particular pool. Let the latter create a
separate bulk of pages which's PP is different and flush it recursively.
This greatly optimizes xdp_return_frame_bulk(): no more hashtable
lookups. Also make xdp_flush_frame_bulk() inline, as it's just one if +
function call + one u32 read, not worth extending the call ladder.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool/types.h |  7 +++--
 include/net/xdp.h             | 16 ++++++++---
 net/core/page_pool.c          | 50 ++++++++++++++++++++++++++++++-----
 net/core/xdp.c                | 29 +-------------------
 4 files changed, 59 insertions(+), 43 deletions(-)

Comments

Toke Høiland-Jørgensen Nov. 1, 2024, 1:09 p.m. UTC | #1
Alexander Lobakin <aleksander.lobakin@intel.com> writes:

> The main reason for this change was to allow mixing pages from different
> &page_pools within one &xdp_buff/&xdp_frame. Why not?
> Adjust xdp_return_frame_bulk() and page_pool_put_page_bulk(), so that
> they won't be tied to a particular pool. Let the latter create a
> separate bulk of pages which's PP is different and flush it recursively.
> This greatly optimizes xdp_return_frame_bulk(): no more hashtable
> lookups. Also make xdp_flush_frame_bulk() inline, as it's just one if +
> function call + one u32 read, not worth extending the call ladder.
>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>

Neat idea, but one comment, see below:

[...]

>  /**
>   * page_pool_put_page_bulk() - release references on multiple pages
> - * @pool:	pool from which pages were allocated
>   * @data:	array holding page pointers
>   * @count:	number of pages in @data
> + * @rec:	whether it's called recursively by itself
>   *
>   * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
>   * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
> @@ -854,21 +865,43 @@ EXPORT_SYMBOL(page_pool_put_unrefed_page);
>   * Please note the caller must not use data area after running
>   * page_pool_put_page_bulk(), as this function overwrites it.
>   */
> -void page_pool_put_page_bulk(struct page_pool *pool, struct page **data,
> -			     u32 count)
> +void page_pool_put_page_bulk(struct page **data, u32 count, bool rec)
>  {
> +	struct page_pool *pool = NULL;
> +	struct xdp_frame_bulk sub;
>  	int i, bulk_len = 0;
>  	bool allow_direct;
>  	bool in_softirq;
>  
> -	allow_direct = page_pool_napi_local(pool);
> +	xdp_frame_bulk_init(&sub);
>  
>  	for (i = 0; i < count; i++) {
> -		netmem_ref netmem = page_to_netmem(compound_head(data[i]));
> +		struct page *page;
> +		netmem_ref netmem;
> +
> +		if (!rec) {
> +			page = compound_head(data[i]);
> +			netmem = page_to_netmem(page);
>  
> -		/* It is not the last user for the page frag case */
> -		if (!page_pool_is_last_ref(netmem))
> +			/* It is not the last user for the page frag case */
> +			if (!page_pool_is_last_ref(netmem))
> +				continue;
> +		} else {
> +			page = data[i];
> +			netmem = page_to_netmem(page);
> +		}
> +
> +		if (unlikely(!pool)) {
> +			pool = page->pp;
> +			allow_direct = page_pool_napi_local(pool);
> +		} else if (page->pp != pool) {
> +			/*
> +			 * If the page belongs to a different page_pool, save
> +			 * it to handle recursively after the main loop.
> +			 */
> +			page_pool_bulk_rec_add(&sub, page);
>  			continue;
> +		}
>  
>  		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
>  		/* Approved for bulk recycling in ptr_ring cache */
> @@ -876,6 +909,9 @@ void page_pool_put_page_bulk(struct page_pool *pool, struct page **data,
>  			data[bulk_len++] = (__force void *)netmem;
>  	}
>  
> +	if (sub.count)
> +		page_pool_put_page_bulk(sub.q, sub.count, true);
> +

In the worst case here, this function can recursively call itself
XDP_BULK_QUEUE_SIZE (=16) times. Which will blow ~2.5k of stack size,
and lots of function call overhead. I'm not saying this level of
recursion is likely to happen today, but who knows about future uses? So
why not make it iterative instead of recursive (same basic idea, but
some kind of 'goto begin', or loop, instead of the recursive call)?

Something like:


void page_pool_put_page_bulk(void **data, int count)
{
	struct page *bulk_prod[XDP_BULK_QUEUE_SIZE];
	int page_count = 0, pages_left, bulk_len, i;
	bool allow_direct;
	bool in_softirq;

	for (i = 0; i < count; i++) {
		struct page *p = compound_head(data[i]));
                
		if (page_pool_is_last_ref(page_to_netmem(p)))
			data[page_count++] = p;
	}

begin:
	pool = data[0]->pp;
	allow_direct = page_pool_napi_local(pool);
	pages_left = 0;
	bulk_len = 0;

	for (i = 0; i < page_count; i++) {
                struct page *p = data[i];
		netmem_ref netmem;
                
		if (unlikely(p->pp != pool)) {
			data[pages_left++] = p;
			continue;
		}

		netmem = __page_pool_put_page(pool, page_to_netmem(p), -1, allow_direct);
		/* Approved for bulk recycling in ptr_ring cache */
		if (netmem)
			bulk_prod[bulk_len++] = (__force void *)netmem;
	}

	if (!bulk_len)
		goto out;

	/* Bulk producer into ptr_ring page_pool cache */
	in_softirq = page_pool_producer_lock(pool);
	for (i = 0; i < bulk_len; i++) {
		if (__ptr_ring_produce(&pool->ring, bulk_prod[i])) {
			/* ring full */
			recycle_stat_inc(pool, ring_full);
			break;
		}
	}
	recycle_stat_add(pool, ring, i);
	page_pool_producer_unlock(pool, in_softirq);

	/* Hopefully all pages was return into ptr_ring */
	if (likely(i == bulk_len))
		goto out;

	/* ptr_ring cache full, free remaining pages outside producer lock
	 * since put_page() with refcnt == 1 can be an expensive operation
	 */
	for (; i < bulk_len; i++)
		page_pool_return_page(pool, (__force netmem_ref)bulk_prod[i]);

out:
	if (pages_left) {
		page_count = pages_left;
		goto begin;
	}        
}
    
  
Personally I also think this is easier to read, and it gets rid of the
'rec' function parameter wart... :)

-Toke
Alexander Lobakin Nov. 4, 2024, 2:32 p.m. UTC | #2
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 01 Nov 2024 14:09:59 +0100

> Alexander Lobakin <aleksander.lobakin@intel.com> writes:
> 
>> The main reason for this change was to allow mixing pages from different
>> &page_pools within one &xdp_buff/&xdp_frame. Why not?
>> Adjust xdp_return_frame_bulk() and page_pool_put_page_bulk(), so that
>> they won't be tied to a particular pool. Let the latter create a
>> separate bulk of pages which's PP is different and flush it recursively.
>> This greatly optimizes xdp_return_frame_bulk(): no more hashtable
>> lookups. Also make xdp_flush_frame_bulk() inline, as it's just one if +
>> function call + one u32 read, not worth extending the call ladder.
>>
>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> 
> Neat idea, but one comment, see below:

[...]

>> +	if (sub.count)
>> +		page_pool_put_page_bulk(sub.q, sub.count, true);
>> +
> 
> In the worst case here, this function can recursively call itself
> XDP_BULK_QUEUE_SIZE (=16) times. Which will blow ~2.5k of stack size,
> and lots of function call overhead. I'm not saying this level of
> recursion is likely to happen today, but who knows about future uses? So
> why not make it iterative instead of recursive (same basic idea, but
> some kind of 'goto begin', or loop, instead of the recursive call)?

Oh, great idea!
I was also unsure about the recursion here. Initially, I wanted header
split frames, which usually have linear/header part from one PP and
frag/payload part from second PP, to be efficiently recycled in bulks.
Currently, it's not possible, as a bulk will look like [1, 2, 1, 2, ...]
IOW will be flush every frame.
But I realize the recursion is not really optimal here, just the first
that came to my mind. I'll give you Suggested-by here (or
Co-developed-by?), really liked your approach :>

Thanks,
Olek
Toke Høiland-Jørgensen Nov. 4, 2024, 4:22 p.m. UTC | #3
Alexander Lobakin <aleksander.lobakin@intel.com> writes:

> From: Toke Høiland-Jørgensen <toke@redhat.com>
> Date: Fri, 01 Nov 2024 14:09:59 +0100
>
>> Alexander Lobakin <aleksander.lobakin@intel.com> writes:
>> 
>>> The main reason for this change was to allow mixing pages from different
>>> &page_pools within one &xdp_buff/&xdp_frame. Why not?
>>> Adjust xdp_return_frame_bulk() and page_pool_put_page_bulk(), so that
>>> they won't be tied to a particular pool. Let the latter create a
>>> separate bulk of pages which's PP is different and flush it recursively.
>>> This greatly optimizes xdp_return_frame_bulk(): no more hashtable
>>> lookups. Also make xdp_flush_frame_bulk() inline, as it's just one if +
>>> function call + one u32 read, not worth extending the call ladder.
>>>
>>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
>> 
>> Neat idea, but one comment, see below:
>
> [...]
>
>>> +	if (sub.count)
>>> +		page_pool_put_page_bulk(sub.q, sub.count, true);
>>> +
>> 
>> In the worst case here, this function can recursively call itself
>> XDP_BULK_QUEUE_SIZE (=16) times. Which will blow ~2.5k of stack size,
>> and lots of function call overhead. I'm not saying this level of
>> recursion is likely to happen today, but who knows about future uses? So
>> why not make it iterative instead of recursive (same basic idea, but
>> some kind of 'goto begin', or loop, instead of the recursive call)?
>
> Oh, great idea!
> I was also unsure about the recursion here. Initially, I wanted header
> split frames, which usually have linear/header part from one PP and
> frag/payload part from second PP, to be efficiently recycled in bulks.
> Currently, it's not possible, as a bulk will look like [1, 2, 1, 2, ...]
> IOW will be flush every frame.
> But I realize the recursion is not really optimal here, just the first
> that came to my mind. I'll give you Suggested-by here (or
> Co-developed-by?), really liked your approach :>

Sure, co-developed-by SGTM :)

-Toke
diff mbox series

Patch

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 6c1be99a5959..9a01ce864afa 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -259,8 +259,7 @@  void page_pool_disable_direct_recycling(struct page_pool *pool);
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 			   const struct xdp_mem_info *mem);
-void page_pool_put_page_bulk(struct page_pool *pool, struct page **data,
-			     u32 count);
+void page_pool_put_page_bulk(struct page **data, u32 count, bool rec);
 #else
 static inline void page_pool_destroy(struct page_pool *pool)
 {
@@ -272,8 +271,8 @@  static inline void page_pool_use_xdp_mem(struct page_pool *pool,
 {
 }
 
-static inline void page_pool_put_page_bulk(struct page_pool *pool,
-					   struct page **data, u32 count)
+static inline void page_pool_put_page_bulk(struct page **data, u32 count,
+					   bool rec)
 {
 }
 #endif
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 4416cd4b5086..49f596513435 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -11,6 +11,8 @@ 
 #include <linux/netdevice.h>
 #include <linux/skbuff.h> /* skb_shared_info */
 
+#include <net/page_pool/types.h>
+
 /**
  * DOC: XDP RX-queue information
  *
@@ -193,14 +195,12 @@  xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame)
 #define XDP_BULK_QUEUE_SIZE	16
 struct xdp_frame_bulk {
 	int count;
-	void *xa;
 	struct page *q[XDP_BULK_QUEUE_SIZE];
 };
 
 static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
 {
-	/* bq->count will be zero'ed when bq->xa gets updated */
-	bq->xa = NULL;
+	bq->count = 0;
 }
 
 static inline struct skb_shared_info *
@@ -317,10 +317,18 @@  void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq);
 
+static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
+{
+	if (unlikely(!bq->count))
+		return;
+
+	page_pool_put_page_bulk(bq->q, bq->count, false);
+	bq->count = 0;
+}
+
 static __always_inline unsigned int
 xdp_get_frame_len(const struct xdp_frame *xdpf)
 {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ad219206ee8d..22b44f86dfa0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -839,11 +839,22 @@  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 }
 EXPORT_SYMBOL(page_pool_put_unrefed_page);
 
+static void page_pool_bulk_rec_add(struct xdp_frame_bulk *bulk,
+				   struct page *page)
+{
+	bulk->q[bulk->count++] = page;
+
+	if (unlikely(bulk->count == ARRAY_SIZE(bulk->q))) {
+		page_pool_put_page_bulk(bulk->q, ARRAY_SIZE(bulk->q), true);
+		bulk->count = 0;
+	}
+}
+
 /**
  * page_pool_put_page_bulk() - release references on multiple pages
- * @pool:	pool from which pages were allocated
  * @data:	array holding page pointers
  * @count:	number of pages in @data
+ * @rec:	whether it's called recursively by itself
  *
  * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
  * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
@@ -854,21 +865,43 @@  EXPORT_SYMBOL(page_pool_put_unrefed_page);
  * Please note the caller must not use data area after running
  * page_pool_put_page_bulk(), as this function overwrites it.
  */
-void page_pool_put_page_bulk(struct page_pool *pool, struct page **data,
-			     u32 count)
+void page_pool_put_page_bulk(struct page **data, u32 count, bool rec)
 {
+	struct page_pool *pool = NULL;
+	struct xdp_frame_bulk sub;
 	int i, bulk_len = 0;
 	bool allow_direct;
 	bool in_softirq;
 
-	allow_direct = page_pool_napi_local(pool);
+	xdp_frame_bulk_init(&sub);
 
 	for (i = 0; i < count; i++) {
-		netmem_ref netmem = page_to_netmem(compound_head(data[i]));
+		struct page *page;
+		netmem_ref netmem;
+
+		if (!rec) {
+			page = compound_head(data[i]);
+			netmem = page_to_netmem(page);
 
-		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_ref(netmem))
+			/* It is not the last user for the page frag case */
+			if (!page_pool_is_last_ref(netmem))
+				continue;
+		} else {
+			page = data[i];
+			netmem = page_to_netmem(page);
+		}
+
+		if (unlikely(!pool)) {
+			pool = page->pp;
+			allow_direct = page_pool_napi_local(pool);
+		} else if (page->pp != pool) {
+			/*
+			 * If the page belongs to a different page_pool, save
+			 * it to handle recursively after the main loop.
+			 */
+			page_pool_bulk_rec_add(&sub, page);
 			continue;
+		}
 
 		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
 		/* Approved for bulk recycling in ptr_ring cache */
@@ -876,6 +909,9 @@  void page_pool_put_page_bulk(struct page_pool *pool, struct page **data,
 			data[bulk_len++] = (__force void *)netmem;
 	}
 
+	if (sub.count)
+		page_pool_put_page_bulk(sub.q, sub.count, true);
+
 	if (!bulk_len)
 		return;
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 779e646f347b..0fde1bb54192 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -508,46 +508,19 @@  EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
  * xdp_frame_bulk is usually stored/allocated on the function
  * call-stack to avoid locking penalties.
  */
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
-{
-	struct xdp_mem_allocator *xa = bq->xa;
-
-	if (unlikely(!xa || !bq->count))
-		return;
-
-	page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
-	/* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
-	bq->count = 0;
-}
-EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
 
 /* Must be called with rcu_read_lock held */
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq)
 {
-	struct xdp_mem_info *mem = &xdpf->mem;
-	struct xdp_mem_allocator *xa;
-
-	if (mem->type != MEM_TYPE_PAGE_POOL) {
+	if (xdpf->mem.type != MEM_TYPE_PAGE_POOL) {
 		xdp_return_frame(xdpf);
 		return;
 	}
 
-	xa = bq->xa;
-	if (unlikely(!xa)) {
-		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-		bq->count = 0;
-		bq->xa = xa;
-	}
-
 	if (bq->count == XDP_BULK_QUEUE_SIZE)
 		xdp_flush_frame_bulk(bq);
 
-	if (unlikely(mem->id != xa->mem.id)) {
-		xdp_flush_frame_bulk(bq);
-		bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-	}
-
 	if (unlikely(xdp_frame_has_frags(xdpf))) {
 		struct skb_shared_info *sinfo;
 		int i;