diff mbox series

[net-next,v8,11/17] io_uring/zcrx: implement zerocopy receive pp memory provider

Message ID 20241204172204.4180482-12-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Dec. 4, 2024, 5:21 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

Implement a page pool memory provider for io_uring to receieve in a
zero copy fashion. For that, the provider allocates user pages wrapped
around into struct net_iovs, that are stored in a previously registered
struct net_iov_area.

Unlike with traditional receives, for which pages from a page pool can
be deallocated right after the user receives data, e.g. via recv(2),
we extend the lifetime by recycling buffers only after the user space
acknowledges that it's done processing the data via the refill queue.
Before handing buffers to the user, we mark them by bumping the refcount
by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
returned back. When the corresponding io_uring instance and/or page pool
are destroyed, we'll force back all buffers that are currently in the
user space in ->io_pp_zc_scrub by clearing the bias.

Refcounting and lifetime:

Initially, all buffers are considered unallocated and stored in
->freelist, at which point they are not yet directly exposed to the core
page pool code and not accounted to page pool's pages_state_hold_cnt.
The ->alloc_netmems callback will allocate them by placing into the
page pool's cache, setting the refcount to 1 as usual and adjusting
pages_state_hold_cnt.

Then, either the buffer is dropped and returns back to the page pool
into the ->freelist via io_pp_zc_release_netmem, in which case the page
pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
likely the buffer will go through the network/protocol stacks and end up
in the corresponding socket's receive queue. From there the user can get
it via an new io_uring request implemented in following patches. As
mentioned above, before giving a buffer to the user we bump the refcount
by IO_ZC_RX_UREF.

Once the user is done with the buffer processing, it must return it back
via the refill queue, from where our ->alloc_netmems implementation can
grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
there are no more users left. As we place such buffers right back into
the page pools fast cache and they didn't go through the normal pp
release path, they are still considered "allocated" and no pp hold_cnt
is required. For the same reason we dma sync buffers for the device
in io_zc_add_pp_cache().

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 io_uring/zcrx.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++
 io_uring/zcrx.h |   5 ++
 2 files changed, 220 insertions(+)

Comments

Jakub Kicinski Dec. 10, 2024, 4:01 a.m. UTC | #1
On Wed,  4 Dec 2024 09:21:50 -0800 David Wei wrote:
> Then, either the buffer is dropped and returns back to the page pool
> into the ->freelist via io_pp_zc_release_netmem, in which case the page
> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> likely the buffer will go through the network/protocol stacks and end up
> in the corresponding socket's receive queue. From there the user can get
> it via an new io_uring request implemented in following patches. As
> mentioned above, before giving a buffer to the user we bump the refcount
> by IO_ZC_RX_UREF.
> 
> Once the user is done with the buffer processing, it must return it back
> via the refill queue, from where our ->alloc_netmems implementation can
> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> there are no more users left. As we place such buffers right back into
> the page pools fast cache and they didn't go through the normal pp
> release path, they are still considered "allocated" and no pp hold_cnt
> is required. For the same reason we dma sync buffers for the device
> in io_zc_add_pp_cache().

Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
struct, we can add more fields. In fact we have 8B of padding in it
that can be allocated without growing the struct. So why play with
biases? You can add a 32b atomic counter for how many refs have been
handed out to the user.
Pavel Begunkov Dec. 10, 2024, 4:45 a.m. UTC | #2
On 12/10/24 04:01, Jakub Kicinski wrote:
> On Wed,  4 Dec 2024 09:21:50 -0800 David Wei wrote:
>> Then, either the buffer is dropped and returns back to the page pool
>> into the ->freelist via io_pp_zc_release_netmem, in which case the page
>> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
>> likely the buffer will go through the network/protocol stacks and end up
>> in the corresponding socket's receive queue. From there the user can get
>> it via an new io_uring request implemented in following patches. As
>> mentioned above, before giving a buffer to the user we bump the refcount
>> by IO_ZC_RX_UREF.
>>
>> Once the user is done with the buffer processing, it must return it back
>> via the refill queue, from where our ->alloc_netmems implementation can
>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
>> there are no more users left. As we place such buffers right back into
>> the page pools fast cache and they didn't go through the normal pp
>> release path, they are still considered "allocated" and no pp hold_cnt
>> is required. For the same reason we dma sync buffers for the device
>> in io_zc_add_pp_cache().
> 
> Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
> struct, we can add more fields. In fact we have 8B of padding in it
> that can be allocated without growing the struct. So why play with

I guess we can, though it's growing it for everyone not just
io_uring considering how indexing works, i.e. no embedding into
a larger struct.

> biases? You can add a 32b atomic counter for how many refs have been
> handed out to the user.

This set does it in a stupid way, but the bias allows to coalesce
operations with it into a single atomic. Regardless, it can be
placed separately, though we still need a good way to optimise
counting. Take a look at my reply with questions in the v7 thread,
I outlined what can work quite well in terms of performance but
needs a clear api for that from net/
Pavel Begunkov Dec. 10, 2024, 4:50 a.m. UTC | #3
On 12/10/24 04:45, Pavel Begunkov wrote:
> On 12/10/24 04:01, Jakub Kicinski wrote:
>> On Wed,  4 Dec 2024 09:21:50 -0800 David Wei wrote:
>>> Then, either the buffer is dropped and returns back to the page pool
>>> into the ->freelist via io_pp_zc_release_netmem, in which case the page
>>> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
>>> likely the buffer will go through the network/protocol stacks and end up
>>> in the corresponding socket's receive queue. From there the user can get
>>> it via an new io_uring request implemented in following patches. As
>>> mentioned above, before giving a buffer to the user we bump the refcount
>>> by IO_ZC_RX_UREF.
>>>
>>> Once the user is done with the buffer processing, it must return it back
>>> via the refill queue, from where our ->alloc_netmems implementation can
>>> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
>>> there are no more users left. As we place such buffers right back into
>>> the page pools fast cache and they didn't go through the normal pp
>>> release path, they are still considered "allocated" and no pp hold_cnt
>>> is required. For the same reason we dma sync buffers for the device
>>> in io_zc_add_pp_cache().
>>
>> Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
>> struct, we can add more fields. In fact we have 8B of padding in it
>> that can be allocated without growing the struct. So why play with
> 
> I guess we can, though it's growing it for everyone not just
> io_uring considering how indexing works, i.e. no embedding into
> a larger struct.
> 
>> biases? You can add a 32b atomic counter for how many refs have been
>> handed out to the user.
> 
> This set does it in a stupid way, but the bias allows to coalesce
> operations with it into a single atomic. Regardless, it can be
> placed separately, though we still need a good way to optimise
> counting. Take a look at my reply with questions in the v7 thread,
> I outlined what can work quite well in terms of performance but
> needs a clear api for that from net/

FWIW, I tried it and placed user refs into a separate array.
Without optimisations it'll be additional atomics + cache
bouncing, which is not great, but if we can somehow reuse the
frag ref as in replies to v7, that might work even better than
with the bias. Devmem might reuse that as well.
Mina Almasry Dec. 10, 2024, 4:55 p.m. UTC | #4
On Mon, Dec 9, 2024 at 8:01 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed,  4 Dec 2024 09:21:50 -0800 David Wei wrote:
> > Then, either the buffer is dropped and returns back to the page pool
> > into the ->freelist via io_pp_zc_release_netmem, in which case the page
> > pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> > likely the buffer will go through the network/protocol stacks and end up
> > in the corresponding socket's receive queue. From there the user can get
> > it via an new io_uring request implemented in following patches. As
> > mentioned above, before giving a buffer to the user we bump the refcount
> > by IO_ZC_RX_UREF.
> >
> > Once the user is done with the buffer processing, it must return it back
> > via the refill queue, from where our ->alloc_netmems implementation can
> > grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> > there are no more users left. As we place such buffers right back into
> > the page pools fast cache and they didn't go through the normal pp
> > release path, they are still considered "allocated" and no pp hold_cnt
> > is required. For the same reason we dma sync buffers for the device
> > in io_zc_add_pp_cache().
>
> Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
> struct, we can add more fields. In fact we have 8B of padding in it
> that can be allocated without growing the struct. So why play with
> biases? You can add a 32b atomic counter for how many refs have been
> handed out to the user.

Great idea IMO. I would prefer niov->pp_frag_ref to remain reserved
for pp refs used by dereferencing paths shared with pages and devmem
like napi_pp_put_page. Using an empty field in net_iov would alleviate
that concern.

I think I suggested something similar on v7, although maybe I
suggested putting it in an io_uring specific struct that hangs off the
net_iov to keep anything memory type specific outside of net_iov, but
a new field in net_iov is fine IMO.
Jakub Kicinski Dec. 11, 2024, 12:24 a.m. UTC | #5
On Tue, 10 Dec 2024 04:45:23 +0000 Pavel Begunkov wrote:
> > Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
> > struct, we can add more fields. In fact we have 8B of padding in it
> > that can be allocated without growing the struct. So why play with  
> 
> I guess we can, though it's growing it for everyone not just
> io_uring considering how indexing works, i.e. no embedding into
> a larger struct.

Right but we literally have 8B of "padding". We only need 32b counter
here, so there will still be 4B of unused space. Not to mention that
net_iov is not cacheline aligned today. Space is not a concern.

> > biases? You can add a 32b atomic counter for how many refs have been
> > handed out to the user.  
> 
> This set does it in a stupid way, but the bias allows to coalesce
> operations with it into a single atomic. Regardless, it can be
> placed separately, though we still need a good way to optimise
> counting. Take a look at my reply with questions in the v7 thread,
> I outlined what can work quite well in terms of performance but
> needs a clear api for that from net/

I was thinking along the lines of transferring the ownership of
the frags. But let's work on that as a follow up. Atomic add on 
an exclusively owned cacheline is 2 cycles on AMD if I'm looking
correctly.
Pavel Begunkov Dec. 11, 2024, 2:42 p.m. UTC | #6
On 12/11/24 00:24, Jakub Kicinski wrote:
> On Tue, 10 Dec 2024 04:45:23 +0000 Pavel Begunkov wrote:
>>> Can you say more about the IO_ZC_RX_UREF bias? net_iov is not the page
>>> struct, we can add more fields. In fact we have 8B of padding in it
>>> that can be allocated without growing the struct. So why play with
>>
>> I guess we can, though it's growing it for everyone not just
>> io_uring considering how indexing works, i.e. no embedding into
>> a larger struct.
> 
> Right but we literally have 8B of "padding". We only need 32b counter
> here, so there will still be 4B of unused space. Not to mention that
> net_iov is not cacheline aligned today. Space is not a concern.
> 
>>> biases? You can add a 32b atomic counter for how many refs have been
>>> handed out to the user.
>>
>> This set does it in a stupid way, but the bias allows to coalesce
>> operations with it into a single atomic. Regardless, it can be
>> placed separately, though we still need a good way to optimise
>> counting. Take a look at my reply with questions in the v7 thread,
>> I outlined what can work quite well in terms of performance but
>> needs a clear api for that from net/
> 
> I was thinking along the lines of transferring the ownership of
> the frags. But let's work on that as a follow up. Atomic add on

That's fine to leave it out for now and deal later, but what's
important for me when going through preliminary shittification of
the project is to have a way to optimise it after and a clear
understanding that it can't be left w/o it, and that there are
no strong opinions that would block it.

The current cache situation is too unfortunate, understandably so
with it being aliased to struct page. pp_ref_count is in the
same line with ->pp and others. Here an iov usually gets modified
by napi, then refcounted from syscall, after deferred skb put will
put it down back at napi context, and in some time after it gets
washed out from the cache, the user will finally return it back
to page pool.

> an exclusively owned cacheline is 2 cycles on AMD if I'm looking
> correctly.

Sounds too good to be true considering x86 implies a full barrier
for atomics. I wonder where the data comes from?
Jakub Kicinski Dec. 12, 2024, 1:38 a.m. UTC | #7
On Wed, 11 Dec 2024 14:42:43 +0000 Pavel Begunkov wrote:
> > I was thinking along the lines of transferring the ownership of
> > the frags. But let's work on that as a follow up. Atomic add on  
> 
> That's fine to leave it out for now and deal later, but what's
> important for me when going through preliminary shittification of
> the project is to have a way to optimise it after and a clear
> understanding that it can't be left w/o it, and that there are
> no strong opinions that would block it.
> 
> The current cache situation is too unfortunate, understandably so
> with it being aliased to struct page. pp_ref_count is in the
> same line with ->pp and others. Here an iov usually gets modified
> by napi, then refcounted from syscall, after deferred skb put will
> put it down back at napi context, and in some time after it gets
> washed out from the cache, the user will finally return it back
> to page pool.

Let's not get distracted. It's very unusual to have arguments about
microoptimizations before the initial version of the code is merged :|

> > an exclusively owned cacheline is 2 cycles on AMD if I'm looking
> > correctly.  
> 
> Sounds too good to be true considering x86 implies a full barrier
> for atomics.

Right but two barriers back to back are hopefully similar impact as one.

> I wonder where the data comes from?

Agner's instruction tables. What source do you use?
Pavel Begunkov Dec. 12, 2024, 1:42 p.m. UTC | #8
On 12/12/24 01:38, Jakub Kicinski wrote:
> On Wed, 11 Dec 2024 14:42:43 +0000 Pavel Begunkov wrote:
>>> I was thinking along the lines of transferring the ownership of
>>> the frags. But let's work on that as a follow up. Atomic add on
>>
>> That's fine to leave it out for now and deal later, but what's
>> important for me when going through preliminary shittification of
>> the project is to have a way to optimise it after and a clear
>> understanding that it can't be left w/o it, and that there are
>> no strong opinions that would block it.
>>
>> The current cache situation is too unfortunate, understandably so
>> with it being aliased to struct page. pp_ref_count is in the
>> same line with ->pp and others. Here an iov usually gets modified
>> by napi, then refcounted from syscall, after deferred skb put will
>> put it down back at napi context, and in some time after it gets
>> washed out from the cache, the user will finally return it back
>> to page pool.
> 
> Let's not get distracted. It's very unusual to have arguments about
> microoptimizations before the initial version of the code is merged :|

I can't avoid it since one of the goals is to save cpu cycles,
and it's not that micro either, but I hear you.

>>> an exclusively owned cacheline is 2 cycles on AMD if I'm looking
>>> correctly.
>>
>> Sounds too good to be true considering x86 implies a full barrier
>> for atomics.
> 
> Right but two barriers back to back are hopefully similar impact as one.
> 
>> I wonder where the data comes from?
> 
> Agner's instruction tables. What source do you use?

Mostly observational and from scattered hw knowledge. Seems like
the table says that the best case chained latency is ~8 cycles
for zen4, pretty good!
diff mbox series

Patch

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 8f838add94a4..7919f5e52c73 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -2,7 +2,12 @@ 
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/nospec.h>
+#include <linux/netdevice.h>
 #include <linux/io_uring.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -14,6 +19,16 @@ 
 
 #define IO_RQ_MAX_ENTRIES		32768
 
+__maybe_unused
+static const struct memory_provider_ops io_uring_pp_zc_ops;
+
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+	struct net_iov_area *owner = net_iov_owner(niov);
+
+	return container_of(owner, struct io_zcrx_area, nia);
+}
+
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg,
 				 struct io_uring_region_desc *rd)
@@ -104,6 +119,9 @@  static int io_zcrx_create_area(struct io_ring_ctx *ctx,
 		goto err;
 
 	for (i = 0; i < nr_pages; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+
+		niov->owner = &area->nia;
 		area->freelist[i] = i;
 	}
 
@@ -238,3 +256,200 @@  void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
 	lockdep_assert_held(&ctx->uring_lock);
 }
+
+static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
+{
+	return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
+}
+
+static bool io_zcrx_put_niov_uref(struct net_iov *niov)
+{
+	if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
+		return false;
+
+	return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+				      struct net_iov *niov)
+{
+}
+
+static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
+{
+	u32 entries;
+
+	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
+static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
+						 unsigned mask)
+{
+	unsigned int idx = ifq->cached_rq_head++ & mask;
+
+	return &ifq->rqes[idx];
+}
+
+static void io_zcrx_ring_refill(struct page_pool *pp,
+				struct io_zcrx_ifq *ifq)
+{
+	unsigned int entries = io_zcrx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+
+	entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
+	if (unlikely(!entries))
+		return;
+
+	do {
+		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
+		struct io_zcrx_area *area;
+		struct net_iov *niov;
+		unsigned niov_idx, area_idx;
+
+		area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
+		niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE;
+
+		if (unlikely(rqe->__pad || area_idx))
+			continue;
+		area = ifq->area;
+
+		if (unlikely(niov_idx >= area->nia.num_niovs))
+			continue;
+		niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
+
+		niov = &area->nia.niovs[niov_idx];
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));
+	} while (--entries);
+
+	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
+{
+	struct io_zcrx_area *area = ifq->area;
+
+	spin_lock_bh(&area->freelist_lock);
+	while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+		struct net_iov *niov;
+		u32 pgid;
+
+		pgid = area->freelist[--area->free_count];
+		niov = &area->nia.niovs[pgid];
+
+		page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));
+
+		pp->pages_state_hold_cnt++;
+		trace_page_pool_state_hold(pp, net_iov_to_netmem(niov),
+					   pp->pages_state_hold_cnt);
+	}
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_recycle_niov(struct net_iov *niov)
+{
+	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+	spin_lock_bh(&area->freelist_lock);
+	area->freelist[area->free_count++] = net_iov_idx(niov);
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+
+	/* pp should already be ensuring that */
+	if (unlikely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_ring_refill(pp, ifq);
+	if (likely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_refill_slow(pp, ifq);
+	if (!pp->alloc.count)
+		return 0;
+out_return:
+	return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
+{
+	struct net_iov *niov;
+
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return false;
+
+	niov = netmem_to_net_iov(netmem);
+
+	if (io_zcrx_niov_put(niov, 1))
+		io_zcrx_recycle_niov(niov);
+	return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int i;
+
+	/* Reclaim back all buffers given to the user space. */
+	for (i = 0; i < area->nia.num_niovs; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+		int count;
+
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		io_zcrx_recycle_niov(niov);
+
+		count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+		trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count);
+	}
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int ret;
+
+	if (!ifq)
+		return -EINVAL;
+	if (pp->p.order != 0)
+		return -EINVAL;
+	if (!pp->p.napi)
+		return -EINVAL;
+
+	ret = page_pool_mp_init_paged_area(pp, &area->nia, area->pages);
+	if (ret)
+		return ret;
+
+	percpu_ref_get(&ifq->ctx->refs);
+	ifq->pp = pp;
+	return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+
+	page_pool_mp_release_area(pp, &ifq->area->nia);
+
+	ifq->pp = NULL;
+
+	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
+		return;
+	percpu_ref_put(&ifq->ctx->refs);
+}
+
+static const struct memory_provider_ops io_uring_pp_zc_ops = {
+	.alloc_netmems		= io_pp_zc_alloc_netmems,
+	.release_netmem		= io_pp_zc_release_netmem,
+	.init			= io_pp_zc_init,
+	.destroy		= io_pp_zc_destroy,
+	.scrub			= io_pp_zc_scrub,
+};
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 07742c0cfcf3..8515cde78a2c 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -5,6 +5,9 @@ 
 #include <linux/io_uring_types.h>
 #include <net/page_pool/types.h>
 
+#define IO_ZC_RX_UREF			0x10000
+#define IO_ZC_RX_KREF_MASK		(IO_ZC_RX_UREF - 1)
+
 struct io_zcrx_area {
 	struct net_iov_area	nia;
 	struct io_zcrx_ifq	*ifq;
@@ -22,10 +25,12 @@  struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
 	struct io_zcrx_area		*area;
+	struct page_pool		*pp;
 
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe 	*rqes;
 	u32				rq_entries;
+	u32				cached_rq_head;
 
 	u32				if_rxq;