diff mbox series

[v7,11/15] io_uring/zcrx: implement zerocopy receive pp memory provider

Message ID 20241029230521.2385749-12-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 29, 2024, 11:05 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

Implement a page pool memory provider for io_uring to receieve in a
zero copy fashion. For that, the provider allocates user pages wrapped
around into struct net_iovs, that are stored in a previously registered
struct net_iov_area.

Unlike with traditional receives, for which pages from a page pool can
be deallocated right after the user receives data, e.g. via recv(2),
we extend the lifetime by recycling buffers only after the user space
acknowledges that it's done processing the data via the refill queue.
Before handing buffers to the user, we mark them by bumping the refcount
by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
returned back. When the corresponding io_uring instance and/or page pool
are destroyed, we'll force back all buffers that are currently in the
user space in ->io_pp_zc_scrub by clearing the bias.

Refcounting and lifetime:

Initially, all buffers are considered unallocated and stored in
->freelist, at which point they are not yet directly exposed to the core
page pool code and not accounted to page pool's pages_state_hold_cnt.
The ->alloc_netmems callback will allocate them by placing into the
page pool's cache, setting the refcount to 1 as usual and adjusting
pages_state_hold_cnt.

Then, either the buffer is dropped and returns back to the page pool
into the ->freelist via io_pp_zc_release_netmem, in which case the page
pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
likely the buffer will go through the network/protocol stacks and end up
in the corresponding socket's receive queue. From there the user can get
it via an new io_uring request implemented in following patches. As
mentioned above, before giving a buffer to the user we bump the refcount
by IO_ZC_RX_UREF.

Once the user is done with the buffer processing, it must return it back
via the refill queue, from where our ->alloc_netmems implementation can
grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
there are no more users left. As we place such buffers right back into
the page pools fast cache and they didn't go through the normal pp
release path, they are still considered "allocated" and no pp hold_cnt
is required. For the same reason we dma sync buffers for the device
in io_zc_add_pp_cache().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 io_uring/zcrx.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++
 io_uring/zcrx.h |   5 ++
 2 files changed, 220 insertions(+)

Comments

Mina Almasry Nov. 1, 2024, 8:06 p.m. UTC | #1
On Tue, Oct 29, 2024 at 4:06 PM David Wei <dw@davidwei.uk> wrote:
>
> From: Pavel Begunkov <asml.silence@gmail.com>
>
> Implement a page pool memory provider for io_uring to receieve in a
> zero copy fashion. For that, the provider allocates user pages wrapped
> around into struct net_iovs, that are stored in a previously registered
> struct net_iov_area.
>
> Unlike with traditional receives, for which pages from a page pool can
> be deallocated right after the user receives data, e.g. via recv(2),
> we extend the lifetime by recycling buffers only after the user space
> acknowledges that it's done processing the data via the refill queue.
> Before handing buffers to the user, we mark them by bumping the refcount
> by a bias value IO_ZC_RX_UREF, which will be checked when the buffer is
> returned back. When the corresponding io_uring instance and/or page pool
> are destroyed, we'll force back all buffers that are currently in the
> user space in ->io_pp_zc_scrub by clearing the bias.
>
> Refcounting and lifetime:
>
> Initially, all buffers are considered unallocated and stored in
> ->freelist, at which point they are not yet directly exposed to the core
> page pool code and not accounted to page pool's pages_state_hold_cnt.
> The ->alloc_netmems callback will allocate them by placing into the
> page pool's cache, setting the refcount to 1 as usual and adjusting
> pages_state_hold_cnt.
>
> Then, either the buffer is dropped and returns back to the page pool
> into the ->freelist via io_pp_zc_release_netmem, in which case the page
> pool will match hold_cnt for us with ->pages_state_release_cnt. Or more
> likely the buffer will go through the network/protocol stacks and end up
> in the corresponding socket's receive queue. From there the user can get
> it via an new io_uring request implemented in following patches. As
> mentioned above, before giving a buffer to the user we bump the refcount
> by IO_ZC_RX_UREF.
>
> Once the user is done with the buffer processing, it must return it back
> via the refill queue, from where our ->alloc_netmems implementation can
> grab it, check references, put IO_ZC_RX_UREF, and recycle the buffer if
> there are no more users left. As we place such buffers right back into
> the page pools fast cache and they didn't go through the normal pp
> release path, they are still considered "allocated" and no pp hold_cnt
> is required. For the same reason we dma sync buffers for the device
> in io_zc_add_pp_cache().
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: David Wei <dw@davidwei.uk>
> ---
>  io_uring/zcrx.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++
>  io_uring/zcrx.h |   5 ++
>  2 files changed, 220 insertions(+)
>
> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> index a276572fe953..aad35676207e 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -2,7 +2,12 @@
>  #include <linux/kernel.h>
>  #include <linux/errno.h>
>  #include <linux/mm.h>
> +#include <linux/nospec.h>
> +#include <linux/netdevice.h>
>  #include <linux/io_uring.h>
> +#include <net/page_pool/helpers.h>
> +#include <net/page_pool/memory_provider.h>
> +#include <trace/events/page_pool.h>
>
>  #include <uapi/linux/io_uring.h>
>
> @@ -14,6 +19,16 @@
>
>  #define IO_RQ_MAX_ENTRIES              32768
>
> +__maybe_unused
> +static const struct memory_provider_ops io_uring_pp_zc_ops;
> +
> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
> +{
> +       struct net_iov_area *owner = net_iov_owner(niov);
> +
> +       return container_of(owner, struct io_zcrx_area, nia);
> +}
> +

We discussed this before I disappeared on vacation but I'm not too
convinced to be honest, sorry.

It's invalid to call io_zcrx_iov_to_area on a devmem niov and vice
versa, right? So current and future code has to be very careful to
call the right helpers on the right niovs.

At the very least there needs to be a comment above all these
container_of helpers:

/* caller must have verified that this niov is devmem/io_zcrx */.

However I feel like even a comment is extremely error prone. These
container_of's are inside of the call stack of some helpers. I would
say we need a check. If we're concerned about performance, the check
can be behind DEBUG_NET_WARN_ON(), although even that is a bit iffy,
but could be fine. Doing this without a check seems too risky to me.

>  static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
>                                  struct io_uring_zcrx_ifq_reg *reg)
>  {
> @@ -99,6 +114,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>                 goto err;
>
>         for (i = 0; i < nr_pages; i++) {
> +               struct net_iov *niov = &area->nia.niovs[i];
> +
> +               niov->owner = &area->nia;
>                 area->freelist[i] = i;
>         }
>
> @@ -230,3 +248,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
>  {
>         lockdep_assert_held(&ctx->uring_lock);
>  }
> +
> +static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
> +{
> +       return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
> +}
> +
> +static bool io_zcrx_put_niov_uref(struct net_iov *niov)
> +{
> +       if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
> +               return false;
> +
> +       return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
> +}
> +

Sorry, I have to push back a bit against this. The refcounting of
netmem is already complicated. the paged netmem has 2 refcounts and
care needs to be taken when acquiring and dropping refcounts. net_iov
inherited the pp_ref_count but not the paged refcount, and again need
some special handling. skb_frag_unref takes very special care checking
pp->recycle, is_pp_netmem, and others to figure out the correct
refcount to put based on the type of the netmem and skb flag.

This code ignores all these generic code
skb_frag_unref/napi_pp_put_page/etc paths and uses raw access to
niv->pp_ref_count. If this is merged as-is, for posterity any changes
in netmem refcounting need to also account for this use case opting
out of these generic code paths that handle all other skb reffing
including devmem.

Additionally since you're opting out of the generic unreffing paths
you're also (as mentioned before) bypassing the pp recycling. AFAICT
that may be hurting your performance. IIUC you refill
PP_ALLOC_CACHE_REFILL (64) entries everytime _alloc_netmems is
entered, and you don't recycle netmem any other way, so your slow path
is entered 1/64 of the page_pool_alloc calls? That should be much
worse than what the normal pp recycling does, which returns all freed
netmem into its alloc.cache or the ptr_ring and hits *_alloc_netmems
much more rarely. There are also regular perf improvements and testing
to the generic pool recycling paths you're also opting out of.

I see a lot of downsides to opting out of the generic use cases. Is
there any reason the normal freeing paths are not applicable to your
use case?

> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
> +                                     struct net_iov *niov)
> +{
> +}
> +

Looks unused/empty.

> +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
> +{
> +       u32 entries;
> +
> +       entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
> +       return min(entries, ifq->rq_entries);
> +}
> +
> +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
> +                                                unsigned mask)
> +{
> +       unsigned int idx = ifq->cached_rq_head++ & mask;
> +
> +       return &ifq->rqes[idx];
> +}
> +
> +static void io_zcrx_ring_refill(struct page_pool *pp,
> +                               struct io_zcrx_ifq *ifq)
> +{
> +       unsigned int entries = io_zcrx_rqring_entries(ifq);
> +       unsigned int mask = ifq->rq_entries - 1;
> +
> +       entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
> +       if (unlikely(!entries))
> +               return;
> +
> +       do {
> +               struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
> +               struct io_zcrx_area *area;
> +               struct net_iov *niov;
> +               unsigned niov_idx, area_idx;
> +
> +               area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
> +               niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE;
> +
> +               if (unlikely(rqe->__pad || area_idx))
> +                       continue;
> +               area = ifq->area;
> +
> +               if (unlikely(niov_idx >= area->nia.num_niovs))
> +                       continue;
> +               niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
> +
> +               niov = &area->nia.niovs[niov_idx];
> +               if (!io_zcrx_put_niov_uref(niov))
> +                       continue;
> +               page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));

I'm sorry I pushed back against the provider filling the pp cache in
earlier iterations. That was very wrong.

*_alloc_netmems was meant to be a mirror of
__page_pool_alloc_pages_slow. Since the latter fills the pp->cache,
the former should also fill the pp->cache. The dmabuf mp is actually
deficient in this regard. I'll look into fixing it. This part is more
than fine for me.

I'm still unsure about opting out of the generic freeing paths as I
mentioned above though.

> +       } while (--entries);
> +
> +       smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
> +}
> +
> +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
> +{
> +       struct io_zcrx_area *area = ifq->area;
> +
> +       spin_lock_bh(&area->freelist_lock);
> +       while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
> +               struct net_iov *niov;
> +               u32 pgid;
> +
> +               pgid = area->freelist[--area->free_count];
> +               niov = &area->nia.niovs[pgid];
> +
> +               page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));
> +
> +               pp->pages_state_hold_cnt++;
> +               trace_page_pool_state_hold(pp, net_iov_to_netmem(niov),
> +                                          pp->pages_state_hold_cnt);
> +       }
> +       spin_unlock_bh(&area->freelist_lock);
> +}
> +
> +static void io_zcrx_recycle_niov(struct net_iov *niov)
> +{
> +       struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
> +
> +       spin_lock_bh(&area->freelist_lock);
> +       area->freelist[area->free_count++] = net_iov_idx(niov);
> +       spin_unlock_bh(&area->freelist_lock);
> +}
> +
> +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
> +{
> +       struct io_zcrx_ifq *ifq = pp->mp_priv;
> +
> +       /* pp should already be ensuring that */
> +       if (unlikely(pp->alloc.count))
> +               goto out_return;
> +
> +       io_zcrx_ring_refill(pp, ifq);
> +       if (likely(pp->alloc.count))
> +               goto out_return;
> +
> +       io_zcrx_refill_slow(pp, ifq);
> +       if (!pp->alloc.count)
> +               return 0;
> +out_return:
> +       return pp->alloc.cache[--pp->alloc.count];
> +}
> +
> +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
> +{
> +       struct net_iov *niov;
> +
> +       if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
> +               return false;
> +
> +       niov = netmem_to_net_iov(netmem);
> +
> +       if (io_zcrx_niov_put(niov, 1))
> +               io_zcrx_recycle_niov(niov);
> +       return false;
> +}
> +
> +static void io_pp_zc_scrub(struct page_pool *pp)
> +{
> +       struct io_zcrx_ifq *ifq = pp->mp_priv;
> +       struct io_zcrx_area *area = ifq->area;
> +       int i;
> +
> +       /* Reclaim back all buffers given to the user space. */
> +       for (i = 0; i < area->nia.num_niovs; i++) {
> +               struct net_iov *niov = &area->nia.niovs[i];
> +               int count;
> +
> +               if (!io_zcrx_put_niov_uref(niov))
> +                       continue;
> +               io_zcrx_recycle_niov(niov);
> +
> +               count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
> +               trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count);
> +       }
> +}
> +
> +static int io_pp_zc_init(struct page_pool *pp)
> +{
> +       struct io_zcrx_ifq *ifq = pp->mp_priv;
> +       struct io_zcrx_area *area = ifq->area;
> +       int ret;
> +
> +       if (!ifq)
> +               return -EINVAL;
> +       if (pp->p.order != 0)
> +               return -EINVAL;
> +       if (!pp->p.napi)
> +               return -EINVAL;
> +
> +       ret = page_pool_mp_init_paged_area(pp, &area->nia, area->pages);
> +       if (ret)
> +               return ret;
> +
> +       percpu_ref_get(&ifq->ctx->refs);
> +       ifq->pp = pp;
> +       return 0;
> +}
> +
> +static void io_pp_zc_destroy(struct page_pool *pp)
> +{
> +       struct io_zcrx_ifq *ifq = pp->mp_priv;
> +       struct io_zcrx_area *area = ifq->area;
> +
> +       page_pool_mp_release_area(pp, &ifq->area->nia);
> +
> +       ifq->pp = NULL;
> +
> +       if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
> +               return;
> +       percpu_ref_put(&ifq->ctx->refs);
> +}
> +
> +static const struct memory_provider_ops io_uring_pp_zc_ops = {
> +       .alloc_netmems          = io_pp_zc_alloc_netmems,
> +       .release_netmem         = io_pp_zc_release_netmem,
> +       .init                   = io_pp_zc_init,
> +       .destroy                = io_pp_zc_destroy,
> +       .scrub                  = io_pp_zc_scrub,
> +};
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
> index a8db61498c67..464b4bd89b64 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -5,6 +5,9 @@
>  #include <linux/io_uring_types.h>
>  #include <net/page_pool/types.h>
>
> +#define IO_ZC_RX_UREF                  0x10000
> +#define IO_ZC_RX_KREF_MASK             (IO_ZC_RX_UREF - 1)
> +
>  struct io_zcrx_area {
>         struct net_iov_area     nia;
>         struct io_zcrx_ifq      *ifq;
> @@ -22,10 +25,12 @@ struct io_zcrx_ifq {
>         struct io_ring_ctx              *ctx;
>         struct net_device               *dev;
>         struct io_zcrx_area             *area;
> +       struct page_pool                *pp;
>
>         struct io_uring                 *rq_ring;
>         struct io_uring_zcrx_rqe        *rqes;
>         u32                             rq_entries;
> +       u32                             cached_rq_head;
>
>         unsigned short                  n_rqe_pages;
>         struct page                     **rqe_pages;
> --
> 2.43.5
>


--
Thanks,
Mina
Pavel Begunkov Nov. 1, 2024, 9:09 p.m. UTC | #2
On 11/1/24 20:06, Mina Almasry wrote:
...
>> +__maybe_unused
>> +static const struct memory_provider_ops io_uring_pp_zc_ops;
>> +
>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
>> +{
>> +       struct net_iov_area *owner = net_iov_owner(niov);
>> +
>> +       return container_of(owner, struct io_zcrx_area, nia);
>> +}
>> +
> 
> We discussed this before I disappeared on vacation but I'm not too
> convinced to be honest, sorry.
> 
> It's invalid to call io_zcrx_iov_to_area on a devmem niov and vice
> versa, right? So current and future code has to be very careful to

Yes

> call the right helpers on the right niovs.
> 
> At the very least there needs to be a comment above all these
> container_of helpers:
> 
> /* caller must have verified that this niov is devmem/io_zcrx */.
> 
> However I feel like even a comment is extremely error prone. These
> container_of's are inside of the call stack of some helpers. I would
> say we need a check. If we're concerned about performance, the check
> can be behind DEBUG_NET_WARN_ON(), although even that is a bit iffy,
> but could be fine. Doing this without a check seems too risky to me.

No, it doesn't need a check nor it needs a comment. The very
essence of virtual function tables is that they're coupled
together with objects for which those function make sense and
called only for those objects. The only way to get here with
invalid net_iovs is to take one page pool and feed it with
net_iovs from other another page pool that won't be sane in
the first place.

That would be an equivalent of:

struct file *f1 = ...;
struct file *f2 = ...;

f1->f_op->read(f2, ...);

Maybe it looks strange for you in C, but it's same as putting
comments that a virtual function that it should be called only
for objects of that class:

struct A {
	virtual void foo() = 0;
};
struct B: public A {
	void foo() override {
		// we should only be called with objects of type
		// struct B (or anything inheriting it), check that
		if (!reinterpret_cast<struct B*>(this))
			throw;
		...
	}
}


>>   static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
>>                                   struct io_uring_zcrx_ifq_reg *reg)
>>   {
>> @@ -99,6 +114,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>>                  goto err;
>>
>>          for (i = 0; i < nr_pages; i++) {
>> +               struct net_iov *niov = &area->nia.niovs[i];
>> +
>> +               niov->owner = &area->nia;
>>                  area->freelist[i] = i;
>>          }
>>
>> @@ -230,3 +248,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
>>   {
>>          lockdep_assert_held(&ctx->uring_lock);
>>   }
>> +
>> +static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
>> +{
>> +       return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
>> +}
>> +
>> +static bool io_zcrx_put_niov_uref(struct net_iov *niov)
>> +{
>> +       if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
>> +               return false;
>> +
>> +       return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
>> +}
>> +
> 
> Sorry, I have to push back a bit against this. The refcounting of
> netmem is already complicated. the paged netmem has 2 refcounts and
> care needs to be taken when acquiring and dropping refcounts. net_iov
> inherited the pp_ref_count but not the paged refcount, and again need
> some special handling. skb_frag_unref takes very special care checking

Which is why it's using net_iovs.

> pp->recycle, is_pp_netmem, and others to figure out the correct

pp->recycle has nothing to do with the series. We don't add
it in any special way, and if it's broken it's broken even
for non-proivder buffers.

> refcount to put based on the type of the netmem and skb flag.

Just same as with the ->[un]readable flag, which is not
functionally needed, and if it's screwed many things can
go very wrong.

> This code ignores all these generic code
> skb_frag_unref/napi_pp_put_page/etc paths and uses raw access to

I don't see the point, they are not used because they're not
needed. Instead of checking whether it came from a page pool
and whether it's net_iov or not, in the path io_uring returns
it we already apriori know that they're from a specific page
pool, net_iov and from the current provider.

Same for optimisations provided by those helpers, they are
useful when you're transferring buffers from one context to
another, e.g. task recieve path -> napi / page_pool. In this
case they're already fetched in the right context without any
need to additionally jumping through the hoops. If anything,
it'd be odd to jump out of a window to climb a rope on the
other side of the building when you could've just walked 5
meters to the other room.

> niv->pp_ref_count. If this is merged as-is, for posterity any changes

Ok, let's add a helper then

> in netmem refcounting need to also account for this use case opting
> out of these generic code paths that handle all other skb reffing
> including devmem.
> 
> Additionally since you're opting out of the generic unreffing paths
> you're also (as mentioned before) bypassing the pp recycling. AFAICT
> that may be hurting your performance. IIUC you refill
> PP_ALLOC_CACHE_REFILL (64) entries everytime _alloc_netmems is
> entered, and you don't recycle netmem any other way, so your slow path
> is entered 1/64 of the page_pool_alloc calls? That should be much

One virtual call per 64 buffers gets enough of ammortisation. The
cache size can be extended if necessary.

> worse than what the normal pp recycling does, which returns all freed
> netmem into its alloc.cache or the ptr_ring and hits *_alloc_netmems

You return it from a syscall (a special sockopt), I'm pretty sure
overhead of just that syscall without any handling would be more
expensive than one virtual function call. Then you need to hit the
fast cache, and it's not unconditional, it has to be lucky enough
so that napi is not run or scheduled, and even then it has to
be very careful to avoid races. That's the best case for <64 entries
recycling, otherwise it's ptr_ring and spinlocks.

Note, the normal (non-zc) recycling happens in the receive
syscall, but it's not the normal path, and just like devmem we
have to give the buffer to the user and wait until it's returned
back.
  
> much more rarely. There are also regular perf improvements and testing
> to the generic pool recycling paths you're also opting out of.

For performance, see above. As for testing, tests come after code
functionality, not the other way around. Why we're even adding any
zero copy and interface when it could be old good and well tested
non-zerocopy recv(2)

> I see a lot of downsides to opting out of the generic use cases. Is
> there any reason the normal freeing paths are not applicable to your
> use case?
> 
>> +static inline void io_zc_add_pp_cache(struct page_pool *pp,
>> +                                     struct net_iov *niov)
>> +{
>> +}
>> +
> 
> Looks unused/empty.

Indeed, slipped through.
...
Pavel Begunkov Nov. 3, 2024, 9:52 p.m. UTC | #3
On 11/1/24 21:09, Pavel Begunkov wrote:
> On 11/1/24 20:06, Mina Almasry wrote:
> ...
>>> +__maybe_unused
>>> +static const struct memory_provider_ops io_uring_pp_zc_ops;
>>> +
>>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
>>> +{
>>> +       struct net_iov_area *owner = net_iov_owner(niov);
>>> +
>>> +       return container_of(owner, struct io_zcrx_area, nia);
>>> +}
>>> +
>>
>> We discussed this before I disappeared on vacation but I'm not too
>> convinced to be honest, sorry.

To expand on this one, a few weeks ago I outlined how you can employ
the compiler and verify correctness, and I don't really see a way to
convince you unless you're willing to check your claim that it can
go wrong. Turning it the other way around, if you see a path where it
could go wrong, please do let me know, and it'll certainly get fixed,
but until then I don't believe it's anyhow a blocker for the series.
Mina Almasry Nov. 4, 2024, 7:54 p.m. UTC | #4
On Fri, Nov 1, 2024 at 2:09 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 11/1/24 20:06, Mina Almasry wrote:
> ...
> >> +__maybe_unused
> >> +static const struct memory_provider_ops io_uring_pp_zc_ops;
> >> +
> >> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
> >> +{
> >> +       struct net_iov_area *owner = net_iov_owner(niov);
> >> +
> >> +       return container_of(owner, struct io_zcrx_area, nia);
> >> +}
> >> +
> >
> > We discussed this before I disappeared on vacation but I'm not too
> > convinced to be honest, sorry.
> >
> > It's invalid to call io_zcrx_iov_to_area on a devmem niov and vice
> > versa, right? So current and future code has to be very careful to
>
> Yes
>
> > call the right helpers on the right niovs.
> >
> > At the very least there needs to be a comment above all these
> > container_of helpers:
> >
> > /* caller must have verified that this niov is devmem/io_zcrx */.
> >
> > However I feel like even a comment is extremely error prone. These
> > container_of's are inside of the call stack of some helpers. I would
> > say we need a check. If we're concerned about performance, the check
> > can be behind DEBUG_NET_WARN_ON(), although even that is a bit iffy,
> > but could be fine. Doing this without a check seems too risky to me.
>
> No, it doesn't need a check nor it needs a comment. The very
> essence of virtual function tables is that they're coupled
> together with objects for which those function make sense and
> called only for those objects. The only way to get here with
> invalid net_iovs is to take one page pool and feed it with
> net_iovs from other another page pool that won't be sane in
> the first place.
>

That could happen. In fact the whole devmem tcp paths are very
carefully written to handle that

net_iovs are allocated from the page_pool, put in skbs, and then sit
in the sk receive queue. In pathological cases (user is
re/misconfiguring flow steering) we can have 1 sk receive queue that
has a mix of page skbs, devmem skbs, and io_uring skbs, and other
skbs.

Code that is processing the skbs in the receive queue has no idea
whether what kind of skb it is. That's why that code needs to check
whether the skb has readable frags, and that's why in this very series
you needed to add a check in tcp_recvmsg_dmabuf to make sure that its
a dmabuf skb, and you need to add a check to io_zcrx_recv_frag that
the frag inside it is io_uring niov. The code would be wrong without
it.

All I'm trying to say is that it's very error prone to rely on folks
writing and reviewing code to check that whenever dmabuf/io_rcrx/etc
handling is done, somewhere in the call stack a type verification
check has been made, and a DEBUG_NET_WARN could help avoid some subtle
memory corruption bugs.

> That would be an equivalent of:
>
> struct file *f1 = ...;
> struct file *f2 = ...;
>
> f1->f_op->read(f2, ...);
>
> Maybe it looks strange for you in C, but it's same as putting
> comments that a virtual function that it should be called only
> for objects of that class:
>
> struct A {
>         virtual void foo() = 0;
> };
> struct B: public A {
>         void foo() override {
>                 // we should only be called with objects of type
>                 // struct B (or anything inheriting it), check that
>                 if (!reinterpret_cast<struct B*>(this))
>                         throw;
>                 ...
>         }
> }
>
>

I'm not really sure I followed here. We do not get any type of
compiler or type safety from this code because the dma-buf niovs and
io_uring niovs are the same net_iov type.

We can get type safety by defining new types for dmabuf_net_iov and
io_uring_net_iov, then provide helpers:

dmabuf_net_iov *net_iov_to_dmabuf();
io_uring_net_iov *net_iov_to_io_uring();

The helpers can check the niov is of the right type once and do a
cast,  then the object with the specific type can be passed to all
future heplers without additional checks. This is one way to do it I
guess.

> >>   static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
> >>                                   struct io_uring_zcrx_ifq_reg *reg)
> >>   {
> >> @@ -99,6 +114,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx,
> >>                  goto err;
> >>
> >>          for (i = 0; i < nr_pages; i++) {
> >> +               struct net_iov *niov = &area->nia.niovs[i];
> >> +
> >> +               niov->owner = &area->nia;
> >>                  area->freelist[i] = i;
> >>          }
> >>
> >> @@ -230,3 +248,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
> >>   {
> >>          lockdep_assert_held(&ctx->uring_lock);
> >>   }
> >> +
> >> +static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
> >> +{
> >> +       return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
> >> +}
> >> +
> >> +static bool io_zcrx_put_niov_uref(struct net_iov *niov)
> >> +{
> >> +       if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
> >> +               return false;
> >> +
> >> +       return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
> >> +}
> >> +
> >
> > Sorry, I have to push back a bit against this. The refcounting of
> > netmem is already complicated. the paged netmem has 2 refcounts and
> > care needs to be taken when acquiring and dropping refcounts. net_iov
> > inherited the pp_ref_count but not the paged refcount, and again need
> > some special handling. skb_frag_unref takes very special care checking
>
> Which is why it's using net_iovs.
>
> > pp->recycle, is_pp_netmem, and others to figure out the correct
>
> pp->recycle has nothing to do with the series. We don't add
> it in any special way, and if it's broken it's broken even
> for non-proivder buffers.
>
> > refcount to put based on the type of the netmem and skb flag.
>
> Just same as with the ->[un]readable flag, which is not
> functionally needed, and if it's screwed many things can
> go very wrong.
>
> > This code ignores all these generic code
> > skb_frag_unref/napi_pp_put_page/etc paths and uses raw access to
>
> I don't see the point, they are not used because they're not
> needed. Instead of checking whether it came from a page pool
> and whether it's net_iov or not, in the path io_uring returns
> it we already apriori know that they're from a specific page
> pool, net_iov and from the current provider.
>
> Same for optimisations provided by those helpers, they are
> useful when you're transferring buffers from one context to
> another, e.g. task recieve path -> napi / page_pool. In this
> case they're already fetched in the right context without any
> need to additionally jumping through the hoops. If anything,
> it'd be odd to jump out of a window to climb a rope on the
> other side of the building when you could've just walked 5
> meters to the other room.
>

For me, "they are not used because they're not needed." is not enough
justification to ignore the generic code paths that support generic
use cases and add your own freeing path and recycling that needs to
work adjacent to generic paths for posterity. You need to provide
concrete reasons why the current code paths don't work for you and
can't be made to work for you.

Is it very complicated to napi_pp_put_page() niovs as the user puts
them in the refill queue without adding a new syscall? If so, is it
possible to do a niov equivalent of page_pool_put_page_bulk() of the
refill queue while/as you process the RX path?

If you've tested the generic code paths to be performance deficient
and your recycling is indeed better, you could improve the page_pool
to pull netmems when it needs to like you're doing here, but in a
generic way that applies to the page allocator and other providers.
Not a one-off implementation that only applies to your provider.

If you're absolutely set on ignoring the currently supported reffing
and implementing your own reffing and recycling for your use case,
sure, that could work, but please don't overload the
niov->pp_ref_count reserved for the generic use cases for this. Add
io_zcrx_area->io_uring_ref or something and do whatever you want with
it. Since it's not sharing the pp_ref_count with the generic code
paths I don't see them conflicting in the future.

--
Thanks,
Mina
Pavel Begunkov Nov. 4, 2024, 9:14 p.m. UTC | #5
On 11/4/24 19:54, Mina Almasry wrote:
> On Fri, Nov 1, 2024 at 2:09 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>>> However I feel like even a comment is extremely error prone. These
>>> container_of's are inside of the call stack of some helpers. I would
>>> say we need a check. If we're concerned about performance, the check
>>> can be behind DEBUG_NET_WARN_ON(), although even that is a bit iffy,
>>> but could be fine. Doing this without a check seems too risky to me.
>>
>> No, it doesn't need a check nor it needs a comment. The very
>> essence of virtual function tables is that they're coupled
>> together with objects for which those function make sense and
>> called only for those objects. The only way to get here with
>> invalid net_iovs is to take one page pool and feed it with
>> net_iovs from other another page pool that won't be sane in
>> the first place.
>>
> 
> That could happen. In fact the whole devmem tcp paths are very
> carefully written to handle that

What could happen? Calling ops of one page pool with net iovs
of another? Right, you can force yourself to write it this way,
but it's not sane code.

> net_iovs are allocated from the page_pool, put in skbs, and then sit
> in the sk receive queue. In pathological cases (user is
> re/misconfiguring flow steering) we can have 1 sk receive queue that
> has a mix of page skbs, devmem skbs, and io_uring skbs, and other
> skbs.
> 
> Code that is processing the skbs in the receive queue has no idea
> whether what kind of skb it is. That's why that code needs to check
> whether the skb has readable frags, and that's why in this very series
> you needed to add a check in tcp_recvmsg_dmabuf to make sure that its
> a dmabuf skb, and you need to add a check to io_zcrx_recv_frag that
> the frag inside it is io_uring niov. The code would be wrong without
> it.

Right, it's expanded to support multiple possible types instead of
"it's a devmem TCP thing and nothing else can ever use it". And it's
not new, devmem forks off the generic path, it just does it based on
skb->readable, which is no more than an optimisation and could've
been on the type of the buffer, e.g. is_net_iov(netmem).

> All I'm trying to say is that it's very error prone to rely on folks

It's really not, especially comparing to lots of other bits that
are much easier to screw up, skb->readable would be a stark
example, which we did catch failing many times.

> writing and reviewing code to check that whenever dmabuf/io_rcrx/etc
> handling is done, somewhere in the call stack a type verification
> check has been made, and a DEBUG_NET_WARN could help avoid some subtle
> memory corruption bugs.
> 
>> That would be an equivalent of:
>>
>> struct file *f1 = ...;
>> struct file *f2 = ...;
>>
>> f1->f_op->read(f2, ...);
>>
>> Maybe it looks strange for you in C, but it's same as putting
>> comments that a virtual function that it should be called only
>> for objects of that class:
>>
>> struct A {
>>          virtual void foo() = 0;
>> };
>> struct B: public A {
>>          void foo() override {
>>                  // we should only be called with objects of type
>>                  // struct B (or anything inheriting it), check that
>>                  if (!reinterpret_cast<struct B*>(this))
>>                          throw;
>>                  ...
>>          }
>> }
>>
>>
> 
> I'm not really sure I followed here. We do not get any type of
> compiler or type safety from this code because the dma-buf niovs and
> io_uring niovs are the same net_iov type.

Right, because it's C, but the basic idea of virtual dispatch
is in there.

> We can get type safety by defining new types for dmabuf_net_iov and
> io_uring_net_iov, then provide helpers:
> 
> dmabuf_net_iov *net_iov_to_dmabuf();
> io_uring_net_iov *net_iov_to_io_uring();

Directly aliasing it to parts of struct page doesn't leave
much space to extending types. The only option is for all
those types to have exactly same layout, but then there is
no much point in doing so.

> The helpers can check the niov is of the right type once and do a
> cast,  then the object with the specific type can be passed to all
> future heplers without additional checks. This is one way to do it I
> guess.
> 
...
>> Same for optimisations provided by those helpers, they are
>> useful when you're transferring buffers from one context to
>> another, e.g. task recieve path -> napi / page_pool. In this
>> case they're already fetched in the right context without any
>> need to additionally jumping through the hoops. If anything,
>> it'd be odd to jump out of a window to climb a rope on the
>> other side of the building when you could've just walked 5
>> meters to the other room.
>>
> 
> For me, "they are not used because they're not needed." is not enough
> justification to ignore the generic code paths that support generic
> use cases and add your own freeing path and recycling that needs to
> work adjacent to generic paths for posterity. You need to provide
> concrete reasons why the current code paths don't work for you and
> can't be made to work for you.

No, it more than justifies it, it's neither needed nor makes sense for
the chosen API, and the API is chosen so that it avoids those extra
steps of crossing contexts an extra time.

What you're saying is that It should work in a less efficient way and
(perhaps arguably) be less convenient to the user as it now needs to
care about batching, because that's how devmem TCP does it. It's not
really a good argument.

Let me give you a devmem TCP example of what you're saying. Why can't
you use the generic (copy) TCP path for devmem TCP? It's well
tested. The reason that it's about zero copy and copying adds... hmm...
a "copy" doesn't justify avoiding the generic path.

> Is it very complicated to napi_pp_put_page() niovs as the user puts
> them in the refill queue without adding a new syscall? If so, is it
> possible to do a niov equivalent of page_pool_put_page_bulk() of the
> refill queue while/as you process the RX path?

That adds an extra jump from one context to another for no apparent
reason just as mentioned above.

> If you've tested the generic code paths to be performance deficient
> and your recycling is indeed better, you could improve the page_pool
> to pull netmems when it needs to like you're doing here, but in a
> generic way that applies to the page allocator and other providers.
> Not a one-off implementation that only applies to your provider.

If I read it right, you're saying you need to improve devmem TCP
instead of doing an io_uring API, just like you indirectly declared
in the very beginning a couple of weeks ago. Again, if you're
against having an io_uring user API in general or against some
particular aspects of the API, then please state it clearly. If not,
I can leave the idea to you to entertain once it's merged.

> If you're absolutely set on ignoring the currently supported reffing
> and implementing your own reffing and recycling for your use case,
> sure, that could work, but please don't overload the
> niov->pp_ref_count reserved for the generic use cases for this. Add
> io_zcrx_area->io_uring_ref or something and do whatever you want with
> it. Since it's not sharing the pp_ref_count with the generic code
> paths I don't see them conflicting in the future.

That would be a performance problem, I don't believe they can't
live together.
Pavel Begunkov Nov. 4, 2024, 9:53 p.m. UTC | #6
On 11/4/24 21:14, Pavel Begunkov wrote:
> On 11/4/24 19:54, Mina Almasry wrote:
>> On Fri, Nov 1, 2024 at 2:09 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
...
>> If you've tested the generic code paths to be performance deficient
>> and your recycling is indeed better, you could improve the page_pool
>> to pull netmems when it needs to like you're doing here, but in a
>> generic way that applies to the page allocator and other providers.
>> Not a one-off implementation that only applies to your provider.
> 
> If I read it right, you're saying you need to improve devmem TCP
> instead of doing an io_uring API, just like you indirectly declared
> in the very beginning a couple of weeks ago. Again, if you're
> against having an io_uring user API in general or against some
> particular aspects of the API, then please state it clearly. If not,
> I can leave the idea to you to entertain once it's merged.

On top of it, that wouldn't make sense for the normal page pool path,
it already pushes pages via a ring (ptr_ring + caches) from one
context to another. The difference is that buffers with these zero
copy interfaces make an extra stop in the user space, from where
we directly push into the page pool, just like you can directly push
via ptr_ring when you're already in the kernel, even though it
requires more logic to handle untrusted user space.

It only makes for zero copy providers, and I do remember you was
suggested to take the same approach, I think it was Stan, but since
it didn't materialise I assume it's not of interest to devmem TCP.
David Wei Nov. 11, 2024, 9:15 p.m. UTC | #7
On 2024-11-04 11:54, Mina Almasry wrote:
> On Fri, Nov 1, 2024 at 2:09 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 11/1/24 20:06, Mina Almasry wrote:
>> ...
>>>> +__maybe_unused
>>>> +static const struct memory_provider_ops io_uring_pp_zc_ops;
>>>> +
>>>> +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
>>>> +{
>>>> +       struct net_iov_area *owner = net_iov_owner(niov);
>>>> +
>>>> +       return container_of(owner, struct io_zcrx_area, nia);
>>>> +}
>>>> +
>>>
>>> We discussed this before I disappeared on vacation but I'm not too
>>> convinced to be honest, sorry.
>>>
>>> It's invalid to call io_zcrx_iov_to_area on a devmem niov and vice
>>> versa, right? So current and future code has to be very careful to
>>
>> Yes
>>
>>> call the right helpers on the right niovs.
>>>
>>> At the very least there needs to be a comment above all these
>>> container_of helpers:
>>>
>>> /* caller must have verified that this niov is devmem/io_zcrx */.
>>>
>>> However I feel like even a comment is extremely error prone. These
>>> container_of's are inside of the call stack of some helpers. I would
>>> say we need a check. If we're concerned about performance, the check
>>> can be behind DEBUG_NET_WARN_ON(), although even that is a bit iffy,
>>> but could be fine. Doing this without a check seems too risky to me.
>>
>> No, it doesn't need a check nor it needs a comment. The very
>> essence of virtual function tables is that they're coupled
>> together with objects for which those function make sense and
>> called only for those objects. The only way to get here with
>> invalid net_iovs is to take one page pool and feed it with
>> net_iovs from other another page pool that won't be sane in
>> the first place.
>>
> 
> That could happen. In fact the whole devmem tcp paths are very
> carefully written to handle that
> 
> net_iovs are allocated from the page_pool, put in skbs, and then sit
> in the sk receive queue. In pathological cases (user is
> re/misconfiguring flow steering) we can have 1 sk receive queue that
> has a mix of page skbs, devmem skbs, and io_uring skbs, and other
> skbs.
> 
> Code that is processing the skbs in the receive queue has no idea
> whether what kind of skb it is. That's why that code needs to check
> whether the skb has readable frags, and that's why in this very series
> you needed to add a check in tcp_recvmsg_dmabuf to make sure that its
> a dmabuf skb, and you need to add a check to io_zcrx_recv_frag that
> the frag inside it is io_uring niov. The code would be wrong without
> it.

The checks are already there in e.g. io_zcrx_recv_frag() to prevent
io_zcrx_iov_to_area() from being called on a devmem niov.

io_zcrx_copy_chunk() does not need a check because it is guaranteed to
only call io_zcrx_iov_to_area() on an io_uring niov. Copying happens in
two cases:

1. Not a niov
2. If offset is in the linearized part of an skb

Both cases do not apply to devmem so it is safe even in the case of an
skb rcvbuf with a mixture of non-niov, devmem niov and io_uring niov.

> 
> All I'm trying to say is that it's very error prone to rely on folks
> writing and reviewing code to check that whenever dmabuf/io_rcrx/etc
> handling is done, somewhere in the call stack a type verification
> check has been made, and a DEBUG_NET_WARN could help avoid some subtle
> memory corruption bugs.

This is a fair ask. I'll address it in the next iteration.

> 
>> That would be an equivalent of:
>>
>> struct file *f1 = ...;
>> struct file *f2 = ...;
>>
>> f1->f_op->read(f2, ...);
>>
>> Maybe it looks strange for you in C, but it's same as putting
>> comments that a virtual function that it should be called only
>> for objects of that class:
>>
>> struct A {
>>         virtual void foo() = 0;
>> };
>> struct B: public A {
>>         void foo() override {
>>                 // we should only be called with objects of type
>>                 // struct B (or anything inheriting it), check that
>>                 if (!reinterpret_cast<struct B*>(this))
>>                         throw;
>>                 ...
>>         }
>> }
>>
>>
> 
> I'm not really sure I followed here. We do not get any type of
> compiler or type safety from this code because the dma-buf niovs and
> io_uring niovs are the same net_iov type.
> 
> We can get type safety by defining new types for dmabuf_net_iov and
> io_uring_net_iov, then provide helpers:
> 
> dmabuf_net_iov *net_iov_to_dmabuf();
> io_uring_net_iov *net_iov_to_io_uring();
> 
> The helpers can check the niov is of the right type once and do a
> cast,  then the object with the specific type can be passed to all
> future heplers without additional checks. This is one way to do it I
> guess.
> 
>>>>   static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
>>>>                                   struct io_uring_zcrx_ifq_reg *reg)
>>>>   {
>>>> @@ -99,6 +114,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx,
>>>>                  goto err;
>>>>
>>>>          for (i = 0; i < nr_pages; i++) {
>>>> +               struct net_iov *niov = &area->nia.niovs[i];
>>>> +
>>>> +               niov->owner = &area->nia;
>>>>                  area->freelist[i] = i;
>>>>          }
>>>>
>>>> @@ -230,3 +248,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
>>>>   {
>>>>          lockdep_assert_held(&ctx->uring_lock);
>>>>   }
>>>> +
>>>> +static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
>>>> +{
>>>> +       return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
>>>> +}
>>>> +
>>>> +static bool io_zcrx_put_niov_uref(struct net_iov *niov)
>>>> +{
>>>> +       if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
>>>> +               return false;
>>>> +
>>>> +       return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
>>>> +}
>>>> +
>>>
>>> Sorry, I have to push back a bit against this. The refcounting of
>>> netmem is already complicated. the paged netmem has 2 refcounts and
>>> care needs to be taken when acquiring and dropping refcounts. net_iov
>>> inherited the pp_ref_count but not the paged refcount, and again need
>>> some special handling. skb_frag_unref takes very special care checking
>>
>> Which is why it's using net_iovs.
>>
>>> pp->recycle, is_pp_netmem, and others to figure out the correct
>>
>> pp->recycle has nothing to do with the series. We don't add
>> it in any special way, and if it's broken it's broken even
>> for non-proivder buffers.
>>
>>> refcount to put based on the type of the netmem and skb flag.
>>
>> Just same as with the ->[un]readable flag, which is not
>> functionally needed, and if it's screwed many things can
>> go very wrong.
>>
>>> This code ignores all these generic code
>>> skb_frag_unref/napi_pp_put_page/etc paths and uses raw access to
>>
>> I don't see the point, they are not used because they're not
>> needed. Instead of checking whether it came from a page pool
>> and whether it's net_iov or not, in the path io_uring returns
>> it we already apriori know that they're from a specific page
>> pool, net_iov and from the current provider.
>>
>> Same for optimisations provided by those helpers, they are
>> useful when you're transferring buffers from one context to
>> another, e.g. task recieve path -> napi / page_pool. In this
>> case they're already fetched in the right context without any
>> need to additionally jumping through the hoops. If anything,
>> it'd be odd to jump out of a window to climb a rope on the
>> other side of the building when you could've just walked 5
>> meters to the other room.
>>
> 
> For me, "they are not used because they're not needed." is not enough
> justification to ignore the generic code paths that support generic
> use cases and add your own freeing path and recycling that needs to
> work adjacent to generic paths for posterity. You need to provide
> concrete reasons why the current code paths don't work for you and
> can't be made to work for you.

We are already using io_uring specific code, though. The entire feature
requires the use of io_uring, and sockets that are set up for it can
only be (properly) consumed via io_uring specific functions. The return
path is no different.

Consuming via standard syscalls is still functionally correct. In that
case, the skbs when freed will go back to the page pool via the generic
paths e.g. napi_pp_put_page(). But in the intended fast path, returning
via the io_uring specific refill ring, there is no reason why we must
use the generic return functions.

> 
> Is it very complicated to napi_pp_put_page() niovs as the user puts
> them in the refill queue without adding a new syscall? If so, is it
> possible to do a niov equivalent of page_pool_put_page_bulk() of the
> refill queue while/as you process the RX path?
> 
> If you've tested the generic code paths to be performance deficient
> and your recycling is indeed better, you could improve the page_pool
> to pull netmems when it needs to like you're doing here, but in a
> generic way that applies to the page allocator and other providers.
> Not a one-off implementation that only applies to your provider.
> 
> If you're absolutely set on ignoring the currently supported reffing
> and implementing your own reffing and recycling for your use case,
> sure, that could work, but please don't overload the
> niov->pp_ref_count reserved for the generic use cases for this. Add
> io_zcrx_area->io_uring_ref or something and do whatever you want with
> it. Since it's not sharing the pp_ref_count with the generic code
> paths I don't see them conflicting in the future.

Why insist on this? Both page/niov and devmem/io_uring niov are mutually
exclusive. There is no strong technical reason to not re-use
pp_ref_count.

> 
> --
> Thanks,
> Mina
Mina Almasry Nov. 14, 2024, 8:56 p.m. UTC | #8
On Mon, Nov 11, 2024 at 1:15 PM David Wei <dw@davidwei.uk> wrote:
> > Is it very complicated to napi_pp_put_page() niovs as the user puts
> > them in the refill queue without adding a new syscall? If so, is it
> > possible to do a niov equivalent of page_pool_put_page_bulk() of the
> > refill queue while/as you process the RX path?
> >
> > If you've tested the generic code paths to be performance deficient
> > and your recycling is indeed better, you could improve the page_pool
> > to pull netmems when it needs to like you're doing here, but in a
> > generic way that applies to the page allocator and other providers.
> > Not a one-off implementation that only applies to your provider.
> >
> > If you're absolutely set on ignoring the currently supported reffing
> > and implementing your own reffing and recycling for your use case,
> > sure, that could work, but please don't overload the
> > niov->pp_ref_count reserved for the generic use cases for this. Add
> > io_zcrx_area->io_uring_ref or something and do whatever you want with
> > it. Since it's not sharing the pp_ref_count with the generic code
> > paths I don't see them conflicting in the future.
>
> Why insist on this? Both page/niov and devmem/io_uring niov are mutually
> exclusive. There is no strong technical reason to not re-use
> pp_ref_count.
>

Conflict between devmem (specifically) and io_uring is not my concern.
My concern is possible future conflict between io_uring refcounting
and page/devmem refcounting.

Currently net_iov refcounting and page refcounting is unified (as much
as possible). net_iov->pp_ref_count is an exact mirror in usage of
page->pp_ref_count and the intention is for it to remain so. This
patch reuses net_iov->pp_ref_count in a way that doesn't really apply
to page->pp_ref_count and already some deviation in use case is
happening which may lead to conflicting requirements in the future
(and to be fair, may not).

But I've been bringing this up a lot in the past (and offering
alternatives that don't introduce this overloading) and I think this
conversation has run its course. I'm unsure about this approach and
this could use another pair of eyes. Jakub, sorry to bother you but
you probably are the one that reviewed the whole net_iov stuff most
closely. Any chance you would take a look and provide direction here?
Maybe my concern is overblown...

--
Thanks,
Mina
Jakub Kicinski Nov. 15, 2024, 11:14 p.m. UTC | #9
On Thu, 14 Nov 2024 12:56:14 -0800 Mina Almasry wrote:
> But I've been bringing this up a lot in the past (and offering
> alternatives that don't introduce this overloading) and I think this
> conversation has run its course. I'm unsure about this approach and
> this could use another pair of eyes. Jakub, sorry to bother you but
> you probably are the one that reviewed the whole net_iov stuff most
> closely. Any chance you would take a look and provide direction here?
> Maybe my concern is overblown...

Sorry I haven't read this code very closely (still!?) really hoping 
for next version to come during the merge window when time is more
abundant :S

From scanning the quoted context I gather you're asking about using 
the elevated ->pp_ref_count for user-owned pages? If yes - I also
find that part.. borderline incorrect. The page pool stats will show
these pages as allocated which normally means held by the driver or 
the stack. Pages given to the user should effectively leave the pool.
So I'm guessing this is some optimization. Same for patch 8.

But let me not get sucked into this before we wrap up the net-next PR..
diff mbox series

Patch

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index a276572fe953..aad35676207e 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -2,7 +2,12 @@ 
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/nospec.h>
+#include <linux/netdevice.h>
 #include <linux/io_uring.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -14,6 +19,16 @@ 
 
 #define IO_RQ_MAX_ENTRIES		32768
 
+__maybe_unused
+static const struct memory_provider_ops io_uring_pp_zc_ops;
+
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+	struct net_iov_area *owner = net_iov_owner(niov);
+
+	return container_of(owner, struct io_zcrx_area, nia);
+}
+
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg)
 {
@@ -99,6 +114,9 @@  static int io_zcrx_create_area(struct io_ring_ctx *ctx,
 		goto err;
 
 	for (i = 0; i < nr_pages; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+
+		niov->owner = &area->nia;
 		area->freelist[i] = i;
 	}
 
@@ -230,3 +248,200 @@  void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
 	lockdep_assert_held(&ctx->uring_lock);
 }
+
+static bool io_zcrx_niov_put(struct net_iov *niov, int nr)
+{
+	return atomic_long_sub_and_test(nr, &niov->pp_ref_count);
+}
+
+static bool io_zcrx_put_niov_uref(struct net_iov *niov)
+{
+	if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF)
+		return false;
+
+	return io_zcrx_niov_put(niov, IO_ZC_RX_UREF);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+				      struct net_iov *niov)
+{
+}
+
+static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
+{
+	u32 entries;
+
+	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
+static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
+						 unsigned mask)
+{
+	unsigned int idx = ifq->cached_rq_head++ & mask;
+
+	return &ifq->rqes[idx];
+}
+
+static void io_zcrx_ring_refill(struct page_pool *pp,
+				struct io_zcrx_ifq *ifq)
+{
+	unsigned int entries = io_zcrx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+
+	entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
+	if (unlikely(!entries))
+		return;
+
+	do {
+		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
+		struct io_zcrx_area *area;
+		struct net_iov *niov;
+		unsigned niov_idx, area_idx;
+
+		area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
+		niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE;
+
+		if (unlikely(rqe->__pad || area_idx))
+			continue;
+		area = ifq->area;
+
+		if (unlikely(niov_idx >= area->nia.num_niovs))
+			continue;
+		niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
+
+		niov = &area->nia.niovs[niov_idx];
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));
+	} while (--entries);
+
+	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
+{
+	struct io_zcrx_area *area = ifq->area;
+
+	spin_lock_bh(&area->freelist_lock);
+	while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+		struct net_iov *niov;
+		u32 pgid;
+
+		pgid = area->freelist[--area->free_count];
+		niov = &area->nia.niovs[pgid];
+
+		page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov));
+
+		pp->pages_state_hold_cnt++;
+		trace_page_pool_state_hold(pp, net_iov_to_netmem(niov),
+					   pp->pages_state_hold_cnt);
+	}
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_recycle_niov(struct net_iov *niov)
+{
+	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+	spin_lock_bh(&area->freelist_lock);
+	area->freelist[area->free_count++] = net_iov_idx(niov);
+	spin_unlock_bh(&area->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+
+	/* pp should already be ensuring that */
+	if (unlikely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_ring_refill(pp, ifq);
+	if (likely(pp->alloc.count))
+		goto out_return;
+
+	io_zcrx_refill_slow(pp, ifq);
+	if (!pp->alloc.count)
+		return 0;
+out_return:
+	return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
+{
+	struct net_iov *niov;
+
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return false;
+
+	niov = netmem_to_net_iov(netmem);
+
+	if (io_zcrx_niov_put(niov, 1))
+		io_zcrx_recycle_niov(niov);
+	return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int i;
+
+	/* Reclaim back all buffers given to the user space. */
+	for (i = 0; i < area->nia.num_niovs; i++) {
+		struct net_iov *niov = &area->nia.niovs[i];
+		int count;
+
+		if (!io_zcrx_put_niov_uref(niov))
+			continue;
+		io_zcrx_recycle_niov(niov);
+
+		count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+		trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count);
+	}
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+	int ret;
+
+	if (!ifq)
+		return -EINVAL;
+	if (pp->p.order != 0)
+		return -EINVAL;
+	if (!pp->p.napi)
+		return -EINVAL;
+
+	ret = page_pool_mp_init_paged_area(pp, &area->nia, area->pages);
+	if (ret)
+		return ret;
+
+	percpu_ref_get(&ifq->ctx->refs);
+	ifq->pp = pp;
+	return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+	struct io_zcrx_ifq *ifq = pp->mp_priv;
+	struct io_zcrx_area *area = ifq->area;
+
+	page_pool_mp_release_area(pp, &ifq->area->nia);
+
+	ifq->pp = NULL;
+
+	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
+		return;
+	percpu_ref_put(&ifq->ctx->refs);
+}
+
+static const struct memory_provider_ops io_uring_pp_zc_ops = {
+	.alloc_netmems		= io_pp_zc_alloc_netmems,
+	.release_netmem		= io_pp_zc_release_netmem,
+	.init			= io_pp_zc_init,
+	.destroy		= io_pp_zc_destroy,
+	.scrub			= io_pp_zc_scrub,
+};
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index a8db61498c67..464b4bd89b64 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -5,6 +5,9 @@ 
 #include <linux/io_uring_types.h>
 #include <net/page_pool/types.h>
 
+#define IO_ZC_RX_UREF			0x10000
+#define IO_ZC_RX_KREF_MASK		(IO_ZC_RX_UREF - 1)
+
 struct io_zcrx_area {
 	struct net_iov_area	nia;
 	struct io_zcrx_ifq	*ifq;
@@ -22,10 +25,12 @@  struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
 	struct io_zcrx_area		*area;
+	struct page_pool		*pp;
 
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe 	*rqes;
 	u32				rq_entries;
+	u32				cached_rq_head;
 
 	unsigned short			n_rqe_pages;
 	struct page			**rqe_pages;