[net-next,v8,11/17] io_uring/zcrx: implement zerocopy receive pp memory provider

Message ID	20241204172204.4180482-12-dw@davidwei.uk (mailing list archive)
State	New
Headers	show Received: from mail-pf1-f170.google.com (mail-pf1-f170.google.com [209.85.210.170]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB0E9215F4C for <io-uring@vger.kernel.org>; Wed, 4 Dec 2024 17:22:56 +0000 (UTC) From: David Wei <dw@davidwei.uk> To: io-uring@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe <axboe@kernel.dk>, Pavel Begunkov <asml.silence@gmail.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jesper Dangaard Brouer <hawk@kernel.org>, David Ahern <dsahern@kernel.org>, Mina Almasry <almasrymina@google.com>, Stanislav Fomichev <stfomichev@gmail.com>, Joe Damato <jdamato@fastly.com>, Pedro Tammela <pctammela@mojatatu.com> Subject: [PATCH net-next v8 11/17] io_uring/zcrx: implement zerocopy receive pp memory provider Date: Wed, 4 Dec 2024 09:21:50 -0800 Message-ID: <20241204172204.4180482-12-dw@davidwei.uk> In-Reply-To: <20241204172204.4180482-1-dw@davidwei.uk> References: <20241204172204.4180482-1-dw@davidwei.uk> Precedence: bulk MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	io_uring zero copy rx \| expand [net-next,v8,00/17] io_uring zero copy rx [net-next,v8,01/17] net: prefix devmem specific helpers [net-next,v8,02/17] net: generalise net_iov chunk owners [net-next,v8,03/17] net: page_pool: create hooks for custom page providers [net-next,v8,04/17] net: prepare for non devmem TCP memory providers [net-next,v8,05/17] net: page_pool: add ->scrub mem provider callback [net-next,v8,06/17] net: page pool: add helper creating area from pages [net-next,v8,07/17] net: page_pool: introduce page_pool_mp_return_in_cache [net-next,v8,08/17] net: add helper executing custom callback from napi [net-next,v8,09/17] io_uring/zcrx: add interface queue and refill queue [net-next,v8,10/17] io_uring/zcrx: add io_zcrx_area [net-next,v8,11/17] io_uring/zcrx: implement zerocopy receive pp memory provider [net-next,v8,12/17] io_uring/zcrx: add io_recvzc request [net-next,v8,13/17] io_uring/zcrx: set pp memory provider for an rx queue [net-next,v8,14/17] io_uring/zcrx: add copy fallback [net-next,v8,15/17] io_uring/zcrx: throttle receive requests [net-next,v8,16/17] net: add documentation for io_uring zcrx [net-next,v8,17/17] io_uring/zcrx: add selftest

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 8f838add94a4..7919f5e52c73 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -2,7 +2,12 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/netdevice.h> #include <linux/io_uring.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <trace/events/page_pool.h> #include <uapi/linux/io_uring.h> @@ -14,6 +19,16 @@ #define IO_RQ_MAX_ENTRIES 32768 +__maybe_unused +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -104,6 +119,9 @@ static int io_zcrx_create_area(struct io_ring_ctx *ctx, goto err; for (i = 0; i < nr_pages; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; area->freelist[i] = i; } @@ -238,3 +256,200 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { lockdep_assert_held(&ctx->uring_lock); } + +static bool io_zcrx_niov_put(struct net_iov *niov, int nr) +{ + return atomic_long_sub_and_test(nr, &niov->pp_ref_count); +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + if (atomic_long_read(&niov->pp_ref_count) < IO_ZC_RX_UREF) + return false; + + return io_zcrx_niov_put(niov, IO_ZC_RX_UREF); +} + +static inline void io_zc_add_pp_cache(struct page_pool *pp, + struct net_iov *niov) +{ +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); +} + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int entries = io_zcrx_rqring_entries(ifq); + unsigned int mask = ifq->rq_entries - 1; + + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) + return; + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) / PAGE_SIZE; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov)); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov; + u32 pgid; + + pgid = area->freelist[--area->free_count]; + niov = &area->nia.niovs[pgid]; + + page_pool_mp_return_in_cache(pp, net_iov_to_netmem(niov)); + + pp->pages_state_hold_cnt++; + trace_page_pool_state_hold(pp, net_iov_to_netmem(niov), + pp->pages_state_hold_cnt); + } + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_recycle_niov(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + + if (io_zcrx_niov_put(niov, 1)) + io_zcrx_recycle_niov(niov); + return false; +} + +static void io_pp_zc_scrub(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + int i; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int count; + + if (!io_zcrx_put_niov_uref(niov)) + continue; + io_zcrx_recycle_niov(niov); + + count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt); + trace_page_pool_state_release(pp, net_iov_to_netmem(niov), count); + } +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + int ret; + + if (!ifq) + return -EINVAL; + if (pp->p.order != 0) + return -EINVAL; + if (!pp->p.napi) + return -EINVAL; + + ret = page_pool_mp_init_paged_area(pp, &area->nia, area->pages); + if (ret) + return ret; + + percpu_ref_get(&ifq->ctx->refs); + ifq->pp = pp; + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + + page_pool_mp_release_area(pp, &ifq->area->nia); + + ifq->pp = NULL; + + if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) + return; + percpu_ref_put(&ifq->ctx->refs); +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .scrub = io_pp_zc_scrub, +}; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 07742c0cfcf3..8515cde78a2c 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -5,6 +5,9 @@ #include <linux/io_uring_types.h> #include <net/page_pool/types.h> +#define IO_ZC_RX_UREF 0x10000 +#define IO_ZC_RX_KREF_MASK (IO_ZC_RX_UREF - 1) + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -22,10 +25,12 @@ struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct net_device *dev; struct io_zcrx_area *area; + struct page_pool *pp; struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries; + u32 cached_rq_head; u32 if_rxq;

[net-next,v8,11/17] io_uring/zcrx: implement zerocopy receive pp memory provider

Commit Message

Comments

Patch