diff mbox series

[RFC,v4,11/16] io_uring: implement pp memory provider for zc rx

Message ID 20240312214430.2923019-12-dw@davidwei.uk (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Zero copy Rx using io_uring | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

David Wei March 12, 2024, 9:44 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

Implement a new pp memory provider for io_uring zerocopy receive.

All buffers are backed by struct io_zc_rx_buf, which is a thin extension
of struct net_iov. Initially, all of them are unallocated and placed in
a spinlock protected ->freelist. Then, they will be allocate via
the ->alloc_pages callback, which sets refcount to 1.

Later, buffers would either be dropped by the net stack and recycled
back into page pool / released by ->release_page, or, more likely, get
transferred to the userspace by posting a corresponding CQE and
elevating refcount by IO_ZC_RX_UREF. When the user is done with a buffer,
it should be put into the refill ring.

Next time io_pp_zc_alloc_pages() runs it'll check the ring, put user
refs and ultimately grab buffers from there. That's done in the attached
napi context and so doesn't need any additional synchronisation. That is
the second hottest path after getting a buffer from the pp lockless cache.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/linux/io_uring/net.h  |   5 +
 include/net/page_pool/types.h |   1 +
 io_uring/zc_rx.c              | 202 ++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h              |   5 +
 net/core/page_pool.c          |   2 +-
 5 files changed, 214 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index 05d5a6a97264..a225d7090b6b 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -12,6 +12,11 @@  struct io_zc_rx_buf {
 };
 
 #if defined(CONFIG_IO_URING)
+
+#if defined(CONFIG_PAGE_POOL)
+extern const struct memory_provider_ops io_uring_pp_zc_ops;
+#endif
+
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 #else
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 347837b83d36..9e91f2cdbe61 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -227,6 +227,7 @@  netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
 					  int cpuid);
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
 
 struct xdp_mem_info;
 
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 326ae3fcc643..b2507df121fb 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -8,6 +8,7 @@ 
 #include <linux/nospec.h>
 #include <net/tcp.h>
 #include <net/af_unix.h>
+#include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -357,4 +358,205 @@  int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+static inline struct io_zc_rx_buf *io_niov_to_buf(struct net_iov *niov)
+{
+	return container_of(niov, struct io_zc_rx_buf, niov);
+}
+
+static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
+				   struct io_zc_rx_buf *buf)
+{
+	return buf - pool->bufs;
+}
+
+static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+{
+	atomic_long_add(IO_ZC_RX_UREF, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_buf_put(struct io_zc_rx_buf *buf, int nr)
+{
+	return atomic_long_sub_and_test(nr, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_put_buf_uref(struct io_zc_rx_buf *buf)
+{
+	if (atomic_long_read(&buf->niov.pp_ref_count) < IO_ZC_RX_UREF)
+		return false;
+
+	return io_zc_rx_buf_put(buf, IO_ZC_RX_UREF);
+}
+
+static inline netmem_ref io_zc_buf_to_netmem(struct io_zc_rx_buf *buf)
+{
+	return net_iov_to_netmem(&buf->niov);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+				      struct io_zc_rx_buf *buf)
+{
+	netmem_ref netmem = io_zc_buf_to_netmem(buf);
+
+	page_pool_set_pp_info(pp, netmem);
+	pp->alloc.cache[pp->alloc.count++] = netmem;
+}
+
+static inline u32 io_zc_rx_rqring_entries(struct io_zc_rx_ifq *ifq)
+{
+	u32 entries;
+
+	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
+static void io_zc_rx_ring_refill(struct page_pool *pp,
+				 struct io_zc_rx_ifq *ifq)
+{
+	unsigned int entries = io_zc_rx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	if (unlikely(!entries))
+		return;
+
+	while (entries--) {
+		unsigned int rq_idx = ifq->cached_rq_head++ & mask;
+		struct io_uring_rbuf_rqe *rqe = &ifq->rqes[rq_idx];
+		u32 pgid = rqe->off / PAGE_SIZE;
+		struct io_zc_rx_buf *buf = &pool->bufs[pgid];
+
+		if (!io_zc_rx_put_buf_uref(buf))
+			continue;
+		io_zc_add_pp_cache(pp, buf);
+		if (pp->alloc.count >= PP_ALLOC_CACHE_REFILL)
+			break;
+	}
+	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zc_rx_refill_slow(struct page_pool *pp, struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	spin_lock_bh(&pool->freelist_lock);
+	while (pool->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+		struct io_zc_rx_buf *buf;
+		u32 pgid;
+
+		pgid = pool->freelist[--pool->free_count];
+		buf = &pool->bufs[pgid];
+
+		io_zc_add_pp_cache(pp, buf);
+		pp->pages_state_hold_cnt++;
+		trace_page_pool_state_hold(pp, io_zc_buf_to_netmem(buf),
+					   pp->pages_state_hold_cnt);
+	}
+	spin_unlock_bh(&pool->freelist_lock);
+}
+
+static void io_zc_rx_recycle_buf(struct io_zc_rx_pool *pool,
+				 struct io_zc_rx_buf *buf)
+{
+	spin_lock_bh(&pool->freelist_lock);
+	pool->freelist[pool->free_count++] = io_buf_pgid(pool, buf);
+	spin_unlock_bh(&pool->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_pages(struct page_pool *pp, gfp_t gfp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+	/* pp should already be ensuring that */
+	if (unlikely(pp->alloc.count))
+		goto out_return;
+
+	io_zc_rx_ring_refill(pp, ifq);
+	if (likely(pp->alloc.count))
+		goto out_return;
+
+	io_zc_rx_refill_slow(pp, ifq);
+	if (!pp->alloc.count)
+		return 0;
+out_return:
+	return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_page(struct page_pool *pp, netmem_ref netmem)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_buf *buf;
+	struct net_iov *niov;
+
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return false;
+
+	niov = netmem_to_net_iov(netmem);
+	buf = io_niov_to_buf(niov);
+
+	if (io_zc_rx_buf_put(buf, 1))
+		io_zc_rx_recycle_buf(ifq->pool, buf);
+	return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_pool *pool = ifq->pool;
+	int i;
+
+	for (i = 0; i < pool->nr_bufs; i++) {
+		struct io_zc_rx_buf *buf = &pool->bufs[i];
+		int count;
+
+		if (!io_zc_rx_put_buf_uref(buf))
+			continue;
+		io_zc_rx_recycle_buf(pool, buf);
+
+		count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+		trace_page_pool_state_release(pp, io_zc_buf_to_netmem(buf), count);
+	}
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+	if (!ifq)
+		return -EINVAL;
+	if (pp->p.order != 0)
+		return -EINVAL;
+	if (!pp->p.napi)
+		return -EINVAL;
+	if (pp->p.flags & PP_FLAG_DMA_MAP)
+		return -EOPNOTSUPP;
+	if (pp->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		return -EOPNOTSUPP;
+
+	percpu_ref_get(&ifq->ctx->refs);
+	ifq->pp = pp;
+	return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	ifq->pp = NULL;
+
+	if (WARN_ON_ONCE(pool->free_count != pool->nr_bufs))
+		return;
+	percpu_ref_put(&ifq->ctx->refs);
+}
+
+const struct memory_provider_ops io_uring_pp_zc_ops = {
+	.alloc_pages		= io_pp_zc_alloc_pages,
+	.release_page		= io_pp_zc_release_page,
+	.init			= io_pp_zc_init,
+	.destroy		= io_pp_zc_destroy,
+	.scrub			= io_pp_zc_scrub,
+};
+EXPORT_SYMBOL(io_uring_pp_zc_ops);
+
+
 #endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 466b2b8f9813..c02bf8cabc6c 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -10,6 +10,9 @@ 
 #define IO_ZC_IFQ_IDX_OFFSET		16
 #define IO_ZC_IFQ_IDX_MASK		((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
 
+#define IO_ZC_RX_UREF			0x10000
+#define IO_ZC_RX_KREF_MASK		(IO_ZC_RX_UREF - 1)
+
 struct io_zc_rx_pool {
 	struct io_zc_rx_ifq	*ifq;
 	struct io_zc_rx_buf	*bufs;
@@ -26,10 +29,12 @@  struct io_zc_rx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
 	struct io_zc_rx_pool		*pool;
+	struct page_pool		*pp;
 
 	struct io_uring			*rq_ring;
 	struct io_uring_rbuf_rqe 	*rqes;
 	u32				rq_entries;
+	u32				cached_rq_head;
 
 	/* hw rx descriptor ring id */
 	u32				if_rxq_id;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index fc92e551ed13..f83ddbb4ebd8 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -460,7 +460,7 @@  static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 	return false;
 }
 
-static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 {
 	netmem_set_pp(netmem, pool);
 	netmem_or_pp_magic(netmem, PP_SIGNATURE);