diff mbox series

[net-next,v8,08/17] net: add helper executing custom callback from napi

Message ID 20241204172204.4180482-9-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Dec. 4, 2024, 5:21 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

It's useful to have napi private bits and pieces like page pool's fast
allocating cache, so that the hot allocation path doesn't have to do any
additional synchronisation. In case of io_uring memory provider
introduced in following patches, we keep the consumer end of the
io_uring's refill queue private to napi as it's a hot path.

However, from time to time we need to synchronise with the napi, for
example to add more user memory or allocate fallback buffers. Add a
helper function napi_execute that allows to run a custom callback from
under napi context so that it can access and modify napi protected
parts of io_uring. It works similar to busy polling and stops napi from
running in the meantime, so it's supposed to be a slow control path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/net/busy_poll.h |  6 +++
 net/core/dev.c          | 81 ++++++++++++++++++++++++++++++++---------
 2 files changed, 70 insertions(+), 17 deletions(-)

Comments

Jakub Kicinski Dec. 10, 2024, 3:44 a.m. UTC | #1
On Wed,  4 Dec 2024 09:21:47 -0800 David Wei wrote:
> It's useful to have napi private bits and pieces like page pool's fast
> allocating cache, so that the hot allocation path doesn't have to do any
> additional synchronisation. In case of io_uring memory provider
> introduced in following patches, we keep the consumer end of the
> io_uring's refill queue private to napi as it's a hot path.
> 
> However, from time to time we need to synchronise with the napi, for
> example to add more user memory or allocate fallback buffers. Add a
> helper function napi_execute that allows to run a custom callback from
> under napi context so that it can access and modify napi protected
> parts of io_uring. It works similar to busy polling and stops napi from
> running in the meantime, so it's supposed to be a slow control path.

Let's leave this out, please. I seriously doubt this works reliably
and the bar for adding complexity to NAPI is fairly high. A commit
which doesn't even quote any perf numbers will certainly not clear it.
Pavel Begunkov Dec. 10, 2024, 4:11 a.m. UTC | #2
On 12/10/24 03:44, Jakub Kicinski wrote:
> On Wed,  4 Dec 2024 09:21:47 -0800 David Wei wrote:
>> It's useful to have napi private bits and pieces like page pool's fast
>> allocating cache, so that the hot allocation path doesn't have to do any
>> additional synchronisation. In case of io_uring memory provider
>> introduced in following patches, we keep the consumer end of the
>> io_uring's refill queue private to napi as it's a hot path.
>>
>> However, from time to time we need to synchronise with the napi, for
>> example to add more user memory or allocate fallback buffers. Add a
>> helper function napi_execute that allows to run a custom callback from
>> under napi context so that it can access and modify napi protected
>> parts of io_uring. It works similar to busy polling and stops napi from
>> running in the meantime, so it's supposed to be a slow control path.
> 
> Let's leave this out, please. I seriously doubt this works reliably
> and the bar for adding complexity to NAPI is fairly high. A commit
> which doesn't even quote any perf numbers will certainly not clear it.

I reworked it and got rid of the patch, though I don't see why
performance here would matter. It's a very slow path if all
paths failed, batching inside wasn't done right on the io_uring
for simplicity, but that was certainly planned a follow up.
diff mbox series

Patch

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c858270141bc..d2ae1b4bf20c 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -47,6 +47,7 @@  bool sk_busy_loop_end(void *p, unsigned long start_time);
 void napi_busy_loop(unsigned int napi_id,
 		    bool (*loop_end)(void *, unsigned long),
 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget);
+void napi_execute(unsigned napi_id, void (*cb)(void *), void *cb_arg);
 
 void napi_busy_loop_rcu(unsigned int napi_id,
 			bool (*loop_end)(void *, unsigned long),
@@ -66,6 +67,11 @@  static inline bool sk_can_busy_loop(struct sock *sk)
 	return false;
 }
 
+static inline void napi_execute(unsigned napi_id,
+				void (*cb)(void *), void *cb_arg)
+{
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 static inline unsigned long busy_loop_current_time(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 13d00fc10f55..590ded8cc544 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6351,6 +6351,30 @@  enum {
 	NAPI_F_END_ON_RESCHED	= 2,
 };
 
+static inline bool napi_state_start_busy_polling(struct napi_struct *napi,
+						 unsigned flags)
+{
+	unsigned long val = READ_ONCE(napi->state);
+
+	/* If multiple threads are competing for this napi,
+	 * we avoid dirtying napi->state as much as we can.
+	 */
+	if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+		   NAPIF_STATE_IN_BUSY_POLL))
+		goto fail;
+
+	if (cmpxchg(&napi->state, val,
+		    val | NAPIF_STATE_IN_BUSY_POLL |
+			  NAPIF_STATE_SCHED) != val)
+		goto fail;
+
+	return true;
+fail:
+	if (flags & NAPI_F_PREFER_BUSY_POLL)
+		set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
+	return false;
+}
+
 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 			   unsigned flags, u16 budget)
 {
@@ -6426,24 +6450,8 @@  static void __napi_busy_loop(unsigned int napi_id,
 		local_bh_disable();
 		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 		if (!napi_poll) {
-			unsigned long val = READ_ONCE(napi->state);
-
-			/* If multiple threads are competing for this napi,
-			 * we avoid dirtying napi->state as much as we can.
-			 */
-			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
-				   NAPIF_STATE_IN_BUSY_POLL)) {
-				if (flags & NAPI_F_PREFER_BUSY_POLL)
-					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
+			if (!napi_state_start_busy_polling(napi, flags))
 				goto count;
-			}
-			if (cmpxchg(&napi->state, val,
-				    val | NAPIF_STATE_IN_BUSY_POLL |
-					  NAPIF_STATE_SCHED) != val) {
-				if (flags & NAPI_F_PREFER_BUSY_POLL)
-					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
-				goto count;
-			}
 			have_poll_lock = netpoll_poll_lock(napi);
 			napi_poll = napi->poll;
 		}
@@ -6507,6 +6515,45 @@  void napi_busy_loop(unsigned int napi_id,
 }
 EXPORT_SYMBOL(napi_busy_loop);
 
+void napi_execute(unsigned napi_id,
+		  void (*cb)(void *), void *cb_arg)
+{
+	unsigned flags = NAPI_F_PREFER_BUSY_POLL;
+	void *have_poll_lock = NULL;
+	struct napi_struct *napi;
+
+	rcu_read_lock();
+	napi = napi_by_id(napi_id);
+	if (!napi) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+
+	for (;;) {
+		local_bh_disable();
+
+		if (napi_state_start_busy_polling(napi, flags)) {
+			have_poll_lock = netpoll_poll_lock(napi);
+			cb(cb_arg);
+			local_bh_enable();
+			busy_poll_stop(napi, have_poll_lock, flags, 1);
+			break;
+		}
+
+		local_bh_enable();
+		if (unlikely(need_resched()))
+			break;
+		cpu_relax();
+	}
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	rcu_read_unlock();
+}
+
 void napi_suspend_irqs(unsigned int napi_id)
 {
 	struct napi_struct *napi;