Message ID | 20241007221603.1703699-9-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zero copy rx | expand |
On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote: > From: Pavel Begunkov <asml.silence@gmail.com> [...] > However, from time to time we need to synchronise with the napi, for > example to add more user memory or allocate fallback buffers. Add a > helper function napi_execute that allows to run a custom callback from > under napi context so that it can access and modify napi protected > parts of io_uring. It works similar to busy polling and stops napi from > running in the meantime, so it's supposed to be a slow control path. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > Signed-off-by: David Wei <dw@davidwei.uk> [...] > diff --git a/net/core/dev.c b/net/core/dev.c > index 1e740faf9e78..ba2f43cf5517 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id, > } > EXPORT_SYMBOL(napi_busy_loop); > > +void napi_execute(unsigned napi_id, > + void (*cb)(void *), void *cb_arg) > +{ > + struct napi_struct *napi; > + bool done = false; > + unsigned long val; > + void *have_poll_lock = NULL; > + > + rcu_read_lock(); > + > + napi = napi_by_id(napi_id); > + if (!napi) { > + rcu_read_unlock(); > + return; > + } > + > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > + preempt_disable(); > + for (;;) { > + local_bh_disable(); > + val = READ_ONCE(napi->state); > + > + /* If multiple threads are competing for this napi, > + * we avoid dirtying napi->state as much as we can. > + */ > + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | > + NAPIF_STATE_IN_BUSY_POLL)) > + goto restart; > + > + if (cmpxchg(&napi->state, val, > + val | NAPIF_STATE_IN_BUSY_POLL | > + NAPIF_STATE_SCHED) != val) > + goto restart; > + > + have_poll_lock = netpoll_poll_lock(napi); > + cb(cb_arg); A lot of the above code seems quite similar to __napi_busy_loop, as you mentioned. It might be too painful, but I can't help but wonder if there's a way to refactor this to use common helpers or something? I had been thinking that the napi->state check / cmpxchg could maybe be refactored to avoid being repeated in both places?
On 10/8/24 23:25, Joe Damato wrote: > On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote: >> From: Pavel Begunkov <asml.silence@gmail.com> > > [...] > >> However, from time to time we need to synchronise with the napi, for >> example to add more user memory or allocate fallback buffers. Add a >> helper function napi_execute that allows to run a custom callback from >> under napi context so that it can access and modify napi protected >> parts of io_uring. It works similar to busy polling and stops napi from >> running in the meantime, so it's supposed to be a slow control path. >> >> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> >> Signed-off-by: David Wei <dw@davidwei.uk> > > [...] > >> diff --git a/net/core/dev.c b/net/core/dev.c >> index 1e740faf9e78..ba2f43cf5517 100644 >> --- a/net/core/dev.c >> +++ b/net/core/dev.c >> @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id, >> } >> EXPORT_SYMBOL(napi_busy_loop); >> >> +void napi_execute(unsigned napi_id, >> + void (*cb)(void *), void *cb_arg) >> +{ >> + struct napi_struct *napi; >> + bool done = false; >> + unsigned long val; >> + void *have_poll_lock = NULL; >> + >> + rcu_read_lock(); >> + >> + napi = napi_by_id(napi_id); >> + if (!napi) { >> + rcu_read_unlock(); >> + return; >> + } >> + >> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >> + preempt_disable(); >> + for (;;) { >> + local_bh_disable(); >> + val = READ_ONCE(napi->state); >> + >> + /* If multiple threads are competing for this napi, >> + * we avoid dirtying napi->state as much as we can. >> + */ >> + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | >> + NAPIF_STATE_IN_BUSY_POLL)) >> + goto restart; >> + >> + if (cmpxchg(&napi->state, val, >> + val | NAPIF_STATE_IN_BUSY_POLL | >> + NAPIF_STATE_SCHED) != val) >> + goto restart; >> + >> + have_poll_lock = netpoll_poll_lock(napi); >> + cb(cb_arg); > > A lot of the above code seems quite similar to __napi_busy_loop, as > you mentioned. > > It might be too painful, but I can't help but wonder if there's a > way to refactor this to use common helpers or something? > > I had been thinking that the napi->state check / > cmpxchg could maybe be refactored to avoid being repeated in both > places? Yep, I can add a helper for that, but I'm not sure how to deduplicate it further while trying not to pollute the napi polling path.
On Wed, Oct 09, 2024 at 04:09:53PM +0100, Pavel Begunkov wrote: > On 10/8/24 23:25, Joe Damato wrote: > > On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote: > > > From: Pavel Begunkov <asml.silence@gmail.com> > > > > [...] > > > > > However, from time to time we need to synchronise with the napi, for > > > example to add more user memory or allocate fallback buffers. Add a > > > helper function napi_execute that allows to run a custom callback from > > > under napi context so that it can access and modify napi protected > > > parts of io_uring. It works similar to busy polling and stops napi from > > > running in the meantime, so it's supposed to be a slow control path. > > > > > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > > > Signed-off-by: David Wei <dw@davidwei.uk> > > > > [...] > > > > > diff --git a/net/core/dev.c b/net/core/dev.c > > > index 1e740faf9e78..ba2f43cf5517 100644 > > > --- a/net/core/dev.c > > > +++ b/net/core/dev.c > > > @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id, > > > } > > > EXPORT_SYMBOL(napi_busy_loop); > > > +void napi_execute(unsigned napi_id, > > > + void (*cb)(void *), void *cb_arg) > > > +{ > > > + struct napi_struct *napi; > > > + bool done = false; > > > + unsigned long val; > > > + void *have_poll_lock = NULL; > > > + > > > + rcu_read_lock(); > > > + > > > + napi = napi_by_id(napi_id); > > > + if (!napi) { > > > + rcu_read_unlock(); > > > + return; > > > + } > > > + > > > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) > > > + preempt_disable(); > > > + for (;;) { > > > + local_bh_disable(); > > > + val = READ_ONCE(napi->state); > > > + > > > + /* If multiple threads are competing for this napi, > > > + * we avoid dirtying napi->state as much as we can. > > > + */ > > > + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | > > > + NAPIF_STATE_IN_BUSY_POLL)) > > > + goto restart; > > > + > > > + if (cmpxchg(&napi->state, val, > > > + val | NAPIF_STATE_IN_BUSY_POLL | > > > + NAPIF_STATE_SCHED) != val) > > > + goto restart; > > > + > > > + have_poll_lock = netpoll_poll_lock(napi); > > > + cb(cb_arg); > > > > A lot of the above code seems quite similar to __napi_busy_loop, as > > you mentioned. > > > > It might be too painful, but I can't help but wonder if there's a > > way to refactor this to use common helpers or something? > > > > I had been thinking that the napi->state check / > > cmpxchg could maybe be refactored to avoid being repeated in both > > places? > > Yep, I can add a helper for that, but I'm not sure how to > deduplicate it further while trying not to pollute the > napi polling path. It was just a minor nit; I wouldn't want to hold back this important work just for that. I'm still looking at the code myself to see if I can see a better arrangement of the code. But that could always come later as a cleanup for -next ?
On 10/9/24 17:13, Joe Damato wrote: > On Wed, Oct 09, 2024 at 04:09:53PM +0100, Pavel Begunkov wrote: >> On 10/8/24 23:25, Joe Damato wrote: >>> On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote: >>>> From: Pavel Begunkov <asml.silence@gmail.com> >>> >>> [...] >>> >>>> However, from time to time we need to synchronise with the napi, for >>>> example to add more user memory or allocate fallback buffers. Add a >>>> helper function napi_execute that allows to run a custom callback from >>>> under napi context so that it can access and modify napi protected >>>> parts of io_uring. It works similar to busy polling and stops napi from >>>> running in the meantime, so it's supposed to be a slow control path. >>>> >>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> >>>> Signed-off-by: David Wei <dw@davidwei.uk> >>> >>> [...] >>> >>>> diff --git a/net/core/dev.c b/net/core/dev.c >>>> index 1e740faf9e78..ba2f43cf5517 100644 >>>> --- a/net/core/dev.c >>>> +++ b/net/core/dev.c >>>> @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id, >>>> } >>>> EXPORT_SYMBOL(napi_busy_loop); >>>> +void napi_execute(unsigned napi_id, >>>> + void (*cb)(void *), void *cb_arg) >>>> +{ >>>> + struct napi_struct *napi; >>>> + bool done = false; >>>> + unsigned long val; >>>> + void *have_poll_lock = NULL; >>>> + >>>> + rcu_read_lock(); >>>> + >>>> + napi = napi_by_id(napi_id); >>>> + if (!napi) { >>>> + rcu_read_unlock(); >>>> + return; >>>> + } >>>> + >>>> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) >>>> + preempt_disable(); >>>> + for (;;) { >>>> + local_bh_disable(); >>>> + val = READ_ONCE(napi->state); >>>> + >>>> + /* If multiple threads are competing for this napi, >>>> + * we avoid dirtying napi->state as much as we can. >>>> + */ >>>> + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | >>>> + NAPIF_STATE_IN_BUSY_POLL)) >>>> + goto restart; >>>> + >>>> + if (cmpxchg(&napi->state, val, >>>> + val | NAPIF_STATE_IN_BUSY_POLL | >>>> + NAPIF_STATE_SCHED) != val) >>>> + goto restart; >>>> + >>>> + have_poll_lock = netpoll_poll_lock(napi); >>>> + cb(cb_arg); >>> >>> A lot of the above code seems quite similar to __napi_busy_loop, as >>> you mentioned. >>> >>> It might be too painful, but I can't help but wonder if there's a >>> way to refactor this to use common helpers or something? >>> >>> I had been thinking that the napi->state check / >>> cmpxchg could maybe be refactored to avoid being repeated in both >>> places? >> >> Yep, I can add a helper for that, but I'm not sure how to >> deduplicate it further while trying not to pollute the >> napi polling path. > > It was just a minor nit; I wouldn't want to hold back this important > work just for that. > > I'm still looking at the code myself to see if I can see a better > arrangement of the code. > > But that could always come later as a cleanup for -next ? It's still early, there will be a v6 anyway. And thanks for taking a look.
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f03040baaefd..3fd9e65731e9 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -47,6 +47,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_execute(unsigned napi_id, void (*cb)(void *), void *cb_arg); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), @@ -63,6 +64,11 @@ static inline bool sk_can_busy_loop(struct sock *sk) return false; } +static inline void napi_execute(unsigned napi_id, + void (*cb)(void *), void *cb_arg) +{ +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) diff --git a/net/core/dev.c b/net/core/dev.c index 1e740faf9e78..ba2f43cf5517 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_execute(unsigned napi_id, + void (*cb)(void *), void *cb_arg) +{ + struct napi_struct *napi; + bool done = false; + unsigned long val; + void *have_poll_lock = NULL; + + rcu_read_lock(); + + napi = napi_by_id(napi_id); + if (!napi) { + rcu_read_unlock(); + return; + } + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + for (;;) { + local_bh_disable(); + val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto restart; + + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto restart; + + have_poll_lock = netpoll_poll_lock(napi); + cb(cb_arg); + done = true; + gro_normal_list(napi); + local_bh_enable(); + break; +restart: + local_bh_enable(); + if (unlikely(need_resched())) + break; + cpu_relax(); + } + if (done) + busy_poll_stop(napi, have_poll_lock, false, 1); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static void napi_hash_add(struct napi_struct *napi)