@@ -2,6 +2,7 @@
#define IO_URING_TYPES_H
#include <linux/blkdev.h>
+#include <linux/hashtable.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
@@ -276,6 +277,15 @@ struct io_ring_ctx {
struct xarray personalities;
u32 pers_next;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct list_head napi_list; /* track busy poll napi_id */
+ spinlock_t napi_lock; /* napi_list lock */
+
+ DECLARE_HASHTABLE(napi_ht, 4);
+ unsigned int napi_busy_poll_to; /* napi busy poll default timeout */
+ bool napi_prefer_busy_poll;
+#endif
+
struct {
/*
* We cache a range of free CQEs we can use, once exhausted it
@@ -9,3 +9,4 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
obj-$(CONFIG_IO_WQ) += io-wq.o
+obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
@@ -90,6 +90,7 @@
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
+#include "napi.h"
#include "notif.h"
#include "timeout.h"
@@ -335,6 +336,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ io_napi_init(ctx);
+
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -2566,15 +2569,31 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
iowq.timeout = KTIME_MAX;
- if (uts) {
- struct timespec64 ts;
+ if (!io_napi(ctx)) {
+ if (uts) {
+ struct timespec64 ts;
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+ } else {
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+
+ io_napi_adjust_timeout(ctx, &iowq, &ts);
+ iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ } else {
+ io_napi_adjust_timeout(ctx, &iowq, NULL);
+ }
+ io_napi_busy_loop(ctx, &iowq);
}
trace_io_uring_cqring_wait(ctx, min_events);
+
do {
unsigned long check_cq;
@@ -2806,6 +2825,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
+ io_napi_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->dummy_ubuf);
@@ -34,6 +34,10 @@ struct io_wait_queue {
unsigned nr_timeouts;
ktime_t timeout;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned int napi_busy_poll_to;
+ bool napi_prefer_busy_poll;
+#endif
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
new file mode 100644
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "io_uring.h"
+#include "napi.h"
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+/* Timeout for cleanout of stale entries. */
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct io_napi_ht_entry {
+ unsigned int napi_id;
+ struct list_head list;
+
+ /* Covered by napi lock spinlock. */
+ unsigned long timeout;
+ struct hlist_node node;
+};
+
+void __io_napi_add(struct io_ring_ctx *ctx, struct file *file)
+{
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ struct io_napi_ht_entry *he;
+
+ sock = sock_from_file(file);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ spin_lock(&ctx->napi_lock);
+ hash_for_each_possible(ctx->napi_ht, he, node, napi_id) {
+ if (he->napi_id == napi_id) {
+ he->timeout = jiffies + NAPI_TIMEOUT;
+ goto out;
+ }
+ }
+
+ he = kmalloc(sizeof(*he), GFP_NOWAIT);
+ if (!he)
+ goto out;
+
+ he->napi_id = napi_id;
+ he->timeout = jiffies + NAPI_TIMEOUT;
+ hash_add(ctx->napi_ht, &he->node, napi_id);
+
+ list_add_tail(&he->list, &ctx->napi_list);
+
+out:
+ spin_unlock(&ctx->napi_lock);
+}
+
+static inline void adjust_timeout(unsigned int poll_to, struct timespec64 *ts,
+ unsigned int *new_poll_to)
+{
+ struct timespec64 pollto = ns_to_timespec64(1000 * (s64)poll_to);
+
+ if (timespec64_compare(ts, &pollto) > 0) {
+ *ts = timespec64_sub(*ts, pollto);
+ *new_poll_to = poll_to;
+ } else {
+ u64 to = timespec64_to_ns(ts);
+
+ do_div(to, 1000);
+ *new_poll_to = to;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
+}
+
+static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+
+ return true;
+}
+
+static bool io_napi_busy_loop_should_end(void *p, unsigned long start_time)
+{
+ struct io_wait_queue *iowq = p;
+
+ return signal_pending(current) ||
+ io_should_wake(iowq) ||
+ io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to);
+}
+
+static bool __io_napi_busy_loop(struct list_head *napi_list, bool prefer_busy_poll)
+{
+ struct io_napi_ht_entry *e;
+ struct io_napi_ht_entry *n;
+
+ list_for_each_entry_safe(e, n, napi_list, list) {
+ napi_busy_loop(e->napi_id, NULL, NULL, prefer_busy_poll,
+ BUSY_POLL_BUDGET);
+ }
+
+ return !list_empty(napi_list);
+}
+
+static void io_napi_multi_busy_loop(struct list_head *napi_list,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time = busy_loop_current_time();
+
+ do {
+ if (list_is_singular(napi_list))
+ break;
+ if (!__io_napi_busy_loop(napi_list, iowq->napi_prefer_busy_poll))
+ break;
+ } while (!io_napi_busy_loop_should_end(iowq, start_time));
+}
+
+static void io_napi_blocking_busy_loop(struct list_head *napi_list,
+ struct io_wait_queue *iowq)
+{
+ if (!list_is_singular(napi_list))
+ io_napi_multi_busy_loop(napi_list, iowq);
+
+ if (list_is_singular(napi_list)) {
+ struct io_napi_ht_entry *ne;
+
+ ne = list_first_entry(napi_list, struct io_napi_ht_entry, list);
+ napi_busy_loop(ne->napi_id, io_napi_busy_loop_should_end, iowq,
+ iowq->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+ }
+}
+
+static void io_napi_remove_stale(struct io_ring_ctx *ctx)
+{
+ unsigned int i;
+ struct io_napi_ht_entry *he;
+
+ hash_for_each(ctx->napi_ht, i, he, node) {
+ if (time_after(jiffies, he->timeout)) {
+ list_del(&he->list);
+ hash_del(&he->node);
+ }
+ }
+
+}
+
+static void io_napi_merge_lists(struct io_ring_ctx *ctx,
+ struct list_head *napi_list)
+{
+ spin_lock(&ctx->napi_lock);
+ list_splice(napi_list, &ctx->napi_list);
+ io_napi_remove_stale(ctx);
+ spin_unlock(&ctx->napi_lock);
+}
+
+/*
+ * io_napi_init() - Init napi settings
+ * @ctx: pointer to io-uring context structure
+ *
+ * Init napi settings in the io-uring context.
+ */
+void io_napi_init(struct io_ring_ctx *ctx)
+{
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+ ctx->napi_prefer_busy_poll = false;
+ ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+}
+
+/*
+ * io_napi_free() - Deallocate napi
+ * @ctx: pointer to io-uring context structure
+ *
+ * Free the napi list and the hash table in the io-uring context.
+ */
+void io_napi_free(struct io_ring_ctx *ctx)
+{
+ unsigned int i;
+ struct io_napi_ht_entry *he;
+ LIST_HEAD(napi_list);
+
+ spin_lock(&ctx->napi_lock);
+ hash_for_each(ctx->napi_ht, i, he, node)
+ hash_del(&he->node);
+ spin_unlock(&ctx->napi_lock);
+}
+
+/*
+ * io_napi_adjust_timeout() - Add napi id to the busy poll list
+ * @ctx: pointer to io-uring context structure
+ * @iowq: pointer to io wait queue
+ * @ts: pointer to timespec or NULL
+ *
+ * Adjust the busy loop timeout according to timespec and busy poll timeout.
+ */
+void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
+ struct timespec64 *ts)
+{
+ if (ts)
+ adjust_timeout(READ_ONCE(ctx->napi_busy_poll_to), ts,
+ &iowq->napi_busy_poll_to);
+ else
+ iowq->napi_busy_poll_to = READ_ONCE(ctx->napi_busy_poll_to);
+}
+
+/*
+ * io_napi_busy_loop() - execute busy poll loop
+ * @ctx: pointer to io-uring context structure
+ * @iowq: pointer to io wait queue
+ *
+ * Execute the busy poll loop and merge the spliced off list.
+ */
+void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
+{
+ iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+
+ /* SQPOLL is handled in sqthread. */
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ LIST_HEAD(napi_list);
+
+ spin_lock(&ctx->napi_lock);
+ list_splice_init(&ctx->napi_list, &napi_list);
+ spin_unlock(&ctx->napi_lock);
+
+ if (iowq->napi_busy_poll_to)
+ io_napi_blocking_busy_loop(&napi_list, iowq);
+
+ io_napi_merge_lists(ctx, &napi_list);
+ }
+}
+
+#endif
new file mode 100644
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef IOU_NAPI_H
+#define IOU_NAPI_H
+
+#include <linux/kernel.h>
+#include <linux/io_uring.h>
+#include <net/busy_poll.h>
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+void io_napi_init(struct io_ring_ctx *ctx);
+void io_napi_free(struct io_ring_ctx *ctx);
+
+void __io_napi_add(struct io_ring_ctx *ctx, struct file *file);
+
+void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq, struct timespec64 *ts);
+void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
+
+static inline bool io_napi(struct io_ring_ctx *ctx)
+{
+ return !list_empty(&ctx->napi_list);
+}
+
+/*
+ * io_napi_add() - Add napi id to the busy poll list
+ * @req: pointer to io_kiocb request
+ *
+ * Add the napi id of the socket to the napi busy poll list and hash table.
+ */
+static inline void io_napi_add(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!READ_ONCE(ctx->napi_busy_poll_to))
+ return;
+
+ __io_napi_add(ctx, req->file);
+}
+
+#else
+
+static inline void io_napi_init(struct io_ring_ctx *ctx)
+{
+}
+
+static inline void io_napi_free(struct io_ring_ctx *ctx)
+{
+}
+
+static inline bool io_napi(struct io_ring_ctx *ctx)
+{
+ return false;
+}
+
+static inline void io_napi_add(struct io_kiocb *req)
+{
+}
+
+#define io_napi_adjust_timeout(ctx, iowq, ts) do {} while (0)
+#define io_napi_busy_loop(ctx, iowq) do {} while (0)
+
+#endif
+
+#endif
@@ -15,6 +15,7 @@
#include "io_uring.h"
#include "refs.h"
+#include "napi.h"
#include "opdef.h"
#include "kbuf.h"
#include "poll.h"
@@ -629,6 +630,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
__io_poll_execute(req, mask);
return 0;
}
+ io_napi_add(req);
if (ipt->owning) {
/*