diff mbox series

[v5] Subject: io_uring: releasing CPU resources when polling

Message ID 20240619071826.1553543-1-xue01.he@samsung.com (mailing list archive)
State New
Headers show
Series [v5] Subject: io_uring: releasing CPU resources when polling | expand

Commit Message

hexue June 19, 2024, 7:18 a.m. UTC
io_uring use polling mode could improve the IO performence, but it will
spend 100% of CPU resources to do polling.

This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
a interface for user to enable a new hybrid polling at io_uring level.

A new hybrid poll is implemented on the io_uring layer. Once IO issued,
it will not polling immediately, but block first and re-run before IO
complete, then poll to reap IO. This poll function could keep polling
high performance and free up some CPU resources.

we considered about complex situations, such as multi-concurrency,
different processing speed of multi-disk, etc.

Test results:
set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM

per CPU utilization:
    read(128k, QD64, 1Job)     53%   write(128k, QD64, 1Job)     45%
    randread(4k, QD64, 16Job)  70%   randwrite(4k, QD64, 16Job)  16%
performance reduction:
    read  0.92%     write  0.92%    randread  1.61%    randwrite  0%

--

changes since v4:
- Rewrote the commit
- Update the test results
- Reorganized the code basd on 6.11

changes since v3:
- Simplified the commit
- Add some comments on code

changes since v2:
- Modified some formatting errors
- Move judgement to poll path

changes since v1:
- Extend hybrid poll to async polled io

Signed-off-by: hexue <xue01.he@samsung.com>
---
 include/linux/io_uring_types.h |  14 ++++
 include/uapi/linux/io_uring.h  |   1 +
 io_uring/io_uring.c            |   4 +-
 io_uring/io_uring.h            |   3 +
 io_uring/rw.c                  | 115 ++++++++++++++++++++++++++++++++-
 5 files changed, 135 insertions(+), 2 deletions(-)

Comments

Pavel Begunkov June 19, 2024, 3:22 p.m. UTC | #1
On 6/19/24 08:18, hexue wrote:
> io_uring use polling mode could improve the IO performence, but it will
> spend 100% of CPU resources to do polling.
> 
> This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
> a interface for user to enable a new hybrid polling at io_uring level.
> 
> A new hybrid poll is implemented on the io_uring layer. Once IO issued,
> it will not polling immediately, but block first and re-run before IO
> complete, then poll to reap IO. This poll function could keep polling
> high performance and free up some CPU resources.
> 
> we considered about complex situations, such as multi-concurrency,
> different processing speed of multi-disk, etc.
> 
> Test results:
> set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM
> 
> per CPU utilization:
>      read(128k, QD64, 1Job)     53%   write(128k, QD64, 1Job)     45%
>      randread(4k, QD64, 16Job)  70%   randwrite(4k, QD64, 16Job)  16%
> performance reduction:
>      read  0.92%     write  0.92%    randread  1.61%    randwrite  0%

What are numbers for normal / non-IOPOLL runs?



> --
> 
> changes since v4:
> - Rewrote the commit
> - Update the test results
> - Reorganized the code basd on 6.11
> 
> changes since v3:
> - Simplified the commit
> - Add some comments on code
> 
> changes since v2:
> - Modified some formatting errors
> - Move judgement to poll path
> 
> changes since v1:
> - Extend hybrid poll to async polled io
> 
> Signed-off-by: hexue <xue01.he@samsung.com>
> ---
>   include/linux/io_uring_types.h |  14 ++++
>   include/uapi/linux/io_uring.h  |   1 +
>   io_uring/io_uring.c            |   4 +-
>   io_uring/io_uring.h            |   3 +
>   io_uring/rw.c                  | 115 ++++++++++++++++++++++++++++++++-
>   5 files changed, 135 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 91224bbcfa73..8eab99c4122e 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -226,6 +226,11 @@ struct io_alloc_cache {
>   	size_t			elem_size;
>   };
>   
> +struct iopoll_info {
> +	long		last_runtime;
> +	long		last_irqtime;
> +};

Please follow the naming convention for all types you
add: "io_...".

> +
>   struct io_ring_ctx {
>   	/* const or read-mostly hot data */
>   	struct {
> @@ -428,6 +433,7 @@ struct io_ring_ctx {
>   	unsigned short			n_sqe_pages;
>   	struct page			**ring_pages;
>   	struct page			**sqe_pages;
> +	struct xarray		poll_array;
>   };
>   
>   struct io_tw_state {
> @@ -591,6 +597,12 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
>   )
>   #define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
>   
> +struct hy_poll_time {
> +	int		poll_state;
> +	struct timespec64		iopoll_start;
> +	struct timespec64		iopoll_end;

why not u64 and ktime_get_ns()?

> +};
> +
>   struct io_kiocb {
>   	union {
>   		/*
> @@ -665,6 +677,8 @@ struct io_kiocb {
>   		u64			extra1;
>   		u64			extra2;
>   	} big_cqe;
> +    /* for hybrid iopoll */
> +	struct hy_poll_time		*hy_poll;
>   };
>   
>   struct io_overflow_cqe {
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 994bf7af0efe..ef32ec319d1f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -199,6 +199,7 @@ enum io_uring_sqe_flags_bit {
>    * Removes indirection through the SQ index array.
>    */
>   #define IORING_SETUP_NO_SQARRAY		(1U << 16)
> +#define IORING_SETUP_HY_POLL	(1U << 17)
>   
>   enum io_uring_op {
>   	IORING_OP_NOP,
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 816e93e7f949..a1015ce6dde7 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -299,6 +299,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   		goto err;
>   
>   	ctx->flags = p->flags;
> +	xa_init(&ctx->poll_array);
>   	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
>   	init_waitqueue_head(&ctx->sqo_sq_wait);
>   	INIT_LIST_HEAD(&ctx->sqd_list);
> @@ -2679,6 +2680,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>   	kfree(ctx->cancel_table.hbs);
>   	kfree(ctx->cancel_table_locked.hbs);
>   	xa_destroy(&ctx->io_bl_xa);
> +	xa_destroy(&ctx->poll_array);
>   	kfree(ctx);
>   }
>   
> @@ -3637,7 +3639,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
>   			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
>   			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
>   			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
> -			IORING_SETUP_NO_SQARRAY))
> +			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
>   		return -EINVAL;
>   
>   	return io_uring_create(entries, &p, params);
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 624ca9076a50..665093c048ba 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -148,6 +148,9 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
>   		__io_submit_flush_completions(ctx);
>   }
>   
> +/* if sleep time less than 1us, then do not do the schedule op */
> +#define MIN_SCHETIME 1000

Add prefix IO_, and name should be less ambiguous.

e.g. IO_HYBRID_POLL_MIN_SLEEP

> +
>   #define io_for_each_link(pos, head) \
>   	for (pos = (head); pos; pos = pos->link)
>   
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 1a2128459cb4..48b162becfa2 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -772,6 +772,46 @@ static bool need_complete_io(struct io_kiocb *req)
>   		S_ISBLK(file_inode(req->file)->i_mode);
>   }
>   
> +static void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
> +{
> +	/*
> +	 * In multiple concurrency, a thread may operate several files
> +	 * under different file systems, the inode numbers may be
> +	 * duplicated. Each device has a different IO command processing
> +	 * capability, so using device number to record the running time
> +	 * of device
> +	 */
> +	u32 index = req->file->f_inode->i_rdev;
> +	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
> +	struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL);

Just a note, it's a performance hazard for anything running
fast enough. Per IO allocation + xarray + ktime...

> +
> +	/* if alloc fail, go to regular poll */
> +	if (!hpt) {
> +		ctx->flags &= ~IORING_SETUP_HY_POLL;

Please don't modify ctx->flags, if you want to fall back
to normal polling do it per IO.

> +		return;
> +	}
> +	hpt->poll_state = 0;

What's 0/1? Just turn it into a bool with the right name
or add named constants for states.

> +	req->hy_poll = hpt;
> +
> +	if (!entry) {
> +		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
> +		if (!entry) {
> +			ctx->flags &= ~IORING_SETUP_HY_POLL;
> +			return;
> +		}
> +		entry->last_runtime = 0;
> +		entry->last_irqtime = 0;
> +		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
> +	}
> +
> +	/*
> +	 * Here we need nanosecond timestamps, some ways of reading
> +	 * timestamps directly are only accurate to microseconds, so
> +	 * there's no better alternative here for now
> +	 */
> +	ktime_get_ts64(&hpt->iopoll_start);
> +}
> +
>   static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
>   {
>   	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> @@ -809,6 +849,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
>   		kiocb->ki_flags |= IOCB_HIPRI;
>   		kiocb->ki_complete = io_complete_rw_iopoll;
>   		req->iopoll_completed = 0;
> +		if (ctx->flags & IORING_SETUP_HY_POLL)
> +			init_hybrid_poll(ctx, req);
>   	} else {
>   		if (kiocb->ki_flags & IOCB_HIPRI)
>   			return -EINVAL;
> @@ -1106,6 +1148,75 @@ void io_rw_fail(struct io_kiocb *req)
>   	io_req_set_res(req, res, req->cqe.flags);
>   }
>   
> +static void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry)
> +{
> +	struct hrtimer_sleeper timer;
> +	struct timespec64 tc, oldtc;
> +	enum hrtimer_mode mode;
> +	ktime_t kt;
> +	long sleep_ti;
> +
> +	if (hpt->poll_state == 1)
> +		return;
> +
> +	if (entry->last_runtime <= entry->last_irqtime)
> +		return;
> +
> +	/*
> +	 * Avoid excessive scheduling time affecting performance
> +	 * by using only 25 per cent of the remaining time
> +	 */
> +	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;

Let's say you've got a new entry, which sets both to 0. Then
user does sleep(8 min) and iopolls it once, which will update
last_runtime=8 min. Next time it tries to iopoll,
(8 min - 0) / 4 = 2 min. It'll try to sleep for 2 minutes,
with UNINTERRUBTIBLE (below) at that.

Extreme, but shows there can be all kinds of fun with this scheme.

> +
> +	/*
> +	 * If the time available for sleep is too short, i.e. the
> +	 * totle running time and the context switching loss time
> +	 * are very close to each other, the scheduling operation
> +	 * is not performed to avoid increasing latency
> +	 */
> +	if (sleep_ti < MIN_SCHETIME)
> +		return;
> +
> +	ktime_get_ts64(&oldtc);
> +	kt = ktime_set(0, sleep_ti);
> +	hpt->poll_state = 1;
> +
> +	mode = HRTIMER_MODE_REL;
> +	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
> +	hrtimer_set_expires(&timer.timer, kt);
> +	set_current_state(TASK_UNINTERRUPTIBLE);

TASK_INTERRUPTIBLE, we don't want it to make userspace
unresponsive.

> +	hrtimer_sleeper_start_expires(&timer, mode);
> +
> +	if (timer.task)
> +		io_schedule();
> +
> +	hrtimer_cancel(&timer.timer);
> +	mode = HRTIMER_MODE_ABS;

What's this for? It's a local variable.

> +	__set_current_state(TASK_RUNNING);
> +	destroy_hrtimer_on_stack(&timer.timer);
> +
> +	ktime_get_ts64(&tc);
> +	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti
Where did seconds go? And can also get negative values and all
kinds of inconsistent timings.

Why is it even called irq time? Anyone would think it's an interrupt
from the IO, but it's rather from the timer.

> +}
> +
> +static int io_uring_hybrid_poll(struct io_kiocb *req,
> +				struct io_comp_batch *iob, unsigned int poll_flags)
> +{
> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +	struct io_ring_ctx *ctx = req->ctx;
> +	struct hy_poll_time *hpt = req->hy_poll;
> +	u32 index = req->file->f_inode->i_rdev;
> +	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);

Why is it per file/device/etc instead of globally per ring?
It'd make some sense if you caluculate an aggregate, e.g.
minimum sleep time for all of them, but otherwise not much.
Take one slow device, one fast device, while sleeping for the
slow one, you can get lots of completions from the fast one.

> +	int ret;

Why it's even per inode but not per ring? Take one very
slow device,

> +
> +	io_delay(hpt, entry);
> +	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);

IORING_OP_URING_CMD uses uring_cmd_iopoll

> +
> +	ktime_get_ts64(&hpt->iopoll_end);
> +	entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec;
> +	return ret;
> +}
> +
>   int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   {
>   	struct io_wq_work_node *pos, *start, *prev;
> @@ -1133,7 +1244,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   		if (READ_ONCE(req->iopoll_completed))
>   			break;
>   
> -		if (req->opcode == IORING_OP_URING_CMD) {
> +		if (ctx->flags & IORING_SETUP_HY_POLL) {
> +			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
> +		} else if (req->opcode == IORING_OP_URING_CMD) {
>   			struct io_uring_cmd *ioucmd;
>   
>   			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
Jens Axboe June 19, 2024, 3:51 p.m. UTC | #2
On 6/19/24 1:18 AM, hexue wrote:
> io_uring use polling mode could improve the IO performence, but it will
> spend 100% of CPU resources to do polling.
> 
> This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
> a interface for user to enable a new hybrid polling at io_uring level.
> 
> A new hybrid poll is implemented on the io_uring layer. Once IO issued,
> it will not polling immediately, but block first and re-run before IO
> complete, then poll to reap IO. This poll function could keep polling
> high performance and free up some CPU resources.
> 
> we considered about complex situations, such as multi-concurrency,
> different processing speed of multi-disk, etc.
> 
> Test results:
> set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM
> 
> per CPU utilization:
>     read(128k, QD64, 1Job)     53%   write(128k, QD64, 1Job)     45%
>     randread(4k, QD64, 16Job)  70%   randwrite(4k, QD64, 16Job)  16%
> performance reduction:
>     read  0.92%     write  0.92%    randread  1.61%    randwrite  0%

Haven't tried this on slower storage yet, but my usual 122M IOPS polled
test case (24 drives, each using a single thread to load up a drive)
yields the following with hybrid polling enabled:

IOPS=57.08M, BW=27.87GiB/s, IOS/call=32/31
IOPS=56.91M, BW=27.79GiB/s, IOS/call=32/32
IOPS=57.93M, BW=28.29GiB/s, IOS/call=31/31
IOPS=57.82M, BW=28.23GiB/s, IOS/call=32/32

which is even slower than IRQ driven.

It does use less cpu, about 1900% compared to 2400% before as it's
polling. And obviously this is not the best case for this scenario, as
these devices have low latencies. Like I predicted in earlier replies,
most of the added overhead here is TSC reading, outside of the obvious
one of now having wakeups and context switches, about 1M/sec of the
latter for this test.

If we move to regular flash, here's another box I have with 32 flash
drives in it. For a similar test, we get:

IOPS=104.01M, BW=50.78GiB/s, IOS/call=31/31
IOPS=103.92M, BW=50.74GiB/s, IOS/call=31/31
IOPS=103.99M, BW=50.78GiB/s, IOS/call=31/31
IOPS=103.97M, BW=50.77GiB/s, IOS/call=31/31
IOPS=104.01M, BW=50.79GiB/s, IOS/call=31/31
IOPS=104.02M, BW=50.79GiB/s, IOS/call=31/31
IOPS=103.62M, BW=50.59GiB/s, IOS/call=31/31

using 3200% CPU (32 drives, 32 threads polling) with regular polling,
and enabling hybrid polling:

IOPS=53.62M, BW=26.18GiB/s, IOS/call=32/32
IOPS=53.37M, BW=26.06GiB/s, IOS/call=31/31
IOPS=53.45M, BW=26.10GiB/s, IOS/call=32/31
IOPS=53.43M, BW=26.09GiB/s, IOS/call=32/32
IOPS=53.11M, BW=25.93GiB/s, IOS/call=32/32

and again a lot of tsc overhead (> 10%), overhead from your extra
allocations (8%).

If we just do a single flash drive, it'll do 3.25M with 100% with normal
polling, and 2.0M with 50% CPU usage with hybrid polling.

While I do suspect there are cases where hybrid polling will be more
efficient, not sure there are many of them. And you're most likely
better off just doing IRQ driven IO at that point? Particularly with the
fairly substantial overhead of maintaining the data you need, and time
querying.
hexue June 24, 2024, 2:54 a.m. UTC | #3
On 6/19/24 16:22, Pavel Begunkov wrote:
>On 6/19/24 08:18, hexue wrote:
>
>> Test results:
>> set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM
>>
>> per CPU utilization:
>>      read(128k, QD64, 1Job)     53%   write(128k, QD64, 1Job)     45%
>>      randread(4k, QD64, 16Job)  70%   randwrite(4k, QD64, 16Job)  16%
>> performance reduction:
>>      read  0.92%     write  0.92%    randread  1.61%    randwrite  0%
>
>What are numbers for normal / non-IOPOLL runs?

for normal poll, per CPU utilization is 100%*8
performance like this:
				read			write			randread		randwrite
normal poll		BW=10.9GiB/s    BW=6202MiB/s 	IOPS=1682k		IOPS=255k
hybrid poll     BW=10.8GiB/s	BW=6145MiB/s	IOPS=1655k		IOPS=255k

--
hexue
hexue June 25, 2024, 8:39 a.m. UTC | #4
On 6/19/24 15:51, Jens Axboe wrote:
>On 6/19/24 08:18, hexue wrote:
>
>While I do suspect there are cases where hybrid polling will be more
>efficient, not sure there are many of them. And you're most likely
>better off just doing IRQ driven IO at that point? Particularly with the
>fairly substantial overhead of maintaining the data you need, and time
>querying.

I rebuilt the test cases based on your information, that is, each drive has
only one thread with high pressure, there is a significant performance loss.
I previously provided test data for a single drive with multi-threaded, which
can achieve performance stable and CPU savings.

Thanks for your data, I will reduce the cost of each IO and submit the next
version.

--
hexue
Jens Axboe June 28, 2024, 9:13 p.m. UTC | #5
On 6/25/24 2:39 AM, hexue wrote:
> On 6/19/24 15:51, Jens Axboe wrote:
>> On 6/19/24 08:18, hexue wrote:
>>
>> While I do suspect there are cases where hybrid polling will be more
>> efficient, not sure there are many of them. And you're most likely
>> better off just doing IRQ driven IO at that point? Particularly with the
>> fairly substantial overhead of maintaining the data you need, and time
>> querying.
> 
> I rebuilt the test cases based on your information, that is, each drive has
> only one thread with high pressure, there is a significant performance loss.
> I previously provided test data for a single drive with multi-threaded, which
> can achieve performance stable and CPU savings.
> 
> Thanks for your data, I will reduce the cost of each IO and submit the next
> version.

Sounds good, thanks. I don't doubt it's useful for some cases as-is, but
I think there's the potential for it to be more useful.
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 91224bbcfa73..8eab99c4122e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -226,6 +226,11 @@  struct io_alloc_cache {
 	size_t			elem_size;
 };
 
+struct iopoll_info {
+	long		last_runtime;
+	long		last_irqtime;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -428,6 +433,7 @@  struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+	struct xarray		poll_array;
 };
 
 struct io_tw_state {
@@ -591,6 +597,12 @@  static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
 )
 #define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
 
+struct hy_poll_time {
+	int		poll_state;
+	struct timespec64		iopoll_start;
+	struct timespec64		iopoll_end;
+};
+
 struct io_kiocb {
 	union {
 		/*
@@ -665,6 +677,8 @@  struct io_kiocb {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
+    /* for hybrid iopoll */
+	struct hy_poll_time		*hy_poll;
 };
 
 struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 994bf7af0efe..ef32ec319d1f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -199,6 +199,7 @@  enum io_uring_sqe_flags_bit {
  * Removes indirection through the SQ index array.
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
+#define IORING_SETUP_HY_POLL	(1U << 17)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 816e93e7f949..a1015ce6dde7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -299,6 +299,7 @@  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	xa_init(&ctx->poll_array);
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -2679,6 +2680,7 @@  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->poll_array);
 	kfree(ctx);
 }
 
@@ -3637,7 +3639,7 @@  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 624ca9076a50..665093c048ba 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -148,6 +148,9 @@  static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 		__io_submit_flush_completions(ctx);
 }
 
+/* if sleep time less than 1us, then do not do the schedule op */
+#define MIN_SCHETIME 1000
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a2128459cb4..48b162becfa2 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -772,6 +772,46 @@  static bool need_complete_io(struct io_kiocb *req)
 		S_ISBLK(file_inode(req->file)->i_mode);
 }
 
+static void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	/*
+	 * In multiple concurrency, a thread may operate several files
+	 * under different file systems, the inode numbers may be
+	 * duplicated. Each device has a different IO command processing
+	 * capability, so using device number to record the running time
+	 * of device
+	 */
+	u32 index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+	struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL);
+
+	/* if alloc fail, go to regular poll */
+	if (!hpt) {
+		ctx->flags &= ~IORING_SETUP_HY_POLL;
+		return;
+	}
+	hpt->poll_state = 0;
+	req->hy_poll = hpt;
+
+	if (!entry) {
+		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+		if (!entry) {
+			ctx->flags &= ~IORING_SETUP_HY_POLL;
+			return;
+		}
+		entry->last_runtime = 0;
+		entry->last_irqtime = 0;
+		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+	}
+
+	/*
+	 * Here we need nanosecond timestamps, some ways of reading
+	 * timestamps directly are only accurate to microseconds, so
+	 * there's no better alternative here for now
+	 */
+	ktime_get_ts64(&hpt->iopoll_start);
+}
+
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -809,6 +849,8 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HY_POLL)
+			init_hybrid_poll(ctx, req);
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;
@@ -1106,6 +1148,75 @@  void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+static void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry)
+{
+	struct hrtimer_sleeper timer;
+	struct timespec64 tc, oldtc;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	long sleep_ti;
+
+	if (hpt->poll_state == 1)
+		return;
+
+	if (entry->last_runtime <= entry->last_irqtime)
+		return;
+
+	/*
+	 * Avoid excessive scheduling time affecting performance
+	 * by using only 25 per cent of the remaining time
+	 */
+	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
+
+	/*
+	 * If the time available for sleep is too short, i.e. the
+	 * totle running time and the context switching loss time
+	 * are very close to each other, the scheduling operation
+	 * is not performed to avoid increasing latency
+	 */
+	if (sleep_ti < MIN_SCHETIME)
+		return;
+
+	ktime_get_ts64(&oldtc);
+	kt = ktime_set(0, sleep_ti);
+	hpt->poll_state = 1;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	mode = HRTIMER_MODE_ABS;
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+
+	ktime_get_ts64(&tc);
+	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct hy_poll_time *hpt = req->hy_poll;
+	u32 index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+	int ret;
+
+	io_delay(hpt, entry);
+	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+
+	ktime_get_ts64(&hpt->iopoll_end);
+	entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec;
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1133,7 +1244,9 @@  int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		if (req->opcode == IORING_OP_URING_CMD) {
+		if (ctx->flags & IORING_SETUP_HY_POLL) {
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		} else if (req->opcode == IORING_OP_URING_CMD) {
 			struct io_uring_cmd *ioucmd;
 
 			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);