[PATCHv3] io_uring: set plug tags for same file

Message ID	20230731203932.2083468-1-kbusch@meta.com (mailing list archive)
State	New
Headers	show Return-Path: <io-uring-owner@vger.kernel.org> From: Keith Busch <kbusch@meta.com> To: <axboe@kernel.dk>, <asml.silence@gmail.com>, <linux-block@vger.kernel.org>, <io-uring@vger.kernel.org> CC: Keith Busch <kbusch@kernel.org> Subject: [PATCHv3] io_uring: set plug tags for same file Date: Mon, 31 Jul 2023 13:39:32 -0700 Message-ID: <20230731203932.2083468-1-kbusch@meta.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain Precedence: bulk
Series	[PATCHv3] io_uring: set plug tags for same file \| expand [PATCHv3] io_uring: set plug tags for same file

Message ID

20230731203932.2083468-1-kbusch@meta.com (mailing list archive)

State

New

Headers

From: Keith Busch <kbusch@meta.com>
To: <axboe@kernel.dk>, <asml.silence@gmail.com>,
        <linux-block@vger.kernel.org>, <io-uring@vger.kernel.org>
CC: Keith Busch <kbusch@kernel.org>
Subject: [PATCHv3] io_uring: set plug tags for same file
Date: Mon, 31 Jul 2023 13:39:32 -0700
Message-ID: <20230731203932.2083468-1-kbusch@meta.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain
Precedence: bulk

Series

[PATCHv3] io_uring: set plug tags for same file | expand

Commit Message

Keith Busch July 31, 2023, 8:39 p.m. UTC

From: Keith Busch <kbusch@kernel.org>

io_uring tries to optimize allocating tags by hinting to the plug how
many it expects to need for a batch instead of allocating each tag
individually. But io_uring submission queueus may have a mix of many
devices for io, so the number of io's counted may be overestimated. This
can lead to allocating too many tags, which adds overhead to finding
that many contiguous tags, freeing up the ones we didn't use, and may
starve out other users that can actually use them.

When starting a new batch of uring commands, count only commands that
match the file descriptor of the first seen for this optimization. This
avoids have to call the unlikely "blk_mq_free_plug_rqs()" at the end of
a submission when multiple devices are used in a batch.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2->v3

  The previous attempted to split the setup and submit further, but was
  requested to go back to this simpler version. 

 block/blk-core.c               | 49 +++++++++++++++-------------------
 block/blk-mq.c                 |  6 +++--
 include/linux/blkdev.h         |  6 -----
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            | 37 ++++++++++++++++++-------
 5 files changed, 54 insertions(+), 45 deletions(-)

Comments

Pavel Begunkov Aug. 9, 2023, 12:37 p.m. UTC | #1

On 7/31/23 21:39, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> io_uring tries to optimize allocating tags by hinting to the plug how
> many it expects to need for a batch instead of allocating each tag
> individually. But io_uring submission queueus may have a mix of many
> devices for io, so the number of io's counted may be overestimated. This
> can lead to allocating too many tags, which adds overhead to finding
> that many contiguous tags, freeing up the ones we didn't use, and may
> starve out other users that can actually use them.
> 
> When starting a new batch of uring commands, count only commands that
> match the file descriptor of the first seen for this optimization. This
> avoids have to call the unlikely "blk_mq_free_plug_rqs()" at the end of
> a submission when multiple devices are used in a batch.

looks good and doesn't blow up on tests

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>


> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> v2->v3
> 
>    The previous attempted to split the setup and submit further, but was
>    requested to go back to this simpler version.
> 
>   block/blk-core.c               | 49 +++++++++++++++-------------------
>   block/blk-mq.c                 |  6 +++--
>   include/linux/blkdev.h         |  6 -----
>   include/linux/io_uring_types.h |  1 +
>   io_uring/io_uring.c            | 37 ++++++++++++++++++-------
>   5 files changed, 54 insertions(+), 45 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 99d8b9812b18f..b8f8aa1376e60 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1043,32 +1043,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
>   }
>   EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
>   
> -void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
> -{
> -	struct task_struct *tsk = current;
> -
> -	/*
> -	 * If this is a nested plug, don't actually assign it.
> -	 */
> -	if (tsk->plug)
> -		return;
> -
> -	plug->mq_list = NULL;
> -	plug->cached_rq = NULL;
> -	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
> -	plug->rq_count = 0;
> -	plug->multiple_queues = false;
> -	plug->has_elevator = false;
> -	plug->nowait = false;
> -	INIT_LIST_HEAD(&plug->cb_list);
> -
> -	/*
> -	 * Store ordering should not be needed here, since a potential
> -	 * preempt will imply a full memory barrier
> -	 */
> -	tsk->plug = plug;
> -}
> -
>   /**
>    * blk_start_plug - initialize blk_plug and track it inside the task_struct
>    * @plug:	The &struct blk_plug that needs to be initialized
> @@ -1094,7 +1068,28 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
>    */
>   void blk_start_plug(struct blk_plug *plug)
>   {
> -	blk_start_plug_nr_ios(plug, 1);
> +	struct task_struct *tsk = current;
> +
> +	/*
> +	 * If this is a nested plug, don't actually assign it.
> +	 */
> +	if (tsk->plug)
> +		return;
> +
> +	plug->mq_list = NULL;
> +	plug->cached_rq = NULL;
> +	plug->nr_ios = 1;
> +	plug->rq_count = 0;
> +	plug->multiple_queues = false;
> +	plug->has_elevator = false;
> +	plug->nowait = false;
> +	INIT_LIST_HEAD(&plug->cb_list);
> +
> +	/*
> +	 * Store ordering should not be needed here, since a potential
> +	 * preempt will imply a full memory barrier
> +	 */
> +	tsk->plug = plug;
>   }
>   EXPORT_SYMBOL(blk_start_plug);
>   
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d50b1d62a3d92..fc75fb9ef34ed 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -523,7 +523,8 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
>   		.q		= q,
>   		.flags		= flags,
>   		.cmd_flags	= opf,
> -		.nr_tags	= plug->nr_ios,
> +		.nr_tags	= min_t(unsigned int, plug->nr_ios,
> +					BLK_MAX_REQUEST_COUNT),
>   		.cached_rq	= &plug->cached_rq,
>   	};
>   	struct request *rq;
> @@ -2859,7 +2860,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
>   	rq_qos_throttle(q, bio);
>   
>   	if (plug) {
> -		data.nr_tags = plug->nr_ios;
> +		data.nr_tags = min_t(unsigned int, plug->nr_ios,
> +				     BLK_MAX_REQUEST_COUNT);
>   		plug->nr_ios = 1;
>   		data.cached_rq = &plug->cached_rq;
>   	}
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index ed44a997f629f..a2a022957cd96 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -984,7 +984,6 @@ struct blk_plug_cb {
>   extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
>   					     void *data, int size);
>   extern void blk_start_plug(struct blk_plug *);
> -extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
>   extern void blk_finish_plug(struct blk_plug *);
>   
>   void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
> @@ -1000,11 +999,6 @@ long nr_blockdev_pages(void);
>   struct blk_plug {
>   };
>   
> -static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
> -					 unsigned short nr_ios)
> -{
> -}
> -
>   static inline void blk_start_plug(struct blk_plug *plug)
>   {
>   }
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index f04ce513fadba..109d4530bccbf 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -175,6 +175,7 @@ struct io_submit_state {
>   	bool			need_plug;
>   	unsigned short		submit_nr;
>   	unsigned int		cqes_count;
> +	int			fd;
>   	struct blk_plug		plug;
>   	struct io_uring_cqe	cqes[16];
>   };
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 135da2fd0edab..36f45d234fe49 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2195,18 +2195,25 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>   		return -EINVAL;
>   
>   	if (def->needs_file) {
> -		struct io_submit_state *state = &ctx->submit_state;
> -
>   		req->cqe.fd = READ_ONCE(sqe->fd);
>   
>   		/*
>   		 * Plug now if we have more than 2 IO left after this, and the
>   		 * target is potentially a read/write to block based storage.
>   		 */
> -		if (state->need_plug && def->plug) {
> -			state->plug_started = true;
> -			state->need_plug = false;
> -			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
> +	        if (def->plug) {
> +			struct io_submit_state *state = &ctx->submit_state;
> +
> +			if (state->need_plug) {
> +			        state->plug_started = true;
> +			        state->need_plug = false;
> +			        state->fd = req->cqe.fd;
> +			        blk_start_plug(&state->plug);
> +			} else if (state->plug_started &&
> +			           state->fd == req->cqe.fd &&
> +			           !state->link.head) {
> +			        state->plug.nr_ios++;
> +			}
>   		}
>   	}
>   
> @@ -2267,7 +2274,8 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
>   }
>   
>   static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
> -			 const struct io_uring_sqe *sqe)
> +			 const struct io_uring_sqe *sqe,
> +			 struct io_wq_work_list *req_list)
>   	__must_hold(&ctx->uring_lock)
>   {
>   	struct io_submit_link *link = &ctx->submit_state.link;
> @@ -2315,7 +2323,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
>   		return 0;
>   	}
>   
> -	io_queue_sqe(req);
> +	wq_list_add_tail(&req->comp_list, req_list);
>   	return 0;
>   }
>   
> @@ -2400,6 +2408,8 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>   	__must_hold(&ctx->uring_lock)
>   {
>   	unsigned int entries = io_sqring_entries(ctx);
> +	struct io_wq_work_list req_list;
> +	struct io_kiocb *req;
>   	unsigned int left;
>   	int ret;
>   
> @@ -2410,9 +2420,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>   	io_get_task_refs(left);
>   	io_submit_state_start(&ctx->submit_state, left);
>   
> +	INIT_WQ_LIST(&req_list);
>   	do {
>   		const struct io_uring_sqe *sqe;
> -		struct io_kiocb *req;
>   
>   		if (unlikely(!io_alloc_req(ctx, &req)))
>   			break;
> @@ -2425,13 +2435,20 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>   		 * Continue submitting even for sqe failure if the
>   		 * ring was setup with IORING_SETUP_SUBMIT_ALL
>   		 */
> -		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
> +		if (unlikely(io_submit_sqe(ctx, req, sqe, &req_list)) &&
>   		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
>   			left--;
>   			break;
>   		}
>   	} while (--left);
>   
> +	while (req_list.first) {
> +		req = container_of(req_list.first, struct io_kiocb, comp_list);
> +		req_list.first = req->comp_list.next;
> +		req->comp_list.next = NULL;
> +		io_queue_sqe(req);
> +	}
> +
>   	if (unlikely(left)) {
>   		ret -= left;
>   		/* try again if it submitted nothing and can't allocate a req */

Jens Axboe Aug. 11, 2023, 7:24 p.m. UTC | #2

On 7/31/23 2:39 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> io_uring tries to optimize allocating tags by hinting to the plug how
> many it expects to need for a batch instead of allocating each tag
> individually. But io_uring submission queueus may have a mix of many
> devices for io, so the number of io's counted may be overestimated. This
> can lead to allocating too many tags, which adds overhead to finding
> that many contiguous tags, freeing up the ones we didn't use, and may
> starve out other users that can actually use them.
> 
> When starting a new batch of uring commands, count only commands that
> match the file descriptor of the first seen for this optimization. This
> avoids have to call the unlikely "blk_mq_free_plug_rqs()" at the end of
> a submission when multiple devices are used in a batch.

Wanted to run this through both the peak IOPS and networking testing,
started with the former. Here's a peak run with -git + pending 6.5
changes + pending 6.6 changes:

IOPS=125.88M, BW=61.46GiB/s, IOS/call=32/31
IOPS=125.39M, BW=61.23GiB/s, IOS/call=31/31
IOPS=124.97M, BW=61.02GiB/s, IOS/call=32/32
IOPS=124.60M, BW=60.84GiB/s, IOS/call=32/32
IOPS=124.27M, BW=60.68GiB/s, IOS/call=31/31
IOPS=124.00M, BW=60.54GiB/s, IOS/call=32/31

and here's one with the patch:

IOPS=121.69M, BW=59.42GiB/s, IOS/call=31/32
IOPS=121.26M, BW=59.21GiB/s, IOS/call=32/32
IOPS=120.87M, BW=59.02GiB/s, IOS/call=31/31
IOPS=120.87M, BW=59.02GiB/s, IOS/call=32/32
IOPS=121.02M, BW=59.09GiB/s, IOS/call=32/32
IOPS=121.63M, BW=59.39GiB/s, IOS/call=31/32
IOPS=121.48M, BW=59.32GiB/s, IOS/call=31/31

Running a quick profile, here's the top diff:

# Baseline  Delta Abs  Shared Object     Symbol                                     
# ........  .........  ................  ...........................................
#
     6.69%     -3.03%  [kernel.vmlinux]  [k] io_issue_sqe
     9.65%     +2.30%  [nvme]            [k] nvme_poll_cq
     4.86%     -1.55%  [kernel.vmlinux]  [k] io_submit_sqes
     4.61%     +1.40%  [kernel.vmlinux]  [k] blk_mq_submit_bio
     4.79%     -0.98%  [kernel.vmlinux]  [k] io_read
     0.56%     +0.97%  [kernel.vmlinux]  [k] blkdev_dio_unaligned.isra.0
     3.61%     +0.52%  [kernel.vmlinux]  [k] dma_unmap_page_attrs
     2.04%     -0.45%  [kernel.vmlinux]  [k] blk_add_rq_to_plug

Note that this is perf.data.old being the kernel with your patch, and
perf.data being the "stock" kernel mentioned above. The main thing looks
like spending more time in io_issue_sqe() and io_submit_sqes(), and
converserly we're spending less time polling. Usually for profiling a
polled workload, having more time in the polling function is good, as it
shows us we're spending less time everywhere else.

This is what I'm using:

sudo t/io_uring -p1 -d128 -b512 -s32 -c32 -F1 -B1 -R0 -X1 -n24 -P1 /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1 /dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1 /dev/nvme16n1 /dev/nvme17n1 /dev/nvme18n1 /dev/nvme19n1 /dev/nvme20n1 /dev/nvme21n1 /dev/nvme22n1 /dev/nvme23n1

which is submitting 32 requests at the time. Obviously we don't expect a
win in this case, as each thread is handling just a single NVMe device.
The stock kernel will not over-allocate in this case.

If we change that to -n12 instead, meaning each will drive two devices,
here's what the stock kernel gets:

IOPS=60.95M, BW=29.76GiB/s, IOS/call=31/32
IOPS=60.99M, BW=29.78GiB/s, IOS/call=31/32
IOPS=60.96M, BW=29.77GiB/s, IOS/call=31/31
IOPS=60.95M, BW=29.76GiB/s, IOS/call=31/31
IOPS=60.91M, BW=29.74GiB/s, IOS/call=32/32

and with the patch:

IOPS=59.64M, BW=29.12GiB/s, IOS/call=32/31
IOPS=59.63M, BW=29.12GiB/s, IOS/call=31/32
IOPS=59.57M, BW=29.09GiB/s, IOS/call=31/31
IOPS=59.57M, BW=29.09GiB/s, IOS/call=32/32
IOPS=59.65M, BW=29.12GiB/s, IOS/call=31/31

Now these are both obviously lower, but I haven't done anything to
ensure that the two-drives-per-poller workload is optimized. For all I
know, the numa layout is now messed up too. Just as a caveat, but they
are comparable to each other.

Perf diff again looks similar, note that this time it's perf.data.old
that's the stock kernel and perf.data that's the one with your patch:

     3.51%     +2.84%  [kernel.vmlinux]  [k] io_issue_sqe
     3.24%     +1.35%  [kernel.vmlinux]  [k] io_submit_sqes

With the kernel without your patch, I was looking for tag flush overhead
but didn't find much:

     0.02%  io_uring  [kernel.vmlinux]  [k] blk_mq_free_plug_rqs

Outside of the peak worry with the patch, do you have a workload that we
should test this on?

Keith Busch Aug. 14, 2023, 4:50 p.m. UTC | #3

On Fri, Aug 11, 2023 at 01:24:17PM -0600, Jens Axboe wrote:
> 
>      3.51%     +2.84%  [kernel.vmlinux]  [k] io_issue_sqe
>      3.24%     +1.35%  [kernel.vmlinux]  [k] io_submit_sqes
> 
> With the kernel without your patch, I was looking for tag flush overhead
> but didn't find much:
> 
>      0.02%  io_uring  [kernel.vmlinux]  [k] blk_mq_free_plug_rqs
> 
> Outside of the peak worry with the patch, do you have a workload that we
> should test this on?

Thanks for the insights! I had tested simply 4 nvme drives with 1
thread, default everything:

   ./t/io_uring /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1

Which appeared to show a very small improvement with this patch on my
test vm. I'll test more to see where the tipping point is, and also see
if there's any other ways to reduce time spent in io_issue_sqe.

diff --git a/block/blk-core.c b/block/blk-core.c
index 99d8b9812b18f..b8f8aa1376e60 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1043,32 +1043,6 @@  int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
-void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
-{
-	struct task_struct *tsk = current;
-
-	/*
-	 * If this is a nested plug, don't actually assign it.
-	 */
-	if (tsk->plug)
-		return;
-
-	plug->mq_list = NULL;
-	plug->cached_rq = NULL;
-	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
-	plug->rq_count = 0;
-	plug->multiple_queues = false;
-	plug->has_elevator = false;
-	plug->nowait = false;
-	INIT_LIST_HEAD(&plug->cb_list);
-
-	/*
-	 * Store ordering should not be needed here, since a potential
-	 * preempt will imply a full memory barrier
-	 */
-	tsk->plug = plug;
-}
-
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:	The &struct blk_plug that needs to be initialized
@@ -1094,7 +1068,28 @@  void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
  */
 void blk_start_plug(struct blk_plug *plug)
 {
-	blk_start_plug_nr_ios(plug, 1);
+	struct task_struct *tsk = current;
+
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
+	plug->mq_list = NULL;
+	plug->cached_rq = NULL;
+	plug->nr_ios = 1;
+	plug->rq_count = 0;
+	plug->multiple_queues = false;
+	plug->has_elevator = false;
+	plug->nowait = false;
+	INIT_LIST_HEAD(&plug->cb_list);
+
+	/*
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
+	 */
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d50b1d62a3d92..fc75fb9ef34ed 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -523,7 +523,8 @@  static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= opf,
-		.nr_tags	= plug->nr_ios,
+		.nr_tags	= min_t(unsigned int, plug->nr_ios,
+					BLK_MAX_REQUEST_COUNT),
 		.cached_rq	= &plug->cached_rq,
 	};
 	struct request *rq;
@@ -2859,7 +2860,8 @@  static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	rq_qos_throttle(q, bio);
 
 	if (plug) {
-		data.nr_tags = plug->nr_ios;
+		data.nr_tags = min_t(unsigned int, plug->nr_ios,
+				     BLK_MAX_REQUEST_COUNT);
 		plug->nr_ios = 1;
 		data.cached_rq = &plug->cached_rq;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed44a997f629f..a2a022957cd96 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -984,7 +984,6 @@  struct blk_plug_cb {
 extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
 					     void *data, int size);
 extern void blk_start_plug(struct blk_plug *);
-extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
 
 void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
@@ -1000,11 +999,6 @@  long nr_blockdev_pages(void);
 struct blk_plug {
 };
 
-static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
-					 unsigned short nr_ios)
-{
-}
-
 static inline void blk_start_plug(struct blk_plug *plug)
 {
 }
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f04ce513fadba..109d4530bccbf 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -175,6 +175,7 @@  struct io_submit_state {
 	bool			need_plug;
 	unsigned short		submit_nr;
 	unsigned int		cqes_count;
+	int			fd;
 	struct blk_plug		plug;
 	struct io_uring_cqe	cqes[16];
 };
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 135da2fd0edab..36f45d234fe49 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2195,18 +2195,25 @@  static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return -EINVAL;
 
 	if (def->needs_file) {
-		struct io_submit_state *state = &ctx->submit_state;
-
 		req->cqe.fd = READ_ONCE(sqe->fd);
 
 		/*
 		 * Plug now if we have more than 2 IO left after this, and the
 		 * target is potentially a read/write to block based storage.
 		 */
-		if (state->need_plug && def->plug) {
-			state->plug_started = true;
-			state->need_plug = false;
-			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+	        if (def->plug) {
+			struct io_submit_state *state = &ctx->submit_state;
+
+			if (state->need_plug) {
+			        state->plug_started = true;
+			        state->need_plug = false;
+			        state->fd = req->cqe.fd;
+			        blk_start_plug(&state->plug);
+			} else if (state->plug_started &&
+			           state->fd == req->cqe.fd &&
+			           !state->link.head) {
+			        state->plug.nr_ios++;
+			}
 		}
 	}
 
@@ -2267,7 +2274,8 @@  static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
 }
 
 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			 const struct io_uring_sqe *sqe)
+			 const struct io_uring_sqe *sqe,
+			 struct io_wq_work_list *req_list)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_link *link = &ctx->submit_state.link;
@@ -2315,7 +2323,7 @@  static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		return 0;
 	}
 
-	io_queue_sqe(req);
+	wq_list_add_tail(&req->comp_list, req_list);
 	return 0;
 }
 
@@ -2400,6 +2408,8 @@  int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
 	unsigned int entries = io_sqring_entries(ctx);
+	struct io_wq_work_list req_list;
+	struct io_kiocb *req;
 	unsigned int left;
 	int ret;
 
@@ -2410,9 +2420,9 @@  int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	io_get_task_refs(left);
 	io_submit_state_start(&ctx->submit_state, left);
 
+	INIT_WQ_LIST(&req_list);
 	do {
 		const struct io_uring_sqe *sqe;
-		struct io_kiocb *req;
 
 		if (unlikely(!io_alloc_req(ctx, &req)))
 			break;
@@ -2425,13 +2435,20 @@  int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		 * Continue submitting even for sqe failure if the
 		 * ring was setup with IORING_SETUP_SUBMIT_ALL
 		 */
-		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+		if (unlikely(io_submit_sqe(ctx, req, sqe, &req_list)) &&
 		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
 			left--;
 			break;
 		}
 	} while (--left);
 
+	while (req_list.first) {
+		req = container_of(req_list.first, struct io_kiocb, comp_list);
+		req_list.first = req->comp_list.next;
+		req->comp_list.next = NULL;
+		io_queue_sqe(req);
+	}
+
 	if (unlikely(left)) {
 		ret -= left;
 		/* try again if it submitted nothing and can't allocate a req */

[PATCHv3] io_uring: set plug tags for same file

Commit Message

Comments

Patch