diff mbox series

[RESEND,4/5] loop: try to handle loop aio command via NOWAIT IO first

Message ID 20250308162312.1640828-5-ming.lei@redhat.com (mailing list archive)
State New
Headers show
Series loop: improve loop aio perf by IOCB_NOWAIT | expand

Commit Message

Ming Lei March 8, 2025, 4:23 p.m. UTC
Try to handle loop aio command via NOWAIT IO first, then we can avoid to
queue the aio command into workqueue.

Fallback to workqueue in case of -EAGAIN.

BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
.write_iter() which might sleep even though it is NOWAIT.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/block/loop.c | 47 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig March 10, 2025, 11:14 a.m. UTC | #1
On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> queue the aio command into workqueue.
> 
> Fallback to workqueue in case of -EAGAIN.
> 
> BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> .write_iter() which might sleep even though it is NOWAIT.

This needs performance numbers (or other reasons) justifying the
change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.

>  static DEFINE_IDR(loop_index_idr);
>  static DEFINE_MUTEX(loop_ctl_mutex);
>  static DEFINE_MUTEX(loop_validate_mutex);
> @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
>  
>  	if (!atomic_dec_and_test(&cmd->ref))
>  		return;
> +
> +	if (cmd->ret == -EAGAIN) {
> +		struct loop_device *lo = rq->q->queuedata;
> +
> +		loop_queue_work(lo, cmd);
> +		return;
> +	}

This looks like the wrong place for the rety, as -EAGAIN can only come from
the submissions path.  i.e. we should never make it to the full completion
path for that case.

>  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> +{
> +	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
> +	int ret;
> +
> +	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
> +	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
> +	if (ret != -EIOCBQUEUED)
> +		lo_rw_aio_complete(&cmd->iocb, ret);
> +	return 0;

This needs an explanation that it is for the fallback path and thus
clears the nowait flag.

> +}
> +
> +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)

Overly long line.

> @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		break;
>  	}
>  
> +	if (cmd->use_aio) {
> +		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
> +		int ret = lo_rw_aio_nowait(lo, cmd, pos);
> +
> +		if (!ret)
> +			return BLK_STS_OK;
> +		if (ret != -EAGAIN)
> +			return BLK_STS_IOERR;
> +		/* fallback to workqueue for handling aio */
> +	}

Why isn't all the logic in this branch in lo_rw_aio_nowait?
Ming Lei March 11, 2025, 1:33 a.m. UTC | #2
On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > queue the aio command into workqueue.
> > 
> > Fallback to workqueue in case of -EAGAIN.
> > 
> > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > .write_iter() which might sleep even though it is NOWAIT.
> 
> This needs performance numbers (or other reasons) justifying the
> change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.

The difference is just rcu_read_lock() vs. srcu_read_lock(), and not
see any difference in typical fio workload on loop device, and the gain
is pretty obvious, bandwidth is increased by > 4X in aio workloads:

https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6

> 
> >  static DEFINE_IDR(loop_index_idr);
> >  static DEFINE_MUTEX(loop_ctl_mutex);
> >  static DEFINE_MUTEX(loop_validate_mutex);
> > @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
> >  
> >  	if (!atomic_dec_and_test(&cmd->ref))
> >  		return;
> > +
> > +	if (cmd->ret == -EAGAIN) {
> > +		struct loop_device *lo = rq->q->queuedata;
> > +
> > +		loop_queue_work(lo, cmd);
> > +		return;
> > +	}
> 
> This looks like the wrong place for the rety, as -EAGAIN can only come from
> the submissions path.  i.e. we should never make it to the full completion
> path for that case.

That is not true, at least for XFS:

[root@ktest-40 io]# bpftrace -e 'kretfunc:lo_rw_aio_complete /args->ret == -11/ { @eagain[kstack] = count() } '
Attaching 1 probe...
^C

@eagain[
    bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250
    bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250
    bpf_trampoline_367219848433+108
    lo_rw_aio_complete+9
    blkdev_bio_end_io_async+63
    bio_submit_split+347
    blk_mq_submit_bio+395
    __submit_bio+116
    submit_bio_noacct_nocheck+773
    submit_bio_wait+87
    xfs_rw_bdev+348
    xlog_do_io+131
    xlog_write_log_records+451
    xlog_find_tail+829
    xlog_recover+61
    xfs_log_mount+259
    xfs_mountfs+1232
    xfs_fs_fill_super+1507
    get_tree_bdev_flags+303
    vfs_get_tree+38
    vfs_cmd_create+89
    __do_sys_fsconfig+1286
    do_syscall_64+130
    entry_SYSCALL_64_after_hwframe+118
]: 2


> 
> >  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> > +{
> > +	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
> > +	int ret;
> > +
> > +	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
> > +	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
> > +	if (ret != -EIOCBQUEUED)
> > +		lo_rw_aio_complete(&cmd->iocb, ret);
> > +	return 0;
> 
> This needs an explanation that it is for the fallback path and thus
> clears the nowait flag.

OK.

> 
> > +}
> > +
> > +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
> 
> Overly long line.
> 
> > @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  		break;
> >  	}
> >  
> > +	if (cmd->use_aio) {
> > +		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
> > +		int ret = lo_rw_aio_nowait(lo, cmd, pos);
> > +
> > +		if (!ret)
> > +			return BLK_STS_OK;
> > +		if (ret != -EAGAIN)
> > +			return BLK_STS_IOERR;
> > +		/* fallback to workqueue for handling aio */
> > +	}
> 
> Why isn't all the logic in this branch in lo_rw_aio_nowait?

Good catch, I just found we have BLK_STS_AGAIN.


Thanks,
Ming
Christoph Hellwig March 11, 2025, 7:58 a.m. UTC | #3
On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote:
> On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > > queue the aio command into workqueue.
> > > 
> > > Fallback to workqueue in case of -EAGAIN.
> > > 
> > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > > .write_iter() which might sleep even though it is NOWAIT.
> > 
> > This needs performance numbers (or other reasons) justifying the
> > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.
> 
> The difference is just rcu_read_lock() vs. srcu_read_lock(), and not

Not, it also includes offloading to kblockd in more cases.

>
>
> see any difference in typical fio workload on loop device, and the gain
> is pretty obvious, bandwidth is increased by > 4X in aio workloads:
> 
> https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6

Please document that in the commit log.

> > > +	if (cmd->ret == -EAGAIN) {
> > > +		struct loop_device *lo = rq->q->queuedata;
> > > +
> > > +		loop_queue_work(lo, cmd);
> > > +		return;
> > > +	}
> > 
> > This looks like the wrong place for the rety, as -EAGAIN can only come from
> > the submissions path.  i.e. we should never make it to the full completion
> > path for that case.
> 
> That is not true, at least for XFS:

Your trace sees lo_rw_aio_complete called from the block layer
splitting called from loop, I see nothing about XFS there.  But yes,
this shows the issue discussed last week in the iomap IOCB_NOWAIT
thread.
Ming Lei March 11, 2025, 10:55 a.m. UTC | #4
On Tue, Mar 11, 2025 at 12:58:46AM -0700, Christoph Hellwig wrote:
> On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote:
> > On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote:
> > > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote:
> > > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to
> > > > queue the aio command into workqueue.
> > > > 
> > > > Fallback to workqueue in case of -EAGAIN.
> > > > 
> > > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or
> > > > .write_iter() which might sleep even though it is NOWAIT.
> > > 
> > > This needs performance numbers (or other reasons) justifying the
> > > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead.
> > 
> > The difference is just rcu_read_lock() vs. srcu_read_lock(), and not
> 
> Not, it also includes offloading to kblockd in more cases.

But loop doesn't run into these cases:

blk_execute_rq_nowait():
	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);

blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
	blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);

blk_mq_start_stopped_hw_queues
	...

> 
> >
> >
> > see any difference in typical fio workload on loop device, and the gain
> > is pretty obvious, bandwidth is increased by > 4X in aio workloads:
> > 
> > https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6
> 
> Please document that in the commit log.
> 
> > > > +	if (cmd->ret == -EAGAIN) {
> > > > +		struct loop_device *lo = rq->q->queuedata;
> > > > +
> > > > +		loop_queue_work(lo, cmd);
> > > > +		return;
> > > > +	}
> > > 
> > > This looks like the wrong place for the rety, as -EAGAIN can only come from
> > > the submissions path.  i.e. we should never make it to the full completion
> > > path for that case.
> > 
> > That is not true, at least for XFS:
> 
> Your trace sees lo_rw_aio_complete called from the block layer
> splitting called from loop, I see nothing about XFS there.  But yes,
> this shows the issue discussed last week in the iomap IOCB_NOWAIT
> thread.

Looks I mis-parse the stack, but it is still returned from blkdev's ->ki_complete(),
and need to be handled.



Thanks,
Ming
diff mbox series

Patch

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9f8d32d2dc4d..46be0c8e75a6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -92,6 +92,8 @@  struct loop_cmd {
 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
 #define LOOP_DEFAULT_HW_Q_DEPTH 128
 
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd);
+
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_ctl_mutex);
 static DEFINE_MUTEX(loop_validate_mutex);
@@ -380,8 +382,17 @@  static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
 
 	if (!atomic_dec_and_test(&cmd->ref))
 		return;
+
+	if (cmd->ret == -EAGAIN) {
+		struct loop_device *lo = rq->q->queuedata;
+
+		loop_queue_work(lo, cmd);
+		return;
+	}
+
 	kfree(cmd->bvec);
 	cmd->bvec = NULL;
+
 	if (likely(!blk_should_fake_timeout(rq->q)))
 		blk_mq_complete_request(rq);
 }
@@ -478,16 +489,34 @@  static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd,
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
+{
+	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
+	int ret;
+
+	cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
+	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
+	if (ret != -EIOCBQUEUED)
+		lo_rw_aio_complete(&cmd->iocb, ret);
+	return 0;
+}
+
+static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos)
 {
 	unsigned int nr_bvec = lo_cmd_nr_bvec(cmd);
 	int ret = lo_rw_aio_prep(lo, cmd, nr_bvec);
 
 	if (ret < 0)
 		return ret;
+
+	cmd->iocb.ki_flags |= IOCB_NOWAIT;
 	ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec);
-	if (ret != -EIOCBQUEUED)
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	if (ret != -EAGAIN) {
 		lo_rw_aio_complete(&cmd->iocb, ret);
-	return 0;
+		return 0;
+	}
+	return ret;
 }
 
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
@@ -1926,6 +1955,17 @@  static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
+	if (cmd->use_aio) {
+		loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
+		int ret = lo_rw_aio_nowait(lo, cmd, pos);
+
+		if (!ret)
+			return BLK_STS_OK;
+		if (ret != -EAGAIN)
+			return BLK_STS_IOERR;
+		/* fallback to workqueue for handling aio */
+	}
+
 	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
@@ -2076,7 +2116,8 @@  static int loop_add(int i)
 	lo->tag_set.queue_depth = hw_queue_depth;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-	lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT;
+	lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT |
+		BLK_MQ_F_BLOCKING;
 	lo->tag_set.driver_data = lo;
 
 	err = blk_mq_alloc_tag_set(&lo->tag_set);