Message ID | 20250308162312.1640828-5-ming.lei@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | loop: improve loop aio perf by IOCB_NOWAIT | expand |
On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote: > Try to handle loop aio command via NOWAIT IO first, then we can avoid to > queue the aio command into workqueue. > > Fallback to workqueue in case of -EAGAIN. > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or > .write_iter() which might sleep even though it is NOWAIT. This needs performance numbers (or other reasons) justifying the change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead. > static DEFINE_IDR(loop_index_idr); > static DEFINE_MUTEX(loop_ctl_mutex); > static DEFINE_MUTEX(loop_validate_mutex); > @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) > > if (!atomic_dec_and_test(&cmd->ref)) > return; > + > + if (cmd->ret == -EAGAIN) { > + struct loop_device *lo = rq->q->queuedata; > + > + loop_queue_work(lo, cmd); > + return; > + } This looks like the wrong place for the rety, as -EAGAIN can only come from the submissions path. i.e. we should never make it to the full completion path for that case. > static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) > +{ > + unsigned int nr_bvec = lo_cmd_nr_bvec(cmd); > + int ret; > + > + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; > + ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec); > + if (ret != -EIOCBQUEUED) > + lo_rw_aio_complete(&cmd->iocb, ret); > + return 0; This needs an explanation that it is for the fallback path and thus clears the nowait flag. > +} > + > +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) Overly long line. > @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, > break; > } > > + if (cmd->use_aio) { > + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; > + int ret = lo_rw_aio_nowait(lo, cmd, pos); > + > + if (!ret) > + return BLK_STS_OK; > + if (ret != -EAGAIN) > + return BLK_STS_IOERR; > + /* fallback to workqueue for handling aio */ > + } Why isn't all the logic in this branch in lo_rw_aio_nowait?
On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote: > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote: > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to > > queue the aio command into workqueue. > > > > Fallback to workqueue in case of -EAGAIN. > > > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or > > .write_iter() which might sleep even though it is NOWAIT. > > This needs performance numbers (or other reasons) justifying the > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead. The difference is just rcu_read_lock() vs. srcu_read_lock(), and not see any difference in typical fio workload on loop device, and the gain is pretty obvious, bandwidth is increased by > 4X in aio workloads: https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6 > > > static DEFINE_IDR(loop_index_idr); > > static DEFINE_MUTEX(loop_ctl_mutex); > > static DEFINE_MUTEX(loop_validate_mutex); > > @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) > > > > if (!atomic_dec_and_test(&cmd->ref)) > > return; > > + > > + if (cmd->ret == -EAGAIN) { > > + struct loop_device *lo = rq->q->queuedata; > > + > > + loop_queue_work(lo, cmd); > > + return; > > + } > > This looks like the wrong place for the rety, as -EAGAIN can only come from > the submissions path. i.e. we should never make it to the full completion > path for that case. That is not true, at least for XFS: [root@ktest-40 io]# bpftrace -e 'kretfunc:lo_rw_aio_complete /args->ret == -11/ { @eagain[kstack] = count() } ' Attaching 1 probe... ^C @eagain[ bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250 bpf_prog_6deef7357e7b4530_sd_fw_ingress+28250 bpf_trampoline_367219848433+108 lo_rw_aio_complete+9 blkdev_bio_end_io_async+63 bio_submit_split+347 blk_mq_submit_bio+395 __submit_bio+116 submit_bio_noacct_nocheck+773 submit_bio_wait+87 xfs_rw_bdev+348 xlog_do_io+131 xlog_write_log_records+451 xlog_find_tail+829 xlog_recover+61 xfs_log_mount+259 xfs_mountfs+1232 xfs_fs_fill_super+1507 get_tree_bdev_flags+303 vfs_get_tree+38 vfs_cmd_create+89 __do_sys_fsconfig+1286 do_syscall_64+130 entry_SYSCALL_64_after_hwframe+118 ]: 2 > > > static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) > > +{ > > + unsigned int nr_bvec = lo_cmd_nr_bvec(cmd); > > + int ret; > > + > > + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; > > + ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec); > > + if (ret != -EIOCBQUEUED) > > + lo_rw_aio_complete(&cmd->iocb, ret); > > + return 0; > > This needs an explanation that it is for the fallback path and thus > clears the nowait flag. OK. > > > +} > > + > > +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) > > Overly long line. > > > @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, > > break; > > } > > > > + if (cmd->use_aio) { > > + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; > > + int ret = lo_rw_aio_nowait(lo, cmd, pos); > > + > > + if (!ret) > > + return BLK_STS_OK; > > + if (ret != -EAGAIN) > > + return BLK_STS_IOERR; > > + /* fallback to workqueue for handling aio */ > > + } > > Why isn't all the logic in this branch in lo_rw_aio_nowait? Good catch, I just found we have BLK_STS_AGAIN. Thanks, Ming
On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote: > On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote: > > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote: > > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to > > > queue the aio command into workqueue. > > > > > > Fallback to workqueue in case of -EAGAIN. > > > > > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or > > > .write_iter() which might sleep even though it is NOWAIT. > > > > This needs performance numbers (or other reasons) justifying the > > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead. > > The difference is just rcu_read_lock() vs. srcu_read_lock(), and not Not, it also includes offloading to kblockd in more cases. > > > see any difference in typical fio workload on loop device, and the gain > is pretty obvious, bandwidth is increased by > 4X in aio workloads: > > https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6 Please document that in the commit log. > > > + if (cmd->ret == -EAGAIN) { > > > + struct loop_device *lo = rq->q->queuedata; > > > + > > > + loop_queue_work(lo, cmd); > > > + return; > > > + } > > > > This looks like the wrong place for the rety, as -EAGAIN can only come from > > the submissions path. i.e. we should never make it to the full completion > > path for that case. > > That is not true, at least for XFS: Your trace sees lo_rw_aio_complete called from the block layer splitting called from loop, I see nothing about XFS there. But yes, this shows the issue discussed last week in the iomap IOCB_NOWAIT thread.
On Tue, Mar 11, 2025 at 12:58:46AM -0700, Christoph Hellwig wrote: > On Tue, Mar 11, 2025 at 09:33:01AM +0800, Ming Lei wrote: > > On Mon, Mar 10, 2025 at 12:14:44PM +0100, Christoph Hellwig wrote: > > > On Sun, Mar 09, 2025 at 12:23:08AM +0800, Ming Lei wrote: > > > > Try to handle loop aio command via NOWAIT IO first, then we can avoid to > > > > queue the aio command into workqueue. > > > > > > > > Fallback to workqueue in case of -EAGAIN. > > > > > > > > BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or > > > > .write_iter() which might sleep even though it is NOWAIT. > > > > > > This needs performance numbers (or other reasons) justifying the > > > change, especially as BLK_MQ_F_BLOCKING is a bit of an overhead. > > > > The difference is just rcu_read_lock() vs. srcu_read_lock(), and not > > Not, it also includes offloading to kblockd in more cases. But loop doesn't run into these cases: blk_execute_rq_nowait(): blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); blk_mq_start_stopped_hw_queues ... > > > > > > > see any difference in typical fio workload on loop device, and the gain > > is pretty obvious, bandwidth is increased by > 4X in aio workloads: > > > > https://lore.kernel.org/linux-block/f7c9d956-2b9b-8bb4-aa49-d57323fc8eb0@redhat.com/T/#md3a6154218cb6619d8af5432cf2dd3a4a7a3dcc6 > > Please document that in the commit log. > > > > > + if (cmd->ret == -EAGAIN) { > > > > + struct loop_device *lo = rq->q->queuedata; > > > > + > > > > + loop_queue_work(lo, cmd); > > > > + return; > > > > + } > > > > > > This looks like the wrong place for the rety, as -EAGAIN can only come from > > > the submissions path. i.e. we should never make it to the full completion > > > path for that case. > > > > That is not true, at least for XFS: > > Your trace sees lo_rw_aio_complete called from the block layer > splitting called from loop, I see nothing about XFS there. But yes, > this shows the issue discussed last week in the iomap IOCB_NOWAIT > thread. Looks I mis-parse the stack, but it is still returned from blkdev's ->ki_complete(), and need to be handled. Thanks, Ming
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9f8d32d2dc4d..46be0c8e75a6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -92,6 +92,8 @@ struct loop_cmd { #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd); + static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); @@ -380,8 +382,17 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) if (!atomic_dec_and_test(&cmd->ref)) return; + + if (cmd->ret == -EAGAIN) { + struct loop_device *lo = rq->q->queuedata; + + loop_queue_work(lo, cmd); + return; + } + kfree(cmd->bvec); cmd->bvec = NULL; + if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } @@ -478,16 +489,34 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) +{ + unsigned int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret; + + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; + ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec); + if (ret != -EIOCBQUEUED) + lo_rw_aio_complete(&cmd->iocb, ret); + return 0; +} + +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos) { unsigned int nr_bvec = lo_cmd_nr_bvec(cmd); int ret = lo_rw_aio_prep(lo, cmd, nr_bvec); if (ret < 0) return ret; + + cmd->iocb.ki_flags |= IOCB_NOWAIT; ret = lo_submit_rw_aio(lo, cmd, pos, nr_bvec); - if (ret != -EIOCBQUEUED) + if (ret == -EIOCBQUEUED) + return 0; + if (ret != -EAGAIN) { lo_rw_aio_complete(&cmd->iocb, ret); - return 0; + return 0; + } + return ret; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) @@ -1926,6 +1955,17 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } + if (cmd->use_aio) { + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; + int ret = lo_rw_aio_nowait(lo, cmd, pos); + + if (!ret) + return BLK_STS_OK; + if (ret != -EAGAIN) + return BLK_STS_IOERR; + /* fallback to workqueue for handling aio */ + } + loop_queue_work(lo, cmd); return BLK_STS_OK; @@ -2076,7 +2116,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT | + BLK_MQ_F_BLOCKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set);
Try to handle loop aio command via NOWAIT IO first, then we can avoid to queue the aio command into workqueue. Fallback to workqueue in case of -EAGAIN. BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or .write_iter() which might sleep even though it is NOWAIT. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/loop.c | 47 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-)