@@ -83,6 +83,8 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_ABORTED 0x04
+#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@@ -160,11 +162,19 @@ static struct lock_class_key ublk_bio_compl_lkclass;
static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
{
if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
+ !(ubq->flags & UBLK_F_NEED_GET_DATA) &&
!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
return true;
return false;
}
+static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
+{
+ if (ubq->flags & UBLK_F_NEED_GET_DATA)
+ return true;
+ return false;
+}
+
static struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@@ -514,6 +524,21 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
}
}
+static void ubq_complete_io_cmd(struct ublk_io *io, int res)
+{
+ /* mark this cmd owned by ublksrv */
+ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+ /*
+ * clear ACTIVE since we are done with this sqe/cmd slot
+ * We can only accept io cmd in case of being not active.
+ */
+ io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+ /* tell ublksrv one io request is coming */
+ io_uring_cmd_done(io->cmd, res, 0);
+}
+
#define UBLK_REQUEUE_DELAY_MS 3
static inline void __ublk_rq_task_work(struct request *req)
@@ -536,6 +561,20 @@ static inline void __ublk_rq_task_work(struct request *req)
return;
}
+ if (ublk_need_get_data(ubq) &&
+ (req_op(req) == REQ_OP_WRITE ||
+ req_op(req) == REQ_OP_FLUSH) &&
+ !(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
+
+ pr_devel("%s: ublk_need_get_data. op %d, qid %d tag %d io_flags %x\n",
+ __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags);
+
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+
+ ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
+ return;
+ }
+
mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
@@ -558,17 +597,13 @@ static inline void __ublk_rq_task_work(struct request *req)
mapped_bytes >> 9;
}
- /* mark this cmd owned by ublksrv */
- io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
-
/*
- * clear ACTIVE since we are done with this sqe/cmd slot
- * We can only accept io cmd in case of being not active.
+ * Anyway, we have handled UBLK_IO_NEED_GET_DATA for WRITE/FLUSH requests,
+ * or we did nothing for other types requests.
*/
- io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
+ ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
}
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
@@ -860,6 +895,16 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
mutex_unlock(&ub->mutex);
}
+static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
+ int tag, struct io_uring_cmd *cmd)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+
+ pdu->req = req;
+ io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -898,6 +943,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto out;
}
+ /*
+ * ensure that the user issues UBLK_IO_NEED_GET_DATA
+ * if the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
+ */
+ if ((io->flags & UBLK_IO_FLAG_NEED_GET_DATA)
+ && (cmd_op != UBLK_IO_NEED_GET_DATA))
+ goto out;
+
switch (cmd_op) {
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
@@ -931,6 +984,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
io->cmd = cmd;
ublk_commit_completion(ub, ub_cmd);
break;
+ case UBLK_IO_NEED_GET_DATA:
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+ goto out;
+ io->addr = ub_cmd->addr;
+ io->cmd = cmd;
+ io->flags |= UBLK_IO_FLAG_ACTIVE;
+ ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
+ break;
default:
goto out;
}
@@ -28,12 +28,22 @@
* this IO request, request's handling result is committed to ublk
* driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
* handled before completing io request.
+ *
+ * NEED_GET_DATA: only used for write requests to set io addr and copy data
+ * When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
+ * command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
+ *
+ * It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
+ * while starting a ublk device.
*/
#define UBLK_IO_FETCH_REQ 0x20
#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
+#define UBLK_IO_NEED_GET_DATA 0x22
+
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
+#define UBLK_IO_RES_NEED_GET_DATA 1
#define UBLK_IO_RES_ABORT (-ENODEV)
#define UBLKSRV_CMD_BUF_OFFSET 0
@@ -54,6 +64,15 @@
*/
#define UBLK_F_URING_CMD_COMP_IN_TASK (1UL << 1)
+/*
+ * User should issue io cmd again for write requests to
+ * set io buffer address and copy data from bio vectors
+ * to the userspace io buffer.
+ *
+ * In this mode, task_work is not used.
+ */
+#define UBLK_F_NEED_GET_DATA (1UL << 3)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
Add one new ublk command: UBLK_IO_NEED_GET_DATA. It is designed for a user application who wants to allocate IO buffer and set IO buffer address only after it receives an IO request. 1. Background: For now, ublk requires the user to set IO buffer address in advance(with last UBLK_IO_COMMIT_AND_FETCH_REQ command)so the user has to pre-allocate IO buffer. For READ requests, this work flow looks good because the data copy happens after user application gets a cqe and the kernel copies data. So user application can allocate IO buffer, copy data to be read into it, and issues a sqe with the newly allocated IO buffer. However, for WRITE requests, this work flow looks weird because the data copy happens in a task_work before the user application gets one cqe. So it is inconvenient for users who allocates(or fetch from a memory pool)buffer after it gets one request(and know the actual data size). 2. Design: Consider add a new feature flag: UBLK_F_NEED_GET_DATA. If user sets this new flag(through libublksrv) and pass it to kernel driver, ublk kernel driver should returns a cqe with UBLK_IO_RES_NEED_GET_DATA after a new blk-mq WRITE request comes. A user application now can allocate data buffer for writing and pass its address in UBLK_IO_NEED_GET_DATA command after ublk kernel driver returns cqe with UBLK_IO_RES_NEED_GET_DATA. After the kernel side gets the sqe (UBLK_IO_NEED_GET_DATA command), it now copies(address pinned, indeed) data to be written from bio vectors to newly returned IO data buffer. Finally, the kernel side returns UBLK_IO_RES_OK and ublksrv handles the IO request as usual.The new feature: UBLK_F_NEED_GET_DATA is enabled on demand ublksrv still can pre-allocate data buffers with task_work. 3. Evaluation: We modify ublksrv to support UBLK_F_NEED_GET_DATA and the modification will be PR-ed to Ming Lei's github repository soon if this patch is okay. We have tested write latency with: (1) No UBLK_F_NEED_GET_DATA(the old commit) as baseline (2) UBLK_F_NEED_GET_DATA enabled/disabled on demo_null and demo_event of newest ublksrv project. Config of fio:bs=4k, iodepth=1, numjobs=1, rw=write/randwrite, direct=1, ioengine=libaio. Here is the comparison of lat(usec) in fio: demo_null: write: 28.74(baseline) -- 28.77(disable) -- 57.20(enable) randwrite: 27.81(baseline) -- 28.51(disable) -- 54.81(enable) demo_event: write: 46.45(baseline) -- 43.31(disable) -- 75.50(enable) randwrite: 45.39(baseline) -- 43.66(disable) -- 76.02(enable) Looks like: (1) UBLK_F_NEED_GET_DATA does not introduce additional overhead when comparing baseline and disable. (2) enabling UBLK_F_NEED_GET_DATA adds about two times more latency than disabling it. And it is reasonable since the IO goes through the total ublk IO stack(ubd_drv <--> ublksrv) once again. (3) demo_null and demo_event are both null targets. And I think this overhead is not too heavy if real data handling backend is used. Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com> --- drivers/block/ublk_drv.c | 77 +++++++++++++++++++++++++++++++---- include/uapi/linux/ublk_cmd.h | 19 +++++++++ 2 files changed, 88 insertions(+), 8 deletions(-)