diff mbox

[PATCHv6,0/6] ublk zero copy support

Message ID 20250226181002.2574148-1-kbusch@meta.com (mailing list archive)
State New
Headers show

Commit Message

Keith Busch Feb. 26, 2025, 6:09 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Changes from v5:

  Merged up to latest block for-next tree

  Fixed up the io_uring read/write fixed prep to not set do_import, and
  actually use the issue_flags when importing the buffer node (Pavel,
  Caleb).

  Used unambigious names for the read/write permissions of registered
  kernel vectors, defined them using their symbolic names instead of
  literals, and added a BUILD_BUG_ON to ensure the flags fits in the
  type (Ming, Pavel).

  Limit the io cache size to 64 elements (Pavel).

  Enforce unpriveledged ublk dev can't use zero copy (Ming).

  Various cleanups.

  Added reviews

Keith Busch (5):
  io_uring/rw: move fixed buffer import to issue path
  io_uring: add support for kernel registered bvecs
  ublk: zc register/unregister bvec
  io_uring: add abstraction for buf_table rsrc data
  io_uring: cache nodes and mapped buffers

Xinyu Zhang (1):
  nvme: map uring_cmd data even if address is 0

 drivers/block/ublk_drv.c       | 119 +++++++++-----
 drivers/nvme/host/ioctl.c      |   2 +-
 include/linux/io_uring/cmd.h   |   7 +
 include/linux/io_uring_types.h |  24 +--
 include/uapi/linux/ublk_cmd.h  |   4 +
 io_uring/fdinfo.c              |   8 +-
 io_uring/filetable.c           |   2 +-
 io_uring/io_uring.c            |   3 +
 io_uring/nop.c                 |   2 +-
 io_uring/opdef.c               |   4 +-
 io_uring/register.c            |   2 +-
 io_uring/rsrc.c                | 280 ++++++++++++++++++++++++++-------
 io_uring/rsrc.h                |  10 +-
 io_uring/rw.c                  |  39 +++--
 io_uring/rw.h                  |   2 +
 15 files changed, 389 insertions(+), 119 deletions(-)

Comments

Keith Busch Feb. 26, 2025, 6:17 p.m. UTC | #1
I duplicated the format-patch command into the same directory, so I
completely screwed up this patch thread. Please disregard this one, and
we'll start a new one. Sorry for the noise.
diff mbox

Patch

diff --git a/include/ublk_cmd.h b/include/ublk_cmd.h
index 0150003..07439be 100644
--- a/include/ublk_cmd.h
+++ b/include/ublk_cmd.h
@@ -94,6 +94,10 @@ 
 	_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
 #define	UBLK_U_IO_NEED_GET_DATA		\
 	_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_REGISTER_IO_BUF	\
+	_IOWR('u', 0x23, struct ublksrv_io_cmd)
+#define UBLK_U_IO_UNREGISTER_IO_BUF	\
+	_IOWR('u', 0x24, struct ublksrv_io_cmd)
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
diff --git a/include/ublksrv_tgt.h b/include/ublksrv_tgt.h
index 1deee2b..c331963 100644
--- a/include/ublksrv_tgt.h
+++ b/include/ublksrv_tgt.h
@@ -99,6 +99,7 @@  struct ublk_io_tgt {
 	co_handle_type co;
 	const struct io_uring_cqe *tgt_io_cqe;
 	int queued_tgt_io;	/* obsolete */
+	bool needs_unregister;
 };
 
 static inline struct ublk_io_tgt *__ublk_get_io_tgt_data(const struct ublk_io_data *io)
diff --git a/lib/ublksrv.c b/lib/ublksrv.c
index 16a9e13..7205247 100644
--- a/lib/ublksrv.c
+++ b/lib/ublksrv.c
@@ -619,6 +619,15 @@  skip_alloc_buf:
 		goto fail;
 	}
 
+	if (ctrl_dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+		if (ret) {
+			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
+					q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret);
+			goto fail;
+		}
+	}
+
 	io_uring_register_ring_fd(&q->ring);
 
 	/*
diff --git a/tgt_loop.cpp b/tgt_loop.cpp
index 0f16676..91f8c81 100644
--- a/tgt_loop.cpp
+++ b/tgt_loop.cpp
@@ -246,12 +246,70 @@  static inline int loop_fallocate_mode(const struct ublksrv_io_desc *iod)
        return mode;
 }
 
+static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode		= IORING_OP_URING_CMD;
+	sqe->flags		|= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op		= UBLK_U_IO_REGISTER_IO_BUF;
+
+	cmd->tag		= tag;
+	cmd->addr		= index;
+	cmd->q_id		= q_id;
+}
+
+static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode             = IORING_OP_URING_CMD;
+	sqe->flags              |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op             = UBLK_U_IO_UNREGISTER_IO_BUF;
+
+	cmd->tag                = tag;
+	cmd->addr               = index;
+	cmd->q_id               = q_id;
+}
+
+static void loop_unregister(const struct ublksrv_queue *q, int tag)
+{
+	struct io_uring_sqe *sqe;
+
+	ublk_get_sqe_pair(q->ring_ptr, &sqe, NULL);
+	io_uring_prep_buf_unregister(sqe, 0, tag, q->q_id, tag);
+}
+
 static void loop_queue_tgt_read(const struct ublksrv_queue *q,
-		const struct ublksrv_io_desc *iod, int tag)
+		const struct ublk_io_data *data, int tag)
 {
+	struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
+	const struct ublksrv_io_desc *iod = data->iod;
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *read;
+
+		ublk_get_sqe_pair(q->ring_ptr, &reg, &read);
+
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_read_fixed(read, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(read, IOSQE_FIXED_FILE);
+		read->user_data = build_user_data(tag, ublk_op, 0, 1);
+		io->needs_unregister = true;
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
@@ -284,11 +342,31 @@  static void loop_queue_tgt_read(const struct ublksrv_queue *q,
 }
 
 static void loop_queue_tgt_write(const struct ublksrv_queue *q,
-		const struct ublksrv_io_desc *iod, int tag)
+		const struct ublk_io_data *data, int tag)
 {
+	const struct ublksrv_io_desc *iod = data->iod;
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *write;
+
+		ublk_get_sqe_pair(q->ring_ptr, &reg, &write);
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_write_fixed(write, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(write, IOSQE_FIXED_FILE);
+		write->user_data = build_user_data(tag, ublk_op, 0, 1);
+
+		io->needs_unregister = true;
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
@@ -352,10 +430,10 @@  static int loop_queue_tgt_io(const struct ublksrv_queue *q,
 		sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
 		break;
 	case UBLK_IO_OP_READ:
-		loop_queue_tgt_read(q, iod, tag);
+		loop_queue_tgt_read(q, data, tag);
 		break;
 	case UBLK_IO_OP_WRITE:
-		loop_queue_tgt_write(q, iod, tag);
+		loop_queue_tgt_write(q, data, tag);
 		break;
 	default:
 		return -EINVAL;
@@ -387,6 +465,10 @@  static co_io_job __loop_handle_io_async(const struct ublksrv_queue *q,
 		if (io->tgt_io_cqe->res == -EAGAIN)
 			goto again;
 
+		if (io->needs_unregister) {
+			io->needs_unregister = false;
+			loop_unregister(q, tag);
+		}
 		ublksrv_complete_io(q, tag, io->tgt_io_cqe->res);
 	} else if (ret < 0) {
 		ublk_err( "fail to queue io %d, ret %d\n", tag, tag);
diff --git a/ublksrv_tgt.cpp b/ublksrv_tgt.cpp
index 8f9cf28..f3ebe14 100644
--- a/ublksrv_tgt.cpp
+++ b/ublksrv_tgt.cpp
@@ -723,7 +723,7 @@  static int cmd_dev_add(int argc, char *argv[])
 			data.tgt_type = optarg;
 			break;
 		case 'z':
-			data.flags |= UBLK_F_SUPPORT_ZERO_COPY;
+			data.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
 			break;
 		case 'q':
 			data.nr_hw_queues = strtol(optarg, NULL, 10);