diff mbox series

[RFC,2/4] io_uring/rw: support read/write with metadata

Message ID 20240322185023.131697-3-joshi.k@samsung.com (mailing list archive)
State New
Headers show
Series Read/Write with meta buffer | expand

Commit Message

Kanchan Joshi March 22, 2024, 6:50 p.m. UTC
From: Anuj Gupta <anuj20.g@samsung.com>

This patch introduces IORING_OP_READ_META and IORING_OP_WRITE_META
opcodes which allow sending a meta buffer along with read/write.

Application can do that by using the newly added meta_buf and meta-len
fields of the SQE.

These opcodes are supported only for direct IO.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
 include/linux/fs.h            |  1 +
 include/uapi/linux/io_uring.h |  6 +++
 io_uring/io_uring.c           |  2 +
 io_uring/opdef.c              | 29 ++++++++++++
 io_uring/rw.c                 | 86 +++++++++++++++++++++++++++++++++--
 io_uring/rw.h                 |  8 ++++
 6 files changed, 129 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a22b7245982..c3a483a4fdac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -327,6 +327,7 @@  struct readahead_control;
 #define IOCB_NOIO		(1 << 20)
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
+#define IOCB_USE_META		(1 << 22)
 /*
  * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
  * iocb completion can be passed back to the owner for execution from a safe
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..87bd44098037 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -97,6 +97,10 @@  struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	meta_addr;
+			__u32	meta_len;
+		};
 		__u64	optval;
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
@@ -256,6 +260,8 @@  enum io_uring_op {
 	IORING_OP_FUTEX_WAITV,
 	IORING_OP_FIXED_FD_INSTALL,
 	IORING_OP_FTRUNCATE,
+	IORING_OP_READ_META,
+	IORING_OP_WRITE_META,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 49a124daa359..7c380cac4465 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4134,7 +4134,9 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
+	BUILD_BUG_SQE_ELEM(48, __u64,  meta_addr);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(56, __u32,  meta_len);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9c080aadc5a6..cb31573ac4ad 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -146,6 +146,26 @@  const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READ_META] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw_meta,
+		.issue			= io_rw_meta,
+	},
+	[IORING_OP_WRITE_META] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw_meta,
+		.issue			= io_rw_meta,
+	},
 	[IORING_OP_RECVMSG] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -501,6 +521,15 @@  const struct io_cold_def io_cold_defs[] = {
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
+	[IORING_OP_READ_META] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_META",
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITE_META] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_META",
+	},
 	[IORING_OP_FSYNC] = {
 		.name			= "FSYNC",
 	},
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 40f6c2a59928..87a6304052f0 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -27,6 +27,7 @@  struct io_rw {
 	struct kiocb			kiocb;
 	u64				addr;
 	u32				len;
+	u32				meta_len;
 };
 
 static inline bool io_file_supports_nowait(struct io_kiocb *req)
@@ -107,6 +108,22 @@  int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct kiocb *kiocb = &rw->kiocb;
+	int ret;
+
+	ret = io_prep_rw(req, sqe);
+	if (unlikely(ret))
+		return ret;
+	kiocb->private = u64_to_user_ptr(READ_ONCE(sqe->meta_addr));
+	rw->meta_len = READ_ONCE(sqe->meta_len);
+
+	kiocb->ki_flags |= IOCB_USE_META;
+	return 0;
+}
+
 int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	int ret;
@@ -571,9 +588,18 @@  static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 	}
 }
 
+static inline void io_req_map_meta(struct io_async_rw *iorw, struct io_rw_state_meta *sm)
+{
+	memcpy(&iorw->s_meta.iter_meta, &sm->iter_meta, sizeof(struct iov_iter));
+	iov_iter_save_state(&iorw->s_meta.iter_meta, &iorw->s_meta.iter_state_meta);
+}
+
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     struct io_rw_state *s, bool force)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct kiocb *kiocb = &rw->kiocb;
+
 	if (!force && !io_cold_defs[req->opcode].prep_async)
 		return 0;
 	/* opcode type doesn't need async data */
@@ -591,6 +617,11 @@  static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 		iorw = req->async_data;
 		/* we've copied and mapped the iter, ensure state is saved */
 		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
+		if (unlikely(kiocb->ki_flags & IOCB_USE_META)) {
+			struct io_rw_state_meta *sm = kiocb->private;
+
+			io_req_map_meta(iorw, sm);
+		}
 	}
 	return 0;
 }
@@ -747,7 +778,8 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
 			return -EOPNOTSUPP;
 
-		kiocb->private = NULL;
+		if (likely(!(kiocb->ki_flags & IOCB_USE_META)))
+			kiocb->private = NULL;
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
@@ -766,6 +798,7 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
 	struct kiocb *kiocb = &rw->kiocb;
+	struct io_rw_state_meta *sm = kiocb->private;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	struct io_async_rw *io;
 	ssize_t ret, ret2;
@@ -840,13 +873,16 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 		/* no retry on NONBLOCK nor RWF_NOWAIT */
 		if (req->flags & REQ_F_NOWAIT)
 			goto done;
+		if (kiocb->ki_flags & IOCB_USE_META)
+			kiocb->private = sm;
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
 		if (iovec)
 			kfree(iovec);
 		return IOU_ISSUE_SKIP_COMPLETE;
 	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
-		   (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
+		   (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
+		   (kiocb->ki_flags & IOCB_USE_META)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
 	}
@@ -857,6 +893,12 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&s->iter, &s->iter_state);
+	if (unlikely(kiocb->ki_flags & IOCB_USE_META)) {
+		/* don't handle partial completion for read + meta */
+		if (ret > 0)
+			goto done;
+		iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta);
+	}
 
 	ret2 = io_setup_async_rw(req, iovec, s, true);
 	iovec = NULL;
@@ -1070,7 +1112,8 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto copy_iov;
 
-		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
+		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)
+				&& !(kiocb->ki_flags & IOCB_USE_META)) {
 			struct io_async_rw *io;
 
 			trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
@@ -1111,6 +1154,43 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	void __user *meta_addr = u64_to_user_ptr((u64)rw->kiocb.private);
+	struct io_rw_state_meta __sm, *sm = &__sm;
+	struct kiocb *kiocb = &rw->kiocb;
+	int ret;
+
+	if (!(req->file->f_flags & O_DIRECT))
+		return -EOPNOTSUPP;
+	/* prepare iter for meta-buffer */
+	if (!req_has_async_data(req)) {
+		ret = import_ubuf(ITER_SOURCE, meta_addr, rw->meta_len, &sm->iter_meta);
+		iov_iter_save_state(&sm->iter_meta, &sm->iter_state_meta);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw *io = req->async_data;
+
+		sm = &io->s_meta;
+		iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta);
+	}
+	/* Store iter for meta-buf in private, will be used later*/
+	kiocb->private = sm;
+	if (req->opcode == IORING_OP_READ_META) {
+		ret = __io_read(req, issue_flags);
+		if (ret >= 0)
+			return kiocb_done(req, ret, issue_flags);
+	} else {
+		ret = io_write(req, issue_flags);
+	}
+	if (ret == -EAGAIN)
+		kiocb->private = meta_addr;
+	return ret;
+
+}
+
 void io_rw_fail(struct io_kiocb *req)
 {
 	int res;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f9e89b4fe4da..7c12216776bc 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -8,19 +8,27 @@  struct io_rw_state {
 	struct iovec			fast_iov[UIO_FASTIOV];
 };
 
+struct io_rw_state_meta {
+	struct iov_iter			iter_meta;
+	struct iov_iter_state		iter_state_meta;
+};
+
 struct io_async_rw {
 	struct io_rw_state		s;
+	struct io_rw_state_meta		s_meta;
 	const struct iovec		*free_iovec;
 	size_t				bytes_done;
 	struct wait_page_queue		wpq;
 };
 
 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read(struct io_kiocb *req, unsigned int issue_flags);
 int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);