diff mbox series

[v8,06/10] io_uring/rw: add support to send metadata along with read/write

Message ID 20241106121842.5004-7-anuj20.g@samsung.com (mailing list archive)
State Superseded
Headers show
Series [v8,01/10] block: define set of integrity flags to be inherited by cloned bip | expand

Commit Message

Anuj Gupta Nov. 6, 2024, 12:18 p.m. UTC
This patch adds the capability of passing integrity metadata along with
read/write. A new ext_cap (extended_capability) field is introduced in SQE
which indicates the type of extra information being sent. A new
'struct io_uring_sqe_ext' represents the secondary SQE space for
read/write. In future if another extension needs to be added, then one
needs to:
1. Add extra fields in the sqe/secondary-sqe
2. Introduce a ext_cap flag indicating additional values that have been
passed

The last 32 bytes of secondary SQE is used to pass following PI related
information:

- flags: integrity check flags namely
IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of the pi/metadata buffer
- buf: address of the metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value

Application sets up a SQE128 ring, prepares PI information within the
second SQE and sets the ext_cap field to EXT_CAP_PI.  The patch processes
this information to prepare uio_meta descriptor and passes it down using
kiocb->private.

Meta exchange is supported only for direct IO.
Also vectored read/write operations with meta are not supported
currently.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 include/uapi/linux/io_uring.h | 34 ++++++++++++++
 io_uring/io_uring.c           |  8 ++++
 io_uring/rw.c                 | 88 ++++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 | 14 +++++-
 4 files changed, 141 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig Nov. 7, 2024, 5:55 a.m. UTC | #1
> +enum io_uring_sqe_ext_cap_bits {
> +	EXT_CAP_PI_BIT,
> +	/*
> +	 * not a real extended capability; just to make sure that we don't
> +	 * overflow
> +	 */
> +	EXT_CAP_LAST_BIT,
> +};
> +
> +/* extended capability flags */
> +#define EXT_CAP_PI	(1U << EXT_CAP_PI_BIT)

This is getting into nitpicking, but is the a good reason to have that
enum, which is never used as a type and the values or only defined to
actually define the bit positions below?  That's a bit confusing to
me.

Also please document the ABI for EXT_CAP_PI, right now this is again
entirely undocumented.

> +/* Second half of SQE128 for IORING_OP_READ/WRITE */
> +struct io_uring_sqe_ext {
> +	__u64	rsvd0[4];
> +	/* if sqe->ext_cap is EXT_CAP_PI, last 32 bytes are for PI */
> +	union {
> +		__u64	rsvd1[4];
> +		struct {
> +			__u16	flags;
> +			__u16	app_tag;
> +			__u32	len;
> +			__u64	addr;
> +			__u64	seed;
> +			__u64	rsvd;
> +		} rw_pi;
> +	};

And this is not what I though we discussed before.  By having a
union here you imply some kind of "type" again that is switched
on a value, and not flags indication the presence of potential
multiple optional and combinable features.  This is what I would
have expected here based on the previous discussion:

struct io_uring_sqe_ext {
	/*
	 * Reservered for please tell me what and why it is in the beginning
	 * and not the end:
	 */
	__u64	rsvd0[4];

	/*
	 * Only valid when EXT_CAP_PI is set:
	 */
	__u16	pi_flags; /* or make this generic flags, dunno? */
	__u16	app_tag;
	__u32	pi_len;
	__u64	pi_addr;
	__u64	pi_seed;

	__u64	rsvd1;
};
Anuj gupta Nov. 7, 2024, 7:26 a.m. UTC | #2
On Thu, Nov 7, 2024 at 11:25 AM Christoph Hellwig <hch@lst.de> wrote:
>
> > +enum io_uring_sqe_ext_cap_bits {
> > +     EXT_CAP_PI_BIT,
> > +     /*
> > +      * not a real extended capability; just to make sure that we don't
> > +      * overflow
> > +      */
> > +     EXT_CAP_LAST_BIT,
> > +};
> > +
> > +/* extended capability flags */
> > +#define EXT_CAP_PI   (1U << EXT_CAP_PI_BIT)
>
> This is getting into nitpicking, but is the a good reason to have that
> enum, which is never used as a type and the values or only defined to
> actually define the bit positions below?  That's a bit confusing to
> me.

The enum is added to keep a check on the number of flags that can
be added, and make sure that we don't overflow.

>
> Also please document the ABI for EXT_CAP_PI, right now this is again
> entirely undocumented.
>

We are planning to document this in man/io_uring_enter.2 in the liburing
repo, right after this series goes in. Or should it go somewhere else?

> > +/* Second half of SQE128 for IORING_OP_READ/WRITE */
> > +struct io_uring_sqe_ext {
> > +     __u64   rsvd0[4];
> > +     /* if sqe->ext_cap is EXT_CAP_PI, last 32 bytes are for PI */
> > +     union {
> > +             __u64   rsvd1[4];
> > +             struct {
> > +                     __u16   flags;
> > +                     __u16   app_tag;
> > +                     __u32   len;
> > +                     __u64   addr;
> > +                     __u64   seed;
> > +                     __u64   rsvd;
> > +             } rw_pi;
> > +     };
>
> And this is not what I though we discussed before.  By having a
> union here you imply some kind of "type" again that is switched
> on a value, and not flags indication the presence of potential
> multiple optional and combinable features.  This is what I would
> have expected here based on the previous discussion:

The attempt here is that if two extended capabilities are not known to
co-exist then they can be kept in the same place. Since each extended
capability is now a flag, we can check what combinations are valid and
throw an error in case of incompatibility. Do you see this differently?

>
> struct io_uring_sqe_ext {
>         /*
>          * Reservered for please tell me what and why it is in the beginning
>          * and not the end:
>          */
>         __u64   rsvd0[4];

This space is reserved for extended capabilities that might be added down
the line. It was at the end in the earlier versions, but it is moved
to the beginning
now to maintain contiguity with the free space (18b) available in the first SQE,
based on previous discussions [1].

[1] https://lore.kernel.org/linux-block/ceb58d97-b2e3-4d36-898d-753ba69476be@samsung.com/

>
>         /*
>          * Only valid when EXT_CAP_PI is set:
>          */
>         __u16   pi_flags; /* or make this generic flags, dunno? */
>         __u16   app_tag;
>         __u32   pi_len;
>         __u64   pi_addr;
>         __u64   pi_seed;
>
>         __u64   rsvd1;
> };
>
Christoph Hellwig Nov. 7, 2024, 7:38 a.m. UTC | #3
On Thu, Nov 07, 2024 at 12:56:03PM +0530, Anuj gupta wrote:
> > > +/* extended capability flags */
> > > +#define EXT_CAP_PI   (1U << EXT_CAP_PI_BIT)
> >
> > This is getting into nitpicking, but is the a good reason to have that
> > enum, which is never used as a type and the values or only defined to
> > actually define the bit positions below?  That's a bit confusing to
> > me.
> 
> The enum is added to keep a check on the number of flags that can
> be added, and make sure that we don't overflow.

Umm, it is pretty clear you overflow when you do a

#define EXT_CAP_FOO   (1U << 16)

and assign it u16.  Just about every static checker will tell you
even if you don't instantly see it.  Basic testing will also show
you it won't work..

> > Also please document the ABI for EXT_CAP_PI, right now this is again
> > entirely undocumented.
> >
> 
> We are planning to document this in man/io_uring_enter.2 in the liburing
> repo, right after this series goes in. Or should it go somewhere else?

Well, it needs to go into the code actually explaining what the flag
does.  Throwing an undocumented flag into a uapi is just asking for
trouble.

> The attempt here is that if two extended capabilities are not known to
> co-exist then they can be kept in the same place. Since each extended
> capability is now a flag, we can check what combinations are valid and
> throw an error in case of incompatibility. Do you see this differently?

You only know they can't co-exist when you add them, and at that point
you can add a union.

> 
> >
> > struct io_uring_sqe_ext {
> >         /*
> >          * Reservered for please tell me what and why it is in the beginning
> >          * and not the end:
> >          */
> >         __u64   rsvd0[4];
> 
> This space is reserved for extended capabilities that might be added down
> the line. It was at the end in the earlier versions, but it is moved
> to the beginning
> now to maintain contiguity with the free space (18b) available in the first SQE,
> based on previous discussions [1].

I can't follow the argument.  But if you reserve space at the beginning
of the structure instead of the usual end you'd better add a comment
explaining it.
Anuj Gupta Nov. 7, 2024, 10:40 a.m. UTC | #4
I addressed your feedback in the patch below, does this look fine?

From e03fe5fe8ea057d01f5986b8add3769d1095da07 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Wed, 6 Nov 2024 17:48:38 +0530
Subject: [PATCH] io_uring/rw: add support to send metadata along with
 read/write

This patch adds the capability of passing integrity metadata along with
read/write. A new ext_cap (extended_capability) field is introduced in SQE
which indicates the type of extra information being sent. A new
'struct io_uring_sqe_ext' represents the secondary SQE space for
read/write. In future if another extension needs to be added, then one
needs to:
1. Add extra fields in the sqe/secondary-sqe
2. Introduce a ext_cap flag indicating additional values that have been
passed

The last 32 bytes of secondary SQE is used to pass following PI related
information:

- flags: integrity check flags namely
IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- pi_len: length of the pi/metadata buffer
- pi_addr: address of the metadata buffer
- pi_seed: seed value for reftag remapping
- pi_app_tag: application defined 16b value

Application sets up a SQE128 ring, prepares PI information within the
second SQE and sets the ext_cap field to EXT_CAP_PI.  The patch processes
this information to prepare uio_meta descriptor and passes it down using
kiocb->private.

Meta exchange is supported only for direct IO.
Also vectored read/write operations with meta are not supported
currently.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 include/uapi/linux/io_uring.h | 28 +++++++++++
 io_uring/io_uring.c           |  5 ++
 io_uring/rw.c                 | 88 ++++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 | 14 +++++-
 4 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 56cf30b49ef5..29f0b742004b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,11 @@ struct io_uring_sqe {
 			__u16	addr_len;
 			__u16	__pad3[1];
 		};
+		struct {
+			/* flags indicating additional information being passed */
+			__u16	ext_cap;
+			__u16	__pad4[1];
+		};
 	};
 	union {
 		struct {
@@ -107,6 +112,29 @@ struct io_uring_sqe {
 	};
 };
 
+/*
+ * If sqe->ext_cap is set to this for IORING_OP_READ/WRITE, then the SQE
+ * contains protection information, and ring needs to be setup with SQE128
+ */
+#define EXT_CAP_PI	(1U << 0)
+
+/* Second half of SQE128 for IORING_OP_READ/WRITE */
+struct io_uring_sqe_ext {
+	/*
+	 * Reserved space for extended capabilities that are added down the
+	 * line. Kept in beginning to maintain contiguity with the free space
+	 * in first SQE
+	 */
+	__u64	rsvd0[4];
+	/* only valid when EXT_CAP_PI is set */
+	__u16	flags;
+	__u16	pi_app_tag;
+	__u32	pi_len;
+	__u64	pi_addr;
+	__u64	pi_seed;
+	__u64	rsvd1;
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 076171977d5e..5aa16bb60313 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4166,7 +4166,9 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
+	BUILD_BUG_SQE_ELEM(44, __u16,  ext_cap);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
+	BUILD_BUG_SQE_ELEM(46, __u16,  __pad4[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
@@ -4193,6 +4195,9 @@ static int __init io_uring_init(void)
 	/* top 8bits are for internal use */
 	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
 
+	BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) !=
+		     sizeof(struct io_uring_sqe));
+
 	io_uring_optable_init();
 
 	/*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 768a908ca2a8..4f8b7952d9be 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,64 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+	io->meta.seed = io->meta_state.seed;
+	iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe)
+{
+	return (sqe + 1);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_rw *rw, int ddir)
+{
+	const struct io_uring_sqe_ext *sqe_ext;
+	const struct io_issue_def *def;
+	struct io_async_rw *io;
+	int ret;
+
+	if (!(req->ctx->flags & IORING_SETUP_SQE128))
+		return -EINVAL;
+
+	sqe_ext = io_uring_sqe_ext(sqe);
+	if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1])
+	    || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3]))
+		return -EINVAL;
+	if (READ_ONCE(sqe_ext->rsvd1))
+		return -EINVAL;
+
+	def = &io_issue_defs[req->opcode];
+	if (def->vectored)
+		return -EOPNOTSUPP;
+
+	io = req->async_data;
+	io->meta.flags = READ_ONCE(sqe_ext->flags);
+	io->meta.app_tag = READ_ONCE(sqe_ext->pi_app_tag);
+	io->meta.seed = READ_ONCE(sqe_ext->pi_seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->pi_addr)),
+			  READ_ONCE(sqe_ext->pi_len), &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u16 ext_cap;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +332,23 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	ext_cap = READ_ONCE(sqe->ext_cap);
+	if (ext_cap) {
+		if (READ_ONCE(sqe->__pad4[0]) || !(ext_cap & EXT_CAP_PI))
+			return -EINVAL;
+		ret = io_prep_rw_pi(req, sqe, rw, ddir);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -410,7 +475,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -795,7 +863,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -829,6 +897,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -903,6 +983,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 
 	do {
 		/*
@@ -1126,6 +1208,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		if (kiocb->ki_flags & IOCB_HAS_METADATA)
+			io_meta_restore(io);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
Christoph Hellwig Nov. 7, 2024, 11:44 a.m. UTC | #5
> +/*
> + * If sqe->ext_cap is set to this for IORING_OP_READ/WRITE, then the SQE
> + * contains protection information, and ring needs to be setup with SQE128
> + */
> +#define EXT_CAP_PI	(1U << 0)
> +
> +/* Second half of SQE128 for IORING_OP_READ/WRITE */
> +struct io_uring_sqe_ext {
> +	/*
> +	 * Reserved space for extended capabilities that are added down the
> +	 * line. Kept in beginning to maintain contiguity with the free space
> +	 * in first SQE
> +	 */
> +	__u64	rsvd0[4];

Thanks for documenting the design decision.  But I still don't understand
it.  Due to the layout it will be a bit hard to have fields spreading
form the "normal" SQE into the extended area anyway.  Note that this
is not a rejection of the approach, but I don't understand the argument
for it.

> +	/* only valid when EXT_CAP_PI is set */
> +	__u16	flags;
> +	__u16	pi_app_tag;
> +	__u32	pi_len;
> +	__u64	pi_addr;
> +	__u64	pi_seed;
> +	__u64	rsvd1;

.. but either way it would probably make sense to keep the reserved
areas together instead of spread out.

Otherwise this looks good to me.
Pavel Begunkov Nov. 12, 2024, 12:54 a.m. UTC | #6
On 11/7/24 07:26, Anuj gupta wrote:
> On Thu, Nov 7, 2024 at 11:25 AM Christoph Hellwig <hch@lst.de> wrote:
...
>>
>> struct io_uring_sqe_ext {
>>          /*
>>           * Reservered for please tell me what and why it is in the beginning
>>           * and not the end:
>>           */
>>          __u64   rsvd0[4];
> 
> This space is reserved for extended capabilities that might be added down
> the line. It was at the end in the earlier versions, but it is moved
> to the beginning
> now to maintain contiguity with the free space (18b) available in the first SQE,
> based on previous discussions [1].
> 
> [1] https://lore.kernel.org/linux-block/ceb58d97-b2e3-4d36-898d-753ba69476be@samsung.com/

I don't believe it helps much anything, placing a structure on the
border between SQEs also feels a bit odd.
Anuj Gupta Nov. 12, 2024, 6:51 a.m. UTC | #7
On Tue, Nov 12, 2024 at 12:54:23AM +0000, Pavel Begunkov wrote:
> On 11/7/24 07:26, Anuj gupta wrote:
> > On Thu, Nov 7, 2024 at 11:25 AM Christoph Hellwig <hch@lst.de> wrote:
> ...
> > > 
> > > struct io_uring_sqe_ext {
> > >          /*
> > >           * Reservered for please tell me what and why it is in the beginning
> > >           * and not the end:
> > >           */
> > >          __u64   rsvd0[4];
> > 
> > This space is reserved for extended capabilities that might be added down
> > the line. It was at the end in the earlier versions, but it is moved
> > to the beginning
> > now to maintain contiguity with the free space (18b) available in the first SQE,
> > based on previous discussions [1].
> > 
> > [1] https://lore.kernel.org/linux-block/ceb58d97-b2e3-4d36-898d-753ba69476be@samsung.com/
> 
> I don't believe it helps much anything, placing a structure on the
> border between SQEs also feels a bit odd.

In next version, I can move it to the beginning of second SQE.

ext_cap also keeps it open to pass the same/different attributes via
user pointer.
Is that fine, or do you want anything else to be changed?

> 
> -- 
> Pavel Begunkov
>
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 56cf30b49ef5..449e7627b1b5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,11 @@  struct io_uring_sqe {
 			__u16	addr_len;
 			__u16	__pad3[1];
 		};
+		struct {
+			/* flags indicating additional information being passed */
+			__u16	ext_cap;
+			__u16	__pad4[1];
+		};
 	};
 	union {
 		struct {
@@ -107,6 +112,35 @@  struct io_uring_sqe {
 	};
 };
 
+enum io_uring_sqe_ext_cap_bits {
+	EXT_CAP_PI_BIT,
+	/*
+	 * not a real extended capability; just to make sure that we don't
+	 * overflow
+	 */
+	EXT_CAP_LAST_BIT,
+};
+
+/* extended capability flags */
+#define EXT_CAP_PI	(1U << EXT_CAP_PI_BIT)
+
+/* Second half of SQE128 for IORING_OP_READ/WRITE */
+struct io_uring_sqe_ext {
+	__u64	rsvd0[4];
+	/* if sqe->ext_cap is EXT_CAP_PI, last 32 bytes are for PI */
+	union {
+		__u64	rsvd1[4];
+		struct {
+			__u16	flags;
+			__u16	app_tag;
+			__u32	len;
+			__u64	addr;
+			__u64	seed;
+			__u64	rsvd;
+		} rw_pi;
+	};
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b590e50f09e7..6e582fe93bc4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4165,7 +4165,9 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
+	BUILD_BUG_SQE_ELEM(44, __u16,  ext_cap);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
+	BUILD_BUG_SQE_ELEM(46, __u16,  __pad4[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
@@ -4192,6 +4194,12 @@  static int __init io_uring_init(void)
 	/* top 8bits are for internal use */
 	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
 
+	BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) !=
+		     sizeof(struct io_uring_sqe));
+
+	BUILD_BUG_ON(EXT_CAP_LAST_BIT >
+		     8 * sizeof_field(struct io_uring_sqe, ext_cap));
+
 	io_uring_optable_init();
 
 	/*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 768a908ca2a8..e60bf0ed4c4f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,64 @@  static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+	io->meta.seed = io->meta_state.seed;
+	iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe)
+{
+	return (sqe + 1);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_rw *rw, int ddir)
+{
+	const struct io_uring_sqe_ext *sqe_ext;
+	const struct io_issue_def *def;
+	struct io_async_rw *io;
+	int ret;
+
+	if (!(req->ctx->flags & IORING_SETUP_SQE128))
+		return -EINVAL;
+
+	sqe_ext = io_uring_sqe_ext(sqe);
+	if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1])
+	    || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3]))
+		return -EINVAL;
+	if (READ_ONCE(sqe_ext->rw_pi.rsvd))
+		return -EINVAL;
+
+	def = &io_issue_defs[req->opcode];
+	if (def->vectored)
+		return -EOPNOTSUPP;
+
+	io = req->async_data;
+	io->meta.flags = READ_ONCE(sqe_ext->rw_pi.flags);
+	io->meta.app_tag = READ_ONCE(sqe_ext->rw_pi.app_tag);
+	io->meta.seed = READ_ONCE(sqe_ext->rw_pi.seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->rw_pi.addr)),
+			  READ_ONCE(sqe_ext->rw_pi.len), &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u16 ext_cap;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +332,23 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	ext_cap = READ_ONCE(sqe->ext_cap);
+	if (ext_cap) {
+		if (READ_ONCE(sqe->__pad4[0]) || !(ext_cap & EXT_CAP_PI))
+			return -EINVAL;
+		ret = io_prep_rw_pi(req, sqe, rw, ddir);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -410,7 +475,10 @@  static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -795,7 +863,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -829,6 +897,18 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -903,6 +983,8 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 
 	do {
 		/*
@@ -1126,6 +1208,8 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		if (kiocb->ki_flags & IOCB_HAS_METADATA)
+			io_meta_restore(io);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@ 
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@  struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);