diff mbox series

[1/2] iov: introduce ITER_BVEC_FLAG_FIXED

Message ID de27dbca08f8005a303e5efd81612c9a5cdcf196.1607477897.git.asml.silence@gmail.com (mailing list archive)
State New, archived
Headers show
Series nocopy bvec for direct IO | expand

Commit Message

Pavel Begunkov Dec. 9, 2020, 2:19 a.m. UTC
Add ITER_BVEC_FLAG_FIXED iov iter flag, which will allow us to reuse
passed in bvec instead of copying it. In particular it means that
iter->bvec won't be freed and page references are taken remain so
until callees don't need them, including asynchronous execution.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c       |  1 +
 include/linux/uio.h | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

Comments

Christoph Hellwig Dec. 9, 2020, 8:36 a.m. UTC | #1
Ok, seems like the patches made it to the lists, while oyu only
send the cover letter to my address which is very strange.

> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 72d88566694e..af626eb970cf 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -18,6 +18,8 @@ struct kvec {
>  };
>  
>  enum iter_type {
> +	ITER_BVEC_FLAG_FIXED = 2,
> +
>  	/* iter types */
>  	ITER_IOVEC = 4,
>  	ITER_KVEC = 8,

This is making the iter type even more of a mess than it already is.
I think we at least need placeholders for 0/1 here and an explicit
flags namespace, preferably after the types.

Then again I'd much prefer if we didn't even add the flag or at best
just add it for a short-term transition and move everyone over to the
new scheme.  Otherwise the amount of different interfaces and supporting
code keeps exploding.

> @@ -29,8 +31,9 @@ enum iter_type {
>  struct iov_iter {
>  	/*
>  	 * Bit 0 is the read/write bit, set if we're writing.
> -	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
> -	 * the caller isn't expecting to drop a page reference when done.
> +	 * Bit 1 is the BVEC_FLAG_FIXED bit, set if type is a bvec and the
> +	 * caller ensures that page references and memory baking bvec won't
> +	 * go away until callees finish with them.
>  	 */
>  	unsigned int type;

I think the comment needs to move to the enum.
Christoph Hellwig Dec. 9, 2020, 9:06 a.m. UTC | #2
On Wed, Dec 09, 2020 at 08:36:45AM +0000, Christoph Hellwig wrote:
> This is making the iter type even more of a mess than it already is.
> I think we at least need placeholders for 0/1 here and an explicit
> flags namespace, preferably after the types.
> 
> Then again I'd much prefer if we didn't even add the flag or at best
> just add it for a short-term transition and move everyone over to the
> new scheme.  Otherwise the amount of different interfaces and supporting
> code keeps exploding.

Note that the only other callers that use iov_iter_bvec and asynchronous
read/write are loop, target and nvme-target, so this should actually
be pretty simple.  Out of these only target needs something like this
trivial change to keep the bvec available over the duration of the I/O,
the other two should be fine already:

---
From 581a8eabbb1759e3dcfee4b1d2e419f814a8cb80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Dec 2020 10:05:04 +0100
Subject: target/file: allocate the bvec array as part of struct target_core_file_cmd

This saves one memory allocation, and ensures the bvecs aren't freed
before the AIO completion.  This will allow the lower level code to be
optimized so that it can avoid allocating another bvec array.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/target/target_core_file.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 7143d03f0e027e..ed0c39a1f7c649 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -241,6 +241,7 @@ struct target_core_file_cmd {
 	unsigned long	len;
 	struct se_cmd	*cmd;
 	struct kiocb	iocb;
+	struct bio_vec	bvecs[];
 };
 
 static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
@@ -268,29 +269,22 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	struct target_core_file_cmd *aio_cmd;
 	struct iov_iter iter = {};
 	struct scatterlist *sg;
-	struct bio_vec *bvec;
 	ssize_t len = 0;
 	int ret = 0, i;
 
-	aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL);
+	aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL);
 	if (!aio_cmd)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-	bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL);
-	if (!bvec) {
-		kfree(aio_cmd);
-		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-	}
-
 	for_each_sg(sgl, sg, sgl_nents, i) {
-		bvec[i].bv_page = sg_page(sg);
-		bvec[i].bv_len = sg->length;
-		bvec[i].bv_offset = sg->offset;
+		aio_cmd->bvecs[i].bv_page = sg_page(sg);
+		aio_cmd->bvecs[i].bv_len = sg->length;
+		aio_cmd->bvecs[i].bv_offset = sg->offset;
 
 		len += sg->length;
 	}
 
-	iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len);
+	iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len);
 
 	aio_cmd->cmd = cmd;
 	aio_cmd->len = len;
@@ -307,8 +301,6 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	else
 		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
 
-	kfree(bvec);
-
 	if (ret != -EIOCBQUEUED)
 		cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
Pavel Begunkov Dec. 9, 2020, 11:54 a.m. UTC | #3
On 09/12/2020 09:06, Christoph Hellwig wrote:
> On Wed, Dec 09, 2020 at 08:36:45AM +0000, Christoph Hellwig wrote:
>> This is making the iter type even more of a mess than it already is.
>> I think we at least need placeholders for 0/1 here and an explicit
>> flags namespace, preferably after the types.
>>
>> Then again I'd much prefer if we didn't even add the flag or at best
>> just add it for a short-term transition and move everyone over to the
>> new scheme.  Otherwise the amount of different interfaces and supporting
>> code keeps exploding.

At least the flag can be ignored. Anyway sounds good to me. I'll take
your patch below to the series, thanks!

> 
> Note that the only other callers that use iov_iter_bvec and asynchronous
> read/write are loop, target and nvme-target, so this should actually
> be pretty simple.  Out of these only target needs something like this
> trivial change to keep the bvec available over the duration of the I/O,
> the other two should be fine already:
> 
> ---
> From 581a8eabbb1759e3dcfee4b1d2e419f814a8cb80 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Wed, 9 Dec 2020 10:05:04 +0100
> Subject: target/file: allocate the bvec array as part of struct target_core_file_cmd
> 
> This saves one memory allocation, and ensures the bvecs aren't freed
> before the AIO completion.  This will allow the lower level code to be
> optimized so that it can avoid allocating another bvec array.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/target/target_core_file.c | 20 ++++++--------------
>  1 file changed, 6 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
> index 7143d03f0e027e..ed0c39a1f7c649 100644
> --- a/drivers/target/target_core_file.c
> +++ b/drivers/target/target_core_file.c
> @@ -241,6 +241,7 @@ struct target_core_file_cmd {
>  	unsigned long	len;
>  	struct se_cmd	*cmd;
>  	struct kiocb	iocb;
> +	struct bio_vec	bvecs[];
>  };
>  
>  static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
> @@ -268,29 +269,22 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
>  	struct target_core_file_cmd *aio_cmd;
>  	struct iov_iter iter = {};
>  	struct scatterlist *sg;
> -	struct bio_vec *bvec;
>  	ssize_t len = 0;
>  	int ret = 0, i;
>  
> -	aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL);
> +	aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL);
>  	if (!aio_cmd)
>  		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
>  
> -	bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL);
> -	if (!bvec) {
> -		kfree(aio_cmd);
> -		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
> -	}
> -
>  	for_each_sg(sgl, sg, sgl_nents, i) {
> -		bvec[i].bv_page = sg_page(sg);
> -		bvec[i].bv_len = sg->length;
> -		bvec[i].bv_offset = sg->offset;
> +		aio_cmd->bvecs[i].bv_page = sg_page(sg);
> +		aio_cmd->bvecs[i].bv_len = sg->length;
> +		aio_cmd->bvecs[i].bv_offset = sg->offset;
>  
>  		len += sg->length;
>  	}
>  
> -	iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len);
> +	iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len);
>  
>  	aio_cmd->cmd = cmd;
>  	aio_cmd->len = len;
> @@ -307,8 +301,6 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
>  	else
>  		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
>  
> -	kfree(bvec);
> -
>  	if (ret != -EIOCBQUEUED)
>  		cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0);
>  
>
Al Viro Dec. 9, 2020, 1:07 p.m. UTC | #4
On Wed, Dec 09, 2020 at 08:36:45AM +0000, Christoph Hellwig wrote:
> 
> This is making the iter type even more of a mess than it already is.
> I think we at least need placeholders for 0/1 here and an explicit
> flags namespace, preferably after the types.
> 
> Then again I'd much prefer if we didn't even add the flag or at best
> just add it for a short-term transition and move everyone over to the
> new scheme.  Otherwise the amount of different interfaces and supporting
> code keeps exploding.

Yes.  The only problem I see is how to describe the rules - "bdev-backed
iterators need the bvec array to stay allocated until IO completes"?
And one way or another, that needs to be documented - D/f/porting with
"mandatory" for priority.
Pavel Begunkov Dec. 9, 2020, 1:37 p.m. UTC | #5
On 09/12/2020 13:07, Al Viro wrote:
> On Wed, Dec 09, 2020 at 08:36:45AM +0000, Christoph Hellwig wrote:
>>
>> This is making the iter type even more of a mess than it already is.
>> I think we at least need placeholders for 0/1 here and an explicit
>> flags namespace, preferably after the types.
>>
>> Then again I'd much prefer if we didn't even add the flag or at best
>> just add it for a short-term transition and move everyone over to the
>> new scheme.  Otherwise the amount of different interfaces and supporting
>> code keeps exploding.
> 
> Yes.  The only problem I see is how to describe the rules - "bdev-backed
> iterators need the bvec array to stay allocated until IO completes"?
> And one way or another, that needs to be documented - D/f/porting with
> "mandatory" for priority.

Yeah, I had troubles to put comments around, and it's still open.

For current cases it can be bound to kiocb, e.g. "if an bvec iter passed
"together" with kiocb then the vector should stay intact up to 
->ki_complete()". But that "together" is rather full of holes.
Christoph Hellwig Dec. 9, 2020, 5:55 p.m. UTC | #6
On Wed, Dec 09, 2020 at 01:37:05PM +0000, Pavel Begunkov wrote:
> Yeah, I had troubles to put comments around, and it's still open.
> 
> For current cases it can be bound to kiocb, e.g. "if an bvec iter passed
> "together" with kiocb then the vector should stay intact up to 
> ->ki_complete()". But that "together" is rather full of holes.

What about: "For bvec based iters the bvec must not be freed until the
I/O has completed.  For asynchronous I/O that means it must be freed
no earlier than from ->ki_complete."
Matthew Wilcox Dec. 9, 2020, 6:24 p.m. UTC | #7
On Wed, Dec 09, 2020 at 05:55:53PM +0000, Christoph Hellwig wrote:
> On Wed, Dec 09, 2020 at 01:37:05PM +0000, Pavel Begunkov wrote:
> > Yeah, I had troubles to put comments around, and it's still open.
> > 
> > For current cases it can be bound to kiocb, e.g. "if an bvec iter passed
> > "together" with kiocb then the vector should stay intact up to 
> > ->ki_complete()". But that "together" is rather full of holes.
> 
> What about: "For bvec based iters the bvec must not be freed until the
> I/O has completed.  For asynchronous I/O that means it must be freed
> no earlier than from ->ki_complete."

Perhaps for the second sentence "If the I/O is completed asynchronously,
the bvec must not be freed before ->ki_complete() has been called"?
Pavel Begunkov Dec. 13, 2020, 10:09 p.m. UTC | #8
On 09/12/2020 18:24, Matthew Wilcox wrote:
> On Wed, Dec 09, 2020 at 05:55:53PM +0000, Christoph Hellwig wrote:
>> On Wed, Dec 09, 2020 at 01:37:05PM +0000, Pavel Begunkov wrote:
>>> Yeah, I had troubles to put comments around, and it's still open.
>>>
>>> For current cases it can be bound to kiocb, e.g. "if an bvec iter passed
>>> "together" with kiocb then the vector should stay intact up to 
>>> ->ki_complete()". But that "together" is rather full of holes.
>>
>> What about: "For bvec based iters the bvec must not be freed until the
>> I/O has completed.  For asynchronous I/O that means it must be freed
>> no earlier than from ->ki_complete."
> 
> Perhaps for the second sentence "If the I/O is completed asynchronously,
> the bvec must not be freed before ->ki_complete() has been called"?

Sounds good, I'll use it. Thanks!
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c536462920a3..9ff2805d0075 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2920,6 +2920,7 @@  static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
 		}
 	}
 
+	iter->type |= ITER_BVEC_FLAG_FIXED;
 	return len;
 }
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 72d88566694e..af626eb970cf 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -18,6 +18,8 @@  struct kvec {
 };
 
 enum iter_type {
+	ITER_BVEC_FLAG_FIXED = 2,
+
 	/* iter types */
 	ITER_IOVEC = 4,
 	ITER_KVEC = 8,
@@ -29,8 +31,9 @@  enum iter_type {
 struct iov_iter {
 	/*
 	 * Bit 0 is the read/write bit, set if we're writing.
-	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
-	 * the caller isn't expecting to drop a page reference when done.
+	 * Bit 1 is the BVEC_FLAG_FIXED bit, set if type is a bvec and the
+	 * caller ensures that page references and memory baking bvec won't
+	 * go away until callees finish with them.
 	 */
 	unsigned int type;
 	size_t iov_offset;
@@ -52,7 +55,7 @@  struct iov_iter {
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
-	return i->type & ~(READ | WRITE);
+	return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_FIXED);
 }
 
 static inline bool iter_is_iovec(const struct iov_iter *i)
@@ -85,6 +88,11 @@  static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->type & (READ | WRITE);
 }
 
+static inline unsigned char iov_iter_bvec_fixed(const struct iov_iter *i)
+{
+	return i->type & ITER_BVEC_FLAG_FIXED;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *