diff mbox series

[v13,4/9] fs, block: copy_file_range for def_blk_ops for direct block device

Message ID 20230627183629.26571-5-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series Implement copy offload support | expand

Commit Message

Nitesh Shetty June 27, 2023, 6:36 p.m. UTC
For direct block device opened with O_DIRECT, use copy_file_range to
issue device copy offload, and fallback to generic_copy_file_range incase
device copy offload capability is absent.
Modify checks to allow bdevs to use copy_file_range.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
 block/blk-lib.c        | 26 ++++++++++++++++++++++++++
 block/fops.c           | 20 ++++++++++++++++++++
 fs/read_write.c        |  7 +++++--
 include/linux/blkdev.h |  4 ++++
 4 files changed, 55 insertions(+), 2 deletions(-)

Comments

Damien Le Moal June 28, 2023, 6:51 a.m. UTC | #1
On 6/28/23 03:36, Nitesh Shetty wrote:
> For direct block device opened with O_DIRECT, use copy_file_range to
> issue device copy offload, and fallback to generic_copy_file_range incase
> device copy offload capability is absent.

...if the device does not support copy offload or the device files are not open
with O_DIRECT.

No ?

> Modify checks to allow bdevs to use copy_file_range.
> 
> Suggested-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> ---
>  block/blk-lib.c        | 26 ++++++++++++++++++++++++++
>  block/fops.c           | 20 ++++++++++++++++++++
>  fs/read_write.c        |  7 +++++--
>  include/linux/blkdev.h |  4 ++++
>  4 files changed, 55 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 09e0d5d51d03..7d8e09a99254 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -473,6 +473,32 @@ ssize_t blkdev_copy_offload(
>  }
>  EXPORT_SYMBOL_GPL(blkdev_copy_offload);
>  
> +/* Copy source offset from source block device to destination block
> + * device. Returns the length of bytes copied.
> + */

Multi-line comment style: start with a "/*" line please.

> +ssize_t blkdev_copy_offload_failfast(

What is the "failfast" in the name for ?

> +		struct block_device *bdev_in, loff_t pos_in,
> +		struct block_device *bdev_out, loff_t pos_out,
> +		size_t len, gfp_t gfp_mask)
> +{
> +	struct request_queue *in_q = bdev_get_queue(bdev_in);
> +	struct request_queue *out_q = bdev_get_queue(bdev_out);
> +	ssize_t ret = 0;

You do not need this initialization.

> +
> +	if (blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len))
> +		return 0;
> +
> +	if (blk_queue_copy(in_q) && blk_queue_copy(out_q)) {

Given that I think we do not allow copies between different devices, in_q and
out_q should always be the same, no ?

> +		ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
> +				len, NULL, NULL, gfp_mask);

Same here. Why pass 2 bdevs if we only allow copies within the same device ?

> +		if (ret < 0)
> +			return 0;
> +	}
> +
> +	return ret;

return 0;

> +}
> +EXPORT_SYMBOL_GPL(blkdev_copy_offload_failfast);
> +
>  static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>  		struct bio **biop, unsigned flags)
> diff --git a/block/fops.c b/block/fops.c
> index a286bf3325c5..a1576304f269 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -621,6 +621,25 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	return ret;
>  }
>  
> +static ssize_t blkdev_copy_file_range(struct file *file_in, loff_t pos_in,
> +				struct file *file_out, loff_t pos_out,
> +				size_t len, unsigned int flags)
> +{
> +	struct block_device *in_bdev = I_BDEV(bdev_file_inode(file_in));
> +	struct block_device *out_bdev = I_BDEV(bdev_file_inode(file_out));
> +	ssize_t comp_len = 0;
> +
> +	if ((file_in->f_iocb_flags & IOCB_DIRECT) &&
> +		(file_out->f_iocb_flags & IOCB_DIRECT))
> +		comp_len = blkdev_copy_offload_failfast(in_bdev, pos_in,
> +				out_bdev, pos_out, len, GFP_KERNEL);
> +	if (comp_len != len)
> +		comp_len = generic_copy_file_range(file_in, pos_in + comp_len,
> +			file_out, pos_out + comp_len, len - comp_len, flags);
> +
> +	return comp_len;
> +}
> +
>  #define	BLKDEV_FALLOC_FL_SUPPORTED					\
>  		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
>  		 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
> @@ -714,6 +733,7 @@ const struct file_operations def_blk_fops = {
>  	.splice_read	= filemap_splice_read,
>  	.splice_write	= iter_file_splice_write,
>  	.fallocate	= blkdev_fallocate,
> +	.copy_file_range = blkdev_copy_file_range,
>  };
>  
>  static __init int blkdev_init(void)
> diff --git a/fs/read_write.c b/fs/read_write.c
> index b07de77ef126..d27148a2543f 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1447,7 +1447,8 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>  		return -EOVERFLOW;
>  
>  	/* Shorten the copy to EOF */
> -	size_in = i_size_read(inode_in);
> +	size_in = i_size_read(file_in->f_mapping->host);
> +
>  	if (pos_in >= size_in)
>  		count = 0;
>  	else
> @@ -1708,7 +1709,9 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
>  	/* Don't copy dirs, pipes, sockets... */
>  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>  		return -EISDIR;
> -	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +
> +	if ((!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) &&
> +		(!S_ISBLK(inode_in->i_mode) || !S_ISBLK(inode_out->i_mode)))
>  		return -EINVAL;
>  
>  	if (!(file_in->f_mode & FMODE_READ) ||
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c176bf6173c5..850168cad080 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1047,6 +1047,10 @@ ssize_t blkdev_copy_offload(
>  		struct block_device *bdev_in, loff_t pos_in,
>  		struct block_device *bdev_out, loff_t pos_out,
>  		size_t len, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
> +ssize_t blkdev_copy_offload_failfast(
> +		struct block_device *bdev_in, loff_t pos_in,
> +		struct block_device *bdev_out, loff_t pos_out,
> +		size_t len, gfp_t gfp_mask);
>  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  		gfp_t gfp_mask);
>  void bio_map_kern_endio(struct bio *bio);
Nitesh Shetty June 28, 2023, 4:39 p.m. UTC | #2
On 23/06/28 03:51PM, Damien Le Moal wrote:
>On 6/28/23 03:36, Nitesh Shetty wrote:
>> For direct block device opened with O_DIRECT, use copy_file_range to
>> issue device copy offload, and fallback to generic_copy_file_range incase
>> device copy offload capability is absent.
>
>...if the device does not support copy offload or the device files are not open
>with O_DIRECT.
>
>No ?
>
Yes your right. We will fallback to generic_copy_file_range in either of
these cases.

>> Modify checks to allow bdevs to use copy_file_range.
>>
>> Suggested-by: Ming Lei <ming.lei@redhat.com>
>> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> ---
>>  block/blk-lib.c        | 26 ++++++++++++++++++++++++++
>>  block/fops.c           | 20 ++++++++++++++++++++
>>  fs/read_write.c        |  7 +++++--
>>  include/linux/blkdev.h |  4 ++++
>>  4 files changed, 55 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index 09e0d5d51d03..7d8e09a99254 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -473,6 +473,32 @@ ssize_t blkdev_copy_offload(
>>  }
>>  EXPORT_SYMBOL_GPL(blkdev_copy_offload);
>>
>> +/* Copy source offset from source block device to destination block
>> + * device. Returns the length of bytes copied.
>> + */
>
>Multi-line comment style: start with a "/*" line please.
>
acked

>> +ssize_t blkdev_copy_offload_failfast(
>
>What is the "failfast" in the name for ?

We dont want failed copy offload IOs to fallback to block layer copy emulation.
We wanted a API to return error, if offload fails.

>
>> +		struct block_device *bdev_in, loff_t pos_in,
>> +		struct block_device *bdev_out, loff_t pos_out,
>> +		size_t len, gfp_t gfp_mask)
>> +{
>> +	struct request_queue *in_q = bdev_get_queue(bdev_in);
>> +	struct request_queue *out_q = bdev_get_queue(bdev_out);
>> +	ssize_t ret = 0;
>
>You do not need this initialization.
>

we need this initialization, because __blkdev_copy_offload return number of
bytes copied or error value.
So we can not return 0, incase of success/partial completion.
blkdev_copy_offload_failfast is expected to return number of bytes copied.

>> +
>> +	if (blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len))
>> +		return 0;
>> +
>> +	if (blk_queue_copy(in_q) && blk_queue_copy(out_q)) {
>
>Given that I think we do not allow copies between different devices, in_q and
>out_q should always be the same, no ?

acked, will update this.

>
>> +		ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
>> +				len, NULL, NULL, gfp_mask);
>
>Same here. Why pass 2 bdevs if we only allow copies within the same device ?
>

acked, will update function arguments to take single bdev.

>> +		if (ret < 0)
>> +			return 0;
>> +	}
>> +
>> +	return ret;
>
>return 0;
>

Nack, explained above.

Thank you,
Nitesh Shetty
--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Christoph Hellwig July 20, 2023, 7:57 a.m. UTC | #3
> +/* Copy source offset from source block device to destination block
> + * device. Returns the length of bytes copied.
> + */
> +ssize_t blkdev_copy_offload_failfast(
> +		struct block_device *bdev_in, loff_t pos_in,
> +		struct block_device *bdev_out, loff_t pos_out,
> +		size_t len, gfp_t gfp_mask)

This is an odd and very misnamed interface.

Either we have a klkdev_copy() interface that automatically falls back
to a fallback (maybe with an opt-out), or we have separate
blkdev_copy_offload/blkdev_copy_emulated interface and let the caller
decide.  But none of that really is "failfast".

Also this needs to go into the helpers patch and not a patch that is
supposed to just wire copying up for block device node.

> index b07de77ef126..d27148a2543f 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1447,7 +1447,8 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>  		return -EOVERFLOW;
>  
>  	/* Shorten the copy to EOF */
> -	size_in = i_size_read(inode_in);
> +	size_in = i_size_read(file_in->f_mapping->host);

generic_copy_file_checks needs to be fixed to use ->mapping->host both
or inode_in and inode_out at the top of the file instead of this
band aid.  And that needs to be a separate patch with a Fixes tag.

> @@ -1708,7 +1709,9 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
>  	/* Don't copy dirs, pipes, sockets... */
>  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>  		return -EISDIR;
> -	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> +
> +	if ((!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) &&
> +		(!S_ISBLK(inode_in->i_mode) || !S_ISBLK(inode_out->i_mode)))

This is using weird indentation, and might also not be doing
exactly what we want.  I think the better thing to do here is to:

 1) check for the accetable types only on the in inode
 2) have a check that the mode matches for the in and out inodes

And please do this as a separate prep patch instead of hiding it here.

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Nitesh Shetty July 24, 2023, 5:46 a.m. UTC | #4
On 23/07/20 09:57AM, Christoph Hellwig wrote:
>> +/* Copy source offset from source block device to destination block
>> + * device. Returns the length of bytes copied.
>> + */
>> +ssize_t blkdev_copy_offload_failfast(
>> +		struct block_device *bdev_in, loff_t pos_in,
>> +		struct block_device *bdev_out, loff_t pos_out,
>> +		size_t len, gfp_t gfp_mask)
>
>This is an odd and very misnamed interface.
>
>Either we have a klkdev_copy() interface that automatically falls back
>to a fallback (maybe with an opt-out), or we have separate
>blkdev_copy_offload/blkdev_copy_emulated interface and let the caller
>decide.  But none of that really is "failfast".
>
>Also this needs to go into the helpers patch and not a patch that is
>supposed to just wire copying up for block device node.
>
Acked.

>> index b07de77ef126..d27148a2543f 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1447,7 +1447,8 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
>>  		return -EOVERFLOW;
>>
>>  	/* Shorten the copy to EOF */
>> -	size_in = i_size_read(inode_in);
>> +	size_in = i_size_read(file_in->f_mapping->host);
>
>generic_copy_file_checks needs to be fixed to use ->mapping->host both
>or inode_in and inode_out at the top of the file instead of this
>band aid.  And that needs to be a separate patch with a Fixes tag.
>
Addressed below.

>> @@ -1708,7 +1709,9 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
>>  	/* Don't copy dirs, pipes, sockets... */
>>  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>>  		return -EISDIR;
>> -	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> +
>> +	if ((!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) &&
>> +		(!S_ISBLK(inode_in->i_mode) || !S_ISBLK(inode_out->i_mode)))
>
>This is using weird indentation, and might also not be doing
>exactly what we want.  I think the better thing to do here is to:
>
> 1) check for the accetable types only on the in inode
> 2) have a check that the mode matches for the in and out inodes
>
>And please do this as a separate prep patch instead of hiding it here.
>
Agreed. We will send a separate patch, that enables copy_file_range on
block devices.

Thank you,
Nitesh Shetty
--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 09e0d5d51d03..7d8e09a99254 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -473,6 +473,32 @@  ssize_t blkdev_copy_offload(
 }
 EXPORT_SYMBOL_GPL(blkdev_copy_offload);
 
+/* Copy source offset from source block device to destination block
+ * device. Returns the length of bytes copied.
+ */
+ssize_t blkdev_copy_offload_failfast(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, gfp_t gfp_mask)
+{
+	struct request_queue *in_q = bdev_get_queue(bdev_in);
+	struct request_queue *out_q = bdev_get_queue(bdev_out);
+	ssize_t ret = 0;
+
+	if (blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len))
+		return 0;
+
+	if (blk_queue_copy(in_q) && blk_queue_copy(out_q)) {
+		ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
+				len, NULL, NULL, gfp_mask);
+		if (ret < 0)
+			return 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload_failfast);
+
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/block/fops.c b/block/fops.c
index a286bf3325c5..a1576304f269 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -621,6 +621,25 @@  static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
+static ssize_t blkdev_copy_file_range(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out,
+				size_t len, unsigned int flags)
+{
+	struct block_device *in_bdev = I_BDEV(bdev_file_inode(file_in));
+	struct block_device *out_bdev = I_BDEV(bdev_file_inode(file_out));
+	ssize_t comp_len = 0;
+
+	if ((file_in->f_iocb_flags & IOCB_DIRECT) &&
+		(file_out->f_iocb_flags & IOCB_DIRECT))
+		comp_len = blkdev_copy_offload_failfast(in_bdev, pos_in,
+				out_bdev, pos_out, len, GFP_KERNEL);
+	if (comp_len != len)
+		comp_len = generic_copy_file_range(file_in, pos_in + comp_len,
+			file_out, pos_out + comp_len, len - comp_len, flags);
+
+	return comp_len;
+}
+
 #define	BLKDEV_FALLOC_FL_SUPPORTED					\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
@@ -714,6 +733,7 @@  const struct file_operations def_blk_fops = {
 	.splice_read	= filemap_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= blkdev_fallocate,
+	.copy_file_range = blkdev_copy_file_range,
 };
 
 static __init int blkdev_init(void)
diff --git a/fs/read_write.c b/fs/read_write.c
index b07de77ef126..d27148a2543f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1447,7 +1447,8 @@  static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
 		return -EOVERFLOW;
 
 	/* Shorten the copy to EOF */
-	size_in = i_size_read(inode_in);
+	size_in = i_size_read(file_in->f_mapping->host);
+
 	if (pos_in >= size_in)
 		count = 0;
 	else
@@ -1708,7 +1709,9 @@  int generic_file_rw_checks(struct file *file_in, struct file *file_out)
 	/* Don't copy dirs, pipes, sockets... */
 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
 		return -EISDIR;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+
+	if ((!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) &&
+		(!S_ISBLK(inode_in->i_mode) || !S_ISBLK(inode_out->i_mode)))
 		return -EINVAL;
 
 	if (!(file_in->f_mode & FMODE_READ) ||
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c176bf6173c5..850168cad080 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1047,6 +1047,10 @@  ssize_t blkdev_copy_offload(
 		struct block_device *bdev_in, loff_t pos_in,
 		struct block_device *bdev_out, loff_t pos_out,
 		size_t len, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
+ssize_t blkdev_copy_offload_failfast(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, gfp_t gfp_mask);
 struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 		gfp_t gfp_mask);
 void bio_map_kern_endio(struct bio *bio);