diff mbox series

[v4,02/10] block: Add copy offload support infrastructure

Message ID 20220426101241.30100-3-nj.shetty@samsung.com (mailing list archive)
State Not Applicable
Headers show
Series [v4,01/10] block: Introduce queue limits for copy-offload support | expand

Commit Message

Nitesh Shetty April 26, 2022, 10:12 a.m. UTC
Introduce blkdev_issue_copy which supports source and destination bdevs,
and an array of (source, destination and copy length) tuples.
Introduce REQ_COPY copy offload operation flag. Create a read-write
bio pair with a token as payload and submitted to the device in order.
Read request populates token with source specific information which
is then passed with write request.
This design is courtesy Mikulas Patocka's token based copy

Larger copy will be divided, based on max_copy_sectors,
max_copy_range_sector limits.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
---
 block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
 block/blk.h               |   2 +
 include/linux/blk_types.h |  21 ++++
 include/linux/blkdev.h    |   2 +
 include/uapi/linux/fs.h   |  14 +++
 5 files changed, 271 insertions(+)

Comments

kernel test robot April 27, 2022, 12:11 a.m. UTC | #1
Hi Nitesh,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20220422]
[cannot apply to axboe-block/for-next device-mapper-dm/for-next linus/master v5.18-rc4 v5.18-rc3 v5.18-rc2 v5.18-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Nitesh-Shetty/block-Introduce-queue-limits-for-copy-offload-support/20220426-201825
base:    e7d6987e09a328d4a949701db40ef63fbb970670
config: hexagon-randconfig-r041-20220425 (https://download.01.org/0day-ci/archive/20220427/202204270754.pM0Ewhl5-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 1cddcfdc3c683b393df1a5c9063252eb60e52818)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/3e91cba65ef73ba116953031d5548da7fd33a150
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Nitesh-Shetty/block-Introduce-queue-limits-for-copy-offload-support/20220426-201825
        git checkout 3e91cba65ef73ba116953031d5548da7fd33a150
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/blk-lib.c:178:5: warning: no previous prototype for function 'blk_copy_offload' [-Wmissing-prototypes]
   int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
       ^
   block/blk-lib.c:178:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
   ^
   static 
   1 warning generated.


vim +/blk_copy_offload +178 block/blk-lib.c

   173	
   174	/*
   175	 * blk_copy_offload	- Use device's native copy offload feature
   176	 * Go through user provide payload, prepare new payload based on device's copy offload limits.
   177	 */
 > 178	int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
   179			struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
   180	{
   181		struct request_queue *sq = bdev_get_queue(src_bdev);
   182		struct request_queue *dq = bdev_get_queue(dst_bdev);
   183		struct bio *read_bio, *write_bio;
   184		struct copy_ctx *ctx;
   185		struct cio *cio;
   186		struct page *token;
   187		sector_t src_blk, copy_len, dst_blk;
   188		sector_t remaining, max_copy_len = LONG_MAX;
   189		unsigned long flags;
   190		int ri = 0, ret = 0;
   191	
   192		cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
   193		if (!cio)
   194			return -ENOMEM;
   195		cio->rlist = rlist;
   196		spin_lock_init(&cio->lock);
   197	
   198		max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
   199		max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
   200				(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
   201	
   202		for (ri = 0; ri < nr_srcs; ri++) {
   203			cio->rlist[ri].comp_len = rlist[ri].len;
   204			src_blk = rlist[ri].src;
   205			dst_blk = rlist[ri].dst;
   206			for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
   207				copy_len = min(remaining, max_copy_len);
   208	
   209				token = alloc_page(gfp_mask);
   210				if (unlikely(!token)) {
   211					ret = -ENOMEM;
   212					goto err_token;
   213				}
   214	
   215				ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
   216				if (!ctx) {
   217					ret = -ENOMEM;
   218					goto err_ctx;
   219				}
   220				ctx->cio = cio;
   221				ctx->range_idx = ri;
   222				ctx->start_sec = dst_blk;
   223	
   224				read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
   225						gfp_mask);
   226				if (!read_bio) {
   227					ret = -ENOMEM;
   228					goto err_read_bio;
   229				}
   230				read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
   231				__bio_add_page(read_bio, token, PAGE_SIZE, 0);
   232				/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
   233				read_bio->bi_iter.bi_size = copy_len;
   234				ret = submit_bio_wait(read_bio);
   235				bio_put(read_bio);
   236				if (ret)
   237					goto err_read_bio;
   238	
   239				write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
   240						gfp_mask);
   241				if (!write_bio) {
   242					ret = -ENOMEM;
   243					goto err_read_bio;
   244				}
   245				write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
   246				__bio_add_page(write_bio, token, PAGE_SIZE, 0);
   247				/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
   248				write_bio->bi_iter.bi_size = copy_len;
   249				write_bio->bi_end_io = bio_copy_end_io;
   250				write_bio->bi_private = ctx;
   251	
   252				spin_lock_irqsave(&cio->lock, flags);
   253				++cio->refcount;
   254				spin_unlock_irqrestore(&cio->lock, flags);
   255	
   256				submit_bio(write_bio);
   257				src_blk += copy_len;
   258				dst_blk += copy_len;
   259			}
   260		}
   261	
   262		/* Wait for completion of all IO's*/
   263		return cio_await_completion(cio);
   264	
   265	err_read_bio:
   266		kfree(ctx);
   267	err_ctx:
   268		__free_page(token);
   269	err_token:
   270		rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
   271	
   272		cio->io_err = ret;
   273		return cio_await_completion(cio);
   274	}
   275
Damien Le Moal April 27, 2022, 2:45 a.m. UTC | #2
On 4/26/22 19:12, Nitesh Shetty wrote:
> Introduce blkdev_issue_copy which supports source and destination bdevs,
> and an array of (source, destination and copy length) tuples.
> Introduce REQ_COPY copy offload operation flag. Create a read-write
> bio pair with a token as payload and submitted to the device in order.
> Read request populates token with source specific information which
> is then passed with write request.
> This design is courtesy Mikulas Patocka's token based copy
> 
> Larger copy will be divided, based on max_copy_sectors,
> max_copy_range_sector limits.
> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> ---
>  block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
>  block/blk.h               |   2 +
>  include/linux/blk_types.h |  21 ++++
>  include/linux/blkdev.h    |   2 +
>  include/uapi/linux/fs.h   |  14 +++
>  5 files changed, 271 insertions(+)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 09b7e1200c0f..ba9da2d2f429 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  }
>  EXPORT_SYMBOL(blkdev_issue_discard);
>  
> +/*
> + * Wait on and process all in-flight BIOs.  This must only be called once
> + * all bios have been issued so that the refcount can only decrease.
> + * This just waits for all bios to make it through bio_copy_end_io. IO
> + * errors are propagated through cio->io_error.
> + */
> +static int cio_await_completion(struct cio *cio)
> +{
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (cio->refcount) {
> +		cio->waiter = current;
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		spin_unlock_irqrestore(&cio->lock, flags);
> +		blk_io_schedule();
> +		/* wake up sets us TASK_RUNNING */
> +		spin_lock_irqsave(&cio->lock, flags);
> +		cio->waiter = NULL;
> +		ret = cio->io_err;
> +	}
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	kvfree(cio);

cio is allocated with kzalloc() == kmalloc(). So why the kvfree() here ?

> +
> +	return ret;
> +}
> +
> +static void bio_copy_end_io(struct bio *bio)
> +{
> +	struct copy_ctx *ctx = bio->bi_private;
> +	struct cio *cio = ctx->cio;
> +	sector_t clen;
> +	int ri = ctx->range_idx;
> +	unsigned long flags;
> +	bool wake = false;
> +
> +	if (bio->bi_status) {
> +		cio->io_err = bio->bi_status;
> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);

long line.

> +	}
> +	__free_page(bio->bi_io_vec[0].bv_page);
> +	kfree(ctx);
> +	bio_put(bio);
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (((--cio->refcount) <= 0) && cio->waiter)
> +		wake = true;
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	if (wake)
> +		wake_up_process(cio->waiter);
> +}
> +
> +/*
> + * blk_copy_offload	- Use device's native copy offload feature
> + * Go through user provide payload, prepare new payload based on device's copy offload limits.

long line.

> + */
> +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)

long line.

rlist is an array, but rlist naming implies a list. Why not call that
argument "ranges" ?

The argument ordering is also strange. I would make that:

blk_copy_offload(struct block_device *src_bdev,
	         struct block_device *dst_bdev,
		 struct range_entry *rlist, int nr_srcs,
		 gfp_t gfp_mask)

> +{
> +	struct request_queue *sq = bdev_get_queue(src_bdev);
> +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> +	struct bio *read_bio, *write_bio;
> +	struct copy_ctx *ctx;
> +	struct cio *cio;
> +	struct page *token;
> +	sector_t src_blk, copy_len, dst_blk;
> +	sector_t remaining, max_copy_len = LONG_MAX;
> +	unsigned long flags;
> +	int ri = 0, ret = 0;
> +
> +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> +	if (!cio)
> +		return -ENOMEM;
> +	cio->rlist = rlist;
> +	spin_lock_init(&cio->lock);
> +
> +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;

But max_copy_range_sectors is for one sector only, right ? So what is this
second min3() doing ? It is mixing up total length and one range length.
The device should not have reported a per range max length larger than the
total length in the first place, right ? If it does, that would be a very
starnge device...

> +
> +	for (ri = 0; ri < nr_srcs; ri++) {
> +		cio->rlist[ri].comp_len = rlist[ri].len;
> +		src_blk = rlist[ri].src;
> +		dst_blk = rlist[ri].dst;
> +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> +			copy_len = min(remaining, max_copy_len);
> +
> +			token = alloc_page(gfp_mask);
> +			if (unlikely(!token)) {
> +				ret = -ENOMEM;
> +				goto err_token;
> +			}
> +
> +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> +			if (!ctx) {
> +				ret = -ENOMEM;
> +				goto err_ctx;
> +			}
> +			ctx->cio = cio;
> +			ctx->range_idx = ri;
> +			ctx->start_sec = dst_blk;
> +
> +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!read_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			read_bio->bi_iter.bi_size = copy_len;
> +			ret = submit_bio_wait(read_bio);
> +			bio_put(read_bio);
> +			if (ret)
> +				goto err_read_bio;
> +
> +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!write_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			write_bio->bi_iter.bi_size = copy_len;
> +			write_bio->bi_end_io = bio_copy_end_io;
> +			write_bio->bi_private = ctx;
> +
> +			spin_lock_irqsave(&cio->lock, flags);
> +			++cio->refcount;

Shouldn't this be an atomic_t ?

And wrap lines please. Many are too long.

> +			spin_unlock_irqrestore(&cio->lock, flags);
> +
> +			submit_bio(write_bio);
> +			src_blk += copy_len;
> +			dst_blk += copy_len;
> +		}
> +	}
> +
> +	/* Wait for completion of all IO's*/
> +	return cio_await_completion(cio);
> +
> +err_read_bio:
> +	kfree(ctx);
> +err_ctx:
> +	__free_page(token);
> +err_token:
> +	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
> +
> +	cio->io_err = ret;
> +	return cio_await_completion(cio);
> +}
> +
> +static inline int blk_copy_sanity_check(struct block_device *src_bdev,
> +		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
> +{
> +	unsigned int align_mask = max(
> +			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
> +	sector_t len = 0;
> +	int i;
> +
> +	for (i = 0; i < nr; i++) {
> +		if (rlist[i].len)
> +			len += rlist[i].len;
> +		else
> +			return -EINVAL;

Reverse the if condition and return to avoid the else.

> +		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
> +				(rlist[i].len & align_mask))
> +			return -EINVAL;
> +		rlist[i].comp_len = 0;
> +	}
> +
> +	if (len && len >= MAX_COPY_TOTAL_LENGTH)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static inline bool blk_check_copy_offload(struct request_queue *src_q,
> +		struct request_queue *dest_q)
> +{
> +	if (blk_queue_copy(dest_q) && blk_queue_copy(src_q))
> +		return true;
> +
> +	return false;

return blk_queue_copy(dest_q) && blk_queue_copy(src_q);

would be simpler.

> +}
> +
> +/*
> + * blkdev_issue_copy - queue a copy
> + * @src_bdev:	source block device
> + * @nr_srcs:	number of source ranges to copy
> + * @rlist:	array of source/dest/len
> + * @dest_bdev:	destination block device
> + * @gfp_mask:   memory allocation flags (for bio_alloc)
> + *
> + * Description:
> + *	Copy source ranges from source block device to destination block device.
> + *	length of a source range cannot be zero.
> + */
> +int blkdev_issue_copy(struct block_device *src_bdev, int nr,
> +		struct range_entry *rlist, struct block_device *dest_bdev, gfp_t gfp_mask)

same comment as above about args order and naming.

> +{
> +	struct request_queue *src_q = bdev_get_queue(src_bdev);
> +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> +	int ret = -EINVAL;
> +
> +	if (!src_q || !dest_q)
> +		return -ENXIO;
> +
> +	if (!nr)
> +		return -EINVAL;
> +
> +	if (nr >= MAX_COPY_NR_RANGE)
> +		return -EINVAL;

Where do you check the number of ranges against what the device can do ?

> +
> +	if (bdev_read_only(dest_bdev))
> +		return -EPERM;
> +
> +	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
> +	if (ret)
> +		return ret;

nr check should be in this function...

> +
> +	if (blk_check_copy_offload(src_q, dest_q))

...which should be only one function with this one.

> +		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blkdev_issue_copy);
> +
>  static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>  		struct bio **biop, unsigned flags)
> diff --git a/block/blk.h b/block/blk.h
> index 434017701403..6010eda58c70 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -291,6 +291,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
>  		break;
>  	}
>  
> +	if (unlikely(op_is_copy(bio->bi_opf)))
> +		return false;
>  	/*
>  	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
>  	 * This is a quick and dirty check that relies on the fact that
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index c62274466e72..f5b01f284c43 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -418,6 +418,7 @@ enum req_flag_bits {
>  	/* for driver use */
>  	__REQ_DRV,
>  	__REQ_SWAP,		/* swapping request. */
> +	__REQ_COPY,		/* copy request */
>  	__REQ_NR_BITS,		/* stops here */
>  };
>  
> @@ -443,6 +444,7 @@ enum req_flag_bits {
>  
>  #define REQ_DRV			(1ULL << __REQ_DRV)
>  #define REQ_SWAP		(1ULL << __REQ_SWAP)
> +#define REQ_COPY		(1ULL << __REQ_COPY)
>  
>  #define REQ_FAILFAST_MASK \
>  	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
> @@ -459,6 +461,11 @@ enum stat_group {
>  	NR_STAT_GROUPS
>  };
>  
> +static inline bool op_is_copy(unsigned int op)
> +{
> +	return (op & REQ_COPY);
> +}
> +
>  #define bio_op(bio) \
>  	((bio)->bi_opf & REQ_OP_MASK)
>  
> @@ -533,4 +540,18 @@ struct blk_rq_stat {
>  	u64 batch;
>  };
>  
> +struct cio {
> +	struct range_entry *rlist;

naming... This is an array, right ?

> +	struct task_struct *waiter;     /* waiting task (NULL if none) */
> +	spinlock_t lock;		/* protects refcount and waiter */
> +	int refcount;
> +	blk_status_t io_err;
> +};
> +
> +struct copy_ctx {
> +	int range_idx;
> +	sector_t start_sec;
> +	struct cio *cio;
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 3596fd37fae7..c6cb3fe82ba2 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1121,6 +1121,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
>  int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
>  		sector_t nr_sects, gfp_t gfp);
> +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *src_rlist, struct block_device *dest_bdev, gfp_t gfp_mask);
>  
>  #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
>  #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index bdf7b404b3e7..822c28cebf3a 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -64,6 +64,20 @@ struct fstrim_range {
>  	__u64 minlen;
>  };
>  
> +/* Maximum no of entries supported */
> +#define MAX_COPY_NR_RANGE	(1 << 12)

This value should be used also when setting the limits in the previous
patch. max_copy_nr_ranges and max_hw_copy_nr_ranges must be bounded by it.

> +
> +/* maximum total copy length */
> +#define MAX_COPY_TOTAL_LENGTH	(1 << 27)

Same for this one. And where does this magic number come from ?

> +
> +/* Source range entry for copy */
> +struct range_entry {
> +	__u64 src;
> +	__u64 dst;
> +	__u64 len;
> +	__u64 comp_len;

Please describe the fields of this structure. The meaning of them is
really not clear from the names.

> +};
> +
>  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
>  #define FILE_DEDUPE_RANGE_SAME		0
>  #define FILE_DEDUPE_RANGE_DIFFERS	1
Hannes Reinecke April 27, 2022, 10:29 a.m. UTC | #3
On 4/26/22 12:12, Nitesh Shetty wrote:
> Introduce blkdev_issue_copy which supports source and destination bdevs,
> and an array of (source, destination and copy length) tuples.
> Introduce REQ_COPY copy offload operation flag. Create a read-write
> bio pair with a token as payload and submitted to the device in order.
> Read request populates token with source specific information which
> is then passed with write request.
> This design is courtesy Mikulas Patocka's token based copy
> 
> Larger copy will be divided, based on max_copy_sectors,
> max_copy_range_sector limits.
> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> ---
>   block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
>   block/blk.h               |   2 +
>   include/linux/blk_types.h |  21 ++++
>   include/linux/blkdev.h    |   2 +
>   include/uapi/linux/fs.h   |  14 +++
>   5 files changed, 271 insertions(+)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 09b7e1200c0f..ba9da2d2f429 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>   }
>   EXPORT_SYMBOL(blkdev_issue_discard);
>   
> +/*
> + * Wait on and process all in-flight BIOs.  This must only be called once
> + * all bios have been issued so that the refcount can only decrease.
> + * This just waits for all bios to make it through bio_copy_end_io. IO
> + * errors are propagated through cio->io_error.
> + */
> +static int cio_await_completion(struct cio *cio)
> +{
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (cio->refcount) {
> +		cio->waiter = current;
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		spin_unlock_irqrestore(&cio->lock, flags);
> +		blk_io_schedule();
> +		/* wake up sets us TASK_RUNNING */
> +		spin_lock_irqsave(&cio->lock, flags);
> +		cio->waiter = NULL;
> +		ret = cio->io_err;
> +	}
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	kvfree(cio);
> +
> +	return ret;
> +}
> +
> +static void bio_copy_end_io(struct bio *bio)
> +{
> +	struct copy_ctx *ctx = bio->bi_private;
> +	struct cio *cio = ctx->cio;
> +	sector_t clen;
> +	int ri = ctx->range_idx;
> +	unsigned long flags;
> +	bool wake = false;
> +
> +	if (bio->bi_status) {
> +		cio->io_err = bio->bi_status;
> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> +	}
> +	__free_page(bio->bi_io_vec[0].bv_page);
> +	kfree(ctx);
> +	bio_put(bio);
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (((--cio->refcount) <= 0) && cio->waiter)
> +		wake = true;
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	if (wake)
> +		wake_up_process(cio->waiter);
> +}
> +
> +/*
> + * blk_copy_offload	- Use device's native copy offload feature
> + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> + */
> +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> +{
> +	struct request_queue *sq = bdev_get_queue(src_bdev);
> +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> +	struct bio *read_bio, *write_bio;
> +	struct copy_ctx *ctx;
> +	struct cio *cio;
> +	struct page *token;
> +	sector_t src_blk, copy_len, dst_blk;
> +	sector_t remaining, max_copy_len = LONG_MAX;
> +	unsigned long flags;
> +	int ri = 0, ret = 0;
> +
> +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> +	if (!cio)
> +		return -ENOMEM;
> +	cio->rlist = rlist;
> +	spin_lock_init(&cio->lock);
> +
> +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> +
> +	for (ri = 0; ri < nr_srcs; ri++) {
> +		cio->rlist[ri].comp_len = rlist[ri].len;
> +		src_blk = rlist[ri].src;
> +		dst_blk = rlist[ri].dst;
> +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> +			copy_len = min(remaining, max_copy_len);
> +
> +			token = alloc_page(gfp_mask);
> +			if (unlikely(!token)) {
> +				ret = -ENOMEM;
> +				goto err_token;
> +			}
> +
> +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> +			if (!ctx) {
> +				ret = -ENOMEM;
> +				goto err_ctx;
> +			}
> +			ctx->cio = cio;
> +			ctx->range_idx = ri;
> +			ctx->start_sec = dst_blk;
> +
> +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!read_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			read_bio->bi_iter.bi_size = copy_len;
> +			ret = submit_bio_wait(read_bio);
> +			bio_put(read_bio);
> +			if (ret)
> +				goto err_read_bio;
> +
> +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!write_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			write_bio->bi_iter.bi_size = copy_len;
> +			write_bio->bi_end_io = bio_copy_end_io;
> +			write_bio->bi_private = ctx;
> +
> +			spin_lock_irqsave(&cio->lock, flags);
> +			++cio->refcount;
> +			spin_unlock_irqrestore(&cio->lock, flags);
> +
> +			submit_bio(write_bio);
> +			src_blk += copy_len;
> +			dst_blk += copy_len;
> +		}
> +	}
> +

Hmm. I'm not sure if I like the copy loop.
What I definitely would do is to allocate the write bio before reading 
data; after all, if we can't allocate the write bio reading is pretty 
much pointless.

But the real issue I have with this is that it's doing synchronous 
reads, thereby limiting the performance.

Can't you submit the write bio from the end_io function of the read bio?
That would disentangle things, and we should be getting a better 
performance.

Cheers,

Hannes
Nitesh Shetty April 27, 2022, 3:15 p.m. UTC | #4
On Wed, Apr 27, 2022 at 11:45:26AM +0900, Damien Le Moal wrote:
> On 4/26/22 19:12, Nitesh Shetty wrote:
> > Introduce blkdev_issue_copy which supports source and destination bdevs,
> > and an array of (source, destination and copy length) tuples.
> > Introduce REQ_COPY copy offload operation flag. Create a read-write
> > bio pair with a token as payload and submitted to the device in order.
> > Read request populates token with source specific information which
> > is then passed with write request.
> > This design is courtesy Mikulas Patocka's token based copy
> > 
> > Larger copy will be divided, based on max_copy_sectors,
> > max_copy_range_sector limits.
> > 
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> > ---
> >  block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
> >  block/blk.h               |   2 +
> >  include/linux/blk_types.h |  21 ++++
> >  include/linux/blkdev.h    |   2 +
> >  include/uapi/linux/fs.h   |  14 +++
> >  5 files changed, 271 insertions(+)
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index 09b7e1200c0f..ba9da2d2f429 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  }
> >  EXPORT_SYMBOL(blkdev_issue_discard);
> >  
> > +/*
> > + * Wait on and process all in-flight BIOs.  This must only be called once
> > + * all bios have been issued so that the refcount can only decrease.
> > + * This just waits for all bios to make it through bio_copy_end_io. IO
> > + * errors are propagated through cio->io_error.
> > + */
> > +static int cio_await_completion(struct cio *cio)
> > +{
> > +	int ret = 0;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (cio->refcount) {
> > +		cio->waiter = current;
> > +		__set_current_state(TASK_UNINTERRUPTIBLE);
> > +		spin_unlock_irqrestore(&cio->lock, flags);
> > +		blk_io_schedule();
> > +		/* wake up sets us TASK_RUNNING */
> > +		spin_lock_irqsave(&cio->lock, flags);
> > +		cio->waiter = NULL;
> > +		ret = cio->io_err;
> > +	}
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	kvfree(cio);
> 
> cio is allocated with kzalloc() == kmalloc(). So why the kvfree() here ?
>

acked.

> > +
> > +	return ret;
> > +}
> > +
> > +static void bio_copy_end_io(struct bio *bio)
> > +{
> > +	struct copy_ctx *ctx = bio->bi_private;
> > +	struct cio *cio = ctx->cio;
> > +	sector_t clen;
> > +	int ri = ctx->range_idx;
> > +	unsigned long flags;
> > +	bool wake = false;
> > +
> > +	if (bio->bi_status) {
> > +		cio->io_err = bio->bi_status;
> > +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> > +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> 
> long line.

Is it because line is more than 80 character, I thought limit is 100 now, so
went with longer lines ?

> 
> > +	}
> > +	__free_page(bio->bi_io_vec[0].bv_page);
> > +	kfree(ctx);
> > +	bio_put(bio);
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (((--cio->refcount) <= 0) && cio->waiter)
> > +		wake = true;
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	if (wake)
> > +		wake_up_process(cio->waiter);
> > +}
> > +
> > +/*
> > + * blk_copy_offload	- Use device's native copy offload feature
> > + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> 
> long line.
> 

Same as above

> > + */
> > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> 
> long line.
>

Same as above

> rlist is an array, but rlist naming implies a list. Why not call that
> argument "ranges" ?
> 
> The argument ordering is also strange. I would make that:
> 
> blk_copy_offload(struct block_device *src_bdev,
> 	         struct block_device *dst_bdev,
> 		 struct range_entry *rlist, int nr_srcs,
> 		 gfp_t gfp_mask)
> 

Yes, looks better. We will update in next version.
One doubt, this arguments ordering is based on size ?
Since we ordered it with logic that, we use nr_srcs to get number of entries
in rlist(ranges).

> > +{
> > +	struct request_queue *sq = bdev_get_queue(src_bdev);
> > +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> > +	struct bio *read_bio, *write_bio;
> > +	struct copy_ctx *ctx;
> > +	struct cio *cio;
> > +	struct page *token;
> > +	sector_t src_blk, copy_len, dst_blk;
> > +	sector_t remaining, max_copy_len = LONG_MAX;
> > +	unsigned long flags;
> > +	int ri = 0, ret = 0;
> > +
> > +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> > +	if (!cio)
> > +		return -ENOMEM;
> > +	cio->rlist = rlist;
> > +	spin_lock_init(&cio->lock);
> > +
> > +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> > +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> > +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> 
> But max_copy_range_sectors is for one sector only, right ? So what is this
> second min3() doing ? It is mixing up total length and one range length.
> The device should not have reported a per range max length larger than the
> total length in the first place, right ? If it does, that would be a very
> starnge device...
> 

Yeah you are right, makes sense, will update in next version.

> > +
> > +	for (ri = 0; ri < nr_srcs; ri++) {
> > +		cio->rlist[ri].comp_len = rlist[ri].len;
> > +		src_blk = rlist[ri].src;
> > +		dst_blk = rlist[ri].dst;
> > +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> > +			copy_len = min(remaining, max_copy_len);
> > +
> > +			token = alloc_page(gfp_mask);
> > +			if (unlikely(!token)) {
> > +				ret = -ENOMEM;
> > +				goto err_token;
> > +			}
> > +
> > +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> > +			if (!ctx) {
> > +				ret = -ENOMEM;
> > +				goto err_ctx;
> > +			}
> > +			ctx->cio = cio;
> > +			ctx->range_idx = ri;
> > +			ctx->start_sec = dst_blk;
> > +
> > +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!read_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			read_bio->bi_iter.bi_size = copy_len;
> > +			ret = submit_bio_wait(read_bio);
> > +			bio_put(read_bio);
> > +			if (ret)
> > +				goto err_read_bio;
> > +
> > +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!write_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			write_bio->bi_iter.bi_size = copy_len;
> > +			write_bio->bi_end_io = bio_copy_end_io;
> > +			write_bio->bi_private = ctx;
> > +
> > +			spin_lock_irqsave(&cio->lock, flags);
> > +			++cio->refcount;
> 
> Shouldn't this be an atomic_t ?
> 

We changed it to normal variable and used a single spin_lock to avoid race
condition on refcount and current process wakeup in completion path.
Since for making copy asynchronous, we needed to store process
context as well. So there was a possibility of race condition.
https://lore.kernel.org/all/20220209102208.GB7698@test-zns/

> And wrap lines please. Many are too long.
> 
> > +			spin_unlock_irqrestore(&cio->lock, flags);
> > +
> > +			submit_bio(write_bio);
> > +			src_blk += copy_len;
> > +			dst_blk += copy_len;
> > +		}
> > +	}
> > +
> > +	/* Wait for completion of all IO's*/
> > +	return cio_await_completion(cio);
> > +
> > +err_read_bio:
> > +	kfree(ctx);
> > +err_ctx:
> > +	__free_page(token);
> > +err_token:
> > +	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
> > +
> > +	cio->io_err = ret;
> > +	return cio_await_completion(cio);
> > +}
> > +
> > +static inline int blk_copy_sanity_check(struct block_device *src_bdev,
> > +		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
> > +{
> > +	unsigned int align_mask = max(
> > +			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
> > +	sector_t len = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < nr; i++) {
> > +		if (rlist[i].len)
> > +			len += rlist[i].len;
> > +		else
> > +			return -EINVAL;
> 
> Reverse the if condition and return to avoid the else.
> 

acked

> > +		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
> > +				(rlist[i].len & align_mask))
> > +			return -EINVAL;
> > +		rlist[i].comp_len = 0;
> > +	}
> > +
> > +	if (len && len >= MAX_COPY_TOTAL_LENGTH)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +
> > +static inline bool blk_check_copy_offload(struct request_queue *src_q,
> > +		struct request_queue *dest_q)
> > +{
> > +	if (blk_queue_copy(dest_q) && blk_queue_copy(src_q))
> > +		return true;
> > +
> > +	return false;
> 
> return blk_queue_copy(dest_q) && blk_queue_copy(src_q);
> 
> would be simpler.
> 

acked

> > +}
> > +
> > +/*
> > + * blkdev_issue_copy - queue a copy
> > + * @src_bdev:	source block device
> > + * @nr_srcs:	number of source ranges to copy
> > + * @rlist:	array of source/dest/len
> > + * @dest_bdev:	destination block device
> > + * @gfp_mask:   memory allocation flags (for bio_alloc)
> > + *
> > + * Description:
> > + *	Copy source ranges from source block device to destination block device.
> > + *	length of a source range cannot be zero.
> > + */
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr,
> > +		struct range_entry *rlist, struct block_device *dest_bdev, gfp_t gfp_mask)
> 
> same comment as above about args order and naming.
> 

acked

> > +{
> > +	struct request_queue *src_q = bdev_get_queue(src_bdev);
> > +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> > +	int ret = -EINVAL;
> > +
> > +	if (!src_q || !dest_q)
> > +		return -ENXIO;
> > +
> > +	if (!nr)
> > +		return -EINVAL;
> > +
> > +	if (nr >= MAX_COPY_NR_RANGE)
> > +		return -EINVAL;
> 
> Where do you check the number of ranges against what the device can do ?
>

The present implementation submits only one range at a time. This was done to 
make copy offload generic, so that other types of copy implementation such as
XCOPY should be able to use same infrastructure. Downside at present being
NVMe copy offload is not optimal.

> > +
> > +	if (bdev_read_only(dest_bdev))
> > +		return -EPERM;
> > +
> > +	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
> > +	if (ret)
> > +		return ret;
> 
> nr check should be in this function...
> 
> > +
> > +	if (blk_check_copy_offload(src_q, dest_q))
> 
> ...which should be only one function with this one.
> 

Sure, we can combine blk_copy_sanity_check and blk_check_copy_offload.

> > +		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(blkdev_issue_copy);
> > +
> >  static int __blkdev_issue_write_zeroes(struct block_device *bdev,
> >  		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
> >  		struct bio **biop, unsigned flags)
> > diff --git a/block/blk.h b/block/blk.h
> > index 434017701403..6010eda58c70 100644
> > --- a/block/blk.h
> > +++ b/block/blk.h
> > @@ -291,6 +291,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
> >  		break;
> >  	}
> >  
> > +	if (unlikely(op_is_copy(bio->bi_opf)))
> > +		return false;
> >  	/*
> >  	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
> >  	 * This is a quick and dirty check that relies on the fact that
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index c62274466e72..f5b01f284c43 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -418,6 +418,7 @@ enum req_flag_bits {
> >  	/* for driver use */
> >  	__REQ_DRV,
> >  	__REQ_SWAP,		/* swapping request. */
> > +	__REQ_COPY,		/* copy request */
> >  	__REQ_NR_BITS,		/* stops here */
> >  };
> >  
> > @@ -443,6 +444,7 @@ enum req_flag_bits {
> >  
> >  #define REQ_DRV			(1ULL << __REQ_DRV)
> >  #define REQ_SWAP		(1ULL << __REQ_SWAP)
> > +#define REQ_COPY		(1ULL << __REQ_COPY)
> >  
> >  #define REQ_FAILFAST_MASK \
> >  	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
> > @@ -459,6 +461,11 @@ enum stat_group {
> >  	NR_STAT_GROUPS
> >  };
> >  
> > +static inline bool op_is_copy(unsigned int op)
> > +{
> > +	return (op & REQ_COPY);
> > +}
> > +
> >  #define bio_op(bio) \
> >  	((bio)->bi_opf & REQ_OP_MASK)
> >  
> > @@ -533,4 +540,18 @@ struct blk_rq_stat {
> >  	u64 batch;
> >  };
> >  
> > +struct cio {
> > +	struct range_entry *rlist;
> 
> naming... This is an array, right ?
>

acked, will update in next version.

> > +	struct task_struct *waiter;     /* waiting task (NULL if none) */
> > +	spinlock_t lock;		/* protects refcount and waiter */
> > +	int refcount;
> > +	blk_status_t io_err;
> > +};
> > +
> > +struct copy_ctx {
> > +	int range_idx;
> > +	sector_t start_sec;
> > +	struct cio *cio;
> > +};
> > +
> >  #endif /* __LINUX_BLK_TYPES_H */
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 3596fd37fae7..c6cb3fe82ba2 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -1121,6 +1121,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
> >  int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> >  		sector_t nr_sects, gfp_t gfp);
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *src_rlist, struct block_device *dest_bdev, gfp_t gfp_mask);
> >  
> >  #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
> >  #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index bdf7b404b3e7..822c28cebf3a 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -64,6 +64,20 @@ struct fstrim_range {
> >  	__u64 minlen;
> >  };
> >  
> > +/* Maximum no of entries supported */
> > +#define MAX_COPY_NR_RANGE	(1 << 12)
> 
> This value should be used also when setting the limits in the previous
> patch. max_copy_nr_ranges and max_hw_copy_nr_ranges must be bounded by it.
> 

acked.

> > +
> > +/* maximum total copy length */
> > +#define MAX_COPY_TOTAL_LENGTH	(1 << 27)
> 
> Same for this one. And where does this magic number come from ?
>

We used this as max size for local testing, so as not to hang resources in case
of emulation. Feel free to suggest better values if you have anything in mind !!

> > +
> > +/* Source range entry for copy */
> > +struct range_entry {
> > +	__u64 src;
> > +	__u64 dst;
> > +	__u64 len;
> > +	__u64 comp_len;
> 
> Please describe the fields of this structure. The meaning of them is
> really not clear from the names.
> 

acked

> > +};
> > +
> >  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
> >  #define FILE_DEDUPE_RANGE_SAME		0
> >  #define FILE_DEDUPE_RANGE_DIFFERS	1
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
>
Nitesh Shetty April 27, 2022, 3:48 p.m. UTC | #5
On Wed, Apr 27, 2022 at 12:29:15PM +0200, Hannes Reinecke wrote:
> On 4/26/22 12:12, Nitesh Shetty wrote:
> > Introduce blkdev_issue_copy which supports source and destination bdevs,
> > and an array of (source, destination and copy length) tuples.
> > Introduce REQ_COPY copy offload operation flag. Create a read-write
> > bio pair with a token as payload and submitted to the device in order.
> > Read request populates token with source specific information which
> > is then passed with write request.
> > This design is courtesy Mikulas Patocka's token based copy
> > 
> > Larger copy will be divided, based on max_copy_sectors,
> > max_copy_range_sector limits.
> > 
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> > ---
> >   block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
> >   block/blk.h               |   2 +
> >   include/linux/blk_types.h |  21 ++++
> >   include/linux/blkdev.h    |   2 +
> >   include/uapi/linux/fs.h   |  14 +++
> >   5 files changed, 271 insertions(+)
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index 09b7e1200c0f..ba9da2d2f429 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >   }
> >   EXPORT_SYMBOL(blkdev_issue_discard);
> > +/*
> > + * Wait on and process all in-flight BIOs.  This must only be called once
> > + * all bios have been issued so that the refcount can only decrease.
> > + * This just waits for all bios to make it through bio_copy_end_io. IO
> > + * errors are propagated through cio->io_error.
> > + */
> > +static int cio_await_completion(struct cio *cio)
> > +{
> > +	int ret = 0;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (cio->refcount) {
> > +		cio->waiter = current;
> > +		__set_current_state(TASK_UNINTERRUPTIBLE);
> > +		spin_unlock_irqrestore(&cio->lock, flags);
> > +		blk_io_schedule();
> > +		/* wake up sets us TASK_RUNNING */
> > +		spin_lock_irqsave(&cio->lock, flags);
> > +		cio->waiter = NULL;
> > +		ret = cio->io_err;
> > +	}
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	kvfree(cio);
> > +
> > +	return ret;
> > +}
> > +
> > +static void bio_copy_end_io(struct bio *bio)
> > +{
> > +	struct copy_ctx *ctx = bio->bi_private;
> > +	struct cio *cio = ctx->cio;
> > +	sector_t clen;
> > +	int ri = ctx->range_idx;
> > +	unsigned long flags;
> > +	bool wake = false;
> > +
> > +	if (bio->bi_status) {
> > +		cio->io_err = bio->bi_status;
> > +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> > +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> > +	}
> > +	__free_page(bio->bi_io_vec[0].bv_page);
> > +	kfree(ctx);
> > +	bio_put(bio);
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (((--cio->refcount) <= 0) && cio->waiter)
> > +		wake = true;
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	if (wake)
> > +		wake_up_process(cio->waiter);
> > +}
> > +
> > +/*
> > + * blk_copy_offload	- Use device's native copy offload feature
> > + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> > + */
> > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> > +{
> > +	struct request_queue *sq = bdev_get_queue(src_bdev);
> > +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> > +	struct bio *read_bio, *write_bio;
> > +	struct copy_ctx *ctx;
> > +	struct cio *cio;
> > +	struct page *token;
> > +	sector_t src_blk, copy_len, dst_blk;
> > +	sector_t remaining, max_copy_len = LONG_MAX;
> > +	unsigned long flags;
> > +	int ri = 0, ret = 0;
> > +
> > +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> > +	if (!cio)
> > +		return -ENOMEM;
> > +	cio->rlist = rlist;
> > +	spin_lock_init(&cio->lock);
> > +
> > +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> > +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> > +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> > +
> > +	for (ri = 0; ri < nr_srcs; ri++) {
> > +		cio->rlist[ri].comp_len = rlist[ri].len;
> > +		src_blk = rlist[ri].src;
> > +		dst_blk = rlist[ri].dst;
> > +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> > +			copy_len = min(remaining, max_copy_len);
> > +
> > +			token = alloc_page(gfp_mask);
> > +			if (unlikely(!token)) {
> > +				ret = -ENOMEM;
> > +				goto err_token;
> > +			}
> > +
> > +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> > +			if (!ctx) {
> > +				ret = -ENOMEM;
> > +				goto err_ctx;
> > +			}
> > +			ctx->cio = cio;
> > +			ctx->range_idx = ri;
> > +			ctx->start_sec = dst_blk;
> > +
> > +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!read_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			read_bio->bi_iter.bi_size = copy_len;
> > +			ret = submit_bio_wait(read_bio);
> > +			bio_put(read_bio);
> > +			if (ret)
> > +				goto err_read_bio;
> > +
> > +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!write_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			write_bio->bi_iter.bi_size = copy_len;
> > +			write_bio->bi_end_io = bio_copy_end_io;
> > +			write_bio->bi_private = ctx;
> > +
> > +			spin_lock_irqsave(&cio->lock, flags);
> > +			++cio->refcount;
> > +			spin_unlock_irqrestore(&cio->lock, flags);
> > +
> > +			submit_bio(write_bio);
> > +			src_blk += copy_len;
> > +			dst_blk += copy_len;
> > +		}
> > +	}
> > +
> 
> Hmm. I'm not sure if I like the copy loop.
> What I definitely would do is to allocate the write bio before reading data;
> after all, if we can't allocate the write bio reading is pretty much
> pointless.
> 
> But the real issue I have with this is that it's doing synchronous reads,
> thereby limiting the performance.
> 
> Can't you submit the write bio from the end_io function of the read bio?
> That would disentangle things, and we should be getting a better
> performance.
> 

Agree, it will make code efficient.

--
Thank you 
Nitesh Shetty
Damien Le Moal April 27, 2022, 10:04 p.m. UTC | #6
On 4/28/22 00:15, Nitesh Shetty wrote:
> On Wed, Apr 27, 2022 at 11:45:26AM +0900, Damien Le Moal wrote:
>> On 4/26/22 19:12, Nitesh Shetty wrote:
>>> Introduce blkdev_issue_copy which supports source and destination bdevs,
>>> and an array of (source, destination and copy length) tuples.
>>> Introduce REQ_COPY copy offload operation flag. Create a read-write
>>> bio pair with a token as payload and submitted to the device in order.
>>> Read request populates token with source specific information which
>>> is then passed with write request.
>>> This design is courtesy Mikulas Patocka's token based copy
>>>
>>> Larger copy will be divided, based on max_copy_sectors,
>>> max_copy_range_sector limits.
>>>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
>>> ---
>>>  block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
>>>  block/blk.h               |   2 +
>>>  include/linux/blk_types.h |  21 ++++
>>>  include/linux/blkdev.h    |   2 +
>>>  include/uapi/linux/fs.h   |  14 +++
>>>  5 files changed, 271 insertions(+)
>>>
>>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>>> index 09b7e1200c0f..ba9da2d2f429 100644
>>> --- a/block/blk-lib.c
>>> +++ b/block/blk-lib.c
>>> @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>>  }
>>>  EXPORT_SYMBOL(blkdev_issue_discard);
>>>  
>>> +/*
>>> + * Wait on and process all in-flight BIOs.  This must only be called once
>>> + * all bios have been issued so that the refcount can only decrease.
>>> + * This just waits for all bios to make it through bio_copy_end_io. IO
>>> + * errors are propagated through cio->io_error.
>>> + */
>>> +static int cio_await_completion(struct cio *cio)
>>> +{
>>> +	int ret = 0;
>>> +	unsigned long flags;
>>> +
>>> +	spin_lock_irqsave(&cio->lock, flags);
>>> +	if (cio->refcount) {
>>> +		cio->waiter = current;
>>> +		__set_current_state(TASK_UNINTERRUPTIBLE);
>>> +		spin_unlock_irqrestore(&cio->lock, flags);
>>> +		blk_io_schedule();
>>> +		/* wake up sets us TASK_RUNNING */
>>> +		spin_lock_irqsave(&cio->lock, flags);
>>> +		cio->waiter = NULL;
>>> +		ret = cio->io_err;
>>> +	}
>>> +	spin_unlock_irqrestore(&cio->lock, flags);
>>> +	kvfree(cio);
>>
>> cio is allocated with kzalloc() == kmalloc(). So why the kvfree() here ?
>>
> 
> acked.
> 
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +static void bio_copy_end_io(struct bio *bio)
>>> +{
>>> +	struct copy_ctx *ctx = bio->bi_private;
>>> +	struct cio *cio = ctx->cio;
>>> +	sector_t clen;
>>> +	int ri = ctx->range_idx;
>>> +	unsigned long flags;
>>> +	bool wake = false;
>>> +
>>> +	if (bio->bi_status) {
>>> +		cio->io_err = bio->bi_status;
>>> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
>>> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
>>
>> long line.
> 
> Is it because line is more than 80 character, I thought limit is 100 now, so
> went with longer lines ?

When it is easy to wrap the lines without readability loss, please do to
keep things under 80 char per line.


>>> +{
>>> +	struct request_queue *src_q = bdev_get_queue(src_bdev);
>>> +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
>>> +	int ret = -EINVAL;
>>> +
>>> +	if (!src_q || !dest_q)
>>> +		return -ENXIO;
>>> +
>>> +	if (!nr)
>>> +		return -EINVAL;
>>> +
>>> +	if (nr >= MAX_COPY_NR_RANGE)
>>> +		return -EINVAL;
>>
>> Where do you check the number of ranges against what the device can do ?
>>
> 
> The present implementation submits only one range at a time. This was done to 
> make copy offload generic, so that other types of copy implementation such as
> XCOPY should be able to use same infrastructure. Downside at present being
> NVMe copy offload is not optimal.

If you issue one range at a time without checking the number of ranges,
what is the point of the nr ranges queue limit ? The user can submit a
copy ioctl request exceeding it. Please use that limit and enforce it or
remove it entirely.
Nitesh Shetty April 28, 2022, 8:01 a.m. UTC | #7
On Thu, Apr 28, 2022 at 07:04:13AM +0900, Damien Le Moal wrote:
> On 4/28/22 00:15, Nitesh Shetty wrote:
> > On Wed, Apr 27, 2022 at 11:45:26AM +0900, Damien Le Moal wrote:
> >> On 4/26/22 19:12, Nitesh Shetty wrote:
> >>> Introduce blkdev_issue_copy which supports source and destination bdevs,
> >>> and an array of (source, destination and copy length) tuples.
> >>> Introduce REQ_COPY copy offload operation flag. Create a read-write
> >>> bio pair with a token as payload and submitted to the device in order.
> >>> Read request populates token with source specific information which
> >>> is then passed with write request.
> >>> This design is courtesy Mikulas Patocka's token based copy
> >>>
> >>> Larger copy will be divided, based on max_copy_sectors,
> >>> max_copy_range_sector limits.
> >>>
> >>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> >>> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> >>> ---
> >>>  block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
> >>>  block/blk.h               |   2 +
> >>>  include/linux/blk_types.h |  21 ++++
> >>>  include/linux/blkdev.h    |   2 +
> >>>  include/uapi/linux/fs.h   |  14 +++
> >>>  5 files changed, 271 insertions(+)
> >>>
> >>> diff --git a/block/blk-lib.c b/block/blk-lib.c
> >>> index 09b7e1200c0f..ba9da2d2f429 100644
> >>> --- a/block/blk-lib.c
> >>> +++ b/block/blk-lib.c
> >>> @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >>>  }
> >>>  EXPORT_SYMBOL(blkdev_issue_discard);
> >>>  
> >>> +/*
> >>> + * Wait on and process all in-flight BIOs.  This must only be called once
> >>> + * all bios have been issued so that the refcount can only decrease.
> >>> + * This just waits for all bios to make it through bio_copy_end_io. IO
> >>> + * errors are propagated through cio->io_error.
> >>> + */
> >>> +static int cio_await_completion(struct cio *cio)
> >>> +{
> >>> +	int ret = 0;
> >>> +	unsigned long flags;
> >>> +
> >>> +	spin_lock_irqsave(&cio->lock, flags);
> >>> +	if (cio->refcount) {
> >>> +		cio->waiter = current;
> >>> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> >>> +		spin_unlock_irqrestore(&cio->lock, flags);
> >>> +		blk_io_schedule();
> >>> +		/* wake up sets us TASK_RUNNING */
> >>> +		spin_lock_irqsave(&cio->lock, flags);
> >>> +		cio->waiter = NULL;
> >>> +		ret = cio->io_err;
> >>> +	}
> >>> +	spin_unlock_irqrestore(&cio->lock, flags);
> >>> +	kvfree(cio);
> >>
> >> cio is allocated with kzalloc() == kmalloc(). So why the kvfree() here ?
> >>
> > 
> > acked.
> > 
> >>> +
> >>> +	return ret;
> >>> +}
> >>> +
> >>> +static void bio_copy_end_io(struct bio *bio)
> >>> +{
> >>> +	struct copy_ctx *ctx = bio->bi_private;
> >>> +	struct cio *cio = ctx->cio;
> >>> +	sector_t clen;
> >>> +	int ri = ctx->range_idx;
> >>> +	unsigned long flags;
> >>> +	bool wake = false;
> >>> +
> >>> +	if (bio->bi_status) {
> >>> +		cio->io_err = bio->bi_status;
> >>> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> >>> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> >>
> >> long line.
> > 
> > Is it because line is more than 80 character, I thought limit is 100 now, so
> > went with longer lines ?
> 
> When it is easy to wrap the lines without readability loss, please do to
> keep things under 80 char per line.
> 
>

acked

> >>> +{
> >>> +	struct request_queue *src_q = bdev_get_queue(src_bdev);
> >>> +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> >>> +	int ret = -EINVAL;
> >>> +
> >>> +	if (!src_q || !dest_q)
> >>> +		return -ENXIO;
> >>> +
> >>> +	if (!nr)
> >>> +		return -EINVAL;
> >>> +
> >>> +	if (nr >= MAX_COPY_NR_RANGE)
> >>> +		return -EINVAL;
> >>
> >> Where do you check the number of ranges against what the device can do ?
> >>
> > 
> > The present implementation submits only one range at a time. This was done to 
> > make copy offload generic, so that other types of copy implementation such as
> > XCOPY should be able to use same infrastructure. Downside at present being
> > NVMe copy offload is not optimal.
> 
> If you issue one range at a time without checking the number of ranges,
> what is the point of the nr ranges queue limit ? The user can submit a
> copy ioctl request exceeding it. Please use that limit and enforce it or
> remove it entirely.
> 

Sure, will remove this limit in next version.

--
Thank you
Nitesh Shetty
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 09b7e1200c0f..ba9da2d2f429 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -117,6 +117,238 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through bio_copy_end_io. IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cio->lock, flags);
+	if (cio->refcount) {
+		cio->waiter = current;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irqrestore(&cio->lock, flags);
+		blk_io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		spin_lock_irqsave(&cio->lock, flags);
+		cio->waiter = NULL;
+		ret = cio->io_err;
+	}
+	spin_unlock_irqrestore(&cio->lock, flags);
+	kvfree(cio);
+
+	return ret;
+}
+
+static void bio_copy_end_io(struct bio *bio)
+{
+	struct copy_ctx *ctx = bio->bi_private;
+	struct cio *cio = ctx->cio;
+	sector_t clen;
+	int ri = ctx->range_idx;
+	unsigned long flags;
+	bool wake = false;
+
+	if (bio->bi_status) {
+		cio->io_err = bio->bi_status;
+		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
+		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
+	}
+	__free_page(bio->bi_io_vec[0].bv_page);
+	kfree(ctx);
+	bio_put(bio);
+
+	spin_lock_irqsave(&cio->lock, flags);
+	if (((--cio->refcount) <= 0) && cio->waiter)
+		wake = true;
+	spin_unlock_irqrestore(&cio->lock, flags);
+	if (wake)
+		wake_up_process(cio->waiter);
+}
+
+/*
+ * blk_copy_offload	- Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy offload limits.
+ */
+int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
+{
+	struct request_queue *sq = bdev_get_queue(src_bdev);
+	struct request_queue *dq = bdev_get_queue(dst_bdev);
+	struct bio *read_bio, *write_bio;
+	struct copy_ctx *ctx;
+	struct cio *cio;
+	struct page *token;
+	sector_t src_blk, copy_len, dst_blk;
+	sector_t remaining, max_copy_len = LONG_MAX;
+	unsigned long flags;
+	int ri = 0, ret = 0;
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	cio->rlist = rlist;
+	spin_lock_init(&cio->lock);
+
+	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
+	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
+			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
+
+	for (ri = 0; ri < nr_srcs; ri++) {
+		cio->rlist[ri].comp_len = rlist[ri].len;
+		src_blk = rlist[ri].src;
+		dst_blk = rlist[ri].dst;
+		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
+			copy_len = min(remaining, max_copy_len);
+
+			token = alloc_page(gfp_mask);
+			if (unlikely(!token)) {
+				ret = -ENOMEM;
+				goto err_token;
+			}
+
+			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+			if (!ctx) {
+				ret = -ENOMEM;
+				goto err_ctx;
+			}
+			ctx->cio = cio;
+			ctx->range_idx = ri;
+			ctx->start_sec = dst_blk;
+
+			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!read_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
+			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
+			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
+			read_bio->bi_iter.bi_size = copy_len;
+			ret = submit_bio_wait(read_bio);
+			bio_put(read_bio);
+			if (ret)
+				goto err_read_bio;
+
+			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!write_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
+			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
+			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
+			write_bio->bi_iter.bi_size = copy_len;
+			write_bio->bi_end_io = bio_copy_end_io;
+			write_bio->bi_private = ctx;
+
+			spin_lock_irqsave(&cio->lock, flags);
+			++cio->refcount;
+			spin_unlock_irqrestore(&cio->lock, flags);
+
+			submit_bio(write_bio);
+			src_blk += copy_len;
+			dst_blk += copy_len;
+		}
+	}
+
+	/* Wait for completion of all IO's*/
+	return cio_await_completion(cio);
+
+err_read_bio:
+	kfree(ctx);
+err_ctx:
+	__free_page(token);
+err_token:
+	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
+
+	cio->io_err = ret;
+	return cio_await_completion(cio);
+}
+
+static inline int blk_copy_sanity_check(struct block_device *src_bdev,
+		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
+{
+	unsigned int align_mask = max(
+			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
+	sector_t len = 0;
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		if (rlist[i].len)
+			len += rlist[i].len;
+		else
+			return -EINVAL;
+		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
+				(rlist[i].len & align_mask))
+			return -EINVAL;
+		rlist[i].comp_len = 0;
+	}
+
+	if (len && len >= MAX_COPY_TOTAL_LENGTH)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline bool blk_check_copy_offload(struct request_queue *src_q,
+		struct request_queue *dest_q)
+{
+	if (blk_queue_copy(dest_q) && blk_queue_copy(src_q))
+		return true;
+
+	return false;
+}
+
+/*
+ * blkdev_issue_copy - queue a copy
+ * @src_bdev:	source block device
+ * @nr_srcs:	number of source ranges to copy
+ * @rlist:	array of source/dest/len
+ * @dest_bdev:	destination block device
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *	Copy source ranges from source block device to destination block device.
+ *	length of a source range cannot be zero.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, int nr,
+		struct range_entry *rlist, struct block_device *dest_bdev, gfp_t gfp_mask)
+{
+	struct request_queue *src_q = bdev_get_queue(src_bdev);
+	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
+	int ret = -EINVAL;
+
+	if (!src_q || !dest_q)
+		return -ENXIO;
+
+	if (!nr)
+		return -EINVAL;
+
+	if (nr >= MAX_COPY_NR_RANGE)
+		return -EINVAL;
+
+	if (bdev_read_only(dest_bdev))
+		return -EPERM;
+
+	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
+	if (ret)
+		return ret;
+
+	if (blk_check_copy_offload(src_q, dest_q))
+		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_issue_copy);
+
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/block/blk.h b/block/blk.h
index 434017701403..6010eda58c70 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -291,6 +291,8 @@  static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 		break;
 	}
 
+	if (unlikely(op_is_copy(bio->bi_opf)))
+		return false;
 	/*
 	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
 	 * This is a quick and dirty check that relies on the fact that
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c62274466e72..f5b01f284c43 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -418,6 +418,7 @@  enum req_flag_bits {
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
+	__REQ_COPY,		/* copy request */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -443,6 +444,7 @@  enum req_flag_bits {
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
+#define REQ_COPY		(1ULL << __REQ_COPY)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -459,6 +461,11 @@  enum stat_group {
 	NR_STAT_GROUPS
 };
 
+static inline bool op_is_copy(unsigned int op)
+{
+	return (op & REQ_COPY);
+}
+
 #define bio_op(bio) \
 	((bio)->bi_opf & REQ_OP_MASK)
 
@@ -533,4 +540,18 @@  struct blk_rq_stat {
 	u64 batch;
 };
 
+struct cio {
+	struct range_entry *rlist;
+	struct task_struct *waiter;     /* waiting task (NULL if none) */
+	spinlock_t lock;		/* protects refcount and waiter */
+	int refcount;
+	blk_status_t io_err;
+};
+
+struct copy_ctx {
+	int range_idx;
+	sector_t start_sec;
+	struct cio *cio;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3596fd37fae7..c6cb3fe82ba2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1121,6 +1121,8 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
 int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp);
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *src_rlist, struct block_device *dest_bdev, gfp_t gfp_mask);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..822c28cebf3a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,20 @@  struct fstrim_range {
 	__u64 minlen;
 };
 
+/* Maximum no of entries supported */
+#define MAX_COPY_NR_RANGE	(1 << 12)
+
+/* maximum total copy length */
+#define MAX_COPY_TOTAL_LENGTH	(1 << 27)
+
+/* Source range entry for copy */
+struct range_entry {
+	__u64 src;
+	__u64 dst;
+	__u64 len;
+	__u64 comp_len;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1