diff mbox series

[v2,03/10] block: Add copy offload support infrastructure

Message ID 20220207141348.4235-4-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series Add Copy offload support | expand

Commit Message

Nitesh Shetty Feb. 7, 2022, 2:13 p.m. UTC
Introduce blkdev_issue_copy which supports source and destination bdevs,
and a array of (source, destination and copy length) tuples.
Introduce REQ_COP copy offload operation flag. Create a read-write
bio pair with a token as payload and submitted to the device in order.
the read request populates token with source specific information which
is then passed with write request.
Ths design is courtsey Mikulas Patocka<mpatocka@>'s token based copy

Larger copy operation may be divided if necessary by looking at device
limits.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
---
 block/blk-lib.c           | 216 ++++++++++++++++++++++++++++++++++++++
 block/blk-settings.c      |   2 +
 block/blk.h               |   2 +
 include/linux/blk_types.h |  20 ++++
 include/linux/blkdev.h    |   3 +
 include/uapi/linux/fs.h   |  14 +++
 6 files changed, 257 insertions(+)

Comments

kernel test robot Feb. 7, 2022, 10:45 p.m. UTC | #1
Hi Nitesh,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on next-20220207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nitesh-Shetty/block-make-bio_map_kern-non-static/20220207-231407
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: nios2-randconfig-r001-20220207 (https://download.01.org/0day-ci/archive/20220208/202202080650.48C9Ps00-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/12a9801a7301f1a1e2ea355c5a4438dab17894cf
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nitesh-Shetty/block-make-bio_map_kern-non-static/20220207-231407
        git checkout 12a9801a7301f1a1e2ea355c5a4438dab17894cf
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=nios2 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/blk-lib.c:185:5: warning: no previous prototype for 'blk_copy_offload' [-Wmissing-prototypes]
     185 | int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
         |     ^~~~~~~~~~~~~~~~


vim +/blk_copy_offload +185 block/blk-lib.c

   180	
   181	/*
   182	 * blk_copy_offload	- Use device's native copy offload feature
   183	 * Go through user provide payload, prepare new payload based on device's copy offload limits.
   184	 */
 > 185	int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
   186			struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
   187	{
   188		struct request_queue *sq = bdev_get_queue(src_bdev);
   189		struct request_queue *dq = bdev_get_queue(dst_bdev);
   190		struct bio *read_bio, *write_bio;
   191		struct copy_ctx *ctx;
   192		struct cio *cio;
   193		struct page *token;
   194		sector_t src_blk, copy_len, dst_blk;
   195		sector_t remaining, max_copy_len = LONG_MAX;
   196		int ri = 0, ret = 0;
   197	
   198		cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
   199		if (!cio)
   200			return -ENOMEM;
   201		atomic_set(&cio->refcount, 0);
   202		cio->rlist = rlist;
   203	
   204		max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
   205				(sector_t)dq->limits.max_copy_sectors);
   206		max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
   207				(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
   208	
   209		for (ri = 0; ri < nr_srcs; ri++) {
   210			cio->rlist[ri].comp_len = rlist[ri].len;
   211			for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
   212				remaining > 0;
   213				remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
   214				copy_len = min(remaining, max_copy_len);
   215	
   216				token = alloc_page(gfp_mask);
   217				if (unlikely(!token)) {
   218					ret = -ENOMEM;
   219					goto err_token;
   220				}
   221	
   222				read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
   223						gfp_mask);
   224				if (!read_bio) {
   225					ret = -ENOMEM;
   226					goto err_read_bio;
   227				}
   228				read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
   229				read_bio->bi_iter.bi_size = copy_len;
   230				__bio_add_page(read_bio, token, PAGE_SIZE, 0);
   231				ret = submit_bio_wait(read_bio);
   232				if (ret) {
   233					bio_put(read_bio);
   234					goto err_read_bio;
   235				}
   236				bio_put(read_bio);
   237				ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
   238				if (!ctx) {
   239					ret = -ENOMEM;
   240					goto err_read_bio;
   241				}
   242				ctx->cio = cio;
   243				ctx->range_idx = ri;
   244				ctx->start_sec = rlist[ri].src;
   245	
   246				write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
   247						gfp_mask);
   248				if (!write_bio) {
   249					ret = -ENOMEM;
   250					goto err_read_bio;
   251				}
   252	
   253				write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
   254				write_bio->bi_iter.bi_size = copy_len;
   255				__bio_add_page(write_bio, token, PAGE_SIZE, 0);
   256				write_bio->bi_end_io = bio_copy_end_io;
   257				write_bio->bi_private = ctx;
   258				atomic_inc(&cio->refcount);
   259				submit_bio(write_bio);
   260			}
   261		}
   262	
   263		/* Wait for completion of all IO's*/
   264		return cio_await_completion(cio);
   265	
   266	err_read_bio:
   267		__free_page(token);
   268	err_token:
   269		rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
   270	
   271		cio->io_err = ret;
   272		return cio_await_completion(cio);
   273	}
   274	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Feb. 7, 2022, 11:26 p.m. UTC | #2
Hi Nitesh,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on next-20220207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nitesh-Shetty/block-make-bio_map_kern-non-static/20220207-231407
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: hexagon-randconfig-r045-20220207 (https://download.01.org/0day-ci/archive/20220208/202202080735.lyaEe5Bq-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 0d8850ae2cae85d49bea6ae0799fa41c7202c05c)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/12a9801a7301f1a1e2ea355c5a4438dab17894cf
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nitesh-Shetty/block-make-bio_map_kern-non-static/20220207-231407
        git checkout 12a9801a7301f1a1e2ea355c5a4438dab17894cf
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> block/blk-lib.c:185:5: warning: no previous prototype for function 'blk_copy_offload' [-Wmissing-prototypes]
   int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
       ^
   block/blk-lib.c:185:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
   ^
   static 
   1 warning generated.


vim +/blk_copy_offload +185 block/blk-lib.c

   180	
   181	/*
   182	 * blk_copy_offload	- Use device's native copy offload feature
   183	 * Go through user provide payload, prepare new payload based on device's copy offload limits.
   184	 */
 > 185	int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
   186			struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
   187	{
   188		struct request_queue *sq = bdev_get_queue(src_bdev);
   189		struct request_queue *dq = bdev_get_queue(dst_bdev);
   190		struct bio *read_bio, *write_bio;
   191		struct copy_ctx *ctx;
   192		struct cio *cio;
   193		struct page *token;
   194		sector_t src_blk, copy_len, dst_blk;
   195		sector_t remaining, max_copy_len = LONG_MAX;
   196		int ri = 0, ret = 0;
   197	
   198		cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
   199		if (!cio)
   200			return -ENOMEM;
   201		atomic_set(&cio->refcount, 0);
   202		cio->rlist = rlist;
   203	
   204		max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
   205				(sector_t)dq->limits.max_copy_sectors);
   206		max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
   207				(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
   208	
   209		for (ri = 0; ri < nr_srcs; ri++) {
   210			cio->rlist[ri].comp_len = rlist[ri].len;
   211			for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
   212				remaining > 0;
   213				remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
   214				copy_len = min(remaining, max_copy_len);
   215	
   216				token = alloc_page(gfp_mask);
   217				if (unlikely(!token)) {
   218					ret = -ENOMEM;
   219					goto err_token;
   220				}
   221	
   222				read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
   223						gfp_mask);
   224				if (!read_bio) {
   225					ret = -ENOMEM;
   226					goto err_read_bio;
   227				}
   228				read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
   229				read_bio->bi_iter.bi_size = copy_len;
   230				__bio_add_page(read_bio, token, PAGE_SIZE, 0);
   231				ret = submit_bio_wait(read_bio);
   232				if (ret) {
   233					bio_put(read_bio);
   234					goto err_read_bio;
   235				}
   236				bio_put(read_bio);
   237				ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
   238				if (!ctx) {
   239					ret = -ENOMEM;
   240					goto err_read_bio;
   241				}
   242				ctx->cio = cio;
   243				ctx->range_idx = ri;
   244				ctx->start_sec = rlist[ri].src;
   245	
   246				write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
   247						gfp_mask);
   248				if (!write_bio) {
   249					ret = -ENOMEM;
   250					goto err_read_bio;
   251				}
   252	
   253				write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
   254				write_bio->bi_iter.bi_size = copy_len;
   255				__bio_add_page(write_bio, token, PAGE_SIZE, 0);
   256				write_bio->bi_end_io = bio_copy_end_io;
   257				write_bio->bi_private = ctx;
   258				atomic_inc(&cio->refcount);
   259				submit_bio(write_bio);
   260			}
   261		}
   262	
   263		/* Wait for completion of all IO's*/
   264		return cio_await_completion(cio);
   265	
   266	err_read_bio:
   267		__free_page(token);
   268	err_token:
   269		rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
   270	
   271		cio->io_err = ret;
   272		return cio_await_completion(cio);
   273	}
   274	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Damien Le Moal Feb. 8, 2022, 7:21 a.m. UTC | #3
On 2/7/22 23:13, Nitesh Shetty wrote:
> Introduce blkdev_issue_copy which supports source and destination bdevs,
> and a array of (source, destination and copy length) tuples.

s/a/an

> Introduce REQ_COP copy offload operation flag. Create a read-write

REQ_COPY ?

> bio pair with a token as payload and submitted to the device in order.
> the read request populates token with source specific information which
> is then passed with write request.
> Ths design is courtsey Mikulas Patocka<mpatocka@>'s token based copy

s/Ths design is courtsey/This design is courtesy of

> 
> Larger copy operation may be divided if necessary by looking at device
> limits.

may or will ?
by looking at -> depending on the ?

> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> ---
>  block/blk-lib.c           | 216 ++++++++++++++++++++++++++++++++++++++
>  block/blk-settings.c      |   2 +
>  block/blk.h               |   2 +
>  include/linux/blk_types.h |  20 ++++
>  include/linux/blkdev.h    |   3 +
>  include/uapi/linux/fs.h   |  14 +++
>  6 files changed, 257 insertions(+)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 1b8ced45e4e5..3ae2c27b566e 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -135,6 +135,222 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  }
>  EXPORT_SYMBOL(blkdev_issue_discard);
>  
> +/*
> + * Wait on and process all in-flight BIOs.  This must only be called once
> + * all bios have been issued so that the refcount can only decrease.
> + * This just waits for all bios to make it through bio_copy_end_io. IO
> + * errors are propagated through cio->io_error.
> + */
> +static int cio_await_completion(struct cio *cio)
> +{
> +	int ret = 0;
> +
> +	while (atomic_read(&cio->refcount)) {
> +		cio->waiter = current;
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		blk_io_schedule();
> +		/* wake up sets us TASK_RUNNING */
> +		cio->waiter = NULL;
> +		ret = cio->io_err;

Why is this in the loop ?

> +	}
> +	kvfree(cio);
> +
> +	return ret;
> +}
> +
> +static void bio_copy_end_io(struct bio *bio)
> +{
> +	struct copy_ctx *ctx = bio->bi_private;
> +	struct cio *cio = ctx->cio;
> +	sector_t clen;
> +	int ri = ctx->range_idx;
> +
> +	if (bio->bi_status) {
> +		cio->io_err = bio->bi_status;
> +		clen = (bio->bi_iter.bi_sector - ctx->start_sec) << SECTOR_SHIFT;
> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> +	}
> +	__free_page(bio->bi_io_vec[0].bv_page);
> +	kfree(ctx);
> +	bio_put(bio);
> +
> +	if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
> +		wake_up_process(cio->waiter);

This looks racy: the cio->waiter test and wakeup are not atomic.

> +}
> +
> +/*
> + * blk_copy_offload	- Use device's native copy offload feature
> + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> + */
> +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> +{
> +	struct request_queue *sq = bdev_get_queue(src_bdev);
> +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> +	struct bio *read_bio, *write_bio;
> +	struct copy_ctx *ctx;
> +	struct cio *cio;
> +	struct page *token;
> +	sector_t src_blk, copy_len, dst_blk;
> +	sector_t remaining, max_copy_len = LONG_MAX;
> +	int ri = 0, ret = 0;
> +
> +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> +	if (!cio)
> +		return -ENOMEM;
> +	atomic_set(&cio->refcount, 0);
> +	cio->rlist = rlist;
> +
> +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
> +			(sector_t)dq->limits.max_copy_sectors);

sq->limits.max_copy_sectors is already by definition smaller than
LONG_MAX, so there is no need for the min3 here.

> +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;> +
> +	for (ri = 0; ri < nr_srcs; ri++) {
> +		cio->rlist[ri].comp_len = rlist[ri].len;
> +		for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
> +			remaining > 0;
> +			remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {

This is unreadable.

> +			copy_len = min(remaining, max_copy_len);
> +
> +			token = alloc_page(gfp_mask);
> +			if (unlikely(!token)) {
> +				ret = -ENOMEM;
> +				goto err_token;
> +			}
> +
> +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!read_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> +			read_bio->bi_iter.bi_size = copy_len;
> +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> +			ret = submit_bio_wait(read_bio);
> +			if (ret) {
> +				bio_put(read_bio);
> +				goto err_read_bio;
> +			}
> +			bio_put(read_bio);
> +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> +			if (!ctx) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}

This should be done before the read.

> +			ctx->cio = cio;
> +			ctx->range_idx = ri;
> +			ctx->start_sec = rlist[ri].src;
> +
> +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!write_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +
> +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> +			write_bio->bi_iter.bi_size = copy_len;
> +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> +			write_bio->bi_end_io = bio_copy_end_io;
> +			write_bio->bi_private = ctx;
> +			atomic_inc(&cio->refcount);
> +			submit_bio(write_bio);
> +		}
> +	}
> +
> +	/* Wait for completion of all IO's*/
> +	return cio_await_completion(cio);
> +
> +err_read_bio:
> +	__free_page(token);
> +err_token:
> +	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
> +
> +	cio->io_err = ret;
> +	return cio_await_completion(cio);
> +}
> +
> +static inline int blk_copy_sanity_check(struct block_device *src_bdev,
> +		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
> +{
> +	unsigned int align_mask = max(
> +			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
> +	sector_t len = 0;
> +	int i;
> +
> +	for (i = 0; i < nr; i++) {
> +		if (rlist[i].len)
> +			len += rlist[i].len;
> +		else
> +			return -EINVAL;
> +		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
> +				(rlist[i].len & align_mask))
> +			return -EINVAL;
> +		rlist[i].comp_len = 0;
> +	}
> +
> +	if (!len && len >= MAX_COPY_TOTAL_LENGTH)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static inline bool blk_check_copy_offload(struct request_queue *src_q,
> +		struct request_queue *dest_q)
> +{
> +	if (dest_q->limits.copy_offload == BLK_COPY_OFFLOAD &&
> +			src_q->limits.copy_offload == BLK_COPY_OFFLOAD)
> +		return true;
> +
> +	return false;
> +}
> +
> +/*
> + * blkdev_issue_copy - queue a copy
> + * @src_bdev:	source block device
> + * @nr_srcs:	number of source ranges to copy
> + * @src_rlist:	array of source ranges
> + * @dest_bdev:	destination block device
> + * @gfp_mask:   memory allocation flags (for bio_alloc)
> + * @flags:	BLKDEV_COPY_* flags to control behaviour
> + *
> + * Description:
> + *	Copy source ranges from source block device to destination block device.
> + *	length of a source range cannot be zero.
> + */
> +int blkdev_issue_copy(struct block_device *src_bdev, int nr,
> +		struct range_entry *rlist, struct block_device *dest_bdev,
> +		gfp_t gfp_mask, int flags)
> +{
> +	struct request_queue *src_q = bdev_get_queue(src_bdev);
> +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> +	int ret = -EINVAL;
> +
> +	if (!src_q || !dest_q)
> +		return -ENXIO;
> +
> +	if (!nr)
> +		return -EINVAL;
> +
> +	if (nr >= MAX_COPY_NR_RANGE)
> +		return -EINVAL;
> +
> +	if (bdev_read_only(dest_bdev))
> +		return -EPERM;
> +
> +	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
> +	if (ret)
> +		return ret;
> +
> +	if (blk_check_copy_offload(src_q, dest_q))
> +		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(blkdev_issue_copy);
> +
>  /**
>   * __blkdev_issue_write_same - generate number of bios with same page
>   * @bdev:	target blockdev
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 818454552cf8..4c8d48b8af25 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -545,6 +545,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>  	t->max_segment_size = min_not_zero(t->max_segment_size,
>  					   b->max_segment_size);
>  
> +	t->max_copy_sectors = min_not_zero(t->max_copy_sectors, b->max_copy_sectors);

Why min_not_zero ? If one of the underlying drive does not support copy
offload, you cannot report that the top drive does.

> +
>  	t->misaligned |= b->misaligned;
>  
>  	alignment = queue_limit_alignment_offset(b, start);
> diff --git a/block/blk.h b/block/blk.h
> index abb663a2a147..94d2b055750b 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -292,6 +292,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
>  		break;
>  	}
>  
> +	if (unlikely(op_is_copy(bio->bi_opf)))
> +		return false;
>  	/*
>  	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
>  	 * This is a quick and dirty check that relies on the fact that
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 5561e58d158a..0a3fee8ad61c 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -418,6 +418,7 @@ enum req_flag_bits {
>  	/* for driver use */
>  	__REQ_DRV,
>  	__REQ_SWAP,		/* swapping request. */
> +	__REQ_COPY,		/* copy request*/
>  	__REQ_NR_BITS,		/* stops here */
>  };
>  
> @@ -442,6 +443,7 @@ enum req_flag_bits {
>  
>  #define REQ_DRV			(1ULL << __REQ_DRV)
>  #define REQ_SWAP		(1ULL << __REQ_SWAP)
> +#define REQ_COPY		(1ULL << __REQ_COPY)
>  
>  #define REQ_FAILFAST_MASK \
>  	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
> @@ -498,6 +500,11 @@ static inline bool op_is_discard(unsigned int op)
>  	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
>  }
>  
> +static inline bool op_is_copy(unsigned int op)
> +{
> +	return (op & REQ_COPY);
> +}
> +
>  /*
>   * Check if a bio or request operation is a zone management operation, with
>   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
> @@ -532,4 +539,17 @@ struct blk_rq_stat {
>  	u64 batch;
>  };
>  
> +struct cio {
> +	atomic_t refcount;
> +	blk_status_t io_err;
> +	struct range_entry *rlist;
> +	struct task_struct *waiter;     /* waiting task (NULL if none) */
> +};
> +
> +struct copy_ctx {
> +	int range_idx;
> +	sector_t start_sec;
> +	struct cio *cio;
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f63ae50f1de3..15597488040c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1120,6 +1120,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		struct bio **biop);
>  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  		gfp_t gfp_mask);
> +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *src_rlist, struct block_device *dest_bdev,
> +		gfp_t gfp_mask, int flags);
>  
>  #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
>  #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index bdf7b404b3e7..55bca8f6e8ed 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -64,6 +64,20 @@ struct fstrim_range {
>  	__u64 minlen;
>  };
>  
> +/* Maximum no of entries supported */
> +#define MAX_COPY_NR_RANGE	(1 << 12)
> +
> +/* maximum total copy length */
> +#define MAX_COPY_TOTAL_LENGTH	(1 << 21)
> +
> +/* Source range entry for copy */
> +struct range_entry {
> +	__u64 src;
> +	__u64 dst;
> +	__u64 len;
> +	__u64 comp_len;
> +};
> +
>  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
>  #define FILE_DEDUPE_RANGE_SAME		0
>  #define FILE_DEDUPE_RANGE_DIFFERS	1
Dan Carpenter Feb. 9, 2022, 7:48 a.m. UTC | #4
Hi Nitesh,

url:    https://github.com/0day-ci/linux/commits/Nitesh-Shetty/block-make-bio_map_kern-non-static/20220207-231407
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: i386-randconfig-m021-20220207 (https://download.01.org/0day-ci/archive/20220209/202202090703.U5riBMIn-lkp@intel.com/config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>

smatch warnings:
block/blk-lib.c:272 blk_copy_offload() warn: possible memory leak of 'ctx'

vim +/ctx +272 block/blk-lib.c

12a9801a7301f1 Nitesh Shetty 2022-02-07  185  int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
12a9801a7301f1 Nitesh Shetty 2022-02-07  186  		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
12a9801a7301f1 Nitesh Shetty 2022-02-07  187  {
12a9801a7301f1 Nitesh Shetty 2022-02-07  188  	struct request_queue *sq = bdev_get_queue(src_bdev);
12a9801a7301f1 Nitesh Shetty 2022-02-07  189  	struct request_queue *dq = bdev_get_queue(dst_bdev);
12a9801a7301f1 Nitesh Shetty 2022-02-07  190  	struct bio *read_bio, *write_bio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  191  	struct copy_ctx *ctx;
12a9801a7301f1 Nitesh Shetty 2022-02-07  192  	struct cio *cio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  193  	struct page *token;
12a9801a7301f1 Nitesh Shetty 2022-02-07  194  	sector_t src_blk, copy_len, dst_blk;
12a9801a7301f1 Nitesh Shetty 2022-02-07  195  	sector_t remaining, max_copy_len = LONG_MAX;
12a9801a7301f1 Nitesh Shetty 2022-02-07  196  	int ri = 0, ret = 0;
12a9801a7301f1 Nitesh Shetty 2022-02-07  197  
12a9801a7301f1 Nitesh Shetty 2022-02-07  198  	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
12a9801a7301f1 Nitesh Shetty 2022-02-07  199  	if (!cio)
12a9801a7301f1 Nitesh Shetty 2022-02-07  200  		return -ENOMEM;
12a9801a7301f1 Nitesh Shetty 2022-02-07  201  	atomic_set(&cio->refcount, 0);
12a9801a7301f1 Nitesh Shetty 2022-02-07  202  	cio->rlist = rlist;
12a9801a7301f1 Nitesh Shetty 2022-02-07  203  
12a9801a7301f1 Nitesh Shetty 2022-02-07  204  	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
12a9801a7301f1 Nitesh Shetty 2022-02-07  205  			(sector_t)dq->limits.max_copy_sectors);
12a9801a7301f1 Nitesh Shetty 2022-02-07  206  	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
12a9801a7301f1 Nitesh Shetty 2022-02-07  207  			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
12a9801a7301f1 Nitesh Shetty 2022-02-07  208  
12a9801a7301f1 Nitesh Shetty 2022-02-07  209  	for (ri = 0; ri < nr_srcs; ri++) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  210  		cio->rlist[ri].comp_len = rlist[ri].len;
12a9801a7301f1 Nitesh Shetty 2022-02-07  211  		for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
12a9801a7301f1 Nitesh Shetty 2022-02-07  212  			remaining > 0;
12a9801a7301f1 Nitesh Shetty 2022-02-07  213  			remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  214  			copy_len = min(remaining, max_copy_len);
12a9801a7301f1 Nitesh Shetty 2022-02-07  215  
12a9801a7301f1 Nitesh Shetty 2022-02-07  216  			token = alloc_page(gfp_mask);
12a9801a7301f1 Nitesh Shetty 2022-02-07  217  			if (unlikely(!token)) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  218  				ret = -ENOMEM;
12a9801a7301f1 Nitesh Shetty 2022-02-07  219  				goto err_token;
12a9801a7301f1 Nitesh Shetty 2022-02-07  220  			}
12a9801a7301f1 Nitesh Shetty 2022-02-07  221  
12a9801a7301f1 Nitesh Shetty 2022-02-07  222  			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
12a9801a7301f1 Nitesh Shetty 2022-02-07  223  					gfp_mask);
12a9801a7301f1 Nitesh Shetty 2022-02-07  224  			if (!read_bio) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  225  				ret = -ENOMEM;
12a9801a7301f1 Nitesh Shetty 2022-02-07  226  				goto err_read_bio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  227  			}
12a9801a7301f1 Nitesh Shetty 2022-02-07  228  			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
12a9801a7301f1 Nitesh Shetty 2022-02-07  229  			read_bio->bi_iter.bi_size = copy_len;
12a9801a7301f1 Nitesh Shetty 2022-02-07  230  			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
12a9801a7301f1 Nitesh Shetty 2022-02-07  231  			ret = submit_bio_wait(read_bio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  232  			if (ret) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  233  				bio_put(read_bio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  234  				goto err_read_bio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  235  			}
12a9801a7301f1 Nitesh Shetty 2022-02-07  236  			bio_put(read_bio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  237  			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
12a9801a7301f1 Nitesh Shetty 2022-02-07  238  			if (!ctx) {
12a9801a7301f1 Nitesh Shetty 2022-02-07  239  				ret = -ENOMEM;
12a9801a7301f1 Nitesh Shetty 2022-02-07  240  				goto err_read_bio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  241  			}
12a9801a7301f1 Nitesh Shetty 2022-02-07  242  			ctx->cio = cio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  243  			ctx->range_idx = ri;
12a9801a7301f1 Nitesh Shetty 2022-02-07  244  			ctx->start_sec = rlist[ri].src;
12a9801a7301f1 Nitesh Shetty 2022-02-07  245  
12a9801a7301f1 Nitesh Shetty 2022-02-07  246  			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
12a9801a7301f1 Nitesh Shetty 2022-02-07  247  					gfp_mask);
12a9801a7301f1 Nitesh Shetty 2022-02-07  248  			if (!write_bio) {

Please call kfree(ctx) before the goto.

12a9801a7301f1 Nitesh Shetty 2022-02-07  249  				ret = -ENOMEM;
12a9801a7301f1 Nitesh Shetty 2022-02-07  250  				goto err_read_bio;
12a9801a7301f1 Nitesh Shetty 2022-02-07  251  			}
12a9801a7301f1 Nitesh Shetty 2022-02-07  252  
12a9801a7301f1 Nitesh Shetty 2022-02-07  253  			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
12a9801a7301f1 Nitesh Shetty 2022-02-07  254  			write_bio->bi_iter.bi_size = copy_len;
12a9801a7301f1 Nitesh Shetty 2022-02-07  255  			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
12a9801a7301f1 Nitesh Shetty 2022-02-07  256  			write_bio->bi_end_io = bio_copy_end_io;
12a9801a7301f1 Nitesh Shetty 2022-02-07  257  			write_bio->bi_private = ctx;
12a9801a7301f1 Nitesh Shetty 2022-02-07  258  			atomic_inc(&cio->refcount);
12a9801a7301f1 Nitesh Shetty 2022-02-07  259  			submit_bio(write_bio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  260  		}
12a9801a7301f1 Nitesh Shetty 2022-02-07  261  	}
12a9801a7301f1 Nitesh Shetty 2022-02-07  262  
12a9801a7301f1 Nitesh Shetty 2022-02-07  263  	/* Wait for completion of all IO's*/
12a9801a7301f1 Nitesh Shetty 2022-02-07  264  	return cio_await_completion(cio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  265  
12a9801a7301f1 Nitesh Shetty 2022-02-07  266  err_read_bio:
12a9801a7301f1 Nitesh Shetty 2022-02-07  267  	__free_page(token);
12a9801a7301f1 Nitesh Shetty 2022-02-07  268  err_token:
12a9801a7301f1 Nitesh Shetty 2022-02-07  269  	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
12a9801a7301f1 Nitesh Shetty 2022-02-07  270  
12a9801a7301f1 Nitesh Shetty 2022-02-07  271  	cio->io_err = ret;
12a9801a7301f1 Nitesh Shetty 2022-02-07 @272  	return cio_await_completion(cio);
12a9801a7301f1 Nitesh Shetty 2022-02-07  273  }

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Nitesh Shetty Feb. 9, 2022, 10:22 a.m. UTC | #5
O Tue, Feb 08, 2022 at 04:21:19PM +0900, Damien Le Moal wrote:
> On 2/7/22 23:13, Nitesh Shetty wrote:
> > Introduce blkdev_issue_copy which supports source and destination bdevs,
> > and a array of (source, destination and copy length) tuples.
> 
> s/a/an
>

acked

> > Introduce REQ_COP copy offload operation flag. Create a read-write
> 
> REQ_COPY ?
>

acked

> > bio pair with a token as payload and submitted to the device in order.
> > the read request populates token with source specific information which
> > is then passed with write request.
> > Ths design is courtsey Mikulas Patocka<mpatocka@>'s token based copy
> 
> s/Ths design is courtsey/This design is courtesy of
>

acked

> > 
> > Larger copy operation may be divided if necessary by looking at device
> > limits.
> 
> may or will ?
> by looking at -> depending on the ?
> 

Larger copy will be divided, based on max_copy_sectors,max_copy_range_sector
limits. Will add in next series.

> > 
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
> > Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> > ---
> >  block/blk-lib.c           | 216 ++++++++++++++++++++++++++++++++++++++
> >  block/blk-settings.c      |   2 +
> >  block/blk.h               |   2 +
> >  include/linux/blk_types.h |  20 ++++
> >  include/linux/blkdev.h    |   3 +
> >  include/uapi/linux/fs.h   |  14 +++
> >  6 files changed, 257 insertions(+)
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index 1b8ced45e4e5..3ae2c27b566e 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -135,6 +135,222 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  }
> >  EXPORT_SYMBOL(blkdev_issue_discard);
> >  
> > +/*
> > + * Wait on and process all in-flight BIOs.  This must only be called once
> > + * all bios have been issued so that the refcount can only decrease.
> > + * This just waits for all bios to make it through bio_copy_end_io. IO
> > + * errors are propagated through cio->io_error.
> > + */
> > +static int cio_await_completion(struct cio *cio)
> > +{
> > +	int ret = 0;
> > +
> > +	while (atomic_read(&cio->refcount)) {
> > +		cio->waiter = current;
> > +		__set_current_state(TASK_UNINTERRUPTIBLE);
> > +		blk_io_schedule();
> > +		/* wake up sets us TASK_RUNNING */
> > +		cio->waiter = NULL;
> > +		ret = cio->io_err;
> 
> Why is this in the loop ?
>

agree.

> > +	}
> > +	kvfree(cio);
> > +
> > +	return ret;
> > +}
> > +
> > +static void bio_copy_end_io(struct bio *bio)
> > +{
> > +	struct copy_ctx *ctx = bio->bi_private;
> > +	struct cio *cio = ctx->cio;
> > +	sector_t clen;
> > +	int ri = ctx->range_idx;
> > +
> > +	if (bio->bi_status) {
> > +		cio->io_err = bio->bi_status;
> > +		clen = (bio->bi_iter.bi_sector - ctx->start_sec) << SECTOR_SHIFT;
> > +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> > +	}
> > +	__free_page(bio->bi_io_vec[0].bv_page);
> > +	kfree(ctx);
> > +	bio_put(bio);
> > +
> > +	if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
> > +		wake_up_process(cio->waiter);
> 
> This looks racy: the cio->waiter test and wakeup are not atomic.

agreed, will remove atomic for refcount and add if check and wakeup in locks
in next version.

> > +}
> > +
> > +/*
> > + * blk_copy_offload	- Use device's native copy offload feature
> > + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> > + */
> > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> > +{
> > +	struct request_queue *sq = bdev_get_queue(src_bdev);
> > +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> > +	struct bio *read_bio, *write_bio;
> > +	struct copy_ctx *ctx;
> > +	struct cio *cio;
> > +	struct page *token;
> > +	sector_t src_blk, copy_len, dst_blk;
> > +	sector_t remaining, max_copy_len = LONG_MAX;
> > +	int ri = 0, ret = 0;
> > +
> > +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> > +	if (!cio)
> > +		return -ENOMEM;
> > +	atomic_set(&cio->refcount, 0);
> > +	cio->rlist = rlist;
> > +
> > +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
> > +			(sector_t)dq->limits.max_copy_sectors);
> 
> sq->limits.max_copy_sectors is already by definition smaller than
> LONG_MAX, so there is no need for the min3 here.
>

acked

> > +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> > +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;> +
> > +	for (ri = 0; ri < nr_srcs; ri++) {
> > +		cio->rlist[ri].comp_len = rlist[ri].len;
> > +		for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
> > +			remaining > 0;
> > +			remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
> 
> This is unreadable.
> 

Sure, I will simplify the loops in next version.

> > +			copy_len = min(remaining, max_copy_len);
> > +
> > +			token = alloc_page(gfp_mask);
> > +			if (unlikely(!token)) {
> > +				ret = -ENOMEM;
> > +				goto err_token;
> > +			}
> > +
> > +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!read_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> > +			read_bio->bi_iter.bi_size = copy_len;
> > +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> > +			ret = submit_bio_wait(read_bio);
> > +			if (ret) {
> > +				bio_put(read_bio);
> > +				goto err_read_bio;
> > +			}
> > +			bio_put(read_bio);
> > +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> > +			if (!ctx) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> 
> This should be done before the read.
>

acked.

> > +			ctx->cio = cio;
> > +			ctx->range_idx = ri;
> > +			ctx->start_sec = rlist[ri].src;
> > +
> > +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!write_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +
> > +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> > +			write_bio->bi_iter.bi_size = copy_len;
> > +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> > +			write_bio->bi_end_io = bio_copy_end_io;
> > +			write_bio->bi_private = ctx;
> > +			atomic_inc(&cio->refcount);
> > +			submit_bio(write_bio);
> > +		}
> > +	}
> > +
> > +	/* Wait for completion of all IO's*/
> > +	return cio_await_completion(cio);
> > +
> > +err_read_bio:
> > +	__free_page(token);
> > +err_token:
> > +	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
> > +
> > +	cio->io_err = ret;
> > +	return cio_await_completion(cio);
> > +}
> > +
> > +static inline int blk_copy_sanity_check(struct block_device *src_bdev,
> > +		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
> > +{
> > +	unsigned int align_mask = max(
> > +			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
> > +	sector_t len = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < nr; i++) {
> > +		if (rlist[i].len)
> > +			len += rlist[i].len;
> > +		else
> > +			return -EINVAL;
> > +		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
> > +				(rlist[i].len & align_mask))
> > +			return -EINVAL;
> > +		rlist[i].comp_len = 0;
> > +	}
> > +
> > +	if (!len && len >= MAX_COPY_TOTAL_LENGTH)
> > +		return -EINVAL;
> > +
> > +	return 0;
> > +}
> > +
> > +static inline bool blk_check_copy_offload(struct request_queue *src_q,
> > +		struct request_queue *dest_q)
> > +{
> > +	if (dest_q->limits.copy_offload == BLK_COPY_OFFLOAD &&
> > +			src_q->limits.copy_offload == BLK_COPY_OFFLOAD)
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +/*
> > + * blkdev_issue_copy - queue a copy
> > + * @src_bdev:	source block device
> > + * @nr_srcs:	number of source ranges to copy
> > + * @src_rlist:	array of source ranges
> > + * @dest_bdev:	destination block device
> > + * @gfp_mask:   memory allocation flags (for bio_alloc)
> > + * @flags:	BLKDEV_COPY_* flags to control behaviour
> > + *
> > + * Description:
> > + *	Copy source ranges from source block device to destination block device.
> > + *	length of a source range cannot be zero.
> > + */
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr,
> > +		struct range_entry *rlist, struct block_device *dest_bdev,
> > +		gfp_t gfp_mask, int flags)
> > +{
> > +	struct request_queue *src_q = bdev_get_queue(src_bdev);
> > +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> > +	int ret = -EINVAL;
> > +
> > +	if (!src_q || !dest_q)
> > +		return -ENXIO;
> > +
> > +	if (!nr)
> > +		return -EINVAL;
> > +
> > +	if (nr >= MAX_COPY_NR_RANGE)
> > +		return -EINVAL;
> > +
> > +	if (bdev_read_only(dest_bdev))
> > +		return -EPERM;
> > +
> > +	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (blk_check_copy_offload(src_q, dest_q))
> > +		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(blkdev_issue_copy);
> > +
> >  /**
> >   * __blkdev_issue_write_same - generate number of bios with same page
> >   * @bdev:	target blockdev
> > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > index 818454552cf8..4c8d48b8af25 100644
> > --- a/block/blk-settings.c
> > +++ b/block/blk-settings.c
> > @@ -545,6 +545,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
> >  	t->max_segment_size = min_not_zero(t->max_segment_size,
> >  					   b->max_segment_size);
> >  
> > +	t->max_copy_sectors = min_not_zero(t->max_copy_sectors, b->max_copy_sectors);
> 
> Why min_not_zero ? If one of the underlying drive does not support copy
> offload, you cannot report that the top drive does.
>

agreed. Will update in next series.

> > +
> >  	t->misaligned |= b->misaligned;
> >  
> >  	alignment = queue_limit_alignment_offset(b, start);
> > diff --git a/block/blk.h b/block/blk.h
> > index abb663a2a147..94d2b055750b 100644
> > --- a/block/blk.h
> > +++ b/block/blk.h
> > @@ -292,6 +292,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
> >  		break;
> >  	}
> >  
> > +	if (unlikely(op_is_copy(bio->bi_opf)))
> > +		return false;
> >  	/*
> >  	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
> >  	 * This is a quick and dirty check that relies on the fact that
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index 5561e58d158a..0a3fee8ad61c 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -418,6 +418,7 @@ enum req_flag_bits {
> >  	/* for driver use */
> >  	__REQ_DRV,
> >  	__REQ_SWAP,		/* swapping request. */
> > +	__REQ_COPY,		/* copy request*/
> >  	__REQ_NR_BITS,		/* stops here */
> >  };
> >  
> > @@ -442,6 +443,7 @@ enum req_flag_bits {
> >  
> >  #define REQ_DRV			(1ULL << __REQ_DRV)
> >  #define REQ_SWAP		(1ULL << __REQ_SWAP)
> > +#define REQ_COPY		(1ULL << __REQ_COPY)
> >  
> >  #define REQ_FAILFAST_MASK \
> >  	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
> > @@ -498,6 +500,11 @@ static inline bool op_is_discard(unsigned int op)
> >  	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
> >  }
> >  
> > +static inline bool op_is_copy(unsigned int op)
> > +{
> > +	return (op & REQ_COPY);
> > +}
> > +
> >  /*
> >   * Check if a bio or request operation is a zone management operation, with
> >   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
> > @@ -532,4 +539,17 @@ struct blk_rq_stat {
> >  	u64 batch;
> >  };
> >  
> > +struct cio {
> > +	atomic_t refcount;
> > +	blk_status_t io_err;
> > +	struct range_entry *rlist;
> > +	struct task_struct *waiter;     /* waiting task (NULL if none) */
> > +};
> > +
> > +struct copy_ctx {
> > +	int range_idx;
> > +	sector_t start_sec;
> > +	struct cio *cio;
> > +};
> > +
> >  #endif /* __LINUX_BLK_TYPES_H */
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index f63ae50f1de3..15597488040c 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -1120,6 +1120,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  		struct bio **biop);
> >  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
> >  		gfp_t gfp_mask);
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *src_rlist, struct block_device *dest_bdev,
> > +		gfp_t gfp_mask, int flags);
> >  
> >  #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
> >  #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index bdf7b404b3e7..55bca8f6e8ed 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -64,6 +64,20 @@ struct fstrim_range {
> >  	__u64 minlen;
> >  };
> >  
> > +/* Maximum no of entries supported */
> > +#define MAX_COPY_NR_RANGE	(1 << 12)
> > +
> > +/* maximum total copy length */
> > +#define MAX_COPY_TOTAL_LENGTH	(1 << 21)
> > +
> > +/* Source range entry for copy */
> > +struct range_entry {
> > +	__u64 src;
> > +	__u64 dst;
> > +	__u64 len;
> > +	__u64 comp_len;
> > +};
> > +
> >  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
> >  #define FILE_DEDUPE_RANGE_SAME		0
> >  #define FILE_DEDUPE_RANGE_DIFFERS	1
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
>

 -- 
Thank you
Nitesh
Nitesh Shetty Feb. 9, 2022, 10:32 a.m. UTC | #6
On Wed, Feb 09, 2022 at 10:48:44AM +0300, Dan Carpenter wrote:
> Hi Nitesh,
> 
> url:    https://protect2.fireeye.com/v1/url?k=483798a4-17aca1b5-483613eb-0cc47a31cdbc-db5fd22936f47f46&q=1&e=e5a0c082-878d-4bbf-be36-3c8e34773475&u=https%3A%2F%2Fgithub.com%2F0day-ci%2Flinux%2Fcommits%2FNitesh-Shetty%2Fblock-make-bio_map_kern-non-static%2F20220207-231407
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
> config: i386-randconfig-m021-20220207 (https://protect2.fireeye.com/v1/url?k=24e309ba-7b7830ab-24e282f5-0cc47a31cdbc-9cc4e76aaefa8c0d&q=1&e=e5a0c082-878d-4bbf-be36-3c8e34773475&u=https%3A%2F%2Fdownload.01.org%2F0day-ci%2Farchive%2F20220209%2F202202090703.U5riBMIn-lkp%40intel.com%2Fconfig)
> compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> 
> smatch warnings:
> block/blk-lib.c:272 blk_copy_offload() warn: possible memory leak of 'ctx'
> 
> vim +/ctx +272 block/blk-lib.c
>

acked

> 12a9801a7301f1 Nitesh Shetty 2022-02-07  185  int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  186  		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  187  {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  188  	struct request_queue *sq = bdev_get_queue(src_bdev);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  189  	struct request_queue *dq = bdev_get_queue(dst_bdev);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  190  	struct bio *read_bio, *write_bio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  191  	struct copy_ctx *ctx;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  192  	struct cio *cio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  193  	struct page *token;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  194  	sector_t src_blk, copy_len, dst_blk;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  195  	sector_t remaining, max_copy_len = LONG_MAX;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  196  	int ri = 0, ret = 0;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  197  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  198  	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  199  	if (!cio)
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  200  		return -ENOMEM;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  201  	atomic_set(&cio->refcount, 0);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  202  	cio->rlist = rlist;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  203  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  204  	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  205  			(sector_t)dq->limits.max_copy_sectors);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  206  	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  207  			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  208  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  209  	for (ri = 0; ri < nr_srcs; ri++) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  210  		cio->rlist[ri].comp_len = rlist[ri].len;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  211  		for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  212  			remaining > 0;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  213  			remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  214  			copy_len = min(remaining, max_copy_len);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  215  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  216  			token = alloc_page(gfp_mask);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  217  			if (unlikely(!token)) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  218  				ret = -ENOMEM;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  219  				goto err_token;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  220  			}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  221  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  222  			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  223  					gfp_mask);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  224  			if (!read_bio) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  225  				ret = -ENOMEM;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  226  				goto err_read_bio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  227  			}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  228  			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  229  			read_bio->bi_iter.bi_size = copy_len;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  230  			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  231  			ret = submit_bio_wait(read_bio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  232  			if (ret) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  233  				bio_put(read_bio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  234  				goto err_read_bio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  235  			}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  236  			bio_put(read_bio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  237  			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  238  			if (!ctx) {
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  239  				ret = -ENOMEM;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  240  				goto err_read_bio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  241  			}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  242  			ctx->cio = cio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  243  			ctx->range_idx = ri;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  244  			ctx->start_sec = rlist[ri].src;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  245  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  246  			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  247  					gfp_mask);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  248  			if (!write_bio) {
> 
> Please call kfree(ctx) before the goto.
> 
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  249  				ret = -ENOMEM;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  250  				goto err_read_bio;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  251  			}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  252  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  253  			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  254  			write_bio->bi_iter.bi_size = copy_len;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  255  			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  256  			write_bio->bi_end_io = bio_copy_end_io;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  257  			write_bio->bi_private = ctx;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  258  			atomic_inc(&cio->refcount);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  259  			submit_bio(write_bio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  260  		}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  261  	}
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  262  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  263  	/* Wait for completion of all IO's*/
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  264  	return cio_await_completion(cio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  265  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  266  err_read_bio:
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  267  	__free_page(token);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  268  err_token:
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  269  	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  270  
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  271  	cio->io_err = ret;
> 12a9801a7301f1 Nitesh Shetty 2022-02-07 @272  	return cio_await_completion(cio);
> 12a9801a7301f1 Nitesh Shetty 2022-02-07  273  }
> 
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://protect2.fireeye.com/v1/url?k=4cd82b59-13431248-4cd9a016-0cc47a31cdbc-7ef30a0abcb321a3&q=1&e=e5a0c082-878d-4bbf-be36-3c8e34773475&u=https%3A%2F%2Flists.01.org%2Fhyperkitty%2Flist%2Fkbuild-all%40lists.01.org
> 
> 
> 

--
Thank you
Nitesh
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 1b8ced45e4e5..3ae2c27b566e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -135,6 +135,222 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through bio_copy_end_io. IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+	int ret = 0;
+
+	while (atomic_read(&cio->refcount)) {
+		cio->waiter = current;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		blk_io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		cio->waiter = NULL;
+		ret = cio->io_err;
+	}
+	kvfree(cio);
+
+	return ret;
+}
+
+static void bio_copy_end_io(struct bio *bio)
+{
+	struct copy_ctx *ctx = bio->bi_private;
+	struct cio *cio = ctx->cio;
+	sector_t clen;
+	int ri = ctx->range_idx;
+
+	if (bio->bi_status) {
+		cio->io_err = bio->bi_status;
+		clen = (bio->bi_iter.bi_sector - ctx->start_sec) << SECTOR_SHIFT;
+		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
+	}
+	__free_page(bio->bi_io_vec[0].bv_page);
+	kfree(ctx);
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
+		wake_up_process(cio->waiter);
+}
+
+/*
+ * blk_copy_offload	- Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy offload limits.
+ */
+int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
+{
+	struct request_queue *sq = bdev_get_queue(src_bdev);
+	struct request_queue *dq = bdev_get_queue(dst_bdev);
+	struct bio *read_bio, *write_bio;
+	struct copy_ctx *ctx;
+	struct cio *cio;
+	struct page *token;
+	sector_t src_blk, copy_len, dst_blk;
+	sector_t remaining, max_copy_len = LONG_MAX;
+	int ri = 0, ret = 0;
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	atomic_set(&cio->refcount, 0);
+	cio->rlist = rlist;
+
+	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
+			(sector_t)dq->limits.max_copy_sectors);
+	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
+			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
+
+	for (ri = 0; ri < nr_srcs; ri++) {
+		cio->rlist[ri].comp_len = rlist[ri].len;
+		for (remaining = rlist[ri].len, src_blk = rlist[ri].src, dst_blk = rlist[ri].dst;
+			remaining > 0;
+			remaining -= copy_len, src_blk += copy_len, dst_blk += copy_len) {
+			copy_len = min(remaining, max_copy_len);
+
+			token = alloc_page(gfp_mask);
+			if (unlikely(!token)) {
+				ret = -ENOMEM;
+				goto err_token;
+			}
+
+			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!read_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
+			read_bio->bi_iter.bi_size = copy_len;
+			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
+			ret = submit_bio_wait(read_bio);
+			if (ret) {
+				bio_put(read_bio);
+				goto err_read_bio;
+			}
+			bio_put(read_bio);
+			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+			if (!ctx) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			ctx->cio = cio;
+			ctx->range_idx = ri;
+			ctx->start_sec = rlist[ri].src;
+
+			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!write_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+
+			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
+			write_bio->bi_iter.bi_size = copy_len;
+			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
+			write_bio->bi_end_io = bio_copy_end_io;
+			write_bio->bi_private = ctx;
+			atomic_inc(&cio->refcount);
+			submit_bio(write_bio);
+		}
+	}
+
+	/* Wait for completion of all IO's*/
+	return cio_await_completion(cio);
+
+err_read_bio:
+	__free_page(token);
+err_token:
+	rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len - remaining));
+
+	cio->io_err = ret;
+	return cio_await_completion(cio);
+}
+
+static inline int blk_copy_sanity_check(struct block_device *src_bdev,
+		struct block_device *dst_bdev, struct range_entry *rlist, int nr)
+{
+	unsigned int align_mask = max(
+			bdev_logical_block_size(dst_bdev), bdev_logical_block_size(src_bdev)) - 1;
+	sector_t len = 0;
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		if (rlist[i].len)
+			len += rlist[i].len;
+		else
+			return -EINVAL;
+		if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) ||
+				(rlist[i].len & align_mask))
+			return -EINVAL;
+		rlist[i].comp_len = 0;
+	}
+
+	if (!len && len >= MAX_COPY_TOTAL_LENGTH)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline bool blk_check_copy_offload(struct request_queue *src_q,
+		struct request_queue *dest_q)
+{
+	if (dest_q->limits.copy_offload == BLK_COPY_OFFLOAD &&
+			src_q->limits.copy_offload == BLK_COPY_OFFLOAD)
+		return true;
+
+	return false;
+}
+
+/*
+ * blkdev_issue_copy - queue a copy
+ * @src_bdev:	source block device
+ * @nr_srcs:	number of source ranges to copy
+ * @src_rlist:	array of source ranges
+ * @dest_bdev:	destination block device
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_COPY_* flags to control behaviour
+ *
+ * Description:
+ *	Copy source ranges from source block device to destination block device.
+ *	length of a source range cannot be zero.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, int nr,
+		struct range_entry *rlist, struct block_device *dest_bdev,
+		gfp_t gfp_mask, int flags)
+{
+	struct request_queue *src_q = bdev_get_queue(src_bdev);
+	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
+	int ret = -EINVAL;
+
+	if (!src_q || !dest_q)
+		return -ENXIO;
+
+	if (!nr)
+		return -EINVAL;
+
+	if (nr >= MAX_COPY_NR_RANGE)
+		return -EINVAL;
+
+	if (bdev_read_only(dest_bdev))
+		return -EPERM;
+
+	ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
+	if (ret)
+		return ret;
+
+	if (blk_check_copy_offload(src_q, dest_q))
+		ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, gfp_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
 /**
  * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:	target blockdev
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 818454552cf8..4c8d48b8af25 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -545,6 +545,8 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
+	t->max_copy_sectors = min_not_zero(t->max_copy_sectors, b->max_copy_sectors);
+
 	t->misaligned |= b->misaligned;
 
 	alignment = queue_limit_alignment_offset(b, start);
diff --git a/block/blk.h b/block/blk.h
index abb663a2a147..94d2b055750b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -292,6 +292,8 @@  static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 		break;
 	}
 
+	if (unlikely(op_is_copy(bio->bi_opf)))
+		return false;
 	/*
 	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
 	 * This is a quick and dirty check that relies on the fact that
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5561e58d158a..0a3fee8ad61c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -418,6 +418,7 @@  enum req_flag_bits {
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
+	__REQ_COPY,		/* copy request*/
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -442,6 +443,7 @@  enum req_flag_bits {
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
+#define REQ_COPY		(1ULL << __REQ_COPY)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -498,6 +500,11 @@  static inline bool op_is_discard(unsigned int op)
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
 
+static inline bool op_is_copy(unsigned int op)
+{
+	return (op & REQ_COPY);
+}
+
 /*
  * Check if a bio or request operation is a zone management operation, with
  * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
@@ -532,4 +539,17 @@  struct blk_rq_stat {
 	u64 batch;
 };
 
+struct cio {
+	atomic_t refcount;
+	blk_status_t io_err;
+	struct range_entry *rlist;
+	struct task_struct *waiter;     /* waiting task (NULL if none) */
+};
+
+struct copy_ctx {
+	int range_idx;
+	sector_t start_sec;
+	struct cio *cio;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f63ae50f1de3..15597488040c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1120,6 +1120,9 @@  extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		struct bio **biop);
 struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 		gfp_t gfp_mask);
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *src_rlist, struct block_device *dest_bdev,
+		gfp_t gfp_mask, int flags);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..55bca8f6e8ed 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,20 @@  struct fstrim_range {
 	__u64 minlen;
 };
 
+/* Maximum no of entries supported */
+#define MAX_COPY_NR_RANGE	(1 << 12)
+
+/* maximum total copy length */
+#define MAX_COPY_TOTAL_LENGTH	(1 << 21)
+
+/* Source range entry for copy */
+struct range_entry {
+	__u64 src;
+	__u64 dst;
+	__u64 len;
+	__u64 comp_len;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1