@@ -115,6 +115,249 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
+/*
+ * For synchronous copy offload/emulation, wait and process all in-flight BIOs.
+ * This must only be called once all bios have been issued so that the refcount
+ * can only decrease. This just waits for all bios to make it through
+ * blkdev_copy_(offload/emulate)_write_endio.
+ */
+static ssize_t blkdev_copy_wait_completion(struct cio *cio)
+{
+ ssize_t ret;
+
+ if (cio->endio)
+ return 0;
+
+ if (atomic_read(&cio->refcount)) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ blk_io_schedule();
+ }
+
+ ret = cio->comp_len;
+ kfree(cio);
+
+ return ret;
+}
+
+static void blkdev_copy_offload_write_endio(struct bio *bio)
+{
+ struct copy_ctx *ctx = bio->bi_private;
+ struct cio *cio = ctx->cio;
+ sector_t clen;
+
+ if (bio->bi_status) {
+ clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
+ cio->comp_len = min_t(sector_t, clen, cio->comp_len);
+ }
+ kfree(bvec_virt(&bio->bi_io_vec[0]));
+ bio_put(bio);
+
+ kfree(ctx);
+ if (!atomic_dec_and_test(&cio->refcount))
+ return;
+ if (cio->endio) {
+ cio->endio(cio->private, cio->comp_len);
+ kfree(cio);
+ } else
+ blk_wake_io_task(cio->waiter);
+}
+
+static void blkdev_copy_offload_read_endio(struct bio *read_bio)
+{
+ struct copy_ctx *ctx = read_bio->bi_private;
+ struct cio *cio = ctx->cio;
+ sector_t clen;
+
+ if (read_bio->bi_status) {
+ clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT)
+ - cio->pos_in;
+ cio->comp_len = min_t(sector_t, clen, cio->comp_len);
+ kfree(bvec_virt(&read_bio->bi_io_vec[0]));
+ bio_put(ctx->write_bio);
+ bio_put(read_bio);
+ kfree(ctx);
+ if (atomic_dec_and_test(&cio->refcount)) {
+ if (cio->endio) {
+ cio->endio(cio->private, cio->comp_len);
+ kfree(cio);
+ } else
+ blk_wake_io_task(cio->waiter);
+ }
+ return;
+ }
+
+ schedule_work(&ctx->dispatch_work);
+ bio_put(read_bio);
+}
+
+static void blkdev_copy_dispatch_work(struct work_struct *work)
+{
+ struct copy_ctx *ctx = container_of(work, struct copy_ctx,
+ dispatch_work);
+
+ submit_bio(ctx->write_bio);
+}
+
+/*
+ * __blkdev_copy_offload - Use device's native copy offload feature.
+ * we perform copy operation by sending 2 bio.
+ * 1. First we send a read bio with REQ_COPY flag along with a token and source
+ * and length. Once read bio reaches driver layer, device driver adds all the
+ * source info to token and does a fake completion.
+ * 2. Once read operation completes, we issue write with REQ_COPY flag with same
+ * token. In driver layer, token info is used to form a copy offload command.
+ *
+ * Returns the length of bytes copied or error if encountered
+ */
+static ssize_t __blkdev_copy_offload(
+ struct block_device *bdev_in, loff_t pos_in,
+ struct block_device *bdev_out, loff_t pos_out,
+ size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
+{
+ struct cio *cio;
+ struct copy_ctx *ctx;
+ struct bio *read_bio, *write_bio;
+ void *token;
+ sector_t copy_len;
+ sector_t rem, max_copy_len;
+
+ cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+ if (!cio)
+ return -ENOMEM;
+ atomic_set(&cio->refcount, 0);
+ cio->waiter = current;
+ cio->endio = endio;
+ cio->private = private;
+
+ max_copy_len = min(bdev_max_copy_sectors(bdev_in),
+ bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT;
+
+ cio->pos_in = pos_in;
+ cio->pos_out = pos_out;
+ /* If there is a error, comp_len will be set to least successfully
+ * completed copied length
+ */
+ cio->comp_len = len;
+ for (rem = len; rem > 0; rem -= copy_len) {
+ copy_len = min(rem, max_copy_len);
+
+ token = kmalloc(COPY_TOKEN_SIZE, gfp_mask);
+ if (unlikely(!token))
+ goto err_token;
+
+ ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+ if (!ctx)
+ goto err_ctx;
+ read_bio = bio_alloc(bdev_in, 1, REQ_OP_READ | REQ_COPY
+ | REQ_SYNC | REQ_NOMERGE, gfp_mask);
+ if (!read_bio)
+ goto err_read_bio;
+ write_bio = bio_alloc(bdev_out, 1, REQ_OP_WRITE
+ | REQ_COPY | REQ_SYNC | REQ_NOMERGE, gfp_mask);
+ if (!write_bio)
+ goto err_write_bio;
+
+ ctx->cio = cio;
+ ctx->write_bio = write_bio;
+ INIT_WORK(&ctx->dispatch_work, blkdev_copy_dispatch_work);
+
+ __bio_add_page(read_bio, virt_to_page(token), COPY_TOKEN_SIZE,
+ offset_in_page(token));
+ read_bio->bi_iter.bi_size = copy_len;
+ read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
+ read_bio->bi_end_io = blkdev_copy_offload_read_endio;
+ read_bio->bi_private = ctx;
+
+ __bio_add_page(write_bio, virt_to_page(token), COPY_TOKEN_SIZE,
+ offset_in_page(token));
+ write_bio->bi_iter.bi_size = copy_len;
+ write_bio->bi_end_io = blkdev_copy_offload_write_endio;
+ write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
+ write_bio->bi_private = ctx;
+
+ atomic_inc(&cio->refcount);
+ submit_bio(read_bio);
+ pos_in += copy_len;
+ pos_out += copy_len;
+ }
+
+ /* Wait for completion of all IO's*/
+ return blkdev_copy_wait_completion(cio);
+
+err_write_bio:
+ bio_put(read_bio);
+err_read_bio:
+ kfree(ctx);
+err_ctx:
+ kfree(token);
+err_token:
+ cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem));
+ if (!atomic_read(&cio->refcount)) {
+ kfree(cio);
+ return -ENOMEM;
+ }
+ /* Wait for submitted IOs to complete */
+ return blkdev_copy_wait_completion(cio);
+}
+
+static inline ssize_t blkdev_copy_sanity_check(
+ struct block_device *bdev_in, loff_t pos_in,
+ struct block_device *bdev_out, loff_t pos_out,
+ size_t len)
+{
+ unsigned int align = max(bdev_logical_block_size(bdev_out),
+ bdev_logical_block_size(bdev_in)) - 1;
+
+ if (bdev_read_only(bdev_out))
+ return -EPERM;
+
+ if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
+ len >= COPY_MAX_BYTES)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * @bdev_in: source block device
+ * @pos_in: source offset
+ * @bdev_out: destination block device
+ * @pos_out: destination offset
+ * @len: length in bytes to be copied
+ * @endio: endio function to be called on completion of copy operation,
+ * for synchronous operation this should be NULL
+ * @private: endio function will be called with this private data, should be
+ * NULL, if operation is synchronous in nature
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Returns the length of bytes copied or error if encountered
+ *
+ * Description:
+ * Copy source offset from source block device to destination block
+ * device. If copy offload is not supported or fails, fallback to
+ * emulation. Max total length of copy is limited to COPY_MAX_BYTES
+ */
+ssize_t blkdev_copy_offload(
+ struct block_device *bdev_in, loff_t pos_in,
+ struct block_device *bdev_out, loff_t pos_out,
+ size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
+{
+ struct request_queue *q_in = bdev_get_queue(bdev_in);
+ struct request_queue *q_out = bdev_get_queue(bdev_out);
+ ssize_t ret;
+
+ ret = blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len);
+ if (ret)
+ return ret;
+
+ if (blk_queue_copy(q_in) && blk_queue_copy(q_out))
+ ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
+ len, endio, private, gfp_mask);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
+
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned flags)
@@ -303,6 +303,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
break;
}
+ if (unlikely(op_is_copy(bio->bi_opf)))
+ return false;
/*
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
* This is a quick and dirty check that relies on the fact that
@@ -427,6 +427,7 @@ enum req_flag_bits {
*/
/* for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
+ __REQ_COPY, /* copy request */
__REQ_NR_BITS, /* stops here */
};
@@ -451,6 +452,7 @@ enum req_flag_bits {
#define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED)
#define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
#define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP)
+#define REQ_COPY ((__force blk_opf_t)(1ULL << __REQ_COPY))
#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV)
#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
@@ -481,6 +483,11 @@ static inline bool op_is_write(blk_opf_t op)
return !!(op & (__force blk_opf_t)1);
}
+static inline bool op_is_copy(blk_opf_t op)
+{
+ return op & REQ_COPY;
+}
+
/*
* Check if the bio or request is one that needs special treatment in the
* flush state machine.
@@ -540,4 +547,22 @@ struct blk_rq_stat {
u64 batch;
};
+typedef void (cio_iodone_t)(void *private, int comp_len);
+
+struct cio {
+ struct task_struct *waiter; /* waiting task (NULL if none) */
+ atomic_t refcount;
+ loff_t pos_in;
+ loff_t pos_out;
+ ssize_t comp_len;
+ cio_iodone_t *endio; /* applicable for async operation */
+ void *private; /* applicable for async operation */
+};
+
+struct copy_ctx {
+ struct cio *cio;
+ struct work_struct dispatch_work;
+ struct bio *write_bio;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
@@ -1053,6 +1053,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp);
+ssize_t blkdev_copy_offload(
+ struct block_device *bdev_in, loff_t pos_in,
+ struct block_device *bdev_out, loff_t pos_out,
+ size_t len, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
@@ -64,9 +64,12 @@ struct fstrim_range {
__u64 minlen;
};
-/* maximum total copy length, this is set to 128 MB based on current testing */
+/* maximum copy offload length, this is set to 128MB based on current testing */
#define COPY_MAX_BYTES (1 << 27)
+/* copy offload token size */
+#define COPY_TOKEN_SIZE SECTOR_SIZE
+
/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
#define FILE_DEDUPE_RANGE_SAME 0
#define FILE_DEDUPE_RANGE_DIFFERS 1