[V2] qemu-img: make convert async

Message ID	1488193394-28453-1-git-send-email-pl@kamp.de (mailing list archive)
State	New, archived
Headers	show Return-Path: <qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org> avast: 1.2.2/17010300. spamassassin: 3.4.1. Clear:RC:1(195.62.97.28):. Processed in 0.102332 secs); 27 Feb 2017 11:03:19 -0000 From: Peter Lieven <pl@kamp.de> To: qemu-devel@nongnu.org Date: Mon, 27 Feb 2017 12:03:14 +0100 Message-Id: <1488193394-28453-1-git-send-email-pl@kamp.de> Subject: [Qemu-devel] [PATCH V2] qemu-img: make convert async Precedence: list Cc: kwolf@redhat.com, famz@redhat.com, qemu-block@nongnu.org, stefanha@gmail.com, Peter Lieven <pl@kamp.de>, ct@flyingcircus.io, mreitz@redhat.com Errors-To: qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patchwork-qemu-devel=patchwork.kernel.org@nongnu.org>

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index f054599..9c9702c 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -40,9 +40,9 @@ STEXI ETEXI DEF("convert", img_convert, - "convert [--object objectdef] [--image-opts] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-o options] [-s snapshot_id_or_name] [-l snapshot_param] [-S sparse_size] filename [filename2 [...]] output_filename") + "convert [--object objectdef] [--image-opts] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-o options] [-s snapshot_id_or_name] [-l snapshot_param] [-S sparse_size] [-m num_coroutines] [-W] filename [filename2 [...]] output_filename") STEXI -@item convert [--object @var{objectdef}] [--image-opts] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_id_or_name}] [-l @var{snapshot_param}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename} +@item convert [--object @var{objectdef}] [--image-opts] [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_id_or_name}] [-l @var{snapshot_param}] [-S @var{sparse_size}] [-m @var{num_coroutines}] [-W] @var{filename} [@var{filename2} [...]] @var{output_filename} ETEXI DEF("dd", img_dd, diff --git a/qemu-img.c b/qemu-img.c index cff22e3..1826202 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -156,6 +156,11 @@ static void QEMU_NORETURN help(void) " kinds of errors, with a higher risk of choosing the wrong fix or\n" " hiding corruption that has already occurred.\n" "\n" + "Parameters to convert subcommand:\n" + " '-m' specifies how many coroutines work in parallel during the convert\n" + " process (defaults to 8)\n" + " '-W' allow to write to the target out of order rather than sequential\n" + "\n" "Parameters to snapshot subcommand:\n" " 'snapshot' is the name of the snapshot to create, apply or delete\n" " '-a' applies a snapshot (revert disk to saved state)\n" @@ -1448,48 +1453,61 @@ enum ImgConvertBlockStatus { BLK_BACKING_FILE, }; +#define MAX_COROUTINES 16 + typedef struct ImgConvertState { BlockBackend **src; int64_t *src_sectors; - int src_cur, src_num; - int64_t src_cur_offset; + int src_num; int64_t total_sectors; int64_t allocated_sectors; + int64_t allocated_done; + int64_t sector_num; + int64_t wr_offs; enum ImgConvertBlockStatus status; int64_t sector_next_status; BlockBackend *target; bool has_zero_init; bool compressed; bool target_has_backing; + bool wr_in_order; int min_sparse; size_t cluster_sectors; size_t buf_sectors; + int num_coroutines; + int running_coroutines; + Coroutine *co[MAX_COROUTINES]; + int64_t wait_sector_num[MAX_COROUTINES]; + CoMutex lock; + int ret; } ImgConvertState; -static void convert_select_part(ImgConvertState *s, int64_t sector_num) +static void convert_select_part(ImgConvertState *s, int64_t sector_num, + int *src_cur, int64_t *src_cur_offset) { - assert(sector_num >= s->src_cur_offset); - while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) { - s->src_cur_offset += s->src_sectors[s->src_cur]; - s->src_cur++; - assert(s->src_cur < s->src_num); + *src_cur = 0; + *src_cur_offset = 0; + while (sector_num - *src_cur_offset >= s->src_sectors[*src_cur]) { + *src_cur_offset += s->src_sectors[*src_cur]; + (*src_cur)++; + assert(*src_cur < s->src_num); } } static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) { - int64_t ret; - int n; + int64_t ret, src_cur_offset; + int n, src_cur; - convert_select_part(s, sector_num); + convert_select_part(s, sector_num, &src_cur, &src_cur_offset); assert(s->total_sectors > sector_num); n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); if (s->sector_next_status <= sector_num) { BlockDriverState *file; - ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]), - sector_num - s->src_cur_offset, + ret = bdrv_get_block_status(blk_bs(s->src[src_cur]), + sector_num - src_cur_offset, n, &n, &file); if (ret < 0) { return ret; @@ -1505,8 +1523,8 @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) /* Check block status of the backing file chain to avoid * needlessly reading zeroes and limiting the iteration to the * buffer size */ - ret = bdrv_get_block_status_above(blk_bs(s->src[s->src_cur]), NULL, - sector_num - s->src_cur_offset, + ret = bdrv_get_block_status_above(blk_bs(s->src[src_cur]), NULL, + sector_num - src_cur_offset, n, &n, &file); if (ret < 0) { return ret; @@ -1544,28 +1562,34 @@ static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) return n; } -static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors, - uint8_t *buf) +static int coroutine_fn convert_co_read(ImgConvertState *s, int64_t sector_num, + int nb_sectors, uint8_t *buf) { - int n; - int ret; + int n, ret; + QEMUIOVector qiov; + struct iovec iov; assert(nb_sectors <= s->buf_sectors); while (nb_sectors > 0) { BlockBackend *blk; - int64_t bs_sectors; + int src_cur; + int64_t bs_sectors, src_cur_offset; /* In the case of compression with multiple source files, we can get a * nb_sectors that spreads into the next part. So we must be able to * read across multiple BDSes for one convert_read() call. */ - convert_select_part(s, sector_num); - blk = s->src[s->src_cur]; - bs_sectors = s->src_sectors[s->src_cur]; - - n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset)); - ret = blk_pread(blk, - (sector_num - s->src_cur_offset) << BDRV_SECTOR_BITS, - buf, n << BDRV_SECTOR_BITS); + convert_select_part(s, sector_num, &src_cur, &src_cur_offset); + blk = s->src[src_cur]; + bs_sectors = s->src_sectors[src_cur]; + + n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset)); + iov.iov_base = buf; + iov.iov_len = n << BDRV_SECTOR_BITS; + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = blk_co_preadv( + blk, (sector_num - src_cur_offset) << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, &qiov, 0); if (ret < 0) { return ret; } @@ -1578,15 +1602,18 @@ static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors, return 0; } -static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, - const uint8_t *buf) + +static int coroutine_fn convert_co_write(ImgConvertState *s, int64_t sector_num, + int nb_sectors, uint8_t *buf, + enum ImgConvertBlockStatus status) { int ret; + QEMUIOVector qiov; + struct iovec iov; while (nb_sectors > 0) { int n = nb_sectors; - - switch (s->status) { + switch (status) { case BLK_BACKING_FILE: /* If we have a backing file, leave clusters unallocated that are * unallocated in the source image, so that the backing file is @@ -1607,9 +1634,13 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, break; } - ret = blk_pwrite_compressed(s->target, - sector_num << BDRV_SECTOR_BITS, - buf, n << BDRV_SECTOR_BITS); + iov.iov_base = buf; + iov.iov_len = n << BDRV_SECTOR_BITS; + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = blk_co_pwritev(s->target, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, &qiov, + BDRV_REQ_WRITE_COMPRESSED); if (ret < 0) { return ret; } @@ -1622,8 +1653,12 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, if (!s->min_sparse || is_allocated_sectors_min(buf, n, &n, s->min_sparse)) { - ret = blk_pwrite(s->target, sector_num << BDRV_SECTOR_BITS, - buf, n << BDRV_SECTOR_BITS, 0); + iov.iov_base = buf; + iov.iov_len = n << BDRV_SECTOR_BITS; + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = blk_co_pwritev(s->target, sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, &qiov, 0); if (ret < 0) { return ret; } @@ -1635,8 +1670,9 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, if (s->has_zero_init) { break; } - ret = blk_pwrite_zeroes(s->target, sector_num << BDRV_SECTOR_BITS, - n << BDRV_SECTOR_BITS, 0); + ret = blk_co_pwrite_zeroes(s->target, + sector_num << BDRV_SECTOR_BITS, + n << BDRV_SECTOR_BITS, 0); if (ret < 0) { return ret; } @@ -1651,12 +1687,122 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, return 0; } -static int convert_do_copy(ImgConvertState *s) +static void coroutine_fn convert_co_do_copy(void *opaque) { + ImgConvertState *s = opaque; uint8_t *buf = NULL; - int64_t sector_num, allocated_done; - int ret; - int n; + int ret, i; + int index = -1; + + for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i] == qemu_coroutine_self()) { + index = i; + break; + } + } + assert(index >= 0); + + s->running_coroutines++; + buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE); + + while (1) { + int n; + int64_t sector_num; + enum ImgConvertBlockStatus status; + + qemu_co_mutex_lock(&s->lock); + if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) { + qemu_co_mutex_unlock(&s->lock); + goto out; + } + n = convert_iteration_sectors(s, s->sector_num); + if (n < 0) { + qemu_co_mutex_unlock(&s->lock); + s->ret = n; + goto out; + } + /* save current sector and allocation status to local variables */ + sector_num = s->sector_num; + status = s->status; + if (!s->min_sparse && s->status == BLK_ZERO) { + n = MIN(n, s->buf_sectors); + } + /* increment global sector counter so that other coroutines can + * already continue reading beyond this request */ + s->sector_num += n; + qemu_co_mutex_unlock(&s->lock); + + if (status == BLK_DATA || (!s->min_sparse && status == BLK_ZERO)) { + s->allocated_done += n; + qemu_progress_print(100.0 * s->allocated_done / + s->allocated_sectors, 0); + } + + if (status == BLK_DATA) { + ret = convert_co_read(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while reading sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + s->ret = ret; + goto out; + } + } else if (!s->min_sparse && status == BLK_ZERO) { + status = BLK_DATA; + memset(buf, 0x00, n * BDRV_SECTOR_SIZE); + } + + if (s->wr_in_order) { + /* keep writes in order */ + while (s->wr_offs != sector_num) { + if (s->ret != -EINPROGRESS) { + goto out; + } + s->wait_sector_num[index] = sector_num; + qemu_coroutine_yield(); + } + s->wait_sector_num[index] = -1; + } + + ret = convert_co_write(s, sector_num, n, buf, status); + if (ret < 0) { + error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + s->ret = ret; + goto out; + } + + if (s->wr_in_order) { + /* reenter the coroutine that might have waited + * for this write to complete */ + s->wr_offs = sector_num + n; + for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i] && s->wait_sector_num[i] == s->wr_offs) { + /* + * A -> B -> A cannot occur because A has + * s->wait_sector_num[i] == -1 during A -> B. Therefore + * B will never enter A during this time window. + */ + qemu_coroutine_enter(s->co[i]); + break; + } + } + } + } + +out: + qemu_vfree(buf); + s->co[index] = NULL; + s->running_coroutines--; + if (!s->running_coroutines && s->ret == -EINPROGRESS) { + /* the convert job finished successfully */ + s->ret = 0; + } +} + +static int convert_do_copy(ImgConvertState *s) +{ + int ret, i, n; + int64_t sector_num = 0; /* Check whether we have zero initialisation or can get it efficiently */ s->has_zero_init = s->min_sparse && !s->target_has_backing @@ -1677,21 +1823,15 @@ static int convert_do_copy(ImgConvertState *s) if (s->compressed) { if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) { error_report("invalid cluster size"); - ret = -EINVAL; - goto fail; + return -EINVAL; } s->buf_sectors = s->cluster_sectors; } - buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE); - /* Calculate allocated sectors for progress */ - s->allocated_sectors = 0; - sector_num = 0; while (sector_num < s->total_sectors) { n = convert_iteration_sectors(s, sector_num); if (n < 0) { - ret = n; - goto fail; + return n; } if (s->status == BLK_DATA || (!s->min_sparse && s->status == BLK_ZERO)) { @@ -1701,61 +1841,29 @@ static int convert_do_copy(ImgConvertState *s) } /* Do the copy */ - s->src_cur = 0; - s->src_cur_offset = 0; s->sector_next_status = 0; + s->ret = -EINPROGRESS; - sector_num = 0; - allocated_done = 0; - - while (sector_num < s->total_sectors) { - n = convert_iteration_sectors(s, sector_num); - if (n < 0) { - ret = n; - goto fail; - } - if (s->status == BLK_DATA || (!s->min_sparse && s->status == BLK_ZERO)) - { - allocated_done += n; - qemu_progress_print(100.0 * allocated_done / s->allocated_sectors, - 0); - } - - if (s->status == BLK_DATA) { - ret = convert_read(s, sector_num, n, buf); - if (ret < 0) { - error_report("error while reading sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto fail; - } - } else if (!s->min_sparse && s->status == BLK_ZERO) { - n = MIN(n, s->buf_sectors); - memset(buf, 0, n * BDRV_SECTOR_SIZE); - s->status = BLK_DATA; - } - - ret = convert_write(s, sector_num, n, buf); - if (ret < 0) { - error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto fail; - } + qemu_co_mutex_init(&s->lock); + for (i = 0; i < s->num_coroutines; i++) { + s->co[i] = qemu_coroutine_create(convert_co_do_copy, s); + s->wait_sector_num[i] = -1; + qemu_coroutine_enter(s->co[i]); + } - sector_num += n; + while (s->ret == -EINPROGRESS) { + main_loop_wait(false); } - if (s->compressed) { + if (s->compressed && !s->ret) { /* signal EOF to align */ ret = blk_pwrite_compressed(s->target, 0, NULL, 0); if (ret < 0) { - goto fail; + return ret; } } - ret = 0; -fail: - qemu_vfree(buf); - return ret; + return s->ret; } static int img_convert(int argc, char **argv) @@ -1783,6 +1891,8 @@ static int img_convert(int argc, char **argv) QemuOpts *sn_opts = NULL; ImgConvertState state; bool image_opts = false; + bool wr_in_order = true; + int num_coroutines = 8; fmt = NULL; out_fmt = "raw"; @@ -1798,7 +1908,7 @@ static int img_convert(int argc, char **argv) {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "hf:O:B:ce6o:s:l:S:pt:T:qn", + c = getopt_long(argc, argv, "hf:O:B:ce6o:s:l:S:pt:T:qnm:W", long_options, NULL); if (c == -1) { break; @@ -1890,6 +2000,18 @@ static int img_convert(int argc, char **argv) case 'n': skip_create = 1; break; + case 'm': + num_coroutines = atoi(optarg); + if (num_coroutines < 1 || num_coroutines > MAX_COROUTINES) { + error_report("Allowed number of coroutines is between 1 and %d", + MAX_COROUTINES); + ret = -1; + goto fail_getopt; + } + break; + case 'W': + wr_in_order = false; + break; case OPTION_OBJECT: opts = qemu_opts_parse_noisily(&qemu_object_opts, optarg, true); @@ -1909,6 +2031,12 @@ static int img_convert(int argc, char **argv) goto fail_getopt; } + if (!wr_in_order && compress) { + error_report("Out of order write and compress are mutually exclusive"); + ret = -1; + goto fail_getopt; + } + /* Initialize before goto out */ if (quiet) { progress = 0; @@ -2149,6 +2277,8 @@ static int img_convert(int argc, char **argv) .min_sparse = min_sparse, .cluster_sectors = cluster_sectors, .buf_sectors = bufsectors, + .wr_in_order = wr_in_order, + .num_coroutines = num_coroutines, }; ret = convert_do_copy(&state); diff --git a/qemu-img.texi b/qemu-img.texi index 174aae3..42ea20d 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -137,6 +137,12 @@ Parameters to convert subcommand: @item -n Skip the creation of the target volume +@item -m +Number of parallel coroutines for the convert process +@item -W +Allow to write out of order to the destination. This is option improves performance, +but is only recommened for preallocated devices like host devices or other +raw block devices. @end table Parameters to dd subcommand: @@ -296,7 +302,7 @@ Error on reading data @end table -@item convert [-c] [-p] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_id_or_name}] [-l @var{snapshot_param}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename} +@item convert [-c] [-p] [-n] [-f @var{fmt}] [-t @var{cache}] [-T @var{src_cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_id_or_name}] [-l @var{snapshot_param}] [-m @var{num_coroutines}] [-W] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename} Convert the disk image @var{filename} or a snapshot @var{snapshot_param}(@var{snapshot_id_or_name} is deprecated) to disk image @var{output_filename} using format @var{output_fmt}. It can be optionally compressed (@code{-c} @@ -326,6 +332,14 @@ skipped. This is useful for formats such as @code{rbd} if the target volume has already been created with site specific options that cannot be supplied through qemu-img. +Out of order writes can be enabled with @code{-W} to improve performance. +This is only recommended for preallocated devices like host devices or other +raw block devices. Out of order write does not work in combination with +creating compressed images. + +@var{num_coroutines} specifies how many coroutines work in parallel during +the convert process (defaults to 8). + @item dd [-f @var{fmt}] [-O @var{output_fmt}] [bs=@var{block_size}] [count=@var{blocks}] [skip=@var{blocks}] if=@var{input} of=@var{output} Dd copies from @var{input} file to @var{output} file converting it from

[V2] qemu-img: make convert async

Commit Message

Comments

Patch