Message ID | 20190427113625.46594-1-sgarzare@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | block/rbd: add preallocation support | expand |
On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > This patch adds the support of preallocation (off/full) for the RBD > block driver. > If available, we use rbd_writesame() to quickly fill the image when > full preallocation is required. > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > --- > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > qapi/block-core.json | 4 +- > 2 files changed, 136 insertions(+), 17 deletions(-) > > diff --git a/block/rbd.c b/block/rbd.c > index 0c549c9935..29dd1bb040 100644 > --- a/block/rbd.c > +++ b/block/rbd.c > @@ -13,6 +13,7 @@ > > #include "qemu/osdep.h" > > +#include "qemu/units.h" > #include <rbd/librbd.h> > #include "qapi/error.h" > #include "qemu/error-report.h" > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > } > } > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > + PreallocMode prealloc, Error **errp) > +{ > + uint64_t current_length; > + char *buf = NULL; > + int ret; > + > + ret = rbd_get_size(image, ¤t_length); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to get file length"); > + goto out; > + } > + > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > + error_setg(errp, "Cannot use preallocation for shrinking files"); > + ret = -ENOTSUP; > + goto out; > + } > + > + switch (prealloc) { > + case PREALLOC_MODE_FULL: { > + uint64_t current_offset = current_length; > + int buf_size = 64 * KiB; This should probably be 512B or 4KiB (which is the smallest striped unit size). Not only will this avoid sending unnecessary zeroes to the backing cluster, writesame silently turns into a standard write if your buffer isn't properly aligned with the min(object size, stripe unit size). > + ssize_t bytes; > + > + buf = g_malloc(buf_size); > + /* > + * Some versions of rbd_writesame() discards writes of buffers with > + * all zeroes. In order to avoid this behaviour, we set the first byte > + * to one. > + */ > + buf[0] = 1; You could also use "rados_conf_set(cluster, "rbd_discard_on_zeroed_write_same", "false"). > + ret = rbd_resize(image, offset); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to resize file"); > + goto out; > + } > + > +#ifdef LIBRBD_SUPPORTS_WRITESAME > + while (offset - current_offset > buf_size) { > + /* > + * rbd_writesame() supports only request where the size of the > + * operation is multiple of buffer size and it must be less or > + * equal to INT_MAX. > + */ > + bytes = MIN(offset - current_offset, INT_MAX); > + bytes -= bytes % buf_size; Using the default object size of 4MiB, this write size would result in up to 512 concurrent ops to the backing cluster. Perhaps the size should be bounded such that only a dozen or so concurrent requests are issued per write, always rounded next largest object / stripe period size. librbd and the rbd CLI usually try to bound themselves to the value in the "rbd_concurrent_management_ops" configuration setting (currently defaults to 10). > + bytes = rbd_writesame(image, current_offset, bytes, buf, buf_size, > + 0); > + if (bytes < 0) { > + ret = bytes; > + error_setg_errno(errp, -ret, > + "Failed to write for preallocation"); > + goto out; > + } > + > + current_offset += bytes; > + } > +#endif /* LIBRBD_SUPPORTS_WRITESAME */ > + > + while (current_offset < offset) { > + bytes = rbd_write(image, current_offset, > + MIN(offset - current_offset, buf_size), buf); > + if (bytes < 0) { > + ret = bytes; > + error_setg_errno(errp, -ret, > + "Failed to write for preallocation"); > + goto out; > + } > + > + current_offset += bytes; > + } > + > + ret = rbd_flush(image); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to flush the file"); > + goto out; > + } > + > + break; > + } > + case PREALLOC_MODE_OFF: > + ret = rbd_resize(image, offset); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to resize file"); > + goto out; > + } > + break; > + default: > + error_setg(errp, "Unsupported preallocation mode: %s", > + PreallocMode_str(prealloc)); > + ret = -ENOTSUP; > + goto out; > + } > + > + ret = 0; > + > +out: > + g_free(buf); > + return ret; > +} > + > static QemuOptsList runtime_opts = { > .name = "rbd", > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > @@ -376,6 +481,7 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, > BlockdevCreateOptionsRbd *opts = &options->u.rbd; > rados_t cluster; > rados_ioctx_t io_ctx; > + rbd_image_t image; > int obj_order = 0; > int ret; > > @@ -404,13 +510,21 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, > return ret; > } > > - ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order); > + ret = rbd_create(io_ctx, opts->location->image, 0, &obj_order); > if (ret < 0) { > error_setg_errno(errp, -ret, "error rbd create"); > goto out; > } > > - ret = 0; > + ret = rbd_open(io_ctx, opts->location->image, &image, NULL); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "error rbd open"); > + goto out; > + } > + > + ret = qemu_rbd_do_truncate(image, opts->size, opts->preallocation, errp); > + > + rbd_close(image); > out: > rados_ioctx_destroy(io_ctx); > rados_shutdown(cluster); > @@ -431,6 +545,7 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, > BlockdevOptionsRbd *loc; > Error *local_err = NULL; > const char *keypairs, *password_secret; > + char *prealloc; > QDict *options = NULL; > int ret = 0; > > @@ -449,6 +564,16 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, > BLOCK_OPT_CLUSTER_SIZE, 0); > rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0); > > + prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); > + rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc, > + PREALLOC_MODE_OFF, &local_err); > + g_free(prealloc); > + if (local_err) { > + ret = -EINVAL; > + error_propagate(errp, local_err); > + goto exit; > + } > + > options = qdict_new(); > qemu_rbd_parse_filename(filename, options, &local_err); > if (local_err) { > @@ -1052,21 +1177,8 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, > Error **errp) > { > BDRVRBDState *s = bs->opaque; > - int r; > - > - if (prealloc != PREALLOC_MODE_OFF) { > - error_setg(errp, "Unsupported preallocation mode '%s'", > - PreallocMode_str(prealloc)); > - return -ENOTSUP; > - } > - > - r = rbd_resize(s->image, offset); > - if (r < 0) { > - error_setg_errno(errp, -r, "Failed to resize file"); > - return r; > - } > > - return 0; > + return qemu_rbd_do_truncate(s->image, offset, prealloc, errp); > } > > static int qemu_rbd_snap_create(BlockDriverState *bs, > @@ -1219,6 +1331,11 @@ static QemuOptsList qemu_rbd_create_opts = { > .type = QEMU_OPT_SIZE, > .help = "RBD object size" > }, > + { > + .name = BLOCK_OPT_PREALLOC, > + .type = QEMU_OPT_STRING, > + .help = "Preallocation mode (allowed values: off, full)" > + }, > { > .name = "password-secret", > .type = QEMU_OPT_STRING, > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 7ccbfff9d0..db25a4065b 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -4277,13 +4277,15 @@ > # point to a snapshot. > # @size Size of the virtual disk in bytes > # @cluster-size RBD object size > +# @preallocation Preallocation mode (allowed values: off, full) > # > # Since: 2.12 > ## > { 'struct': 'BlockdevCreateOptionsRbd', > 'data': { 'location': 'BlockdevOptionsRbd', > 'size': 'size', > - '*cluster-size' : 'size' } } > + '*cluster-size' : 'size', > + '*preallocation': 'PreallocMode' } } > > ## > # @BlockdevVmdkSubformat: > -- > 2.20.1 > >
On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote: > On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > > > This patch adds the support of preallocation (off/full) for the RBD > > block driver. > > If available, we use rbd_writesame() to quickly fill the image when > > full preallocation is required. > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > --- > > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > qapi/block-core.json | 4 +- > > 2 files changed, 136 insertions(+), 17 deletions(-) > > > > diff --git a/block/rbd.c b/block/rbd.c > > index 0c549c9935..29dd1bb040 100644 > > --- a/block/rbd.c > > +++ b/block/rbd.c > > @@ -13,6 +13,7 @@ > > > > #include "qemu/osdep.h" > > > > +#include "qemu/units.h" > > #include <rbd/librbd.h> > > #include "qapi/error.h" > > #include "qemu/error-report.h" > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > > } > > } > > > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > > + PreallocMode prealloc, Error **errp) > > +{ > > + uint64_t current_length; > > + char *buf = NULL; > > + int ret; > > + > > + ret = rbd_get_size(image, ¤t_length); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Failed to get file length"); > > + goto out; > > + } > > + > > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > > + error_setg(errp, "Cannot use preallocation for shrinking files"); > > + ret = -ENOTSUP; > > + goto out; > > + } > > + > > + switch (prealloc) { > > + case PREALLOC_MODE_FULL: { > > + uint64_t current_offset = current_length; > > + int buf_size = 64 * KiB; > > This should probably be 512B or 4KiB (which is the smallest striped > unit size). Not only will this avoid sending unnecessary zeroes to the > backing cluster, writesame silently turns into a standard write if > your buffer isn't properly aligned with the min(object size, stripe > unit size). > Okay, I'll change it on v2. Should we query about the "stripe_unit" size or we simply use the smallest allowed? > > + ssize_t bytes; > > + > > + buf = g_malloc(buf_size); > > + /* > > + * Some versions of rbd_writesame() discards writes of buffers with > > + * all zeroes. In order to avoid this behaviour, we set the first byte > > + * to one. > > + */ > > + buf[0] = 1; > > You could also use "rados_conf_set(cluster, > "rbd_discard_on_zeroed_write_same", "false"). > I tried it, but it is not supported on all versions. (eg. I have Ceph v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is available) Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and "buf[0] = 1" > > + ret = rbd_resize(image, offset); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Failed to resize file"); > > + goto out; > > + } > > + > > +#ifdef LIBRBD_SUPPORTS_WRITESAME > > + while (offset - current_offset > buf_size) { > > + /* > > + * rbd_writesame() supports only request where the size of the > > + * operation is multiple of buffer size and it must be less or > > + * equal to INT_MAX. > > + */ > > + bytes = MIN(offset - current_offset, INT_MAX); > > + bytes -= bytes % buf_size; > > Using the default object size of 4MiB, this write size would result in > up to 512 concurrent ops to the backing cluster. Perhaps the size > should be bounded such that only a dozen or so concurrent requests are > issued per write, always rounded next largest object / stripe period > size. librbd and the rbd CLI usually try to bound themselves to the > value in the "rbd_concurrent_management_ops" configuration setting > (currently defaults to 10). > Do you suggest to use "rbd_concurrent_management_ops" to limit concurrent requests or use a new QEMU parameters for the RBD driver? Thanks for your comments, Stefano
On Mon, Apr 29, 2019 at 8:47 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote: > > On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > > > > > This patch adds the support of preallocation (off/full) for the RBD > > > block driver. > > > If available, we use rbd_writesame() to quickly fill the image when > > > full preallocation is required. > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > > --- > > > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > > qapi/block-core.json | 4 +- > > > 2 files changed, 136 insertions(+), 17 deletions(-) > > > > > > diff --git a/block/rbd.c b/block/rbd.c > > > index 0c549c9935..29dd1bb040 100644 > > > --- a/block/rbd.c > > > +++ b/block/rbd.c > > > @@ -13,6 +13,7 @@ > > > > > > #include "qemu/osdep.h" > > > > > > +#include "qemu/units.h" > > > #include <rbd/librbd.h> > > > #include "qapi/error.h" > > > #include "qemu/error-report.h" > > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > > > } > > > } > > > > > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > > > + PreallocMode prealloc, Error **errp) > > > +{ > > > + uint64_t current_length; > > > + char *buf = NULL; > > > + int ret; > > > + > > > + ret = rbd_get_size(image, ¤t_length); > > > + if (ret < 0) { > > > + error_setg_errno(errp, -ret, "Failed to get file length"); > > > + goto out; > > > + } > > > + > > > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > > > + error_setg(errp, "Cannot use preallocation for shrinking files"); > > > + ret = -ENOTSUP; > > > + goto out; > > > + } > > > + > > > + switch (prealloc) { > > > + case PREALLOC_MODE_FULL: { > > > + uint64_t current_offset = current_length; > > > + int buf_size = 64 * KiB; > > > > This should probably be 512B or 4KiB (which is the smallest striped > > unit size). Not only will this avoid sending unnecessary zeroes to the > > backing cluster, writesame silently turns into a standard write if > > your buffer isn't properly aligned with the min(object size, stripe > > unit size). > > > > Okay, I'll change it on v2. > Should we query about the "stripe_unit" size or we simply use the > smallest allowed? Technically we don't prevent a user from choosing terrible stripe unit sizes (e.g. 1 byte), so you are probably safe to just use 4KiB. > > > + ssize_t bytes; > > > + > > > + buf = g_malloc(buf_size); > > > + /* > > > + * Some versions of rbd_writesame() discards writes of buffers with > > > + * all zeroes. In order to avoid this behaviour, we set the first byte > > > + * to one. > > > + */ > > > + buf[0] = 1; > > > > You could also use "rados_conf_set(cluster, > > "rbd_discard_on_zeroed_write_same", "false"). > > > > I tried it, but it is not supported on all versions. (eg. I have Ceph > v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is > available) > > Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and > "buf[0] = 1" Probably not worth the effort if it's not supported across all releases. > > > + ret = rbd_resize(image, offset); > > > + if (ret < 0) { > > > + error_setg_errno(errp, -ret, "Failed to resize file"); > > > + goto out; > > > + } > > > + > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME > > > + while (offset - current_offset > buf_size) { > > > + /* > > > + * rbd_writesame() supports only request where the size of the > > > + * operation is multiple of buffer size and it must be less or > > > + * equal to INT_MAX. > > > + */ > > > + bytes = MIN(offset - current_offset, INT_MAX); > > > + bytes -= bytes % buf_size; > > > > Using the default object size of 4MiB, this write size would result in > > up to 512 concurrent ops to the backing cluster. Perhaps the size > > should be bounded such that only a dozen or so concurrent requests are > > issued per write, always rounded next largest object / stripe period > > size. librbd and the rbd CLI usually try to bound themselves to the > > value in the "rbd_concurrent_management_ops" configuration setting > > (currently defaults to 10). > > > > Do you suggest to use "rbd_concurrent_management_ops" to limit > concurrent requests or use a new QEMU parameters for the RBD driver? I think it would be nicer to just query the "rbd_concurrent_management_ops" limit to derive your writesame size since the Ceph cluster admin can globally set that option to match the available parallelism of the cluster. > Thanks for your comments, > Stefano
On Mon, Apr 29, 2019 at 09:00:26AM -0400, Jason Dillaman wrote: > On Mon, Apr 29, 2019 at 8:47 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > > > On Sat, Apr 27, 2019 at 08:43:26AM -0400, Jason Dillaman wrote: > > > On Sat, Apr 27, 2019 at 7:37 AM Stefano Garzarella <sgarzare@redhat.com> wrote: > > > > > > > > This patch adds the support of preallocation (off/full) for the RBD > > > > block driver. > > > > If available, we use rbd_writesame() to quickly fill the image when > > > > full preallocation is required. > > > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > > > --- > > > > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > > > qapi/block-core.json | 4 +- > > > > 2 files changed, 136 insertions(+), 17 deletions(-) > > > > > > > > diff --git a/block/rbd.c b/block/rbd.c > > > > index 0c549c9935..29dd1bb040 100644 > > > > --- a/block/rbd.c > > > > +++ b/block/rbd.c > > > > @@ -13,6 +13,7 @@ > > > > > > > > #include "qemu/osdep.h" > > > > > > > > +#include "qemu/units.h" > > > > #include <rbd/librbd.h> > > > > #include "qapi/error.h" > > > > #include "qemu/error-report.h" > > > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > > > > } > > > > } > > > > > > > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > > > > + PreallocMode prealloc, Error **errp) > > > > +{ > > > > + uint64_t current_length; > > > > + char *buf = NULL; > > > > + int ret; > > > > + > > > > + ret = rbd_get_size(image, ¤t_length); > > > > + if (ret < 0) { > > > > + error_setg_errno(errp, -ret, "Failed to get file length"); > > > > + goto out; > > > > + } > > > > + > > > > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > > > > + error_setg(errp, "Cannot use preallocation for shrinking files"); > > > > + ret = -ENOTSUP; > > > > + goto out; > > > > + } > > > > + > > > > + switch (prealloc) { > > > > + case PREALLOC_MODE_FULL: { > > > > + uint64_t current_offset = current_length; > > > > + int buf_size = 64 * KiB; > > > > > > This should probably be 512B or 4KiB (which is the smallest striped > > > unit size). Not only will this avoid sending unnecessary zeroes to the > > > backing cluster, writesame silently turns into a standard write if > > > your buffer isn't properly aligned with the min(object size, stripe > > > unit size). > > > > > > > Okay, I'll change it on v2. > > Should we query about the "stripe_unit" size or we simply use the > > smallest allowed? > > Technically we don't prevent a user from choosing terrible stripe unit > sizes (e.g. 1 byte), so you are probably safe to just use 4KiB. > > > > > + ssize_t bytes; > > > > + > > > > + buf = g_malloc(buf_size); > > > > + /* > > > > + * Some versions of rbd_writesame() discards writes of buffers with > > > > + * all zeroes. In order to avoid this behaviour, we set the first byte > > > > + * to one. > > > > + */ > > > > + buf[0] = 1; > > > > > > You could also use "rados_conf_set(cluster, > > > "rbd_discard_on_zeroed_write_same", "false"). > > > > > > > I tried it, but it is not supported on all versions. (eg. I have Ceph > > v12.2.11 on my Fedora 29 and it is not supported, but rbd_writesame() is > > available) > > > > Maybe we can use both: "rbd_discard_on_zeroed_write_same = false" and > > "buf[0] = 1" > > Probably not worth the effort if it's not supported across all releases. > > > > > + ret = rbd_resize(image, offset); > > > > + if (ret < 0) { > > > > + error_setg_errno(errp, -ret, "Failed to resize file"); > > > > + goto out; > > > > + } > > > > + > > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME > > > > + while (offset - current_offset > buf_size) { > > > > + /* > > > > + * rbd_writesame() supports only request where the size of the > > > > + * operation is multiple of buffer size and it must be less or > > > > + * equal to INT_MAX. > > > > + */ > > > > + bytes = MIN(offset - current_offset, INT_MAX); > > > > + bytes -= bytes % buf_size; > > > > > > Using the default object size of 4MiB, this write size would result in > > > up to 512 concurrent ops to the backing cluster. Perhaps the size > > > should be bounded such that only a dozen or so concurrent requests are > > > issued per write, always rounded next largest object / stripe period > > > size. librbd and the rbd CLI usually try to bound themselves to the > > > value in the "rbd_concurrent_management_ops" configuration setting > > > (currently defaults to 10). > > > > > > > Do you suggest to use "rbd_concurrent_management_ops" to limit > > concurrent requests or use a new QEMU parameters for the RBD driver? > > I think it would be nicer to just query the > "rbd_concurrent_management_ops" limit to derive your writesame size > since the Ceph cluster admin can globally set that option to match the > available parallelism of the cluster. > Thanks for the details, I'll send a v2 following yuor comments! Stefano
Cc: Peter for a libvirt perspective. Stefano Garzarella <sgarzare@redhat.com> writes: > This patch adds the support of preallocation (off/full) for the RBD > block driver. > If available, we use rbd_writesame() to quickly fill the image when > full preallocation is required. > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > --- > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > qapi/block-core.json | 4 +- > 2 files changed, 136 insertions(+), 17 deletions(-) > > diff --git a/block/rbd.c b/block/rbd.c > index 0c549c9935..29dd1bb040 100644 > --- a/block/rbd.c > +++ b/block/rbd.c > @@ -13,6 +13,7 @@ > > #include "qemu/osdep.h" > > +#include "qemu/units.h" > #include <rbd/librbd.h> > #include "qapi/error.h" > #include "qemu/error-report.h" > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > } > } > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > + PreallocMode prealloc, Error **errp) > +{ > + uint64_t current_length; > + char *buf = NULL; > + int ret; > + > + ret = rbd_get_size(image, ¤t_length); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Failed to get file length"); > + goto out; > + } > + > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > + error_setg(errp, "Cannot use preallocation for shrinking files"); > + ret = -ENOTSUP; > + goto out; > + } > + > + switch (prealloc) { > + case PREALLOC_MODE_FULL: { [...] > + case PREALLOC_MODE_OFF: [...] > + default: > + error_setg(errp, "Unsupported preallocation mode: %s", > + PreallocMode_str(prealloc)); > + ret = -ENOTSUP; > + goto out; > + } Other block drivers also accept only some values of PreallocMode. Okay. I wonder whether management applications need to know which values are supported. Let me review support in drivers: * file (file-win32.c) * iscsi * nfs * qed * ssh - Reject all but PREALLOC_MODE_OFF * copy-on-read * luks (crypto.c) * raw - Pass through only * file host_cdrom host_device (file-posix.c) - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular files - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE - Reject PREALLOC_MODE_METADATA * gluster - Reject all but PREALLOC_MODE_OFF when shrinking - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL - Reject PREALLOC_MODE_METADATA * qcow2 - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing file * rbd with this patch - Reject all but PREALLOC_MODE_OFF when shrinking - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC * sheepdog - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC - Doesn't support shrinking * vdi - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL - Doesn't support shrinking * blkdebug * blklogwrites * blkverify * bochs * cloop * dmg * ftp * ftps * http * https * luks * nbd * null-aio * null-co * nvme * parallels * qcow * quorum * replication * throttle * vhdx * vmdk * vpc * vvfat * vxhs - These appear not to use PreallocMode: they don't implement .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or implement it without a prealloc parameter. Looks good to me. > + > + ret = 0; > + > +out: > + g_free(buf); > + return ret; > +} > + > static QemuOptsList runtime_opts = { > .name = "rbd", > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), [...] > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 7ccbfff9d0..db25a4065b 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -4277,13 +4277,15 @@ > # point to a snapshot. > # @size Size of the virtual disk in bytes > # @cluster-size RBD object size > +# @preallocation Preallocation mode (allowed values: off, full) > # > # Since: 2.12 > ## > { 'struct': 'BlockdevCreateOptionsRbd', > 'data': { 'location': 'BlockdevOptionsRbd', > 'size': 'size', > - '*cluster-size' : 'size' } } > + '*cluster-size' : 'size', > + '*preallocation': 'PreallocMode' } } > > ## > # @BlockdevVmdkSubformat: The non-support of values 'metadata' and 'falloc' is not visible in introspection, only in documentation. No reason to block this patch, as the other block drivers have the same introspection weakness (only sheepdog and vdi bother to document). Should we address the introspection weakness? Only if there's a use for the information, I think. Should we improve documentation for the other block drivers?
On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > Cc: Peter for a libvirt perspective. > > Stefano Garzarella <sgarzare@redhat.com> writes: > > > This patch adds the support of preallocation (off/full) for the RBD > > block driver. > > If available, we use rbd_writesame() to quickly fill the image when > > full preallocation is required. > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > --- > > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > qapi/block-core.json | 4 +- > > 2 files changed, 136 insertions(+), 17 deletions(-) > > > > diff --git a/block/rbd.c b/block/rbd.c > > index 0c549c9935..29dd1bb040 100644 > > --- a/block/rbd.c > > +++ b/block/rbd.c > > @@ -13,6 +13,7 @@ > > > > #include "qemu/osdep.h" > > > > +#include "qemu/units.h" > > #include <rbd/librbd.h> > > #include "qapi/error.h" > > #include "qemu/error-report.h" > > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > > } > > } > > > > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > > + PreallocMode prealloc, Error **errp) > > +{ > > + uint64_t current_length; > > + char *buf = NULL; > > + int ret; > > + > > + ret = rbd_get_size(image, ¤t_length); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Failed to get file length"); > > + goto out; > > + } > > + > > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > > + error_setg(errp, "Cannot use preallocation for shrinking files"); > > + ret = -ENOTSUP; > > + goto out; > > + } > > + > > + switch (prealloc) { > > + case PREALLOC_MODE_FULL: { > [...] > > + case PREALLOC_MODE_OFF: > [...] > > + default: > > + error_setg(errp, "Unsupported preallocation mode: %s", > > + PreallocMode_str(prealloc)); > > + ret = -ENOTSUP; > > + goto out; > > + } > > Other block drivers also accept only some values of PreallocMode. Okay. > > I wonder whether management applications need to know which values are > supported. Good point! > > Let me review support in drivers: > > * file (file-win32.c) > * iscsi > * nfs > * qed > * ssh > > - Reject all but PREALLOC_MODE_OFF > > * copy-on-read > * luks (crypto.c) > * raw > > - Pass through only > > * file host_cdrom host_device (file-posix.c) > > - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular > files > - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE > - Reject PREALLOC_MODE_METADATA > > * gluster > > - Reject all but PREALLOC_MODE_OFF when shrinking > - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE > - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL > - Reject PREALLOC_MODE_METADATA > > * qcow2 > > - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing > file > > * rbd with this patch > > - Reject all but PREALLOC_MODE_OFF when shrinking > - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > > * sheepdog > > - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > - Doesn't support shrinking > > * vdi > > - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL > - Doesn't support shrinking > > * blkdebug > * blklogwrites > * blkverify > * bochs > * cloop > * dmg > * ftp > * ftps > * http > * https > * luks > * nbd > * null-aio > * null-co > * nvme > * parallels > * qcow > * quorum > * replication > * throttle > * vhdx > * vmdk > * vpc > * vvfat > * vxhs > > - These appear not to use PreallocMode: they don't implement > .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or > implement it without a prealloc parameter. > > Looks good to me. > Thanks for the analysis! > > + > > + ret = 0; > > + > > +out: > > + g_free(buf); > > + return ret; > > +} > > + > > static QemuOptsList runtime_opts = { > > .name = "rbd", > > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > [...] > > diff --git a/qapi/block-core.json b/qapi/block-core.json > > index 7ccbfff9d0..db25a4065b 100644 > > --- a/qapi/block-core.json > > +++ b/qapi/block-core.json > > @@ -4277,13 +4277,15 @@ > > # point to a snapshot. > > # @size Size of the virtual disk in bytes > > # @cluster-size RBD object size > > +# @preallocation Preallocation mode (allowed values: off, full) > > # > > # Since: 2.12 > > ## > > { 'struct': 'BlockdevCreateOptionsRbd', > > 'data': { 'location': 'BlockdevOptionsRbd', > > 'size': 'size', > > - '*cluster-size' : 'size' } } > > + '*cluster-size' : 'size', > > + '*preallocation': 'PreallocMode' } } > > > > ## > > # @BlockdevVmdkSubformat: > > The non-support of values 'metadata' and 'falloc' is not visible in > introspection, only in documentation. No reason to block this patch, as > the other block drivers have the same introspection weakness (only > sheepdog and vdi bother to document). > > Should we address the introspection weakness? Only if there's a use for > the information, I think. If the management applications will use that information (or maybe also our help pages), could be useful to have an array of 'PreallocMode' supported per-driver. > > Should we improve documentation for the other block drivers? > Yes, e.g. for Gluster it is not updated. If you agree, I can check and update the documentation of all drivers following your analysis. Cheers, Stefano
Stefano Garzarella <sgarzare@redhat.com> writes: > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: >> Cc: Peter for a libvirt perspective. >> >> Stefano Garzarella <sgarzare@redhat.com> writes: >> >> > This patch adds the support of preallocation (off/full) for the RBD >> > block driver. >> > If available, we use rbd_writesame() to quickly fill the image when >> > full preallocation is required. >> > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> >> > --- >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- >> > qapi/block-core.json | 4 +- >> > 2 files changed, 136 insertions(+), 17 deletions(-) >> > >> > diff --git a/block/rbd.c b/block/rbd.c >> > index 0c549c9935..29dd1bb040 100644 >> > --- a/block/rbd.c >> > +++ b/block/rbd.c >> > @@ -13,6 +13,7 @@ >> > >> > #include "qemu/osdep.h" >> > >> > +#include "qemu/units.h" >> > #include <rbd/librbd.h> >> > #include "qapi/error.h" >> > #include "qemu/error-report.h" >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) >> > } >> > } >> > >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, >> > + PreallocMode prealloc, Error **errp) >> > +{ >> > + uint64_t current_length; >> > + char *buf = NULL; >> > + int ret; >> > + >> > + ret = rbd_get_size(image, ¤t_length); >> > + if (ret < 0) { >> > + error_setg_errno(errp, -ret, "Failed to get file length"); >> > + goto out; >> > + } >> > + >> > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { >> > + error_setg(errp, "Cannot use preallocation for shrinking files"); >> > + ret = -ENOTSUP; >> > + goto out; >> > + } >> > + >> > + switch (prealloc) { >> > + case PREALLOC_MODE_FULL: { >> [...] >> > + case PREALLOC_MODE_OFF: >> [...] >> > + default: >> > + error_setg(errp, "Unsupported preallocation mode: %s", >> > + PreallocMode_str(prealloc)); >> > + ret = -ENOTSUP; >> > + goto out; >> > + } >> >> Other block drivers also accept only some values of PreallocMode. Okay. >> >> I wonder whether management applications need to know which values are >> supported. > > Good point! We can continue to assume they don't until somebody tells us otherwise. >> Let me review support in drivers: >> >> * file (file-win32.c) >> * iscsi >> * nfs >> * qed >> * ssh >> >> - Reject all but PREALLOC_MODE_OFF >> >> * copy-on-read >> * luks (crypto.c) >> * raw >> >> - Pass through only >> >> * file host_cdrom host_device (file-posix.c) >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular >> files >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE >> - Reject PREALLOC_MODE_METADATA >> >> * gluster >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL >> - Reject PREALLOC_MODE_METADATA >> >> * qcow2 >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing >> file >> >> * rbd with this patch >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> >> * sheepdog >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> - Doesn't support shrinking >> >> * vdi >> >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL >> - Doesn't support shrinking >> >> * blkdebug >> * blklogwrites >> * blkverify >> * bochs >> * cloop >> * dmg >> * ftp >> * ftps >> * http >> * https >> * luks >> * nbd >> * null-aio >> * null-co >> * nvme >> * parallels >> * qcow >> * quorum >> * replication >> * throttle >> * vhdx >> * vmdk >> * vpc >> * vvfat >> * vxhs >> >> - These appear not to use PreallocMode: they don't implement >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or >> implement it without a prealloc parameter. >> >> Looks good to me. >> > > Thanks for the analysis! > >> > + >> > + ret = 0; >> > + >> > +out: >> > + g_free(buf); >> > + return ret; >> > +} >> > + >> > static QemuOptsList runtime_opts = { >> > .name = "rbd", >> > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), >> [...] >> > diff --git a/qapi/block-core.json b/qapi/block-core.json >> > index 7ccbfff9d0..db25a4065b 100644 >> > --- a/qapi/block-core.json >> > +++ b/qapi/block-core.json >> > @@ -4277,13 +4277,15 @@ >> > # point to a snapshot. >> > # @size Size of the virtual disk in bytes >> > # @cluster-size RBD object size >> > +# @preallocation Preallocation mode (allowed values: off, full) >> > # >> > # Since: 2.12 >> > ## >> > { 'struct': 'BlockdevCreateOptionsRbd', >> > 'data': { 'location': 'BlockdevOptionsRbd', >> > 'size': 'size', >> > - '*cluster-size' : 'size' } } >> > + '*cluster-size' : 'size', >> > + '*preallocation': 'PreallocMode' } } >> > >> > ## >> > # @BlockdevVmdkSubformat: >> >> The non-support of values 'metadata' and 'falloc' is not visible in >> introspection, only in documentation. No reason to block this patch, as >> the other block drivers have the same introspection weakness (only >> sheepdog and vdi bother to document). >> >> Should we address the introspection weakness? Only if there's a use for >> the information, I think. > > If the management applications will use that information (or maybe also > our help pages), could be useful to have an array of 'PreallocMode' > supported per-driver. Ideally, query-qmp-schema would show only the supported values. Not hard to do, just tedious: we'd get a number of sub-enums in addition to the full one, and we'd have to map from sub-enum to the full one. QAPI language support for sub-enums would remove most of the tedium. Not worthwhile unless the need for sub-enums is actually common. >> Should we improve documentation for the other block drivers? >> > > Yes, e.g. for Gluster it is not updated. > If you agree, I can check and update the documentation of all drivers following > your analysis. Yes, please!
On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote: > Stefano Garzarella <sgarzare@redhat.com> writes: > > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > >> Cc: Peter for a libvirt perspective. > >> > >> Stefano Garzarella <sgarzare@redhat.com> writes: > >> > >> > This patch adds the support of preallocation (off/full) for the RBD > >> > block driver. > >> > If available, we use rbd_writesame() to quickly fill the image when > >> > full preallocation is required. > >> > > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > >> > --- > >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > >> > qapi/block-core.json | 4 +- > >> > 2 files changed, 136 insertions(+), 17 deletions(-) > >> > > >> > diff --git a/block/rbd.c b/block/rbd.c > >> > index 0c549c9935..29dd1bb040 100644 > >> > --- a/block/rbd.c > >> > +++ b/block/rbd.c > >> > @@ -13,6 +13,7 @@ > >> > > >> > #include "qemu/osdep.h" > >> > > >> > +#include "qemu/units.h" > >> > #include <rbd/librbd.h> > >> > #include "qapi/error.h" > >> > #include "qemu/error-report.h" > >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > >> > } > >> > } > >> > > >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > >> > + PreallocMode prealloc, Error **errp) > >> > +{ > >> > + uint64_t current_length; > >> > + char *buf = NULL; > >> > + int ret; > >> > + > >> > + ret = rbd_get_size(image, ¤t_length); > >> > + if (ret < 0) { > >> > + error_setg_errno(errp, -ret, "Failed to get file length"); > >> > + goto out; > >> > + } > >> > + > >> > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > >> > + error_setg(errp, "Cannot use preallocation for shrinking files"); > >> > + ret = -ENOTSUP; > >> > + goto out; > >> > + } > >> > + > >> > + switch (prealloc) { > >> > + case PREALLOC_MODE_FULL: { > >> [...] > >> > + case PREALLOC_MODE_OFF: > >> [...] > >> > + default: > >> > + error_setg(errp, "Unsupported preallocation mode: %s", > >> > + PreallocMode_str(prealloc)); > >> > + ret = -ENOTSUP; > >> > + goto out; > >> > + } > >> > >> Other block drivers also accept only some values of PreallocMode. Okay. > >> > >> I wonder whether management applications need to know which values are > >> supported. > > > > Good point! > > We can continue to assume they don't until somebody tells us otherwise. > > >> Let me review support in drivers: > >> > >> * file (file-win32.c) > >> * iscsi > >> * nfs > >> * qed > >> * ssh > >> > >> - Reject all but PREALLOC_MODE_OFF > >> > >> * copy-on-read > >> * luks (crypto.c) > >> * raw > >> > >> - Pass through only > >> > >> * file host_cdrom host_device (file-posix.c) > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular > >> files > >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE > >> - Reject PREALLOC_MODE_METADATA > >> > >> * gluster > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE > >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL > >> - Reject PREALLOC_MODE_METADATA > >> > >> * qcow2 > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing > >> file > >> > >> * rbd with this patch > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> > >> * sheepdog > >> > >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> - Doesn't support shrinking > >> > >> * vdi > >> > >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL > >> - Doesn't support shrinking > >> > >> * blkdebug > >> * blklogwrites > >> * blkverify > >> * bochs > >> * cloop > >> * dmg > >> * ftp > >> * ftps > >> * http > >> * https > >> * luks > >> * nbd > >> * null-aio > >> * null-co > >> * nvme > >> * parallels > >> * qcow > >> * quorum > >> * replication > >> * throttle > >> * vhdx > >> * vmdk > >> * vpc > >> * vvfat > >> * vxhs > >> > >> - These appear not to use PreallocMode: they don't implement > >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or > >> implement it without a prealloc parameter. > >> > >> Looks good to me. > >> > > > > Thanks for the analysis! > > > >> > + > >> > + ret = 0; > >> > + > >> > +out: > >> > + g_free(buf); > >> > + return ret; > >> > +} > >> > + > >> > static QemuOptsList runtime_opts = { > >> > .name = "rbd", > >> > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > >> [...] > >> > diff --git a/qapi/block-core.json b/qapi/block-core.json > >> > index 7ccbfff9d0..db25a4065b 100644 > >> > --- a/qapi/block-core.json > >> > +++ b/qapi/block-core.json > >> > @@ -4277,13 +4277,15 @@ > >> > # point to a snapshot. > >> > # @size Size of the virtual disk in bytes > >> > # @cluster-size RBD object size > >> > +# @preallocation Preallocation mode (allowed values: off, full) > >> > # > >> > # Since: 2.12 > >> > ## > >> > { 'struct': 'BlockdevCreateOptionsRbd', > >> > 'data': { 'location': 'BlockdevOptionsRbd', > >> > 'size': 'size', > >> > - '*cluster-size' : 'size' } } > >> > + '*cluster-size' : 'size', > >> > + '*preallocation': 'PreallocMode' } } > >> > > >> > ## > >> > # @BlockdevVmdkSubformat: > >> > >> The non-support of values 'metadata' and 'falloc' is not visible in > >> introspection, only in documentation. No reason to block this patch, as > >> the other block drivers have the same introspection weakness (only > >> sheepdog and vdi bother to document). > >> > >> Should we address the introspection weakness? Only if there's a use for > >> the information, I think. > > > > If the management applications will use that information (or maybe also > > our help pages), could be useful to have an array of 'PreallocMode' > > supported per-driver. > > Ideally, query-qmp-schema would show only the supported values. > > Not hard to do, just tedious: we'd get a number of sub-enums in addition > to the full one, and we'd have to map from sub-enum to the full one. > > QAPI language support for sub-enums would remove most of the tedium. > Not worthwhile unless the need for sub-enums is actually common. I should study better the QMP and QAPI to understand how to implement the sub-enums. If you agree, I'll put it as a background task, until somebody from management applications tell us his interest. > > >> Should we improve documentation for the other block drivers? > >> > > > > Yes, e.g. for Gluster it is not updated. > > If you agree, I can check and update the documentation of all drivers following > > your analysis. > > Yes, please! Okay, I'll send a patch to update it. Thanks, Stefano
Stefano Garzarella <sgarzare@redhat.com> writes: > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote: >> Stefano Garzarella <sgarzare@redhat.com> writes: >> >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: >> >> Cc: Peter for a libvirt perspective. >> >> >> >> Stefano Garzarella <sgarzare@redhat.com> writes: >> >> >> >> > This patch adds the support of preallocation (off/full) for the RBD >> >> > block driver. >> >> > If available, we use rbd_writesame() to quickly fill the image when >> >> > full preallocation is required. >> >> > >> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> >> >> > --- >> >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- >> >> > qapi/block-core.json | 4 +- >> >> > 2 files changed, 136 insertions(+), 17 deletions(-) >> >> > >> >> > diff --git a/block/rbd.c b/block/rbd.c >> >> > index 0c549c9935..29dd1bb040 100644 >> >> > --- a/block/rbd.c >> >> > +++ b/block/rbd.c >> >> > @@ -13,6 +13,7 @@ >> >> > >> >> > #include "qemu/osdep.h" >> >> > >> >> > +#include "qemu/units.h" >> >> > #include <rbd/librbd.h> >> >> > #include "qapi/error.h" >> >> > #include "qemu/error-report.h" >> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) >> >> > } >> >> > } >> >> > >> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, >> >> > + PreallocMode prealloc, Error **errp) >> >> > +{ >> >> > + uint64_t current_length; >> >> > + char *buf = NULL; >> >> > + int ret; >> >> > + >> >> > + ret = rbd_get_size(image, ¤t_length); >> >> > + if (ret < 0) { >> >> > + error_setg_errno(errp, -ret, "Failed to get file length"); >> >> > + goto out; >> >> > + } >> >> > + >> >> > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { >> >> > + error_setg(errp, "Cannot use preallocation for shrinking files"); >> >> > + ret = -ENOTSUP; >> >> > + goto out; >> >> > + } >> >> > + >> >> > + switch (prealloc) { >> >> > + case PREALLOC_MODE_FULL: { >> >> [...] >> >> > + case PREALLOC_MODE_OFF: >> >> [...] >> >> > + default: >> >> > + error_setg(errp, "Unsupported preallocation mode: %s", >> >> > + PreallocMode_str(prealloc)); >> >> > + ret = -ENOTSUP; >> >> > + goto out; >> >> > + } >> >> >> >> Other block drivers also accept only some values of PreallocMode. Okay. >> >> >> >> I wonder whether management applications need to know which values are >> >> supported. >> > >> > Good point! >> >> We can continue to assume they don't until somebody tells us otherwise. >> >> >> Let me review support in drivers: >> >> >> >> * file (file-win32.c) >> >> * iscsi >> >> * nfs >> >> * qed >> >> * ssh >> >> >> >> - Reject all but PREALLOC_MODE_OFF >> >> >> >> * copy-on-read >> >> * luks (crypto.c) >> >> * raw >> >> >> >> - Pass through only >> >> >> >> * file host_cdrom host_device (file-posix.c) >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular >> >> files >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE >> >> - Reject PREALLOC_MODE_METADATA >> >> >> >> * gluster >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE >> >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL >> >> - Reject PREALLOC_MODE_METADATA >> >> >> >> * qcow2 >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing >> >> file >> >> >> >> * rbd with this patch >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> >> >> >> * sheepdog >> >> >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> >> - Doesn't support shrinking >> >> >> >> * vdi >> >> >> >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL >> >> - Doesn't support shrinking >> >> >> >> * blkdebug >> >> * blklogwrites >> >> * blkverify >> >> * bochs >> >> * cloop >> >> * dmg >> >> * ftp >> >> * ftps >> >> * http >> >> * https >> >> * luks >> >> * nbd >> >> * null-aio >> >> * null-co >> >> * nvme >> >> * parallels >> >> * qcow >> >> * quorum >> >> * replication >> >> * throttle >> >> * vhdx >> >> * vmdk >> >> * vpc >> >> * vvfat >> >> * vxhs >> >> >> >> - These appear not to use PreallocMode: they don't implement >> >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or >> >> implement it without a prealloc parameter. >> >> >> >> Looks good to me. >> >> >> > >> > Thanks for the analysis! >> > >> >> > + >> >> > + ret = 0; >> >> > + >> >> > +out: >> >> > + g_free(buf); >> >> > + return ret; >> >> > +} >> >> > + >> >> > static QemuOptsList runtime_opts = { >> >> > .name = "rbd", >> >> > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), >> >> [...] >> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json >> >> > index 7ccbfff9d0..db25a4065b 100644 >> >> > --- a/qapi/block-core.json >> >> > +++ b/qapi/block-core.json >> >> > @@ -4277,13 +4277,15 @@ >> >> > # point to a snapshot. >> >> > # @size Size of the virtual disk in bytes >> >> > # @cluster-size RBD object size >> >> > +# @preallocation Preallocation mode (allowed values: off, full) >> >> > # >> >> > # Since: 2.12 >> >> > ## >> >> > { 'struct': 'BlockdevCreateOptionsRbd', >> >> > 'data': { 'location': 'BlockdevOptionsRbd', >> >> > 'size': 'size', >> >> > - '*cluster-size' : 'size' } } >> >> > + '*cluster-size' : 'size', >> >> > + '*preallocation': 'PreallocMode' } } >> >> > >> >> > ## >> >> > # @BlockdevVmdkSubformat: >> >> >> >> The non-support of values 'metadata' and 'falloc' is not visible in >> >> introspection, only in documentation. No reason to block this patch, as >> >> the other block drivers have the same introspection weakness (only >> >> sheepdog and vdi bother to document). >> >> >> >> Should we address the introspection weakness? Only if there's a use for >> >> the information, I think. >> > >> > If the management applications will use that information (or maybe also >> > our help pages), could be useful to have an array of 'PreallocMode' >> > supported per-driver. >> >> Ideally, query-qmp-schema would show only the supported values. >> >> Not hard to do, just tedious: we'd get a number of sub-enums in addition >> to the full one, and we'd have to map from sub-enum to the full one. >> >> QAPI language support for sub-enums would remove most of the tedium. >> Not worthwhile unless the need for sub-enums is actually common. > > I should study better the QMP and QAPI to understand how to implement > the sub-enums. Sub-enums of { 'enum': 'PreallocMode', 'data': [ 'off', 'metadata', 'falloc', 'full' ] } done the obvious way: { 'enum': 'PreallocModeOff', 'data': [ 'off' ] } { 'enum': 'PreallocModeOffPosix', 'data': [ 'off', 'metadata', { 'name': 'falloc', 'if': 'defined(CONFIG_POSIX_FALLOCATE)' }, 'full' ] } and so forth. This generates a bunch of different C enum types in addition to PreallocMode: PreallocModeOff, PreallocModePosix, ... Common C code continues to use just PreallocMode. The QMP command handlers using sub-enums will have to map between the sub-enums and PreallocMode. Tedious. With QAPI language support for sub-enums, we could eliminate the additional C enums. > If you agree, I'll put it as a background task, until somebody from > management applications tell us his interest. Only act if there's a compelling use case. >> >> Should we improve documentation for the other block drivers? >> >> >> > >> > Yes, e.g. for Gluster it is not updated. >> > If you agree, I can check and update the documentation of all drivers following >> > your analysis. >> >> Yes, please! > > Okay, I'll send a patch to update it. > > Thanks, > Stefano
On Thu, May 09, 2019 at 10:26:46 +0200, Stefano Garzarella wrote: > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote: > > Stefano Garzarella <sgarzare@redhat.com> writes: > > > > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > > >> Cc: Peter for a libvirt perspective. > > >> > > >> Stefano Garzarella <sgarzare@redhat.com> writes: > > >> > > >> > This patch adds the support of preallocation (off/full) for the RBD > > >> > block driver. > > >> > If available, we use rbd_writesame() to quickly fill the image when > > >> > full preallocation is required. > > >> > > > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > >> > --- > > >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > >> > qapi/block-core.json | 4 +- > > >> > 2 files changed, 136 insertions(+), 17 deletions(-) [...] > > >> > ## > > >> > # @BlockdevVmdkSubformat: > > >> > > >> The non-support of values 'metadata' and 'falloc' is not visible in > > >> introspection, only in documentation. No reason to block this patch, as > > >> the other block drivers have the same introspection weakness (only > > >> sheepdog and vdi bother to document). > > >> > > >> Should we address the introspection weakness? Only if there's a use for > > >> the information, I think. > > > > > > If the management applications will use that information (or maybe also > > > our help pages), could be useful to have an array of 'PreallocMode' > > > supported per-driver. > > > > Ideally, query-qmp-schema would show only the supported values. > > > > Not hard to do, just tedious: we'd get a number of sub-enums in addition > > to the full one, and we'd have to map from sub-enum to the full one. > > > > QAPI language support for sub-enums would remove most of the tedium. > > Not worthwhile unless the need for sub-enums is actually common. > > I should study better the QMP and QAPI to understand how to implement > the sub-enums. > > If you agree, I'll put it as a background task, until somebody from > management applications tell us his interest. Sorry for the late response. Libvirt currently does not deal that much with the preallocation settings. Preallocation isn't in current state implemented at all for 'blockdev-create' and only the 'metadata' and 'falloc' modes are used in the storage driver via qemu-img. We currently hardcode the knowledge for which formats actually support it internally. I'd say it's not criticall to expose this in the QMP schema but obviously if we'll ever need to use it for a recent enough qemu it's welcome to have a way to check.
On Thu, May 09, 2019 at 02:07:34PM +0200, Markus Armbruster wrote: > Stefano Garzarella <sgarzare@redhat.com> writes: > > > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote: > >> Stefano Garzarella <sgarzare@redhat.com> writes: > >> > >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > >> >> Cc: Peter for a libvirt perspective. > >> >> > >> >> Stefano Garzarella <sgarzare@redhat.com> writes: > >> >> > >> >> > This patch adds the support of preallocation (off/full) for the RBD > >> >> > block driver. > >> >> > If available, we use rbd_writesame() to quickly fill the image when > >> >> > full preallocation is required. > >> >> > > >> >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > >> >> > --- > >> >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > >> >> > qapi/block-core.json | 4 +- > >> >> > 2 files changed, 136 insertions(+), 17 deletions(-) > >> >> > > >> >> > diff --git a/block/rbd.c b/block/rbd.c > >> >> > index 0c549c9935..29dd1bb040 100644 > >> >> > --- a/block/rbd.c > >> >> > +++ b/block/rbd.c > >> >> > @@ -13,6 +13,7 @@ > >> >> > > >> >> > #include "qemu/osdep.h" > >> >> > > >> >> > +#include "qemu/units.h" > >> >> > #include <rbd/librbd.h> > >> >> > #include "qapi/error.h" > >> >> > #include "qemu/error-report.h" > >> >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > >> >> > } > >> >> > } > >> >> > > >> >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > >> >> > + PreallocMode prealloc, Error **errp) > >> >> > +{ > >> >> > + uint64_t current_length; > >> >> > + char *buf = NULL; > >> >> > + int ret; > >> >> > + > >> >> > + ret = rbd_get_size(image, ¤t_length); > >> >> > + if (ret < 0) { > >> >> > + error_setg_errno(errp, -ret, "Failed to get file length"); > >> >> > + goto out; > >> >> > + } > >> >> > + > >> >> > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > >> >> > + error_setg(errp, "Cannot use preallocation for shrinking files"); > >> >> > + ret = -ENOTSUP; > >> >> > + goto out; > >> >> > + } > >> >> > + > >> >> > + switch (prealloc) { > >> >> > + case PREALLOC_MODE_FULL: { > >> >> [...] > >> >> > + case PREALLOC_MODE_OFF: > >> >> [...] > >> >> > + default: > >> >> > + error_setg(errp, "Unsupported preallocation mode: %s", > >> >> > + PreallocMode_str(prealloc)); > >> >> > + ret = -ENOTSUP; > >> >> > + goto out; > >> >> > + } > >> >> > >> >> Other block drivers also accept only some values of PreallocMode. Okay. > >> >> > >> >> I wonder whether management applications need to know which values are > >> >> supported. > >> > > >> > Good point! > >> > >> We can continue to assume they don't until somebody tells us otherwise. > >> > >> >> Let me review support in drivers: > >> >> > >> >> * file (file-win32.c) > >> >> * iscsi > >> >> * nfs > >> >> * qed > >> >> * ssh > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF > >> >> > >> >> * copy-on-read > >> >> * luks (crypto.c) > >> >> * raw > >> >> > >> >> - Pass through only > >> >> > >> >> * file host_cdrom host_device (file-posix.c) > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular > >> >> files > >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE > >> >> - Reject PREALLOC_MODE_METADATA > >> >> > >> >> * gluster > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE > >> >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL > >> >> - Reject PREALLOC_MODE_METADATA > >> >> > >> >> * qcow2 > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing > >> >> file > >> >> > >> >> * rbd with this patch > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> >> > >> >> * sheepdog > >> >> > >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> >> - Doesn't support shrinking > >> >> > >> >> * vdi > >> >> > >> >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL > >> >> - Doesn't support shrinking > >> >> > >> >> * blkdebug > >> >> * blklogwrites > >> >> * blkverify > >> >> * bochs > >> >> * cloop > >> >> * dmg > >> >> * ftp > >> >> * ftps > >> >> * http > >> >> * https > >> >> * luks > >> >> * nbd > >> >> * null-aio > >> >> * null-co > >> >> * nvme > >> >> * parallels > >> >> * qcow > >> >> * quorum > >> >> * replication > >> >> * throttle > >> >> * vhdx > >> >> * vmdk > >> >> * vpc > >> >> * vvfat > >> >> * vxhs > >> >> > >> >> - These appear not to use PreallocMode: they don't implement > >> >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or > >> >> implement it without a prealloc parameter. > >> >> > >> >> Looks good to me. > >> >> > >> > > >> > Thanks for the analysis! > >> > > >> >> > + > >> >> > + ret = 0; > >> >> > + > >> >> > +out: > >> >> > + g_free(buf); > >> >> > + return ret; > >> >> > +} > >> >> > + > >> >> > static QemuOptsList runtime_opts = { > >> >> > .name = "rbd", > >> >> > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > >> >> [...] > >> >> > diff --git a/qapi/block-core.json b/qapi/block-core.json > >> >> > index 7ccbfff9d0..db25a4065b 100644 > >> >> > --- a/qapi/block-core.json > >> >> > +++ b/qapi/block-core.json > >> >> > @@ -4277,13 +4277,15 @@ > >> >> > # point to a snapshot. > >> >> > # @size Size of the virtual disk in bytes > >> >> > # @cluster-size RBD object size > >> >> > +# @preallocation Preallocation mode (allowed values: off, full) > >> >> > # > >> >> > # Since: 2.12 > >> >> > ## > >> >> > { 'struct': 'BlockdevCreateOptionsRbd', > >> >> > 'data': { 'location': 'BlockdevOptionsRbd', > >> >> > 'size': 'size', > >> >> > - '*cluster-size' : 'size' } } > >> >> > + '*cluster-size' : 'size', > >> >> > + '*preallocation': 'PreallocMode' } } > >> >> > > >> >> > ## > >> >> > # @BlockdevVmdkSubformat: > >> >> > >> >> The non-support of values 'metadata' and 'falloc' is not visible in > >> >> introspection, only in documentation. No reason to block this patch, as > >> >> the other block drivers have the same introspection weakness (only > >> >> sheepdog and vdi bother to document). > >> >> > >> >> Should we address the introspection weakness? Only if there's a use for > >> >> the information, I think. > >> > > >> > If the management applications will use that information (or maybe also > >> > our help pages), could be useful to have an array of 'PreallocMode' > >> > supported per-driver. > >> > >> Ideally, query-qmp-schema would show only the supported values. > >> > >> Not hard to do, just tedious: we'd get a number of sub-enums in addition > >> to the full one, and we'd have to map from sub-enum to the full one. > >> > >> QAPI language support for sub-enums would remove most of the tedium. > >> Not worthwhile unless the need for sub-enums is actually common. > > > > I should study better the QMP and QAPI to understand how to implement > > the sub-enums. > > Sub-enums of > > { 'enum': 'PreallocMode', > 'data': [ 'off', 'metadata', 'falloc', 'full' ] } > > done the obvious way: > > { 'enum': 'PreallocModeOff', > 'data': [ 'off' ] } > { 'enum': 'PreallocModeOffPosix', > 'data': [ 'off', 'metadata', > { 'name': 'falloc', 'if': 'defined(CONFIG_POSIX_FALLOCATE)' }, > 'full' ] } > > and so forth. > > This generates a bunch of different C enum types in addition to > PreallocMode: PreallocModeOff, PreallocModePosix, ... > > Common C code continues to use just PreallocMode. The QMP command > handlers using sub-enums will have to map between the sub-enums and > PreallocMode. > > Tedious. > > With QAPI language support for sub-enums, we could eliminate the > additional C enums. > Okay, I understood your idea. Thanks for the explanation! > > If you agree, I'll put it as a background task, until somebody from > > management applications tell us his interest. > > Only act if there's a compelling use case. Sure. Thanks, Stefano
On Thu, May 09, 2019 at 03:29:13PM +0200, Peter Krempa wrote: > On Thu, May 09, 2019 at 10:26:46 +0200, Stefano Garzarella wrote: > > On Wed, May 08, 2019 at 01:44:27PM +0200, Markus Armbruster wrote: > > > Stefano Garzarella <sgarzare@redhat.com> writes: > > > > > > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > > > >> Cc: Peter for a libvirt perspective. > > > >> > > > >> Stefano Garzarella <sgarzare@redhat.com> writes: > > > >> > > > >> > This patch adds the support of preallocation (off/full) for the RBD > > > >> > block driver. > > > >> > If available, we use rbd_writesame() to quickly fill the image when > > > >> > full preallocation is required. > > > >> > > > > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > > > >> > --- > > > >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > > > >> > qapi/block-core.json | 4 +- > > > >> > 2 files changed, 136 insertions(+), 17 deletions(-) > > [...] > > > > > >> > ## > > > >> > # @BlockdevVmdkSubformat: > > > >> > > > >> The non-support of values 'metadata' and 'falloc' is not visible in > > > >> introspection, only in documentation. No reason to block this patch, as > > > >> the other block drivers have the same introspection weakness (only > > > >> sheepdog and vdi bother to document). > > > >> > > > >> Should we address the introspection weakness? Only if there's a use for > > > >> the information, I think. > > > > > > > > If the management applications will use that information (or maybe also > > > > our help pages), could be useful to have an array of 'PreallocMode' > > > > supported per-driver. > > > > > > Ideally, query-qmp-schema would show only the supported values. > > > > > > Not hard to do, just tedious: we'd get a number of sub-enums in addition > > > to the full one, and we'd have to map from sub-enum to the full one. > > > > > > QAPI language support for sub-enums would remove most of the tedium. > > > Not worthwhile unless the need for sub-enums is actually common. > > > > I should study better the QMP and QAPI to understand how to implement > > the sub-enums. > > > > If you agree, I'll put it as a background task, until somebody from > > management applications tell us his interest. > > Sorry for the late response. Libvirt currently does not deal that much > with the preallocation settings. Preallocation isn't in current state > implemented at all for 'blockdev-create' and only the 'metadata' and > 'falloc' modes are used in the storage driver via qemu-img. > > We currently hardcode the knowledge for which formats actually support > it internally. > > I'd say it's not criticall to expose this in the QMP schema but > obviously if we'll ever need to use it for a recent enough qemu it's > welcome to have a way to check. Thank you for sharing this information! Cheers, Stefano
On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote: > > Stefano Garzarella <sgarzare@redhat.com> writes: > > > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > >> Cc: Peter for a libvirt perspective. > >> > >> Stefano Garzarella <sgarzare@redhat.com> writes: > >> > >> > This patch adds the support of preallocation (off/full) for the RBD > >> > block driver. > >> > If available, we use rbd_writesame() to quickly fill the image when > >> > full preallocation is required. > >> > > >> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> > >> > --- > >> > block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- > >> > qapi/block-core.json | 4 +- > >> > 2 files changed, 136 insertions(+), 17 deletions(-) > >> > > >> > diff --git a/block/rbd.c b/block/rbd.c > >> > index 0c549c9935..29dd1bb040 100644 > >> > --- a/block/rbd.c > >> > +++ b/block/rbd.c > >> > @@ -13,6 +13,7 @@ > >> > > >> > #include "qemu/osdep.h" > >> > > >> > +#include "qemu/units.h" > >> > #include <rbd/librbd.h> > >> > #include "qapi/error.h" > >> > #include "qemu/error-report.h" > >> > @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) > >> > } > >> > } > >> > > >> > +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, > >> > + PreallocMode prealloc, Error **errp) > >> > +{ > >> > + uint64_t current_length; > >> > + char *buf = NULL; > >> > + int ret; > >> > + > >> > + ret = rbd_get_size(image, ¤t_length); > >> > + if (ret < 0) { > >> > + error_setg_errno(errp, -ret, "Failed to get file length"); > >> > + goto out; > >> > + } > >> > + > >> > + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { > >> > + error_setg(errp, "Cannot use preallocation for shrinking files"); > >> > + ret = -ENOTSUP; > >> > + goto out; > >> > + } > >> > + > >> > + switch (prealloc) { > >> > + case PREALLOC_MODE_FULL: { > >> [...] > >> > + case PREALLOC_MODE_OFF: > >> [...] > >> > + default: > >> > + error_setg(errp, "Unsupported preallocation mode: %s", > >> > + PreallocMode_str(prealloc)); > >> > + ret = -ENOTSUP; > >> > + goto out; > >> > + } > >> > >> Other block drivers also accept only some values of PreallocMode. Okay. > >> > >> I wonder whether management applications need to know which values are > >> supported. > > > > Good point! > > We can continue to assume they don't until somebody tells us otherwise. > > >> Let me review support in drivers: > >> > >> * file (file-win32.c) > >> * iscsi > >> * nfs > >> * qed > >> * ssh > >> > >> - Reject all but PREALLOC_MODE_OFF > >> > >> * copy-on-read > >> * luks (crypto.c) > >> * raw > >> > >> - Pass through only > >> > >> * file host_cdrom host_device (file-posix.c) > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular > >> files > >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE > >> - Reject PREALLOC_MODE_METADATA > >> > >> * gluster > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE > >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL > >> - Reject PREALLOC_MODE_METADATA > >> > >> * qcow2 > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing > >> file > >> > >> * rbd with this patch > >> > >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> > >> * sheepdog > >> > >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> - Doesn't support shrinking > >> > >> * vdi > >> > >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL > >> - Doesn't support shrinking > >> > >> * blkdebug > >> * blklogwrites > >> * blkverify > >> * bochs > >> * cloop > >> * dmg > >> * ftp > >> * ftps > >> * http > >> * https > >> * luks > >> * nbd > >> * null-aio > >> * null-co > >> * nvme > >> * parallels > >> * qcow > >> * quorum > >> * replication > >> * throttle > >> * vhdx > >> * vmdk > >> * vpc > >> * vvfat > >> * vxhs > >> > >> - These appear not to use PreallocMode: they don't implement > >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or > >> implement it without a prealloc parameter. > >> > >> Looks good to me. > >> > > > > Thanks for the analysis! > > > >> > + > >> > + ret = 0; > >> > + > >> > +out: > >> > + g_free(buf); > >> > + return ret; > >> > +} > >> > + > >> > static QemuOptsList runtime_opts = { > >> > .name = "rbd", > >> > .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > >> [...] > >> > diff --git a/qapi/block-core.json b/qapi/block-core.json > >> > index 7ccbfff9d0..db25a4065b 100644 > >> > --- a/qapi/block-core.json > >> > +++ b/qapi/block-core.json > >> > @@ -4277,13 +4277,15 @@ > >> > # point to a snapshot. > >> > # @size Size of the virtual disk in bytes > >> > # @cluster-size RBD object size > >> > +# @preallocation Preallocation mode (allowed values: off, full) > >> > # > >> > # Since: 2.12 > >> > ## > >> > { 'struct': 'BlockdevCreateOptionsRbd', > >> > 'data': { 'location': 'BlockdevOptionsRbd', > >> > 'size': 'size', > >> > - '*cluster-size' : 'size' } } > >> > + '*cluster-size' : 'size', > >> > + '*preallocation': 'PreallocMode' } } > >> > > >> > ## > >> > # @BlockdevVmdkSubformat: > >> > >> The non-support of values 'metadata' and 'falloc' is not visible in > >> introspection, only in documentation. No reason to block this patch, as > >> the other block drivers have the same introspection weakness (only > >> sheepdog and vdi bother to document). > >> > >> Should we address the introspection weakness? Only if there's a use for > >> the information, I think. > > > > If the management applications will use that information (or maybe also > > our help pages), could be useful to have an array of 'PreallocMode' > > supported per-driver. > > Ideally, query-qmp-schema would show only the supported values. > > Not hard to do, just tedious: we'd get a number of sub-enums in addition > to the full one, and we'd have to map from sub-enum to the full one. > > QAPI language support for sub-enums would remove most of the tedium. > Not worthwhile unless the need for sub-enums is actually common. > > >> Should we improve documentation for the other block drivers? > >> > > > > Yes, e.g. for Gluster it is not updated. > > If you agree, I can check and update the documentation of all drivers following > > your analysis. > > Yes, please! Hi Markus, I'm finally updating the documentation of preallocation modes supported by block drivers and protocols in qapi/block-core.json. As sheepdog and vdi I'm adding the supported values for each driver or protocol that supports 'preallocation' parameter during the creation, I'm also updating the '.help' in the QemuOptsList. My doubt is: where is better to put the documentation about preallocation modes supported during the resize? (e.g. some drivers support only PREALLOC_MODE_OFF when shrinking) Thanks, Stefano
Stefano Garzarella <sgarzare@redhat.com> writes: > On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote: >> >> Stefano Garzarella <sgarzare@redhat.com> writes: >> >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: [...] >> >> Let me review support in drivers: >> >> >> >> * file (file-win32.c) >> >> * iscsi >> >> * nfs >> >> * qed >> >> * ssh >> >> >> >> - Reject all but PREALLOC_MODE_OFF >> >> >> >> * copy-on-read >> >> * luks (crypto.c) >> >> * raw >> >> >> >> - Pass through only >> >> >> >> * file host_cdrom host_device (file-posix.c) >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular >> >> files >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE >> >> - Reject PREALLOC_MODE_METADATA >> >> >> >> * gluster >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE >> >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL >> >> - Reject PREALLOC_MODE_METADATA >> >> >> >> * qcow2 >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing >> >> file >> >> >> >> * rbd with this patch >> >> >> >> - Reject all but PREALLOC_MODE_OFF when shrinking >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> >> >> >> * sheepdog >> >> >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC >> >> - Doesn't support shrinking >> >> >> >> * vdi >> >> >> >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL >> >> - Doesn't support shrinking >> >> >> >> * blkdebug >> >> * blklogwrites >> >> * blkverify >> >> * bochs >> >> * cloop >> >> * dmg >> >> * ftp >> >> * ftps >> >> * http >> >> * https >> >> * luks >> >> * nbd >> >> * null-aio >> >> * null-co >> >> * nvme >> >> * parallels >> >> * qcow >> >> * quorum >> >> * replication >> >> * throttle >> >> * vhdx >> >> * vmdk >> >> * vpc >> >> * vvfat >> >> * vxhs >> >> >> >> - These appear not to use PreallocMode: they don't implement >> >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or >> >> implement it without a prealloc parameter. >> >> >> >> Looks good to me. >> >> >> > >> > Thanks for the analysis! [...] >> > If you agree, I can check and update the documentation of all drivers following >> > your analysis. >> >> Yes, please! > > > Hi Markus, > I'm finally updating the documentation of preallocation modes > supported by block drivers and protocols in qapi/block-core.json. > As sheepdog and vdi I'm adding the supported values for each driver or > protocol that supports 'preallocation' parameter during the creation, > I'm also updating the '.help' in the QemuOptsList. > > My doubt is: where is better to put the documentation about > preallocation modes supported during the resize? (e.g. some drivers > support only PREALLOC_MODE_OFF when shrinking) As far as I can tell, no driver supports anything but PREALLOC_MODE_OFF when shrinking. Suggest to ignore the shrinking case for now when documenting. I'm not sure I fully answered your question. Don't hesitate to ask for more advice.
On Wed, May 22, 2019 at 06:25:17PM +0200, Markus Armbruster wrote: > Stefano Garzarella <sgarzare@redhat.com> writes: > > > On Wed, May 8, 2019 at 1:44 PM Markus Armbruster <armbru@redhat.com> wrote: > >> > >> Stefano Garzarella <sgarzare@redhat.com> writes: > >> > >> > On Tue, May 07, 2019 at 08:34:51AM +0200, Markus Armbruster wrote: > [...] > >> >> Let me review support in drivers: > >> >> > >> >> * file (file-win32.c) > >> >> * iscsi > >> >> * nfs > >> >> * qed > >> >> * ssh > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF > >> >> > >> >> * copy-on-read > >> >> * luks (crypto.c) > >> >> * raw > >> >> > >> >> - Pass through only > >> >> > >> >> * file host_cdrom host_device (file-posix.c) > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and for non-regular > >> >> files > >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_POSIX_FALLOCATE > >> >> - Reject PREALLOC_MODE_METADATA > >> >> > >> >> * gluster > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> >> - Reject PREALLOC_MODE_FALLOC unless CONFIG_GLUSTERFS_FALLOCATE > >> >> - Reject PREALLOC_MODE_FULL unless CONFIG_GLUSTERFS_ZEROFILL > >> >> - Reject PREALLOC_MODE_METADATA > >> >> > >> >> * qcow2 > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking and with a backing > >> >> file > >> >> > >> >> * rbd with this patch > >> >> > >> >> - Reject all but PREALLOC_MODE_OFF when shrinking > >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> >> > >> >> * sheepdog > >> >> > >> >> - Reject PREALLOC_MODE_METADATA and PREALLOC_MODE_FALLOC > >> >> - Doesn't support shrinking > >> >> > >> >> * vdi > >> >> > >> >> - Reject PREALLOC_MODE_FALLOC and PREALLOC_MODE_FULL > >> >> - Doesn't support shrinking > >> >> > >> >> * blkdebug > >> >> * blklogwrites > >> >> * blkverify > >> >> * bochs > >> >> * cloop > >> >> * dmg > >> >> * ftp > >> >> * ftps > >> >> * http > >> >> * https > >> >> * luks > >> >> * nbd > >> >> * null-aio > >> >> * null-co > >> >> * nvme > >> >> * parallels > >> >> * qcow > >> >> * quorum > >> >> * replication > >> >> * throttle > >> >> * vhdx > >> >> * vmdk > >> >> * vpc > >> >> * vvfat > >> >> * vxhs > >> >> > >> >> - These appear not to use PreallocMode: they don't implement > >> >> .bdrv_co_truncate(), and either don't implement .bdrv_co_create() or > >> >> implement it without a prealloc parameter. > >> >> > >> >> Looks good to me. > >> >> > >> > > >> > Thanks for the analysis! > [...] > >> > If you agree, I can check and update the documentation of all drivers following > >> > your analysis. > >> > >> Yes, please! > > > > > > Hi Markus, > > I'm finally updating the documentation of preallocation modes > > supported by block drivers and protocols in qapi/block-core.json. > > As sheepdog and vdi I'm adding the supported values for each driver or > > protocol that supports 'preallocation' parameter during the creation, > > I'm also updating the '.help' in the QemuOptsList. > > > > My doubt is: where is better to put the documentation about > > preallocation modes supported during the resize? (e.g. some drivers > > support only PREALLOC_MODE_OFF when shrinking) > > As far as I can tell, no driver supports anything but PREALLOC_MODE_OFF > when shrinking. Suggest to ignore the shrinking case for now when > documenting. > Okay, I'll ignore it for now. > I'm not sure I fully answered your question. Don't hesitate to ask for > more advice. Yes, your answer is what I was looking for :) Thanks, Stefano
diff --git a/block/rbd.c b/block/rbd.c index 0c549c9935..29dd1bb040 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -13,6 +13,7 @@ #include "qemu/osdep.h" +#include "qemu/units.h" #include <rbd/librbd.h> #include "qapi/error.h" #include "qemu/error-report.h" @@ -331,6 +332,110 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) } } +static int qemu_rbd_do_truncate(rbd_image_t image, int64_t offset, + PreallocMode prealloc, Error **errp) +{ + uint64_t current_length; + char *buf = NULL; + int ret; + + ret = rbd_get_size(image, ¤t_length); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get file length"); + goto out; + } + + if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Cannot use preallocation for shrinking files"); + ret = -ENOTSUP; + goto out; + } + + switch (prealloc) { + case PREALLOC_MODE_FULL: { + uint64_t current_offset = current_length; + int buf_size = 64 * KiB; + ssize_t bytes; + + buf = g_malloc(buf_size); + /* + * Some versions of rbd_writesame() discards writes of buffers with + * all zeroes. In order to avoid this behaviour, we set the first byte + * to one. + */ + buf[0] = 1; + + ret = rbd_resize(image, offset); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to resize file"); + goto out; + } + +#ifdef LIBRBD_SUPPORTS_WRITESAME + while (offset - current_offset > buf_size) { + /* + * rbd_writesame() supports only request where the size of the + * operation is multiple of buffer size and it must be less or + * equal to INT_MAX. + */ + bytes = MIN(offset - current_offset, INT_MAX); + bytes -= bytes % buf_size; + + bytes = rbd_writesame(image, current_offset, bytes, buf, buf_size, + 0); + if (bytes < 0) { + ret = bytes; + error_setg_errno(errp, -ret, + "Failed to write for preallocation"); + goto out; + } + + current_offset += bytes; + } +#endif /* LIBRBD_SUPPORTS_WRITESAME */ + + while (current_offset < offset) { + bytes = rbd_write(image, current_offset, + MIN(offset - current_offset, buf_size), buf); + if (bytes < 0) { + ret = bytes; + error_setg_errno(errp, -ret, + "Failed to write for preallocation"); + goto out; + } + + current_offset += bytes; + } + + ret = rbd_flush(image); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to flush the file"); + goto out; + } + + break; + } + case PREALLOC_MODE_OFF: + ret = rbd_resize(image, offset); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to resize file"); + goto out; + } + break; + default: + error_setg(errp, "Unsupported preallocation mode: %s", + PreallocMode_str(prealloc)); + ret = -ENOTSUP; + goto out; + } + + ret = 0; + +out: + g_free(buf); + return ret; +} + static QemuOptsList runtime_opts = { .name = "rbd", .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), @@ -376,6 +481,7 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, BlockdevCreateOptionsRbd *opts = &options->u.rbd; rados_t cluster; rados_ioctx_t io_ctx; + rbd_image_t image; int obj_order = 0; int ret; @@ -404,13 +510,21 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, return ret; } - ret = rbd_create(io_ctx, opts->location->image, opts->size, &obj_order); + ret = rbd_create(io_ctx, opts->location->image, 0, &obj_order); if (ret < 0) { error_setg_errno(errp, -ret, "error rbd create"); goto out; } - ret = 0; + ret = rbd_open(io_ctx, opts->location->image, &image, NULL); + if (ret < 0) { + error_setg_errno(errp, -ret, "error rbd open"); + goto out; + } + + ret = qemu_rbd_do_truncate(image, opts->size, opts->preallocation, errp); + + rbd_close(image); out: rados_ioctx_destroy(io_ctx); rados_shutdown(cluster); @@ -431,6 +545,7 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, BlockdevOptionsRbd *loc; Error *local_err = NULL; const char *keypairs, *password_secret; + char *prealloc; QDict *options = NULL; int ret = 0; @@ -449,6 +564,16 @@ static int coroutine_fn qemu_rbd_co_create_opts(const char *filename, BLOCK_OPT_CLUSTER_SIZE, 0); rbd_opts->has_cluster_size = (rbd_opts->cluster_size != 0); + prealloc = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + rbd_opts->preallocation = qapi_enum_parse(&PreallocMode_lookup, prealloc, + PREALLOC_MODE_OFF, &local_err); + g_free(prealloc); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto exit; + } + options = qdict_new(); qemu_rbd_parse_filename(filename, options, &local_err); if (local_err) { @@ -1052,21 +1177,8 @@ static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, Error **errp) { BDRVRBDState *s = bs->opaque; - int r; - - if (prealloc != PREALLOC_MODE_OFF) { - error_setg(errp, "Unsupported preallocation mode '%s'", - PreallocMode_str(prealloc)); - return -ENOTSUP; - } - - r = rbd_resize(s->image, offset); - if (r < 0) { - error_setg_errno(errp, -r, "Failed to resize file"); - return r; - } - return 0; + return qemu_rbd_do_truncate(s->image, offset, prealloc, errp); } static int qemu_rbd_snap_create(BlockDriverState *bs, @@ -1219,6 +1331,11 @@ static QemuOptsList qemu_rbd_create_opts = { .type = QEMU_OPT_SIZE, .help = "RBD object size" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, { .name = "password-secret", .type = QEMU_OPT_STRING, diff --git a/qapi/block-core.json b/qapi/block-core.json index 7ccbfff9d0..db25a4065b 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4277,13 +4277,15 @@ # point to a snapshot. # @size Size of the virtual disk in bytes # @cluster-size RBD object size +# @preallocation Preallocation mode (allowed values: off, full) # # Since: 2.12 ## { 'struct': 'BlockdevCreateOptionsRbd', 'data': { 'location': 'BlockdevOptionsRbd', 'size': 'size', - '*cluster-size' : 'size' } } + '*cluster-size' : 'size', + '*preallocation': 'PreallocMode' } } ## # @BlockdevVmdkSubformat:
This patch adds the support of preallocation (off/full) for the RBD block driver. If available, we use rbd_writesame() to quickly fill the image when full preallocation is required. Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> --- block/rbd.c | 149 ++++++++++++++++++++++++++++++++++++++----- qapi/block-core.json | 4 +- 2 files changed, 136 insertions(+), 17 deletions(-)