diff mbox series

[V2] block/rbd: implement bdrv_co_block_status

Message ID 20210810134124.18523-1-pl@kamp.de (mailing list archive)
State New, archived
Headers show
Series [V2] block/rbd: implement bdrv_co_block_status | expand

Commit Message

Peter Lieven Aug. 10, 2021, 1:41 p.m. UTC
the qemu rbd driver currently lacks support for bdrv_co_block_status.
This results mainly in incorrect progress during block operations (e.g.
qemu-img convert with an rbd image as source).

This patch utilizes the rbd_diff_iterate2 call from librbd to detect
allocated and unallocated (all zero areas).

To avoid querying the ceph OSDs for the answer this is only done if
the image has the fast-diff features which depends on the object-map
and exclusive-lock. In this case it is guaranteed that the information
is present in memory in the librbd client and thus very fast.

If fast-diff is not available all areas are reported to be allocated
which is the current behaviour if bdrv_co_block_status is not implemented.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
V1->V2:
- add commit comment [Stefano]
- use failed_post_open [Stefano]
- remove redundant assert [Stefano]
- add macro+comment for the magic -9000 value [Stefano]
- always set *file if its non NULL [Stefano]

 block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

Comments

Ilya Dryomov Aug. 22, 2021, 9:02 p.m. UTC | #1
On Tue, Aug 10, 2021 at 3:41 PM Peter Lieven <pl@kamp.de> wrote:
>
> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> This results mainly in incorrect progress during block operations (e.g.
> qemu-img convert with an rbd image as source).
>
> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> allocated and unallocated (all zero areas).
>
> To avoid querying the ceph OSDs for the answer this is only done if
> the image has the fast-diff features which depends on the object-map

Hi Peter,

Nit: "has the fast-diff feature which depends on the object-map and
exclusive-lock features"

> and exclusive-lock. In this case it is guaranteed that the information
> is present in memory in the librbd client and thus very fast.
>
> If fast-diff is not available all areas are reported to be allocated
> which is the current behaviour if bdrv_co_block_status is not implemented.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> V1->V2:
> - add commit comment [Stefano]
> - use failed_post_open [Stefano]
> - remove redundant assert [Stefano]
> - add macro+comment for the magic -9000 value [Stefano]
> - always set *file if its non NULL [Stefano]
>
>  block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 125 insertions(+)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index dcf82b15b8..8692e76f40 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -88,6 +88,7 @@ typedef struct BDRVRBDState {
>      char *namespace;
>      uint64_t image_size;
>      uint64_t object_size;
> +    uint64_t features;
>  } BDRVRBDState;
>
>  typedef struct RBDTask {
> @@ -983,6 +984,13 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>      s->image_size = info.size;
>      s->object_size = info.obj_size;
>
> +    r = rbd_get_features(s->image, &s->features);
> +    if (r < 0) {
> +        error_setg_errno(errp, -r, "error getting image features from %s",
> +                         s->image_name);
> +        goto failed_post_open;
> +    }

The object-map and fast-diff features can be enabled/disabled while the
image is open so this should probably go to qemu_rbd_co_block_status().

> +
>      /* If we are using an rbd snapshot, we must be r/o, otherwise
>       * leave as-is */
>      if (s->snap != NULL) {
> @@ -1259,6 +1267,122 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>      return spec_info;
>  }
>
> +typedef struct rbd_diff_req {
> +    uint64_t offs;
> +    uint64_t bytes;
> +    int exists;
> +} rbd_diff_req;
> +
> +/*
> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> + * value in the callback routine. Choose a value that does not conflict with
> + * an existing exitcode and return it if we want to prematurely stop the
> + * execution because we detected a change in the allocation status.
> + */
> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> +
> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> +                                       int exists, void *opaque)
> +{
> +    struct rbd_diff_req *req = opaque;
> +
> +    assert(req->offs + req->bytes <= offs);
> +
> +    if (req->exists && offs > req->offs + req->bytes) {
> +        /*
> +         * we started in an allocated area and jumped over an unallocated area,
> +         * req->bytes contains the length of the allocated area before the
> +         * unallocated area. stop further processing.
> +         */
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> +    }
> +    if (req->exists && !exists) {
> +        /*
> +         * we started in an allocated area and reached a hole. req->bytes
> +         * contains the length of the allocated area before the hole.
> +         * stop further processing.
> +         */
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> +    }
> +    if (!req->exists && exists && offs > req->offs) {
> +        /*
> +         * we started in an unallocated area and hit the first allocated
> +         * block. req->bytes must be set to the length of the unallocated area
> +         * before the allocated area. stop further processing.
> +         */
> +        req->bytes = offs - req->offs;
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> +    }
> +
> +    /*
> +     * assert that we catched all cases above and allocation state has not

catched -> caught

> +     * changed during callbacks.
> +     */
> +    assert(exists == req->exists || !req->bytes);
> +    req->exists = exists;
> +
> +    /*
> +     * assert that we either return an unallocated block or have got callbacks
> +     * for all allocated blocks present.
> +     */
> +    assert(!req->exists || offs == req->offs + req->bytes);
> +    req->bytes = offs + len - req->offs;
> +
> +    return 0;
> +}
> +
> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> +                                                 bool want_zero, int64_t offset,
> +                                                 int64_t bytes, int64_t *pnum,
> +                                                 int64_t *map,
> +                                                 BlockDriverState **file)
> +{
> +    BDRVRBDState *s = bs->opaque;
> +    int ret, r;
> +    struct rbd_diff_req req = { .offs = offset };
> +
> +    assert(offset + bytes <= s->image_size);
> +
> +    /* default to all sectors allocated */
> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;

I'm a little confused by the meaning of these flags (but I haven't
looked at the other drivers yet).  Looks like this patch always sets
BDRV_BLOCK_OFFSET_VALID (makes sense since the "host" offset is always
known for rbd) and returns either BDRV_BLOCK_DATA or BDRV_BLOCK_ZERO.

DATA ZERO OFFSET_VALID
 t    t        t       sectors read as zero, returned file is zero at offset
 t    f        t       sectors read as valid from file at offset
 f    t        t       sectors preallocated, read as zero, returned file not
                       necessarily zero at offset
 f    f        t       sectors preallocated but read from backing_hd,
                       returned file contains garbage at offset

What about the first case (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)?
What is the practical difference to just BDRV_BLOCK_ZERO?

> +    if (map) {
> +        *map = offset;
> +    }
> +    if (file) {
> +        *file = bs;
> +    }

A comment in block_int.h says that map and file are guaranteed to be
non-NULL so these tests seem redundant?

> +    *pnum = bytes;
> +
> +    /* RBD image does not support fast-diff */
> +    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
> +        goto out;
> +    }

Need to make sure that fast-diff is valid here: call rbd_get_flags()
on the image and test for !RBD_FLAG_FAST_DIFF_INVALID.

> +
> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
> +                          qemu_rbd_co_block_status_cb, &req);
> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
> +        goto out;
> +    }
> +    assert(req.bytes <= bytes);
> +    if (!req.exists) {
> +        if (r == 0 && !req.bytes) {
> +            /*
> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
> +             * except for the case where an overlay has a hole where the parent
> +             * has not. This here catches the case where no callback was
> +             * invoked at all.
> +             */
> +            req.bytes = bytes;
> +        }
> +        ret &= ~BDRV_BLOCK_DATA;
> +        ret |= BDRV_BLOCK_ZERO;

Is this equivalent to ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID?
If so, that would be clearer IMO.

Thanks,

                Ilya
Peter Lieven Aug. 23, 2021, 9:38 a.m. UTC | #2
Am 22.08.21 um 23:02 schrieb Ilya Dryomov:
> On Tue, Aug 10, 2021 at 3:41 PM Peter Lieven <pl@kamp.de> wrote:
>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>> This results mainly in incorrect progress during block operations (e.g.
>> qemu-img convert with an rbd image as source).
>>
>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>> allocated and unallocated (all zero areas).
>>
>> To avoid querying the ceph OSDs for the answer this is only done if
>> the image has the fast-diff features which depends on the object-map
> Hi Peter,
>
> Nit: "has the fast-diff feature which depends on the object-map and
> exclusive-lock features"


will reword in V3.


>
>> and exclusive-lock. In this case it is guaranteed that the information
>> is present in memory in the librbd client and thus very fast.
>>
>> If fast-diff is not available all areas are reported to be allocated
>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> V1->V2:
>> - add commit comment [Stefano]
>> - use failed_post_open [Stefano]
>> - remove redundant assert [Stefano]
>> - add macro+comment for the magic -9000 value [Stefano]
>> - always set *file if its non NULL [Stefano]
>>
>>   block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 125 insertions(+)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index dcf82b15b8..8692e76f40 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -88,6 +88,7 @@ typedef struct BDRVRBDState {
>>       char *namespace;
>>       uint64_t image_size;
>>       uint64_t object_size;
>> +    uint64_t features;
>>   } BDRVRBDState;
>>
>>   typedef struct RBDTask {
>> @@ -983,6 +984,13 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>>       s->image_size = info.size;
>>       s->object_size = info.obj_size;
>>
>> +    r = rbd_get_features(s->image, &s->features);
>> +    if (r < 0) {
>> +        error_setg_errno(errp, -r, "error getting image features from %s",
>> +                         s->image_name);
>> +        goto failed_post_open;
>> +    }
> The object-map and fast-diff features can be enabled/disabled while the
> image is open so this should probably go to qemu_rbd_co_block_status().
>
>> +
>>       /* If we are using an rbd snapshot, we must be r/o, otherwise
>>        * leave as-is */
>>       if (s->snap != NULL) {
>> @@ -1259,6 +1267,122 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>       return spec_info;
>>   }
>>
>> +typedef struct rbd_diff_req {
>> +    uint64_t offs;
>> +    uint64_t bytes;
>> +    int exists;
>> +} rbd_diff_req;
>> +
>> +/*
>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>> + * value in the callback routine. Choose a value that does not conflict with
>> + * an existing exitcode and return it if we want to prematurely stop the
>> + * execution because we detected a change in the allocation status.
>> + */
>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>> +
>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>> +                                       int exists, void *opaque)
>> +{
>> +    struct rbd_diff_req *req = opaque;
>> +
>> +    assert(req->offs + req->bytes <= offs);
>> +
>> +    if (req->exists && offs > req->offs + req->bytes) {
>> +        /*
>> +         * we started in an allocated area and jumped over an unallocated area,
>> +         * req->bytes contains the length of the allocated area before the
>> +         * unallocated area. stop further processing.
>> +         */
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>> +    }
>> +    if (req->exists && !exists) {
>> +        /*
>> +         * we started in an allocated area and reached a hole. req->bytes
>> +         * contains the length of the allocated area before the hole.
>> +         * stop further processing.
>> +         */
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>> +    }
>> +    if (!req->exists && exists && offs > req->offs) {
>> +        /*
>> +         * we started in an unallocated area and hit the first allocated
>> +         * block. req->bytes must be set to the length of the unallocated area
>> +         * before the allocated area. stop further processing.
>> +         */
>> +        req->bytes = offs - req->offs;
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>> +    }
>> +
>> +    /*
>> +     * assert that we catched all cases above and allocation state has not
> catched -> caught
>
>> +     * changed during callbacks.
>> +     */
>> +    assert(exists == req->exists || !req->bytes);
>> +    req->exists = exists;
>> +
>> +    /*
>> +     * assert that we either return an unallocated block or have got callbacks
>> +     * for all allocated blocks present.
>> +     */
>> +    assert(!req->exists || offs == req->offs + req->bytes);
>> +    req->bytes = offs + len - req->offs;
>> +
>> +    return 0;
>> +}
>> +
>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>> +                                                 bool want_zero, int64_t offset,
>> +                                                 int64_t bytes, int64_t *pnum,
>> +                                                 int64_t *map,
>> +                                                 BlockDriverState **file)
>> +{
>> +    BDRVRBDState *s = bs->opaque;
>> +    int ret, r;
>> +    struct rbd_diff_req req = { .offs = offset };
>> +
>> +    assert(offset + bytes <= s->image_size);
>> +
>> +    /* default to all sectors allocated */
>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> I'm a little confused by the meaning of these flags (but I haven't
> looked at the other drivers yet).  Looks like this patch always sets
> BDRV_BLOCK_OFFSET_VALID (makes sense since the "host" offset is always
> known for rbd) and returns either BDRV_BLOCK_DATA or BDRV_BLOCK_ZERO.
>
> DATA ZERO OFFSET_VALID
>   t    t        t       sectors read as zero, returned file is zero at offset
>   t    f        t       sectors read as valid from file at offset
>   f    t        t       sectors preallocated, read as zero, returned file not
>                         necessarily zero at offset
>   f    f        t       sectors preallocated but read from backing_hd,
>                         returned file contains garbage at offset
>
> What about the first case (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)?
> What is the practical difference to just BDRV_BLOCK_ZERO?


Actually I don't know, I adapted the flags from other drivers. Looking at your

table it seems that DATA + ZERO would be more appropriate for rbd, right?

We do not preallocate in qem/rbd yet.


>
>> +    if (map) {
>> +        *map = offset;
>> +    }
>> +    if (file) {
>> +        *file = bs;
>> +    }
> A comment in block_int.h says that map and file are guaranteed to be
> non-NULL so these tests seem redundant?


This is also Copy&Paste from other drivers, will change it (and maybe

in other drivers as well).



>
>> +    *pnum = bytes;
>> +
>> +    /* RBD image does not support fast-diff */
>> +    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
>> +        goto out;
>> +    }
> Need to make sure that fast-diff is valid here: call rbd_get_flags()
> on the image and test for !RBD_FLAG_FAST_DIFF_INVALID.


Do I have to check for feature FAST_DIFF + flag !FAST_DIFF_INVALID or

is the second enough? Is this call fast?


>
>> +
>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>> +                          qemu_rbd_co_block_status_cb, &req);
>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>> +        goto out;
>> +    }
>> +    assert(req.bytes <= bytes);
>> +    if (!req.exists) {
>> +        if (r == 0 && !req.bytes) {
>> +            /*
>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>> +             * except for the case where an overlay has a hole where the parent
>> +             * has not. This here catches the case where no callback was
>> +             * invoked at all.
>> +             */
>> +            req.bytes = bytes;
>> +        }
>> +        ret &= ~BDRV_BLOCK_DATA;
>> +        ret |= BDRV_BLOCK_ZERO;
> Is this equivalent to ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID?
> If so, that would be clearer IMO.


Right. I will change this.


Peter
Ilya Dryomov Aug. 24, 2021, 8:39 p.m. UTC | #3
On Mon, Aug 23, 2021 at 11:38 AM Peter Lieven <pl@kamp.de> wrote:
>
> Am 22.08.21 um 23:02 schrieb Ilya Dryomov:
> > On Tue, Aug 10, 2021 at 3:41 PM Peter Lieven <pl@kamp.de> wrote:
> >> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> >> This results mainly in incorrect progress during block operations (e.g.
> >> qemu-img convert with an rbd image as source).
> >>
> >> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> >> allocated and unallocated (all zero areas).
> >>
> >> To avoid querying the ceph OSDs for the answer this is only done if
> >> the image has the fast-diff features which depends on the object-map
> > Hi Peter,
> >
> > Nit: "has the fast-diff feature which depends on the object-map and
> > exclusive-lock features"
>
>
> will reword in V3.
>
>
> >
> >> and exclusive-lock. In this case it is guaranteed that the information
> >> is present in memory in the librbd client and thus very fast.
> >>
> >> If fast-diff is not available all areas are reported to be allocated
> >> which is the current behaviour if bdrv_co_block_status is not implemented.
> >>
> >> Signed-off-by: Peter Lieven <pl@kamp.de>
> >> ---
> >> V1->V2:
> >> - add commit comment [Stefano]
> >> - use failed_post_open [Stefano]
> >> - remove redundant assert [Stefano]
> >> - add macro+comment for the magic -9000 value [Stefano]
> >> - always set *file if its non NULL [Stefano]
> >>
> >>   block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 125 insertions(+)
> >>
> >> diff --git a/block/rbd.c b/block/rbd.c
> >> index dcf82b15b8..8692e76f40 100644
> >> --- a/block/rbd.c
> >> +++ b/block/rbd.c
> >> @@ -88,6 +88,7 @@ typedef struct BDRVRBDState {
> >>       char *namespace;
> >>       uint64_t image_size;
> >>       uint64_t object_size;
> >> +    uint64_t features;
> >>   } BDRVRBDState;
> >>
> >>   typedef struct RBDTask {
> >> @@ -983,6 +984,13 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
> >>       s->image_size = info.size;
> >>       s->object_size = info.obj_size;
> >>
> >> +    r = rbd_get_features(s->image, &s->features);
> >> +    if (r < 0) {
> >> +        error_setg_errno(errp, -r, "error getting image features from %s",
> >> +                         s->image_name);
> >> +        goto failed_post_open;
> >> +    }
> > The object-map and fast-diff features can be enabled/disabled while the
> > image is open so this should probably go to qemu_rbd_co_block_status().
> >
> >> +
> >>       /* If we are using an rbd snapshot, we must be r/o, otherwise
> >>        * leave as-is */
> >>       if (s->snap != NULL) {
> >> @@ -1259,6 +1267,122 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
> >>       return spec_info;
> >>   }
> >>
> >> +typedef struct rbd_diff_req {
> >> +    uint64_t offs;
> >> +    uint64_t bytes;
> >> +    int exists;
> >> +} rbd_diff_req;
> >> +
> >> +/*
> >> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> >> + * value in the callback routine. Choose a value that does not conflict with
> >> + * an existing exitcode and return it if we want to prematurely stop the
> >> + * execution because we detected a change in the allocation status.
> >> + */
> >> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> >> +
> >> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> >> +                                       int exists, void *opaque)
> >> +{
> >> +    struct rbd_diff_req *req = opaque;
> >> +
> >> +    assert(req->offs + req->bytes <= offs);
> >> +
> >> +    if (req->exists && offs > req->offs + req->bytes) {
> >> +        /*
> >> +         * we started in an allocated area and jumped over an unallocated area,
> >> +         * req->bytes contains the length of the allocated area before the
> >> +         * unallocated area. stop further processing.
> >> +         */
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >> +    }
> >> +    if (req->exists && !exists) {
> >> +        /*
> >> +         * we started in an allocated area and reached a hole. req->bytes
> >> +         * contains the length of the allocated area before the hole.
> >> +         * stop further processing.
> >> +         */
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >> +    }
> >> +    if (!req->exists && exists && offs > req->offs) {
> >> +        /*
> >> +         * we started in an unallocated area and hit the first allocated
> >> +         * block. req->bytes must be set to the length of the unallocated area
> >> +         * before the allocated area. stop further processing.
> >> +         */
> >> +        req->bytes = offs - req->offs;
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >> +    }
> >> +
> >> +    /*
> >> +     * assert that we catched all cases above and allocation state has not
> > catched -> caught
> >
> >> +     * changed during callbacks.
> >> +     */
> >> +    assert(exists == req->exists || !req->bytes);
> >> +    req->exists = exists;
> >> +
> >> +    /*
> >> +     * assert that we either return an unallocated block or have got callbacks
> >> +     * for all allocated blocks present.
> >> +     */
> >> +    assert(!req->exists || offs == req->offs + req->bytes);
> >> +    req->bytes = offs + len - req->offs;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> >> +                                                 bool want_zero, int64_t offset,
> >> +                                                 int64_t bytes, int64_t *pnum,
> >> +                                                 int64_t *map,
> >> +                                                 BlockDriverState **file)
> >> +{
> >> +    BDRVRBDState *s = bs->opaque;
> >> +    int ret, r;
> >> +    struct rbd_diff_req req = { .offs = offset };
> >> +
> >> +    assert(offset + bytes <= s->image_size);
> >> +
> >> +    /* default to all sectors allocated */
> >> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> > I'm a little confused by the meaning of these flags (but I haven't
> > looked at the other drivers yet).  Looks like this patch always sets
> > BDRV_BLOCK_OFFSET_VALID (makes sense since the "host" offset is always
> > known for rbd) and returns either BDRV_BLOCK_DATA or BDRV_BLOCK_ZERO.
> >
> > DATA ZERO OFFSET_VALID
> >   t    t        t       sectors read as zero, returned file is zero at offset
> >   t    f        t       sectors read as valid from file at offset
> >   f    t        t       sectors preallocated, read as zero, returned file not
> >                         necessarily zero at offset
> >   f    f        t       sectors preallocated but read from backing_hd,
> >                         returned file contains garbage at offset
> >
> > What about the first case (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)?
> > What is the practical difference to just BDRV_BLOCK_ZERO?
>
>
> Actually I don't know, I adapted the flags from other drivers. Looking at your
> table it seems that DATA + ZERO would be more appropriate for rbd, right?

It would seem that way but I haven't had a chance to look at how qemu
block layer interprets these yet.

>
> We do not preallocate in qem/rbd yet.

I guess it depends on what preallocated means here, if it talks about
the logical address space or the physical sectors.  If the former, rbd
can be thought of as always preallocated.

Separately, rbd_read() at an offset that corresponds to an unallocated
area (including in parent image(s)) would always return zeroes.  I wonder
if that is the same as "returned file is zero at offset".

>
>
> >
> >> +    if (map) {
> >> +        *map = offset;
> >> +    }
> >> +    if (file) {
> >> +        *file = bs;
> >> +    }
> > A comment in block_int.h says that map and file are guaranteed to be
> > non-NULL so these tests seem redundant?
>
>
> This is also Copy&Paste from other drivers, will change it (and maybe
>
> in other drivers as well).
>
>
>
> >
> >> +    *pnum = bytes;
> >> +
> >> +    /* RBD image does not support fast-diff */
> >> +    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
> >> +        goto out;
> >> +    }
> > Need to make sure that fast-diff is valid here: call rbd_get_flags()
> > on the image and test for !RBD_FLAG_FAST_DIFF_INVALID.
>
>
> Do I have to check for feature FAST_DIFF + flag !FAST_DIFF_INVALID or
>
> is the second enough? Is this call fast?

For both.  The call should be as fast as rbd_get_features().

Thanks,

                Ilya
Peter Lieven Sept. 2, 2021, 2:12 p.m. UTC | #4
Am 24.08.21 um 22:39 schrieb Ilya Dryomov:
> On Mon, Aug 23, 2021 at 11:38 AM Peter Lieven <pl@kamp.de> wrote:
>> Am 22.08.21 um 23:02 schrieb Ilya Dryomov:
>>> On Tue, Aug 10, 2021 at 3:41 PM Peter Lieven <pl@kamp.de> wrote:
>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>> This results mainly in incorrect progress during block operations (e.g.
>>>> qemu-img convert with an rbd image as source).
>>>>
>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>> allocated and unallocated (all zero areas).
>>>>
>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>> the image has the fast-diff features which depends on the object-map
>>> Hi Peter,
>>>
>>> Nit: "has the fast-diff feature which depends on the object-map and
>>> exclusive-lock features"
>>
>> will reword in V3.
>>
>>
>>>> and exclusive-lock. In this case it is guaranteed that the information
>>>> is present in memory in the librbd client and thus very fast.
>>>>
>>>> If fast-diff is not available all areas are reported to be allocated
>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>> V1->V2:
>>>> - add commit comment [Stefano]
>>>> - use failed_post_open [Stefano]
>>>> - remove redundant assert [Stefano]
>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>> - always set *file if its non NULL [Stefano]
>>>>
>>>>    block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 125 insertions(+)
>>>>
>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>> index dcf82b15b8..8692e76f40 100644
>>>> --- a/block/rbd.c
>>>> +++ b/block/rbd.c
>>>> @@ -88,6 +88,7 @@ typedef struct BDRVRBDState {
>>>>        char *namespace;
>>>>        uint64_t image_size;
>>>>        uint64_t object_size;
>>>> +    uint64_t features;
>>>>    } BDRVRBDState;
>>>>
>>>>    typedef struct RBDTask {
>>>> @@ -983,6 +984,13 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>>>>        s->image_size = info.size;
>>>>        s->object_size = info.obj_size;
>>>>
>>>> +    r = rbd_get_features(s->image, &s->features);
>>>> +    if (r < 0) {
>>>> +        error_setg_errno(errp, -r, "error getting image features from %s",
>>>> +                         s->image_name);
>>>> +        goto failed_post_open;
>>>> +    }
>>> The object-map and fast-diff features can be enabled/disabled while the
>>> image is open so this should probably go to qemu_rbd_co_block_status().
>>>
>>>> +
>>>>        /* If we are using an rbd snapshot, we must be r/o, otherwise
>>>>         * leave as-is */
>>>>        if (s->snap != NULL) {
>>>> @@ -1259,6 +1267,122 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>        return spec_info;
>>>>    }
>>>>
>>>> +typedef struct rbd_diff_req {
>>>> +    uint64_t offs;
>>>> +    uint64_t bytes;
>>>> +    int exists;
>>>> +} rbd_diff_req;
>>>> +
>>>> +/*
>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>> + * execution because we detected a change in the allocation status.
>>>> + */
>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>> +
>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>> +                                       int exists, void *opaque)
>>>> +{
>>>> +    struct rbd_diff_req *req = opaque;
>>>> +
>>>> +    assert(req->offs + req->bytes <= offs);
>>>> +
>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>> +        /*
>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>> +         * req->bytes contains the length of the allocated area before the
>>>> +         * unallocated area. stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +    if (req->exists && !exists) {
>>>> +        /*
>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>> +         * contains the length of the allocated area before the hole.
>>>> +         * stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>> +        /*
>>>> +         * we started in an unallocated area and hit the first allocated
>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>> +         * before the allocated area. stop further processing.
>>>> +         */
>>>> +        req->bytes = offs - req->offs;
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * assert that we catched all cases above and allocation state has not
>>> catched -> caught
>>>
>>>> +     * changed during callbacks.
>>>> +     */
>>>> +    assert(exists == req->exists || !req->bytes);
>>>> +    req->exists = exists;
>>>> +
>>>> +    /*
>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>> +     * for all allocated blocks present.
>>>> +     */
>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>> +    req->bytes = offs + len - req->offs;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>> +                                                 bool want_zero, int64_t offset,
>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>> +                                                 int64_t *map,
>>>> +                                                 BlockDriverState **file)
>>>> +{
>>>> +    BDRVRBDState *s = bs->opaque;
>>>> +    int ret, r;
>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>> +
>>>> +    assert(offset + bytes <= s->image_size);
>>>> +
>>>> +    /* default to all sectors allocated */
>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>> I'm a little confused by the meaning of these flags (but I haven't
>>> looked at the other drivers yet).  Looks like this patch always sets
>>> BDRV_BLOCK_OFFSET_VALID (makes sense since the "host" offset is always
>>> known for rbd) and returns either BDRV_BLOCK_DATA or BDRV_BLOCK_ZERO.
>>>
>>> DATA ZERO OFFSET_VALID
>>>    t    t        t       sectors read as zero, returned file is zero at offset
>>>    t    f        t       sectors read as valid from file at offset
>>>    f    t        t       sectors preallocated, read as zero, returned file not
>>>                          necessarily zero at offset
>>>    f    f        t       sectors preallocated but read from backing_hd,
>>>                          returned file contains garbage at offset
>>>
>>> What about the first case (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)?
>>> What is the practical difference to just BDRV_BLOCK_ZERO?
>>
>> Actually I don't know, I adapted the flags from other drivers. Looking at your
>> table it seems that DATA + ZERO would be more appropriate for rbd, right?
> It would seem that way but I haven't had a chance to look at how qemu
> block layer interprets these yet.
>
>> We do not preallocate in qem/rbd yet.
> I guess it depends on what preallocated means here, if it talks about
> the logical address space or the physical sectors.  If the former, rbd
> can be thought of as always preallocated.
>
> Separately, rbd_read() at an offset that corresponds to an unallocated
> area (including in parent image(s)) would always return zeroes.  I wonder
> if that is the same as "returned file is zero at offset".


It seems that all other drivers either set DATA or ZERO. I would leave it

the same for RBD.


>
>>
>>>> +    if (map) {
>>>> +        *map = offset;
>>>> +    }
>>>> +    if (file) {
>>>> +        *file = bs;
>>>> +    }
>>> A comment in block_int.h says that map and file are guaranteed to be
>>> non-NULL so these tests seem redundant?
>>
>> This is also Copy&Paste from other drivers, will change it (and maybe
>>
>> in other drivers as well).
>>
>>
>>
>>>> +    *pnum = bytes;
>>>> +
>>>> +    /* RBD image does not support fast-diff */
>>>> +    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
>>>> +        goto out;
>>>> +    }
>>> Need to make sure that fast-diff is valid here: call rbd_get_flags()
>>> on the image and test for !RBD_FLAG_FAST_DIFF_INVALID.
>>
>> Do I have to check for feature FAST_DIFF + flag !FAST_DIFF_INVALID or
>>
>> is the second enough? Is this call fast?
> For both.  The call should be as fast as rbd_get_features().


The question was if this is handled solely in the client?


Peter
Ilya Dryomov Sept. 5, 2021, 7:04 p.m. UTC | #5
On Thu, Sep 2, 2021 at 4:12 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 24.08.21 um 22:39 schrieb Ilya Dryomov:
> > On Mon, Aug 23, 2021 at 11:38 AM Peter Lieven <pl@kamp.de> wrote:
> >> Am 22.08.21 um 23:02 schrieb Ilya Dryomov:
> >>> On Tue, Aug 10, 2021 at 3:41 PM Peter Lieven <pl@kamp.de> wrote:
> >>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> >>>> This results mainly in incorrect progress during block operations (e.g.
> >>>> qemu-img convert with an rbd image as source).
> >>>>
> >>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> >>>> allocated and unallocated (all zero areas).
> >>>>
> >>>> To avoid querying the ceph OSDs for the answer this is only done if
> >>>> the image has the fast-diff features which depends on the object-map
> >>> Hi Peter,
> >>>
> >>> Nit: "has the fast-diff feature which depends on the object-map and
> >>> exclusive-lock features"
> >>
> >> will reword in V3.
> >>
> >>
> >>>> and exclusive-lock. In this case it is guaranteed that the information
> >>>> is present in memory in the librbd client and thus very fast.
> >>>>
> >>>> If fast-diff is not available all areas are reported to be allocated
> >>>> which is the current behaviour if bdrv_co_block_status is not implemented.
> >>>>
> >>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>> ---
> >>>> V1->V2:
> >>>> - add commit comment [Stefano]
> >>>> - use failed_post_open [Stefano]
> >>>> - remove redundant assert [Stefano]
> >>>> - add macro+comment for the magic -9000 value [Stefano]
> >>>> - always set *file if its non NULL [Stefano]
> >>>>
> >>>>    block/rbd.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 125 insertions(+)
> >>>>
> >>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>> index dcf82b15b8..8692e76f40 100644
> >>>> --- a/block/rbd.c
> >>>> +++ b/block/rbd.c
> >>>> @@ -88,6 +88,7 @@ typedef struct BDRVRBDState {
> >>>>        char *namespace;
> >>>>        uint64_t image_size;
> >>>>        uint64_t object_size;
> >>>> +    uint64_t features;
> >>>>    } BDRVRBDState;
> >>>>
> >>>>    typedef struct RBDTask {
> >>>> @@ -983,6 +984,13 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
> >>>>        s->image_size = info.size;
> >>>>        s->object_size = info.obj_size;
> >>>>
> >>>> +    r = rbd_get_features(s->image, &s->features);
> >>>> +    if (r < 0) {
> >>>> +        error_setg_errno(errp, -r, "error getting image features from %s",
> >>>> +                         s->image_name);
> >>>> +        goto failed_post_open;
> >>>> +    }
> >>> The object-map and fast-diff features can be enabled/disabled while the
> >>> image is open so this should probably go to qemu_rbd_co_block_status().
> >>>
> >>>> +
> >>>>        /* If we are using an rbd snapshot, we must be r/o, otherwise
> >>>>         * leave as-is */
> >>>>        if (s->snap != NULL) {
> >>>> @@ -1259,6 +1267,122 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
> >>>>        return spec_info;
> >>>>    }
> >>>>
> >>>> +typedef struct rbd_diff_req {
> >>>> +    uint64_t offs;
> >>>> +    uint64_t bytes;
> >>>> +    int exists;
> >>>> +} rbd_diff_req;
> >>>> +
> >>>> +/*
> >>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> >>>> + * value in the callback routine. Choose a value that does not conflict with
> >>>> + * an existing exitcode and return it if we want to prematurely stop the
> >>>> + * execution because we detected a change in the allocation status.
> >>>> + */
> >>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> >>>> +
> >>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> >>>> +                                       int exists, void *opaque)
> >>>> +{
> >>>> +    struct rbd_diff_req *req = opaque;
> >>>> +
> >>>> +    assert(req->offs + req->bytes <= offs);
> >>>> +
> >>>> +    if (req->exists && offs > req->offs + req->bytes) {
> >>>> +        /*
> >>>> +         * we started in an allocated area and jumped over an unallocated area,
> >>>> +         * req->bytes contains the length of the allocated area before the
> >>>> +         * unallocated area. stop further processing.
> >>>> +         */
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>> +    }
> >>>> +    if (req->exists && !exists) {
> >>>> +        /*
> >>>> +         * we started in an allocated area and reached a hole. req->bytes
> >>>> +         * contains the length of the allocated area before the hole.
> >>>> +         * stop further processing.
> >>>> +         */
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>> +    }
> >>>> +    if (!req->exists && exists && offs > req->offs) {
> >>>> +        /*
> >>>> +         * we started in an unallocated area and hit the first allocated
> >>>> +         * block. req->bytes must be set to the length of the unallocated area
> >>>> +         * before the allocated area. stop further processing.
> >>>> +         */
> >>>> +        req->bytes = offs - req->offs;
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>> +    }
> >>>> +
> >>>> +    /*
> >>>> +     * assert that we catched all cases above and allocation state has not
> >>> catched -> caught
> >>>
> >>>> +     * changed during callbacks.
> >>>> +     */
> >>>> +    assert(exists == req->exists || !req->bytes);
> >>>> +    req->exists = exists;
> >>>> +
> >>>> +    /*
> >>>> +     * assert that we either return an unallocated block or have got callbacks
> >>>> +     * for all allocated blocks present.
> >>>> +     */
> >>>> +    assert(!req->exists || offs == req->offs + req->bytes);
> >>>> +    req->bytes = offs + len - req->offs;
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> >>>> +                                                 bool want_zero, int64_t offset,
> >>>> +                                                 int64_t bytes, int64_t *pnum,
> >>>> +                                                 int64_t *map,
> >>>> +                                                 BlockDriverState **file)
> >>>> +{
> >>>> +    BDRVRBDState *s = bs->opaque;
> >>>> +    int ret, r;
> >>>> +    struct rbd_diff_req req = { .offs = offset };
> >>>> +
> >>>> +    assert(offset + bytes <= s->image_size);
> >>>> +
> >>>> +    /* default to all sectors allocated */
> >>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> >>> I'm a little confused by the meaning of these flags (but I haven't
> >>> looked at the other drivers yet).  Looks like this patch always sets
> >>> BDRV_BLOCK_OFFSET_VALID (makes sense since the "host" offset is always
> >>> known for rbd) and returns either BDRV_BLOCK_DATA or BDRV_BLOCK_ZERO.
> >>>
> >>> DATA ZERO OFFSET_VALID
> >>>    t    t        t       sectors read as zero, returned file is zero at offset
> >>>    t    f        t       sectors read as valid from file at offset
> >>>    f    t        t       sectors preallocated, read as zero, returned file not
> >>>                          necessarily zero at offset
> >>>    f    f        t       sectors preallocated but read from backing_hd,
> >>>                          returned file contains garbage at offset
> >>>
> >>> What about the first case (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)?
> >>> What is the practical difference to just BDRV_BLOCK_ZERO?
> >>
> >> Actually I don't know, I adapted the flags from other drivers. Looking at your
> >> table it seems that DATA + ZERO would be more appropriate for rbd, right?
> > It would seem that way but I haven't had a chance to look at how qemu
> > block layer interprets these yet.
> >
> >> We do not preallocate in qem/rbd yet.
> > I guess it depends on what preallocated means here, if it talks about
> > the logical address space or the physical sectors.  If the former, rbd
> > can be thought of as always preallocated.
> >
> > Separately, rbd_read() at an offset that corresponds to an unallocated
> > area (including in parent image(s)) would always return zeroes.  I wonder
> > if that is the same as "returned file is zero at offset".
>
>
> It seems that all other drivers either set DATA or ZERO. I would leave it
>
> the same for RBD.

Agreed.  It looks like it has to do with layering and backing files and
since rbd layering isn't exposed to qemu let's not over complicate this.

>
>
> >
> >>
> >>>> +    if (map) {
> >>>> +        *map = offset;
> >>>> +    }
> >>>> +    if (file) {
> >>>> +        *file = bs;
> >>>> +    }
> >>> A comment in block_int.h says that map and file are guaranteed to be
> >>> non-NULL so these tests seem redundant?
> >>
> >> This is also Copy&Paste from other drivers, will change it (and maybe
> >>
> >> in other drivers as well).
> >>
> >>
> >>
> >>>> +    *pnum = bytes;
> >>>> +
> >>>> +    /* RBD image does not support fast-diff */
> >>>> +    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
> >>>> +        goto out;
> >>>> +    }
> >>> Need to make sure that fast-diff is valid here: call rbd_get_flags()
> >>> on the image and test for !RBD_FLAG_FAST_DIFF_INVALID.
> >>
> >> Do I have to check for feature FAST_DIFF + flag !FAST_DIFF_INVALID or
> >>
> >> is the second enough? Is this call fast?
> > For both.  The call should be as fast as rbd_get_features().
>
>
> The question was if this is handled solely in the client?

Yes, most of the time.  Both image features and flags are stored in
the header object and the in-memory copy of the header may need to be
refreshed if the client gets a header update notification in the
interim.

Thanks,

                Ilya
diff mbox series

Patch

diff --git a/block/rbd.c b/block/rbd.c
index dcf82b15b8..8692e76f40 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -88,6 +88,7 @@  typedef struct BDRVRBDState {
     char *namespace;
     uint64_t image_size;
     uint64_t object_size;
+    uint64_t features;
 } BDRVRBDState;
 
 typedef struct RBDTask {
@@ -983,6 +984,13 @@  static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     s->image_size = info.size;
     s->object_size = info.obj_size;
 
+    r = rbd_get_features(s->image, &s->features);
+    if (r < 0) {
+        error_setg_errno(errp, -r, "error getting image features from %s",
+                         s->image_name);
+        goto failed_post_open;
+    }
+
     /* If we are using an rbd snapshot, we must be r/o, otherwise
      * leave as-is */
     if (s->snap != NULL) {
@@ -1259,6 +1267,122 @@  static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
     return spec_info;
 }
 
+typedef struct rbd_diff_req {
+    uint64_t offs;
+    uint64_t bytes;
+    int exists;
+} rbd_diff_req;
+
+/*
+ * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+ * value in the callback routine. Choose a value that does not conflict with
+ * an existing exitcode and return it if we want to prematurely stop the
+ * execution because we detected a change in the allocation status.
+ */
+#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+
+static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
+                                       int exists, void *opaque)
+{
+    struct rbd_diff_req *req = opaque;
+
+    assert(req->offs + req->bytes <= offs);
+
+    if (req->exists && offs > req->offs + req->bytes) {
+        /*
+         * we started in an allocated area and jumped over an unallocated area,
+         * req->bytes contains the length of the allocated area before the
+         * unallocated area. stop further processing.
+         */
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+    if (req->exists && !exists) {
+        /*
+         * we started in an allocated area and reached a hole. req->bytes
+         * contains the length of the allocated area before the hole.
+         * stop further processing.
+         */
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+    if (!req->exists && exists && offs > req->offs) {
+        /*
+         * we started in an unallocated area and hit the first allocated
+         * block. req->bytes must be set to the length of the unallocated area
+         * before the allocated area. stop further processing.
+         */
+        req->bytes = offs - req->offs;
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+
+    /*
+     * assert that we catched all cases above and allocation state has not
+     * changed during callbacks.
+     */
+    assert(exists == req->exists || !req->bytes);
+    req->exists = exists;
+
+    /*
+     * assert that we either return an unallocated block or have got callbacks
+     * for all allocated blocks present.
+     */
+    assert(!req->exists || offs == req->offs + req->bytes);
+    req->bytes = offs + len - req->offs;
+
+    return 0;
+}
+
+static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero, int64_t offset,
+                                                 int64_t bytes, int64_t *pnum,
+                                                 int64_t *map,
+                                                 BlockDriverState **file)
+{
+    BDRVRBDState *s = bs->opaque;
+    int ret, r;
+    struct rbd_diff_req req = { .offs = offset };
+
+    assert(offset + bytes <= s->image_size);
+
+    /* default to all sectors allocated */
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    if (map) {
+        *map = offset;
+    }
+    if (file) {
+        *file = bs;
+    }
+    *pnum = bytes;
+
+    /* RBD image does not support fast-diff */
+    if (!(s->features & RBD_FEATURE_FAST_DIFF)) {
+        goto out;
+    }
+
+    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+                          qemu_rbd_co_block_status_cb, &req);
+    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+        goto out;
+    }
+    assert(req.bytes <= bytes);
+    if (!req.exists) {
+        if (r == 0 && !req.bytes) {
+            /*
+             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
+             * except for the case where an overlay has a hole where the parent
+             * has not. This here catches the case where no callback was
+             * invoked at all.
+             */
+            req.bytes = bytes;
+        }
+        ret &= ~BDRV_BLOCK_DATA;
+        ret |= BDRV_BLOCK_ZERO;
+    }
+    *pnum = req.bytes;
+
+out:
+    return ret;
+}
+
 static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1494,6 +1618,7 @@  static BlockDriver bdrv_rbd = {
 #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
 #endif
+    .bdrv_co_block_status   = qemu_rbd_co_block_status,
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,