diff mbox series

[V3] block/rbd: implement bdrv_co_block_status

Message ID 20210916122116.802-1-pl@kamp.de (mailing list archive)
State New, archived
Headers show
Series [V3] block/rbd: implement bdrv_co_block_status | expand

Commit Message

Peter Lieven Sept. 16, 2021, 12:21 p.m. UTC
the qemu rbd driver currently lacks support for bdrv_co_block_status.
This results mainly in incorrect progress during block operations (e.g.
qemu-img convert with an rbd image as source).

This patch utilizes the rbd_diff_iterate2 call from librbd to detect
allocated and unallocated (all zero areas).

To avoid querying the ceph OSDs for the answer this is only done if
the image has the fast-diff feature which depends on the object-map and
exclusive-lock features. In this case it is guaranteed that the information
is present in memory in the librbd client and thus very fast.

If fast-diff is not available all areas are reported to be allocated
which is the current behaviour if bdrv_co_block_status is not implemented.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
V2->V3:
- check rbd_flags every time (they can change during runtime) [Ilya]
- also check for fast-diff invalid flag [Ilya]
- *map and *file cant be NULL [Ilya]
- set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
  unallocated area [Ilya]
- typo: catched -> caught [Ilya]
- changed wording about fast-diff, object-map and exclusive lock in
  commit msg [Ilya]

V1->V2:
- add commit comment [Stefano]
- use failed_post_open [Stefano]
- remove redundant assert [Stefano]
- add macro+comment for the magic -9000 value [Stefano]
- always set *file if its non NULL [Stefano]

 block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

Comments

Ilya Dryomov Oct. 5, 2021, 7:54 a.m. UTC | #1
On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>
> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> This results mainly in incorrect progress during block operations (e.g.
> qemu-img convert with an rbd image as source).
>
> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> allocated and unallocated (all zero areas).
>
> To avoid querying the ceph OSDs for the answer this is only done if
> the image has the fast-diff feature which depends on the object-map and
> exclusive-lock features. In this case it is guaranteed that the information
> is present in memory in the librbd client and thus very fast.
>
> If fast-diff is not available all areas are reported to be allocated
> which is the current behaviour if bdrv_co_block_status is not implemented.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
> V2->V3:
> - check rbd_flags every time (they can change during runtime) [Ilya]
> - also check for fast-diff invalid flag [Ilya]
> - *map and *file cant be NULL [Ilya]
> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>   unallocated area [Ilya]
> - typo: catched -> caught [Ilya]
> - changed wording about fast-diff, object-map and exclusive lock in
>   commit msg [Ilya]
>
> V1->V2:
> - add commit comment [Stefano]
> - use failed_post_open [Stefano]
> - remove redundant assert [Stefano]
> - add macro+comment for the magic -9000 value [Stefano]
> - always set *file if its non NULL [Stefano]
>
>  block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 126 insertions(+)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index dcf82b15b8..3cb24f9981 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>      return spec_info;
>  }
>
> +typedef struct rbd_diff_req {
> +    uint64_t offs;
> +    uint64_t bytes;
> +    int exists;

Hi Peter,

Nit: make exists a bool.  The one in the callback has to be an int
because of the callback signature but let's not spread that.

> +} rbd_diff_req;
> +
> +/*
> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> + * value in the callback routine. Choose a value that does not conflict with
> + * an existing exitcode and return it if we want to prematurely stop the
> + * execution because we detected a change in the allocation status.
> + */
> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> +
> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> +                                       int exists, void *opaque)
> +{
> +    struct rbd_diff_req *req = opaque;
> +
> +    assert(req->offs + req->bytes <= offs);
> +
> +    if (req->exists && offs > req->offs + req->bytes) {
> +        /*
> +         * we started in an allocated area and jumped over an unallocated area,
> +         * req->bytes contains the length of the allocated area before the
> +         * unallocated area. stop further processing.
> +         */
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> +    }
> +    if (req->exists && !exists) {
> +        /*
> +         * we started in an allocated area and reached a hole. req->bytes
> +         * contains the length of the allocated area before the hole.
> +         * stop further processing.
> +         */
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;

Do you have a test case for when this branch is taken?

> +    }
> +    if (!req->exists && exists && offs > req->offs) {
> +        /*
> +         * we started in an unallocated area and hit the first allocated
> +         * block. req->bytes must be set to the length of the unallocated area
> +         * before the allocated area. stop further processing.
> +         */
> +        req->bytes = offs - req->offs;
> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> +    }
> +
> +    /*
> +     * assert that we caught all cases above and allocation state has not
> +     * changed during callbacks.
> +     */
> +    assert(exists == req->exists || !req->bytes);
> +    req->exists = exists;
> +
> +    /*
> +     * assert that we either return an unallocated block or have got callbacks
> +     * for all allocated blocks present.
> +     */
> +    assert(!req->exists || offs == req->offs + req->bytes);
> +    req->bytes = offs + len - req->offs;
> +
> +    return 0;
> +}
> +
> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> +                                                 bool want_zero, int64_t offset,
> +                                                 int64_t bytes, int64_t *pnum,
> +                                                 int64_t *map,
> +                                                 BlockDriverState **file)
> +{
> +    BDRVRBDState *s = bs->opaque;
> +    int ret, r;

Nit: I would rename ret to status or something like that to make
it clear(er) that it is an actual value and never an error.  Or,
even better, drop it entirely and return one of the two bitmasks
directly.

> +    struct rbd_diff_req req = { .offs = offset };
> +    uint64_t features, flags;
> +
> +    assert(offset + bytes <= s->image_size);
> +
> +    /* default to all sectors allocated */
> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> +    *map = offset;
> +    *file = bs;
> +    *pnum = bytes;
> +
> +    /* check if RBD image supports fast-diff */
> +    r = rbd_get_features(s->image, &features);
> +    if (r < 0) {
> +        goto out;
> +    }
> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
> +        goto out;
> +    }
> +
> +    /* check if RBD fast-diff result is valid */
> +    r = rbd_get_flags(s->image, &flags);
> +    if (r < 0) {
> +        goto out;
> +    }
> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
> +        goto out;

Nit: I'm not a fan of labels that cover just the return statement.
Feel free to ignore this one but I would get rid of it and replace
these gotos with returns.

> +    }
> +
> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
> +                          qemu_rbd_co_block_status_cb, &req);
> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
> +        goto out;
> +    }
> +    assert(req.bytes <= bytes);
> +    if (!req.exists) {
> +        if (r == 0 && !req.bytes) {
> +            /*
> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
> +             * except for the case where an overlay has a hole where the parent
> +             * has not. This here catches the case where no callback was
> +             * invoked at all.
> +             */

The above is true in the case of diffing against a snapshot, i.e. when
the "from" snapshot has some data where the "to" revision (whether HEAD
or another snapshot) has a hole.  I don't think it is true for child vs
parent (but it has been a while since I looked at the diff code).  As
long as NULL is passed for fromsnapname, I would expect the callback to
be invoked only for allocated areas.  If I'm right, we could simplify
qemu_rbd_co_block_status_cb() considerably.

Thanks,

                Ilya
Peter Lieven Oct. 5, 2021, 8:19 a.m. UTC | #2
Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>> This results mainly in incorrect progress during block operations (e.g.
>> qemu-img convert with an rbd image as source).
>>
>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>> allocated and unallocated (all zero areas).
>>
>> To avoid querying the ceph OSDs for the answer this is only done if
>> the image has the fast-diff feature which depends on the object-map and
>> exclusive-lock features. In this case it is guaranteed that the information
>> is present in memory in the librbd client and thus very fast.
>>
>> If fast-diff is not available all areas are reported to be allocated
>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>> V2->V3:
>> - check rbd_flags every time (they can change during runtime) [Ilya]
>> - also check for fast-diff invalid flag [Ilya]
>> - *map and *file cant be NULL [Ilya]
>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>    unallocated area [Ilya]
>> - typo: catched -> caught [Ilya]
>> - changed wording about fast-diff, object-map and exclusive lock in
>>    commit msg [Ilya]
>>
>> V1->V2:
>> - add commit comment [Stefano]
>> - use failed_post_open [Stefano]
>> - remove redundant assert [Stefano]
>> - add macro+comment for the magic -9000 value [Stefano]
>> - always set *file if its non NULL [Stefano]
>>
>>   block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 126 insertions(+)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index dcf82b15b8..3cb24f9981 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>       return spec_info;
>>   }
>>
>> +typedef struct rbd_diff_req {
>> +    uint64_t offs;
>> +    uint64_t bytes;
>> +    int exists;
> Hi Peter,
>
> Nit: make exists a bool.  The one in the callback has to be an int
> because of the callback signature but let's not spread that.
>
>> +} rbd_diff_req;
>> +
>> +/*
>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>> + * value in the callback routine. Choose a value that does not conflict with
>> + * an existing exitcode and return it if we want to prematurely stop the
>> + * execution because we detected a change in the allocation status.
>> + */
>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>> +
>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>> +                                       int exists, void *opaque)
>> +{
>> +    struct rbd_diff_req *req = opaque;
>> +
>> +    assert(req->offs + req->bytes <= offs);
>> +
>> +    if (req->exists && offs > req->offs + req->bytes) {
>> +        /*
>> +         * we started in an allocated area and jumped over an unallocated area,
>> +         * req->bytes contains the length of the allocated area before the
>> +         * unallocated area. stop further processing.
>> +         */
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>> +    }
>> +    if (req->exists && !exists) {
>> +        /*
>> +         * we started in an allocated area and reached a hole. req->bytes
>> +         * contains the length of the allocated area before the hole.
>> +         * stop further processing.
>> +         */
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> Do you have a test case for when this branch is taken?


That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?


>
>> +    }
>> +    if (!req->exists && exists && offs > req->offs) {
>> +        /*
>> +         * we started in an unallocated area and hit the first allocated
>> +         * block. req->bytes must be set to the length of the unallocated area
>> +         * before the allocated area. stop further processing.
>> +         */
>> +        req->bytes = offs - req->offs;
>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>> +    }
>> +
>> +    /*
>> +     * assert that we caught all cases above and allocation state has not
>> +     * changed during callbacks.
>> +     */
>> +    assert(exists == req->exists || !req->bytes);
>> +    req->exists = exists;
>> +
>> +    /*
>> +     * assert that we either return an unallocated block or have got callbacks
>> +     * for all allocated blocks present.
>> +     */
>> +    assert(!req->exists || offs == req->offs + req->bytes);
>> +    req->bytes = offs + len - req->offs;
>> +
>> +    return 0;
>> +}
>> +
>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>> +                                                 bool want_zero, int64_t offset,
>> +                                                 int64_t bytes, int64_t *pnum,
>> +                                                 int64_t *map,
>> +                                                 BlockDriverState **file)
>> +{
>> +    BDRVRBDState *s = bs->opaque;
>> +    int ret, r;
> Nit: I would rename ret to status or something like that to make
> it clear(er) that it is an actual value and never an error.  Or,
> even better, drop it entirely and return one of the two bitmasks
> directly.
>
>> +    struct rbd_diff_req req = { .offs = offset };
>> +    uint64_t features, flags;
>> +
>> +    assert(offset + bytes <= s->image_size);
>> +
>> +    /* default to all sectors allocated */
>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>> +    *map = offset;
>> +    *file = bs;
>> +    *pnum = bytes;
>> +
>> +    /* check if RBD image supports fast-diff */
>> +    r = rbd_get_features(s->image, &features);
>> +    if (r < 0) {
>> +        goto out;
>> +    }
>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>> +        goto out;
>> +    }
>> +
>> +    /* check if RBD fast-diff result is valid */
>> +    r = rbd_get_flags(s->image, &flags);
>> +    if (r < 0) {
>> +        goto out;
>> +    }
>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>> +        goto out;
> Nit: I'm not a fan of labels that cover just the return statement.
> Feel free to ignore this one but I would get rid of it and replace
> these gotos with returns.


That would be return with the bitmask directly coded in if I also

drop the ret variable. I can change that, no problem.


>
>> +    }
>> +
>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>> +                          qemu_rbd_co_block_status_cb, &req);
>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>> +        goto out;
>> +    }
>> +    assert(req.bytes <= bytes);
>> +    if (!req.exists) {
>> +        if (r == 0 && !req.bytes) {
>> +            /*
>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>> +             * except for the case where an overlay has a hole where the parent
>> +             * has not. This here catches the case where no callback was
>> +             * invoked at all.
>> +             */
> The above is true in the case of diffing against a snapshot, i.e. when
> the "from" snapshot has some data where the "to" revision (whether HEAD
> or another snapshot) has a hole.  I don't think it is true for child vs
> parent (but it has been a while since I looked at the diff code).  As
> long as NULL is passed for fromsnapname, I would expect the callback to
> be invoked only for allocated areas.  If I'm right, we could simplify
> qemu_rbd_co_block_status_cb() considerably.


See my comment in the callback. Can you look at the diff code or give

me at least a pointer where I can find it. I am not that familiar with

the librbd code yet.


Thanks,

Peter
Ilya Dryomov Oct. 5, 2021, 8:36 a.m. UTC | #3
On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>
> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
> > On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
> >> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> >> This results mainly in incorrect progress during block operations (e.g.
> >> qemu-img convert with an rbd image as source).
> >>
> >> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> >> allocated and unallocated (all zero areas).
> >>
> >> To avoid querying the ceph OSDs for the answer this is only done if
> >> the image has the fast-diff feature which depends on the object-map and
> >> exclusive-lock features. In this case it is guaranteed that the information
> >> is present in memory in the librbd client and thus very fast.
> >>
> >> If fast-diff is not available all areas are reported to be allocated
> >> which is the current behaviour if bdrv_co_block_status is not implemented.
> >>
> >> Signed-off-by: Peter Lieven <pl@kamp.de>
> >> ---
> >> V2->V3:
> >> - check rbd_flags every time (they can change during runtime) [Ilya]
> >> - also check for fast-diff invalid flag [Ilya]
> >> - *map and *file cant be NULL [Ilya]
> >> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
> >>    unallocated area [Ilya]
> >> - typo: catched -> caught [Ilya]
> >> - changed wording about fast-diff, object-map and exclusive lock in
> >>    commit msg [Ilya]
> >>
> >> V1->V2:
> >> - add commit comment [Stefano]
> >> - use failed_post_open [Stefano]
> >> - remove redundant assert [Stefano]
> >> - add macro+comment for the magic -9000 value [Stefano]
> >> - always set *file if its non NULL [Stefano]
> >>
> >>   block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 126 insertions(+)
> >>
> >> diff --git a/block/rbd.c b/block/rbd.c
> >> index dcf82b15b8..3cb24f9981 100644
> >> --- a/block/rbd.c
> >> +++ b/block/rbd.c
> >> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
> >>       return spec_info;
> >>   }
> >>
> >> +typedef struct rbd_diff_req {
> >> +    uint64_t offs;
> >> +    uint64_t bytes;
> >> +    int exists;
> > Hi Peter,
> >
> > Nit: make exists a bool.  The one in the callback has to be an int
> > because of the callback signature but let's not spread that.
> >
> >> +} rbd_diff_req;
> >> +
> >> +/*
> >> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> >> + * value in the callback routine. Choose a value that does not conflict with
> >> + * an existing exitcode and return it if we want to prematurely stop the
> >> + * execution because we detected a change in the allocation status.
> >> + */
> >> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> >> +
> >> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> >> +                                       int exists, void *opaque)
> >> +{
> >> +    struct rbd_diff_req *req = opaque;
> >> +
> >> +    assert(req->offs + req->bytes <= offs);
> >> +
> >> +    if (req->exists && offs > req->offs + req->bytes) {
> >> +        /*
> >> +         * we started in an allocated area and jumped over an unallocated area,
> >> +         * req->bytes contains the length of the allocated area before the
> >> +         * unallocated area. stop further processing.
> >> +         */
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >> +    }
> >> +    if (req->exists && !exists) {
> >> +        /*
> >> +         * we started in an allocated area and reached a hole. req->bytes
> >> +         * contains the length of the allocated area before the hole.
> >> +         * stop further processing.
> >> +         */
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> > Do you have a test case for when this branch is taken?
>
>
> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>
>
> >
> >> +    }
> >> +    if (!req->exists && exists && offs > req->offs) {
> >> +        /*
> >> +         * we started in an unallocated area and hit the first allocated
> >> +         * block. req->bytes must be set to the length of the unallocated area
> >> +         * before the allocated area. stop further processing.
> >> +         */
> >> +        req->bytes = offs - req->offs;
> >> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >> +    }
> >> +
> >> +    /*
> >> +     * assert that we caught all cases above and allocation state has not
> >> +     * changed during callbacks.
> >> +     */
> >> +    assert(exists == req->exists || !req->bytes);
> >> +    req->exists = exists;
> >> +
> >> +    /*
> >> +     * assert that we either return an unallocated block or have got callbacks
> >> +     * for all allocated blocks present.
> >> +     */
> >> +    assert(!req->exists || offs == req->offs + req->bytes);
> >> +    req->bytes = offs + len - req->offs;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> >> +                                                 bool want_zero, int64_t offset,
> >> +                                                 int64_t bytes, int64_t *pnum,
> >> +                                                 int64_t *map,
> >> +                                                 BlockDriverState **file)
> >> +{
> >> +    BDRVRBDState *s = bs->opaque;
> >> +    int ret, r;
> > Nit: I would rename ret to status or something like that to make
> > it clear(er) that it is an actual value and never an error.  Or,
> > even better, drop it entirely and return one of the two bitmasks
> > directly.
> >
> >> +    struct rbd_diff_req req = { .offs = offset };
> >> +    uint64_t features, flags;
> >> +
> >> +    assert(offset + bytes <= s->image_size);
> >> +
> >> +    /* default to all sectors allocated */
> >> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> >> +    *map = offset;
> >> +    *file = bs;
> >> +    *pnum = bytes;
> >> +
> >> +    /* check if RBD image supports fast-diff */
> >> +    r = rbd_get_features(s->image, &features);
> >> +    if (r < 0) {
> >> +        goto out;
> >> +    }
> >> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
> >> +        goto out;
> >> +    }
> >> +
> >> +    /* check if RBD fast-diff result is valid */
> >> +    r = rbd_get_flags(s->image, &flags);
> >> +    if (r < 0) {
> >> +        goto out;
> >> +    }
> >> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
> >> +        goto out;
> > Nit: I'm not a fan of labels that cover just the return statement.
> > Feel free to ignore this one but I would get rid of it and replace
> > these gotos with returns.
>
>
> That would be return with the bitmask directly coded in if I also
>
> drop the ret variable. I can change that, no problem.
>
>
> >
> >> +    }
> >> +
> >> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
> >> +                          qemu_rbd_co_block_status_cb, &req);
> >> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
> >> +        goto out;
> >> +    }
> >> +    assert(req.bytes <= bytes);
> >> +    if (!req.exists) {
> >> +        if (r == 0 && !req.bytes) {
> >> +            /*
> >> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
> >> +             * except for the case where an overlay has a hole where the parent
> >> +             * has not. This here catches the case where no callback was
> >> +             * invoked at all.
> >> +             */
> > The above is true in the case of diffing against a snapshot, i.e. when
> > the "from" snapshot has some data where the "to" revision (whether HEAD
> > or another snapshot) has a hole.  I don't think it is true for child vs
> > parent (but it has been a while since I looked at the diff code).  As
> > long as NULL is passed for fromsnapname, I would expect the callback to
> > be invoked only for allocated areas.  If I'm right, we could simplify
> > qemu_rbd_co_block_status_cb() considerably.
>
> See my comment in the callback. Can you look at the diff code or give
> me at least a pointer where I can find it. I am not that familiar with
> the librbd code yet.

I assumed that you added !exists handling because it came up in your
testing.  If you don't have a test case then let's proceed under the
assumption that it doesn't happen for clones.

Thanks,

                Ilya
Peter Lieven Oct. 5, 2021, 8:38 a.m. UTC | #4
Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>> This results mainly in incorrect progress during block operations (e.g.
>>>> qemu-img convert with an rbd image as source).
>>>>
>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>> allocated and unallocated (all zero areas).
>>>>
>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>> the image has the fast-diff feature which depends on the object-map and
>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>> is present in memory in the librbd client and thus very fast.
>>>>
>>>> If fast-diff is not available all areas are reported to be allocated
>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>> V2->V3:
>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>> - also check for fast-diff invalid flag [Ilya]
>>>> - *map and *file cant be NULL [Ilya]
>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>     unallocated area [Ilya]
>>>> - typo: catched -> caught [Ilya]
>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>     commit msg [Ilya]
>>>>
>>>> V1->V2:
>>>> - add commit comment [Stefano]
>>>> - use failed_post_open [Stefano]
>>>> - remove redundant assert [Stefano]
>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>> - always set *file if its non NULL [Stefano]
>>>>
>>>>    block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 126 insertions(+)
>>>>
>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>> index dcf82b15b8..3cb24f9981 100644
>>>> --- a/block/rbd.c
>>>> +++ b/block/rbd.c
>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>        return spec_info;
>>>>    }
>>>>
>>>> +typedef struct rbd_diff_req {
>>>> +    uint64_t offs;
>>>> +    uint64_t bytes;
>>>> +    int exists;
>>> Hi Peter,
>>>
>>> Nit: make exists a bool.  The one in the callback has to be an int
>>> because of the callback signature but let's not spread that.
>>>
>>>> +} rbd_diff_req;
>>>> +
>>>> +/*
>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>> + * execution because we detected a change in the allocation status.
>>>> + */
>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>> +
>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>> +                                       int exists, void *opaque)
>>>> +{
>>>> +    struct rbd_diff_req *req = opaque;
>>>> +
>>>> +    assert(req->offs + req->bytes <= offs);
>>>> +
>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>> +        /*
>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>> +         * req->bytes contains the length of the allocated area before the
>>>> +         * unallocated area. stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +    if (req->exists && !exists) {
>>>> +        /*
>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>> +         * contains the length of the allocated area before the hole.
>>>> +         * stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>> Do you have a test case for when this branch is taken?
>>
>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>
>>
>>>> +    }
>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>> +        /*
>>>> +         * we started in an unallocated area and hit the first allocated
>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>> +         * before the allocated area. stop further processing.
>>>> +         */
>>>> +        req->bytes = offs - req->offs;
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * assert that we caught all cases above and allocation state has not
>>>> +     * changed during callbacks.
>>>> +     */
>>>> +    assert(exists == req->exists || !req->bytes);
>>>> +    req->exists = exists;
>>>> +
>>>> +    /*
>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>> +     * for all allocated blocks present.
>>>> +     */
>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>> +    req->bytes = offs + len - req->offs;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>> +                                                 bool want_zero, int64_t offset,
>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>> +                                                 int64_t *map,
>>>> +                                                 BlockDriverState **file)
>>>> +{
>>>> +    BDRVRBDState *s = bs->opaque;
>>>> +    int ret, r;
>>> Nit: I would rename ret to status or something like that to make
>>> it clear(er) that it is an actual value and never an error.  Or,
>>> even better, drop it entirely and return one of the two bitmasks
>>> directly.
>>>
>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>> +    uint64_t features, flags;
>>>> +
>>>> +    assert(offset + bytes <= s->image_size);
>>>> +
>>>> +    /* default to all sectors allocated */
>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>> +    *map = offset;
>>>> +    *file = bs;
>>>> +    *pnum = bytes;
>>>> +
>>>> +    /* check if RBD image supports fast-diff */
>>>> +    r = rbd_get_features(s->image, &features);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* check if RBD fast-diff result is valid */
>>>> +    r = rbd_get_flags(s->image, &flags);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>> +        goto out;
>>> Nit: I'm not a fan of labels that cover just the return statement.
>>> Feel free to ignore this one but I would get rid of it and replace
>>> these gotos with returns.
>>
>> That would be return with the bitmask directly coded in if I also
>>
>> drop the ret variable. I can change that, no problem.
>>
>>
>>>> +    }
>>>> +
>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>> +        goto out;
>>>> +    }
>>>> +    assert(req.bytes <= bytes);
>>>> +    if (!req.exists) {
>>>> +        if (r == 0 && !req.bytes) {
>>>> +            /*
>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>> +             * except for the case where an overlay has a hole where the parent
>>>> +             * has not. This here catches the case where no callback was
>>>> +             * invoked at all.
>>>> +             */
>>> The above is true in the case of diffing against a snapshot, i.e. when
>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>> parent (but it has been a while since I looked at the diff code).  As
>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>> qemu_rbd_co_block_status_cb() considerably.
>> See my comment in the callback. Can you look at the diff code or give
>> me at least a pointer where I can find it. I am not that familiar with
>> the librbd code yet.
> I assumed that you added !exists handling because it came up in your
> testing.  If you don't have a test case then let's proceed under the
> assumption that it doesn't happen for clones.


Okay, but I will run some futher tests and anyway add an assertion in the callback.


Thanks,

Peter
Peter Lieven Oct. 7, 2021, 4:30 p.m. UTC | #5
Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>> This results mainly in incorrect progress during block operations (e.g.
>>>> qemu-img convert with an rbd image as source).
>>>>
>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>> allocated and unallocated (all zero areas).
>>>>
>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>> the image has the fast-diff feature which depends on the object-map and
>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>> is present in memory in the librbd client and thus very fast.
>>>>
>>>> If fast-diff is not available all areas are reported to be allocated
>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>> V2->V3:
>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>> - also check for fast-diff invalid flag [Ilya]
>>>> - *map and *file cant be NULL [Ilya]
>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>     unallocated area [Ilya]
>>>> - typo: catched -> caught [Ilya]
>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>     commit msg [Ilya]
>>>>
>>>> V1->V2:
>>>> - add commit comment [Stefano]
>>>> - use failed_post_open [Stefano]
>>>> - remove redundant assert [Stefano]
>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>> - always set *file if its non NULL [Stefano]
>>>>
>>>>    block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 126 insertions(+)
>>>>
>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>> index dcf82b15b8..3cb24f9981 100644
>>>> --- a/block/rbd.c
>>>> +++ b/block/rbd.c
>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>        return spec_info;
>>>>    }
>>>>
>>>> +typedef struct rbd_diff_req {
>>>> +    uint64_t offs;
>>>> +    uint64_t bytes;
>>>> +    int exists;
>>> Hi Peter,
>>>
>>> Nit: make exists a bool.  The one in the callback has to be an int
>>> because of the callback signature but let's not spread that.
>>>
>>>> +} rbd_diff_req;
>>>> +
>>>> +/*
>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>> + * execution because we detected a change in the allocation status.
>>>> + */
>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>> +
>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>> +                                       int exists, void *opaque)
>>>> +{
>>>> +    struct rbd_diff_req *req = opaque;
>>>> +
>>>> +    assert(req->offs + req->bytes <= offs);
>>>> +
>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>> +        /*
>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>> +         * req->bytes contains the length of the allocated area before the
>>>> +         * unallocated area. stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +    if (req->exists && !exists) {
>>>> +        /*
>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>> +         * contains the length of the allocated area before the hole.
>>>> +         * stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>> Do you have a test case for when this branch is taken?
>>
>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>
>>
>>>> +    }
>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>> +        /*
>>>> +         * we started in an unallocated area and hit the first allocated
>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>> +         * before the allocated area. stop further processing.
>>>> +         */
>>>> +        req->bytes = offs - req->offs;
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * assert that we caught all cases above and allocation state has not
>>>> +     * changed during callbacks.
>>>> +     */
>>>> +    assert(exists == req->exists || !req->bytes);
>>>> +    req->exists = exists;
>>>> +
>>>> +    /*
>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>> +     * for all allocated blocks present.
>>>> +     */
>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>> +    req->bytes = offs + len - req->offs;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>> +                                                 bool want_zero, int64_t offset,
>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>> +                                                 int64_t *map,
>>>> +                                                 BlockDriverState **file)
>>>> +{
>>>> +    BDRVRBDState *s = bs->opaque;
>>>> +    int ret, r;
>>> Nit: I would rename ret to status or something like that to make
>>> it clear(er) that it is an actual value and never an error.  Or,
>>> even better, drop it entirely and return one of the two bitmasks
>>> directly.
>>>
>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>> +    uint64_t features, flags;
>>>> +
>>>> +    assert(offset + bytes <= s->image_size);
>>>> +
>>>> +    /* default to all sectors allocated */
>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>> +    *map = offset;
>>>> +    *file = bs;
>>>> +    *pnum = bytes;
>>>> +
>>>> +    /* check if RBD image supports fast-diff */
>>>> +    r = rbd_get_features(s->image, &features);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* check if RBD fast-diff result is valid */
>>>> +    r = rbd_get_flags(s->image, &flags);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>> +        goto out;
>>> Nit: I'm not a fan of labels that cover just the return statement.
>>> Feel free to ignore this one but I would get rid of it and replace
>>> these gotos with returns.
>>
>> That would be return with the bitmask directly coded in if I also
>>
>> drop the ret variable. I can change that, no problem.
>>
>>
>>>> +    }
>>>> +
>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>> +        goto out;
>>>> +    }
>>>> +    assert(req.bytes <= bytes);
>>>> +    if (!req.exists) {
>>>> +        if (r == 0 && !req.bytes) {
>>>> +            /*
>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>> +             * except for the case where an overlay has a hole where the parent
>>>> +             * has not. This here catches the case where no callback was
>>>> +             * invoked at all.
>>>> +             */
>>> The above is true in the case of diffing against a snapshot, i.e. when
>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>> parent (but it has been a while since I looked at the diff code).  As
>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>> qemu_rbd_co_block_status_cb() considerably.
>> See my comment in the callback. Can you look at the diff code or give
>> me at least a pointer where I can find it. I am not that familiar with
>> the librbd code yet.
> I assumed that you added !exists handling because it came up in your
> testing.  If you don't have a test case then let's proceed under the
> assumption that it doesn't happen for clones.
>
> Thanks,
>
>                  Ilya


I just have sent V4. I tested also with thin-clones from a protected snapshot

and do not receive exists == 0 in the callback even after removing big files

from the image and running fstrim or rbd sparsify afterwards.


Peter
Peter Lieven Jan. 6, 2022, 3:27 p.m. UTC | #6
Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>> This results mainly in incorrect progress during block operations (e.g.
>>>> qemu-img convert with an rbd image as source).
>>>>
>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>> allocated and unallocated (all zero areas).
>>>>
>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>> the image has the fast-diff feature which depends on the object-map and
>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>> is present in memory in the librbd client and thus very fast.
>>>>
>>>> If fast-diff is not available all areas are reported to be allocated
>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>> V2->V3:
>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>> - also check for fast-diff invalid flag [Ilya]
>>>> - *map and *file cant be NULL [Ilya]
>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>     unallocated area [Ilya]
>>>> - typo: catched -> caught [Ilya]
>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>     commit msg [Ilya]
>>>>
>>>> V1->V2:
>>>> - add commit comment [Stefano]
>>>> - use failed_post_open [Stefano]
>>>> - remove redundant assert [Stefano]
>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>> - always set *file if its non NULL [Stefano]
>>>>
>>>>    block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 126 insertions(+)
>>>>
>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>> index dcf82b15b8..3cb24f9981 100644
>>>> --- a/block/rbd.c
>>>> +++ b/block/rbd.c
>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>        return spec_info;
>>>>    }
>>>>
>>>> +typedef struct rbd_diff_req {
>>>> +    uint64_t offs;
>>>> +    uint64_t bytes;
>>>> +    int exists;
>>> Hi Peter,
>>>
>>> Nit: make exists a bool.  The one in the callback has to be an int
>>> because of the callback signature but let's not spread that.
>>>
>>>> +} rbd_diff_req;
>>>> +
>>>> +/*
>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>> + * execution because we detected a change in the allocation status.
>>>> + */
>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>> +
>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>> +                                       int exists, void *opaque)
>>>> +{
>>>> +    struct rbd_diff_req *req = opaque;
>>>> +
>>>> +    assert(req->offs + req->bytes <= offs);
>>>> +
>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>> +        /*
>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>> +         * req->bytes contains the length of the allocated area before the
>>>> +         * unallocated area. stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +    if (req->exists && !exists) {
>>>> +        /*
>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>> +         * contains the length of the allocated area before the hole.
>>>> +         * stop further processing.
>>>> +         */
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>> Do you have a test case for when this branch is taken?
>>
>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>
>>
>>>> +    }
>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>> +        /*
>>>> +         * we started in an unallocated area and hit the first allocated
>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>> +         * before the allocated area. stop further processing.
>>>> +         */
>>>> +        req->bytes = offs - req->offs;
>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * assert that we caught all cases above and allocation state has not
>>>> +     * changed during callbacks.
>>>> +     */
>>>> +    assert(exists == req->exists || !req->bytes);
>>>> +    req->exists = exists;
>>>> +
>>>> +    /*
>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>> +     * for all allocated blocks present.
>>>> +     */
>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>> +    req->bytes = offs + len - req->offs;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>> +                                                 bool want_zero, int64_t offset,
>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>> +                                                 int64_t *map,
>>>> +                                                 BlockDriverState **file)
>>>> +{
>>>> +    BDRVRBDState *s = bs->opaque;
>>>> +    int ret, r;
>>> Nit: I would rename ret to status or something like that to make
>>> it clear(er) that it is an actual value and never an error.  Or,
>>> even better, drop it entirely and return one of the two bitmasks
>>> directly.
>>>
>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>> +    uint64_t features, flags;
>>>> +
>>>> +    assert(offset + bytes <= s->image_size);
>>>> +
>>>> +    /* default to all sectors allocated */
>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>> +    *map = offset;
>>>> +    *file = bs;
>>>> +    *pnum = bytes;
>>>> +
>>>> +    /* check if RBD image supports fast-diff */
>>>> +    r = rbd_get_features(s->image, &features);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    /* check if RBD fast-diff result is valid */
>>>> +    r = rbd_get_flags(s->image, &flags);
>>>> +    if (r < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>> +        goto out;
>>> Nit: I'm not a fan of labels that cover just the return statement.
>>> Feel free to ignore this one but I would get rid of it and replace
>>> these gotos with returns.
>>
>> That would be return with the bitmask directly coded in if I also
>>
>> drop the ret variable. I can change that, no problem.
>>
>>
>>>> +    }
>>>> +
>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>> +        goto out;
>>>> +    }
>>>> +    assert(req.bytes <= bytes);
>>>> +    if (!req.exists) {
>>>> +        if (r == 0 && !req.bytes) {
>>>> +            /*
>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>> +             * except for the case where an overlay has a hole where the parent
>>>> +             * has not. This here catches the case where no callback was
>>>> +             * invoked at all.
>>>> +             */
>>> The above is true in the case of diffing against a snapshot, i.e. when
>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>> parent (but it has been a while since I looked at the diff code).  As
>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>> qemu_rbd_co_block_status_cb() considerably.
>> See my comment in the callback. Can you look at the diff code or give
>> me at least a pointer where I can find it. I am not that familiar with
>> the librbd code yet.
> I assumed that you added !exists handling because it came up in your
> testing.  If you don't have a test case then let's proceed under the
> assumption that it doesn't happen for clones.


Hi Ilya,


it seems that our assumption was false. I have an image which shows holes without diffing against

a snapshot. Do you have an idea why?


$ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
     size 20 GiB in 20000 objects
     order 20 (1 MiB objects)
     snapshot_count: 2
     id: 3d6daa102e4d9f
     block_name_prefix: rbd_data.3d6daa102e4d9f
     format: 2
     features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
     op_features:
     flags:
     create_timestamp: Tue Sep 21 14:16:56 2021
     access_timestamp: Thu Jan  6 15:24:46 2022
     modify_timestamp: Thu Jan  6 15:45:42 2022


$ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
  12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
  17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022


$ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
204472320    1048576  zero
1114636288   1048576  zero
1115684864   1048576  zero
1116733440   1048576  zero
1117782016   1048576  zero
1218445312   1048576  zero
1219493888   1048576  zero
1220542464   1048576  zero

....


Best,

Peter
Ilya Dryomov Jan. 6, 2022, 4:01 p.m. UTC | #7
On Thu, Jan 6, 2022 at 4:27 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
> > On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
> >> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
> >>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
> >>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> >>>> This results mainly in incorrect progress during block operations (e.g.
> >>>> qemu-img convert with an rbd image as source).
> >>>>
> >>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> >>>> allocated and unallocated (all zero areas).
> >>>>
> >>>> To avoid querying the ceph OSDs for the answer this is only done if
> >>>> the image has the fast-diff feature which depends on the object-map and
> >>>> exclusive-lock features. In this case it is guaranteed that the information
> >>>> is present in memory in the librbd client and thus very fast.
> >>>>
> >>>> If fast-diff is not available all areas are reported to be allocated
> >>>> which is the current behaviour if bdrv_co_block_status is not implemented.
> >>>>
> >>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>> ---
> >>>> V2->V3:
> >>>> - check rbd_flags every time (they can change during runtime) [Ilya]
> >>>> - also check for fast-diff invalid flag [Ilya]
> >>>> - *map and *file cant be NULL [Ilya]
> >>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
> >>>>     unallocated area [Ilya]
> >>>> - typo: catched -> caught [Ilya]
> >>>> - changed wording about fast-diff, object-map and exclusive lock in
> >>>>     commit msg [Ilya]
> >>>>
> >>>> V1->V2:
> >>>> - add commit comment [Stefano]
> >>>> - use failed_post_open [Stefano]
> >>>> - remove redundant assert [Stefano]
> >>>> - add macro+comment for the magic -9000 value [Stefano]
> >>>> - always set *file if its non NULL [Stefano]
> >>>>
> >>>>    block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 126 insertions(+)
> >>>>
> >>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>> index dcf82b15b8..3cb24f9981 100644
> >>>> --- a/block/rbd.c
> >>>> +++ b/block/rbd.c
> >>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
> >>>>        return spec_info;
> >>>>    }
> >>>>
> >>>> +typedef struct rbd_diff_req {
> >>>> +    uint64_t offs;
> >>>> +    uint64_t bytes;
> >>>> +    int exists;
> >>> Hi Peter,
> >>>
> >>> Nit: make exists a bool.  The one in the callback has to be an int
> >>> because of the callback signature but let's not spread that.
> >>>
> >>>> +} rbd_diff_req;
> >>>> +
> >>>> +/*
> >>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> >>>> + * value in the callback routine. Choose a value that does not conflict with
> >>>> + * an existing exitcode and return it if we want to prematurely stop the
> >>>> + * execution because we detected a change in the allocation status.
> >>>> + */
> >>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> >>>> +
> >>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> >>>> +                                       int exists, void *opaque)
> >>>> +{
> >>>> +    struct rbd_diff_req *req = opaque;
> >>>> +
> >>>> +    assert(req->offs + req->bytes <= offs);
> >>>> +
> >>>> +    if (req->exists && offs > req->offs + req->bytes) {
> >>>> +        /*
> >>>> +         * we started in an allocated area and jumped over an unallocated area,
> >>>> +         * req->bytes contains the length of the allocated area before the
> >>>> +         * unallocated area. stop further processing.
> >>>> +         */
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>> +    }
> >>>> +    if (req->exists && !exists) {
> >>>> +        /*
> >>>> +         * we started in an allocated area and reached a hole. req->bytes
> >>>> +         * contains the length of the allocated area before the hole.
> >>>> +         * stop further processing.
> >>>> +         */
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>> Do you have a test case for when this branch is taken?
> >>
> >> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
> >>
> >>
> >>>> +    }
> >>>> +    if (!req->exists && exists && offs > req->offs) {
> >>>> +        /*
> >>>> +         * we started in an unallocated area and hit the first allocated
> >>>> +         * block. req->bytes must be set to the length of the unallocated area
> >>>> +         * before the allocated area. stop further processing.
> >>>> +         */
> >>>> +        req->bytes = offs - req->offs;
> >>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>> +    }
> >>>> +
> >>>> +    /*
> >>>> +     * assert that we caught all cases above and allocation state has not
> >>>> +     * changed during callbacks.
> >>>> +     */
> >>>> +    assert(exists == req->exists || !req->bytes);
> >>>> +    req->exists = exists;
> >>>> +
> >>>> +    /*
> >>>> +     * assert that we either return an unallocated block or have got callbacks
> >>>> +     * for all allocated blocks present.
> >>>> +     */
> >>>> +    assert(!req->exists || offs == req->offs + req->bytes);
> >>>> +    req->bytes = offs + len - req->offs;
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> >>>> +                                                 bool want_zero, int64_t offset,
> >>>> +                                                 int64_t bytes, int64_t *pnum,
> >>>> +                                                 int64_t *map,
> >>>> +                                                 BlockDriverState **file)
> >>>> +{
> >>>> +    BDRVRBDState *s = bs->opaque;
> >>>> +    int ret, r;
> >>> Nit: I would rename ret to status or something like that to make
> >>> it clear(er) that it is an actual value and never an error.  Or,
> >>> even better, drop it entirely and return one of the two bitmasks
> >>> directly.
> >>>
> >>>> +    struct rbd_diff_req req = { .offs = offset };
> >>>> +    uint64_t features, flags;
> >>>> +
> >>>> +    assert(offset + bytes <= s->image_size);
> >>>> +
> >>>> +    /* default to all sectors allocated */
> >>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> >>>> +    *map = offset;
> >>>> +    *file = bs;
> >>>> +    *pnum = bytes;
> >>>> +
> >>>> +    /* check if RBD image supports fast-diff */
> >>>> +    r = rbd_get_features(s->image, &features);
> >>>> +    if (r < 0) {
> >>>> +        goto out;
> >>>> +    }
> >>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
> >>>> +        goto out;
> >>>> +    }
> >>>> +
> >>>> +    /* check if RBD fast-diff result is valid */
> >>>> +    r = rbd_get_flags(s->image, &flags);
> >>>> +    if (r < 0) {
> >>>> +        goto out;
> >>>> +    }
> >>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
> >>>> +        goto out;
> >>> Nit: I'm not a fan of labels that cover just the return statement.
> >>> Feel free to ignore this one but I would get rid of it and replace
> >>> these gotos with returns.
> >>
> >> That would be return with the bitmask directly coded in if I also
> >>
> >> drop the ret variable. I can change that, no problem.
> >>
> >>
> >>>> +    }
> >>>> +
> >>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
> >>>> +                          qemu_rbd_co_block_status_cb, &req);
> >>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
> >>>> +        goto out;
> >>>> +    }
> >>>> +    assert(req.bytes <= bytes);
> >>>> +    if (!req.exists) {
> >>>> +        if (r == 0 && !req.bytes) {
> >>>> +            /*
> >>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
> >>>> +             * except for the case where an overlay has a hole where the parent
> >>>> +             * has not. This here catches the case where no callback was
> >>>> +             * invoked at all.
> >>>> +             */
> >>> The above is true in the case of diffing against a snapshot, i.e. when
> >>> the "from" snapshot has some data where the "to" revision (whether HEAD
> >>> or another snapshot) has a hole.  I don't think it is true for child vs
> >>> parent (but it has been a while since I looked at the diff code).  As
> >>> long as NULL is passed for fromsnapname, I would expect the callback to
> >>> be invoked only for allocated areas.  If I'm right, we could simplify
> >>> qemu_rbd_co_block_status_cb() considerably.
> >> See my comment in the callback. Can you look at the diff code or give
> >> me at least a pointer where I can find it. I am not that familiar with
> >> the librbd code yet.
> > I assumed that you added !exists handling because it came up in your
> > testing.  If you don't have a test case then let's proceed under the
> > assumption that it doesn't happen for clones.
>
>
> Hi Ilya,
>
>
> it seems that our assumption was false. I have an image which shows holes without diffing against
>
> a snapshot. Do you have an idea why?
>
>
> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
> rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
>      size 20 GiB in 20000 objects
>      order 20 (1 MiB objects)
>      snapshot_count: 2
>      id: 3d6daa102e4d9f
>      block_name_prefix: rbd_data.3d6daa102e4d9f
>      format: 2
>      features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
>      op_features:
>      flags:
>      create_timestamp: Tue Sep 21 14:16:56 2021
>      access_timestamp: Thu Jan  6 15:24:46 2022
>      modify_timestamp: Thu Jan  6 15:45:42 2022
>
>
> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
> SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
>   12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
>   17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022
>
>
> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
> 204472320    1048576  zero
> 1114636288   1048576  zero
> 1115684864   1048576  zero
> 1116733440   1048576  zero
> 1117782016   1048576  zero
> 1218445312   1048576  zero
> 1219493888   1048576  zero
> 1220542464   1048576  zero

Hi Peter,

Yes, I stumbled upon this just yesterday while looking into another
librbd issue surfaced by this patch [1] and was going to email you and
the list after wrapping my head around this behavior.  I see where it
is coming from but I'm not sure what the right fix is.  I would prefer
to patch librbd but that may turn out to be not feasible.  Let me get
back on this next week.

[1] https://tracker.ceph.com/issues/53784

Thanks,

                Ilya
Peter Lieven Jan. 6, 2022, 4:33 p.m. UTC | #8
Am 06.01.22 um 17:01 schrieb Ilya Dryomov:
> On Thu, Jan 6, 2022 at 4:27 PM Peter Lieven <pl@kamp.de> wrote:
>> Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
>>> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>>>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>>>> This results mainly in incorrect progress during block operations (e.g.
>>>>>> qemu-img convert with an rbd image as source).
>>>>>>
>>>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>>>> allocated and unallocated (all zero areas).
>>>>>>
>>>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>>>> the image has the fast-diff feature which depends on the object-map and
>>>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>>>> is present in memory in the librbd client and thus very fast.
>>>>>>
>>>>>> If fast-diff is not available all areas are reported to be allocated
>>>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>>>
>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>> ---
>>>>>> V2->V3:
>>>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>>>> - also check for fast-diff invalid flag [Ilya]
>>>>>> - *map and *file cant be NULL [Ilya]
>>>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>>>      unallocated area [Ilya]
>>>>>> - typo: catched -> caught [Ilya]
>>>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>>>      commit msg [Ilya]
>>>>>>
>>>>>> V1->V2:
>>>>>> - add commit comment [Stefano]
>>>>>> - use failed_post_open [Stefano]
>>>>>> - remove redundant assert [Stefano]
>>>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>>>> - always set *file if its non NULL [Stefano]
>>>>>>
>>>>>>     block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 126 insertions(+)
>>>>>>
>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>> index dcf82b15b8..3cb24f9981 100644
>>>>>> --- a/block/rbd.c
>>>>>> +++ b/block/rbd.c
>>>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>>>         return spec_info;
>>>>>>     }
>>>>>>
>>>>>> +typedef struct rbd_diff_req {
>>>>>> +    uint64_t offs;
>>>>>> +    uint64_t bytes;
>>>>>> +    int exists;
>>>>> Hi Peter,
>>>>>
>>>>> Nit: make exists a bool.  The one in the callback has to be an int
>>>>> because of the callback signature but let's not spread that.
>>>>>
>>>>>> +} rbd_diff_req;
>>>>>> +
>>>>>> +/*
>>>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>>>> + * execution because we detected a change in the allocation status.
>>>>>> + */
>>>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>>>> +
>>>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>>>> +                                       int exists, void *opaque)
>>>>>> +{
>>>>>> +    struct rbd_diff_req *req = opaque;
>>>>>> +
>>>>>> +    assert(req->offs + req->bytes <= offs);
>>>>>> +
>>>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>>>> +        /*
>>>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>>>> +         * req->bytes contains the length of the allocated area before the
>>>>>> +         * unallocated area. stop further processing.
>>>>>> +         */
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>> +    }
>>>>>> +    if (req->exists && !exists) {
>>>>>> +        /*
>>>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>>>> +         * contains the length of the allocated area before the hole.
>>>>>> +         * stop further processing.
>>>>>> +         */
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>> Do you have a test case for when this branch is taken?
>>>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>>>
>>>>
>>>>>> +    }
>>>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>>>> +        /*
>>>>>> +         * we started in an unallocated area and hit the first allocated
>>>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>>>> +         * before the allocated area. stop further processing.
>>>>>> +         */
>>>>>> +        req->bytes = offs - req->offs;
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>> +    }
>>>>>> +
>>>>>> +    /*
>>>>>> +     * assert that we caught all cases above and allocation state has not
>>>>>> +     * changed during callbacks.
>>>>>> +     */
>>>>>> +    assert(exists == req->exists || !req->bytes);
>>>>>> +    req->exists = exists;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>>>> +     * for all allocated blocks present.
>>>>>> +     */
>>>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>>>> +    req->bytes = offs + len - req->offs;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>>>> +                                                 bool want_zero, int64_t offset,
>>>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>>>> +                                                 int64_t *map,
>>>>>> +                                                 BlockDriverState **file)
>>>>>> +{
>>>>>> +    BDRVRBDState *s = bs->opaque;
>>>>>> +    int ret, r;
>>>>> Nit: I would rename ret to status or something like that to make
>>>>> it clear(er) that it is an actual value and never an error.  Or,
>>>>> even better, drop it entirely and return one of the two bitmasks
>>>>> directly.
>>>>>
>>>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>>>> +    uint64_t features, flags;
>>>>>> +
>>>>>> +    assert(offset + bytes <= s->image_size);
>>>>>> +
>>>>>> +    /* default to all sectors allocated */
>>>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>>>> +    *map = offset;
>>>>>> +    *file = bs;
>>>>>> +    *pnum = bytes;
>>>>>> +
>>>>>> +    /* check if RBD image supports fast-diff */
>>>>>> +    r = rbd_get_features(s->image, &features);
>>>>>> +    if (r < 0) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* check if RBD fast-diff result is valid */
>>>>>> +    r = rbd_get_flags(s->image, &flags);
>>>>>> +    if (r < 0) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>>>> +        goto out;
>>>>> Nit: I'm not a fan of labels that cover just the return statement.
>>>>> Feel free to ignore this one but I would get rid of it and replace
>>>>> these gotos with returns.
>>>> That would be return with the bitmask directly coded in if I also
>>>>
>>>> drop the ret variable. I can change that, no problem.
>>>>
>>>>
>>>>>> +    }
>>>>>> +
>>>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    assert(req.bytes <= bytes);
>>>>>> +    if (!req.exists) {
>>>>>> +        if (r == 0 && !req.bytes) {
>>>>>> +            /*
>>>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>>>> +             * except for the case where an overlay has a hole where the parent
>>>>>> +             * has not. This here catches the case where no callback was
>>>>>> +             * invoked at all.
>>>>>> +             */
>>>>> The above is true in the case of diffing against a snapshot, i.e. when
>>>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>>>> parent (but it has been a while since I looked at the diff code).  As
>>>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>>>> qemu_rbd_co_block_status_cb() considerably.
>>>> See my comment in the callback. Can you look at the diff code or give
>>>> me at least a pointer where I can find it. I am not that familiar with
>>>> the librbd code yet.
>>> I assumed that you added !exists handling because it came up in your
>>> testing.  If you don't have a test case then let's proceed under the
>>> assumption that it doesn't happen for clones.
>>
>> Hi Ilya,
>>
>>
>> it seems that our assumption was false. I have an image which shows holes without diffing against
>>
>> a snapshot. Do you have an idea why?
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>> rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
>>       size 20 GiB in 20000 objects
>>       order 20 (1 MiB objects)
>>       snapshot_count: 2
>>       id: 3d6daa102e4d9f
>>       block_name_prefix: rbd_data.3d6daa102e4d9f
>>       format: 2
>>       features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
>>       op_features:
>>       flags:
>>       create_timestamp: Tue Sep 21 14:16:56 2021
>>       access_timestamp: Thu Jan  6 15:24:46 2022
>>       modify_timestamp: Thu Jan  6 15:45:42 2022
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>> SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
>>    12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
>>    17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
>> 204472320    1048576  zero
>> 1114636288   1048576  zero
>> 1115684864   1048576  zero
>> 1116733440   1048576  zero
>> 1117782016   1048576  zero
>> 1218445312   1048576  zero
>> 1219493888   1048576  zero
>> 1220542464   1048576  zero
> Hi Peter,
>
> Yes, I stumbled upon this just yesterday while looking into another
> librbd issue surfaced by this patch [1] and was going to email you and
> the list after wrapping my head around this behavior.  I see where it
> is coming from but I'm not sure what the right fix is.  I would prefer
> to patch librbd but that may turn out to be not feasible.  Let me get
> back on this next week.
>
> [1] https://tracker.ceph.com/issues/53784


Looks like it was a good idea to add all these asserts...


The issue I see does occur with a simple qemu-img convert on the given image as source. It triggers the assert(exists) in the cb.

My dev cluster is running on Nautilus 14.2.22.


Best,

Peter
Ilya Dryomov Jan. 6, 2022, 5:47 p.m. UTC | #9
On Thu, Jan 6, 2022 at 5:33 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 06.01.22 um 17:01 schrieb Ilya Dryomov:
> > On Thu, Jan 6, 2022 at 4:27 PM Peter Lieven <pl@kamp.de> wrote:
> >> Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
> >>> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
> >>>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
> >>>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
> >>>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
> >>>>>> This results mainly in incorrect progress during block operations (e.g.
> >>>>>> qemu-img convert with an rbd image as source).
> >>>>>>
> >>>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
> >>>>>> allocated and unallocated (all zero areas).
> >>>>>>
> >>>>>> To avoid querying the ceph OSDs for the answer this is only done if
> >>>>>> the image has the fast-diff feature which depends on the object-map and
> >>>>>> exclusive-lock features. In this case it is guaranteed that the information
> >>>>>> is present in memory in the librbd client and thus very fast.
> >>>>>>
> >>>>>> If fast-diff is not available all areas are reported to be allocated
> >>>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
> >>>>>>
> >>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>>>> ---
> >>>>>> V2->V3:
> >>>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
> >>>>>> - also check for fast-diff invalid flag [Ilya]
> >>>>>> - *map and *file cant be NULL [Ilya]
> >>>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
> >>>>>>      unallocated area [Ilya]
> >>>>>> - typo: catched -> caught [Ilya]
> >>>>>> - changed wording about fast-diff, object-map and exclusive lock in
> >>>>>>      commit msg [Ilya]
> >>>>>>
> >>>>>> V1->V2:
> >>>>>> - add commit comment [Stefano]
> >>>>>> - use failed_post_open [Stefano]
> >>>>>> - remove redundant assert [Stefano]
> >>>>>> - add macro+comment for the magic -9000 value [Stefano]
> >>>>>> - always set *file if its non NULL [Stefano]
> >>>>>>
> >>>>>>     block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>     1 file changed, 126 insertions(+)
> >>>>>>
> >>>>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>>>> index dcf82b15b8..3cb24f9981 100644
> >>>>>> --- a/block/rbd.c
> >>>>>> +++ b/block/rbd.c
> >>>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
> >>>>>>         return spec_info;
> >>>>>>     }
> >>>>>>
> >>>>>> +typedef struct rbd_diff_req {
> >>>>>> +    uint64_t offs;
> >>>>>> +    uint64_t bytes;
> >>>>>> +    int exists;
> >>>>> Hi Peter,
> >>>>>
> >>>>> Nit: make exists a bool.  The one in the callback has to be an int
> >>>>> because of the callback signature but let's not spread that.
> >>>>>
> >>>>>> +} rbd_diff_req;
> >>>>>> +
> >>>>>> +/*
> >>>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
> >>>>>> + * value in the callback routine. Choose a value that does not conflict with
> >>>>>> + * an existing exitcode and return it if we want to prematurely stop the
> >>>>>> + * execution because we detected a change in the allocation status.
> >>>>>> + */
> >>>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
> >>>>>> +
> >>>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
> >>>>>> +                                       int exists, void *opaque)
> >>>>>> +{
> >>>>>> +    struct rbd_diff_req *req = opaque;
> >>>>>> +
> >>>>>> +    assert(req->offs + req->bytes <= offs);
> >>>>>> +
> >>>>>> +    if (req->exists && offs > req->offs + req->bytes) {
> >>>>>> +        /*
> >>>>>> +         * we started in an allocated area and jumped over an unallocated area,
> >>>>>> +         * req->bytes contains the length of the allocated area before the
> >>>>>> +         * unallocated area. stop further processing.
> >>>>>> +         */
> >>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>>>> +    }
> >>>>>> +    if (req->exists && !exists) {
> >>>>>> +        /*
> >>>>>> +         * we started in an allocated area and reached a hole. req->bytes
> >>>>>> +         * contains the length of the allocated area before the hole.
> >>>>>> +         * stop further processing.
> >>>>>> +         */
> >>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>>> Do you have a test case for when this branch is taken?
> >>>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
> >>>>
> >>>>
> >>>>>> +    }
> >>>>>> +    if (!req->exists && exists && offs > req->offs) {
> >>>>>> +        /*
> >>>>>> +         * we started in an unallocated area and hit the first allocated
> >>>>>> +         * block. req->bytes must be set to the length of the unallocated area
> >>>>>> +         * before the allocated area. stop further processing.
> >>>>>> +         */
> >>>>>> +        req->bytes = offs - req->offs;
> >>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    /*
> >>>>>> +     * assert that we caught all cases above and allocation state has not
> >>>>>> +     * changed during callbacks.
> >>>>>> +     */
> >>>>>> +    assert(exists == req->exists || !req->bytes);
> >>>>>> +    req->exists = exists;
> >>>>>> +
> >>>>>> +    /*
> >>>>>> +     * assert that we either return an unallocated block or have got callbacks
> >>>>>> +     * for all allocated blocks present.
> >>>>>> +     */
> >>>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
> >>>>>> +    req->bytes = offs + len - req->offs;
> >>>>>> +
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
> >>>>>> +                                                 bool want_zero, int64_t offset,
> >>>>>> +                                                 int64_t bytes, int64_t *pnum,
> >>>>>> +                                                 int64_t *map,
> >>>>>> +                                                 BlockDriverState **file)
> >>>>>> +{
> >>>>>> +    BDRVRBDState *s = bs->opaque;
> >>>>>> +    int ret, r;
> >>>>> Nit: I would rename ret to status or something like that to make
> >>>>> it clear(er) that it is an actual value and never an error.  Or,
> >>>>> even better, drop it entirely and return one of the two bitmasks
> >>>>> directly.
> >>>>>
> >>>>>> +    struct rbd_diff_req req = { .offs = offset };
> >>>>>> +    uint64_t features, flags;
> >>>>>> +
> >>>>>> +    assert(offset + bytes <= s->image_size);
> >>>>>> +
> >>>>>> +    /* default to all sectors allocated */
> >>>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
> >>>>>> +    *map = offset;
> >>>>>> +    *file = bs;
> >>>>>> +    *pnum = bytes;
> >>>>>> +
> >>>>>> +    /* check if RBD image supports fast-diff */
> >>>>>> +    r = rbd_get_features(s->image, &features);
> >>>>>> +    if (r < 0) {
> >>>>>> +        goto out;
> >>>>>> +    }
> >>>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
> >>>>>> +        goto out;
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    /* check if RBD fast-diff result is valid */
> >>>>>> +    r = rbd_get_flags(s->image, &flags);
> >>>>>> +    if (r < 0) {
> >>>>>> +        goto out;
> >>>>>> +    }
> >>>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
> >>>>>> +        goto out;
> >>>>> Nit: I'm not a fan of labels that cover just the return statement.
> >>>>> Feel free to ignore this one but I would get rid of it and replace
> >>>>> these gotos with returns.
> >>>> That would be return with the bitmask directly coded in if I also
> >>>>
> >>>> drop the ret variable. I can change that, no problem.
> >>>>
> >>>>
> >>>>>> +    }
> >>>>>> +
> >>>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
> >>>>>> +                          qemu_rbd_co_block_status_cb, &req);
> >>>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
> >>>>>> +        goto out;
> >>>>>> +    }
> >>>>>> +    assert(req.bytes <= bytes);
> >>>>>> +    if (!req.exists) {
> >>>>>> +        if (r == 0 && !req.bytes) {
> >>>>>> +            /*
> >>>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
> >>>>>> +             * except for the case where an overlay has a hole where the parent
> >>>>>> +             * has not. This here catches the case where no callback was
> >>>>>> +             * invoked at all.
> >>>>>> +             */
> >>>>> The above is true in the case of diffing against a snapshot, i.e. when
> >>>>> the "from" snapshot has some data where the "to" revision (whether HEAD
> >>>>> or another snapshot) has a hole.  I don't think it is true for child vs
> >>>>> parent (but it has been a while since I looked at the diff code).  As
> >>>>> long as NULL is passed for fromsnapname, I would expect the callback to
> >>>>> be invoked only for allocated areas.  If I'm right, we could simplify
> >>>>> qemu_rbd_co_block_status_cb() considerably.
> >>>> See my comment in the callback. Can you look at the diff code or give
> >>>> me at least a pointer where I can find it. I am not that familiar with
> >>>> the librbd code yet.
> >>> I assumed that you added !exists handling because it came up in your
> >>> testing.  If you don't have a test case then let's proceed under the
> >>> assumption that it doesn't happen for clones.
> >>
> >> Hi Ilya,
> >>
> >>
> >> it seems that our assumption was false. I have an image which shows holes without diffing against
> >>
> >> a snapshot. Do you have an idea why?
> >>
> >>
> >> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
> >> rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
> >>       size 20 GiB in 20000 objects
> >>       order 20 (1 MiB objects)
> >>       snapshot_count: 2
> >>       id: 3d6daa102e4d9f
> >>       block_name_prefix: rbd_data.3d6daa102e4d9f
> >>       format: 2
> >>       features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
> >>       op_features:
> >>       flags:
> >>       create_timestamp: Tue Sep 21 14:16:56 2021
> >>       access_timestamp: Thu Jan  6 15:24:46 2022
> >>       modify_timestamp: Thu Jan  6 15:45:42 2022
> >>
> >>
> >> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
> >> SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
> >>    12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
> >>    17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022
> >>
> >>
> >> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
> >> 204472320    1048576  zero
> >> 1114636288   1048576  zero
> >> 1115684864   1048576  zero
> >> 1116733440   1048576  zero
> >> 1117782016   1048576  zero
> >> 1218445312   1048576  zero
> >> 1219493888   1048576  zero
> >> 1220542464   1048576  zero
> > Hi Peter,
> >
> > Yes, I stumbled upon this just yesterday while looking into another
> > librbd issue surfaced by this patch [1] and was going to email you and
> > the list after wrapping my head around this behavior.  I see where it
> > is coming from but I'm not sure what the right fix is.  I would prefer
> > to patch librbd but that may turn out to be not feasible.  Let me get
> > back on this next week.
> >
> > [1] https://tracker.ceph.com/issues/53784
>
>
> Looks like it was a good idea to add all these asserts...
>
>
> The issue I see does occur with a simple qemu-img convert on the given image as source. It triggers the assert(exists) in the cb.

Right, "qemu-img convert" issues a block status request.  Does your
image have snapshots?  This behavior should be caused by a snapshot
that contains a discarded object:

$ rbd create --size 16M foo
$ sudo rbd map foo
/dev/rbd0
$ sudo xfs_io -d -c 'pwrite -b 4M 0 16M' /dev/rbd0
wrote 16777216/16777216 bytes at offset 0
16 MiB, 4 ops; 0.4607 sec (34.726 MiB/sec and 8.6814 ops/sec)
$ rbd snap create foo@snap
Creating snap: 100% complete...done.
$ rbd diff foo
Offset    Length   Type
0         4194304  data
4194304   4194304  data
8388608   4194304  data
12582912  4194304  data

$ blkdiscard -o 4M -l 8M /dev/rbd0
$ rbd diff foo
Offset    Length   Type
0         4194304  data
4194304   4194304  zero
8388608   4194304  zero
12582912  4194304  data

If you remove that snapshot, these "holes" should disappear from the
report:

$ rbd snap rm foo@snap
Removing snap: 100% complete...done.
$ rbd diff foo
Offset    Length   Type
0         4194304  data
12582912  4194304  data

Thanks,

                Ilya
Peter Lieven Jan. 6, 2022, 7:39 p.m. UTC | #10
Am 06.01.22 um 18:47 schrieb Ilya Dryomov:
> On Thu, Jan 6, 2022 at 5:33 PM Peter Lieven <pl@kamp.de> wrote:
>> Am 06.01.22 um 17:01 schrieb Ilya Dryomov:
>>> On Thu, Jan 6, 2022 at 4:27 PM Peter Lieven <pl@kamp.de> wrote:
>>>> Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
>>>>> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>>>>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>>>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>>>>>> This results mainly in incorrect progress during block operations (e.g.
>>>>>>>> qemu-img convert with an rbd image as source).
>>>>>>>>
>>>>>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>>>>>> allocated and unallocated (all zero areas).
>>>>>>>>
>>>>>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>>>>>> the image has the fast-diff feature which depends on the object-map and
>>>>>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>>>>>> is present in memory in the librbd client and thus very fast.
>>>>>>>>
>>>>>>>> If fast-diff is not available all areas are reported to be allocated
>>>>>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>>>>>
>>>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>>>> ---
>>>>>>>> V2->V3:
>>>>>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>>>>>> - also check for fast-diff invalid flag [Ilya]
>>>>>>>> - *map and *file cant be NULL [Ilya]
>>>>>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>>>>>      unallocated area [Ilya]
>>>>>>>> - typo: catched -> caught [Ilya]
>>>>>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>>>>>      commit msg [Ilya]
>>>>>>>>
>>>>>>>> V1->V2:
>>>>>>>> - add commit comment [Stefano]
>>>>>>>> - use failed_post_open [Stefano]
>>>>>>>> - remove redundant assert [Stefano]
>>>>>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>>>>>> - always set *file if its non NULL [Stefano]
>>>>>>>>
>>>>>>>>     block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>     1 file changed, 126 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>>>> index dcf82b15b8..3cb24f9981 100644
>>>>>>>> --- a/block/rbd.c
>>>>>>>> +++ b/block/rbd.c
>>>>>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>>>>>         return spec_info;
>>>>>>>>     }
>>>>>>>>
>>>>>>>> +typedef struct rbd_diff_req {
>>>>>>>> +    uint64_t offs;
>>>>>>>> +    uint64_t bytes;
>>>>>>>> +    int exists;
>>>>>>> Hi Peter,
>>>>>>>
>>>>>>> Nit: make exists a bool.  The one in the callback has to be an int
>>>>>>> because of the callback signature but let's not spread that.
>>>>>>>
>>>>>>>> +} rbd_diff_req;
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>>>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>>>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>>>>>> + * execution because we detected a change in the allocation status.
>>>>>>>> + */
>>>>>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>>>>>> +
>>>>>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>>>>>> +                                       int exists, void *opaque)
>>>>>>>> +{
>>>>>>>> +    struct rbd_diff_req *req = opaque;
>>>>>>>> +
>>>>>>>> +    assert(req->offs + req->bytes <= offs);
>>>>>>>> +
>>>>>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>>>>>> +        /*
>>>>>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>>>>>> +         * req->bytes contains the length of the allocated area before the
>>>>>>>> +         * unallocated area. stop further processing.
>>>>>>>> +         */
>>>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>>>> +    }
>>>>>>>> +    if (req->exists && !exists) {
>>>>>>>> +        /*
>>>>>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>>>>>> +         * contains the length of the allocated area before the hole.
>>>>>>>> +         * stop further processing.
>>>>>>>> +         */
>>>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>>> Do you have a test case for when this branch is taken?
>>>>>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>>>>>
>>>>>>
>>>>>>>> +    }
>>>>>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>>>>>> +        /*
>>>>>>>> +         * we started in an unallocated area and hit the first allocated
>>>>>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>>>>>> +         * before the allocated area. stop further processing.
>>>>>>>> +         */
>>>>>>>> +        req->bytes = offs - req->offs;
>>>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * assert that we caught all cases above and allocation state has not
>>>>>>>> +     * changed during callbacks.
>>>>>>>> +     */
>>>>>>>> +    assert(exists == req->exists || !req->bytes);
>>>>>>>> +    req->exists = exists;
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>>>>>> +     * for all allocated blocks present.
>>>>>>>> +     */
>>>>>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>>>>>> +    req->bytes = offs + len - req->offs;
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>>>>>> +                                                 bool want_zero, int64_t offset,
>>>>>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>>>>>> +                                                 int64_t *map,
>>>>>>>> +                                                 BlockDriverState **file)
>>>>>>>> +{
>>>>>>>> +    BDRVRBDState *s = bs->opaque;
>>>>>>>> +    int ret, r;
>>>>>>> Nit: I would rename ret to status or something like that to make
>>>>>>> it clear(er) that it is an actual value and never an error.  Or,
>>>>>>> even better, drop it entirely and return one of the two bitmasks
>>>>>>> directly.
>>>>>>>
>>>>>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>>>>>> +    uint64_t features, flags;
>>>>>>>> +
>>>>>>>> +    assert(offset + bytes <= s->image_size);
>>>>>>>> +
>>>>>>>> +    /* default to all sectors allocated */
>>>>>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>>>>>> +    *map = offset;
>>>>>>>> +    *file = bs;
>>>>>>>> +    *pnum = bytes;
>>>>>>>> +
>>>>>>>> +    /* check if RBD image supports fast-diff */
>>>>>>>> +    r = rbd_get_features(s->image, &features);
>>>>>>>> +    if (r < 0) {
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /* check if RBD fast-diff result is valid */
>>>>>>>> +    r = rbd_get_flags(s->image, &flags);
>>>>>>>> +    if (r < 0) {
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>>>>>> +        goto out;
>>>>>>> Nit: I'm not a fan of labels that cover just the return statement.
>>>>>>> Feel free to ignore this one but I would get rid of it and replace
>>>>>>> these gotos with returns.
>>>>>> That would be return with the bitmask directly coded in if I also
>>>>>>
>>>>>> drop the ret variable. I can change that, no problem.
>>>>>>
>>>>>>
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>>>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>>>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>>>>>> +        goto out;
>>>>>>>> +    }
>>>>>>>> +    assert(req.bytes <= bytes);
>>>>>>>> +    if (!req.exists) {
>>>>>>>> +        if (r == 0 && !req.bytes) {
>>>>>>>> +            /*
>>>>>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>>>>>> +             * except for the case where an overlay has a hole where the parent
>>>>>>>> +             * has not. This here catches the case where no callback was
>>>>>>>> +             * invoked at all.
>>>>>>>> +             */
>>>>>>> The above is true in the case of diffing against a snapshot, i.e. when
>>>>>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>>>>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>>>>>> parent (but it has been a while since I looked at the diff code).  As
>>>>>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>>>>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>>>>>> qemu_rbd_co_block_status_cb() considerably.
>>>>>> See my comment in the callback. Can you look at the diff code or give
>>>>>> me at least a pointer where I can find it. I am not that familiar with
>>>>>> the librbd code yet.
>>>>> I assumed that you added !exists handling because it came up in your
>>>>> testing.  If you don't have a test case then let's proceed under the
>>>>> assumption that it doesn't happen for clones.
>>>> Hi Ilya,
>>>>
>>>>
>>>> it seems that our assumption was false. I have an image which shows holes without diffing against
>>>>
>>>> a snapshot. Do you have an idea why?
>>>>
>>>>
>>>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>>>> rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
>>>>       size 20 GiB in 20000 objects
>>>>       order 20 (1 MiB objects)
>>>>       snapshot_count: 2
>>>>       id: 3d6daa102e4d9f
>>>>       block_name_prefix: rbd_data.3d6daa102e4d9f
>>>>       format: 2
>>>>       features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
>>>>       op_features:
>>>>       flags:
>>>>       create_timestamp: Tue Sep 21 14:16:56 2021
>>>>       access_timestamp: Thu Jan  6 15:24:46 2022
>>>>       modify_timestamp: Thu Jan  6 15:45:42 2022
>>>>
>>>>
>>>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>>>> SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
>>>>    12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
>>>>    17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022
>>>>
>>>>
>>>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
>>>> 204472320    1048576  zero
>>>> 1114636288   1048576  zero
>>>> 1115684864   1048576  zero
>>>> 1116733440   1048576  zero
>>>> 1117782016   1048576  zero
>>>> 1218445312   1048576  zero
>>>> 1219493888   1048576  zero
>>>> 1220542464   1048576  zero
>>> Hi Peter,
>>>
>>> Yes, I stumbled upon this just yesterday while looking into another
>>> librbd issue surfaced by this patch [1] and was going to email you and
>>> the list after wrapping my head around this behavior.  I see where it
>>> is coming from but I'm not sure what the right fix is.  I would prefer
>>> to patch librbd but that may turn out to be not feasible.  Let me get
>>> back on this next week.
>>>
>>> [1] https://tracker.ceph.com/issues/53784
>>
>> Looks like it was a good idea to add all these asserts...
>>
>>
>> The issue I see does occur with a simple qemu-img convert on the given image as source. It triggers the assert(exists) in the cb.
> Right, "qemu-img convert" issues a block status request.  Does your
> image have snapshots?  This behavior should be caused by a snapshot
> that contains a discarded object:
>
> $ rbd create --size 16M foo
> $ sudo rbd map foo
> /dev/rbd0
> $ sudo xfs_io -d -c 'pwrite -b 4M 0 16M' /dev/rbd0
> wrote 16777216/16777216 bytes at offset 0
> 16 MiB, 4 ops; 0.4607 sec (34.726 MiB/sec and 8.6814 ops/sec)
> $ rbd snap create foo@snap
> Creating snap: 100% complete...done.
> $ rbd diff foo
> Offset    Length   Type
> 0         4194304  data
> 4194304   4194304  data
> 8388608   4194304  data
> 12582912  4194304  data
>
> $ blkdiscard -o 4M -l 8M /dev/rbd0
> $ rbd diff foo
> Offset    Length   Type
> 0         4194304  data
> 4194304   4194304  zero
> 8388608   4194304  zero
> 12582912  4194304  data
>
> If you remove that snapshot, these "holes" should disappear from the
> report:
>
> $ rbd snap rm foo@snap
> Removing snap: 100% complete...done.
> $ rbd diff foo
> Offset    Length   Type
> 0         4194304  data
> 12582912  4194304  data
>
> Thanks,
>
>                 Ilya


So we have 2 different issues:


1) For the holes that we did not expect we should use the original version of my patch which was able to handle holes properly.

2) For issue #53784 the right would be to fix the issue in librbd. We could discuss if we try to "detect" this issue, report a warning once and

then disable the get_block_status functionality for the session. The question is, is it possible to "detect" the issue? If thats not possible

should we patch qemu to report unaligned requests as allocated for affected librbd versions?


Best,

Peter
Peter Lieven Jan. 8, 2022, 5:42 p.m. UTC | #11
Am 06.01.22 um 17:01 schrieb Ilya Dryomov:
> On Thu, Jan 6, 2022 at 4:27 PM Peter Lieven <pl@kamp.de> wrote:
>> Am 05.10.21 um 10:36 schrieb Ilya Dryomov:
>>> On Tue, Oct 5, 2021 at 10:19 AM Peter Lieven <pl@kamp.de> wrote:
>>>> Am 05.10.21 um 09:54 schrieb Ilya Dryomov:
>>>>> On Thu, Sep 16, 2021 at 2:21 PM Peter Lieven <pl@kamp.de> wrote:
>>>>>> the qemu rbd driver currently lacks support for bdrv_co_block_status.
>>>>>> This results mainly in incorrect progress during block operations (e.g.
>>>>>> qemu-img convert with an rbd image as source).
>>>>>>
>>>>>> This patch utilizes the rbd_diff_iterate2 call from librbd to detect
>>>>>> allocated and unallocated (all zero areas).
>>>>>>
>>>>>> To avoid querying the ceph OSDs for the answer this is only done if
>>>>>> the image has the fast-diff feature which depends on the object-map and
>>>>>> exclusive-lock features. In this case it is guaranteed that the information
>>>>>> is present in memory in the librbd client and thus very fast.
>>>>>>
>>>>>> If fast-diff is not available all areas are reported to be allocated
>>>>>> which is the current behaviour if bdrv_co_block_status is not implemented.
>>>>>>
>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>> ---
>>>>>> V2->V3:
>>>>>> - check rbd_flags every time (they can change during runtime) [Ilya]
>>>>>> - also check for fast-diff invalid flag [Ilya]
>>>>>> - *map and *file cant be NULL [Ilya]
>>>>>> - set ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID in case of an
>>>>>>     unallocated area [Ilya]
>>>>>> - typo: catched -> caught [Ilya]
>>>>>> - changed wording about fast-diff, object-map and exclusive lock in
>>>>>>     commit msg [Ilya]
>>>>>>
>>>>>> V1->V2:
>>>>>> - add commit comment [Stefano]
>>>>>> - use failed_post_open [Stefano]
>>>>>> - remove redundant assert [Stefano]
>>>>>> - add macro+comment for the magic -9000 value [Stefano]
>>>>>> - always set *file if its non NULL [Stefano]
>>>>>>
>>>>>>    block/rbd.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>    1 file changed, 126 insertions(+)
>>>>>>
>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>> index dcf82b15b8..3cb24f9981 100644
>>>>>> --- a/block/rbd.c
>>>>>> +++ b/block/rbd.c
>>>>>> @@ -1259,6 +1259,131 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
>>>>>>        return spec_info;
>>>>>>    }
>>>>>>
>>>>>> +typedef struct rbd_diff_req {
>>>>>> +    uint64_t offs;
>>>>>> +    uint64_t bytes;
>>>>>> +    int exists;
>>>>> Hi Peter,
>>>>>
>>>>> Nit: make exists a bool.  The one in the callback has to be an int
>>>>> because of the callback signature but let's not spread that.
>>>>>
>>>>>> +} rbd_diff_req;
>>>>>> +
>>>>>> +/*
>>>>>> + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
>>>>>> + * value in the callback routine. Choose a value that does not conflict with
>>>>>> + * an existing exitcode and return it if we want to prematurely stop the
>>>>>> + * execution because we detected a change in the allocation status.
>>>>>> + */
>>>>>> +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
>>>>>> +
>>>>>> +static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
>>>>>> +                                       int exists, void *opaque)
>>>>>> +{
>>>>>> +    struct rbd_diff_req *req = opaque;
>>>>>> +
>>>>>> +    assert(req->offs + req->bytes <= offs);
>>>>>> +
>>>>>> +    if (req->exists && offs > req->offs + req->bytes) {
>>>>>> +        /*
>>>>>> +         * we started in an allocated area and jumped over an unallocated area,
>>>>>> +         * req->bytes contains the length of the allocated area before the
>>>>>> +         * unallocated area. stop further processing.
>>>>>> +         */
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>> +    }
>>>>>> +    if (req->exists && !exists) {
>>>>>> +        /*
>>>>>> +         * we started in an allocated area and reached a hole. req->bytes
>>>>>> +         * contains the length of the allocated area before the hole.
>>>>>> +         * stop further processing.
>>>>>> +         */
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>> Do you have a test case for when this branch is taken?
>>>> That would happen if you diff from a snapshot, the question is if it can also happen if the image is a clone from a snapshot?
>>>>
>>>>
>>>>>> +    }
>>>>>> +    if (!req->exists && exists && offs > req->offs) {
>>>>>> +        /*
>>>>>> +         * we started in an unallocated area and hit the first allocated
>>>>>> +         * block. req->bytes must be set to the length of the unallocated area
>>>>>> +         * before the allocated area. stop further processing.
>>>>>> +         */
>>>>>> +        req->bytes = offs - req->offs;
>>>>>> +        return QEMU_RBD_EXIT_DIFF_ITERATE2;
>>>>>> +    }
>>>>>> +
>>>>>> +    /*
>>>>>> +     * assert that we caught all cases above and allocation state has not
>>>>>> +     * changed during callbacks.
>>>>>> +     */
>>>>>> +    assert(exists == req->exists || !req->bytes);
>>>>>> +    req->exists = exists;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * assert that we either return an unallocated block or have got callbacks
>>>>>> +     * for all allocated blocks present.
>>>>>> +     */
>>>>>> +    assert(!req->exists || offs == req->offs + req->bytes);
>>>>>> +    req->bytes = offs + len - req->offs;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
>>>>>> +                                                 bool want_zero, int64_t offset,
>>>>>> +                                                 int64_t bytes, int64_t *pnum,
>>>>>> +                                                 int64_t *map,
>>>>>> +                                                 BlockDriverState **file)
>>>>>> +{
>>>>>> +    BDRVRBDState *s = bs->opaque;
>>>>>> +    int ret, r;
>>>>> Nit: I would rename ret to status or something like that to make
>>>>> it clear(er) that it is an actual value and never an error.  Or,
>>>>> even better, drop it entirely and return one of the two bitmasks
>>>>> directly.
>>>>>
>>>>>> +    struct rbd_diff_req req = { .offs = offset };
>>>>>> +    uint64_t features, flags;
>>>>>> +
>>>>>> +    assert(offset + bytes <= s->image_size);
>>>>>> +
>>>>>> +    /* default to all sectors allocated */
>>>>>> +    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>>>> +    *map = offset;
>>>>>> +    *file = bs;
>>>>>> +    *pnum = bytes;
>>>>>> +
>>>>>> +    /* check if RBD image supports fast-diff */
>>>>>> +    r = rbd_get_features(s->image, &features);
>>>>>> +    if (r < 0) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    if (!(features & RBD_FEATURE_FAST_DIFF)) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* check if RBD fast-diff result is valid */
>>>>>> +    r = rbd_get_flags(s->image, &flags);
>>>>>> +    if (r < 0) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
>>>>>> +        goto out;
>>>>> Nit: I'm not a fan of labels that cover just the return statement.
>>>>> Feel free to ignore this one but I would get rid of it and replace
>>>>> these gotos with returns.
>>>> That would be return with the bitmask directly coded in if I also
>>>>
>>>> drop the ret variable. I can change that, no problem.
>>>>
>>>>
>>>>>> +    }
>>>>>> +
>>>>>> +    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
>>>>>> +                          qemu_rbd_co_block_status_cb, &req);
>>>>>> +    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
>>>>>> +        goto out;
>>>>>> +    }
>>>>>> +    assert(req.bytes <= bytes);
>>>>>> +    if (!req.exists) {
>>>>>> +        if (r == 0 && !req.bytes) {
>>>>>> +            /*
>>>>>> +             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
>>>>>> +             * except for the case where an overlay has a hole where the parent
>>>>>> +             * has not. This here catches the case where no callback was
>>>>>> +             * invoked at all.
>>>>>> +             */
>>>>> The above is true in the case of diffing against a snapshot, i.e. when
>>>>> the "from" snapshot has some data where the "to" revision (whether HEAD
>>>>> or another snapshot) has a hole.  I don't think it is true for child vs
>>>>> parent (but it has been a while since I looked at the diff code).  As
>>>>> long as NULL is passed for fromsnapname, I would expect the callback to
>>>>> be invoked only for allocated areas.  If I'm right, we could simplify
>>>>> qemu_rbd_co_block_status_cb() considerably.
>>>> See my comment in the callback. Can you look at the diff code or give
>>>> me at least a pointer where I can find it. I am not that familiar with
>>>> the librbd code yet.
>>> I assumed that you added !exists handling because it came up in your
>>> testing.  If you don't have a test case then let's proceed under the
>>> assumption that it doesn't happen for clones.
>>
>> Hi Ilya,
>>
>>
>> it seems that our assumption was false. I have an image which shows holes without diffing against
>>
>> a snapshot. Do you have an idea why?
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven info dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>> rbd image 'c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw':
>>      size 20 GiB in 20000 objects
>>      order 20 (1 MiB objects)
>>      snapshot_count: 2
>>      id: 3d6daa102e4d9f
>>      block_name_prefix: rbd_data.3d6daa102e4d9f
>>      format: 2
>>      features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
>>      op_features:
>>      flags:
>>      create_timestamp: Tue Sep 21 14:16:56 2021
>>      access_timestamp: Thu Jan  6 15:24:46 2022
>>      modify_timestamp: Thu Jan  6 15:45:42 2022
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven snap ls dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw
>> SNAPID  NAME                 SIZE    PROTECTED TIMESTAMP
>>   12297  dlp-20210921-144509  20 GiB             Tue Sep 21 14:45:09 2021
>>   17745  dlp-20220106-040000  20 GiB             Thu Jan  6 04:00:01 2022
>>
>>
>> $ rbd --conf /etc/ceph/ceph-dev01.conf --id lieven diff --whole-object dhp-standard/c4ca7ee9-36ce-4fc9-9d3b-ece8a4f8b83e/c1ad11d0-4f6a-4cc1-8aa3-ff3c413c1471.raw | grep zero
>> 204472320    1048576  zero
>> 1114636288   1048576  zero
>> 1115684864   1048576  zero
>> 1116733440   1048576  zero
>> 1117782016   1048576  zero
>> 1218445312   1048576  zero
>> 1219493888   1048576  zero
>> 1220542464   1048576  zero
> Hi Peter,
>
> Yes, I stumbled upon this just yesterday while looking into another
> librbd issue surfaced by this patch [1] and was going to email you and
> the list after wrapping my head around this behavior.  I see where it
> is coming from but I'm not sure what the right fix is.  I would prefer
> to patch librbd but that may turn out to be not feasible.  Let me get
> back on this next week.
>
> [1] https://tracker.ceph.com/issues/53784


Can you please verify and check if this would be a valid workaround for #53784


diff --git a/block/rbd.c b/block/rbd.c
index def96292e0..19c01bb89f 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1320,6 +1320,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
     int status, r;
     RBDDiffIterateReq req = { .offs = offset };
     uint64_t features, flags;
+    int64_t head;
 
     assert(offset + bytes <= s->image_size);
 
@@ -1347,6 +1348,11 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
         return status;
     }
 
+    head = offset & (s->object_size - 1);
+    offset -= head;
+    req.offs -= head;
+    bytes += head;
+
     r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
                           qemu_rbd_diff_iterate_cb, &req);
     if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
@@ -1366,7 +1372,8 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
         status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
     }
 
-    *pnum = req.bytes;
+    assert(req.bytes > head);
+    *pnum = req.bytes - head;
     return status;
 }


I would then prepare a series with two patches. One for #53784 and a seconds to fix the exists asssertion.


Best,

Peter
diff mbox series

Patch

diff --git a/block/rbd.c b/block/rbd.c
index dcf82b15b8..3cb24f9981 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1259,6 +1259,131 @@  static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
     return spec_info;
 }
 
+typedef struct rbd_diff_req {
+    uint64_t offs;
+    uint64_t bytes;
+    int exists;
+} rbd_diff_req;
+
+/*
+ * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+ * value in the callback routine. Choose a value that does not conflict with
+ * an existing exitcode and return it if we want to prematurely stop the
+ * execution because we detected a change in the allocation status.
+ */
+#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+
+static int qemu_rbd_co_block_status_cb(uint64_t offs, size_t len,
+                                       int exists, void *opaque)
+{
+    struct rbd_diff_req *req = opaque;
+
+    assert(req->offs + req->bytes <= offs);
+
+    if (req->exists && offs > req->offs + req->bytes) {
+        /*
+         * we started in an allocated area and jumped over an unallocated area,
+         * req->bytes contains the length of the allocated area before the
+         * unallocated area. stop further processing.
+         */
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+    if (req->exists && !exists) {
+        /*
+         * we started in an allocated area and reached a hole. req->bytes
+         * contains the length of the allocated area before the hole.
+         * stop further processing.
+         */
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+    if (!req->exists && exists && offs > req->offs) {
+        /*
+         * we started in an unallocated area and hit the first allocated
+         * block. req->bytes must be set to the length of the unallocated area
+         * before the allocated area. stop further processing.
+         */
+        req->bytes = offs - req->offs;
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+
+    /*
+     * assert that we caught all cases above and allocation state has not
+     * changed during callbacks.
+     */
+    assert(exists == req->exists || !req->bytes);
+    req->exists = exists;
+
+    /*
+     * assert that we either return an unallocated block or have got callbacks
+     * for all allocated blocks present.
+     */
+    assert(!req->exists || offs == req->offs + req->bytes);
+    req->bytes = offs + len - req->offs;
+
+    return 0;
+}
+
+static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero, int64_t offset,
+                                                 int64_t bytes, int64_t *pnum,
+                                                 int64_t *map,
+                                                 BlockDriverState **file)
+{
+    BDRVRBDState *s = bs->opaque;
+    int ret, r;
+    struct rbd_diff_req req = { .offs = offset };
+    uint64_t features, flags;
+
+    assert(offset + bytes <= s->image_size);
+
+    /* default to all sectors allocated */
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    *map = offset;
+    *file = bs;
+    *pnum = bytes;
+
+    /* check if RBD image supports fast-diff */
+    r = rbd_get_features(s->image, &features);
+    if (r < 0) {
+        goto out;
+    }
+    if (!(features & RBD_FEATURE_FAST_DIFF)) {
+        goto out;
+    }
+
+    /* check if RBD fast-diff result is valid */
+    r = rbd_get_flags(s->image, &flags);
+    if (r < 0) {
+        goto out;
+    }
+    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+        goto out;
+    }
+
+    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+                          qemu_rbd_co_block_status_cb, &req);
+    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+        goto out;
+    }
+    assert(req.bytes <= bytes);
+    if (!req.exists) {
+        if (r == 0 && !req.bytes) {
+            /*
+             * rbd_diff_iterate2 does not invoke callbacks for unallocated areas
+             * except for the case where an overlay has a hole where the parent
+             * has not. This here catches the case where no callback was
+             * invoked at all.
+             */
+            req.bytes = bytes;
+        }
+        ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+    }
+    *pnum = req.bytes;
+
+out:
+    return ret;
+}
+
 static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1494,6 +1619,7 @@  static BlockDriver bdrv_rbd = {
 #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
 #endif
+    .bdrv_co_block_status   = qemu_rbd_co_block_status,
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,