diff mbox series

[v2,5/8] qemu-img: rebase: avoid unnecessary COW operations

Message ID 20230915162016.141771-6-andrey.drobyshev@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series qemu-img: rebase: add compression support | expand

Commit Message

Andrey Drobyshev Sept. 15, 2023, 4:20 p.m. UTC
When rebasing an image from one backing file to another, we need to
compare data from old and new backings.  If the diff between that data
happens to be unaligned to the target cluster size, we might end up
doing partial writes, which would lead to copy-on-write and additional IO.

Consider the following simple case (virtual_size == cluster_size == 64K):

base <-- inc1 <-- inc2

qemu-io -c "write -P 0xaa 0 32K" base.qcow2
qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2

While doing rebase, we'll write a half of the cluster to inc2, and block
layer will have to read the 2nd half of the same cluster from the base image
inc1 while doing this write operation, although the whole cluster is already
read earlier to perform data comparison.

In order to avoid these unnecessary IO cycles, let's make sure every
write request is aligned to the overlay subcluster boundaries.  Using
subcluster size is universal as for the images which don't have them
this size equals to the cluster size, so in any case we end up aligning
to the smallest unit of allocation.

Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 qemu-img.c | 76 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 20 deletions(-)

Comments

Eric Blake Sept. 15, 2023, 9:52 p.m. UTC | #1
On Fri, Sep 15, 2023 at 07:20:13PM +0300, Andrey Drobyshev wrote:
> When rebasing an image from one backing file to another, we need to
> compare data from old and new backings.  If the diff between that data
> happens to be unaligned to the target cluster size, we might end up
> doing partial writes, which would lead to copy-on-write and additional IO.
> 
> Consider the following simple case (virtual_size == cluster_size == 64K):
> 
> base <-- inc1 <-- inc2
> 
> qemu-io -c "write -P 0xaa 0 32K" base.qcow2
> qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
> qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
> qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
> qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2
> 
> While doing rebase, we'll write a half of the cluster to inc2, and block
> layer will have to read the 2nd half of the same cluster from the base image
> inc1 while doing this write operation, although the whole cluster is already
> read earlier to perform data comparison.
> 
> In order to avoid these unnecessary IO cycles, let's make sure every
> write request is aligned to the overlay subcluster boundaries.  Using
> subcluster size is universal as for the images which don't have them
> this size equals to the cluster size, so in any case we end up aligning
> to the smallest unit of allocation.
> 
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>  qemu-img.c | 76 ++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 56 insertions(+), 20 deletions(-)
> 
> diff --git a/qemu-img.c b/qemu-img.c
> index fcd31d7b5b..83950af42b 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -3523,6 +3523,7 @@ static int img_rebase(int argc, char **argv)
>      uint8_t *buf_new = NULL;
>      BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
>      BlockDriverState *unfiltered_bs;
> +    BlockDriverInfo bdi = {0};
>      char *filename;
>      const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
>      int c, flags, src_flags, ret;
> @@ -3533,6 +3534,7 @@ static int img_rebase(int argc, char **argv)
>      bool quiet = false;
>      Error *local_err = NULL;
>      bool image_opts = false;
> +    int64_t write_align;
>  
>      /* Parse commandline parameters */
>      fmt = NULL;
> @@ -3656,6 +3658,20 @@ static int img_rebase(int argc, char **argv)
>          }
>      }
>  
> +    /*
> +     * We need overlay subcluster size to make sure write requests are
> +     * aligned.
> +     */
> +    ret = bdrv_get_info(unfiltered_bs, &bdi);
> +    if (ret < 0) {
> +        error_report("could not get block driver info");
> +        goto out;
> +    } else if (bdi.subcluster_size == 0) {
> +        bdi.subcluster_size = 1;
> +    }
> +
> +    write_align = bdi.subcluster_size;
> +
>      /* For safe rebasing we need to compare old and new backing file */
>      if (!unsafe) {
>          QDict *options = NULL;
> @@ -3753,7 +3769,7 @@ static int img_rebase(int argc, char **argv)
>          int64_t old_backing_size = 0;
>          int64_t new_backing_size = 0;
>          uint64_t offset;
> -        int64_t n;
> +        int64_t n, n_old = 0, n_new = 0;
>          float local_progress = 0;
>  
>          if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk)) >
> @@ -3799,7 +3815,8 @@ static int img_rebase(int argc, char **argv)
>          }
>  
>          for (offset = 0; offset < size; offset += n) {
> -            bool buf_old_is_zero = false;
> +            bool old_backing_eof = false;
> +            int64_t n_alloc;
>  
>              /* How many bytes can we handle with the next read? */
>              n = MIN(IO_BUF_SIZE, size - offset);
> @@ -3844,33 +3861,48 @@ static int img_rebase(int argc, char **argv)
>                  }
>              }
>  
> +            /*
> +             * At this point we know that the region [offset; offset + n)
> +             * is unallocated within the target image.  This region might be
> +             * unaligned to the target image's (sub)cluster boundaries, as
> +             * old backing may have smaller clusters (or have subclusters).
> +             * We extend it to the aligned boundaries to avoid CoW on
> +             * partial writes in blk_pwrite(),
> +             */
> +            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
> +            offset = QEMU_ALIGN_DOWN(offset, write_align);

If we are always aligning to write_align on each iteration of this
loop, won't this round down always be a no-op?

> +            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
> +            n = MIN(n, size - offset);

However, I can see how this round up can matter.

> +            assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
> +                   n_alloc == n);

This assertion feels a bit heavyweight.  I see what you're trying to
say: if we found a (partial) unallocated region in the destination,
then since write_align is the minimum alignment of such allocation,
our rounding up to alignment boundaries should not change the fact
that we still have an unallocated region in the destination.  But we
already checked the data from the original offset through the original
n, and I argue that the original offset was unchanged; so really, all
the more you'd need to assert (if the assertion is even necessary)
could be something like

assert(new_offset == orig_offset);
tail = new_n - orig_n;
assert(!bdrv_is_allocated(unfiltered_bs, orig_offset+orig_n, tail, &n_alloc) && n_alloc == tail);

> +
> +            /*
> +             * Much like the with the target image, we'll try to read as much
> +             * of the old and new backings as we can.
> +             */
> +            n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
> +            if (blk_new_backing) {
> +                n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
> +            }
> +
>              /*
>               * Read old and new backing file and take into consideration that
>               * backing files may be smaller than the COW image.
>               */
> -            if (offset >= old_backing_size) {
> -                memset(buf_old, 0, n);
> -                buf_old_is_zero = true;
> +            memset(buf_old + n_old, 0, n - n_old);
> +            if (!n_old) {
> +                old_backing_eof = true;
>              } else {
> -                if (offset + n > old_backing_size) {
> -                    n = old_backing_size - offset;
> -                }
> -
> -                ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
> +                ret = blk_pread(blk_old_backing, offset, n_old, buf_old, 0);

Here's a more fundamental question.  Why are we reading from the old
backing file?  At this point in time, isn't unfiltered_bs (the target
image) still chained to the old backing file?  Why not just do a
blk_pread() from the destination?  It will cause the block layer to
read through the backing layers on our behalf, but the block layer
will then take care of any needed zeroing without us having to do a
memset here.

Then, once we have the contents of the disk (as seen through the
destination backed by the old image), we can compare to what the new
image would read, to see if we still need to write into the
destination or can just let the destination rely on backing from the
new image.

>                  if (ret < 0) {
>                      error_report("error while reading from old backing file");
>                      goto out;
>                  }
>              }
>  
> -            if (offset >= new_backing_size || !blk_new_backing) {
> -                memset(buf_new, 0, n);
> -            } else {
> -                if (offset + n > new_backing_size) {
> -                    n = new_backing_size - offset;
> -                }
> -
> -                ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
> +            memset(buf_new + n_new, 0, n - n_new);
> +            if (blk_new_backing && n_new) {
> +                ret = blk_pread(blk_new_backing, offset, n_new, buf_new, 0);
>                  if (ret < 0) {
>                      error_report("error while reading from new backing file");
>                      goto out;
> @@ -3884,11 +3916,12 @@ static int img_rebase(int argc, char **argv)
>                  int64_t pnum;
>  
>                  if (compare_buffers(buf_old + written, buf_new + written,
> -                                    n - written, 0, &pnum))
> +                                    n - written, write_align, &pnum))
>                  {
> -                    if (buf_old_is_zero) {
> +                    if (old_backing_eof) {
>                          ret = blk_pwrite_zeroes(blk, offset + written, pnum, 0);

Deciding whether to write zeroes (which can be more efficient) is
possible for more than just when the old backing file has already
reached EOF.

>                      } else {
> +                        assert(written + pnum <= IO_BUF_SIZE);
>                          ret = blk_pwrite(blk, offset + written, pnum,
>                                           buf_old + written, 0);
>                      }
> @@ -3900,6 +3933,9 @@ static int img_rebase(int argc, char **argv)
>                  }
>  
>                  written += pnum;
> +                if (offset + written >= old_backing_size) {
> +                    old_backing_eof = true;
> +                }
>              }
>              qemu_progress_print(local_progress, 100);
>          }
> -- 
> 2.39.3
>

The idea behind this patch makes sense, but the function is already so
long and complicated and you are adding more complexity.  I'll
continue reviewing the rest of the series, but I'll be interested in
seeing if any other block maintainers has an opinion on this patch.
Andrey Drobyshev Sept. 16, 2023, 5:45 p.m. UTC | #2
On 9/16/23 00:52, Eric Blake wrote:
> On Fri, Sep 15, 2023 at 07:20:13PM +0300, Andrey Drobyshev wrote:
>> When rebasing an image from one backing file to another, we need to
>> compare data from old and new backings.  If the diff between that data
>> happens to be unaligned to the target cluster size, we might end up
>> doing partial writes, which would lead to copy-on-write and additional IO.
>>
>> Consider the following simple case (virtual_size == cluster_size == 64K):
>>
>> base <-- inc1 <-- inc2
>>
>> qemu-io -c "write -P 0xaa 0 32K" base.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
>> qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
>> qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2
>>
>> While doing rebase, we'll write a half of the cluster to inc2, and block
>> layer will have to read the 2nd half of the same cluster from the base image
>> inc1 while doing this write operation, although the whole cluster is already
>> read earlier to perform data comparison.
>>
>> In order to avoid these unnecessary IO cycles, let's make sure every
>> write request is aligned to the overlay subcluster boundaries.  Using
>> subcluster size is universal as for the images which don't have them
>> this size equals to the cluster size, so in any case we end up aligning
>> to the smallest unit of allocation.
>>
>> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>> ---
>>  qemu-img.c | 76 ++++++++++++++++++++++++++++++++++++++++--------------
>>  1 file changed, 56 insertions(+), 20 deletions(-)
>>
>> diff --git a/qemu-img.c b/qemu-img.c
>> index fcd31d7b5b..83950af42b 100644
>> --- a/qemu-img.c
>> +++ b/qemu-img.c
>> @@ -3523,6 +3523,7 @@ static int img_rebase(int argc, char **argv)
>>      uint8_t *buf_new = NULL;
>>      BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
>>      BlockDriverState *unfiltered_bs;
>> +    BlockDriverInfo bdi = {0};
>>      char *filename;
>>      const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
>>      int c, flags, src_flags, ret;
>> @@ -3533,6 +3534,7 @@ static int img_rebase(int argc, char **argv)
>>      bool quiet = false;
>>      Error *local_err = NULL;
>>      bool image_opts = false;
>> +    int64_t write_align;
>>  
>>      /* Parse commandline parameters */
>>      fmt = NULL;
>> @@ -3656,6 +3658,20 @@ static int img_rebase(int argc, char **argv)
>>          }
>>      }
>>  
>> +    /*
>> +     * We need overlay subcluster size to make sure write requests are
>> +     * aligned.
>> +     */
>> +    ret = bdrv_get_info(unfiltered_bs, &bdi);
>> +    if (ret < 0) {
>> +        error_report("could not get block driver info");
>> +        goto out;
>> +    } else if (bdi.subcluster_size == 0) {
>> +        bdi.subcluster_size = 1;
>> +    }
>> +
>> +    write_align = bdi.subcluster_size;
>> +
>>      /* For safe rebasing we need to compare old and new backing file */
>>      if (!unsafe) {
>>          QDict *options = NULL;
>> @@ -3753,7 +3769,7 @@ static int img_rebase(int argc, char **argv)
>>          int64_t old_backing_size = 0;
>>          int64_t new_backing_size = 0;
>>          uint64_t offset;
>> -        int64_t n;
>> +        int64_t n, n_old = 0, n_new = 0;
>>          float local_progress = 0;
>>  
>>          if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk)) >
>> @@ -3799,7 +3815,8 @@ static int img_rebase(int argc, char **argv)
>>          }
>>  
>>          for (offset = 0; offset < size; offset += n) {
>> -            bool buf_old_is_zero = false;
>> +            bool old_backing_eof = false;
>> +            int64_t n_alloc;
>>  
>>              /* How many bytes can we handle with the next read? */
>>              n = MIN(IO_BUF_SIZE, size - offset);
>> @@ -3844,33 +3861,48 @@ static int img_rebase(int argc, char **argv)
>>                  }
>>              }
>>  
>> +            /*
>> +             * At this point we know that the region [offset; offset + n)
>> +             * is unallocated within the target image.  This region might be
>> +             * unaligned to the target image's (sub)cluster boundaries, as
>> +             * old backing may have smaller clusters (or have subclusters).
>> +             * We extend it to the aligned boundaries to avoid CoW on
>> +             * partial writes in blk_pwrite(),
>> +             */
>> +            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
>> +            offset = QEMU_ALIGN_DOWN(offset, write_align);
> 
> If we are always aligning to write_align on each iteration of this
> loop, won't this round down always be a no-op?
> 

No, it's not always the case.  Bear in mind that when the call to
bdrv_is_allocated() operates on the target's granularity, the call to
bdrv_is_allocated_above() would make n aligned to the old backing's
granularity.  Consider the following case (it was brought by Hanna under
v1 and is now covered in tests by commit "iotests/{024, 271}: add
testcases for qemu-img rebase"):

qemu-img create -f qcow2 base.qcow2 1M
qemu-img create -f qcow2 -b base.qcow2 -F qcow2 mid.qcow2 1M
qemu-img create -f qcow2 -o cluster_size=1M -b mid.qcow2 -F qcow2
top.qcow2 1M

qemu-io -c 'write 64k 64k' mid.qcow2
qemu-img rebase -b base.qcow2 -F qcow2 top.qcow2


Base: |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|    64K
Mid:  |-|*|-|-|-|-|-|-|-|-|-|-|-|-|-|-|    64K
Top:  |- - - - - - - - - - - - - - - -|    1M


Then we'll have (without rounding down):

offset == 0, n == 1M

1st iteration:
bdrv_is_allocated() == 0 --> n == 1M
bdrv_is_allocated_above() == 0 --> n == 64K

offset == 64K, n == 960K

2nd iteration:
bdrv_is_allocated() == 0 --> n == 960K
bdrv_is_allocated_above() == 1 --> n == 64K

And then we'll proceed writing 64K w/o rounding up or 960K w/ rounding up.

So the whole point of expanding the unaligned region to its natural
borders is to write (sub)cluster aligned chunks.  I agree that in this
particular case we don't win anything in terms of CoW since in either
case we'll end up writing exactly 1M.  However, respecting cluster
boundaries will matter when we'll do compressed writes.

If you want to make things more orderly, I can move the rounding down
part to the commit ("qemu-img: add compression option to rebase
subcommand").


>> +            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
>> +            n = MIN(n, size - offset);
> 
> However, I can see how this round up can matter.
> 
>> +            assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
>> +                   n_alloc == n);
> 
> This assertion feels a bit heavyweight.  I see what you're trying to
> say: if we found a (partial) unallocated region in the destination,
> then since write_align is the minimum alignment of such allocation,
> our rounding up to alignment boundaries should not change the fact
> that we still have an unallocated region in the destination.  But we
> already checked the data from the original offset through the original
> n, and I argue that the original offset was unchanged; so really, all
> the more you'd need to assert (if the assertion is even necessary)
> could be something like
> 
> assert(new_offset == orig_offset);
> tail = new_n - orig_n;
> assert(!bdrv_is_allocated(unfiltered_bs, orig_offset+orig_n, tail, &n_alloc) && n_alloc == tail);
>

Since the original offset may in fact change as we've established above,
I still think that checking the whole expanded region for being
unallocated makes more sense, than checking just the tail.

>> +
>> +            /*
>> +             * Much like the with the target image, we'll try to read as much
>> +             * of the old and new backings as we can.
>> +             */
>> +            n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
>> +            if (blk_new_backing) {
>> +                n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
>> +            }
>> +
>>              /*
>>               * Read old and new backing file and take into consideration that
>>               * backing files may be smaller than the COW image.
>>               */
>> -            if (offset >= old_backing_size) {
>> -                memset(buf_old, 0, n);
>> -                buf_old_is_zero = true;
>> +            memset(buf_old + n_old, 0, n - n_old);
>> +            if (!n_old) {
>> +                old_backing_eof = true;
>>              } else {
>> -                if (offset + n > old_backing_size) {
>> -                    n = old_backing_size - offset;
>> -                }
>> -
>> -                ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
>> +                ret = blk_pread(blk_old_backing, offset, n_old, buf_old, 0);
> 
> Here's a more fundamental question.  Why are we reading from the old
> backing file?  At this point in time, isn't unfiltered_bs (the target
> image) still chained to the old backing file?  Why not just do a
> blk_pread() from the destination?  It will cause the block layer to
> read through the backing layers on our behalf, but the block layer
> will then take care of any needed zeroing without us having to do a
> memset here.
> 
> Then, once we have the contents of the disk (as seen through the
> destination backed by the old image), we can compare to what the new
> image would read, to see if we still need to write into the
> destination or can just let the destination rely on backing from the
> new image.
> 

Hmm, that's actually a great question.  At first glance there're indeed
no obstacles to entirely get rid of blk_old_backing.  The only drawback
in your scheme above is that we wouldn't keep track of the old backing
size thus lacking the possibility of calling blk_pwrite_zeroes()
directly right after we've reached the end of the old backing.  But I
suppose we can still keep that size in mind while reading from the
target as you've described.

This optimization hasn't crossed my mind as I was mainly focused on the
alignments here, but I suppose we can try it in one of the upcoming
series (unless someone'll point to some other drawbacks of reading
directly from target).

>>                  if (ret < 0) {
>>                      error_report("error while reading from old backing file");
>>                      goto out;
>>                  }
>>              }
>>  
>> -            if (offset >= new_backing_size || !blk_new_backing) {
>> -                memset(buf_new, 0, n);
>> -            } else {
>> -                if (offset + n > new_backing_size) {
>> -                    n = new_backing_size - offset;
>> -                }
>> -
>> -                ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
>> +            memset(buf_new + n_new, 0, n - n_new);
>> +            if (blk_new_backing && n_new) {
>> +                ret = blk_pread(blk_new_backing, offset, n_new, buf_new, 0);
>>                  if (ret < 0) {
>>                      error_report("error while reading from new backing file");
>>                      goto out;
>> @@ -3884,11 +3916,12 @@ static int img_rebase(int argc, char **argv)
>>                  int64_t pnum;
>>  
>>                  if (compare_buffers(buf_old + written, buf_new + written,
>> -                                    n - written, 0, &pnum))
>> +                                    n - written, write_align, &pnum))
>>                  {
>> -                    if (buf_old_is_zero) {
>> +                    if (old_backing_eof) {
>>                          ret = blk_pwrite_zeroes(blk, offset + written, pnum, 0);
> 
> Deciding whether to write zeroes (which can be more efficient) is
> possible for more than just when the old backing file has already
> reached EOF.
> 

Sure, as you can see I haven't changed the logic here, simply renamed
the flag to indicate the condition more accurately.  But if we end up
reading directly from the target as you've suggested, the only way I can
think of to know for sure whether we should write zeroes is iteratively
compare portions of the read buffer to a predefined zero buffer.  That's
because at this point we know for sure that the region is unallocated in
the target, and we can't query bdrv_block_status() to know that it's
reading as zeroes.

>>                      } else {
>> +                        assert(written + pnum <= IO_BUF_SIZE);
>>                          ret = blk_pwrite(blk, offset + written, pnum,
>>                                           buf_old + written, 0);
>>                      }
>> @@ -3900,6 +3933,9 @@ static int img_rebase(int argc, char **argv)
>>                  }
>>  
>>                  written += pnum;
>> +                if (offset + written >= old_backing_size) {
>> +                    old_backing_eof = true;
>> +                }
>>              }
>>              qemu_progress_print(local_progress, 100);
>>          }
>> -- 
>> 2.39.3
>>
> 
> The idea behind this patch makes sense, but the function is already so
> long and complicated and you are adding more complexity.  I'll
> continue reviewing the rest of the series, but I'll be interested in
> seeing if any other block maintainers has an opinion on this patch.
> 

Sure, with the optimization comes complexity.  Due to the fact that we
send fewer read requests and avoid redundant CoW during rebase, the
speed improves significantly.  E.g. that's the numbers I get rebasing
the topmost image in the backup chain of the VM I have at hand:

base.qcow2 <-- delta.qcow2

# qemu-img info delta.qcow2 | grep size
virtual size: 64 GiB (68719476736 bytes)
disk size: 1.6 GiB
cluster_size: 65536

# qemu-img check delta.qcow2
No errors were found on the image.
34266/1048576 = 3.27% allocated, 99.76% fragmented, 99.47% compressed
clusters
Image end offset: 1722548224

# qemu-img info base.qcow2 | grep size
virtual size: 64 GiB (68719476736 bytes)
disk size: 4.06 GiB
cluster_size: 65536

# qemu-img check base.qcow2
No errors were found on the image.
133839/1048576 = 12.76% allocated, 87.92% fragmented, 76.32% compressed
clusters
Image end offset: 4391895040

vanilla:

# time qemu-img rebase -f qcow2 -b '' delta.qcow2.vanilla

real    0m49.192s
user    0m27.253s
sys     0m11.894s

patched:

# time qemu-img rebase -f qcow2 -b '' delta.qcow2.patched

real    0m33.071s
user    0m15.475s
sys     0m8.446s

#

# diff <(qemu-img map --output=json delta.qcow2.patched) <(qemu-img map
--output=json delta.qcow2.vanilla) && echo $?
0

# du --block-size=1 delta.qcow2.patched delta.qcow2.vanilla
10409897984     delta.qcow2.patched
10409897984     delta.qcow2.vanilla

# qemu-img compare delta.qcow2.vanilla delta.qcow2.patched
Images are identical.
Hanna Czenczek Sept. 19, 2023, 10:46 a.m. UTC | #3
On 15.09.23 18:20, Andrey Drobyshev wrote:
> When rebasing an image from one backing file to another, we need to
> compare data from old and new backings.  If the diff between that data
> happens to be unaligned to the target cluster size, we might end up
> doing partial writes, which would lead to copy-on-write and additional IO.
>
> Consider the following simple case (virtual_size == cluster_size == 64K):
>
> base <-- inc1 <-- inc2
>
> qemu-io -c "write -P 0xaa 0 32K" base.qcow2
> qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
> qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
> qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
> qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2
>
> While doing rebase, we'll write a half of the cluster to inc2, and block
> layer will have to read the 2nd half of the same cluster from the base image
> inc1 while doing this write operation, although the whole cluster is already
> read earlier to perform data comparison.
>
> In order to avoid these unnecessary IO cycles, let's make sure every
> write request is aligned to the overlay subcluster boundaries.  Using
> subcluster size is universal as for the images which don't have them
> this size equals to the cluster size, so in any case we end up aligning
> to the smallest unit of allocation.
>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   qemu-img.c | 76 ++++++++++++++++++++++++++++++++++++++++--------------
>   1 file changed, 56 insertions(+), 20 deletions(-)

Looks good, I like the changes from v1!  Two minor things:

> diff --git a/qemu-img.c b/qemu-img.c
> index fcd31d7b5b..83950af42b 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c

[...]

> @@ -3844,33 +3861,48 @@ static int img_rebase(int argc, char **argv)
>                   }
>               }
>   
> +            /*
> +             * At this point we know that the region [offset; offset + n)
> +             * is unallocated within the target image.  This region might be
> +             * unaligned to the target image's (sub)cluster boundaries, as
> +             * old backing may have smaller clusters (or have subclusters).
> +             * We extend it to the aligned boundaries to avoid CoW on
> +             * partial writes in blk_pwrite(),
> +             */
> +            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
> +            offset = QEMU_ALIGN_DOWN(offset, write_align);
> +            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
> +            n = MIN(n, size - offset);
> +            assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
> +                   n_alloc == n);
> +
> +            /*
> +             * Much like the with the target image, we'll try to read as much

s/the with the/with the/

> +             * of the old and new backings as we can.
> +             */
> +            n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
> +            if (blk_new_backing) {
> +                n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
> +            }

If we don’t have a check for blk_old_backing (old_backing_size is 0 if 
blk_old_backing is NULL), why do we have a check for blk_new_backing 
(new_backing_size is 0 if blk_new_backing is NULL)?

(Perhaps because the previous check was `offset >= new_backing_size || 
!blk_new_backing`, i.e. included exactly such a check – but I don’t 
think it’s necessary, new_backing_size will be 0 if blk_new_backing is 
NULL.)

> +
>               /*
>                * Read old and new backing file and take into consideration that
>                * backing files may be smaller than the COW image.
>                */
> -            if (offset >= old_backing_size) {
> -                memset(buf_old, 0, n);
> -                buf_old_is_zero = true;
> +            memset(buf_old + n_old, 0, n - n_old);
> +            if (!n_old) {
> +                old_backing_eof = true;
>               } else {
> -                if (offset + n > old_backing_size) {
> -                    n = old_backing_size - offset;
> -                }
> -
> -                ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
> +                ret = blk_pread(blk_old_backing, offset, n_old, buf_old, 0);
>                   if (ret < 0) {
>                       error_report("error while reading from old backing file");
>                       goto out;
>                   }
>               }
>   
> -            if (offset >= new_backing_size || !blk_new_backing) {
> -                memset(buf_new, 0, n);
> -            } else {
> -                if (offset + n > new_backing_size) {
> -                    n = new_backing_size - offset;
> -                }
> -
> -                ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
> +            memset(buf_new + n_new, 0, n - n_new);
> +            if (blk_new_backing && n_new) {

Same as above, I think we can drop the blk_new_backing check, just so 
that both blocks (for old and new) look the same.

(Also, the memset() already has to trust that n_new is 0 if 
blk_new_backing is NULL, so the check doesn’t make much sense from that 
perspective either, and makes the memset() look wrong.)

> +                ret = blk_pread(blk_new_backing, offset, n_new, buf_new, 0);
>                   if (ret < 0) {
>                       error_report("error while reading from new backing file");
>                       goto out;
Andrey Drobyshev Sept. 19, 2023, 4:21 p.m. UTC | #4
On 9/19/23 13:46, Hanna Czenczek wrote:
> On 15.09.23 18:20, Andrey Drobyshev wrote:
>> When rebasing an image from one backing file to another, we need to
>> compare data from old and new backings.  If the diff between that data
>> happens to be unaligned to the target cluster size, we might end up
>> doing partial writes, which would lead to copy-on-write and additional
>> IO.
>>
>> Consider the following simple case (virtual_size == cluster_size == 64K):
>>
>> base <-- inc1 <-- inc2
>>
>> qemu-io -c "write -P 0xaa 0 32K" base.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
>> qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
>> qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2
>>
>> While doing rebase, we'll write a half of the cluster to inc2, and block
>> layer will have to read the 2nd half of the same cluster from the base
>> image
>> inc1 while doing this write operation, although the whole cluster is
>> already
>> read earlier to perform data comparison.
>>
>> In order to avoid these unnecessary IO cycles, let's make sure every
>> write request is aligned to the overlay subcluster boundaries.  Using
>> subcluster size is universal as for the images which don't have them
>> this size equals to the cluster size, so in any case we end up aligning
>> to the smallest unit of allocation.
>>
>> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>> ---
>>   qemu-img.c | 76 ++++++++++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 56 insertions(+), 20 deletions(-)
> 
> Looks good, I like the changes from v1!  Two minor things:
> 
>> diff --git a/qemu-img.c b/qemu-img.c
>> index fcd31d7b5b..83950af42b 100644
>> --- a/qemu-img.c
>> +++ b/qemu-img.c
> 
> [...]
> 
>> @@ -3844,33 +3861,48 @@ static int img_rebase(int argc, char **argv)
>>                   }
>>               }
>>   +            /*
>> +             * At this point we know that the region [offset; offset
>> + n)
>> +             * is unallocated within the target image.  This region
>> might be
>> +             * unaligned to the target image's (sub)cluster
>> boundaries, as
>> +             * old backing may have smaller clusters (or have
>> subclusters).
>> +             * We extend it to the aligned boundaries to avoid CoW on
>> +             * partial writes in blk_pwrite(),
>> +             */
>> +            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
>> +            offset = QEMU_ALIGN_DOWN(offset, write_align);
>> +            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
>> +            n = MIN(n, size - offset);
>> +            assert(!bdrv_is_allocated(unfiltered_bs, offset, n,
>> &n_alloc) &&
>> +                   n_alloc == n);
>> +
>> +            /*
>> +             * Much like the with the target image, we'll try to read
>> as much
> 
> s/the with the/with the/
>

Noted.

>> +             * of the old and new backings as we can.
>> +             */
>> +            n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
>> +            if (blk_new_backing) {
>> +                n_new = MIN(n, MAX(0, new_backing_size - (int64_t)
>> offset));
>> +            }
> 
> If we don’t have a check for blk_old_backing (old_backing_size is 0 if
> blk_old_backing is NULL), why do we have a check for blk_new_backing
> (new_backing_size is 0 if blk_new_backing is NULL)?
> 
> (Perhaps because the previous check was `offset >= new_backing_size ||
> !blk_new_backing`, i.e. included exactly such a check – but I don’t
> think it’s necessary, new_backing_size will be 0 if blk_new_backing is
> NULL.)
> 
>> +
>>               /*
>>                * Read old and new backing file and take into
>> consideration that
>>                * backing files may be smaller than the COW image.
>>                */
>> -            if (offset >= old_backing_size) {
>> -                memset(buf_old, 0, n);
>> -                buf_old_is_zero = true;
>> +            memset(buf_old + n_old, 0, n - n_old);
>> +            if (!n_old) {
>> +                old_backing_eof = true;
>>               } else {
>> -                if (offset + n > old_backing_size) {
>> -                    n = old_backing_size - offset;
>> -                }
>> -
>> -                ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
>> +                ret = blk_pread(blk_old_backing, offset, n_old,
>> buf_old, 0);
>>                   if (ret < 0) {
>>                       error_report("error while reading from old
>> backing file");
>>                       goto out;
>>                   }
>>               }
>>   -            if (offset >= new_backing_size || !blk_new_backing) {
>> -                memset(buf_new, 0, n);
>> -            } else {
>> -                if (offset + n > new_backing_size) {
>> -                    n = new_backing_size - offset;
>> -                }
>> -
>> -                ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
>> +            memset(buf_new + n_new, 0, n - n_new);
>> +            if (blk_new_backing && n_new) {
> 
> Same as above, I think we can drop the blk_new_backing check, just so
> that both blocks (for old and new) look the same.
> 
> (Also, the memset() already has to trust that n_new is 0 if
> blk_new_backing is NULL, so the check doesn’t make much sense from that
> perspective either, and makes the memset() look wrong.)
>

Good point, thank you.  Looks like we indeed can safely remove these
extra checks.

>> +                ret = blk_pread(blk_new_backing, offset, n_new,
>> buf_new, 0);
>>                   if (ret < 0) {
>>                       error_report("error while reading from new
>> backing file");
>>                       goto out;
>
diff mbox series

Patch

diff --git a/qemu-img.c b/qemu-img.c
index fcd31d7b5b..83950af42b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3523,6 +3523,7 @@  static int img_rebase(int argc, char **argv)
     uint8_t *buf_new = NULL;
     BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
     BlockDriverState *unfiltered_bs;
+    BlockDriverInfo bdi = {0};
     char *filename;
     const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
     int c, flags, src_flags, ret;
@@ -3533,6 +3534,7 @@  static int img_rebase(int argc, char **argv)
     bool quiet = false;
     Error *local_err = NULL;
     bool image_opts = false;
+    int64_t write_align;
 
     /* Parse commandline parameters */
     fmt = NULL;
@@ -3656,6 +3658,20 @@  static int img_rebase(int argc, char **argv)
         }
     }
 
+    /*
+     * We need overlay subcluster size to make sure write requests are
+     * aligned.
+     */
+    ret = bdrv_get_info(unfiltered_bs, &bdi);
+    if (ret < 0) {
+        error_report("could not get block driver info");
+        goto out;
+    } else if (bdi.subcluster_size == 0) {
+        bdi.subcluster_size = 1;
+    }
+
+    write_align = bdi.subcluster_size;
+
     /* For safe rebasing we need to compare old and new backing file */
     if (!unsafe) {
         QDict *options = NULL;
@@ -3753,7 +3769,7 @@  static int img_rebase(int argc, char **argv)
         int64_t old_backing_size = 0;
         int64_t new_backing_size = 0;
         uint64_t offset;
-        int64_t n;
+        int64_t n, n_old = 0, n_new = 0;
         float local_progress = 0;
 
         if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk)) >
@@ -3799,7 +3815,8 @@  static int img_rebase(int argc, char **argv)
         }
 
         for (offset = 0; offset < size; offset += n) {
-            bool buf_old_is_zero = false;
+            bool old_backing_eof = false;
+            int64_t n_alloc;
 
             /* How many bytes can we handle with the next read? */
             n = MIN(IO_BUF_SIZE, size - offset);
@@ -3844,33 +3861,48 @@  static int img_rebase(int argc, char **argv)
                 }
             }
 
+            /*
+             * At this point we know that the region [offset; offset + n)
+             * is unallocated within the target image.  This region might be
+             * unaligned to the target image's (sub)cluster boundaries, as
+             * old backing may have smaller clusters (or have subclusters).
+             * We extend it to the aligned boundaries to avoid CoW on
+             * partial writes in blk_pwrite(),
+             */
+            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
+            offset = QEMU_ALIGN_DOWN(offset, write_align);
+            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
+            n = MIN(n, size - offset);
+            assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
+                   n_alloc == n);
+
+            /*
+             * Much like the with the target image, we'll try to read as much
+             * of the old and new backings as we can.
+             */
+            n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
+            if (blk_new_backing) {
+                n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
+            }
+
             /*
              * Read old and new backing file and take into consideration that
              * backing files may be smaller than the COW image.
              */
-            if (offset >= old_backing_size) {
-                memset(buf_old, 0, n);
-                buf_old_is_zero = true;
+            memset(buf_old + n_old, 0, n - n_old);
+            if (!n_old) {
+                old_backing_eof = true;
             } else {
-                if (offset + n > old_backing_size) {
-                    n = old_backing_size - offset;
-                }
-
-                ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
+                ret = blk_pread(blk_old_backing, offset, n_old, buf_old, 0);
                 if (ret < 0) {
                     error_report("error while reading from old backing file");
                     goto out;
                 }
             }
 
-            if (offset >= new_backing_size || !blk_new_backing) {
-                memset(buf_new, 0, n);
-            } else {
-                if (offset + n > new_backing_size) {
-                    n = new_backing_size - offset;
-                }
-
-                ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
+            memset(buf_new + n_new, 0, n - n_new);
+            if (blk_new_backing && n_new) {
+                ret = blk_pread(blk_new_backing, offset, n_new, buf_new, 0);
                 if (ret < 0) {
                     error_report("error while reading from new backing file");
                     goto out;
@@ -3884,11 +3916,12 @@  static int img_rebase(int argc, char **argv)
                 int64_t pnum;
 
                 if (compare_buffers(buf_old + written, buf_new + written,
-                                    n - written, 0, &pnum))
+                                    n - written, write_align, &pnum))
                 {
-                    if (buf_old_is_zero) {
+                    if (old_backing_eof) {
                         ret = blk_pwrite_zeroes(blk, offset + written, pnum, 0);
                     } else {
+                        assert(written + pnum <= IO_BUF_SIZE);
                         ret = blk_pwrite(blk, offset + written, pnum,
                                          buf_old + written, 0);
                     }
@@ -3900,6 +3933,9 @@  static int img_rebase(int argc, char **argv)
                 }
 
                 written += pnum;
+                if (offset + written >= old_backing_size) {
+                    old_backing_eof = true;
+                }
             }
             qemu_progress_print(local_progress, 100);
         }