diff mbox

[2/5] qcow2: multiple clusters write compressed

Message ID 1510654613-47868-3-git-send-email-anton.nefedov@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Anton Nefedov Nov. 14, 2017, 10:16 a.m. UTC
From: Pavel Butsykin <pbutsykin@virtuozzo.com>

At the moment, qcow2_co_pwritev_compressed can process the requests size
less than or equal to one cluster. This patch added possibility to write
compressed data in the QCOW2 more than one cluster. The implementation
is simple, we just split large requests into separate clusters and write
using existing functionality. For unaligned requests we use a workaround
and do write data without compression.

Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
---
 block/qcow2.c | 77 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 21 deletions(-)

Comments

Max Reitz Nov. 15, 2017, 3:11 p.m. UTC | #1
On 2017-11-14 11:16, Anton Nefedov wrote:
> From: Pavel Butsykin <pbutsykin@virtuozzo.com>
> 
> At the moment, qcow2_co_pwritev_compressed can process the requests size
> less than or equal to one cluster. This patch added possibility to write
> compressed data in the QCOW2 more than one cluster. The implementation
> is simple, we just split large requests into separate clusters and write
> using existing functionality. For unaligned requests we use a workaround
> and do write data without compression.
> 
> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> ---
>  block/qcow2.c | 77 +++++++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 56 insertions(+), 21 deletions(-)

On one hand, it might be better to do this centrally somewhere in
block/io.c.  On the other, that would require more work because it would
probably mean having to introduce another field in BlockLimits, and it
wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
seems to completely not care about this single cluster limitation.  So
for now we probably wouldn't even gain anything by doing this in block/io.c.

So long story short, it's OK to do this locally in qcow2, yes.

> diff --git a/block/qcow2.c b/block/qcow2.c
> index 45c5651..3d5f17d 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -3325,11 +3325,9 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
>      return 0;
>  }
>  
> -/* XXX: put compressed sectors first, then all the cluster aligned
> -   tables to avoid losing bytes in alignment */
>  static coroutine_fn int
> -qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
> -                            uint64_t bytes, QEMUIOVector *qiov)
> +qcow2_co_pwritev_cluster_compressed(BlockDriverState *bs, uint64_t offset,
> +                                    uint64_t bytes, QEMUIOVector *qiov)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      QEMUIOVector hd_qiov;
> @@ -3339,25 +3337,12 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
>      uint8_t *buf, *out_buf;
>      int64_t cluster_offset;
>  
> -    if (bytes == 0) {
> -        /* align end of file to a sector boundary to ease reading with
> -           sector based I/Os */
> -        cluster_offset = bdrv_getlength(bs->file->bs);
> -        if (cluster_offset < 0) {
> -            return cluster_offset;
> -        }
> -        return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
> -    }
> -
> -    if (offset_into_cluster(s, offset)) {
> -        return -EINVAL;
> -    }
> +    assert(bytes <= s->cluster_size);
> +    assert(!offset_into_cluster(s, offset));
>  
>      buf = qemu_blockalign(bs, s->cluster_size);
> -    if (bytes != s->cluster_size) {
> -        if (bytes > s->cluster_size ||
> -            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
> -        {
> +    if (bytes < s->cluster_size) {
> +        if (offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS) {
>              qemu_vfree(buf);
>              return -EINVAL;
>          }
> @@ -3437,6 +3422,56 @@ fail:
>      return ret;
>  }
>  
> +/* XXX: put compressed sectors first, then all the cluster aligned
> +   tables to avoid losing bytes in alignment */
> +static coroutine_fn int
> +qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
> +                            uint64_t bytes, QEMUIOVector *qiov)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    QEMUIOVector hd_qiov;
> +    uint64_t curr_off = 0;
> +    int ret;
> +
> +    if (bytes == 0) {
> +        /* align end of file to a sector boundary to ease reading with
> +           sector based I/Os */
> +        int64_t cluster_offset = bdrv_getlength(bs->file->bs);
> +        if (cluster_offset < 0) {
> +            return cluster_offset;
> +        }
> +        return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
> +    }
> +
> +    if (offset_into_cluster(s, offset)) {
> +        return -EINVAL;
> +    }
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    do {
> +        uint32_t chunk_size;
> +
> +        qemu_iovec_reset(&hd_qiov);
> +        chunk_size = MIN(bytes, s->cluster_size);
> +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
> +
> +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset + curr_off,
> +                                                  chunk_size, &hd_qiov);
> +        if (ret == -ENOTSUP) {

Why this?  I mean, I can see the appeal, but then we should probably
actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
like) -- and we should not abort if offset_into_cluster(s, cluster) is
true, but we should write the header uncompressed and compress the main
bulk.

Max

> +            ret = qcow2_co_pwritev(bs, offset + curr_off, chunk_size,
> +                                   &hd_qiov, 0);
> +        }
> +        if (ret < 0) {
> +            break;
> +        }
> +        curr_off += chunk_size;
> +        bytes -= chunk_size;
> +    } while (bytes);
> +    qemu_iovec_destroy(&hd_qiov);
> +
> +    return ret;
> +}
> +
>  static int make_completely_empty(BlockDriverState *bs)
>  {
>      BDRVQcow2State *s = bs->opaque;
>
Anton Nefedov Nov. 15, 2017, 4:28 p.m. UTC | #2
On 15/11/2017 6:11 PM, Max Reitz wrote:
> On 2017-11-14 11:16, Anton Nefedov wrote:
>> From: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>
>> At the moment, qcow2_co_pwritev_compressed can process the requests size
>> less than or equal to one cluster. This patch added possibility to write
>> compressed data in the QCOW2 more than one cluster. The implementation
>> is simple, we just split large requests into separate clusters and write
>> using existing functionality. For unaligned requests we use a workaround
>> and do write data without compression.
>>
>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>> ---
>>   block/qcow2.c | 77 +++++++++++++++++++++++++++++++++++++++++++----------------
>>   1 file changed, 56 insertions(+), 21 deletions(-)
> 
> On one hand, it might be better to do this centrally somewhere in
> block/io.c.  On the other, that would require more work because it would
> probably mean having to introduce another field in BlockLimits, and it
> wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
> seems to completely not care about this single cluster limitation.  So
> for now we probably wouldn't even gain anything by doing this in block/io.c.
> 
> So long story short, it's OK to do this locally in qcow2, yes.
> 

[..]

>> +        qemu_iovec_reset(&hd_qiov);
>> +        chunk_size = MIN(bytes, s->cluster_size);
>> +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
>> +
>> +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset + curr_off,
>> +                                                  chunk_size, &hd_qiov);
>> +        if (ret == -ENOTSUP) {
> 
> Why this?  I mean, I can see the appeal, but then we should probably
> actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
> like) -- and we should not abort if offset_into_cluster(s, cluster) is
> true, but we should write the header uncompressed and compress the main
> bulk.
> 
> Max
> 

Right, sorry, missed this part when porting the patch.

I think this needs to be removed (and the commit message fixed
accordingly).
Returning an error, rather than silently falling back to uncompressed
seems preferable to me. "Compressing writers" (backup, img convert and
now stream) are aware that they have to cluster-align, and if they fail
to do so that means there is an error somewhere.
If it won't return an error anymore, then the unaligned tail shouldn't
either. So we can end up 'letting' the callers send small unaligned
requests which will never get compressed.

/Anton

>> +            ret = qcow2_co_pwritev(bs, offset + curr_off, chunk_size,
>> +                                   &hd_qiov, 0);
Max Reitz Nov. 15, 2017, 4:30 p.m. UTC | #3
On 2017-11-15 17:28, Anton Nefedov wrote:
> On 15/11/2017 6:11 PM, Max Reitz wrote:
>> On 2017-11-14 11:16, Anton Nefedov wrote:
>>> From: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>>
>>> At the moment, qcow2_co_pwritev_compressed can process the requests size
>>> less than or equal to one cluster. This patch added possibility to write
>>> compressed data in the QCOW2 more than one cluster. The implementation
>>> is simple, we just split large requests into separate clusters and write
>>> using existing functionality. For unaligned requests we use a workaround
>>> and do write data without compression.
>>>
>>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>>> ---
>>>   block/qcow2.c | 77
>>> +++++++++++++++++++++++++++++++++++++++++++----------------
>>>   1 file changed, 56 insertions(+), 21 deletions(-)
>>
>> On one hand, it might be better to do this centrally somewhere in
>> block/io.c.  On the other, that would require more work because it would
>> probably mean having to introduce another field in BlockLimits, and it
>> wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
>> seems to completely not care about this single cluster limitation.  So
>> for now we probably wouldn't even gain anything by doing this in
>> block/io.c.
>>
>> So long story short, it's OK to do this locally in qcow2, yes.
>>
> 
> [..]
> 
>>> +        qemu_iovec_reset(&hd_qiov);
>>> +        chunk_size = MIN(bytes, s->cluster_size);
>>> +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
>>> +
>>> +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset +
>>> curr_off,
>>> +                                                  chunk_size,
>>> &hd_qiov);
>>> +        if (ret == -ENOTSUP) {
>>
>> Why this?  I mean, I can see the appeal, but then we should probably
>> actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
>> like) -- and we should not abort if offset_into_cluster(s, cluster) is
>> true, but we should write the header uncompressed and compress the main
>> bulk.
>>
>> Max
>>
> 
> Right, sorry, missed this part when porting the patch.
> 
> I think this needs to be removed (and the commit message fixed
> accordingly).
> Returning an error, rather than silently falling back to uncompressed
> seems preferable to me. "Compressing writers" (backup, img convert and
> now stream) are aware that they have to cluster-align, and if they fail
> to do so that means there is an error somewhere.

OK for me.

> If it won't return an error anymore, then the unaligned tail shouldn't
> either. So we can end up 'letting' the callers send small unaligned
> requests which will never get compressed.

Either way is fine.  It just looks to me like vmdk falls back to
uncompressed writes, so it's not like it would be completely new behavior...

(But I won't judge whether what vmdk does is a good idea.)

Max
Kevin Wolf Nov. 21, 2017, 5:42 p.m. UTC | #4
Am 15.11.2017 um 17:30 hat Max Reitz geschrieben:
> On 2017-11-15 17:28, Anton Nefedov wrote:
> > On 15/11/2017 6:11 PM, Max Reitz wrote:
> >> On 2017-11-14 11:16, Anton Nefedov wrote:
> >>> From: Pavel Butsykin <pbutsykin@virtuozzo.com>
> >>>
> >>> At the moment, qcow2_co_pwritev_compressed can process the requests size
> >>> less than or equal to one cluster. This patch added possibility to write
> >>> compressed data in the QCOW2 more than one cluster. The implementation
> >>> is simple, we just split large requests into separate clusters and write
> >>> using existing functionality. For unaligned requests we use a workaround
> >>> and do write data without compression.
> >>>
> >>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> >>> ---
> >>>   block/qcow2.c | 77
> >>> +++++++++++++++++++++++++++++++++++++++++++----------------
> >>>   1 file changed, 56 insertions(+), 21 deletions(-)
> >>
> >> On one hand, it might be better to do this centrally somewhere in
> >> block/io.c.  On the other, that would require more work because it would
> >> probably mean having to introduce another field in BlockLimits, and it
> >> wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
> >> seems to completely not care about this single cluster limitation.  So
> >> for now we probably wouldn't even gain anything by doing this in
> >> block/io.c.
> >>
> >> So long story short, it's OK to do this locally in qcow2, yes.
> >>
> > 
> > [..]
> > 
> >>> +        qemu_iovec_reset(&hd_qiov);
> >>> +        chunk_size = MIN(bytes, s->cluster_size);
> >>> +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
> >>> +
> >>> +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset +
> >>> curr_off,
> >>> +                                                  chunk_size,
> >>> &hd_qiov);
> >>> +        if (ret == -ENOTSUP) {
> >>
> >> Why this?  I mean, I can see the appeal, but then we should probably
> >> actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
> >> like) -- and we should not abort if offset_into_cluster(s, cluster) is
> >> true, but we should write the header uncompressed and compress the main
> >> bulk.
> >>
> >> Max
> >>
> > 
> > Right, sorry, missed this part when porting the patch.
> > 
> > I think this needs to be removed (and the commit message fixed
> > accordingly).
> > Returning an error, rather than silently falling back to uncompressed
> > seems preferable to me. "Compressing writers" (backup, img convert and
> > now stream) are aware that they have to cluster-align, and if they fail
> > to do so that means there is an error somewhere.
> 
> OK for me.
> 
> > If it won't return an error anymore, then the unaligned tail shouldn't
> > either. So we can end up 'letting' the callers send small unaligned
> > requests which will never get compressed.
> 
> Either way is fine.  It just looks to me like vmdk falls back to
> uncompressed writes, so it's not like it would be completely new behavior...
> 
> (But I won't judge whether what vmdk does is a good idea.)

Probably not.

If we let io.c know about the cluster-size alignment requirement for
compressed writes, the usual RMW code path could kick in. Wouldn't this
actually be a better solution than uncompressed writes or erroring out?

In fact, with this, we might even be very close to an option that turns
every write into a compressed write, so you get images that stay
compressed even while they are in use.

Kevin
Anton Nefedov Nov. 23, 2017, 9:04 a.m. UTC | #5
On 21/11/2017 8:42 PM, Kevin Wolf wrote:
> Am 15.11.2017 um 17:30 hat Max Reitz geschrieben:
>> On 2017-11-15 17:28, Anton Nefedov wrote:
>>> On 15/11/2017 6:11 PM, Max Reitz wrote:
>>>> On 2017-11-14 11:16, Anton Nefedov wrote:
>>>>> From: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>>>>
>>>>> At the moment, qcow2_co_pwritev_compressed can process the requests size
>>>>> less than or equal to one cluster. This patch added possibility to write
>>>>> compressed data in the QCOW2 more than one cluster. The implementation
>>>>> is simple, we just split large requests into separate clusters and write
>>>>> using existing functionality. For unaligned requests we use a workaround
>>>>> and do write data without compression.
>>>>>
>>>>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>>>>> ---
>>>>>    block/qcow2.c | 77
>>>>> +++++++++++++++++++++++++++++++++++++++++++----------------
>>>>>    1 file changed, 56 insertions(+), 21 deletions(-)
>>>>
>>>> On one hand, it might be better to do this centrally somewhere in
>>>> block/io.c.  On the other, that would require more work because it would
>>>> probably mean having to introduce another field in BlockLimits, and it
>>>> wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
>>>> seems to completely not care about this single cluster limitation.  So
>>>> for now we probably wouldn't even gain anything by doing this in
>>>> block/io.c.
>>>>
>>>> So long story short, it's OK to do this locally in qcow2, yes.
>>>>
>>>
>>> [..]
>>>
>>>>> +        qemu_iovec_reset(&hd_qiov);
>>>>> +        chunk_size = MIN(bytes, s->cluster_size);
>>>>> +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
>>>>> +
>>>>> +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset +
>>>>> curr_off,
>>>>> +                                                  chunk_size,
>>>>> &hd_qiov);
>>>>> +        if (ret == -ENOTSUP) {
>>>>
>>>> Why this?  I mean, I can see the appeal, but then we should probably
>>>> actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
>>>> like) -- and we should not abort if offset_into_cluster(s, cluster) is
>>>> true, but we should write the header uncompressed and compress the main
>>>> bulk.
>>>>
>>>> Max
>>>>
>>>
>>> Right, sorry, missed this part when porting the patch.
>>>
>>> I think this needs to be removed (and the commit message fixed
>>> accordingly).
>>> Returning an error, rather than silently falling back to uncompressed
>>> seems preferable to me. "Compressing writers" (backup, img convert and
>>> now stream) are aware that they have to cluster-align, and if they fail
>>> to do so that means there is an error somewhere.
>>
>> OK for me.
>>
>>> If it won't return an error anymore, then the unaligned tail shouldn't
>>> either. So we can end up 'letting' the callers send small unaligned
>>> requests which will never get compressed.
>>
>> Either way is fine.  It just looks to me like vmdk falls back to
>> uncompressed writes, so it's not like it would be completely new behavior...
>>
>> (But I won't judge whether what vmdk does is a good idea.)
> 
> Probably not.
> 
> If we let io.c know about the cluster-size alignment requirement for
> compressed writes, the usual RMW code path could kick in. Wouldn't this
> actually be a better solution than uncompressed writes or erroring out?
> 
> In fact, with this, we might even be very close to an option that turns
> every write into a compressed write, so you get images that stay
> compressed even while they are in use.
> 
> Kevin
> 

That's alignment, and indeed it would be nice to have that block limit
and I think this patch does not contradict with that (now that in v2 it
doesn't fall back to uncompressed but returns EINVAL).
Unless we also want a max request limit so io.c does the request split?

/Anton
Kevin Wolf Nov. 23, 2017, 9:44 a.m. UTC | #6
Am 23.11.2017 um 10:04 hat Anton Nefedov geschrieben:
> 
> 
> On 21/11/2017 8:42 PM, Kevin Wolf wrote:
> > Am 15.11.2017 um 17:30 hat Max Reitz geschrieben:
> > > On 2017-11-15 17:28, Anton Nefedov wrote:
> > > > On 15/11/2017 6:11 PM, Max Reitz wrote:
> > > > > On 2017-11-14 11:16, Anton Nefedov wrote:
> > > > > > From: Pavel Butsykin <pbutsykin@virtuozzo.com>
> > > > > > 
> > > > > > At the moment, qcow2_co_pwritev_compressed can process the requests size
> > > > > > less than or equal to one cluster. This patch added possibility to write
> > > > > > compressed data in the QCOW2 more than one cluster. The implementation
> > > > > > is simple, we just split large requests into separate clusters and write
> > > > > > using existing functionality. For unaligned requests we use a workaround
> > > > > > and do write data without compression.
> > > > > > 
> > > > > > Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> > > > > > ---
> > > > > >    block/qcow2.c | 77
> > > > > > +++++++++++++++++++++++++++++++++++++++++++----------------
> > > > > >    1 file changed, 56 insertions(+), 21 deletions(-)
> > > > > 
> > > > > On one hand, it might be better to do this centrally somewhere in
> > > > > block/io.c.  On the other, that would require more work because it would
> > > > > probably mean having to introduce another field in BlockLimits, and it
> > > > > wouldn't do much -- because qcow (v1) is, well, qcow v1...  And vmdk
> > > > > seems to completely not care about this single cluster limitation.  So
> > > > > for now we probably wouldn't even gain anything by doing this in
> > > > > block/io.c.
> > > > > 
> > > > > So long story short, it's OK to do this locally in qcow2, yes.
> > > > > 
> > > > 
> > > > [..]
> > > > 
> > > > > > +        qemu_iovec_reset(&hd_qiov);
> > > > > > +        chunk_size = MIN(bytes, s->cluster_size);
> > > > > > +        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
> > > > > > +
> > > > > > +        ret = qcow2_co_pwritev_cluster_compressed(bs, offset +
> > > > > > curr_off,
> > > > > > +                                                  chunk_size,
> > > > > > &hd_qiov);
> > > > > > +        if (ret == -ENOTSUP) {
> > > > > 
> > > > > Why this?  I mean, I can see the appeal, but then we should probably
> > > > > actually return -ENOTSUP somewhere (e.g. for unaligned clusters and the
> > > > > like) -- and we should not abort if offset_into_cluster(s, cluster) is
> > > > > true, but we should write the header uncompressed and compress the main
> > > > > bulk.
> > > > > 
> > > > > Max
> > > > > 
> > > > 
> > > > Right, sorry, missed this part when porting the patch.
> > > > 
> > > > I think this needs to be removed (and the commit message fixed
> > > > accordingly).
> > > > Returning an error, rather than silently falling back to uncompressed
> > > > seems preferable to me. "Compressing writers" (backup, img convert and
> > > > now stream) are aware that they have to cluster-align, and if they fail
> > > > to do so that means there is an error somewhere.
> > > 
> > > OK for me.
> > > 
> > > > If it won't return an error anymore, then the unaligned tail shouldn't
> > > > either. So we can end up 'letting' the callers send small unaligned
> > > > requests which will never get compressed.
> > > 
> > > Either way is fine.  It just looks to me like vmdk falls back to
> > > uncompressed writes, so it's not like it would be completely new behavior...
> > > 
> > > (But I won't judge whether what vmdk does is a good idea.)
> > 
> > Probably not.
> > 
> > If we let io.c know about the cluster-size alignment requirement for
> > compressed writes, the usual RMW code path could kick in. Wouldn't this
> > actually be a better solution than uncompressed writes or erroring out?
> > 
> > In fact, with this, we might even be very close to an option that turns
> > every write into a compressed write, so you get images that stay
> > compressed even while they are in use.
> > 
> > Kevin
> 
> That's alignment, and indeed it would be nice to have that block limit
> and I think this patch does not contradict with that (now that in v2 it
> doesn't fall back to uncompressed but returns EINVAL).

Yes, I agree.

> Unless we also want a max request limit so io.c does the request split?

I'm not sure about this one. We might want to change the qcow2 code
later so that we can actually write multiple compressed clusters at once
as an performance optimisation, and then we would give up the splitting
in io.c again. So maybe it's not really worth it.

Kevin
diff mbox

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index 45c5651..3d5f17d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3325,11 +3325,9 @@  static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
-/* XXX: put compressed sectors first, then all the cluster aligned
-   tables to avoid losing bytes in alignment */
 static coroutine_fn int
-qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
-                            uint64_t bytes, QEMUIOVector *qiov)
+qcow2_co_pwritev_cluster_compressed(BlockDriverState *bs, uint64_t offset,
+                                    uint64_t bytes, QEMUIOVector *qiov)
 {
     BDRVQcow2State *s = bs->opaque;
     QEMUIOVector hd_qiov;
@@ -3339,25 +3337,12 @@  qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
     uint8_t *buf, *out_buf;
     int64_t cluster_offset;
 
-    if (bytes == 0) {
-        /* align end of file to a sector boundary to ease reading with
-           sector based I/Os */
-        cluster_offset = bdrv_getlength(bs->file->bs);
-        if (cluster_offset < 0) {
-            return cluster_offset;
-        }
-        return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
-    }
-
-    if (offset_into_cluster(s, offset)) {
-        return -EINVAL;
-    }
+    assert(bytes <= s->cluster_size);
+    assert(!offset_into_cluster(s, offset));
 
     buf = qemu_blockalign(bs, s->cluster_size);
-    if (bytes != s->cluster_size) {
-        if (bytes > s->cluster_size ||
-            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
-        {
+    if (bytes < s->cluster_size) {
+        if (offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS) {
             qemu_vfree(buf);
             return -EINVAL;
         }
@@ -3437,6 +3422,56 @@  fail:
     return ret;
 }
 
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static coroutine_fn int
+qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
+                            uint64_t bytes, QEMUIOVector *qiov)
+{
+    BDRVQcow2State *s = bs->opaque;
+    QEMUIOVector hd_qiov;
+    uint64_t curr_off = 0;
+    int ret;
+
+    if (bytes == 0) {
+        /* align end of file to a sector boundary to ease reading with
+           sector based I/Os */
+        int64_t cluster_offset = bdrv_getlength(bs->file->bs);
+        if (cluster_offset < 0) {
+            return cluster_offset;
+        }
+        return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
+    }
+
+    if (offset_into_cluster(s, offset)) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    do {
+        uint32_t chunk_size;
+
+        qemu_iovec_reset(&hd_qiov);
+        chunk_size = MIN(bytes, s->cluster_size);
+        qemu_iovec_concat(&hd_qiov, qiov, curr_off, chunk_size);
+
+        ret = qcow2_co_pwritev_cluster_compressed(bs, offset + curr_off,
+                                                  chunk_size, &hd_qiov);
+        if (ret == -ENOTSUP) {
+            ret = qcow2_co_pwritev(bs, offset + curr_off, chunk_size,
+                                   &hd_qiov, 0);
+        }
+        if (ret < 0) {
+            break;
+        }
+        curr_off += chunk_size;
+        bytes -= chunk_size;
+    } while (bytes);
+    qemu_iovec_destroy(&hd_qiov);
+
+    return ret;
+}
+
 static int make_completely_empty(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;