diff mbox series

[v11,09/10] qcow2: skip writing zero buffers to empty COW areas

Message ID 20181218075707.12006-10-anton.nefedov@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series qcow2: cluster space preallocation | expand

Commit Message

Anton Nefedov Dec. 18, 2018, 7:57 a.m. UTC
If COW areas of the newly allocated clusters are zeroes on the backing image,
efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
cluster instead of writing explicit zero buffers later in perform_cow().

iotest 060:
write to the discarded cluster does not trigger COW anymore.
Use a backing image instead.

Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
---
 qapi/block-core.json       |  4 +-
 block/qcow2.h              |  6 +++
 block/qcow2-cluster.c      |  2 +-
 block/qcow2.c              | 89 +++++++++++++++++++++++++++++++++++++-
 block/trace-events         |  1 +
 tests/qemu-iotests/060     |  7 ++-
 tests/qemu-iotests/060.out |  5 ++-
 7 files changed, 108 insertions(+), 6 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy Dec. 21, 2018, 4:16 p.m. UTC | #1
18.12.2018 10:57, Anton Nefedov wrote:
> If COW areas of the newly allocated clusters are zeroes on the backing image,
> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
> cluster instead of writing explicit zero buffers later in perform_cow().
> 
> iotest 060:
> write to the discarded cluster does not trigger COW anymore.
> Use a backing image instead.
> 
> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> ---
>   qapi/block-core.json       |  4 +-
>   block/qcow2.h              |  6 +++
>   block/qcow2-cluster.c      |  2 +-
>   block/qcow2.c              | 89 +++++++++++++++++++++++++++++++++++++-
>   block/trace-events         |  1 +
>   tests/qemu-iotests/060     |  7 ++-
>   tests/qemu-iotests/060.out |  5 ++-
>   7 files changed, 108 insertions(+), 6 deletions(-)
> 
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 762000f31f..204528b3f6 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -3009,6 +3009,8 @@
>   #
>   # @cor_write: a write due to copy-on-read (since 2.11)
>   #
> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
> +#
>   # Since: 2.9
>   ##
>   { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
> @@ -3027,7 +3029,7 @@
>               'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>               'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>               'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
> -            'cor_write'] }
> +            'cor_write', 'cluster_alloc_space'] }
>   
>   ##
>   # @BlkdebugInjectErrorOptions:
> diff --git a/block/qcow2.h b/block/qcow2.h
> index a98d24500b..32d2c04bfa 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -386,6 +386,12 @@ typedef struct QCowL2Meta
>        */
>       Qcow2COWRegion cow_end;
>   
> +    /*
> +     * Indicates that COW regions are already handled and do not require
> +     * any more processing.
> +     */
> +    bool skip_cow;
> +
>       /**

hmm, around it, all comments starts from '/**', so, I think, yours should too.
(this note doesn't touch your other comments)

>        * The I/O vector with the data from the actual guest write request.
>        * If non-NULL, this is meant to be merged together with the data
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index e2737429f5..23e0702027 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>       assert(start->offset + start->nb_bytes <= end->offset);
>       assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>   
> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>           return 0;
>       }
>   
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 4897abae5e..161b935962 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2021,6 +2021,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>               continue;
>           }
>   
> +        /* If COW regions are handled already, skip this too */
> +        if (m->skip_cow) {
> +            continue;
> +        }
> +
>           /* The data (middle) region must be immediately after the
>            * start region */
>           if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
> @@ -2046,6 +2051,77 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>       return false;
>   }
>   
> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
> +{
> +    int64_t nr;
> +    return !bytes ||
> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
> +}
> +
> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
> +{
> +    /*
> +     * This check is designed for optimization shortcut so it must be
> +     * efficient.
> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
> +     * as accurate and can result in false negatives).
> +     */
> +    return is_unallocated(bs, m->offset + m->cow_start.offset,

here you add m->cow_start.offset ...[1]

> +                          m->cow_start.nb_bytes) &&
> +           is_unallocated(bs, m->offset + m->cow_end.offset,
> +                          m->cow_end.nb_bytes);
> +}
> +
> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    QCowL2Meta *m;
> +
> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
> +        return 0;
> +    }
> +
> +    if (bs->encrypted) {
> +        return 0;
> +    }
> +
> +    for (m = l2meta; m != NULL; m = m->next) {
> +        int ret;
> +
> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
> +            continue;
> +        }
> +
> +        if (!is_zero_cow(bs, m)) {
> +            continue;
> +        }
> +
> +        /*
> +         * Conventional place for qcow2_pre_write_overlap_check() but in this
> +         * case it is already done for these clusters
> +         */

to be honest, check is done for original request not, not for cow_regions, and,
on the other hand,
do_perform_cow_write _does_ the check.

So, I see two consistent ways:
1. you just add check here, like in do_perform_cow_write
2. improve check in qcow2_co_pwritev to cover the whole area, and drop checks from
    perform_cow.

I prefer 1, as it more helpful in case of bugs in the code (just check exactly what
you are going to write in the next write).

> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> +        /*
> +         * instead of writing zero COW buffers,
> +         * efficiently zero out the whole clusters
> +         */
> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,

[1]... and here - not.
And perform_cow uses start too.
hmm..

On the other hand, here we just want to zero all allocated clusters, it is not related
to cow regions. So, ok.

> +                                    m->nb_clusters * s->cluster_size,
> +                                    BDRV_REQ_ALLOCATE);
> +        if (ret < 0) {
> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
> +                return ret;
> +            }
> +            continue;
> +        }
> +
> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
> +        m->skip_cow = true;
> +    }
> +    return 0;
> +}
> +

[...]


with pre-write-check added:
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Anton Nefedov Dec. 24, 2018, 8:21 a.m. UTC | #2
On 21/12/2018 7:16 PM, Vladimir Sementsov-Ogievskiy wrote:
> 18.12.2018 10:57, Anton Nefedov wrote:
>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>> cluster instead of writing explicit zero buffers later in perform_cow().
>>
>> iotest 060:
>> write to the discarded cluster does not trigger COW anymore.
>> Use a backing image instead.
>>
>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>> ---
>>    qapi/block-core.json       |  4 +-
>>    block/qcow2.h              |  6 +++
>>    block/qcow2-cluster.c      |  2 +-
>>    block/qcow2.c              | 89 +++++++++++++++++++++++++++++++++++++-
>>    block/trace-events         |  1 +
>>    tests/qemu-iotests/060     |  7 ++-
>>    tests/qemu-iotests/060.out |  5 ++-
>>    7 files changed, 108 insertions(+), 6 deletions(-)
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 762000f31f..204528b3f6 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -3009,6 +3009,8 @@
>>    #
>>    # @cor_write: a write due to copy-on-read (since 2.11)
>>    #
>> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
>> +#
>>    # Since: 2.9
>>    ##
>>    { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
>> @@ -3027,7 +3029,7 @@
>>                'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>                'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>                'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
>> -            'cor_write'] }
>> +            'cor_write', 'cluster_alloc_space'] }
>>    
>>    ##
>>    # @BlkdebugInjectErrorOptions:
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index a98d24500b..32d2c04bfa 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -386,6 +386,12 @@ typedef struct QCowL2Meta
>>         */
>>        Qcow2COWRegion cow_end;
>>    
>> +    /*
>> +     * Indicates that COW regions are already handled and do not require
>> +     * any more processing.
>> +     */
>> +    bool skip_cow;
>> +
>>        /**
> 
> hmm, around it, all comments starts from '/**', so, I think, yours should too.
> (this note doesn't touch your other comments)
> 

that triggers a warning in checkpatch.pl

   WARNING: Block comments use a leading /* on a separate line

>>         * The I/O vector with the data from the actual guest write request.
>>         * If non-NULL, this is meant to be merged together with the data
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index e2737429f5..23e0702027 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>>        assert(start->offset + start->nb_bytes <= end->offset);
>>        assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>>    
>> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
>> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>>            return 0;
>>        }
>>    
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 4897abae5e..161b935962 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2021,6 +2021,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>                continue;
>>            }
>>    
>> +        /* If COW regions are handled already, skip this too */
>> +        if (m->skip_cow) {
>> +            continue;
>> +        }
>> +
>>            /* The data (middle) region must be immediately after the
>>             * start region */
>>            if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
>> @@ -2046,6 +2051,77 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>        return false;
>>    }
>>    
>> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
>> +{
>> +    int64_t nr;
>> +    return !bytes ||
>> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
>> +}
>> +
>> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
>> +{
>> +    /*
>> +     * This check is designed for optimization shortcut so it must be
>> +     * efficient.
>> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
>> +     * as accurate and can result in false negatives).
>> +     */
>> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
> 
> here you add m->cow_start.offset ...[1]
> 
>> +                          m->cow_start.nb_bytes) &&
>> +           is_unallocated(bs, m->offset + m->cow_end.offset,
>> +                          m->cow_end.nb_bytes);
>> +}
>> +
>> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    QCowL2Meta *m;
>> +
>> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
>> +        return 0;
>> +    }
>> +
>> +    if (bs->encrypted) {
>> +        return 0;
>> +    }
>> +
>> +    for (m = l2meta; m != NULL; m = m->next) {
>> +        int ret;
>> +
>> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
>> +            continue;
>> +        }
>> +
>> +        if (!is_zero_cow(bs, m)) {
>> +            continue;
>> +        }
>> +
>> +        /*
>> +         * Conventional place for qcow2_pre_write_overlap_check() but in this
>> +         * case it is already done for these clusters
>> +         */
> 
> to be honest, check is done for original request not, not for cow_regions, and,
> on the other hand,
> do_perform_cow_write _does_ the check.
> 
> So, I see two consistent ways:
> 1. you just add check here, like in do_perform_cow_write
> 2. improve check in qcow2_co_pwritev to cover the whole area, and drop checks from
>      perform_cow.
> 
> I prefer 1, as it more helpful in case of bugs in the code (just check exactly what
> you are going to write in the next write).
> 

ok

diff --git a/block/qcow2.c b/block/qcow2.c
index 161b935962..05a7cbebbd 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2096,16 +2096,18 @@ static int handle_alloc_space(BlockDriverState 
*bs, QCowL2Meta *l2meta)
              continue;
          }

-        /*
-         * Conventional place for qcow2_pre_write_overlap_check() but 
in this
-         * case it is already done for these clusters
-         */
-
-        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
          /*
           * instead of writing zero COW buffers,
           * efficiently zero out the whole clusters
           */
+
+        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
+                                            m->nb_clusters * 
s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+
+        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
          ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
                                      m->nb_clusters * s->cluster_size,
                                      BDRV_REQ_ALLOCATE);

>> +
>> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>> +        /*
>> +         * instead of writing zero COW buffers,
>> +         * efficiently zero out the whole clusters
>> +         */
>> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
> 
> [1]... and here - not.
> And perform_cow uses start too.
> hmm..
> 
> On the other hand, here we just want to zero all allocated clusters, it is not related
> to cow regions. So, ok.
> 
>> +                                    m->nb_clusters * s->cluster_size,
>> +                                    BDRV_REQ_ALLOCATE);
>> +        if (ret < 0) {
>> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
>> +                return ret;
>> +            }
>> +            continue;
>> +        }
>> +
>> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
>> +        m->skip_cow = true;
>> +    }
>> +    return 0;
>> +}
>> +
> 
> [...]
> 
> 
> with pre-write-check added:
> Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> 
> 
>
Vladimir Sementsov-Ogievskiy Dec. 24, 2018, 8:33 a.m. UTC | #3
24.12.2018 11:21, Anton Nefedov wrote:
> On 21/12/2018 7:16 PM, Vladimir Sementsov-Ogievskiy wrote:
>> 18.12.2018 10:57, Anton Nefedov wrote:
>>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>>> cluster instead of writing explicit zero buffers later in perform_cow().
>>>
>>> iotest 060:
>>> write to the discarded cluster does not trigger COW anymore.
>>> Use a backing image instead.
>>>
>>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>>> ---
>>>     qapi/block-core.json       |  4 +-
>>>     block/qcow2.h              |  6 +++
>>>     block/qcow2-cluster.c      |  2 +-
>>>     block/qcow2.c              | 89 +++++++++++++++++++++++++++++++++++++-
>>>     block/trace-events         |  1 +
>>>     tests/qemu-iotests/060     |  7 ++-
>>>     tests/qemu-iotests/060.out |  5 ++-
>>>     7 files changed, 108 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index 762000f31f..204528b3f6 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -3009,6 +3009,8 @@
>>>     #
>>>     # @cor_write: a write due to copy-on-read (since 2.11)
>>>     #
>>> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
>>> +#
>>>     # Since: 2.9
>>>     ##
>>>     { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
>>> @@ -3027,7 +3029,7 @@
>>>                 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>>                 'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>>                 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
>>> -            'cor_write'] }
>>> +            'cor_write', 'cluster_alloc_space'] }
>>>     
>>>     ##
>>>     # @BlkdebugInjectErrorOptions:
>>> diff --git a/block/qcow2.h b/block/qcow2.h
>>> index a98d24500b..32d2c04bfa 100644
>>> --- a/block/qcow2.h
>>> +++ b/block/qcow2.h
>>> @@ -386,6 +386,12 @@ typedef struct QCowL2Meta
>>>          */
>>>         Qcow2COWRegion cow_end;
>>>     
>>> +    /*
>>> +     * Indicates that COW regions are already handled and do not require
>>> +     * any more processing.
>>> +     */
>>> +    bool skip_cow;
>>> +
>>>         /**
>>
>> hmm, around it, all comments starts from '/**', so, I think, yours should too.
>> (this note doesn't touch your other comments)
>>
> 
> that triggers a warning in checkpatch.pl
> 
>     WARNING: Block comments use a leading /* on a separate line

I thing, it is the case, when we should ignore this warning.

> 
>>>          * The I/O vector with the data from the actual guest write request.
>>>          * If non-NULL, this is meant to be merged together with the data
>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>> index e2737429f5..23e0702027 100644
>>> --- a/block/qcow2-cluster.c
>>> +++ b/block/qcow2-cluster.c
>>> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>>>         assert(start->offset + start->nb_bytes <= end->offset);
>>>         assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>>>     
>>> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
>>> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>>>             return 0;
>>>         }
>>>     
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index 4897abae5e..161b935962 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -2021,6 +2021,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>>                 continue;
>>>             }
>>>     
>>> +        /* If COW regions are handled already, skip this too */
>>> +        if (m->skip_cow) {
>>> +            continue;
>>> +        }
>>> +
>>>             /* The data (middle) region must be immediately after the
>>>              * start region */
>>>             if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
>>> @@ -2046,6 +2051,77 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>>         return false;
>>>     }
>>>     
>>> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
>>> +{
>>> +    int64_t nr;
>>> +    return !bytes ||
>>> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
>>> +}
>>> +
>>> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
>>> +{
>>> +    /*
>>> +     * This check is designed for optimization shortcut so it must be
>>> +     * efficient.
>>> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
>>> +     * as accurate and can result in false negatives).
>>> +     */
>>> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
>>
>> here you add m->cow_start.offset ...[1]
>>
>>> +                          m->cow_start.nb_bytes) &&
>>> +           is_unallocated(bs, m->offset + m->cow_end.offset,
>>> +                          m->cow_end.nb_bytes);
>>> +}
>>> +
>>> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>>> +{
>>> +    BDRVQcow2State *s = bs->opaque;
>>> +    QCowL2Meta *m;
>>> +
>>> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (bs->encrypted) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    for (m = l2meta; m != NULL; m = m->next) {
>>> +        int ret;
>>> +
>>> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
>>> +            continue;
>>> +        }
>>> +
>>> +        if (!is_zero_cow(bs, m)) {
>>> +            continue;
>>> +        }
>>> +
>>> +        /*
>>> +         * Conventional place for qcow2_pre_write_overlap_check() but in this
>>> +         * case it is already done for these clusters
>>> +         */
>>
>> to be honest, check is done for original request not, not for cow_regions, and,
>> on the other hand,
>> do_perform_cow_write _does_ the check.
>>
>> So, I see two consistent ways:
>> 1. you just add check here, like in do_perform_cow_write
>> 2. improve check in qcow2_co_pwritev to cover the whole area, and drop checks from
>>       perform_cow.
>>
>> I prefer 1, as it more helpful in case of bugs in the code (just check exactly what
>> you are going to write in the next write).
>>
> 
> ok
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 161b935962..05a7cbebbd 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2096,16 +2096,18 @@ static int handle_alloc_space(BlockDriverState
> *bs, QCowL2Meta *l2meta)
>                continue;
>            }
> 
> -        /*
> -         * Conventional place for qcow2_pre_write_overlap_check() but
> in this
> -         * case it is already done for these clusters
> -         */
> -
> -        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>            /*
>             * instead of writing zero COW buffers,
>             * efficiently zero out the whole clusters
>             */
> +
> +        ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset,
> +                                            m->nb_clusters *
> s->cluster_size);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);

ok

>            ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
>                                        m->nb_clusters * s->cluster_size,
>                                        BDRV_REQ_ALLOCATE);
> 
>>> +
>>> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>>> +        /*
>>> +         * instead of writing zero COW buffers,
>>> +         * efficiently zero out the whole clusters
>>> +         */
>>> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
>>
>> [1]... and here - not.
>> And perform_cow uses start too.
>> hmm..
>>
>> On the other hand, here we just want to zero all allocated clusters, it is not related
>> to cow regions. So, ok.
>>
>>> +                                    m->nb_clusters * s->cluster_size,
>>> +                                    BDRV_REQ_ALLOCATE);
>>> +        if (ret < 0) {
>>> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
>>> +                return ret;
>>> +            }
>>> +            continue;
>>> +        }
>>> +
>>> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
>>> +        m->skip_cow = true;
>>> +    }
>>> +    return 0;
>>> +}
>>> +
>>
>> [...]
>>
>>
>> with pre-write-check added:
>> Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>
>>
>>
Alberto Garcia Jan. 8, 2019, 1:40 p.m. UTC | #4
On Tue 18 Dec 2018 08:57:45 AM CET, Anton Nefedov wrote:
> @@ -2126,24 +2202,33 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
>              goto fail;
>          }
>  
> +        qemu_co_mutex_unlock(&s->lock);
> +
> +        ret = handle_alloc_space(bs, l2meta);
> +        if (ret < 0) {
> +            qemu_co_mutex_lock(&s->lock);
> +            goto fail;
> +        }

I think you could add a brief comment before the handle_alloc_space()
call to indicate what it does.

The patch looks good to me else, but since it seems you're changing it
in the next revision I'll wait for it.

Berto
diff mbox series

Patch

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 762000f31f..204528b3f6 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3009,6 +3009,8 @@ 
 #
 # @cor_write: a write due to copy-on-read (since 2.11)
 #
+# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
+#
 # Since: 2.9
 ##
 { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
@@ -3027,7 +3029,7 @@ 
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
             'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
-            'cor_write'] }
+            'cor_write', 'cluster_alloc_space'] }
 
 ##
 # @BlkdebugInjectErrorOptions:
diff --git a/block/qcow2.h b/block/qcow2.h
index a98d24500b..32d2c04bfa 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -386,6 +386,12 @@  typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /*
+     * Indicates that COW regions are already handled and do not require
+     * any more processing.
+     */
+    bool skip_cow;
+
     /**
      * The I/O vector with the data from the actual guest write request.
      * If non-NULL, this is meant to be merged together with the data
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e2737429f5..23e0702027 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -806,7 +806,7 @@  static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     assert(start->offset + start->nb_bytes <= end->offset);
     assert(!m->data_qiov || m->data_qiov->size == data_bytes);
 
-    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
+    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
         return 0;
     }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 4897abae5e..161b935962 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2021,6 +2021,11 @@  static bool merge_cow(uint64_t offset, unsigned bytes,
             continue;
         }
 
+        /* If COW regions are handled already, skip this too */
+        if (m->skip_cow) {
+            continue;
+        }
+
         /* The data (middle) region must be immediately after the
          * start region */
         if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
@@ -2046,6 +2051,77 @@  static bool merge_cow(uint64_t offset, unsigned bytes,
     return false;
 }
 
+static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+    int64_t nr;
+    return !bytes ||
+        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
+}
+
+static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
+{
+    /*
+     * This check is designed for optimization shortcut so it must be
+     * efficient.
+     * Instead of is_zero(), use is_unallocated() as it is faster (but not
+     * as accurate and can result in false negatives).
+     */
+    return is_unallocated(bs, m->offset + m->cow_start.offset,
+                          m->cow_start.nb_bytes) &&
+           is_unallocated(bs, m->offset + m->cow_end.offset,
+                          m->cow_end.nb_bytes);
+}
+
+static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+    BDRVQcow2State *s = bs->opaque;
+    QCowL2Meta *m;
+
+    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
+        return 0;
+    }
+
+    if (bs->encrypted) {
+        return 0;
+    }
+
+    for (m = l2meta; m != NULL; m = m->next) {
+        int ret;
+
+        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
+            continue;
+        }
+
+        if (!is_zero_cow(bs, m)) {
+            continue;
+        }
+
+        /*
+         * Conventional place for qcow2_pre_write_overlap_check() but in this
+         * case it is already done for these clusters
+         */
+
+        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
+        /*
+         * instead of writing zero COW buffers,
+         * efficiently zero out the whole clusters
+         */
+        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
+                                    m->nb_clusters * s->cluster_size,
+                                    BDRV_REQ_ALLOCATE);
+        if (ret < 0) {
+            if (ret != -ENOTSUP && ret != -EAGAIN) {
+                return ret;
+            }
+            continue;
+        }
+
+        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
+        m->skip_cow = true;
+    }
+    return 0;
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
@@ -2126,24 +2202,33 @@  static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
             goto fail;
         }
 
+        qemu_co_mutex_unlock(&s->lock);
+
+        ret = handle_alloc_space(bs, l2meta);
+        if (ret < 0) {
+            qemu_co_mutex_lock(&s->lock);
+            goto fail;
+        }
+
         /* If we need to do COW, check if it's possible to merge the
          * writing of the guest data together with that of the COW regions.
          * If it's not possible (or not necessary) then write the
          * guest data now. */
         if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
-            qemu_co_mutex_unlock(&s->lock);
             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
             trace_qcow2_writev_data(qemu_coroutine_self(),
                                     cluster_offset + offset_in_cluster);
             ret = bdrv_co_pwritev(bs->file,
                                   cluster_offset + offset_in_cluster,
                                   cur_bytes, &hd_qiov, 0);
-            qemu_co_mutex_lock(&s->lock);
             if (ret < 0) {
+                qemu_co_mutex_lock(&s->lock);
                 goto fail;
             }
         }
 
+        qemu_co_mutex_lock(&s->lock);
+
         ret = qcow2_handle_l2meta(bs, &l2meta, true);
         if (ret) {
             goto fail;
diff --git a/block/trace-events b/block/trace-events
index 3e8c47bb24..f3f6d66dcc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -69,6 +69,7 @@  qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
 qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
 qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
+qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
 
 # block/qcow2-cluster.c
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
index af0588ae9a..163fb075ea 100755
--- a/tests/qemu-iotests/060
+++ b/tests/qemu-iotests/060
@@ -150,10 +150,15 @@  $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io
 echo
 echo "=== Testing overlap while COW is in flight ==="
 echo
+BACKING_IMG=$TEST_IMG.base
+TEST_IMG=$BACKING_IMG _make_test_img 1G
+
+$QEMU_IO -c 'write 0k 64k' "$BACKING_IMG" | _filter_qemu_io
+
 # compat=0.10 is required in order to make the following discard actually
 # unallocate the sector rather than make it a zero sector - we want COW, after
 # all.
-IMGOPTS='compat=0.10' _make_test_img 1G
+IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G
 # Write two clusters, the second one enforces creation of an L2 table after
 # the first data cluster.
 $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io
diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
index d67c6234a4..1d582d4b36 100644
--- a/tests/qemu-iotests/060.out
+++ b/tests/qemu-iotests/060.out
@@ -97,7 +97,10 @@  read 512/512 bytes at offset 0
 
 === Testing overlap while COW is in flight ===
 
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 backing_file=TEST_DIR/t.IMGFMT.base
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 65536/65536 bytes at offset 536870912