diff mbox

[v2,01/15] block: introduce BDRV_REQ_ALLOCATE flag

Message ID 1496330073-51338-2-git-send-email-anton.nefedov@virtuozzo.com (mailing list archive)
State New, archived
Headers show

Commit Message

Anton Nefedov June 1, 2017, 3:14 p.m. UTC
The flag is supposed to indicate that the region of the disk image has
to be sufficiently allocated so it reads as zeroes. The call with the flag
set has to return -ENOTSUP if allocation cannot be done efficiently
(i.e. without falling back to writing actual buffers)

Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
---
 block/io.c            | 19 ++++++++++++++++---
 block/trace-events    |  1 +
 include/block/block.h |  6 +++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

Comments

Eric Blake June 1, 2017, 7:07 p.m. UTC | #1
On 06/01/2017 10:14 AM, Anton Nefedov wrote:
> The flag is supposed to indicate that the region of the disk image has
> to be sufficiently allocated so it reads as zeroes. The call with the flag
> set has to return -ENOTSUP if allocation cannot be done efficiently
> (i.e. without falling back to writing actual buffers)
> 
> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> ---
>  block/io.c            | 19 ++++++++++++++++---
>  block/trace-events    |  1 +
>  include/block/block.h |  6 +++++-
>  3 files changed, 22 insertions(+), 4 deletions(-)

You may want to 'git config diff.orderFile /path/to/file' (with a
suitably populated file) so that .h files come first in your diffs, as
that can aid reviewers.  At one point, there was a thread about adding
such a file to qemu.git proper for everyone to share, although it seems
to have stalled.

> 
> diff --git a/block/io.c b/block/io.c
> index ed31810..d47efa9 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1272,7 +1272,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
>              assert(!bs->supported_zero_flags);
>          }
>  
> -        if (ret == -ENOTSUP) {
> +        if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) {

I'd feel MUCH better if you first fixed the conditional just above this
point to ensure that if the caller requests BDRV_REQ_ALLOCATE that we do
not call bdrv->bdrv_co_pwrite_zeroes() unless bs->supported_zero_flags
also mentions this bit.

Remember, the existing semantics of .bdrv_co_pwrite_zeroes() merely
state that we must return -ENOTSUP unless we can guarantee that we read
back as zeroes, but puts no timing constraints on it.  A driver that has
not been retrofitted to understand the BDRV_REQ_ALLOCATE flag will
therefore risk taking too long.  Using bs->supported_zero_flags as your
gate is what will let you avoid calling into a driver that has not been
audited for fitting the new contract.

>              /* Fall back to bounce buffer if write zeroes is unsupported */
>              BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
>  
> @@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
>      ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
>  
>      if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
> -        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
> -        qemu_iovec_is_zero(qiov)) {
> +        !(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) &&
> +        drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) {
>          flags |= BDRV_REQ_ZERO_WRITE;
>          if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
>              flags |= BDRV_REQ_MAY_UNMAP;
> @@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
>  
>      assert(flags & BDRV_REQ_ZERO_WRITE);
>      if (head_padding_bytes || tail_padding_bytes) {
> +        if (flags & BDRV_REQ_ALLOCATE) {
> +            return -ENOTSUP;
> +        }

Can we assert that BDRV_REQ_ALLOCATE will only be supplied by a caller
that is already using aligned values?  Or is that too strict?

>          buf = qemu_blockalign(bs, align);
>          iov = (struct iovec) {
>              .iov_base   = buf,
> @@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
>          return ret;
>      }
>  
> +    if (qiov && flags & BDRV_REQ_ALLOCATE) {
> +        /* allocation request with qiov provided doesn't make much sense */
> +        return -ENOTSUP;

Should this be an assertion (bug in the program for mixing things that
don't make sense) rather than just a runtime error return?

> +    }
> +
>      bdrv_inc_in_flight(bs);
>      /*
>       * Align write if necessary by performing a read-modify-write cycle.
> @@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
>  {
>      trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
>  
> +    if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) {
> +        /* nonsense */
> +        return -ENOTSUP;
> +    }

Ditto.

> +
>      if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
>          flags &= ~BDRV_REQ_MAY_UNMAP;
>      }
> diff --git a/block/trace-events b/block/trace-events
> index 9a71c7f..a15c2cc 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs
>  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
> +bdrv_co_allocate(void *bs, int64_t offset, int count) "bs %p offset %"PRId64" count %d"
>  bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
>  
>  # block/stream.c
> diff --git a/include/block/block.h b/include/block/block.h
> index 9b355e9..53a357c 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -65,9 +65,13 @@ typedef enum {
>      BDRV_REQ_NO_SERIALISING     = 0x8,
>      BDRV_REQ_FUA                = 0x10,
>      BDRV_REQ_WRITE_COMPRESSED   = 0x20,
> +    /* BDRV_REQ_ALLOCATE is used to indicate that the driver is to
> +     * efficiently allocate the space so it reads as zeroes or return an error
> +     */
> +    BDRV_REQ_ALLOCATE           = 0x40,

Doesn't match how the other flags are documented, but any documentation
is better than none.

Missing mention of the new flag in the documentation for
supported_zero_flags.

>  
>      /* Mask of valid flags */
> -    BDRV_REQ_MASK               = 0x3f,
> +    BDRV_REQ_MASK               = 0x7f,
>  } BdrvRequestFlags;
>  
>  typedef struct BlockSizes {
>
Anton Nefedov June 2, 2017, 1:08 p.m. UTC | #2
On 06/01/2017 10:07 PM, Eric Blake wrote:
> On 06/01/2017 10:14 AM, Anton Nefedov wrote:
>> The flag is supposed to indicate that the region of the disk image has
>> to be sufficiently allocated so it reads as zeroes. The call with the flag
>> set has to return -ENOTSUP if allocation cannot be done efficiently
>> (i.e. without falling back to writing actual buffers)
>>
>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>> ---
>>   block/io.c            | 19 ++++++++++++++++---
>>   block/trace-events    |  1 +
>>   include/block/block.h |  6 +++++-
>>   3 files changed, 22 insertions(+), 4 deletions(-)
> 
> You may want to 'git config diff.orderFile /path/to/file' (with a
> suitably populated file) so that .h files come first in your diffs, as
> that can aid reviewers.  At one point, there was a thread about adding
> such a file to qemu.git proper for everyone to share, although it seems
> to have stalled.
> 

Thanks, will do

>>
>> diff --git a/block/io.c b/block/io.c
>> index ed31810..d47efa9 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -1272,7 +1272,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
>>               assert(!bs->supported_zero_flags);
>>           }
>>   
>> -        if (ret == -ENOTSUP) {
>> +        if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) {
> 
> I'd feel MUCH better if you first fixed the conditional just above this
> point to ensure that if the caller requests BDRV_REQ_ALLOCATE that we do
> not call bdrv->bdrv_co_pwrite_zeroes() unless bs->supported_zero_flags
> also mentions this bit.
> 
> Remember, the existing semantics of .bdrv_co_pwrite_zeroes() merely
> state that we must return -ENOTSUP unless we can guarantee that we read
> back as zeroes, but puts no timing constraints on it.  A driver that has
> not been retrofitted to understand the BDRV_REQ_ALLOCATE flag will
> therefore risk taking too long.  Using bs->supported_zero_flags as your
> gate is what will let you avoid calling into a driver that has not been
> audited for fitting the new contract.
> 

Absolutely; I have even added that check but must have lost that at some
point.
Meant to add that much earlier though, to bdrv_co_pwrite_zeroes()

>>               /* Fall back to bounce buffer if write zeroes is unsupported */
>>               BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
>>   
>> @@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
>>       ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
>>   
>>       if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
>> -        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
>> -        qemu_iovec_is_zero(qiov)) {
>> +        !(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) &&
>> +        drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) {
>>           flags |= BDRV_REQ_ZERO_WRITE;
>>           if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
>>               flags |= BDRV_REQ_MAY_UNMAP;
>> @@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
>>   
>>       assert(flags & BDRV_REQ_ZERO_WRITE);
>>       if (head_padding_bytes || tail_padding_bytes) {
>> +        if (flags & BDRV_REQ_ALLOCATE) {
>> +            return -ENOTSUP;
>> +        }
> 
> Can we assert that BDRV_REQ_ALLOCATE will only be supplied by a caller
> that is already using aligned values?  Or is that too strict?
> 

as I understand the top driver should not care much about the child
driver alignment preferences? that's what the common bdrv_* interface is
there for

>>           buf = qemu_blockalign(bs, align);
>>           iov = (struct iovec) {
>>               .iov_base   = buf,
>> @@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
>>           return ret;
>>       }
>>   
>> +    if (qiov && flags & BDRV_REQ_ALLOCATE) {
>> +        /* allocation request with qiov provided doesn't make much sense */
>> +        return -ENOTSUP;
> 
> Should this be an assertion (bug in the program for mixing things that
> don't make sense) rather than just a runtime error return?
> 

incline to agree here

>> +    }
>> +
>>       bdrv_inc_in_flight(bs);
>>       /*
>>        * Align write if necessary by performing a read-modify-write cycle.
>> @@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
>>   {
>>       trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
>>   
>> +    if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) {
>> +        /* nonsense */
>> +        return -ENOTSUP;
>> +    }
> 
> Ditto.
> 

yep

>> +
>>       if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
>>           flags &= ~BDRV_REQ_MAY_UNMAP;
>>       }
>> diff --git a/block/trace-events b/block/trace-events
>> index 9a71c7f..a15c2cc 100644
>> --- a/block/trace-events
>> +++ b/block/trace-events
>> @@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs
>>   bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>>   bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>>   bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
>> +bdrv_co_allocate(void *bs, int64_t offset, int count) "bs %p offset %"PRId64" count %d"
>>   bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
>>   
>>   # block/stream.c
>> diff --git a/include/block/block.h b/include/block/block.h
>> index 9b355e9..53a357c 100644
>> --- a/include/block/block.h
>> +++ b/include/block/block.h
>> @@ -65,9 +65,13 @@ typedef enum {
>>       BDRV_REQ_NO_SERIALISING     = 0x8,
>>       BDRV_REQ_FUA                = 0x10,
>>       BDRV_REQ_WRITE_COMPRESSED   = 0x20,
>> +    /* BDRV_REQ_ALLOCATE is used to indicate that the driver is to
>> +     * efficiently allocate the space so it reads as zeroes or return an error
>> +     */
>> +    BDRV_REQ_ALLOCATE           = 0x40,
> 
> Doesn't match how the other flags are documented, but any documentation
> is better than none.
> 

Will fix

> Missing mention of the new flag in the documentation for
> supported_zero_flags.
> 

Done.

>>   
>>       /* Mask of valid flags */
>> -    BDRV_REQ_MASK               = 0x3f,
>> +    BDRV_REQ_MASK               = 0x7f,
>>   } BdrvRequestFlags;
>>   
>>   typedef struct BlockSizes {
>>
> 
/Anton
diff mbox

Patch

diff --git a/block/io.c b/block/io.c
index ed31810..d47efa9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1272,7 +1272,7 @@  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
             assert(!bs->supported_zero_flags);
         }
 
-        if (ret == -ENOTSUP) {
+        if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) {
             /* Fall back to bounce buffer if write zeroes is unsupported */
             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
 
@@ -1355,8 +1355,8 @@  static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
-        qemu_iovec_is_zero(qiov)) {
+        !(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) &&
+        drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) {
         flags |= BDRV_REQ_ZERO_WRITE;
         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
             flags |= BDRV_REQ_MAY_UNMAP;
@@ -1436,6 +1436,9 @@  static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
 
     assert(flags & BDRV_REQ_ZERO_WRITE);
     if (head_padding_bytes || tail_padding_bytes) {
+        if (flags & BDRV_REQ_ALLOCATE) {
+            return -ENOTSUP;
+        }
         buf = qemu_blockalign(bs, align);
         iov = (struct iovec) {
             .iov_base   = buf,
@@ -1534,6 +1537,11 @@  int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
         return ret;
     }
 
+    if (qiov && flags & BDRV_REQ_ALLOCATE) {
+        /* allocation request with qiov provided doesn't make much sense */
+        return -ENOTSUP;
+    }
+
     bdrv_inc_in_flight(bs);
     /*
      * Align write if necessary by performing a read-modify-write cycle.
@@ -1665,6 +1673,11 @@  int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
 {
     trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
 
+    if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) {
+        /* nonsense */
+        return -ENOTSUP;
+    }
+
     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
         flags &= ~BDRV_REQ_MAY_UNMAP;
     }
diff --git a/block/trace-events b/block/trace-events
index 9a71c7f..a15c2cc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -15,6 +15,7 @@  bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x"
+bdrv_co_allocate(void *bs, int64_t offset, int count) "bs %p offset %"PRId64" count %d"
 bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u"
 
 # block/stream.c
diff --git a/include/block/block.h b/include/block/block.h
index 9b355e9..53a357c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -65,9 +65,13 @@  typedef enum {
     BDRV_REQ_NO_SERIALISING     = 0x8,
     BDRV_REQ_FUA                = 0x10,
     BDRV_REQ_WRITE_COMPRESSED   = 0x20,
+    /* BDRV_REQ_ALLOCATE is used to indicate that the driver is to
+     * efficiently allocate the space so it reads as zeroes or return an error
+     */
+    BDRV_REQ_ALLOCATE           = 0x40,
 
     /* Mask of valid flags */
-    BDRV_REQ_MASK               = 0x3f,
+    BDRV_REQ_MASK               = 0x7f,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {