diff mbox series

[v3,13/20] block/export: rewrite vduse-blk drain code

Message ID 20230420113732.336620-14-stefanha@redhat.com (mailing list archive)
State Superseded
Headers show
Series block: remove aio_disable_external() API | expand

Commit Message

Stefan Hajnoczi April 20, 2023, 11:37 a.m. UTC
vduse_blk_detach_ctx() waits for in-flight requests using
AIO_WAIT_WHILE(). This is not allowed according to a comment in
bdrv_set_aio_context_commit():

  /*
   * Take the old AioContex when detaching it from bs.
   * At this point, new_context lock is already acquired, and we are now
   * also taking old_context. This is safe as long as bdrv_detach_aio_context
   * does not call AIO_POLL_WHILE().
   */

Use this opportunity to rewrite the drain code in vduse-blk:

- Use the BlockExport refcount so that vduse_blk_exp_delete() is only
  called when there are no more requests in flight.

- Implement .drained_poll() so in-flight request coroutines are stopped
  by the time .bdrv_detach_aio_context() is called.

- Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
  .bdrv_detach_aio_context() constraint violation. It's no longer
  needed due to the previous changes.

- Always handle the VDUSE file descriptor, even in drained sections. The
  VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
  drained sections. This ensures that the VDUSE kernel code gets a fast
  response.

- Suspend virtqueue fd handlers in .drained_begin() and resume them in
  .drained_end(). This eliminates the need for the
  aio_set_fd_handler(is_external=true) flag, which is being removed from
  QEMU.

This is a long list but splitting it into individual commits would
probably lead to git bisect failures - the changes are all related.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
 1 file changed, 93 insertions(+), 39 deletions(-)

Comments

Yongji Xie April 21, 2023, 3:36 a.m. UTC | #1
Hi Stefan,

On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> vduse_blk_detach_ctx() waits for in-flight requests using
> AIO_WAIT_WHILE(). This is not allowed according to a comment in
> bdrv_set_aio_context_commit():
>
>   /*
>    * Take the old AioContex when detaching it from bs.
>    * At this point, new_context lock is already acquired, and we are now
>    * also taking old_context. This is safe as long as bdrv_detach_aio_context
>    * does not call AIO_POLL_WHILE().
>    */
>
> Use this opportunity to rewrite the drain code in vduse-blk:
>
> - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
>   called when there are no more requests in flight.
>
> - Implement .drained_poll() so in-flight request coroutines are stopped
>   by the time .bdrv_detach_aio_context() is called.
>
> - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
>   .bdrv_detach_aio_context() constraint violation. It's no longer
>   needed due to the previous changes.
>
> - Always handle the VDUSE file descriptor, even in drained sections. The
>   VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
>   drained sections. This ensures that the VDUSE kernel code gets a fast
>   response.
>
> - Suspend virtqueue fd handlers in .drained_begin() and resume them in
>   .drained_end(). This eliminates the need for the
>   aio_set_fd_handler(is_external=true) flag, which is being removed from
>   QEMU.
>
> This is a long list but splitting it into individual commits would
> probably lead to git bisect failures - the changes are all related.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
>  1 file changed, 93 insertions(+), 39 deletions(-)
>
> diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> index f7ae44e3ce..35dc8fcf45 100644
> --- a/block/export/vduse-blk.c
> +++ b/block/export/vduse-blk.c
> @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
>      VduseDev *dev;
>      uint16_t num_queues;
>      char *recon_file;
> -    unsigned int inflight;
> +    unsigned int inflight; /* atomic */
> +    bool vqs_started;
>  } VduseBlkExport;
>
>  typedef struct VduseBlkReq {
> @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
>
>  static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
>  {
> -    vblk_exp->inflight++;
> +    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {

I wonder why we need to use atomic operations here.

> +        /* Prevent export from being deleted */
> +        aio_context_acquire(vblk_exp->export.ctx);
> +        blk_exp_ref(&vblk_exp->export);
> +        aio_context_release(vblk_exp->export.ctx);
> +    }
>  }
>
>  static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
>  {
> -    if (--vblk_exp->inflight == 0) {
> +    if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
> +        /* Wake AIO_WAIT_WHILE() */
>          aio_wait_kick();
> +
> +        /* Now the export can be deleted */
> +        aio_context_acquire(vblk_exp->export.ctx);
> +        blk_exp_unref(&vblk_exp->export);
> +        aio_context_release(vblk_exp->export.ctx);
>      }
>  }
>
> @@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
>  {
>      VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
>
> +    if (!vblk_exp->vqs_started) {
> +        return; /* vduse_blk_drained_end() will start vqs later */
> +    }
> +
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> -                       true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
> +                       false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
>      /* Make sure we don't miss any kick afer reconnecting */
>      eventfd_write(vduse_queue_get_fd(vq), 1);
>  }
> @@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
>  static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
>  {
>      VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
> +    int fd = vduse_queue_get_fd(vq);
>
> -    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> -                       true, NULL, NULL, NULL, NULL, NULL);
> +    if (fd < 0) {
> +        return;
> +    }
> +
> +    aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
> +                       NULL, NULL, NULL, NULL, NULL);
>  }
>
>  static const VduseOps vduse_blk_ops = {
> @@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque)
>
>  static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
>  {
> -    int i;
> -
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> -                       true, on_vduse_dev_kick, NULL, NULL, NULL,
> +                       false, on_vduse_dev_kick, NULL, NULL, NULL,
>                         vblk_exp->dev);
>
> -    for (i = 0; i < vblk_exp->num_queues; i++) {
> -        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> -        int fd = vduse_queue_get_fd(vq);
> -
> -        if (fd < 0) {
> -            continue;
> -        }
> -        aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
> -                           on_vduse_vq_kick, NULL, NULL, NULL, vq);
> -    }
> +    /* Virtqueues are handled by vduse_blk_drained_end() */
>  }
>
>  static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
>  {
> -    int i;
> -
> -    for (i = 0; i < vblk_exp->num_queues; i++) {
> -        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> -        int fd = vduse_queue_get_fd(vq);
> -
> -        if (fd < 0) {
> -            continue;
> -        }
> -        aio_set_fd_handler(vblk_exp->export.ctx, fd,
> -                           true, NULL, NULL, NULL, NULL, NULL);
> -    }
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> -                       true, NULL, NULL, NULL, NULL, NULL);
> +                       false, NULL, NULL, NULL, NULL, NULL);
>
> -    AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
> +    /* Virtqueues are handled by vduse_blk_drained_begin() */
>  }
>
>
> @@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque)
>                              (char *)&config.capacity);
>  }
>
> +static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
> +{
> +    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> +        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> +        vduse_blk_disable_queue(vblk_exp->dev, vq);
> +    }
> +
> +    vblk_exp->vqs_started = false;
> +}
> +
> +static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
> +{
> +    vblk_exp->vqs_started = true;
> +
> +    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> +        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> +        vduse_blk_enable_queue(vblk_exp->dev, vq);
> +    }
> +}
> +
> +static void vduse_blk_drained_begin(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    vduse_blk_stop_virtqueues(vblk_exp);
> +}
> +
> +static void vduse_blk_drained_end(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    vduse_blk_start_virtqueues(vblk_exp);
> +}
> +
> +static bool vduse_blk_drained_poll(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    return qatomic_read(&vblk_exp->inflight) > 0;
> +}
> +
>  static const BlockDevOps vduse_block_ops = {
> -    .resize_cb = vduse_blk_resize,
> +    .resize_cb     = vduse_blk_resize,
> +    .drained_begin = vduse_blk_drained_begin,
> +    .drained_end   = vduse_blk_drained_end,
> +    .drained_poll  = vduse_blk_drained_poll,
>  };
>
>  static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
> @@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
>      vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
>      vblk_exp->handler.logical_block_size = logical_block_size;
>      vblk_exp->handler.writable = opts->writable;
> +    vblk_exp->vqs_started = true;
>
>      config.capacity =
>              cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
> @@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
>          vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
>      }
>
> -    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
> +    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false,
>                         on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
>
>      blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
>                                   vblk_exp);
> -
>      blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
>
> +    /*
> +     * We handle draining ourselves using an in-flight counter and by disabling
> +     * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
> +     * complete so the in-flight counter reaches zero.
> +     */
> +    blk_set_disable_request_queuing(exp->blk, true);
> +
>      return 0;
>  err:
>      vduse_dev_destroy(vblk_exp->dev);
> @@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp)
>      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>      int ret;
>
> +    assert(qatomic_read(&vblk_exp->inflight) == 0);
> +
> +    vduse_blk_detach_ctx(vblk_exp);
>      blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
>                                      vblk_exp);
>      blk_set_dev_ops(exp->blk, NULL, NULL);
> @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
>      g_free(vblk_exp->handler.serial);
>  }
>
> +/* Called with exp->ctx acquired */
>  static void vduse_blk_exp_request_shutdown(BlockExport *exp)
>  {
>      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>
> -    aio_context_acquire(vblk_exp->export.ctx);
> -    vduse_blk_detach_ctx(vblk_exp);
> -    aio_context_acquire(vblk_exp->export.ctx);
> +    vduse_blk_stop_virtqueues(vblk_exp);

Can we add a AIO_WAIT_WHILE() here? Then we don't need to
increase/decrease the BlockExport refcount during I/O processing.

Thanks,
Yongji
Stefan Hajnoczi April 25, 2023, 4:42 p.m. UTC | #2
On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote:
> Hi Stefan,
> 
> On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > vduse_blk_detach_ctx() waits for in-flight requests using
> > AIO_WAIT_WHILE(). This is not allowed according to a comment in
> > bdrv_set_aio_context_commit():
> >
> >   /*
> >    * Take the old AioContex when detaching it from bs.
> >    * At this point, new_context lock is already acquired, and we are now
> >    * also taking old_context. This is safe as long as bdrv_detach_aio_context
> >    * does not call AIO_POLL_WHILE().
> >    */
> >
> > Use this opportunity to rewrite the drain code in vduse-blk:
> >
> > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
> >   called when there are no more requests in flight.
> >
> > - Implement .drained_poll() so in-flight request coroutines are stopped
> >   by the time .bdrv_detach_aio_context() is called.
> >
> > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
> >   .bdrv_detach_aio_context() constraint violation. It's no longer
> >   needed due to the previous changes.
> >
> > - Always handle the VDUSE file descriptor, even in drained sections. The
> >   VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
> >   drained sections. This ensures that the VDUSE kernel code gets a fast
> >   response.
> >
> > - Suspend virtqueue fd handlers in .drained_begin() and resume them in
> >   .drained_end(). This eliminates the need for the
> >   aio_set_fd_handler(is_external=true) flag, which is being removed from
> >   QEMU.
> >
> > This is a long list but splitting it into individual commits would
> > probably lead to git bisect failures - the changes are all related.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >  block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
> >  1 file changed, 93 insertions(+), 39 deletions(-)
> >
> > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> > index f7ae44e3ce..35dc8fcf45 100644
> > --- a/block/export/vduse-blk.c
> > +++ b/block/export/vduse-blk.c
> > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
> >      VduseDev *dev;
> >      uint16_t num_queues;
> >      char *recon_file;
> > -    unsigned int inflight;
> > +    unsigned int inflight; /* atomic */
> > +    bool vqs_started;
> >  } VduseBlkExport;
> >
> >  typedef struct VduseBlkReq {
> > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
> >
> >  static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
> >  {
> > -    vblk_exp->inflight++;
> > +    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
> 
> I wonder why we need to use atomic operations here.

The inflight counter is only modified by the vhost-user export thread,
but it may be read by another thread here:

  static bool vduse_blk_drained_poll(void *opaque)
  {
      BlockExport *exp = opaque;
      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);

      return qatomic_read(&vblk_exp->inflight) > 0;

BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are
drained (e.g. blk_drain_all() and related APIs).

> > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> >      g_free(vblk_exp->handler.serial);
> >  }
> >
> > +/* Called with exp->ctx acquired */
> >  static void vduse_blk_exp_request_shutdown(BlockExport *exp)
> >  {
> >      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> >
> > -    aio_context_acquire(vblk_exp->export.ctx);
> > -    vduse_blk_detach_ctx(vblk_exp);
> > -    aio_context_acquire(vblk_exp->export.ctx);
> > +    vduse_blk_stop_virtqueues(vblk_exp);
> 
> Can we add a AIO_WAIT_WHILE() here? Then we don't need to
> increase/decrease the BlockExport refcount during I/O processing.

I don't think so because vduse_blk_exp_request_shutdown() is not the
only place where we wait for requests to complete. There would still
need to be away to wait for requests to finish (without calling
AIO_WAIT_WHILE()) in vduse_blk_drained_poll().

Stefan
Yongji Xie April 26, 2023, 2:23 a.m. UTC | #3
On Wed, Apr 26, 2023 at 12:43 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote:
> > Hi Stefan,
> >
> > On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >
> > > vduse_blk_detach_ctx() waits for in-flight requests using
> > > AIO_WAIT_WHILE(). This is not allowed according to a comment in
> > > bdrv_set_aio_context_commit():
> > >
> > >   /*
> > >    * Take the old AioContex when detaching it from bs.
> > >    * At this point, new_context lock is already acquired, and we are now
> > >    * also taking old_context. This is safe as long as bdrv_detach_aio_context
> > >    * does not call AIO_POLL_WHILE().
> > >    */
> > >
> > > Use this opportunity to rewrite the drain code in vduse-blk:
> > >
> > > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
> > >   called when there are no more requests in flight.
> > >
> > > - Implement .drained_poll() so in-flight request coroutines are stopped
> > >   by the time .bdrv_detach_aio_context() is called.
> > >
> > > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
> > >   .bdrv_detach_aio_context() constraint violation. It's no longer
> > >   needed due to the previous changes.
> > >
> > > - Always handle the VDUSE file descriptor, even in drained sections. The
> > >   VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
> > >   drained sections. This ensures that the VDUSE kernel code gets a fast
> > >   response.
> > >
> > > - Suspend virtqueue fd handlers in .drained_begin() and resume them in
> > >   .drained_end(). This eliminates the need for the
> > >   aio_set_fd_handler(is_external=true) flag, which is being removed from
> > >   QEMU.
> > >
> > > This is a long list but splitting it into individual commits would
> > > probably lead to git bisect failures - the changes are all related.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > >  block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
> > >  1 file changed, 93 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> > > index f7ae44e3ce..35dc8fcf45 100644
> > > --- a/block/export/vduse-blk.c
> > > +++ b/block/export/vduse-blk.c
> > > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
> > >      VduseDev *dev;
> > >      uint16_t num_queues;
> > >      char *recon_file;
> > > -    unsigned int inflight;
> > > +    unsigned int inflight; /* atomic */
> > > +    bool vqs_started;
> > >  } VduseBlkExport;
> > >
> > >  typedef struct VduseBlkReq {
> > > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
> > >
> > >  static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
> > >  {
> > > -    vblk_exp->inflight++;
> > > +    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
> >
> > I wonder why we need to use atomic operations here.
>
> The inflight counter is only modified by the vhost-user export thread,
> but it may be read by another thread here:
>

I see. I mean is it enough to just use volatile keywords here, since
the writers would not access the variable concurrently.

>   static bool vduse_blk_drained_poll(void *opaque)
>   {
>       BlockExport *exp = opaque;
>       VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>
>       return qatomic_read(&vblk_exp->inflight) > 0;
>
> BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are
> drained (e.g. blk_drain_all() and related APIs).
>
> > > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> > >      g_free(vblk_exp->handler.serial);
> > >  }
> > >
> > > +/* Called with exp->ctx acquired */
> > >  static void vduse_blk_exp_request_shutdown(BlockExport *exp)
> > >  {
> > >      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> > >
> > > -    aio_context_acquire(vblk_exp->export.ctx);
> > > -    vduse_blk_detach_ctx(vblk_exp);
> > > -    aio_context_acquire(vblk_exp->export.ctx);
> > > +    vduse_blk_stop_virtqueues(vblk_exp);
> >
> > Can we add a AIO_WAIT_WHILE() here? Then we don't need to
> > increase/decrease the BlockExport refcount during I/O processing.
>
> I don't think so because vduse_blk_exp_request_shutdown() is not the
> only place where we wait for requests to complete. There would still
> need to be away to wait for requests to finish (without calling
> AIO_WAIT_WHILE()) in vduse_blk_drained_poll().
>

But the BlockExport would not be freed until we call
vduse_blk_exp_request_shutdown(). If we can ensure that there will be
no inflight I/O after we call vduse_blk_exp_request_shutdown(), the
BlockExport can be freed safely without increasing/decreasing the
BlockExport refcount during I/O processing.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
index f7ae44e3ce..35dc8fcf45 100644
--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@@ -31,7 +31,8 @@  typedef struct VduseBlkExport {
     VduseDev *dev;
     uint16_t num_queues;
     char *recon_file;
-    unsigned int inflight;
+    unsigned int inflight; /* atomic */
+    bool vqs_started;
 } VduseBlkExport;
 
 typedef struct VduseBlkReq {
@@ -41,13 +42,24 @@  typedef struct VduseBlkReq {
 
 static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
 {
-    vblk_exp->inflight++;
+    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
+        /* Prevent export from being deleted */
+        aio_context_acquire(vblk_exp->export.ctx);
+        blk_exp_ref(&vblk_exp->export);
+        aio_context_release(vblk_exp->export.ctx);
+    }
 }
 
 static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
 {
-    if (--vblk_exp->inflight == 0) {
+    if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
+        /* Wake AIO_WAIT_WHILE() */
         aio_wait_kick();
+
+        /* Now the export can be deleted */
+        aio_context_acquire(vblk_exp->export.ctx);
+        blk_exp_unref(&vblk_exp->export);
+        aio_context_release(vblk_exp->export.ctx);
     }
 }
 
@@ -124,8 +136,12 @@  static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
 {
     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
 
+    if (!vblk_exp->vqs_started) {
+        return; /* vduse_blk_drained_end() will start vqs later */
+    }
+
     aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-                       true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
+                       false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
     /* Make sure we don't miss any kick afer reconnecting */
     eventfd_write(vduse_queue_get_fd(vq), 1);
 }
@@ -133,9 +149,14 @@  static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
 static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
 {
     VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
+    int fd = vduse_queue_get_fd(vq);
 
-    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-                       true, NULL, NULL, NULL, NULL, NULL);
+    if (fd < 0) {
+        return;
+    }
+
+    aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
+                       NULL, NULL, NULL, NULL, NULL);
 }
 
 static const VduseOps vduse_blk_ops = {
@@ -152,42 +173,19 @@  static void on_vduse_dev_kick(void *opaque)
 
 static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
 {
-    int i;
-
     aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
-                       true, on_vduse_dev_kick, NULL, NULL, NULL,
+                       false, on_vduse_dev_kick, NULL, NULL, NULL,
                        vblk_exp->dev);
 
-    for (i = 0; i < vblk_exp->num_queues; i++) {
-        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
-        int fd = vduse_queue_get_fd(vq);
-
-        if (fd < 0) {
-            continue;
-        }
-        aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
-                           on_vduse_vq_kick, NULL, NULL, NULL, vq);
-    }
+    /* Virtqueues are handled by vduse_blk_drained_end() */
 }
 
 static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
 {
-    int i;
-
-    for (i = 0; i < vblk_exp->num_queues; i++) {
-        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
-        int fd = vduse_queue_get_fd(vq);
-
-        if (fd < 0) {
-            continue;
-        }
-        aio_set_fd_handler(vblk_exp->export.ctx, fd,
-                           true, NULL, NULL, NULL, NULL, NULL);
-    }
     aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
-                       true, NULL, NULL, NULL, NULL, NULL);
+                       false, NULL, NULL, NULL, NULL, NULL);
 
-    AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
+    /* Virtqueues are handled by vduse_blk_drained_begin() */
 }
 
 
@@ -220,8 +218,55 @@  static void vduse_blk_resize(void *opaque)
                             (char *)&config.capacity);
 }
 
+static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
+{
+    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+        vduse_blk_disable_queue(vblk_exp->dev, vq);
+    }
+
+    vblk_exp->vqs_started = false;
+}
+
+static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
+{
+    vblk_exp->vqs_started = true;
+
+    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+        vduse_blk_enable_queue(vblk_exp->dev, vq);
+    }
+}
+
+static void vduse_blk_drained_begin(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    vduse_blk_stop_virtqueues(vblk_exp);
+}
+
+static void vduse_blk_drained_end(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    vduse_blk_start_virtqueues(vblk_exp);
+}
+
+static bool vduse_blk_drained_poll(void *opaque)
+{
+    BlockExport *exp = opaque;
+    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+    return qatomic_read(&vblk_exp->inflight) > 0;
+}
+
 static const BlockDevOps vduse_block_ops = {
-    .resize_cb = vduse_blk_resize,
+    .resize_cb     = vduse_blk_resize,
+    .drained_begin = vduse_blk_drained_begin,
+    .drained_end   = vduse_blk_drained_end,
+    .drained_poll  = vduse_blk_drained_poll,
 };
 
 static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
@@ -268,6 +313,7 @@  static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
     vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
     vblk_exp->handler.logical_block_size = logical_block_size;
     vblk_exp->handler.writable = opts->writable;
+    vblk_exp->vqs_started = true;
 
     config.capacity =
             cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
@@ -322,14 +368,20 @@  static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
         vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
     }
 
-    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
+    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false,
                        on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
 
     blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                  vblk_exp);
-
     blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
 
+    /*
+     * We handle draining ourselves using an in-flight counter and by disabling
+     * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
+     * complete so the in-flight counter reaches zero.
+     */
+    blk_set_disable_request_queuing(exp->blk, true);
+
     return 0;
 err:
     vduse_dev_destroy(vblk_exp->dev);
@@ -344,6 +396,9 @@  static void vduse_blk_exp_delete(BlockExport *exp)
     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
     int ret;
 
+    assert(qatomic_read(&vblk_exp->inflight) == 0);
+
+    vduse_blk_detach_ctx(vblk_exp);
     blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
                                     vblk_exp);
     blk_set_dev_ops(exp->blk, NULL, NULL);
@@ -355,13 +410,12 @@  static void vduse_blk_exp_delete(BlockExport *exp)
     g_free(vblk_exp->handler.serial);
 }
 
+/* Called with exp->ctx acquired */
 static void vduse_blk_exp_request_shutdown(BlockExport *exp)
 {
     VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
 
-    aio_context_acquire(vblk_exp->export.ctx);
-    vduse_blk_detach_ctx(vblk_exp);
-    aio_context_acquire(vblk_exp->export.ctx);
+    vduse_blk_stop_virtqueues(vblk_exp);
 }
 
 const BlockExportDriver blk_exp_vduse_blk = {