Message ID | 20230420113732.336620-14-stefanha@redhat.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | block: remove aio_disable_external() API | expand |
Hi Stefan, On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > vduse_blk_detach_ctx() waits for in-flight requests using > AIO_WAIT_WHILE(). This is not allowed according to a comment in > bdrv_set_aio_context_commit(): > > /* > * Take the old AioContex when detaching it from bs. > * At this point, new_context lock is already acquired, and we are now > * also taking old_context. This is safe as long as bdrv_detach_aio_context > * does not call AIO_POLL_WHILE(). > */ > > Use this opportunity to rewrite the drain code in vduse-blk: > > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only > called when there are no more requests in flight. > > - Implement .drained_poll() so in-flight request coroutines are stopped > by the time .bdrv_detach_aio_context() is called. > > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the > .bdrv_detach_aio_context() constraint violation. It's no longer > needed due to the previous changes. > > - Always handle the VDUSE file descriptor, even in drained sections. The > VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in > drained sections. This ensures that the VDUSE kernel code gets a fast > response. > > - Suspend virtqueue fd handlers in .drained_begin() and resume them in > .drained_end(). This eliminates the need for the > aio_set_fd_handler(is_external=true) flag, which is being removed from > QEMU. > > This is a long list but splitting it into individual commits would > probably lead to git bisect failures - the changes are all related. > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > --- > block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------ > 1 file changed, 93 insertions(+), 39 deletions(-) > > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c > index f7ae44e3ce..35dc8fcf45 100644 > --- a/block/export/vduse-blk.c > +++ b/block/export/vduse-blk.c > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport { > VduseDev *dev; > uint16_t num_queues; > char *recon_file; > - unsigned int inflight; > + unsigned int inflight; /* atomic */ > + bool vqs_started; > } VduseBlkExport; > > typedef struct VduseBlkReq { > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq { > > static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) > { > - vblk_exp->inflight++; > + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { I wonder why we need to use atomic operations here. > + /* Prevent export from being deleted */ > + aio_context_acquire(vblk_exp->export.ctx); > + blk_exp_ref(&vblk_exp->export); > + aio_context_release(vblk_exp->export.ctx); > + } > } > > static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp) > { > - if (--vblk_exp->inflight == 0) { > + if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) { > + /* Wake AIO_WAIT_WHILE() */ > aio_wait_kick(); > + > + /* Now the export can be deleted */ > + aio_context_acquire(vblk_exp->export.ctx); > + blk_exp_unref(&vblk_exp->export); > + aio_context_release(vblk_exp->export.ctx); > } > } > > @@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) > { > VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); > > + if (!vblk_exp->vqs_started) { > + return; /* vduse_blk_drained_end() will start vqs later */ > + } > + > aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), > - true, on_vduse_vq_kick, NULL, NULL, NULL, vq); > + false, on_vduse_vq_kick, NULL, NULL, NULL, vq); > /* Make sure we don't miss any kick afer reconnecting */ > eventfd_write(vduse_queue_get_fd(vq), 1); > } > @@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) > static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) > { > VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); > + int fd = vduse_queue_get_fd(vq); > > - aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), > - true, NULL, NULL, NULL, NULL, NULL); > + if (fd < 0) { > + return; > + } > + > + aio_set_fd_handler(vblk_exp->export.ctx, fd, false, > + NULL, NULL, NULL, NULL, NULL); > } > > static const VduseOps vduse_blk_ops = { > @@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque) > > static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx) > { > - int i; > - > aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), > - true, on_vduse_dev_kick, NULL, NULL, NULL, > + false, on_vduse_dev_kick, NULL, NULL, NULL, > vblk_exp->dev); > > - for (i = 0; i < vblk_exp->num_queues; i++) { > - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); > - int fd = vduse_queue_get_fd(vq); > - > - if (fd < 0) { > - continue; > - } > - aio_set_fd_handler(vblk_exp->export.ctx, fd, true, > - on_vduse_vq_kick, NULL, NULL, NULL, vq); > - } > + /* Virtqueues are handled by vduse_blk_drained_end() */ > } > > static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp) > { > - int i; > - > - for (i = 0; i < vblk_exp->num_queues; i++) { > - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); > - int fd = vduse_queue_get_fd(vq); > - > - if (fd < 0) { > - continue; > - } > - aio_set_fd_handler(vblk_exp->export.ctx, fd, > - true, NULL, NULL, NULL, NULL, NULL); > - } > aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), > - true, NULL, NULL, NULL, NULL, NULL); > + false, NULL, NULL, NULL, NULL, NULL); > > - AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0); > + /* Virtqueues are handled by vduse_blk_drained_begin() */ > } > > > @@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque) > (char *)&config.capacity); > } > > +static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp) > +{ > + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { > + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); > + vduse_blk_disable_queue(vblk_exp->dev, vq); > + } > + > + vblk_exp->vqs_started = false; > +} > + > +static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp) > +{ > + vblk_exp->vqs_started = true; > + > + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { > + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); > + vduse_blk_enable_queue(vblk_exp->dev, vq); > + } > +} > + > +static void vduse_blk_drained_begin(void *opaque) > +{ > + BlockExport *exp = opaque; > + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > + > + vduse_blk_stop_virtqueues(vblk_exp); > +} > + > +static void vduse_blk_drained_end(void *opaque) > +{ > + BlockExport *exp = opaque; > + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > + > + vduse_blk_start_virtqueues(vblk_exp); > +} > + > +static bool vduse_blk_drained_poll(void *opaque) > +{ > + BlockExport *exp = opaque; > + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > + > + return qatomic_read(&vblk_exp->inflight) > 0; > +} > + > static const BlockDevOps vduse_block_ops = { > - .resize_cb = vduse_blk_resize, > + .resize_cb = vduse_blk_resize, > + .drained_begin = vduse_blk_drained_begin, > + .drained_end = vduse_blk_drained_end, > + .drained_poll = vduse_blk_drained_poll, > }; > > static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, > @@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, > vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: ""); > vblk_exp->handler.logical_block_size = logical_block_size; > vblk_exp->handler.writable = opts->writable; > + vblk_exp->vqs_started = true; > > config.capacity = > cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); > @@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, > vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); > } > > - aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true, > + aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false, > on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); > > blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, > vblk_exp); > - > blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); > > + /* > + * We handle draining ourselves using an in-flight counter and by disabling > + * virtqueue fd handlers. Do not queue BlockBackend requests, they need to > + * complete so the in-flight counter reaches zero. > + */ > + blk_set_disable_request_queuing(exp->blk, true); > + > return 0; > err: > vduse_dev_destroy(vblk_exp->dev); > @@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp) > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > int ret; > > + assert(qatomic_read(&vblk_exp->inflight) == 0); > + > + vduse_blk_detach_ctx(vblk_exp); > blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, > vblk_exp); > blk_set_dev_ops(exp->blk, NULL, NULL); > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp) > g_free(vblk_exp->handler.serial); > } > > +/* Called with exp->ctx acquired */ > static void vduse_blk_exp_request_shutdown(BlockExport *exp) > { > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > > - aio_context_acquire(vblk_exp->export.ctx); > - vduse_blk_detach_ctx(vblk_exp); > - aio_context_acquire(vblk_exp->export.ctx); > + vduse_blk_stop_virtqueues(vblk_exp); Can we add a AIO_WAIT_WHILE() here? Then we don't need to increase/decrease the BlockExport refcount during I/O processing. Thanks, Yongji
On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote: > Hi Stefan, > > On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > vduse_blk_detach_ctx() waits for in-flight requests using > > AIO_WAIT_WHILE(). This is not allowed according to a comment in > > bdrv_set_aio_context_commit(): > > > > /* > > * Take the old AioContex when detaching it from bs. > > * At this point, new_context lock is already acquired, and we are now > > * also taking old_context. This is safe as long as bdrv_detach_aio_context > > * does not call AIO_POLL_WHILE(). > > */ > > > > Use this opportunity to rewrite the drain code in vduse-blk: > > > > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only > > called when there are no more requests in flight. > > > > - Implement .drained_poll() so in-flight request coroutines are stopped > > by the time .bdrv_detach_aio_context() is called. > > > > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the > > .bdrv_detach_aio_context() constraint violation. It's no longer > > needed due to the previous changes. > > > > - Always handle the VDUSE file descriptor, even in drained sections. The > > VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in > > drained sections. This ensures that the VDUSE kernel code gets a fast > > response. > > > > - Suspend virtqueue fd handlers in .drained_begin() and resume them in > > .drained_end(). This eliminates the need for the > > aio_set_fd_handler(is_external=true) flag, which is being removed from > > QEMU. > > > > This is a long list but splitting it into individual commits would > > probably lead to git bisect failures - the changes are all related. > > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > > --- > > block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------ > > 1 file changed, 93 insertions(+), 39 deletions(-) > > > > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c > > index f7ae44e3ce..35dc8fcf45 100644 > > --- a/block/export/vduse-blk.c > > +++ b/block/export/vduse-blk.c > > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport { > > VduseDev *dev; > > uint16_t num_queues; > > char *recon_file; > > - unsigned int inflight; > > + unsigned int inflight; /* atomic */ > > + bool vqs_started; > > } VduseBlkExport; > > > > typedef struct VduseBlkReq { > > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq { > > > > static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) > > { > > - vblk_exp->inflight++; > > + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { > > I wonder why we need to use atomic operations here. The inflight counter is only modified by the vhost-user export thread, but it may be read by another thread here: static bool vduse_blk_drained_poll(void *opaque) { BlockExport *exp = opaque; VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); return qatomic_read(&vblk_exp->inflight) > 0; BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are drained (e.g. blk_drain_all() and related APIs). > > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp) > > g_free(vblk_exp->handler.serial); > > } > > > > +/* Called with exp->ctx acquired */ > > static void vduse_blk_exp_request_shutdown(BlockExport *exp) > > { > > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > > > > - aio_context_acquire(vblk_exp->export.ctx); > > - vduse_blk_detach_ctx(vblk_exp); > > - aio_context_acquire(vblk_exp->export.ctx); > > + vduse_blk_stop_virtqueues(vblk_exp); > > Can we add a AIO_WAIT_WHILE() here? Then we don't need to > increase/decrease the BlockExport refcount during I/O processing. I don't think so because vduse_blk_exp_request_shutdown() is not the only place where we wait for requests to complete. There would still need to be away to wait for requests to finish (without calling AIO_WAIT_WHILE()) in vduse_blk_drained_poll(). Stefan
On Wed, Apr 26, 2023 at 12:43 AM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote: > > Hi Stefan, > > > > On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > > > vduse_blk_detach_ctx() waits for in-flight requests using > > > AIO_WAIT_WHILE(). This is not allowed according to a comment in > > > bdrv_set_aio_context_commit(): > > > > > > /* > > > * Take the old AioContex when detaching it from bs. > > > * At this point, new_context lock is already acquired, and we are now > > > * also taking old_context. This is safe as long as bdrv_detach_aio_context > > > * does not call AIO_POLL_WHILE(). > > > */ > > > > > > Use this opportunity to rewrite the drain code in vduse-blk: > > > > > > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only > > > called when there are no more requests in flight. > > > > > > - Implement .drained_poll() so in-flight request coroutines are stopped > > > by the time .bdrv_detach_aio_context() is called. > > > > > > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the > > > .bdrv_detach_aio_context() constraint violation. It's no longer > > > needed due to the previous changes. > > > > > > - Always handle the VDUSE file descriptor, even in drained sections. The > > > VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in > > > drained sections. This ensures that the VDUSE kernel code gets a fast > > > response. > > > > > > - Suspend virtqueue fd handlers in .drained_begin() and resume them in > > > .drained_end(). This eliminates the need for the > > > aio_set_fd_handler(is_external=true) flag, which is being removed from > > > QEMU. > > > > > > This is a long list but splitting it into individual commits would > > > probably lead to git bisect failures - the changes are all related. > > > > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > > > --- > > > block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------ > > > 1 file changed, 93 insertions(+), 39 deletions(-) > > > > > > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c > > > index f7ae44e3ce..35dc8fcf45 100644 > > > --- a/block/export/vduse-blk.c > > > +++ b/block/export/vduse-blk.c > > > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport { > > > VduseDev *dev; > > > uint16_t num_queues; > > > char *recon_file; > > > - unsigned int inflight; > > > + unsigned int inflight; /* atomic */ > > > + bool vqs_started; > > > } VduseBlkExport; > > > > > > typedef struct VduseBlkReq { > > > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq { > > > > > > static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) > > > { > > > - vblk_exp->inflight++; > > > + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { > > > > I wonder why we need to use atomic operations here. > > The inflight counter is only modified by the vhost-user export thread, > but it may be read by another thread here: > I see. I mean is it enough to just use volatile keywords here, since the writers would not access the variable concurrently. > static bool vduse_blk_drained_poll(void *opaque) > { > BlockExport *exp = opaque; > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > > return qatomic_read(&vblk_exp->inflight) > 0; > > BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are > drained (e.g. blk_drain_all() and related APIs). > > > > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp) > > > g_free(vblk_exp->handler.serial); > > > } > > > > > > +/* Called with exp->ctx acquired */ > > > static void vduse_blk_exp_request_shutdown(BlockExport *exp) > > > { > > > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); > > > > > > - aio_context_acquire(vblk_exp->export.ctx); > > > - vduse_blk_detach_ctx(vblk_exp); > > > - aio_context_acquire(vblk_exp->export.ctx); > > > + vduse_blk_stop_virtqueues(vblk_exp); > > > > Can we add a AIO_WAIT_WHILE() here? Then we don't need to > > increase/decrease the BlockExport refcount during I/O processing. > > I don't think so because vduse_blk_exp_request_shutdown() is not the > only place where we wait for requests to complete. There would still > need to be away to wait for requests to finish (without calling > AIO_WAIT_WHILE()) in vduse_blk_drained_poll(). > But the BlockExport would not be freed until we call vduse_blk_exp_request_shutdown(). If we can ensure that there will be no inflight I/O after we call vduse_blk_exp_request_shutdown(), the BlockExport can be freed safely without increasing/decreasing the BlockExport refcount during I/O processing. Thanks, Yongji
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c index f7ae44e3ce..35dc8fcf45 100644 --- a/block/export/vduse-blk.c +++ b/block/export/vduse-blk.c @@ -31,7 +31,8 @@ typedef struct VduseBlkExport { VduseDev *dev; uint16_t num_queues; char *recon_file; - unsigned int inflight; + unsigned int inflight; /* atomic */ + bool vqs_started; } VduseBlkExport; typedef struct VduseBlkReq { @@ -41,13 +42,24 @@ typedef struct VduseBlkReq { static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) { - vblk_exp->inflight++; + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) { + /* Prevent export from being deleted */ + aio_context_acquire(vblk_exp->export.ctx); + blk_exp_ref(&vblk_exp->export); + aio_context_release(vblk_exp->export.ctx); + } } static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp) { - if (--vblk_exp->inflight == 0) { + if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) { + /* Wake AIO_WAIT_WHILE() */ aio_wait_kick(); + + /* Now the export can be deleted */ + aio_context_acquire(vblk_exp->export.ctx); + blk_exp_unref(&vblk_exp->export); + aio_context_release(vblk_exp->export.ctx); } } @@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) { VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); + if (!vblk_exp->vqs_started) { + return; /* vduse_blk_drained_end() will start vqs later */ + } + aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), - true, on_vduse_vq_kick, NULL, NULL, NULL, vq); + false, on_vduse_vq_kick, NULL, NULL, NULL, vq); /* Make sure we don't miss any kick afer reconnecting */ eventfd_write(vduse_queue_get_fd(vq), 1); } @@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) { VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); + int fd = vduse_queue_get_fd(vq); - aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), - true, NULL, NULL, NULL, NULL, NULL); + if (fd < 0) { + return; + } + + aio_set_fd_handler(vblk_exp->export.ctx, fd, false, + NULL, NULL, NULL, NULL, NULL); } static const VduseOps vduse_blk_ops = { @@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque) static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx) { - int i; - aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), - true, on_vduse_dev_kick, NULL, NULL, NULL, + false, on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); - for (i = 0; i < vblk_exp->num_queues; i++) { - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); - int fd = vduse_queue_get_fd(vq); - - if (fd < 0) { - continue; - } - aio_set_fd_handler(vblk_exp->export.ctx, fd, true, - on_vduse_vq_kick, NULL, NULL, NULL, vq); - } + /* Virtqueues are handled by vduse_blk_drained_end() */ } static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp) { - int i; - - for (i = 0; i < vblk_exp->num_queues; i++) { - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); - int fd = vduse_queue_get_fd(vq); - - if (fd < 0) { - continue; - } - aio_set_fd_handler(vblk_exp->export.ctx, fd, - true, NULL, NULL, NULL, NULL, NULL); - } aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), - true, NULL, NULL, NULL, NULL, NULL); + false, NULL, NULL, NULL, NULL, NULL); - AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0); + /* Virtqueues are handled by vduse_blk_drained_begin() */ } @@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque) (char *)&config.capacity); } +static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp) +{ + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); + vduse_blk_disable_queue(vblk_exp->dev, vq); + } + + vblk_exp->vqs_started = false; +} + +static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp) +{ + vblk_exp->vqs_started = true; + + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) { + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); + vduse_blk_enable_queue(vblk_exp->dev, vq); + } +} + +static void vduse_blk_drained_begin(void *opaque) +{ + BlockExport *exp = opaque; + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); + + vduse_blk_stop_virtqueues(vblk_exp); +} + +static void vduse_blk_drained_end(void *opaque) +{ + BlockExport *exp = opaque; + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); + + vduse_blk_start_virtqueues(vblk_exp); +} + +static bool vduse_blk_drained_poll(void *opaque) +{ + BlockExport *exp = opaque; + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); + + return qatomic_read(&vblk_exp->inflight) > 0; +} + static const BlockDevOps vduse_block_ops = { - .resize_cb = vduse_blk_resize, + .resize_cb = vduse_blk_resize, + .drained_begin = vduse_blk_drained_begin, + .drained_end = vduse_blk_drained_end, + .drained_poll = vduse_blk_drained_poll, }; static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, @@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: ""); vblk_exp->handler.logical_block_size = logical_block_size; vblk_exp->handler.writable = opts->writable; + vblk_exp->vqs_started = true; config.capacity = cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); @@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); } - aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true, + aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false, on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, vblk_exp); - blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); + /* + * We handle draining ourselves using an in-flight counter and by disabling + * virtqueue fd handlers. Do not queue BlockBackend requests, they need to + * complete so the in-flight counter reaches zero. + */ + blk_set_disable_request_queuing(exp->blk, true); + return 0; err: vduse_dev_destroy(vblk_exp->dev); @@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp) VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); int ret; + assert(qatomic_read(&vblk_exp->inflight) == 0); + + vduse_blk_detach_ctx(vblk_exp); blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, vblk_exp); blk_set_dev_ops(exp->blk, NULL, NULL); @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp) g_free(vblk_exp->handler.serial); } +/* Called with exp->ctx acquired */ static void vduse_blk_exp_request_shutdown(BlockExport *exp) { VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); - aio_context_acquire(vblk_exp->export.ctx); - vduse_blk_detach_ctx(vblk_exp); - aio_context_acquire(vblk_exp->export.ctx); + vduse_blk_stop_virtqueues(vblk_exp); } const BlockExportDriver blk_exp_vduse_blk = {
vduse_blk_detach_ctx() waits for in-flight requests using AIO_WAIT_WHILE(). This is not allowed according to a comment in bdrv_set_aio_context_commit(): /* * Take the old AioContex when detaching it from bs. * At this point, new_context lock is already acquired, and we are now * also taking old_context. This is safe as long as bdrv_detach_aio_context * does not call AIO_POLL_WHILE(). */ Use this opportunity to rewrite the drain code in vduse-blk: - Use the BlockExport refcount so that vduse_blk_exp_delete() is only called when there are no more requests in flight. - Implement .drained_poll() so in-flight request coroutines are stopped by the time .bdrv_detach_aio_context() is called. - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the .bdrv_detach_aio_context() constraint violation. It's no longer needed due to the previous changes. - Always handle the VDUSE file descriptor, even in drained sections. The VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in drained sections. This ensures that the VDUSE kernel code gets a fast response. - Suspend virtqueue fd handlers in .drained_begin() and resume them in .drained_end(). This eliminates the need for the aio_set_fd_handler(is_external=true) flag, which is being removed from QEMU. This is a long list but splitting it into individual commits would probably lead to git bisect failures - the changes are all related. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> --- block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 39 deletions(-)