Message ID | 20170621125249.8805-4-paul.durrant@citrix.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
CC'ing Andreas Färber. Could you please give a quick look below at the way the iothread object is instantiate and destroyed? I am no object model expert and would appreaciate a second opinion. On Wed, 21 Jun 2017, Paul Durrant wrote: > This patch allocates an IOThread object for each xen_disk instance and > sets the AIO context appropriately on connect. This allows processing > of I/O to proceed in parallel. > > The patch also adds tracepoints into xen_disk to make it possible to > follow the state transtions of an instance in the log. > > Signed-off-by: Paul Durrant <paul.durrant@citrix.com> > --- > Cc: Stefano Stabellini <sstabellini@kernel.org> > Cc: Anthony Perard <anthony.perard@citrix.com> > Cc: Kevin Wolf <kwolf@redhat.com> > Cc: Max Reitz <mreitz@redhat.com> > > v2: > - explicitly acquire and release AIO context in qemu_aio_complete() and > blk_bh() > --- > hw/block/trace-events | 7 ++++++ > hw/block/xen_disk.c | 69 ++++++++++++++++++++++++++++++++++++++++++++------- > 2 files changed, 67 insertions(+), 9 deletions(-) > > diff --git a/hw/block/trace-events b/hw/block/trace-events > index 65e83dc258..608b24ba66 100644 > --- a/hw/block/trace-events > +++ b/hw/block/trace-events > @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t offset, > # hw/block/hd-geometry.c > hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d" > hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d" > + > +# hw/block/xen_disk.c > +xen_disk_alloc(char *name) "%s" > +xen_disk_init(char *name) "%s" > +xen_disk_connect(char *name) "%s" > +xen_disk_disconnect(char *name) "%s" > +xen_disk_free(char *name) "%s" > diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c > index 0e6513708e..8548195195 100644 > --- a/hw/block/xen_disk.c > +++ b/hw/block/xen_disk.c > @@ -27,10 +27,13 @@ > #include "hw/xen/xen_backend.h" > #include "xen_blkif.h" > #include "sysemu/blockdev.h" > +#include "sysemu/iothread.h" > #include "sysemu/block-backend.h" > #include "qapi/error.h" > #include "qapi/qmp/qdict.h" > #include "qapi/qmp/qstring.h" > +#include "qom/object_interfaces.h" > +#include "trace.h" > > /* ------------------------------------------------------------- */ > > @@ -128,6 +131,9 @@ struct XenBlkDev { > DriveInfo *dinfo; > BlockBackend *blk; > QEMUBH *bh; > + > + IOThread *iothread; > + AioContext *ctx; > }; > > /* ------------------------------------------------------------- */ > @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq); > static void qemu_aio_complete(void *opaque, int ret) > { > struct ioreq *ioreq = opaque; > + struct XenBlkDev *blkdev = ioreq->blkdev; > + > + aio_context_acquire(blkdev->ctx); I think that Paolo was right that we need a aio_context_acquire here, however the issue is that with the current code: blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete leading to aio_context_acquire being called twice on the same lock, which I don't think is allowed? I think we need to get rid of the qemu_aio_complete call from ioreq_runio_qemu_aio, but to do that we need to be careful with the accounting of aio_inflight (today it's incremented unconditionally at the beginning of ioreq_runio_qemu_aio, I think we would have to change that to increment it only if presync). > if (ret != 0) { > - xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", > + xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n", > ioreq->req.operation == BLKIF_OP_READ ? "read" : "write"); > ioreq->aio_errors++; > } > @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int ret) > if (ioreq->presync) { > ioreq->presync = 0; > ioreq_runio_qemu_aio(ioreq); > - return; > + goto done; > } > if (ioreq->aio_inflight > 0) { > - return; > + goto done; > } > > - if (ioreq->blkdev->feature_grant_copy) { > + if (blkdev->feature_grant_copy) { > switch (ioreq->req.operation) { > case BLKIF_OP_READ: > /* in case of failure ioreq->aio_errors is increased */ > @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int ret) > } > > ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; > - if (!ioreq->blkdev->feature_grant_copy) { > + if (!blkdev->feature_grant_copy) { > ioreq_unmap(ioreq); > } > ioreq_finish(ioreq); > @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int ret) > } > case BLKIF_OP_READ: > if (ioreq->status == BLKIF_RSP_OKAY) { > - block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > + block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct); > } else { > - block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > + block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct); > } > break; > case BLKIF_OP_DISCARD: > default: > break; > } > - qemu_bh_schedule(ioreq->blkdev->bh); > + qemu_bh_schedule(blkdev->bh); > + > +done: > + aio_context_release(blkdev->ctx); > } > > static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number, > @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) > static void blk_bh(void *opaque) > { > struct XenBlkDev *blkdev = opaque; > + > + aio_context_acquire(blkdev->ctx); > blk_handle_requests(blkdev); > + aio_context_release(blkdev->ctx); > } > > static void blk_alloc(struct XenDevice *xendev) > { > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); > + Object *obj; > + char *name; > + Error *err = NULL; > + > + trace_xen_disk_alloc(xendev->name); > > QLIST_INIT(&blkdev->inflight); > QLIST_INIT(&blkdev->finished); > QLIST_INIT(&blkdev->freelist); > - blkdev->bh = qemu_bh_new(blk_bh, blkdev); > + > + obj = object_new(TYPE_IOTHREAD); > + name = g_strdup_printf("iothread-%s", xendev->name); > + > + object_property_add_child(object_get_objects_root(), name, obj, &err); > + assert(!err); Would it be enough to call object_ref? > + g_free(name); > + > + user_creatable_complete(obj, &err); Why do we need to call this? > + assert(!err); > + > + blkdev->iothread = (IOThread *)object_dynamic_cast(obj, TYPE_IOTHREAD); > + blkdev->ctx = iothread_get_aio_context(blkdev->iothread); > + blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev); > + > if (xen_mode != XEN_EMULATE) { > batch_maps = 1; > } > @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev) > blkdev->persistent_gnt_count = 0; > } > > + blk_set_aio_context(blkdev->blk, blkdev->ctx); > + > xen_be_bind_evtchn(&blkdev->xendev); > > xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, " > @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice *xendev) > { > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); > > + trace_xen_disk_disconnect(xendev->name); > + > + aio_context_acquire(blkdev->ctx); > + > if (blkdev->blk) { > + blk_set_aio_context(blkdev->blk, qemu_get_aio_context()); > blk_detach_dev(blkdev->blk, blkdev); > blk_unref(blkdev->blk); > blkdev->blk = NULL; > } > xen_pv_unbind_evtchn(&blkdev->xendev); > > + aio_context_release(blkdev->ctx); > + > if (blkdev->sring) { > xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, > blkdev->nr_ring_ref); > @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev) > g_free(blkdev->dev); > g_free(blkdev->devtype); > qemu_bh_delete(blkdev->bh); > + object_unparent(OBJECT(blkdev->iothread)); Shouldn't this be object_unref? > return 0; > }
> -----Original Message----- > From: Stefano Stabellini [mailto:sstabellini@kernel.org] > Sent: 22 June 2017 23:15 > To: Paul Durrant <Paul.Durrant@citrix.com> > Cc: xen-devel@lists.xenproject.org; qemu-devel@nongnu.org; qemu- > block@nongnu.org; Stefano Stabellini <sstabellini@kernel.org>; Anthony > Perard <anthony.perard@citrix.com>; Kevin Wolf <kwolf@redhat.com>; > Max Reitz <mreitz@redhat.com>; afaerber@suse.de > Subject: Re: [PATCH v2 3/3] xen-disk: use an IOThread per instance > > CC'ing Andreas Färber. Could you please give a quick look below at the > way the iothread object is instantiate and destroyed? I am no object > model expert and would appreaciate a second opinion. > I have not seen any response so far. > > On Wed, 21 Jun 2017, Paul Durrant wrote: > > This patch allocates an IOThread object for each xen_disk instance and > > sets the AIO context appropriately on connect. This allows processing > > of I/O to proceed in parallel. > > > > The patch also adds tracepoints into xen_disk to make it possible to > > follow the state transtions of an instance in the log. > > > > Signed-off-by: Paul Durrant <paul.durrant@citrix.com> > > --- > > Cc: Stefano Stabellini <sstabellini@kernel.org> > > Cc: Anthony Perard <anthony.perard@citrix.com> > > Cc: Kevin Wolf <kwolf@redhat.com> > > Cc: Max Reitz <mreitz@redhat.com> > > > > v2: > > - explicitly acquire and release AIO context in qemu_aio_complete() and > > blk_bh() > > --- > > hw/block/trace-events | 7 ++++++ > > hw/block/xen_disk.c | 69 > ++++++++++++++++++++++++++++++++++++++++++++------- > > 2 files changed, 67 insertions(+), 9 deletions(-) > > > > diff --git a/hw/block/trace-events b/hw/block/trace-events > > index 65e83dc258..608b24ba66 100644 > > --- a/hw/block/trace-events > > +++ b/hw/block/trace-events > > @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int > num_reqs, uint64_t offset, > > # hw/block/hd-geometry.c > > hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p > LCHS %d %d %d" > > hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t > secs, int trans) "blk %p CHS %u %u %u trans %d" > > + > > +# hw/block/xen_disk.c > > +xen_disk_alloc(char *name) "%s" > > +xen_disk_init(char *name) "%s" > > +xen_disk_connect(char *name) "%s" > > +xen_disk_disconnect(char *name) "%s" > > +xen_disk_free(char *name) "%s" > > diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c > > index 0e6513708e..8548195195 100644 > > --- a/hw/block/xen_disk.c > > +++ b/hw/block/xen_disk.c > > @@ -27,10 +27,13 @@ > > #include "hw/xen/xen_backend.h" > > #include "xen_blkif.h" > > #include "sysemu/blockdev.h" > > +#include "sysemu/iothread.h" > > #include "sysemu/block-backend.h" > > #include "qapi/error.h" > > #include "qapi/qmp/qdict.h" > > #include "qapi/qmp/qstring.h" > > +#include "qom/object_interfaces.h" > > +#include "trace.h" > > > > /* ------------------------------------------------------------- */ > > > > @@ -128,6 +131,9 @@ struct XenBlkDev { > > DriveInfo *dinfo; > > BlockBackend *blk; > > QEMUBH *bh; > > + > > + IOThread *iothread; > > + AioContext *ctx; > > }; > > > > /* ------------------------------------------------------------- */ > > @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq > *ioreq); > > static void qemu_aio_complete(void *opaque, int ret) > > { > > struct ioreq *ioreq = opaque; > > + struct XenBlkDev *blkdev = ioreq->blkdev; > > + > > + aio_context_acquire(blkdev->ctx); > > I think that Paolo was right that we need a aio_context_acquire here, > however the issue is that with the current code: > > blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete > > leading to aio_context_acquire being called twice on the same lock, > which I don't think is allowed? It resolves to a qemu_rec_mutex_lock() which I believed is a recursive lock, so I think that's ok. > > I think we need to get rid of the qemu_aio_complete call from > ioreq_runio_qemu_aio, but to do that we need to be careful with the > accounting of aio_inflight (today it's incremented unconditionally at > the beginning of ioreq_runio_qemu_aio, I think we would have to change > that to increment it only if presync). > If the lock is indeed recursive then I think we can avoid this complication. > > > if (ret != 0) { > > - xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", > > + xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n", > > ioreq->req.operation == BLKIF_OP_READ ? "read" : "write"); > > ioreq->aio_errors++; > > } > > @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int > ret) > > if (ioreq->presync) { > > ioreq->presync = 0; > > ioreq_runio_qemu_aio(ioreq); > > - return; > > + goto done; > > } > > if (ioreq->aio_inflight > 0) { > > - return; > > + goto done; > > } > > > > - if (ioreq->blkdev->feature_grant_copy) { > > + if (blkdev->feature_grant_copy) { > > switch (ioreq->req.operation) { > > case BLKIF_OP_READ: > > /* in case of failure ioreq->aio_errors is increased */ > > @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int > ret) > > } > > > > ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : > BLKIF_RSP_OKAY; > > - if (!ioreq->blkdev->feature_grant_copy) { > > + if (!blkdev->feature_grant_copy) { > > ioreq_unmap(ioreq); > > } > > ioreq_finish(ioreq); > > @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int > ret) > > } > > case BLKIF_OP_READ: > > if (ioreq->status == BLKIF_RSP_OKAY) { > > - block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > > + block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct); > > } else { > > - block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > > + block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct); > > } > > break; > > case BLKIF_OP_DISCARD: > > default: > > break; > > } > > - qemu_bh_schedule(ioreq->blkdev->bh); > > + qemu_bh_schedule(blkdev->bh); > > + > > +done: > > + aio_context_release(blkdev->ctx); > > } > > > > static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t > sector_number, > > @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev > *blkdev) > > static void blk_bh(void *opaque) > > { > > struct XenBlkDev *blkdev = opaque; > > + > > + aio_context_acquire(blkdev->ctx); > > blk_handle_requests(blkdev); > > + aio_context_release(blkdev->ctx); > > } > > > > static void blk_alloc(struct XenDevice *xendev) > > { > > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, > xendev); > > + Object *obj; > > + char *name; > > + Error *err = NULL; > > + > > + trace_xen_disk_alloc(xendev->name); > > > > QLIST_INIT(&blkdev->inflight); > > QLIST_INIT(&blkdev->finished); > > QLIST_INIT(&blkdev->freelist); > > - blkdev->bh = qemu_bh_new(blk_bh, blkdev); > > + > > + obj = object_new(TYPE_IOTHREAD); > > + name = g_strdup_printf("iothread-%s", xendev->name); > > + > > + object_property_add_child(object_get_objects_root(), name, obj, > &err); > > + assert(!err); > > Would it be enough to call object_ref? > You mean to avoid the assert? I guess so but I think any failure here would be indicative of a larger problem. > > > + g_free(name); > > + > > + user_creatable_complete(obj, &err); > > Why do we need to call this? > I'm not entirely sure but looking around the object code it seemed to be a necessary part of instantiation. Maybe it is not required for iothread objects, but I could not figure that out from looking at the code and comments in the header suggest it is harmless if it is not required. > > > + assert(!err); > > + > > + blkdev->iothread = (IOThread *)object_dynamic_cast(obj, > TYPE_IOTHREAD); > > + blkdev->ctx = iothread_get_aio_context(blkdev->iothread); > > + blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev); > > + > > if (xen_mode != XEN_EMULATE) { > > batch_maps = 1; > > } > > @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev) > > blkdev->persistent_gnt_count = 0; > > } > > > > + blk_set_aio_context(blkdev->blk, blkdev->ctx); > > + > > xen_be_bind_evtchn(&blkdev->xendev); > > > > xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, " > > @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice > *xendev) > > { > > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, > xendev); > > > > + trace_xen_disk_disconnect(xendev->name); > > + > > + aio_context_acquire(blkdev->ctx); > > + > > if (blkdev->blk) { > > + blk_set_aio_context(blkdev->blk, qemu_get_aio_context()); > > blk_detach_dev(blkdev->blk, blkdev); > > blk_unref(blkdev->blk); > > blkdev->blk = NULL; > > } > > xen_pv_unbind_evtchn(&blkdev->xendev); > > > > + aio_context_release(blkdev->ctx); > > + > > if (blkdev->sring) { > > xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, > > blkdev->nr_ring_ref); > > @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev) > > g_free(blkdev->dev); > > g_free(blkdev->devtype); > > qemu_bh_delete(blkdev->bh); > > + object_unparent(OBJECT(blkdev->iothread)); > > Shouldn't this be object_unref? > I don't think so. I think this is required to undo what was done by calling object_property_add_child() on the root object. Looking at other code such as object_new_with_propv() it looks like the right thing to do is to call object_unref() after calling object_property_add_child() to drop the implicit ref taken by object_new() so I'd need to add the call in blk_alloc(). Paul > > > return 0; > > }
On Fri, 7 Jul 2017, Paul Durrant wrote: > > -----Original Message----- > > From: Stefano Stabellini [mailto:sstabellini@kernel.org] > > Sent: 22 June 2017 23:15 > > To: Paul Durrant <Paul.Durrant@citrix.com> > > Cc: xen-devel@lists.xenproject.org; qemu-devel@nongnu.org; qemu- > > block@nongnu.org; Stefano Stabellini <sstabellini@kernel.org>; Anthony > > Perard <anthony.perard@citrix.com>; Kevin Wolf <kwolf@redhat.com>; > > Max Reitz <mreitz@redhat.com>; afaerber@suse.de > > Subject: Re: [PATCH v2 3/3] xen-disk: use an IOThread per instance > > > > CC'ing Andreas Färber. Could you please give a quick look below at the > > way the iothread object is instantiate and destroyed? I am no object > > model expert and would appreaciate a second opinion. > > > > I have not seen any response so far. > > > > > On Wed, 21 Jun 2017, Paul Durrant wrote: > > > This patch allocates an IOThread object for each xen_disk instance and > > > sets the AIO context appropriately on connect. This allows processing > > > of I/O to proceed in parallel. > > > > > > The patch also adds tracepoints into xen_disk to make it possible to > > > follow the state transtions of an instance in the log. > > > > > > Signed-off-by: Paul Durrant <paul.durrant@citrix.com> > > > --- > > > Cc: Stefano Stabellini <sstabellini@kernel.org> > > > Cc: Anthony Perard <anthony.perard@citrix.com> > > > Cc: Kevin Wolf <kwolf@redhat.com> > > > Cc: Max Reitz <mreitz@redhat.com> > > > > > > v2: > > > - explicitly acquire and release AIO context in qemu_aio_complete() and > > > blk_bh() > > > --- > > > hw/block/trace-events | 7 ++++++ > > > hw/block/xen_disk.c | 69 > > ++++++++++++++++++++++++++++++++++++++++++++------- > > > 2 files changed, 67 insertions(+), 9 deletions(-) > > > > > > diff --git a/hw/block/trace-events b/hw/block/trace-events > > > index 65e83dc258..608b24ba66 100644 > > > --- a/hw/block/trace-events > > > +++ b/hw/block/trace-events > > > @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int > > num_reqs, uint64_t offset, > > > # hw/block/hd-geometry.c > > > hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p > > LCHS %d %d %d" > > > hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t > > secs, int trans) "blk %p CHS %u %u %u trans %d" > > > + > > > +# hw/block/xen_disk.c > > > +xen_disk_alloc(char *name) "%s" > > > +xen_disk_init(char *name) "%s" > > > +xen_disk_connect(char *name) "%s" > > > +xen_disk_disconnect(char *name) "%s" > > > +xen_disk_free(char *name) "%s" > > > diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c > > > index 0e6513708e..8548195195 100644 > > > --- a/hw/block/xen_disk.c > > > +++ b/hw/block/xen_disk.c > > > @@ -27,10 +27,13 @@ > > > #include "hw/xen/xen_backend.h" > > > #include "xen_blkif.h" > > > #include "sysemu/blockdev.h" > > > +#include "sysemu/iothread.h" > > > #include "sysemu/block-backend.h" > > > #include "qapi/error.h" > > > #include "qapi/qmp/qdict.h" > > > #include "qapi/qmp/qstring.h" > > > +#include "qom/object_interfaces.h" > > > +#include "trace.h" > > > > > > /* ------------------------------------------------------------- */ > > > > > > @@ -128,6 +131,9 @@ struct XenBlkDev { > > > DriveInfo *dinfo; > > > BlockBackend *blk; > > > QEMUBH *bh; > > > + > > > + IOThread *iothread; > > > + AioContext *ctx; > > > }; > > > > > > /* ------------------------------------------------------------- */ > > > @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq > > *ioreq); > > > static void qemu_aio_complete(void *opaque, int ret) > > > { > > > struct ioreq *ioreq = opaque; > > > + struct XenBlkDev *blkdev = ioreq->blkdev; > > > + > > > + aio_context_acquire(blkdev->ctx); > > > > I think that Paolo was right that we need a aio_context_acquire here, > > however the issue is that with the current code: > > > > blk_handle_requests -> ioreq_runio_qemu_aio -> qemu_aio_complete > > > > leading to aio_context_acquire being called twice on the same lock, > > which I don't think is allowed? > > It resolves to a qemu_rec_mutex_lock() which I believed is a recursive lock, so I think that's ok. On Linux it becomes pthread_mutex_lock. The lock is created by qemu_rec_mutex_init which specifies PTHREAD_MUTEX_RECURSIVE, so yes, it should be recursive. Good. > > > > I think we need to get rid of the qemu_aio_complete call from > > ioreq_runio_qemu_aio, but to do that we need to be careful with the > > accounting of aio_inflight (today it's incremented unconditionally at > > the beginning of ioreq_runio_qemu_aio, I think we would have to change > > that to increment it only if presync). > > > > If the lock is indeed recursive then I think we can avoid this complication. OK > > > > > if (ret != 0) { > > > - xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", > > > + xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n", > > > ioreq->req.operation == BLKIF_OP_READ ? "read" : "write"); > > > ioreq->aio_errors++; > > > } > > > @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int > > ret) > > > if (ioreq->presync) { > > > ioreq->presync = 0; > > > ioreq_runio_qemu_aio(ioreq); > > > - return; > > > + goto done; > > > } > > > if (ioreq->aio_inflight > 0) { > > > - return; > > > + goto done; > > > } > > > > > > - if (ioreq->blkdev->feature_grant_copy) { > > > + if (blkdev->feature_grant_copy) { > > > switch (ioreq->req.operation) { > > > case BLKIF_OP_READ: > > > /* in case of failure ioreq->aio_errors is increased */ > > > @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int > > ret) > > > } > > > > > > ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : > > BLKIF_RSP_OKAY; > > > - if (!ioreq->blkdev->feature_grant_copy) { > > > + if (!blkdev->feature_grant_copy) { > > > ioreq_unmap(ioreq); > > > } > > > ioreq_finish(ioreq); > > > @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int > > ret) > > > } > > > case BLKIF_OP_READ: > > > if (ioreq->status == BLKIF_RSP_OKAY) { > > > - block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > > > + block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct); > > > } else { > > > - block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); > > > + block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct); > > > } > > > break; > > > case BLKIF_OP_DISCARD: > > > default: > > > break; > > > } > > > - qemu_bh_schedule(ioreq->blkdev->bh); > > > + qemu_bh_schedule(blkdev->bh); > > > + > > > +done: > > > + aio_context_release(blkdev->ctx); > > > } > > > > > > static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t > > sector_number, > > > @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev > > *blkdev) > > > static void blk_bh(void *opaque) > > > { > > > struct XenBlkDev *blkdev = opaque; > > > + > > > + aio_context_acquire(blkdev->ctx); > > > blk_handle_requests(blkdev); > > > + aio_context_release(blkdev->ctx); > > > } > > > > > > static void blk_alloc(struct XenDevice *xendev) > > > { > > > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, > > xendev); > > > + Object *obj; > > > + char *name; > > > + Error *err = NULL; > > > + > > > + trace_xen_disk_alloc(xendev->name); > > > > > > QLIST_INIT(&blkdev->inflight); > > > QLIST_INIT(&blkdev->finished); > > > QLIST_INIT(&blkdev->freelist); > > > - blkdev->bh = qemu_bh_new(blk_bh, blkdev); > > > + > > > + obj = object_new(TYPE_IOTHREAD); > > > + name = g_strdup_printf("iothread-%s", xendev->name); > > > + > > > + object_property_add_child(object_get_objects_root(), name, obj, > > &err); > > > + assert(!err); > > > > Would it be enough to call object_ref? > > > > You mean to avoid the assert? I guess so but I think any failure here would be indicative of a larger problem. No, I meant calling object_ref instead of object_property_add_child. > > > > > + g_free(name); > > > + > > > + user_creatable_complete(obj, &err); > > > > Why do we need to call this? > > > > I'm not entirely sure but looking around the object code it seemed to be a necessary part of instantiation. Maybe it is not required for iothread objects, but I could not figure that out from looking at the code and comments in the header suggest it is harmless if it is not required. > > > > + assert(!err); > > > + > > > + blkdev->iothread = (IOThread *)object_dynamic_cast(obj, > > TYPE_IOTHREAD); > > > + blkdev->ctx = iothread_get_aio_context(blkdev->iothread); > > > + blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev); > > > + > > > if (xen_mode != XEN_EMULATE) { > > > batch_maps = 1; > > > } > > > @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev) > > > blkdev->persistent_gnt_count = 0; > > > } > > > > > > + blk_set_aio_context(blkdev->blk, blkdev->ctx); > > > + > > > xen_be_bind_evtchn(&blkdev->xendev); > > > > > > xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, " > > > @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice > > *xendev) > > > { > > > struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, > > xendev); > > > > > > + trace_xen_disk_disconnect(xendev->name); > > > + > > > + aio_context_acquire(blkdev->ctx); > > > + > > > if (blkdev->blk) { > > > + blk_set_aio_context(blkdev->blk, qemu_get_aio_context()); > > > blk_detach_dev(blkdev->blk, blkdev); > > > blk_unref(blkdev->blk); > > > blkdev->blk = NULL; > > > } > > > xen_pv_unbind_evtchn(&blkdev->xendev); > > > > > > + aio_context_release(blkdev->ctx); > > > + > > > if (blkdev->sring) { > > > xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, > > > blkdev->nr_ring_ref); > > > @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev) > > > g_free(blkdev->dev); > > > g_free(blkdev->devtype); > > > qemu_bh_delete(blkdev->bh); > > > + object_unparent(OBJECT(blkdev->iothread)); > > > > Shouldn't this be object_unref? > > > > I don't think so. I think this is required to undo what was done by calling object_property_add_child() on the root object. Right, so if object_property_add_child is not actually required, then you might be able to turn object_unparent into object_unref. Unfortunately I don't know enough about QOM to be able to tell which is the right way of doing things, but looking at hw/block/dataplane/virtio-blk.c, it would seem that only object_ref and object_unref are required? > Looking at other code such as object_new_with_propv() it looks like the right thing to do is to call object_unref() after calling object_property_add_child() to drop the implicit ref taken by object_new() so I'd need to add the call in blk_alloc().
> -----Original Message----- [snip] > > > > + object_unparent(OBJECT(blkdev->iothread)); > > > > > > Shouldn't this be object_unref? > > > > > > > I don't think so. I think this is required to undo what was done by calling > object_property_add_child() on the root object. > > Right, so if object_property_add_child is not actually required, then > you might be able to turn object_unparent into object_unref. > > Unfortunately I don't know enough about QOM to be able to tell which is > the right way of doing things, but looking at > hw/block/dataplane/virtio-blk.c, it would seem that only object_ref and > object_unref are required? > I guess I can give it a try. I was working on the assumption that all objects were required to have a parent, but maybe that's not true. Can someone more familiar with QOM comment? Cheers, Paul > > > Looking at other code such as object_new_with_propv() it looks like the > right thing to do is to call object_unref() after calling > object_property_add_child() to drop the implicit ref taken by object_new() > so I'd need to add the call in blk_alloc().
diff --git a/hw/block/trace-events b/hw/block/trace-events index 65e83dc258..608b24ba66 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -10,3 +10,10 @@ virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t offset, # hw/block/hd-geometry.c hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d" hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d" + +# hw/block/xen_disk.c +xen_disk_alloc(char *name) "%s" +xen_disk_init(char *name) "%s" +xen_disk_connect(char *name) "%s" +xen_disk_disconnect(char *name) "%s" +xen_disk_free(char *name) "%s" diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index 0e6513708e..8548195195 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -27,10 +27,13 @@ #include "hw/xen/xen_backend.h" #include "xen_blkif.h" #include "sysemu/blockdev.h" +#include "sysemu/iothread.h" #include "sysemu/block-backend.h" #include "qapi/error.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qstring.h" +#include "qom/object_interfaces.h" +#include "trace.h" /* ------------------------------------------------------------- */ @@ -128,6 +131,9 @@ struct XenBlkDev { DriveInfo *dinfo; BlockBackend *blk; QEMUBH *bh; + + IOThread *iothread; + AioContext *ctx; }; /* ------------------------------------------------------------- */ @@ -599,9 +605,12 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq); static void qemu_aio_complete(void *opaque, int ret) { struct ioreq *ioreq = opaque; + struct XenBlkDev *blkdev = ioreq->blkdev; + + aio_context_acquire(blkdev->ctx); if (ret != 0) { - xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", + xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n", ioreq->req.operation == BLKIF_OP_READ ? "read" : "write"); ioreq->aio_errors++; } @@ -610,13 +619,13 @@ static void qemu_aio_complete(void *opaque, int ret) if (ioreq->presync) { ioreq->presync = 0; ioreq_runio_qemu_aio(ioreq); - return; + goto done; } if (ioreq->aio_inflight > 0) { - return; + goto done; } - if (ioreq->blkdev->feature_grant_copy) { + if (blkdev->feature_grant_copy) { switch (ioreq->req.operation) { case BLKIF_OP_READ: /* in case of failure ioreq->aio_errors is increased */ @@ -638,7 +647,7 @@ static void qemu_aio_complete(void *opaque, int ret) } ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; - if (!ioreq->blkdev->feature_grant_copy) { + if (!blkdev->feature_grant_copy) { ioreq_unmap(ioreq); } ioreq_finish(ioreq); @@ -650,16 +659,19 @@ static void qemu_aio_complete(void *opaque, int ret) } case BLKIF_OP_READ: if (ioreq->status == BLKIF_RSP_OKAY) { - block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); + block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct); } else { - block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct); + block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct); } break; case BLKIF_OP_DISCARD: default: break; } - qemu_bh_schedule(ioreq->blkdev->bh); + qemu_bh_schedule(blkdev->bh); + +done: + aio_context_release(blkdev->ctx); } static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number, @@ -917,17 +929,40 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) static void blk_bh(void *opaque) { struct XenBlkDev *blkdev = opaque; + + aio_context_acquire(blkdev->ctx); blk_handle_requests(blkdev); + aio_context_release(blkdev->ctx); } static void blk_alloc(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); + Object *obj; + char *name; + Error *err = NULL; + + trace_xen_disk_alloc(xendev->name); QLIST_INIT(&blkdev->inflight); QLIST_INIT(&blkdev->finished); QLIST_INIT(&blkdev->freelist); - blkdev->bh = qemu_bh_new(blk_bh, blkdev); + + obj = object_new(TYPE_IOTHREAD); + name = g_strdup_printf("iothread-%s", xendev->name); + + object_property_add_child(object_get_objects_root(), name, obj, &err); + assert(!err); + + g_free(name); + + user_creatable_complete(obj, &err); + assert(!err); + + blkdev->iothread = (IOThread *)object_dynamic_cast(obj, TYPE_IOTHREAD); + blkdev->ctx = iothread_get_aio_context(blkdev->iothread); + blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev); + if (xen_mode != XEN_EMULATE) { batch_maps = 1; } @@ -954,6 +989,8 @@ static int blk_init(struct XenDevice *xendev) int info = 0; char *directiosafe = NULL; + trace_xen_disk_init(xendev->name); + /* read xenstore entries */ if (blkdev->params == NULL) { char *h = NULL; @@ -1069,6 +1106,8 @@ static int blk_connect(struct XenDevice *xendev) unsigned int i; uint32_t *domids; + trace_xen_disk_connect(xendev->name); + /* read-only ? */ if (blkdev->directiosafe) { qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO; @@ -1288,6 +1327,8 @@ static int blk_connect(struct XenDevice *xendev) blkdev->persistent_gnt_count = 0; } + blk_set_aio_context(blkdev->blk, blkdev->ctx); + xen_be_bind_evtchn(&blkdev->xendev); xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, " @@ -1301,13 +1342,20 @@ static void blk_disconnect(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); + trace_xen_disk_disconnect(xendev->name); + + aio_context_acquire(blkdev->ctx); + if (blkdev->blk) { + blk_set_aio_context(blkdev->blk, qemu_get_aio_context()); blk_detach_dev(blkdev->blk, blkdev); blk_unref(blkdev->blk); blkdev->blk = NULL; } xen_pv_unbind_evtchn(&blkdev->xendev); + aio_context_release(blkdev->ctx); + if (blkdev->sring) { xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, blkdev->nr_ring_ref); @@ -1341,6 +1389,8 @@ static int blk_free(struct XenDevice *xendev) struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); struct ioreq *ioreq; + trace_xen_disk_free(xendev->name); + if (blkdev->blk || blkdev->sring) { blk_disconnect(xendev); } @@ -1358,6 +1408,7 @@ static int blk_free(struct XenDevice *xendev) g_free(blkdev->dev); g_free(blkdev->devtype); qemu_bh_delete(blkdev->bh); + object_unparent(OBJECT(blkdev->iothread)); return 0; }
This patch allocates an IOThread object for each xen_disk instance and sets the AIO context appropriately on connect. This allows processing of I/O to proceed in parallel. The patch also adds tracepoints into xen_disk to make it possible to follow the state transtions of an instance in the log. Signed-off-by: Paul Durrant <paul.durrant@citrix.com> --- Cc: Stefano Stabellini <sstabellini@kernel.org> Cc: Anthony Perard <anthony.perard@citrix.com> Cc: Kevin Wolf <kwolf@redhat.com> Cc: Max Reitz <mreitz@redhat.com> v2: - explicitly acquire and release AIO context in qemu_aio_complete() and blk_bh() --- hw/block/trace-events | 7 ++++++ hw/block/xen_disk.c | 69 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 9 deletions(-)