Message ID | 20190218130702.32575-9-jean-philippe.brucker@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Disk fixes and AIO reset | expand |
On 18/02/2019 13:07, Jean-Philippe Brucker wrote: > Add a call into the disk layer to synchronize the AIO queue. Wait for all > pending requests to complete. This will be necessary when resetting a > virtqueue. > > The wait() operation isn't the same as flush(). A VIRTIO_BLK_T_FLUSH > request ensures that any write request *that completed before the FLUSH is > sent* is committed to permanent storage (e.g. written back from a write > cache). But it doesn't do anything for requests that are still pending > when the FLUSH is sent. > > Avoid introducing a mutex on the io_submit() and io_getevents() paths, > because it can lead to 30% throughput drop on heavy FIO jobs. Instead > manage an inflight counter using compare-and-swap operations, which is > simple enough as the caller doesn't submit new requests while it waits for > the AIO queue to drain. The __sync_fetch_and_* operations are a bit rough > since they use full barriers, but that didn't seem to introduce a > performance regression. > > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> I'm going to send a v2 because patch 4/9 breaks build of lkvm-static. Any comment or concern with this patch? I think it's correct, but wouldn't bet on it. Thanks, Jean > --- > disk/aio.c | 82 ++++++++++++++++++++++++---------------- > disk/blk.c | 1 + > disk/core.c | 8 ++++ > disk/raw.c | 2 + > include/kvm/disk-image.h | 9 +++++ > 5 files changed, 70 insertions(+), 32 deletions(-) > > diff --git a/disk/aio.c b/disk/aio.c > index 277ddf7c9..a7418c8c2 100644 > --- a/disk/aio.c > +++ b/disk/aio.c > @@ -2,45 +2,31 @@ > #include <pthread.h> > #include <sys/eventfd.h> > > +#include "kvm/brlock.h" > #include "kvm/disk-image.h" > #include "kvm/kvm.h" > #include "linux/list.h" > > #define AIO_MAX 256 > > -static int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, > - const struct iovec *iov, int iovcnt, off_t offset, > - int ev, void *param) > +static int aio_submit(struct disk_image *disk, int nr, struct iocb **ios) > { > - struct iocb *ios[1] = { iocb }; > int ret; > > - io_prep_pwritev(iocb, fd, iov, iovcnt, offset); > - io_set_eventfd(iocb, ev); > - iocb->data = param; > - > + __sync_fetch_and_add(&disk->aio_inflight, nr); > + /* > + * A wmb() is needed here, to ensure disk_aio_thread() sees this > + * increase after receiving the events. It is included in the > + * __sync_fetch_and_add (as a full barrier). > + */ > restart: > - ret = io_submit(ctx, 1, ios); > + ret = io_submit(disk->ctx, nr, ios); > if (ret == -EAGAIN) > goto restart; > - return ret; > -} > - > -static int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, > - const struct iovec *iov, int iovcnt, off_t offset, > - int ev, void *param) > -{ > - struct iocb *ios[1] = { iocb }; > - int ret; > + else if (ret <= 0) > + /* disk_aio_thread() is never going to see those */ > + __sync_fetch_and_sub(&disk->aio_inflight, nr); > > - io_prep_preadv(iocb, fd, iov, iovcnt, offset); > - io_set_eventfd(iocb, ev); > - iocb->data = param; > - > -restart: > - ret = io_submit(ctx, 1, ios); > - if (ret == -EAGAIN) > - goto restart; > return ret; > } > > @@ -48,22 +34,49 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector, > const struct iovec *iov, int iovcount, > void *param) > { > - u64 offset = sector << SECTOR_SHIFT; > struct iocb iocb; > + u64 offset = sector << SECTOR_SHIFT; > + struct iocb *ios[1] = { &iocb }; > > - return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, > - offset, disk->evt, param); > + io_prep_preadv(&iocb, disk->fd, iov, iovcount, offset); > + io_set_eventfd(&iocb, disk->evt); > + iocb.data = param; > + > + return aio_submit(disk, 1, ios); > } > > ssize_t raw_image__write_async(struct disk_image *disk, u64 sector, > const struct iovec *iov, int iovcount, > void *param) > { > - u64 offset = sector << SECTOR_SHIFT; > struct iocb iocb; > + u64 offset = sector << SECTOR_SHIFT; > + struct iocb *ios[1] = { &iocb }; > + > + io_prep_pwritev(&iocb, disk->fd, iov, iovcount, offset); > + io_set_eventfd(&iocb, disk->evt); > + iocb.data = param; > + > + return aio_submit(disk, 1, ios); > +} > > - return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, > - offset, disk->evt, param); > +/* > + * When this function returns there are no in-flight I/O. Caller ensures that > + * io_submit() isn't called concurrently. > + * > + * Returns an inaccurate number of I/O that was in-flight when the function was > + * called. > + */ > +int raw_image__wait(struct disk_image *disk) > +{ > + u64 inflight = disk->aio_inflight; > + > + while (disk->aio_inflight) { > + usleep(100); > + barrier(); > + } > + > + return inflight; > } > > static int disk_aio_get_events(struct disk_image *disk) > @@ -76,6 +89,11 @@ static int disk_aio_get_events(struct disk_image *disk) > nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, ¬ime); > for (i = 0; i < nr; i++) > disk->disk_req_cb(event[i].data, event[i].res); > + > + /* Pairs with wmb() in aio_submit() */ > + rmb(); > + __sync_fetch_and_sub(&disk->aio_inflight, nr); > + > } while (nr > 0); > > return 0; > diff --git a/disk/blk.c b/disk/blk.c > index 48922e028..b4c9fba3b 100644 > --- a/disk/blk.c > +++ b/disk/blk.c > @@ -9,6 +9,7 @@ > static struct disk_image_operations blk_dev_ops = { > .read = raw_image__read, > .write = raw_image__write, > + .wait = raw_image__wait, > .async = true, > }; > > diff --git a/disk/core.c b/disk/core.c > index 89880703e..8d95c98e2 100644 > --- a/disk/core.c > +++ b/disk/core.c > @@ -201,6 +201,14 @@ error: > return err; > } > > +int disk_image__wait(struct disk_image *disk) > +{ > + if (disk->ops->wait) > + return disk->ops->wait(disk); > + > + return 0; > +} > + > int disk_image__flush(struct disk_image *disk) > { > if (disk->ops->flush) > diff --git a/disk/raw.c b/disk/raw.c > index e869d6cc2..54b4e7408 100644 > --- a/disk/raw.c > +++ b/disk/raw.c > @@ -67,6 +67,7 @@ int raw_image__close(struct disk_image *disk) > static struct disk_image_operations raw_image_regular_ops = { > .read = raw_image__read, > .write = raw_image__write, > + .wait = raw_image__wait, > .async = true, > }; > > @@ -78,6 +79,7 @@ struct disk_image_operations ro_ops = { > > struct disk_image_operations ro_ops_nowrite = { > .read = raw_image__read, > + .wait = raw_image__wait, > .async = true, > }; > > diff --git a/include/kvm/disk-image.h b/include/kvm/disk-image.h > index 2275e2343..27d4f7da5 100644 > --- a/include/kvm/disk-image.h > +++ b/include/kvm/disk-image.h > @@ -41,6 +41,7 @@ struct disk_image_operations { > ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov, > int iovcount, void *param); > int (*flush)(struct disk_image *disk); > + int (*wait)(struct disk_image *disk); > int (*close)(struct disk_image *disk); > bool async; > }; > @@ -70,6 +71,7 @@ struct disk_image { > io_context_t ctx; > int evt; > pthread_t thread; > + u64 aio_inflight; > #endif /* CONFIG_HAS_AIO */ > const char *wwpn; > const char *tpgt; > @@ -81,6 +83,7 @@ int disk_image__init(struct kvm *kvm); > int disk_image__exit(struct kvm *kvm); > struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap); > int disk_image__flush(struct disk_image *disk); > +int disk_image__wait(struct disk_image *disk); > ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, > int iovcount, void *param); > ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, > @@ -108,6 +111,7 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector, > const struct iovec *iov, int iovcount, void *param); > ssize_t raw_image__write_async(struct disk_image *disk, u64 sector, > const struct iovec *iov, int iovcount, void *param); > +int raw_image__wait(struct disk_image *disk); > > #define raw_image__read raw_image__read_async > #define raw_image__write raw_image__write_async > @@ -121,6 +125,11 @@ static inline int disk_aio_setup(struct disk_image *disk) > static inline void disk_aio_destroy(struct disk_image *disk) > { > } > + > +static inline int raw_image__wait(struct disk_image *disk) > +{ > + return 0; > +} > #define raw_image__read raw_image__read_sync > #define raw_image__write raw_image__write_sync > #endif /* CONFIG_HAS_AIO */ >
diff --git a/disk/aio.c b/disk/aio.c index 277ddf7c9..a7418c8c2 100644 --- a/disk/aio.c +++ b/disk/aio.c @@ -2,45 +2,31 @@ #include <pthread.h> #include <sys/eventfd.h> +#include "kvm/brlock.h" #include "kvm/disk-image.h" #include "kvm/kvm.h" #include "linux/list.h" #define AIO_MAX 256 -static int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, - const struct iovec *iov, int iovcnt, off_t offset, - int ev, void *param) +static int aio_submit(struct disk_image *disk, int nr, struct iocb **ios) { - struct iocb *ios[1] = { iocb }; int ret; - io_prep_pwritev(iocb, fd, iov, iovcnt, offset); - io_set_eventfd(iocb, ev); - iocb->data = param; - + __sync_fetch_and_add(&disk->aio_inflight, nr); + /* + * A wmb() is needed here, to ensure disk_aio_thread() sees this + * increase after receiving the events. It is included in the + * __sync_fetch_and_add (as a full barrier). + */ restart: - ret = io_submit(ctx, 1, ios); + ret = io_submit(disk->ctx, nr, ios); if (ret == -EAGAIN) goto restart; - return ret; -} - -static int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, - const struct iovec *iov, int iovcnt, off_t offset, - int ev, void *param) -{ - struct iocb *ios[1] = { iocb }; - int ret; + else if (ret <= 0) + /* disk_aio_thread() is never going to see those */ + __sync_fetch_and_sub(&disk->aio_inflight, nr); - io_prep_preadv(iocb, fd, iov, iovcnt, offset); - io_set_eventfd(iocb, ev); - iocb->data = param; - -restart: - ret = io_submit(ctx, 1, ios); - if (ret == -EAGAIN) - goto restart; return ret; } @@ -48,22 +34,49 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param) { - u64 offset = sector << SECTOR_SHIFT; struct iocb iocb; + u64 offset = sector << SECTOR_SHIFT; + struct iocb *ios[1] = { &iocb }; - return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, - offset, disk->evt, param); + io_prep_preadv(&iocb, disk->fd, iov, iovcount, offset); + io_set_eventfd(&iocb, disk->evt); + iocb.data = param; + + return aio_submit(disk, 1, ios); } ssize_t raw_image__write_async(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param) { - u64 offset = sector << SECTOR_SHIFT; struct iocb iocb; + u64 offset = sector << SECTOR_SHIFT; + struct iocb *ios[1] = { &iocb }; + + io_prep_pwritev(&iocb, disk->fd, iov, iovcount, offset); + io_set_eventfd(&iocb, disk->evt); + iocb.data = param; + + return aio_submit(disk, 1, ios); +} - return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, - offset, disk->evt, param); +/* + * When this function returns there are no in-flight I/O. Caller ensures that + * io_submit() isn't called concurrently. + * + * Returns an inaccurate number of I/O that was in-flight when the function was + * called. + */ +int raw_image__wait(struct disk_image *disk) +{ + u64 inflight = disk->aio_inflight; + + while (disk->aio_inflight) { + usleep(100); + barrier(); + } + + return inflight; } static int disk_aio_get_events(struct disk_image *disk) @@ -76,6 +89,11 @@ static int disk_aio_get_events(struct disk_image *disk) nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, ¬ime); for (i = 0; i < nr; i++) disk->disk_req_cb(event[i].data, event[i].res); + + /* Pairs with wmb() in aio_submit() */ + rmb(); + __sync_fetch_and_sub(&disk->aio_inflight, nr); + } while (nr > 0); return 0; diff --git a/disk/blk.c b/disk/blk.c index 48922e028..b4c9fba3b 100644 --- a/disk/blk.c +++ b/disk/blk.c @@ -9,6 +9,7 @@ static struct disk_image_operations blk_dev_ops = { .read = raw_image__read, .write = raw_image__write, + .wait = raw_image__wait, .async = true, }; diff --git a/disk/core.c b/disk/core.c index 89880703e..8d95c98e2 100644 --- a/disk/core.c +++ b/disk/core.c @@ -201,6 +201,14 @@ error: return err; } +int disk_image__wait(struct disk_image *disk) +{ + if (disk->ops->wait) + return disk->ops->wait(disk); + + return 0; +} + int disk_image__flush(struct disk_image *disk) { if (disk->ops->flush) diff --git a/disk/raw.c b/disk/raw.c index e869d6cc2..54b4e7408 100644 --- a/disk/raw.c +++ b/disk/raw.c @@ -67,6 +67,7 @@ int raw_image__close(struct disk_image *disk) static struct disk_image_operations raw_image_regular_ops = { .read = raw_image__read, .write = raw_image__write, + .wait = raw_image__wait, .async = true, }; @@ -78,6 +79,7 @@ struct disk_image_operations ro_ops = { struct disk_image_operations ro_ops_nowrite = { .read = raw_image__read, + .wait = raw_image__wait, .async = true, }; diff --git a/include/kvm/disk-image.h b/include/kvm/disk-image.h index 2275e2343..27d4f7da5 100644 --- a/include/kvm/disk-image.h +++ b/include/kvm/disk-image.h @@ -41,6 +41,7 @@ struct disk_image_operations { ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param); int (*flush)(struct disk_image *disk); + int (*wait)(struct disk_image *disk); int (*close)(struct disk_image *disk); bool async; }; @@ -70,6 +71,7 @@ struct disk_image { io_context_t ctx; int evt; pthread_t thread; + u64 aio_inflight; #endif /* CONFIG_HAS_AIO */ const char *wwpn; const char *tpgt; @@ -81,6 +83,7 @@ int disk_image__init(struct kvm *kvm); int disk_image__exit(struct kvm *kvm); struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap); int disk_image__flush(struct disk_image *disk); +int disk_image__wait(struct disk_image *disk); ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param); ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, @@ -108,6 +111,7 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param); ssize_t raw_image__write_async(struct disk_image *disk, u64 sector, const struct iovec *iov, int iovcount, void *param); +int raw_image__wait(struct disk_image *disk); #define raw_image__read raw_image__read_async #define raw_image__write raw_image__write_async @@ -121,6 +125,11 @@ static inline int disk_aio_setup(struct disk_image *disk) static inline void disk_aio_destroy(struct disk_image *disk) { } + +static inline int raw_image__wait(struct disk_image *disk) +{ + return 0; +} #define raw_image__read raw_image__read_sync #define raw_image__write raw_image__write_sync #endif /* CONFIG_HAS_AIO */
Add a call into the disk layer to synchronize the AIO queue. Wait for all pending requests to complete. This will be necessary when resetting a virtqueue. The wait() operation isn't the same as flush(). A VIRTIO_BLK_T_FLUSH request ensures that any write request *that completed before the FLUSH is sent* is committed to permanent storage (e.g. written back from a write cache). But it doesn't do anything for requests that are still pending when the FLUSH is sent. Avoid introducing a mutex on the io_submit() and io_getevents() paths, because it can lead to 30% throughput drop on heavy FIO jobs. Instead manage an inflight counter using compare-and-swap operations, which is simple enough as the caller doesn't submit new requests while it waits for the AIO queue to drain. The __sync_fetch_and_* operations are a bit rough since they use full barriers, but that didn't seem to introduce a performance regression. Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com> --- disk/aio.c | 82 ++++++++++++++++++++++++---------------- disk/blk.c | 1 + disk/core.c | 8 ++++ disk/raw.c | 2 + include/kvm/disk-image.h | 9 +++++ 5 files changed, 70 insertions(+), 32 deletions(-)