diff mbox series

[kvmtool,8/9] disk/aio: Add wait() disk operation

Message ID 20190218130702.32575-9-jean-philippe.brucker@arm.com (mailing list archive)
State New, archived
Headers show
Series Disk fixes and AIO reset | expand

Commit Message

Jean-Philippe Brucker Feb. 18, 2019, 1:07 p.m. UTC
Add a call into the disk layer to synchronize the AIO queue. Wait for all
pending requests to complete. This will be necessary when resetting a
virtqueue.

The wait() operation isn't the same as flush(). A VIRTIO_BLK_T_FLUSH
request ensures that any write request *that completed before the FLUSH is
sent* is committed to permanent storage (e.g. written back from a write
cache). But it doesn't do anything for requests that are still pending
when the FLUSH is sent.

Avoid introducing a mutex on the io_submit() and io_getevents() paths,
because it can lead to 30% throughput drop on heavy FIO jobs. Instead
manage an inflight counter using compare-and-swap operations, which is
simple enough as the caller doesn't submit new requests while it waits for
the AIO queue to drain. The __sync_fetch_and_* operations are a bit rough
since they use full barriers, but that didn't seem to introduce a
performance regression.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
---
 disk/aio.c               | 82 ++++++++++++++++++++++++----------------
 disk/blk.c               |  1 +
 disk/core.c              |  8 ++++
 disk/raw.c               |  2 +
 include/kvm/disk-image.h |  9 +++++
 5 files changed, 70 insertions(+), 32 deletions(-)

Comments

Jean-Philippe Brucker March 21, 2019, 12:41 p.m. UTC | #1
On 18/02/2019 13:07, Jean-Philippe Brucker wrote:
> Add a call into the disk layer to synchronize the AIO queue. Wait for all
> pending requests to complete. This will be necessary when resetting a
> virtqueue.
> 
> The wait() operation isn't the same as flush(). A VIRTIO_BLK_T_FLUSH
> request ensures that any write request *that completed before the FLUSH is
> sent* is committed to permanent storage (e.g. written back from a write
> cache). But it doesn't do anything for requests that are still pending
> when the FLUSH is sent.
> 
> Avoid introducing a mutex on the io_submit() and io_getevents() paths,
> because it can lead to 30% throughput drop on heavy FIO jobs. Instead
> manage an inflight counter using compare-and-swap operations, which is
> simple enough as the caller doesn't submit new requests while it waits for
> the AIO queue to drain. The __sync_fetch_and_* operations are a bit rough
> since they use full barriers, but that didn't seem to introduce a
> performance regression.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>

I'm going to send a v2 because patch 4/9 breaks build of lkvm-static.
Any comment or concern with this patch? I think it's correct, but
wouldn't bet on it.

Thanks,
Jean

> ---
>  disk/aio.c               | 82 ++++++++++++++++++++++++----------------
>  disk/blk.c               |  1 +
>  disk/core.c              |  8 ++++
>  disk/raw.c               |  2 +
>  include/kvm/disk-image.h |  9 +++++
>  5 files changed, 70 insertions(+), 32 deletions(-)
> 
> diff --git a/disk/aio.c b/disk/aio.c
> index 277ddf7c9..a7418c8c2 100644
> --- a/disk/aio.c
> +++ b/disk/aio.c
> @@ -2,45 +2,31 @@
>  #include <pthread.h>
>  #include <sys/eventfd.h>
>  
> +#include "kvm/brlock.h"
>  #include "kvm/disk-image.h"
>  #include "kvm/kvm.h"
>  #include "linux/list.h"
>  
>  #define AIO_MAX 256
>  
> -static int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd,
> -		       const struct iovec *iov, int iovcnt, off_t offset,
> -		       int ev, void *param)
> +static int aio_submit(struct disk_image *disk, int nr, struct iocb **ios)
>  {
> -	struct iocb *ios[1] = { iocb };
>  	int ret;
>  
> -	io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
> -	io_set_eventfd(iocb, ev);
> -	iocb->data = param;
> -
> +	__sync_fetch_and_add(&disk->aio_inflight, nr);
> +	/*
> +	 * A wmb() is needed here, to ensure disk_aio_thread() sees this
> +	 * increase after receiving the events. It is included in the
> +	 * __sync_fetch_and_add (as a full barrier).
> +	 */
>  restart:
> -	ret = io_submit(ctx, 1, ios);
> +	ret = io_submit(disk->ctx, nr, ios);
>  	if (ret == -EAGAIN)
>  		goto restart;
> -	return ret;
> -}
> -
> -static int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd,
> -		      const struct iovec *iov, int iovcnt, off_t offset,
> -		      int ev, void *param)
> -{
> -	struct iocb *ios[1] = { iocb };
> -	int ret;
> +	else if (ret <= 0)
> +		/* disk_aio_thread() is never going to see those */
> +		__sync_fetch_and_sub(&disk->aio_inflight, nr);
>  
> -	io_prep_preadv(iocb, fd, iov, iovcnt, offset);
> -	io_set_eventfd(iocb, ev);
> -	iocb->data = param;
> -
> -restart:
> -	ret = io_submit(ctx, 1, ios);
> -	if (ret == -EAGAIN)
> -		goto restart;
>  	return ret;
>  }
>  
> @@ -48,22 +34,49 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
>  			      const struct iovec *iov, int iovcount,
>  			      void *param)
>  {
> -	u64 offset = sector << SECTOR_SHIFT;
>  	struct iocb iocb;
> +	u64 offset = sector << SECTOR_SHIFT;
> +	struct iocb *ios[1] = { &iocb };
>  
> -	return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount,
> -			  offset, disk->evt, param);
> +	io_prep_preadv(&iocb, disk->fd, iov, iovcount, offset);
> +	io_set_eventfd(&iocb, disk->evt);
> +	iocb.data = param;
> +
> +	return aio_submit(disk, 1, ios);
>  }
>  
>  ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
>  			       const struct iovec *iov, int iovcount,
>  			       void *param)
>  {
> -	u64 offset = sector << SECTOR_SHIFT;
>  	struct iocb iocb;
> +	u64 offset = sector << SECTOR_SHIFT;
> +	struct iocb *ios[1] = { &iocb };
> +
> +	io_prep_pwritev(&iocb, disk->fd, iov, iovcount, offset);
> +	io_set_eventfd(&iocb, disk->evt);
> +	iocb.data = param;
> +
> +	return aio_submit(disk, 1, ios);
> +}
>  
> -	return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount,
> -			   offset, disk->evt, param);
> +/*
> + * When this function returns there are no in-flight I/O. Caller ensures that
> + * io_submit() isn't called concurrently.
> + *
> + * Returns an inaccurate number of I/O that was in-flight when the function was
> + * called.
> + */
> +int raw_image__wait(struct disk_image *disk)
> +{
> +	u64 inflight = disk->aio_inflight;
> +
> +	while (disk->aio_inflight) {
> +		usleep(100);
> +		barrier();
> +	}
> +
> +	return inflight;
>  }
>  
>  static int disk_aio_get_events(struct disk_image *disk)
> @@ -76,6 +89,11 @@ static int disk_aio_get_events(struct disk_image *disk)
>  		nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
>  		for (i = 0; i < nr; i++)
>  			disk->disk_req_cb(event[i].data, event[i].res);
> +
> +		/* Pairs with wmb() in aio_submit() */
> +		rmb();
> +		__sync_fetch_and_sub(&disk->aio_inflight, nr);
> +
>  	} while (nr > 0);
>  
>  	return 0;
> diff --git a/disk/blk.c b/disk/blk.c
> index 48922e028..b4c9fba3b 100644
> --- a/disk/blk.c
> +++ b/disk/blk.c
> @@ -9,6 +9,7 @@
>  static struct disk_image_operations blk_dev_ops = {
>  	.read	= raw_image__read,
>  	.write	= raw_image__write,
> +	.wait	= raw_image__wait,
>  	.async	= true,
>  };
>  
> diff --git a/disk/core.c b/disk/core.c
> index 89880703e..8d95c98e2 100644
> --- a/disk/core.c
> +++ b/disk/core.c
> @@ -201,6 +201,14 @@ error:
>  	return err;
>  }
>  
> +int disk_image__wait(struct disk_image *disk)
> +{
> +	if (disk->ops->wait)
> +		return disk->ops->wait(disk);
> +
> +	return 0;
> +}
> +
>  int disk_image__flush(struct disk_image *disk)
>  {
>  	if (disk->ops->flush)
> diff --git a/disk/raw.c b/disk/raw.c
> index e869d6cc2..54b4e7408 100644
> --- a/disk/raw.c
> +++ b/disk/raw.c
> @@ -67,6 +67,7 @@ int raw_image__close(struct disk_image *disk)
>  static struct disk_image_operations raw_image_regular_ops = {
>  	.read	= raw_image__read,
>  	.write	= raw_image__write,
> +	.wait	= raw_image__wait,
>  	.async	= true,
>  };
>  
> @@ -78,6 +79,7 @@ struct disk_image_operations ro_ops = {
>  
>  struct disk_image_operations ro_ops_nowrite = {
>  	.read	= raw_image__read,
> +	.wait	= raw_image__wait,
>  	.async	= true,
>  };
>  
> diff --git a/include/kvm/disk-image.h b/include/kvm/disk-image.h
> index 2275e2343..27d4f7da5 100644
> --- a/include/kvm/disk-image.h
> +++ b/include/kvm/disk-image.h
> @@ -41,6 +41,7 @@ struct disk_image_operations {
>  	ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
>  			int iovcount, void *param);
>  	int (*flush)(struct disk_image *disk);
> +	int (*wait)(struct disk_image *disk);
>  	int (*close)(struct disk_image *disk);
>  	bool async;
>  };
> @@ -70,6 +71,7 @@ struct disk_image {
>  	io_context_t			ctx;
>  	int				evt;
>  	pthread_t			thread;
> +	u64				aio_inflight;
>  #endif /* CONFIG_HAS_AIO */
>  	const char			*wwpn;
>  	const char			*tpgt;
> @@ -81,6 +83,7 @@ int disk_image__init(struct kvm *kvm);
>  int disk_image__exit(struct kvm *kvm);
>  struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
>  int disk_image__flush(struct disk_image *disk);
> +int disk_image__wait(struct disk_image *disk);
>  ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
>  				int iovcount, void *param);
>  ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
> @@ -108,6 +111,7 @@ ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
>  			      const struct iovec *iov, int iovcount, void *param);
>  ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
>  			       const struct iovec *iov, int iovcount, void *param);
> +int raw_image__wait(struct disk_image *disk);
>  
>  #define raw_image__read		raw_image__read_async
>  #define raw_image__write	raw_image__write_async
> @@ -121,6 +125,11 @@ static inline int disk_aio_setup(struct disk_image *disk)
>  static inline void disk_aio_destroy(struct disk_image *disk)
>  {
>  }
> +
> +static inline int raw_image__wait(struct disk_image *disk)
> +{
> +	return 0;
> +}
>  #define raw_image__read		raw_image__read_sync
>  #define raw_image__write	raw_image__write_sync
>  #endif /* CONFIG_HAS_AIO */
>
diff mbox series

Patch

diff --git a/disk/aio.c b/disk/aio.c
index 277ddf7c9..a7418c8c2 100644
--- a/disk/aio.c
+++ b/disk/aio.c
@@ -2,45 +2,31 @@ 
 #include <pthread.h>
 #include <sys/eventfd.h>
 
+#include "kvm/brlock.h"
 #include "kvm/disk-image.h"
 #include "kvm/kvm.h"
 #include "linux/list.h"
 
 #define AIO_MAX 256
 
-static int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd,
-		       const struct iovec *iov, int iovcnt, off_t offset,
-		       int ev, void *param)
+static int aio_submit(struct disk_image *disk, int nr, struct iocb **ios)
 {
-	struct iocb *ios[1] = { iocb };
 	int ret;
 
-	io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
-	io_set_eventfd(iocb, ev);
-	iocb->data = param;
-
+	__sync_fetch_and_add(&disk->aio_inflight, nr);
+	/*
+	 * A wmb() is needed here, to ensure disk_aio_thread() sees this
+	 * increase after receiving the events. It is included in the
+	 * __sync_fetch_and_add (as a full barrier).
+	 */
 restart:
-	ret = io_submit(ctx, 1, ios);
+	ret = io_submit(disk->ctx, nr, ios);
 	if (ret == -EAGAIN)
 		goto restart;
-	return ret;
-}
-
-static int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd,
-		      const struct iovec *iov, int iovcnt, off_t offset,
-		      int ev, void *param)
-{
-	struct iocb *ios[1] = { iocb };
-	int ret;
+	else if (ret <= 0)
+		/* disk_aio_thread() is never going to see those */
+		__sync_fetch_and_sub(&disk->aio_inflight, nr);
 
-	io_prep_preadv(iocb, fd, iov, iovcnt, offset);
-	io_set_eventfd(iocb, ev);
-	iocb->data = param;
-
-restart:
-	ret = io_submit(ctx, 1, ios);
-	if (ret == -EAGAIN)
-		goto restart;
 	return ret;
 }
 
@@ -48,22 +34,49 @@  ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
 			      const struct iovec *iov, int iovcount,
 			      void *param)
 {
-	u64 offset = sector << SECTOR_SHIFT;
 	struct iocb iocb;
+	u64 offset = sector << SECTOR_SHIFT;
+	struct iocb *ios[1] = { &iocb };
 
-	return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount,
-			  offset, disk->evt, param);
+	io_prep_preadv(&iocb, disk->fd, iov, iovcount, offset);
+	io_set_eventfd(&iocb, disk->evt);
+	iocb.data = param;
+
+	return aio_submit(disk, 1, ios);
 }
 
 ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
 			       const struct iovec *iov, int iovcount,
 			       void *param)
 {
-	u64 offset = sector << SECTOR_SHIFT;
 	struct iocb iocb;
+	u64 offset = sector << SECTOR_SHIFT;
+	struct iocb *ios[1] = { &iocb };
+
+	io_prep_pwritev(&iocb, disk->fd, iov, iovcount, offset);
+	io_set_eventfd(&iocb, disk->evt);
+	iocb.data = param;
+
+	return aio_submit(disk, 1, ios);
+}
 
-	return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount,
-			   offset, disk->evt, param);
+/*
+ * When this function returns there are no in-flight I/O. Caller ensures that
+ * io_submit() isn't called concurrently.
+ *
+ * Returns an inaccurate number of I/O that was in-flight when the function was
+ * called.
+ */
+int raw_image__wait(struct disk_image *disk)
+{
+	u64 inflight = disk->aio_inflight;
+
+	while (disk->aio_inflight) {
+		usleep(100);
+		barrier();
+	}
+
+	return inflight;
 }
 
 static int disk_aio_get_events(struct disk_image *disk)
@@ -76,6 +89,11 @@  static int disk_aio_get_events(struct disk_image *disk)
 		nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
 		for (i = 0; i < nr; i++)
 			disk->disk_req_cb(event[i].data, event[i].res);
+
+		/* Pairs with wmb() in aio_submit() */
+		rmb();
+		__sync_fetch_and_sub(&disk->aio_inflight, nr);
+
 	} while (nr > 0);
 
 	return 0;
diff --git a/disk/blk.c b/disk/blk.c
index 48922e028..b4c9fba3b 100644
--- a/disk/blk.c
+++ b/disk/blk.c
@@ -9,6 +9,7 @@ 
 static struct disk_image_operations blk_dev_ops = {
 	.read	= raw_image__read,
 	.write	= raw_image__write,
+	.wait	= raw_image__wait,
 	.async	= true,
 };
 
diff --git a/disk/core.c b/disk/core.c
index 89880703e..8d95c98e2 100644
--- a/disk/core.c
+++ b/disk/core.c
@@ -201,6 +201,14 @@  error:
 	return err;
 }
 
+int disk_image__wait(struct disk_image *disk)
+{
+	if (disk->ops->wait)
+		return disk->ops->wait(disk);
+
+	return 0;
+}
+
 int disk_image__flush(struct disk_image *disk)
 {
 	if (disk->ops->flush)
diff --git a/disk/raw.c b/disk/raw.c
index e869d6cc2..54b4e7408 100644
--- a/disk/raw.c
+++ b/disk/raw.c
@@ -67,6 +67,7 @@  int raw_image__close(struct disk_image *disk)
 static struct disk_image_operations raw_image_regular_ops = {
 	.read	= raw_image__read,
 	.write	= raw_image__write,
+	.wait	= raw_image__wait,
 	.async	= true,
 };
 
@@ -78,6 +79,7 @@  struct disk_image_operations ro_ops = {
 
 struct disk_image_operations ro_ops_nowrite = {
 	.read	= raw_image__read,
+	.wait	= raw_image__wait,
 	.async	= true,
 };
 
diff --git a/include/kvm/disk-image.h b/include/kvm/disk-image.h
index 2275e2343..27d4f7da5 100644
--- a/include/kvm/disk-image.h
+++ b/include/kvm/disk-image.h
@@ -41,6 +41,7 @@  struct disk_image_operations {
 	ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
 			int iovcount, void *param);
 	int (*flush)(struct disk_image *disk);
+	int (*wait)(struct disk_image *disk);
 	int (*close)(struct disk_image *disk);
 	bool async;
 };
@@ -70,6 +71,7 @@  struct disk_image {
 	io_context_t			ctx;
 	int				evt;
 	pthread_t			thread;
+	u64				aio_inflight;
 #endif /* CONFIG_HAS_AIO */
 	const char			*wwpn;
 	const char			*tpgt;
@@ -81,6 +83,7 @@  int disk_image__init(struct kvm *kvm);
 int disk_image__exit(struct kvm *kvm);
 struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
 int disk_image__flush(struct disk_image *disk);
+int disk_image__wait(struct disk_image *disk);
 ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
 				int iovcount, void *param);
 ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
@@ -108,6 +111,7 @@  ssize_t raw_image__read_async(struct disk_image *disk, u64 sector,
 			      const struct iovec *iov, int iovcount, void *param);
 ssize_t raw_image__write_async(struct disk_image *disk, u64 sector,
 			       const struct iovec *iov, int iovcount, void *param);
+int raw_image__wait(struct disk_image *disk);
 
 #define raw_image__read		raw_image__read_async
 #define raw_image__write	raw_image__write_async
@@ -121,6 +125,11 @@  static inline int disk_aio_setup(struct disk_image *disk)
 static inline void disk_aio_destroy(struct disk_image *disk)
 {
 }
+
+static inline int raw_image__wait(struct disk_image *disk)
+{
+	return 0;
+}
 #define raw_image__read		raw_image__read_sync
 #define raw_image__write	raw_image__write_sync
 #endif /* CONFIG_HAS_AIO */