diff mbox series

[v5,1/2] virtio-blk: support polling I/O

Message ID 20220405150924.147021-2-suwan.kim027@gmail.com (mailing list archive)
State New, archived
Headers show
Series virtio-blk: support polling I/O and mq_ops->queue_rqs() | expand

Commit Message

Suwan Kim April 5, 2022, 3:09 p.m. UTC
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled by module parameter "poll_queues" and it sets
dedicated polling queues for virtio-blk. This patch improves the
polling I/O throughput and latency.

The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.

virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.

virtio-blk reads the number of poll queues from module parameter
"poll_queues". If VM sets queue parameter as below,
("num-queues=N" [QEMU property], "poll_queues=M" [module parameter])
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.

Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.

For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.

As a result, IOPS and average latency improved about 10%.

Test result:

- Fio io_uring poll without virtio-blk poll support
	-- numjobs=1 : IOPS = 339K, avg latency = 188.33us
	-- numjobs=2 : IOPS = 367K, avg latency = 347.33us
	-- numjobs=4 : IOPS = 383K, avg latency = 682.06us

- Fio io_uring poll with virtio-blk poll support
	-- numjobs=1 : IOPS = 385K, avg latency = 165.94us
	-- numjobs=2 : IOPS = 408K, avg latency = 313.28us
	-- numjobs=4 : IOPS = 424K, avg latency = 613.05us

Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
---
 drivers/block/virtio_blk.c | 115 ++++++++++++++++++++++++++++++++++---
 1 file changed, 108 insertions(+), 7 deletions(-)

Comments

Elliott, Robert (Servers) April 6, 2022, 1:43 a.m. UTC | #1
> -----Original Message-----
> From: Suwan Kim <suwan.kim027@gmail.com>
> Sent: Tuesday, April 5, 2022 10:09 AM
> Subject: [PATCH v5 1/2] virtio-blk: support polling I/O
> 
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> @@ -81,6 +85,7 @@ struct virtio_blk {
> 
> 	/* num of vqs */
> 	int num_vqs;
> +	int io_queues[HCTX_MAX_TYPES];
>  	struct virtio_blk_vq *vqs;
...
>  };> @@ -565,6 +572,18 @@ static int init_vq(struct virtio_blk *vblk)
>  			min_not_zero(num_request_queues, nr_cpu_ids),
>  			num_vqs);
> 
> +	num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
> +
> +	memset(vblk->io_queues, 0, sizeof(int) * HCTX_MAX_TYPES);

Using
    sizeof(vblk->io_queues)
would automatically follow any changes in the definition of that field,
similar to the line below:

...
>  	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
Christoph Hellwig April 6, 2022, 5 a.m. UTC | #2
On Wed, Apr 06, 2022 at 12:09:23AM +0900, Suwan Kim wrote:
> +        for (i = 0; i < num_vqs - num_poll_vqs; i++) {
> +                callbacks[i] = virtblk_done;
> +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
> +                names[i] = vblk->vqs[i].name;
> +        }
> +
> +        for (; i < num_vqs; i++) {
> +                callbacks[i] = NULL;
> +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
> +                names[i] = vblk->vqs[i].name;
> +        }

This uses spaces for indentation.

> +		/*
> +		 * Regular queues have interrupts and hence CPU affinity is
> +		 * defined by the core virtio code, but polling queues have
> +		 * no interrupts so we let the block layer assign CPU affinity.
> +		 */
> +		if (i != HCTX_TYPE_POLL)
> +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> +		else
> +			blk_mq_map_queues(&set->map[i]);

Nit, but I would have just done a "positive" check here as that is ab it
easier to read:

		if (i == HCTX_TYPE_POLL)
			blk_mq_map_queues(&set->map[i]);
		else
			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Suwan Kim April 6, 2022, 1:36 p.m. UTC | #3
On Wed, Apr 06, 2022 at 01:43:55AM +0000, Elliott, Robert (Servers) wrote:
> 
> 
> > -----Original Message-----
> > From: Suwan Kim <suwan.kim027@gmail.com>
> > Sent: Tuesday, April 5, 2022 10:09 AM
> > Subject: [PATCH v5 1/2] virtio-blk: support polling I/O
> > 
> > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> > @@ -81,6 +85,7 @@ struct virtio_blk {
> > 
> > 	/* num of vqs */
> > 	int num_vqs;
> > +	int io_queues[HCTX_MAX_TYPES];
> >  	struct virtio_blk_vq *vqs;
> ...
> >  };> @@ -565,6 +572,18 @@ static int init_vq(struct virtio_blk *vblk)
> >  			min_not_zero(num_request_queues, nr_cpu_ids),
> >  			num_vqs);
> > 
> > +	num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
> > +
> > +	memset(vblk->io_queues, 0, sizeof(int) * HCTX_MAX_TYPES);
> 
> Using
>     sizeof(vblk->io_queues)
> would automatically follow any changes in the definition of that field,
> similar to the line below:

Thanks for the feedback. I think that memset is unnecessary because
all entries of vblk->io_queues[] is set.
I will remove it.

Regards,
Suwan Kim
Suwan Kim April 6, 2022, 1:41 p.m. UTC | #4
On Tue, Apr 05, 2022 at 10:00:29PM -0700, Christoph Hellwig wrote:
> On Wed, Apr 06, 2022 at 12:09:23AM +0900, Suwan Kim wrote:
> > +        for (i = 0; i < num_vqs - num_poll_vqs; i++) {
> > +                callbacks[i] = virtblk_done;
> > +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
> > +                names[i] = vblk->vqs[i].name;
> > +        }
> > +
> > +        for (; i < num_vqs; i++) {
> > +                callbacks[i] = NULL;
> > +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
> > +                names[i] = vblk->vqs[i].name;
> > +        }
> 
> This uses spaces for indentation.

Oh my mistake. I will fix it.

> > +		/*
> > +		 * Regular queues have interrupts and hence CPU affinity is
> > +		 * defined by the core virtio code, but polling queues have
> > +		 * no interrupts so we let the block layer assign CPU affinity.
> > +		 */
> > +		if (i != HCTX_TYPE_POLL)
> > +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> > +		else
> > +			blk_mq_map_queues(&set->map[i]);
> 
> Nit, but I would have just done a "positive" check here as that is ab it
> easier to read:
> 
> 		if (i == HCTX_TYPE_POLL)
> 			blk_mq_map_queues(&set->map[i]);
> 		else
> 			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);

I agree. I will fix it.
Thanks for the feedback!

Regards,
Suwan Kim

>
> Otherwise looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Max Gurtovoy April 6, 2022, 2:10 p.m. UTC | #5
On 4/6/2022 4:41 PM, Suwan Kim wrote:
> On Tue, Apr 05, 2022 at 10:00:29PM -0700, Christoph Hellwig wrote:
>> On Wed, Apr 06, 2022 at 12:09:23AM +0900, Suwan Kim wrote:
>>> +        for (i = 0; i < num_vqs - num_poll_vqs; i++) {
>>> +                callbacks[i] = virtblk_done;
>>> +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
>>> +                names[i] = vblk->vqs[i].name;
>>> +        }
>>> +
>>> +        for (; i < num_vqs; i++) {
>>> +                callbacks[i] = NULL;
>>> +                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
>>> +                names[i] = vblk->vqs[i].name;
>>> +        }
>> This uses spaces for indentation.
> Oh my mistake. I will fix it.
>
>>> +		/*
>>> +		 * Regular queues have interrupts and hence CPU affinity is
>>> +		 * defined by the core virtio code, but polling queues have
>>> +		 * no interrupts so we let the block layer assign CPU affinity.
>>> +		 */
>>> +		if (i != HCTX_TYPE_POLL)
>>> +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
>>> +		else
>>> +			blk_mq_map_queues(&set->map[i]);
>> Nit, but I would have just done a "positive" check here as that is ab it
>> easier to read:
>>
>> 		if (i == HCTX_TYPE_POLL)
>> 			blk_mq_map_queues(&set->map[i]);
>> 		else
>> 			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> I agree. I will fix it.
> Thanks for the feedback!

Looks good with fixing Christoph and Robert comments (memset removal).

Please run checkpatch.pl on the new version to verify we don't have more 
issues such as spaces vs. tabs..

Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>

>
> Regards,
> Suwan Kim
>
>> Otherwise looks good:
>>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
diff mbox series

Patch

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8c415be86732..51eea2a49e11 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -37,6 +37,10 @@  MODULE_PARM_DESC(num_request_queues,
 		 "0 for no limit. "
 		 "Values > nr_cpu_ids truncated to nr_cpu_ids.");
 
+static unsigned int poll_queues;
+module_param(poll_queues, uint, 0644);
+MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O");
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -81,6 +85,7 @@  struct virtio_blk {
 
 	/* num of vqs */
 	int num_vqs;
+	int io_queues[HCTX_MAX_TYPES];
 	struct virtio_blk_vq *vqs;
 };
 
@@ -548,6 +553,7 @@  static int init_vq(struct virtio_blk *vblk)
 	const char **names;
 	struct virtqueue **vqs;
 	unsigned short num_vqs;
+	unsigned int num_poll_vqs;
 	struct virtio_device *vdev = vblk->vdev;
 	struct irq_affinity desc = { 0, };
 
@@ -556,6 +562,7 @@  static int init_vq(struct virtio_blk *vblk)
 				   &num_vqs);
 	if (err)
 		num_vqs = 1;
+
 	if (!err && !num_vqs) {
 		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
 		return -EINVAL;
@@ -565,6 +572,18 @@  static int init_vq(struct virtio_blk *vblk)
 			min_not_zero(num_request_queues, nr_cpu_ids),
 			num_vqs);
 
+	num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
+
+	memset(vblk->io_queues, 0, sizeof(int) * HCTX_MAX_TYPES);
+	vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
+	vblk->io_queues[HCTX_TYPE_READ] = 0;
+	vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
+
+	dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
+				vblk->io_queues[HCTX_TYPE_DEFAULT],
+				vblk->io_queues[HCTX_TYPE_READ],
+				vblk->io_queues[HCTX_TYPE_POLL]);
+
 	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
 	if (!vblk->vqs)
 		return -ENOMEM;
@@ -577,11 +596,17 @@  static int init_vq(struct virtio_blk *vblk)
 		goto out;
 	}
 
-	for (i = 0; i < num_vqs; i++) {
-		callbacks[i] = virtblk_done;
-		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
-		names[i] = vblk->vqs[i].name;
-	}
+        for (i = 0; i < num_vqs - num_poll_vqs; i++) {
+                callbacks[i] = virtblk_done;
+                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+                names[i] = vblk->vqs[i].name;
+        }
+
+        for (; i < num_vqs; i++) {
+                callbacks[i] = NULL;
+                snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
+                names[i] = vblk->vqs[i].name;
+        }
 
 	/* Discover virtqueues and write information to configuration.  */
 	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
@@ -728,16 +753,89 @@  static const struct attribute_group *virtblk_attr_groups[] = {
 static int virtblk_map_queues(struct blk_mq_tag_set *set)
 {
 	struct virtio_blk *vblk = set->driver_data;
+	int i, qoff;
+
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = vblk->io_queues[i];
+		map->queue_offset = qoff;
+		qoff += map->nr_queues;
+
+		if (map->nr_queues == 0)
+			continue;
+
+		/*
+		 * Regular queues have interrupts and hence CPU affinity is
+		 * defined by the core virtio code, but polling queues have
+		 * no interrupts so we let the block layer assign CPU affinity.
+		 */
+		if (i != HCTX_TYPE_POLL)
+			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
+		else
+			blk_mq_map_queues(&set->map[i]);
+	}
+
+	return 0;
+}
+
+static void virtblk_complete_batch(struct io_comp_batch *iob)
+{
+	struct request *req;
 
-	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
-					vblk->vdev, 0);
+	rq_list_for_each(&iob->req_list, req) {
+		virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
+		virtblk_cleanup_cmd(req);
+	}
+	blk_mq_end_request_batch(iob);
+}
+
+static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = hctx->driver_data;
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+	int found = 0;
+
+	spin_lock_irqsave(&vq->lock, flags);
+
+	while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
+		struct request *req = blk_mq_rq_from_pdu(vbr);
+
+		found++;
+		if (!blk_mq_add_to_batch(req, iob, vbr->status,
+						virtblk_complete_batch))
+			blk_mq_complete_request(req);
+	}
+
+	if (found)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+
+	spin_unlock_irqrestore(&vq->lock, flags);
+
+	return found;
+}
+
+static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
+{
+	struct virtio_blk *vblk = data;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx];
+
+	WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags);
+	hctx->driver_data = vq;
+	return 0;
 }
 
 static const struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
 	.commit_rqs	= virtio_commit_rqs,
+	.init_hctx	= virtblk_init_hctx,
 	.complete	= virtblk_request_done,
 	.map_queues	= virtblk_map_queues,
+	.poll		= virtblk_poll,
 };
 
 static unsigned int virtblk_queue_depth;
@@ -816,6 +914,9 @@  static int virtblk_probe(struct virtio_device *vdev)
 		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
 	vblk->tag_set.driver_data = vblk;
 	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
+	vblk->tag_set.nr_maps = 1;
+	if (vblk->io_queues[HCTX_TYPE_POLL])
+		vblk->tag_set.nr_maps = 3;
 
 	err = blk_mq_alloc_tag_set(&vblk->tag_set);
 	if (err)