diff mbox series

virtio-blk: support polling I/O

Message ID 20220311152832.17703-1-suwan.kim027@gmail.com (mailing list archive)
State New, archived
Headers show
Series virtio-blk: support polling I/O | expand

Commit Message

Suwan Kim March 11, 2022, 3:28 p.m. UTC
This patch supports polling I/O via virtio-blk driver. Polling
feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
of polling queues can be set by QEMU virtio-blk-pci property
"num-poll-queues=N". This patch improves the polling I/O throughput
and latency.

The virtio-blk driver doesn't not have a poll function and a poll
queue and it has been operating in interrupt driven method even if
the polling function is called in the upper layer.

virtio-blk polling is implemented upon 'batched completion' of block
layer. virtblk_poll() queues completed request to io_comp_batch->req_list
and later, virtblk_complete_batch() calls unmap function and ends
the requests in batch.

virtio-blk reads the number of queues and poll queues from QEMU
virtio-blk-pci properties ("num-queues=N", "num-poll-queues=M").
It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
queues, the poll queues have no callback function.

Regarding HW-SW queue mapping, the default queue mapping uses the
existing method that condsiders MSI irq vector. But the poll queue
doesn't have an irq, so it uses the regular blk-mq cpu mapping.

To enable poll queues, "num-poll-queues=N" property of virtio-blk-pci
needs to be added to QEMU command line. For that, I temporarily
implemented the property on QEMU. Please refer to the git repository below.

	git : https://github.com/asfaca/qemu.git #on master branch commit

For verifying the improvement, I did Fio polling I/O performance test
with io_uring engine with the options below.
(io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
queues for VM.
(-device virtio-blk-pci,num-queues=4,num-poll-queues=2)
As a result, IOPS and average latency improved about 10%.

Test result:

- Fio io_uring poll without virtio-blk poll support
	-- numjobs=1 : IOPS = 297K, avg latency = 214.59us
	-- numjobs=2 : IOPS = 360K, avg latency = 363.88us
	-- numjobs=4 : IOPS = 289K, avg latency = 885.42us

- Fio io_uring poll with virtio-blk poll support
	-- numjobs=1 : IOPS = 332K, avg latency = 192.61us
	-- numjobs=2 : IOPS = 371K, avg latency = 348.31us
	-- numjobs=4 : IOPS = 321K, avg latency = 795.93us

Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
---
 drivers/block/virtio_blk.c      | 98 +++++++++++++++++++++++++++++++--
 include/uapi/linux/virtio_blk.h |  3 +-
 2 files changed, 96 insertions(+), 5 deletions(-)

Comments

Michael S. Tsirkin March 11, 2022, 3:38 p.m. UTC | #1
On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> index d888f013d9ff..3fcaf937afe1 100644
> --- a/include/uapi/linux/virtio_blk.h
> +++ b/include/uapi/linux/virtio_blk.h
> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>  	 * deallocation of one or more of the sectors.
>  	 */
>  	__u8 write_zeroes_may_unmap;
> +	__u8 unused1;
>  
> -	__u8 unused1[3];
> +	__virtio16 num_poll_queues;
>  } __attribute__((packed));

Same as any virtio UAPI change, this has to go through the virtio TC.
In particular I don't think gating a new config field on
an existing feature flag is a good idea.

>  /*
> -- 
> 2.26.3
Suwan Kim March 11, 2022, 4:07 p.m. UTC | #2
On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > index d888f013d9ff..3fcaf937afe1 100644
> > --- a/include/uapi/linux/virtio_blk.h
> > +++ b/include/uapi/linux/virtio_blk.h
> > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> >  	 * deallocation of one or more of the sectors.
> >  	 */
> >  	__u8 write_zeroes_may_unmap;
> > +	__u8 unused1;
> >  
> > -	__u8 unused1[3];
> > +	__virtio16 num_poll_queues;
> >  } __attribute__((packed));
> 
> Same as any virtio UAPI change, this has to go through the virtio TC.
> In particular I don't think gating a new config field on
> an existing feature flag is a good idea.

Did you mean that the polling should be based on a new feature like
"VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
and features[]? If then, I will add the new feture flag and resend it.

Regards,
Suwan Kim
Michael S. Tsirkin March 11, 2022, 4:18 p.m. UTC | #3
On Sat, Mar 12, 2022 at 01:07:23AM +0900, Suwan Kim wrote:
> On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > index d888f013d9ff..3fcaf937afe1 100644
> > > --- a/include/uapi/linux/virtio_blk.h
> > > +++ b/include/uapi/linux/virtio_blk.h
> > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > >  	 * deallocation of one or more of the sectors.
> > >  	 */
> > >  	__u8 write_zeroes_may_unmap;
> > > +	__u8 unused1;
> > >  
> > > -	__u8 unused1[3];
> > > +	__virtio16 num_poll_queues;
> > >  } __attribute__((packed));
> > 
> > Same as any virtio UAPI change, this has to go through the virtio TC.


Notice this pls.  Remember to copy one of the TC mailing lists.

> > In particular I don't think gating a new config field on
> > an existing feature flag is a good idea.
> 
> Did you mean that the polling should be based on a new feature like
> "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> and features[]? If then, I will add the new feture flag and resend it.
> 
> Regards,
> Suwan Kim
Suwan Kim March 11, 2022, 4:38 p.m. UTC | #4
On Fri, Mar 11, 2022 at 11:18:54AM -0500, Michael S. Tsirkin wrote:
> On Sat, Mar 12, 2022 at 01:07:23AM +0900, Suwan Kim wrote:
> > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > --- a/include/uapi/linux/virtio_blk.h
> > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > >  	 * deallocation of one or more of the sectors.
> > > >  	 */
> > > >  	__u8 write_zeroes_may_unmap;
> > > > +	__u8 unused1;
> > > >  
> > > > -	__u8 unused1[3];
> > > > +	__virtio16 num_poll_queues;
> > > >  } __attribute__((packed));
> > > 
> > > Same as any virtio UAPI change, this has to go through the virtio TC.
> 
> 
> Notice this pls.  Remember to copy one of the TC mailing lists.

Okay. I will send v2 to the virtio TC mailing list also.

Regards,
Suwan Kim
Max Gurtovoy March 13, 2022, 10:37 a.m. UTC | #5
On 3/11/2022 6:07 PM, Suwan Kim wrote:
> On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
>> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
>>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>>> index d888f013d9ff..3fcaf937afe1 100644
>>> --- a/include/uapi/linux/virtio_blk.h
>>> +++ b/include/uapi/linux/virtio_blk.h
>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>   	 * deallocation of one or more of the sectors.
>>>   	 */
>>>   	__u8 write_zeroes_may_unmap;
>>> +	__u8 unused1;
>>>   
>>> -	__u8 unused1[3];
>>> +	__virtio16 num_poll_queues;
>>>   } __attribute__((packed));
>> Same as any virtio UAPI change, this has to go through the virtio TC.
>> In particular I don't think gating a new config field on
>> an existing feature flag is a good idea.
> Did you mean that the polling should be based on a new feature like
> "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> and features[]? If then, I will add the new feture flag and resend it.

Isn't there a way in the SPEC today to create a queue without interrupt 
vector ?


>
> Regards,
> Suwan Kim
Max Gurtovoy March 13, 2022, 10:42 a.m. UTC | #6
On 3/11/2022 5:28 PM, Suwan Kim wrote:
> This patch supports polling I/O via virtio-blk driver. Polling
> feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
> of polling queues can be set by QEMU virtio-blk-pci property
> "num-poll-queues=N". This patch improves the polling I/O throughput
> and latency.
>
> The virtio-blk driver doesn't not have a poll function and a poll
> queue and it has been operating in interrupt driven method even if
> the polling function is called in the upper layer.
>
> virtio-blk polling is implemented upon 'batched completion' of block
> layer. virtblk_poll() queues completed request to io_comp_batch->req_list
> and later, virtblk_complete_batch() calls unmap function and ends
> the requests in batch.

Maybe we can do the batch in separate commit ?

For implementing batch in the right way you need to implement .queue_rqs 
call back.

See NVMe PCI driver implementation.

If we're here, I would really like to see an implementation of 
blk_mq_ops.timeout callback in virtio-blk.

I think it will take this driver to the next level.

>
> virtio-blk reads the number of queues and poll queues from QEMU
> virtio-blk-pci properties ("num-queues=N", "num-poll-queues=M").
> It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
> as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
> queues, the poll queues have no callback function.
>
> Regarding HW-SW queue mapping, the default queue mapping uses the
> existing method that condsiders MSI irq vector. But the poll queue
> doesn't have an irq, so it uses the regular blk-mq cpu mapping.
>
> To enable poll queues, "num-poll-queues=N" property of virtio-blk-pci
> needs to be added to QEMU command line. For that, I temporarily
> implemented the property on QEMU. Please refer to the git repository below.
>
> 	git : https://github.com/asfaca/qemu.git #on master branch commit
>
> For verifying the improvement, I did Fio polling I/O performance test
> with io_uring engine with the options below.
> (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
> I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
> queues for VM.
> (-device virtio-blk-pci,num-queues=4,num-poll-queues=2)
> As a result, IOPS and average latency improved about 10%.
>
> Test result:
>
> - Fio io_uring poll without virtio-blk poll support
> 	-- numjobs=1 : IOPS = 297K, avg latency = 214.59us
> 	-- numjobs=2 : IOPS = 360K, avg latency = 363.88us
> 	-- numjobs=4 : IOPS = 289K, avg latency = 885.42us
>
> - Fio io_uring poll with virtio-blk poll support
> 	-- numjobs=1 : IOPS = 332K, avg latency = 192.61us
> 	-- numjobs=2 : IOPS = 371K, avg latency = 348.31us
> 	-- numjobs=4 : IOPS = 321K, avg latency = 795.93us
>
> Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
> ---
>   drivers/block/virtio_blk.c      | 98 +++++++++++++++++++++++++++++++--
>   include/uapi/linux/virtio_blk.h |  3 +-
>   2 files changed, 96 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 8c415be86732..bfde7d97d528 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -81,6 +81,7 @@ struct virtio_blk {
>   
>   	/* num of vqs */
>   	int num_vqs;
> +	int io_queues[HCTX_MAX_TYPES];
>   	struct virtio_blk_vq *vqs;
>   };
>   
> @@ -548,6 +549,7 @@ static int init_vq(struct virtio_blk *vblk)
>   	const char **names;
>   	struct virtqueue **vqs;
>   	unsigned short num_vqs;
> +	unsigned short num_poll_vqs;
>   	struct virtio_device *vdev = vblk->vdev;
>   	struct irq_affinity desc = { 0, };
>   
> @@ -556,6 +558,13 @@ static int init_vq(struct virtio_blk *vblk)
>   				   &num_vqs);
>   	if (err)
>   		num_vqs = 1;
> +
> +	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
> +				struct virtio_blk_config, num_poll_queues,
> +				&num_poll_vqs);
> +	if (err)
> +		num_poll_vqs = 0;
> +
>   	if (!err && !num_vqs) {
>   		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
>   		return -EINVAL;
> @@ -565,6 +574,13 @@ static int init_vq(struct virtio_blk *vblk)
>   			min_not_zero(num_request_queues, nr_cpu_ids),
>   			num_vqs);
>   
> +	num_poll_vqs = min_t(unsigned int, num_poll_vqs, num_vqs - 1);
> +
> +	memset(vblk->io_queues, 0, sizeof(int) * HCTX_MAX_TYPES);
> +	vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
> +	vblk->io_queues[HCTX_TYPE_READ] = 0;
> +	vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
> +
>   	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
>   	if (!vblk->vqs)
>   		return -ENOMEM;
> @@ -578,8 +594,13 @@ static int init_vq(struct virtio_blk *vblk)
>   	}
>   
>   	for (i = 0; i < num_vqs; i++) {
> -		callbacks[i] = virtblk_done;
> -		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
> +		if (i < num_vqs - num_poll_vqs) {
> +			callbacks[i] = virtblk_done;
> +			snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
> +		} else {
> +			callbacks[i] = NULL;
> +			snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
> +		}
>   		names[i] = vblk->vqs[i].name;
>   	}
>   
> @@ -728,16 +749,82 @@ static const struct attribute_group *virtblk_attr_groups[] = {
>   static int virtblk_map_queues(struct blk_mq_tag_set *set)
>   {
>   	struct virtio_blk *vblk = set->driver_data;
> +	int i, qoff;
> +
> +	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
> +		struct blk_mq_queue_map *map = &set->map[i];
> +
> +		map->nr_queues = vblk->io_queues[i];
> +		map->queue_offset = qoff;
> +		qoff += map->nr_queues;
> +
> +		if (map->nr_queues == 0)
> +			continue;
> +
> +		if (i == HCTX_TYPE_DEFAULT)
> +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> +		else
> +			blk_mq_map_queues(&set->map[i]);
> +	}
> +
> +	return 0;
> +}
> +
> +static void virtblk_complete_batch(struct io_comp_batch *iob)
> +{
> +	struct request *req;
> +	struct virtblk_req *vbr;
>   
> -	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
> -					vblk->vdev, 0);
> +	rq_list_for_each(&iob->req_list, req) {
> +		vbr = blk_mq_rq_to_pdu(req);
> +		virtblk_unmap_data(req, vbr);
> +		virtblk_cleanup_cmd(req);
> +	}
> +	blk_mq_end_request_batch(iob);
> +}
> +
> +static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
> +{
> +	struct virtio_blk_vq *vq = hctx->driver_data;
> +	struct virtblk_req *vbr;
> +	unsigned long flags;
> +	unsigned int len;
> +	int found = 0;
> +
> +	spin_lock_irqsave(&vq->lock, flags);
> +
> +	while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
> +		struct request *req = blk_mq_rq_from_pdu(vbr);
> +
> +		found++;
> +		if (!blk_mq_add_to_batch(req, iob, virtblk_result(vbr),
> +						virtblk_complete_batch))
> +			blk_mq_complete_request(req);
> +	}
> +
> +	spin_unlock_irqrestore(&vq->lock, flags);
> +
> +	return found;
> +}
> +
> +static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> +			  unsigned int hctx_idx)
> +{
> +	struct virtio_blk *vblk = data;
> +	struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx];
> +
> +	WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags);
> +	hctx->driver_data = vq;
> +	return 0;
>   }
>   
>   static const struct blk_mq_ops virtio_mq_ops = {
>   	.queue_rq	= virtio_queue_rq,
>   	.commit_rqs	= virtio_commit_rqs,
> +	.init_hctx	= virtblk_init_hctx,
>   	.complete	= virtblk_request_done,
>   	.map_queues	= virtblk_map_queues,
> +	.poll		= virtblk_poll,
>   };
>   
>   static unsigned int virtblk_queue_depth;
> @@ -816,6 +903,9 @@ static int virtblk_probe(struct virtio_device *vdev)
>   		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
>   	vblk->tag_set.driver_data = vblk;
>   	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
> +	vblk->tag_set.nr_maps = 1;
> +	if (vblk->io_queues[HCTX_TYPE_POLL])
> +		vblk->tag_set.nr_maps = 3;
>   
>   	err = blk_mq_alloc_tag_set(&vblk->tag_set);
>   	if (err)
> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> index d888f013d9ff..3fcaf937afe1 100644
> --- a/include/uapi/linux/virtio_blk.h
> +++ b/include/uapi/linux/virtio_blk.h
> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>   	 * deallocation of one or more of the sectors.
>   	 */
>   	__u8 write_zeroes_may_unmap;
> +	__u8 unused1;
>   
> -	__u8 unused1[3];
> +	__virtio16 num_poll_queues;
>   } __attribute__((packed));
>   
>   /*
Jason Wang March 14, 2022, 6:14 a.m. UTC | #7
在 2022/3/11 下午11:28, Suwan Kim 写道:
> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> index d888f013d9ff..3fcaf937afe1 100644
> --- a/include/uapi/linux/virtio_blk.h
> +++ b/include/uapi/linux/virtio_blk.h
> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>   	 * deallocation of one or more of the sectors.
>   	 */
>   	__u8 write_zeroes_may_unmap;
> +	__u8 unused1;
>   
> -	__u8 unused1[3];
> +	__virtio16 num_poll_queues;
>   } __attribute__((packed));


This looks like a implementation specific (virtio-blk-pci) optimization, 
how about other implementation like vhost-user-blk?

Thanks
Suwan Kim March 14, 2022, 9:43 a.m. UTC | #8
On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> 
> On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > --- a/include/uapi/linux/virtio_blk.h
> > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > >   	 * deallocation of one or more of the sectors.
> > > >   	 */
> > > >   	__u8 write_zeroes_may_unmap;
> > > > +	__u8 unused1;
> > > > -	__u8 unused1[3];
> > > > +	__virtio16 num_poll_queues;
> > > >   } __attribute__((packed));
> > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > In particular I don't think gating a new config field on
> > > an existing feature flag is a good idea.
> > Did you mean that the polling should be based on a new feature like
> > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > and features[]? If then, I will add the new feture flag and resend it.
> 
> Isn't there a way in the SPEC today to create a queue without interrupt
> vector ?

It seems that it is not possible to create a queue without interrupt
vector. If it is possible, we can expect more polling improvement.

Regards,
Suwan Kim
Suwan Kim March 14, 2022, 9:55 a.m. UTC | #9
On Sun, Mar 13, 2022 at 12:42:58PM +0200, Max Gurtovoy wrote:
> 
> On 3/11/2022 5:28 PM, Suwan Kim wrote:
> > This patch supports polling I/O via virtio-blk driver. Polling
> > feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
> > of polling queues can be set by QEMU virtio-blk-pci property
> > "num-poll-queues=N". This patch improves the polling I/O throughput
> > and latency.
> > 
> > The virtio-blk driver doesn't not have a poll function and a poll
> > queue and it has been operating in interrupt driven method even if
> > the polling function is called in the upper layer.
> > 
> > virtio-blk polling is implemented upon 'batched completion' of block
> > layer. virtblk_poll() queues completed request to io_comp_batch->req_list
> > and later, virtblk_complete_batch() calls unmap function and ends
> > the requests in batch.
> 
> Maybe we can do the batch in separate commit ?
> 
> For implementing batch in the right way you need to implement .queue_rqs
> call back.
> 
> See NVMe PCI driver implementation.
> 
> If we're here, I would really like to see an implementation of
> blk_mq_ops.timeout callback in virtio-blk.
> 
> I think it will take this driver to the next level.

Thanks for the feedback. I will implement .queue_rqs for virtio-blk
and make a seperate commit. Later, I will try to implement
blk_mq_ops.timeout as you mentioned.

I will send the patch series soon as below.
[1] support .queue_rqs for batch submission
[2] support polling I/O

Regards,
Suwan Kim
Max Gurtovoy March 14, 2022, 10:25 a.m. UTC | #10
On 3/14/2022 11:43 AM, Suwan Kim wrote:
> On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
>> On 3/11/2022 6:07 PM, Suwan Kim wrote:
>>> On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
>>>> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
>>>>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>>>>> index d888f013d9ff..3fcaf937afe1 100644
>>>>> --- a/include/uapi/linux/virtio_blk.h
>>>>> +++ b/include/uapi/linux/virtio_blk.h
>>>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>>>    	 * deallocation of one or more of the sectors.
>>>>>    	 */
>>>>>    	__u8 write_zeroes_may_unmap;
>>>>> +	__u8 unused1;
>>>>> -	__u8 unused1[3];
>>>>> +	__virtio16 num_poll_queues;
>>>>>    } __attribute__((packed));
>>>> Same as any virtio UAPI change, this has to go through the virtio TC.
>>>> In particular I don't think gating a new config field on
>>>> an existing feature flag is a good idea.
>>> Did you mean that the polling should be based on a new feature like
>>> "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
>>> and features[]? If then, I will add the new feture flag and resend it.
>> Isn't there a way in the SPEC today to create a queue without interrupt
>> vector ?
> It seems that it is not possible to create a queue without interrupt
> vector. If it is possible, we can expect more polling improvement.

MST/Jason/Stefan,

can you confirm that please ?

what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?


> Regards,
> Suwan Kim
Max Gurtovoy March 14, 2022, 10:32 a.m. UTC | #11
On 3/14/2022 11:55 AM, Suwan Kim wrote:
> On Sun, Mar 13, 2022 at 12:42:58PM +0200, Max Gurtovoy wrote:
>> On 3/11/2022 5:28 PM, Suwan Kim wrote:
>>> This patch supports polling I/O via virtio-blk driver. Polling
>>> feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
>>> of polling queues can be set by QEMU virtio-blk-pci property
>>> "num-poll-queues=N". This patch improves the polling I/O throughput
>>> and latency.
>>>
>>> The virtio-blk driver doesn't not have a poll function and a poll
>>> queue and it has been operating in interrupt driven method even if
>>> the polling function is called in the upper layer.
>>>
>>> virtio-blk polling is implemented upon 'batched completion' of block
>>> layer. virtblk_poll() queues completed request to io_comp_batch->req_list
>>> and later, virtblk_complete_batch() calls unmap function and ends
>>> the requests in batch.
>> Maybe we can do the batch in separate commit ?
>>
>> For implementing batch in the right way you need to implement .queue_rqs
>> call back.
>>
>> See NVMe PCI driver implementation.
>>
>> If we're here, I would really like to see an implementation of
>> blk_mq_ops.timeout callback in virtio-blk.
>>
>> I think it will take this driver to the next level.
> Thanks for the feedback. I will implement .queue_rqs for virtio-blk
> and make a seperate commit. Later, I will try to implement
> blk_mq_ops.timeout as you mentioned.

Thanks this will be great.


> I will send the patch series soon as below.
> [1] support .queue_rqs for batch submission
> [2] support polling I/O

I'm afraid it will be hard to implement polling if virtio spec doesn't 
support it.

So lets see what the maintainers will say and if there is no such 
support we can add it to the spec.

So for now what we can do is implement .queue_rqs and .timeout (both are 
orthogonal to each other).

>
> Regards,
> Suwan Kim
Michael S. Tsirkin March 14, 2022, 11:15 a.m. UTC | #12
On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
> 
> On 3/14/2022 11:43 AM, Suwan Kim wrote:
> > On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> > > On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > > > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > >    	 * deallocation of one or more of the sectors.
> > > > > >    	 */
> > > > > >    	__u8 write_zeroes_may_unmap;
> > > > > > +	__u8 unused1;
> > > > > > -	__u8 unused1[3];
> > > > > > +	__virtio16 num_poll_queues;
> > > > > >    } __attribute__((packed));
> > > > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > > > In particular I don't think gating a new config field on
> > > > > an existing feature flag is a good idea.
> > > > Did you mean that the polling should be based on a new feature like
> > > > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > > > and features[]? If then, I will add the new feture flag and resend it.
> > > Isn't there a way in the SPEC today to create a queue without interrupt
> > > vector ?
> > It seems that it is not possible to create a queue without interrupt
> > vector. If it is possible, we can expect more polling improvement.

Yes, it's possible:

Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
\field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
by the configuration change/selected queue events respectively to
the corresponding MSI-X vector. To disable interrupts for an
event type, the driver unmaps this event by writing a special NO_VECTOR
value:

\begin{lstlisting}
/* Vector value used to disable MSI for queue */
#define VIRTIO_MSI_NO_VECTOR            0xffff
\end{lstlisting}



> MST/Jason/Stefan,
> 
> can you confirm that please ?
>
> what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?

This is a hint to the device not to send interrupts.

> 
> > Regards,
> > Suwan Kim
Suwan Kim March 14, 2022, 12:33 p.m. UTC | #13
On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> 
> 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > index d888f013d9ff..3fcaf937afe1 100644
> > --- a/include/uapi/linux/virtio_blk.h
> > +++ b/include/uapi/linux/virtio_blk.h
> > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> >   	 * deallocation of one or more of the sectors.
> >   	 */
> >   	__u8 write_zeroes_may_unmap;
> > +	__u8 unused1;
> > -	__u8 unused1[3];
> > +	__virtio16 num_poll_queues;
> >   } __attribute__((packed));
> 
> 
> This looks like a implementation specific (virtio-blk-pci) optimization, how
> about other implementation like vhost-user-blk?

I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
use vritio_blk_config as kernel-qemu interface?

Does vhost-user-blk need additional modification to support polling
in kernel side?

Regards,
Suwan Kim
Suwan Kim March 14, 2022, 1:22 p.m. UTC | #14
On Mon, Mar 14, 2022 at 07:15:18AM -0400, Michael S. Tsirkin wrote:
> On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
> > 
> > On 3/14/2022 11:43 AM, Suwan Kim wrote:
> > > On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> > > > On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > > > > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > > > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > >    	 * deallocation of one or more of the sectors.
> > > > > > >    	 */
> > > > > > >    	__u8 write_zeroes_may_unmap;
> > > > > > > +	__u8 unused1;
> > > > > > > -	__u8 unused1[3];
> > > > > > > +	__virtio16 num_poll_queues;
> > > > > > >    } __attribute__((packed));
> > > > > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > > > > In particular I don't think gating a new config field on
> > > > > > an existing feature flag is a good idea.
> > > > > Did you mean that the polling should be based on a new feature like
> > > > > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > > > > and features[]? If then, I will add the new feture flag and resend it.
> > > > Isn't there a way in the SPEC today to create a queue without interrupt
> > > > vector ?
> > > It seems that it is not possible to create a queue without interrupt
> > > vector. If it is possible, we can expect more polling improvement.
> 
> Yes, it's possible:
> 
> Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
> \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
> by the configuration change/selected queue events respectively to
> the corresponding MSI-X vector. To disable interrupts for an
> event type, the driver unmaps this event by writing a special NO_VECTOR
> value:
> 
> \begin{lstlisting}
> /* Vector value used to disable MSI for queue */
> #define VIRTIO_MSI_NO_VECTOR            0xffff
> \end{lstlisting}
 
Thanks for the information.

Then, in function vp_find_vqs_msix() at virtio_pci_common.c, it sets
VIRTIO_MSI_NO_VECTOR if vritqueue->callback is NULL as below code.

static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
		struct virtqueue *vqs[], vq_callback_t *callbacks[],
...
		if (!callbacks[i])
			msix_vec = VIRTIO_MSI_NO_VECTOR;
...

In oder to create poll queue in virtio-blk, I set NULL callback for
poll virtqueues and it will create queue without irq.

Regards,
Suwan Kim
Max Gurtovoy March 14, 2022, 1:26 p.m. UTC | #15
On 3/14/2022 1:15 PM, Michael S. Tsirkin wrote:
> On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
>> On 3/14/2022 11:43 AM, Suwan Kim wrote:
>>> On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
>>>> On 3/11/2022 6:07 PM, Suwan Kim wrote:
>>>>> On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
>>>>>> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
>>>>>>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>>>>>>> index d888f013d9ff..3fcaf937afe1 100644
>>>>>>> --- a/include/uapi/linux/virtio_blk.h
>>>>>>> +++ b/include/uapi/linux/virtio_blk.h
>>>>>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>>>>>     	 * deallocation of one or more of the sectors.
>>>>>>>     	 */
>>>>>>>     	__u8 write_zeroes_may_unmap;
>>>>>>> +	__u8 unused1;
>>>>>>> -	__u8 unused1[3];
>>>>>>> +	__virtio16 num_poll_queues;
>>>>>>>     } __attribute__((packed));
>>>>>> Same as any virtio UAPI change, this has to go through the virtio TC.
>>>>>> In particular I don't think gating a new config field on
>>>>>> an existing feature flag is a good idea.
>>>>> Did you mean that the polling should be based on a new feature like
>>>>> "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
>>>>> and features[]? If then, I will add the new feture flag and resend it.
>>>> Isn't there a way in the SPEC today to create a queue without interrupt
>>>> vector ?
>>> It seems that it is not possible to create a queue without interrupt
>>> vector. If it is possible, we can expect more polling improvement.
> Yes, it's possible:
>
> Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
> \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
> by the configuration change/selected queue events respectively to
> the corresponding MSI-X vector. To disable interrupts for an
> event type, the driver unmaps this event by writing a special NO_VECTOR
> value:
>
> \begin{lstlisting}
> /* Vector value used to disable MSI for queue */
> #define VIRTIO_MSI_NO_VECTOR            0xffff
> \end{lstlisting}
>
>
>
>> MST/Jason/Stefan,
>>
>> can you confirm that please ?
>>
>> what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?
> This is a hint to the device not to send interrupts.

Why do you need a hint if the driver implicitly wrote 0xffff to disable 
MSI for a virtqueue ?


>
>>> Regards,
>>> Suwan Kim
Stefan Hajnoczi March 14, 2022, 2:48 p.m. UTC | #16
On Mon, Mar 14, 2022 at 09:33:08PM +0900, Suwan Kim wrote:
> On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > 
> > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > index d888f013d9ff..3fcaf937afe1 100644
> > > --- a/include/uapi/linux/virtio_blk.h
> > > +++ b/include/uapi/linux/virtio_blk.h
> > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > >   	 * deallocation of one or more of the sectors.
> > >   	 */
> > >   	__u8 write_zeroes_may_unmap;
> > > +	__u8 unused1;
> > > -	__u8 unused1[3];
> > > +	__virtio16 num_poll_queues;
> > >   } __attribute__((packed));
> > 
> > 
> > This looks like a implementation specific (virtio-blk-pci) optimization, how
> > about other implementation like vhost-user-blk?
> 
> I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> use vritio_blk_config as kernel-qemu interface?
> 
> Does vhost-user-blk need additional modification to support polling
> in kernel side?

I think QEMU's --device vhost-user-blk-pci will work with this patch
series because QEMU passes the struct virtio_blk_config from the
vhost-user-blk server to the guest. If a new vhost-user-blk server
supports num_poll_queues then the guest will see the config field.

However, the new feature bit that was discussed in another sub-thread
needs to be added to hw/block/vhost-user-blk.c:user_feature_bits[] and
QEMU needs to be recompiled.

Stefan
Michael S. Tsirkin March 14, 2022, 3:12 p.m. UTC | #17
On Mon, Mar 14, 2022 at 10:22:01PM +0900, Suwan Kim wrote:
> On Mon, Mar 14, 2022 at 07:15:18AM -0400, Michael S. Tsirkin wrote:
> > On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
> > > 
> > > On 3/14/2022 11:43 AM, Suwan Kim wrote:
> > > > On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> > > > > On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > > > > > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > > > > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > > > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > > >    	 * deallocation of one or more of the sectors.
> > > > > > > >    	 */
> > > > > > > >    	__u8 write_zeroes_may_unmap;
> > > > > > > > +	__u8 unused1;
> > > > > > > > -	__u8 unused1[3];
> > > > > > > > +	__virtio16 num_poll_queues;
> > > > > > > >    } __attribute__((packed));
> > > > > > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > > > > > In particular I don't think gating a new config field on
> > > > > > > an existing feature flag is a good idea.
> > > > > > Did you mean that the polling should be based on a new feature like
> > > > > > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > > > > > and features[]? If then, I will add the new feture flag and resend it.
> > > > > Isn't there a way in the SPEC today to create a queue without interrupt
> > > > > vector ?
> > > > It seems that it is not possible to create a queue without interrupt
> > > > vector. If it is possible, we can expect more polling improvement.
> > 
> > Yes, it's possible:
> > 
> > Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
> > \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
> > by the configuration change/selected queue events respectively to
> > the corresponding MSI-X vector. To disable interrupts for an
> > event type, the driver unmaps this event by writing a special NO_VECTOR
> > value:
> > 
> > \begin{lstlisting}
> > /* Vector value used to disable MSI for queue */
> > #define VIRTIO_MSI_NO_VECTOR            0xffff
> > \end{lstlisting}
>  
> Thanks for the information.
> 
> Then, in function vp_find_vqs_msix() at virtio_pci_common.c, it sets
> VIRTIO_MSI_NO_VECTOR if vritqueue->callback is NULL as below code.
> 
> static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
> 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
> ...
> 		if (!callbacks[i])
> 			msix_vec = VIRTIO_MSI_NO_VECTOR;
> ...
> 
> In oder to create poll queue in virtio-blk, I set NULL callback for
> poll virtqueues and it will create queue without irq.
> 
> Regards,
> Suwan Kim

Yes, it will.
Michael S. Tsirkin March 14, 2022, 3:15 p.m. UTC | #18
On Mon, Mar 14, 2022 at 03:26:13PM +0200, Max Gurtovoy wrote:
> 
> On 3/14/2022 1:15 PM, Michael S. Tsirkin wrote:
> > On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
> > > On 3/14/2022 11:43 AM, Suwan Kim wrote:
> > > > On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> > > > > On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > > > > > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > > > > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > > > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > > >     	 * deallocation of one or more of the sectors.
> > > > > > > >     	 */
> > > > > > > >     	__u8 write_zeroes_may_unmap;
> > > > > > > > +	__u8 unused1;
> > > > > > > > -	__u8 unused1[3];
> > > > > > > > +	__virtio16 num_poll_queues;
> > > > > > > >     } __attribute__((packed));
> > > > > > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > > > > > In particular I don't think gating a new config field on
> > > > > > > an existing feature flag is a good idea.
> > > > > > Did you mean that the polling should be based on a new feature like
> > > > > > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > > > > > and features[]? If then, I will add the new feture flag and resend it.
> > > > > Isn't there a way in the SPEC today to create a queue without interrupt
> > > > > vector ?
> > > > It seems that it is not possible to create a queue without interrupt
> > > > vector. If it is possible, we can expect more polling improvement.
> > Yes, it's possible:
> > 
> > Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
> > \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
> > by the configuration change/selected queue events respectively to
> > the corresponding MSI-X vector. To disable interrupts for an
> > event type, the driver unmaps this event by writing a special NO_VECTOR
> > value:
> > 
> > \begin{lstlisting}
> > /* Vector value used to disable MSI for queue */
> > #define VIRTIO_MSI_NO_VECTOR            0xffff
> > \end{lstlisting}
> > 
> > 
> > 
> > > MST/Jason/Stefan,
> > > 
> > > can you confirm that please ?
> > > 
> > > what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?
> > This is a hint to the device not to send interrupts.
> 
> Why do you need a hint if the driver implicitly wrote 0xffff to disable MSI
> for a virtqueue ?


VIRTIO_MSI_NO_VECTOR is an expensive write into config space, followed
by an even more expensive read. Reliable and appropriate if you turn
events on/off very rarely.

VIRTQ_AVAIL_F_NO_INTERRUPT is an in-memory write so it's much cheaper,
but it's less reliable. Appropriate if you need to turn events on/off a
lot.



> 
> > 
> > > > Regards,
> > > > Suwan Kim
Stefan Hajnoczi March 14, 2022, 3:19 p.m. UTC | #19
On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> This patch supports polling I/O via virtio-blk driver. Polling
> feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
> of polling queues can be set by QEMU virtio-blk-pci property
> "num-poll-queues=N". This patch improves the polling I/O throughput
> and latency.
> 
> The virtio-blk driver doesn't not have a poll function and a poll
> queue and it has been operating in interrupt driven method even if
> the polling function is called in the upper layer.
> 
> virtio-blk polling is implemented upon 'batched completion' of block
> layer. virtblk_poll() queues completed request to io_comp_batch->req_list
> and later, virtblk_complete_batch() calls unmap function and ends
> the requests in batch.
> 
> virtio-blk reads the number of queues and poll queues from QEMU
> virtio-blk-pci properties ("num-queues=N", "num-poll-queues=M").
> It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
> as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
> queues, the poll queues have no callback function.
> 
> Regarding HW-SW queue mapping, the default queue mapping uses the
> existing method that condsiders MSI irq vector. But the poll queue
> doesn't have an irq, so it uses the regular blk-mq cpu mapping.
> 
> To enable poll queues, "num-poll-queues=N" property of virtio-blk-pci
> needs to be added to QEMU command line. For that, I temporarily
> implemented the property on QEMU. Please refer to the git repository below.
> 
> 	git : https://github.com/asfaca/qemu.git #on master branch commit
> 
> For verifying the improvement, I did Fio polling I/O performance test
> with io_uring engine with the options below.
> (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
> I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
> queues for VM.
> (-device virtio-blk-pci,num-queues=4,num-poll-queues=2)
> As a result, IOPS and average latency improved about 10%.
> 
> Test result:
> 
> - Fio io_uring poll without virtio-blk poll support
> 	-- numjobs=1 : IOPS = 297K, avg latency = 214.59us
> 	-- numjobs=2 : IOPS = 360K, avg latency = 363.88us
> 	-- numjobs=4 : IOPS = 289K, avg latency = 885.42us
> 
> - Fio io_uring poll with virtio-blk poll support
> 	-- numjobs=1 : IOPS = 332K, avg latency = 192.61us
> 	-- numjobs=2 : IOPS = 371K, avg latency = 348.31us
> 	-- numjobs=4 : IOPS = 321K, avg latency = 795.93us

Last year there was a patch series that switched regular queues into
polling queues when HIPRI requests were in flight:
https://lore.kernel.org/linux-block/20210520141305.355961-1-stefanha@redhat.com/T/

The advantage is that polling is possible without prior device
configuration, making it easier for users.

However, the dynamic approach is a bit more complex and bugs can result
in lost irqs (hung I/O). Christoph Hellwig asked for dedicated polling
queues, which your patch series now delivers.

I think your patch series is worth merging once the comments others have
already made have been addressed. I'll keep an eye out for the VIRTIO
spec change to extend the virtio-blk configuration space, which needs to
be accepted before the Linux can be merged.

> @@ -728,16 +749,82 @@ static const struct attribute_group *virtblk_attr_groups[] = {
>  static int virtblk_map_queues(struct blk_mq_tag_set *set)
>  {
>  	struct virtio_blk *vblk = set->driver_data;
> +	int i, qoff;
> +
> +	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
> +		struct blk_mq_queue_map *map = &set->map[i];
> +
> +		map->nr_queues = vblk->io_queues[i];
> +		map->queue_offset = qoff;
> +		qoff += map->nr_queues;
> +
> +		if (map->nr_queues == 0)
> +			continue;
> +
> +		if (i == HCTX_TYPE_DEFAULT)
> +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> +		else
> +			blk_mq_map_queues(&set->map[i]);

A comment would be nice here to explain that regular queues have
interrupts and hence CPU affinity is defined by the core virtio code,
but polling queues have no interrupts so we let the block layer assign
CPU affinity.
Max Gurtovoy March 14, 2022, 4:33 p.m. UTC | #20
On 3/14/2022 5:15 PM, Michael S. Tsirkin wrote:
> On Mon, Mar 14, 2022 at 03:26:13PM +0200, Max Gurtovoy wrote:
>> On 3/14/2022 1:15 PM, Michael S. Tsirkin wrote:
>>> On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
>>>> On 3/14/2022 11:43 AM, Suwan Kim wrote:
>>>>> On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
>>>>>> On 3/11/2022 6:07 PM, Suwan Kim wrote:
>>>>>>> On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
>>>>>>>> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
>>>>>>>>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>>>>>>>>> index d888f013d9ff..3fcaf937afe1 100644
>>>>>>>>> --- a/include/uapi/linux/virtio_blk.h
>>>>>>>>> +++ b/include/uapi/linux/virtio_blk.h
>>>>>>>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>>>>>>>      	 * deallocation of one or more of the sectors.
>>>>>>>>>      	 */
>>>>>>>>>      	__u8 write_zeroes_may_unmap;
>>>>>>>>> +	__u8 unused1;
>>>>>>>>> -	__u8 unused1[3];
>>>>>>>>> +	__virtio16 num_poll_queues;
>>>>>>>>>      } __attribute__((packed));
>>>>>>>> Same as any virtio UAPI change, this has to go through the virtio TC.
>>>>>>>> In particular I don't think gating a new config field on
>>>>>>>> an existing feature flag is a good idea.
>>>>>>> Did you mean that the polling should be based on a new feature like
>>>>>>> "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
>>>>>>> and features[]? If then, I will add the new feture flag and resend it.
>>>>>> Isn't there a way in the SPEC today to create a queue without interrupt
>>>>>> vector ?
>>>>> It seems that it is not possible to create a queue without interrupt
>>>>> vector. If it is possible, we can expect more polling improvement.
>>> Yes, it's possible:
>>>
>>> Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
>>> \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
>>> by the configuration change/selected queue events respectively to
>>> the corresponding MSI-X vector. To disable interrupts for an
>>> event type, the driver unmaps this event by writing a special NO_VECTOR
>>> value:
>>>
>>> \begin{lstlisting}
>>> /* Vector value used to disable MSI for queue */
>>> #define VIRTIO_MSI_NO_VECTOR            0xffff
>>> \end{lstlisting}
>>>
>>>
>>>
>>>> MST/Jason/Stefan,
>>>>
>>>> can you confirm that please ?
>>>>
>>>> what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?
>>> This is a hint to the device not to send interrupts.
>> Why do you need a hint if the driver implicitly wrote 0xffff to disable MSI
>> for a virtqueue ?
>
> VIRTIO_MSI_NO_VECTOR is an expensive write into config space, followed
> by an even more expensive read. Reliable and appropriate if you turn
> events on/off very rarely.
>
> VIRTQ_AVAIL_F_NO_INTERRUPT is an in-memory write so it's much cheaper,
> but it's less reliable. Appropriate if you need to turn events on/off a
> lot.

An "expensive" operation in the ctrl path during vq creation is fine IMO.

I see that nobody even used VIRTQ_AVAIL_F_NO_INTERRUPT in-memory write 
in Linux.

>
>
>>>>> Regards,
>>>>> Suwan Kim
Michael S. Tsirkin March 14, 2022, 10:22 p.m. UTC | #21
On Mon, Mar 14, 2022 at 06:33:06PM +0200, Max Gurtovoy wrote:
> 
> On 3/14/2022 5:15 PM, Michael S. Tsirkin wrote:
> > On Mon, Mar 14, 2022 at 03:26:13PM +0200, Max Gurtovoy wrote:
> > > On 3/14/2022 1:15 PM, Michael S. Tsirkin wrote:
> > > > On Mon, Mar 14, 2022 at 12:25:08PM +0200, Max Gurtovoy wrote:
> > > > > On 3/14/2022 11:43 AM, Suwan Kim wrote:
> > > > > > On Sun, Mar 13, 2022 at 12:37:21PM +0200, Max Gurtovoy wrote:
> > > > > > > On 3/11/2022 6:07 PM, Suwan Kim wrote:
> > > > > > > > On Fri, Mar 11, 2022 at 10:38:07AM -0500, Michael S. Tsirkin wrote:
> > > > > > > > > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > > > > > > > > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
> > > > > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > > > > >      	 * deallocation of one or more of the sectors.
> > > > > > > > > >      	 */
> > > > > > > > > >      	__u8 write_zeroes_may_unmap;
> > > > > > > > > > +	__u8 unused1;
> > > > > > > > > > -	__u8 unused1[3];
> > > > > > > > > > +	__virtio16 num_poll_queues;
> > > > > > > > > >      } __attribute__((packed));
> > > > > > > > > Same as any virtio UAPI change, this has to go through the virtio TC.
> > > > > > > > > In particular I don't think gating a new config field on
> > > > > > > > > an existing feature flag is a good idea.
> > > > > > > > Did you mean that the polling should be based on a new feature like
> > > > > > > > "VIRTIO_BLK_F_POLL" and be added at the end of features_legacy[]
> > > > > > > > and features[]? If then, I will add the new feture flag and resend it.
> > > > > > > Isn't there a way in the SPEC today to create a queue without interrupt
> > > > > > > vector ?
> > > > > > It seems that it is not possible to create a queue without interrupt
> > > > > > vector. If it is possible, we can expect more polling improvement.
> > > > Yes, it's possible:
> > > > 
> > > > Writing a valid MSI-X Table entry number, 0 to 0x7FF, to
> > > > \field{config_msix_vector}/\field{queue_msix_vector} maps interrupts triggered
> > > > by the configuration change/selected queue events respectively to
> > > > the corresponding MSI-X vector. To disable interrupts for an
> > > > event type, the driver unmaps this event by writing a special NO_VECTOR
> > > > value:
> > > > 
> > > > \begin{lstlisting}
> > > > /* Vector value used to disable MSI for queue */
> > > > #define VIRTIO_MSI_NO_VECTOR            0xffff
> > > > \end{lstlisting}
> > > > 
> > > > 
> > > > 
> > > > > MST/Jason/Stefan,
> > > > > 
> > > > > can you confirm that please ?
> > > > > 
> > > > > what does VIRTQ_AVAIL_F_NO_INTERRUPT supposed to do ?
> > > > This is a hint to the device not to send interrupts.
> > > Why do you need a hint if the driver implicitly wrote 0xffff to disable MSI
> > > for a virtqueue ?
> > 
> > VIRTIO_MSI_NO_VECTOR is an expensive write into config space, followed
> > by an even more expensive read. Reliable and appropriate if you turn
> > events on/off very rarely.
> > 
> > VIRTQ_AVAIL_F_NO_INTERRUPT is an in-memory write so it's much cheaper,
> > but it's less reliable. Appropriate if you need to turn events on/off a
> > lot.
> 
> An "expensive" operation in the ctrl path during vq creation is fine IMO.

Yes.

> I see that nobody even used VIRTQ_AVAIL_F_NO_INTERRUPT in-memory write in
> Linux.

Because it's called VRING_AVAIL_F_NO_INTERRUPT there.

> > 
> > 
> > > > > > Regards,
> > > > > > Suwan Kim
Suwan Kim March 15, 2022, 1:55 p.m. UTC | #22
On Mon, Mar 14, 2022 at 03:19:01PM +0000, Stefan Hajnoczi wrote:
> On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > This patch supports polling I/O via virtio-blk driver. Polling
> > feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
> > of polling queues can be set by QEMU virtio-blk-pci property
> > "num-poll-queues=N". This patch improves the polling I/O throughput
> > and latency.
> > 
> > The virtio-blk driver doesn't not have a poll function and a poll
> > queue and it has been operating in interrupt driven method even if
> > the polling function is called in the upper layer.
> > 
> > virtio-blk polling is implemented upon 'batched completion' of block
> > layer. virtblk_poll() queues completed request to io_comp_batch->req_list
> > and later, virtblk_complete_batch() calls unmap function and ends
> > the requests in batch.
> > 
> > virtio-blk reads the number of queues and poll queues from QEMU
> > virtio-blk-pci properties ("num-queues=N", "num-poll-queues=M").
> > It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
> > as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
> > queues, the poll queues have no callback function.
> > 
> > Regarding HW-SW queue mapping, the default queue mapping uses the
> > existing method that condsiders MSI irq vector. But the poll queue
> > doesn't have an irq, so it uses the regular blk-mq cpu mapping.
> > 
> > To enable poll queues, "num-poll-queues=N" property of virtio-blk-pci
> > needs to be added to QEMU command line. For that, I temporarily
> > implemented the property on QEMU. Please refer to the git repository below.
> > 
> > 	git : https://github.com/asfaca/qemu.git #on master branch commit
> > 
> > For verifying the improvement, I did Fio polling I/O performance test
> > with io_uring engine with the options below.
> > (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
> > I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
> > queues for VM.
> > (-device virtio-blk-pci,num-queues=4,num-poll-queues=2)
> > As a result, IOPS and average latency improved about 10%.
> > 
> > Test result:
> > 
> > - Fio io_uring poll without virtio-blk poll support
> > 	-- numjobs=1 : IOPS = 297K, avg latency = 214.59us
> > 	-- numjobs=2 : IOPS = 360K, avg latency = 363.88us
> > 	-- numjobs=4 : IOPS = 289K, avg latency = 885.42us
> > 
> > - Fio io_uring poll with virtio-blk poll support
> > 	-- numjobs=1 : IOPS = 332K, avg latency = 192.61us
> > 	-- numjobs=2 : IOPS = 371K, avg latency = 348.31us
> > 	-- numjobs=4 : IOPS = 321K, avg latency = 795.93us
> 
> Last year there was a patch series that switched regular queues into
> polling queues when HIPRI requests were in flight:
> https://lore.kernel.org/linux-block/20210520141305.355961-1-stefanha@redhat.com/T/
> 
> The advantage is that polling is possible without prior device
> configuration, making it easier for users.
> 
> However, the dynamic approach is a bit more complex and bugs can result
> in lost irqs (hung I/O). Christoph Hellwig asked for dedicated polling
> queues, which your patch series now delivers.
> 
> I think your patch series is worth merging once the comments others have
> already made have been addressed. I'll keep an eye out for the VIRTIO
> spec change to extend the virtio-blk configuration space, which needs to
> be accepted before the Linux can be merged.

Thanks for the feedback :)
There's a lot of history.. I will try to improve the patch.

It might take some time because it need more discussion about qemu
device property and I do this in my night time.

> > @@ -728,16 +749,82 @@ static const struct attribute_group *virtblk_attr_groups[] = {
> >  static int virtblk_map_queues(struct blk_mq_tag_set *set)
> >  {
> >  	struct virtio_blk *vblk = set->driver_data;
> > +	int i, qoff;
> > +
> > +	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
> > +		struct blk_mq_queue_map *map = &set->map[i];
> > +
> > +		map->nr_queues = vblk->io_queues[i];
> > +		map->queue_offset = qoff;
> > +		qoff += map->nr_queues;
> > +
> > +		if (map->nr_queues == 0)
> > +			continue;
> > +
> > +		if (i == HCTX_TYPE_DEFAULT)
> > +			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
> > +		else
> > +			blk_mq_map_queues(&set->map[i]);
> 
> A comment would be nice here to explain that regular queues have
> interrupts and hence CPU affinity is defined by the core virtio code,
> but polling queues have no interrupts so we let the block layer assign
> CPU affinity.

Okay. I will add the comment in v2.

Regards,
Suwan Kim
Suwan Kim March 15, 2022, 2:43 p.m. UTC | #23
On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> 
> > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > >
> > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > diff --git a/include/uapi/linux/virtio_blk.h
> > b/include/uapi/linux/virtio_blk.h
> > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > --- a/include/uapi/linux/virtio_blk.h
> > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > >      * deallocation of one or more of the sectors.
> > > >      */
> > > >     __u8 write_zeroes_may_unmap;
> > > > +   __u8 unused1;
> > > > -   __u8 unused1[3];
> > > > +   __virtio16 num_poll_queues;
> > > >   } __attribute__((packed));
> > >
> > >
> > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > how
> > > about other implementation like vhost-user-blk?
> >
> > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > use vritio_blk_config as kernel-qemu interface?
> >
> 
> Yes, but see below.
> 
> 
> >
> > Does vhost-user-blk need additional modification to support polling
> > in kernel side?
> >
> 
> 
> No, but the issue is, things like polling looks not a good candidate for
> the attributes belonging to the device but the driver. So I have more
> questions:
> 
> 1) what does it really mean for hardware virtio block devices?
> 2) Does driver polling help for the qemu implementation without polling?
> 3) Using blk_config means we can only get the benefit from the new device

1) what does it really mean for hardware virtio block devices?
3) Using blk_config means we can only get the benefit from the new device

This patch adds dedicated HW queue for polling purpose to virtio
block device.

So I think it can be a new hw feature. And it can be a new device
that supports hw poll queue.

BTW, I have other idea about it.

How about adding “num-poll-queues" property as a driver parameter
like NVMe driver, not to QEMU virtio-blk-pci property?

If then, we don’t need to modify virtio_blk_config.
And we can apply the polling feature only to virtio-blk-pci.
But can QEMU pass “num-poll-queues" to virtio-blk driver param?



2) Does driver polling help for the qemu implementation without polling?

Sorry, I didn't understand your question. Could you please explain more about?

Regards,
Suwan Kim
Michael S. Tsirkin March 15, 2022, 2:53 p.m. UTC | #24
On Tue, Mar 15, 2022 at 11:43:18PM +0900, Suwan Kim wrote:
> On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > 
> > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > >
> > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > b/include/uapi/linux/virtio_blk.h
> > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > >      * deallocation of one or more of the sectors.
> > > > >      */
> > > > >     __u8 write_zeroes_may_unmap;
> > > > > +   __u8 unused1;
> > > > > -   __u8 unused1[3];
> > > > > +   __virtio16 num_poll_queues;
> > > > >   } __attribute__((packed));
> > > >
> > > >
> > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > how
> > > > about other implementation like vhost-user-blk?
> > >
> > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > use vritio_blk_config as kernel-qemu interface?
> > >
> > 
> > Yes, but see below.
> > 
> > 
> > >
> > > Does vhost-user-blk need additional modification to support polling
> > > in kernel side?
> > >
> > 
> > 
> > No, but the issue is, things like polling looks not a good candidate for
> > the attributes belonging to the device but the driver. So I have more
> > questions:
> > 
> > 1) what does it really mean for hardware virtio block devices?
> > 2) Does driver polling help for the qemu implementation without polling?
> > 3) Using blk_config means we can only get the benefit from the new device
> 
> 1) what does it really mean for hardware virtio block devices?
> 3) Using blk_config means we can only get the benefit from the new device
> 
> This patch adds dedicated HW queue for polling purpose to virtio
> block device.
> 
> So I think it can be a new hw feature. And it can be a new device
> that supports hw poll queue.
> 
> BTW, I have other idea about it.
> 
> How about adding “num-poll-queues" property as a driver parameter
> like NVMe driver, not to QEMU virtio-blk-pci property?
> 
> If then, we don’t need to modify virtio_blk_config.
> And we can apply the polling feature only to virtio-blk-pci.
> But can QEMU pass “num-poll-queues" to virtio-blk driver param?

Same as any other driver parameter, pass it on kernel command line.

> 
> 
> 2) Does driver polling help for the qemu implementation without polling?
> 
> Sorry, I didn't understand your question. Could you please explain more about?
> 
> Regards,
> Suwan Kim
Jason Wang March 16, 2022, 2:02 a.m. UTC | #25
On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>
> On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> >
> > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > >
> > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > b/include/uapi/linux/virtio_blk.h
> > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > >      * deallocation of one or more of the sectors.
> > > > >      */
> > > > >     __u8 write_zeroes_may_unmap;
> > > > > +   __u8 unused1;
> > > > > -   __u8 unused1[3];
> > > > > +   __virtio16 num_poll_queues;
> > > > >   } __attribute__((packed));
> > > >
> > > >
> > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > how
> > > > about other implementation like vhost-user-blk?
> > >
> > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > use vritio_blk_config as kernel-qemu interface?
> > >
> >
> > Yes, but see below.
> >
> >
> > >
> > > Does vhost-user-blk need additional modification to support polling
> > > in kernel side?
> > >
> >
> >
> > No, but the issue is, things like polling looks not a good candidate for
> > the attributes belonging to the device but the driver. So I have more
> > questions:
> >
> > 1) what does it really mean for hardware virtio block devices?
> > 2) Does driver polling help for the qemu implementation without polling?
> > 3) Using blk_config means we can only get the benefit from the new device
>
> 1) what does it really mean for hardware virtio block devices?
> 3) Using blk_config means we can only get the benefit from the new device
>
> This patch adds dedicated HW queue for polling purpose to virtio
> block device.
>
> So I think it can be a new hw feature. And it can be a new device
> that supports hw poll queue.

One possible issue is that the "poll" looks more like a
software/driver concept other than the device/hardware.

>
> BTW, I have other idea about it.
>
> How about adding “num-poll-queues" property as a driver parameter
> like NVMe driver, not to QEMU virtio-blk-pci property?

It should be fine, but we need to listen to others.

>
> If then, we don’t need to modify virtio_blk_config.
> And we can apply the polling feature only to virtio-blk-pci.
> But can QEMU pass “num-poll-queues" to virtio-blk driver param?

As Michael said we can leave this to guest kernel / administrator.

>
>
>
> 2) Does driver polling help for the qemu implementation without polling?
>
> Sorry, I didn't understand your question. Could you please explain more about?

I mean does the polling work for the ordinary qemu block device
without busy polling?

Thanks

>
> Regards,
> Suwan Kim
>
Max Gurtovoy March 16, 2022, 11:25 a.m. UTC | #26
On 3/16/2022 4:02 AM, Jason Wang wrote:
> On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>> On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
>>> On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>>>
>>>> On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
>>>>> 在 2022/3/11 下午11:28, Suwan Kim 写道:
>>>>>> diff --git a/include/uapi/linux/virtio_blk.h
>>>> b/include/uapi/linux/virtio_blk.h
>>>>>> index d888f013d9ff..3fcaf937afe1 100644
>>>>>> --- a/include/uapi/linux/virtio_blk.h
>>>>>> +++ b/include/uapi/linux/virtio_blk.h
>>>>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>>>>       * deallocation of one or more of the sectors.
>>>>>>       */
>>>>>>      __u8 write_zeroes_may_unmap;
>>>>>> +   __u8 unused1;
>>>>>> -   __u8 unused1[3];
>>>>>> +   __virtio16 num_poll_queues;
>>>>>>    } __attribute__((packed));
>>>>>
>>>>> This looks like a implementation specific (virtio-blk-pci) optimization,
>>>> how
>>>>> about other implementation like vhost-user-blk?
>>>> I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
>>>> use vritio_blk_config as kernel-qemu interface?
>>>>
>>> Yes, but see below.
>>>
>>>
>>>> Does vhost-user-blk need additional modification to support polling
>>>> in kernel side?
>>>>
>>>
>>> No, but the issue is, things like polling looks not a good candidate for
>>> the attributes belonging to the device but the driver. So I have more
>>> questions:
>>>
>>> 1) what does it really mean for hardware virtio block devices?
>>> 2) Does driver polling help for the qemu implementation without polling?
>>> 3) Using blk_config means we can only get the benefit from the new device
>> 1) what does it really mean for hardware virtio block devices?
>> 3) Using blk_config means we can only get the benefit from the new device
>>
>> This patch adds dedicated HW queue for polling purpose to virtio
>> block device.
>>
>> So I think it can be a new hw feature. And it can be a new device
>> that supports hw poll queue.
> One possible issue is that the "poll" looks more like a
> software/driver concept other than the device/hardware.

Agree. Device/SPEC should give a possibility to create virtqueues 
with/without IRQs and it does.

The driver should use this capability.

I don't see any change in the virtio blk config space needed.

>
>> BTW, I have other idea about it.
>>
>> How about adding “num-poll-queues" property as a driver parameter
>> like NVMe driver, not to QEMU virtio-blk-pci property?
> It should be fine, but we need to listen to others.
>
>> If then, we don’t need to modify virtio_blk_config.
>> And we can apply the polling feature only to virtio-blk-pci.
>> But can QEMU pass “num-poll-queues" to virtio-blk driver param?
> As Michael said we can leave this to guest kernel / administrator.
>
>>
>>
>> 2) Does driver polling help for the qemu implementation without polling?
>>
>> Sorry, I didn't understand your question. Could you please explain more about?
> I mean does the polling work for the ordinary qemu block device
> without busy polling?
>
> Thanks
>
>> Regards,
>> Suwan Kim
>>
Stefan Hajnoczi March 16, 2022, 12:20 p.m. UTC | #27
On Tue, Mar 15, 2022 at 10:55:04PM +0900, Suwan Kim wrote:
> On Mon, Mar 14, 2022 at 03:19:01PM +0000, Stefan Hajnoczi wrote:
> > On Sat, Mar 12, 2022 at 12:28:32AM +0900, Suwan Kim wrote:
> > > This patch supports polling I/O via virtio-blk driver. Polling
> > > feature is enabled based on "VIRTIO_BLK_F_MQ" feature and the number
> > > of polling queues can be set by QEMU virtio-blk-pci property
> > > "num-poll-queues=N". This patch improves the polling I/O throughput
> > > and latency.
> > > 
> > > The virtio-blk driver doesn't not have a poll function and a poll
> > > queue and it has been operating in interrupt driven method even if
> > > the polling function is called in the upper layer.
> > > 
> > > virtio-blk polling is implemented upon 'batched completion' of block
> > > layer. virtblk_poll() queues completed request to io_comp_batch->req_list
> > > and later, virtblk_complete_batch() calls unmap function and ends
> > > the requests in batch.
> > > 
> > > virtio-blk reads the number of queues and poll queues from QEMU
> > > virtio-blk-pci properties ("num-queues=N", "num-poll-queues=M").
> > > It allocates N virtqueues to virtio_blk->vqs[N] and it uses [0..(N-M-1)]
> > > as default queues and [(N-M)..(N-1)] as poll queues. Unlike the default
> > > queues, the poll queues have no callback function.
> > > 
> > > Regarding HW-SW queue mapping, the default queue mapping uses the
> > > existing method that condsiders MSI irq vector. But the poll queue
> > > doesn't have an irq, so it uses the regular blk-mq cpu mapping.
> > > 
> > > To enable poll queues, "num-poll-queues=N" property of virtio-blk-pci
> > > needs to be added to QEMU command line. For that, I temporarily
> > > implemented the property on QEMU. Please refer to the git repository below.
> > > 
> > > 	git : https://github.com/asfaca/qemu.git #on master branch commit
> > > 
> > > For verifying the improvement, I did Fio polling I/O performance test
> > > with io_uring engine with the options below.
> > > (io_uring, hipri, randread, direct=1, bs=512, iodepth=64 numjobs=N)
> > > I set 4 vcpu and 4 virtio-blk queues - 2 default queues and 2 poll
> > > queues for VM.
> > > (-device virtio-blk-pci,num-queues=4,num-poll-queues=2)
> > > As a result, IOPS and average latency improved about 10%.
> > > 
> > > Test result:
> > > 
> > > - Fio io_uring poll without virtio-blk poll support
> > > 	-- numjobs=1 : IOPS = 297K, avg latency = 214.59us
> > > 	-- numjobs=2 : IOPS = 360K, avg latency = 363.88us
> > > 	-- numjobs=4 : IOPS = 289K, avg latency = 885.42us
> > > 
> > > - Fio io_uring poll with virtio-blk poll support
> > > 	-- numjobs=1 : IOPS = 332K, avg latency = 192.61us
> > > 	-- numjobs=2 : IOPS = 371K, avg latency = 348.31us
> > > 	-- numjobs=4 : IOPS = 321K, avg latency = 795.93us
> > 
> > Last year there was a patch series that switched regular queues into
> > polling queues when HIPRI requests were in flight:
> > https://lore.kernel.org/linux-block/20210520141305.355961-1-stefanha@redhat.com/T/
> > 
> > The advantage is that polling is possible without prior device
> > configuration, making it easier for users.
> > 
> > However, the dynamic approach is a bit more complex and bugs can result
> > in lost irqs (hung I/O). Christoph Hellwig asked for dedicated polling
> > queues, which your patch series now delivers.
> > 
> > I think your patch series is worth merging once the comments others have
> > already made have been addressed. I'll keep an eye out for the VIRTIO
> > spec change to extend the virtio-blk configuration space, which needs to
> > be accepted before the Linux can be merged.
> 
> Thanks for the feedback :)
> There's a lot of history.. I will try to improve the patch.
> 
> It might take some time because it need more discussion about qemu
> device property and I do this in my night time.

I see, it's great that you're making this contribution. Don't worry
about the old patch series I linked. I think your approach is fine.

Stefan
Suwan Kim March 16, 2022, 1:32 p.m. UTC | #28
On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
> On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> >
> > On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > >
> > > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > > >
> > > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > > b/include/uapi/linux/virtio_blk.h
> > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > >      * deallocation of one or more of the sectors.
> > > > > >      */
> > > > > >     __u8 write_zeroes_may_unmap;
> > > > > > +   __u8 unused1;
> > > > > > -   __u8 unused1[3];
> > > > > > +   __virtio16 num_poll_queues;
> > > > > >   } __attribute__((packed));
> > > > >
> > > > >
> > > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > > how
> > > > > about other implementation like vhost-user-blk?
> > > >
> > > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > > use vritio_blk_config as kernel-qemu interface?
> > > >
> > >
> > > Yes, but see below.
> > >
> > >
> > > >
> > > > Does vhost-user-blk need additional modification to support polling
> > > > in kernel side?
> > > >
> > >
> > >
> > > No, but the issue is, things like polling looks not a good candidate for
> > > the attributes belonging to the device but the driver. So I have more
> > > questions:
> > >
> > > 1) what does it really mean for hardware virtio block devices?
> > > 2) Does driver polling help for the qemu implementation without polling?
> > > 3) Using blk_config means we can only get the benefit from the new device
> >
> > 1) what does it really mean for hardware virtio block devices?
> > 3) Using blk_config means we can only get the benefit from the new device
> >
> > This patch adds dedicated HW queue for polling purpose to virtio
> > block device.
> >
> > So I think it can be a new hw feature. And it can be a new device
> > that supports hw poll queue.
> 
> One possible issue is that the "poll" looks more like a
> software/driver concept other than the device/hardware.
> 
> >
> > BTW, I have other idea about it.
> >
> > How about adding “num-poll-queues" property as a driver parameter
> > like NVMe driver, not to QEMU virtio-blk-pci property?
> 
> It should be fine, but we need to listen to others.
> 
> >
> > If then, we don’t need to modify virtio_blk_config.
> > And we can apply the polling feature only to virtio-blk-pci.
> > But can QEMU pass “num-poll-queues" to virtio-blk driver param?
> 
> As Michael said we can leave this to guest kernel / administrator.
> 
> >
> >
> >
> > 2) Does driver polling help for the qemu implementation without polling?
> >
> > Sorry, I didn't understand your question. Could you please explain more about?
> 
> I mean does the polling work for the ordinary qemu block device
> without busy polling?
 

Yes. The test result in the patch description was done on the normal
qemu without polling. QEMU just passes 'num-poll-queues' to a guest
without any additional settings.
It improves guest I/O performance.

Regards,
Suwan Kim
Suwan Kim March 16, 2022, 3:32 p.m. UTC | #29
On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
> On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> >
> > On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > >
> > > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > > >
> > > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > > b/include/uapi/linux/virtio_blk.h
> > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > >      * deallocation of one or more of the sectors.
> > > > > >      */
> > > > > >     __u8 write_zeroes_may_unmap;
> > > > > > +   __u8 unused1;
> > > > > > -   __u8 unused1[3];
> > > > > > +   __virtio16 num_poll_queues;
> > > > > >   } __attribute__((packed));
> > > > >
> > > > >
> > > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > > how
> > > > > about other implementation like vhost-user-blk?
> > > >
> > > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > > use vritio_blk_config as kernel-qemu interface?
> > > >
> > >
> > > Yes, but see below.
> > >
> > >
> > > >
> > > > Does vhost-user-blk need additional modification to support polling
> > > > in kernel side?
> > > >
> > >
> > >
> > > No, but the issue is, things like polling looks not a good candidate for
> > > the attributes belonging to the device but the driver. So I have more
> > > questions:
> > >
> > > 1) what does it really mean for hardware virtio block devices?
> > > 2) Does driver polling help for the qemu implementation without polling?
> > > 3) Using blk_config means we can only get the benefit from the new device
> >
> > 1) what does it really mean for hardware virtio block devices?
> > 3) Using blk_config means we can only get the benefit from the new device
> >
> > This patch adds dedicated HW queue for polling purpose to virtio
> > block device.
> >
> > So I think it can be a new hw feature. And it can be a new device
> > that supports hw poll queue.
> 
> One possible issue is that the "poll" looks more like a
> software/driver concept other than the device/hardware.
> 
> >
> > BTW, I have other idea about it.
> >
> > How about adding “num-poll-queues" property as a driver parameter
> > like NVMe driver, not to QEMU virtio-blk-pci property?
> 
> It should be fine, but we need to listen to others.

To Michael, Stefan, Max

How about using driver parameter instead of virio_blk_config?

Regards,
Suwan Kim
Max Gurtovoy March 16, 2022, 3:36 p.m. UTC | #30
On 3/16/2022 5:32 PM, Suwan Kim wrote:
> On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
>> On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>>> On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
>>>> On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>>>>
>>>>> On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
>>>>>> 在 2022/3/11 下午11:28, Suwan Kim 写道:
>>>>>>> diff --git a/include/uapi/linux/virtio_blk.h
>>>>> b/include/uapi/linux/virtio_blk.h
>>>>>>> index d888f013d9ff..3fcaf937afe1 100644
>>>>>>> --- a/include/uapi/linux/virtio_blk.h
>>>>>>> +++ b/include/uapi/linux/virtio_blk.h
>>>>>>> @@ -119,8 +119,9 @@ struct virtio_blk_config {
>>>>>>>       * deallocation of one or more of the sectors.
>>>>>>>       */
>>>>>>>      __u8 write_zeroes_may_unmap;
>>>>>>> +   __u8 unused1;
>>>>>>> -   __u8 unused1[3];
>>>>>>> +   __virtio16 num_poll_queues;
>>>>>>>    } __attribute__((packed));
>>>>>>
>>>>>> This looks like a implementation specific (virtio-blk-pci) optimization,
>>>>> how
>>>>>> about other implementation like vhost-user-blk?
>>>>> I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
>>>>> use vritio_blk_config as kernel-qemu interface?
>>>>>
>>>> Yes, but see below.
>>>>
>>>>
>>>>> Does vhost-user-blk need additional modification to support polling
>>>>> in kernel side?
>>>>>
>>>>
>>>> No, but the issue is, things like polling looks not a good candidate for
>>>> the attributes belonging to the device but the driver. So I have more
>>>> questions:
>>>>
>>>> 1) what does it really mean for hardware virtio block devices?
>>>> 2) Does driver polling help for the qemu implementation without polling?
>>>> 3) Using blk_config means we can only get the benefit from the new device
>>> 1) what does it really mean for hardware virtio block devices?
>>> 3) Using blk_config means we can only get the benefit from the new device
>>>
>>> This patch adds dedicated HW queue for polling purpose to virtio
>>> block device.
>>>
>>> So I think it can be a new hw feature. And it can be a new device
>>> that supports hw poll queue.
>> One possible issue is that the "poll" looks more like a
>> software/driver concept other than the device/hardware.
>>
>>> BTW, I have other idea about it.
>>>
>>> How about adding “num-poll-queues" property as a driver parameter
>>> like NVMe driver, not to QEMU virtio-blk-pci property?
>> It should be fine, but we need to listen to others.
> To Michael, Stefan, Max
>
> How about using driver parameter instead of virio_blk_config?

Yes. I mentioned that virtio_blk_config shouldn't change.

The spec allow creating a virtq with no interrupts.

As agreed: polling logic is orthogonal to batching logic (.queue_rqs).

>
> Regards,
> Suwan Kim
Stefan Hajnoczi March 16, 2022, 4 p.m. UTC | #31
On Wed, Mar 16, 2022 at 05:36:20PM +0200, Max Gurtovoy wrote:
> 
> On 3/16/2022 5:32 PM, Suwan Kim wrote:
> > On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
> > > On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > > > On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > > > > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > > > > 
> > > > > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > > > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > > > > b/include/uapi/linux/virtio_blk.h
> > > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > > >       * deallocation of one or more of the sectors.
> > > > > > > >       */
> > > > > > > >      __u8 write_zeroes_may_unmap;
> > > > > > > > +   __u8 unused1;
> > > > > > > > -   __u8 unused1[3];
> > > > > > > > +   __virtio16 num_poll_queues;
> > > > > > > >    } __attribute__((packed));
> > > > > > > 
> > > > > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > > > > how
> > > > > > > about other implementation like vhost-user-blk?
> > > > > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > > > > use vritio_blk_config as kernel-qemu interface?
> > > > > > 
> > > > > Yes, but see below.
> > > > > 
> > > > > 
> > > > > > Does vhost-user-blk need additional modification to support polling
> > > > > > in kernel side?
> > > > > > 
> > > > > 
> > > > > No, but the issue is, things like polling looks not a good candidate for
> > > > > the attributes belonging to the device but the driver. So I have more
> > > > > questions:
> > > > > 
> > > > > 1) what does it really mean for hardware virtio block devices?
> > > > > 2) Does driver polling help for the qemu implementation without polling?
> > > > > 3) Using blk_config means we can only get the benefit from the new device
> > > > 1) what does it really mean for hardware virtio block devices?
> > > > 3) Using blk_config means we can only get the benefit from the new device
> > > > 
> > > > This patch adds dedicated HW queue for polling purpose to virtio
> > > > block device.
> > > > 
> > > > So I think it can be a new hw feature. And it can be a new device
> > > > that supports hw poll queue.
> > > One possible issue is that the "poll" looks more like a
> > > software/driver concept other than the device/hardware.
> > > 
> > > > BTW, I have other idea about it.
> > > > 
> > > > How about adding “num-poll-queues" property as a driver parameter
> > > > like NVMe driver, not to QEMU virtio-blk-pci property?
> > > It should be fine, but we need to listen to others.
> > To Michael, Stefan, Max
> > 
> > How about using driver parameter instead of virio_blk_config?
> 
> Yes. I mentioned that virtio_blk_config shouldn't change.

There are pros and cons to module parameters and configuration space
fields. Both are valid but feel free to drop the configuration space
field.

On note about module parameters: if the guest has a virtio-blk device
with a root file system and another one for benchmarking then it's
unfortunate that the number of poll queues module parameter affects both
devices.

Is there a way to set a module parameter for a specific device instance?
I guess you'd need something else like a sysfs attribute instead but the
implementation is more complex.

I'm fine with a module parameter.

Stefan
Jason Wang March 17, 2022, 2:20 a.m. UTC | #32
On Wed, Mar 16, 2022 at 11:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
>
> On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
> > On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > >
> > > On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > > > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > > >
> > > > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > > > >
> > > > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > > > b/include/uapi/linux/virtio_blk.h
> > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > >      * deallocation of one or more of the sectors.
> > > > > > >      */
> > > > > > >     __u8 write_zeroes_may_unmap;
> > > > > > > +   __u8 unused1;
> > > > > > > -   __u8 unused1[3];
> > > > > > > +   __virtio16 num_poll_queues;
> > > > > > >   } __attribute__((packed));
> > > > > >
> > > > > >
> > > > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > > > how
> > > > > > about other implementation like vhost-user-blk?
> > > > >
> > > > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > > > use vritio_blk_config as kernel-qemu interface?
> > > > >
> > > >
> > > > Yes, but see below.
> > > >
> > > >
> > > > >
> > > > > Does vhost-user-blk need additional modification to support polling
> > > > > in kernel side?
> > > > >
> > > >
> > > >
> > > > No, but the issue is, things like polling looks not a good candidate for
> > > > the attributes belonging to the device but the driver. So I have more
> > > > questions:
> > > >
> > > > 1) what does it really mean for hardware virtio block devices?
> > > > 2) Does driver polling help for the qemu implementation without polling?
> > > > 3) Using blk_config means we can only get the benefit from the new device
> > >
> > > 1) what does it really mean for hardware virtio block devices?
> > > 3) Using blk_config means we can only get the benefit from the new device
> > >
> > > This patch adds dedicated HW queue for polling purpose to virtio
> > > block device.
> > >
> > > So I think it can be a new hw feature. And it can be a new device
> > > that supports hw poll queue.
> >
> > One possible issue is that the "poll" looks more like a
> > software/driver concept other than the device/hardware.
> >
> > >
> > > BTW, I have other idea about it.
> > >
> > > How about adding “num-poll-queues" property as a driver parameter
> > > like NVMe driver, not to QEMU virtio-blk-pci property?
> >
> > It should be fine, but we need to listen to others.
>
> To Michael, Stefan, Max
>
> How about using driver parameter instead of virio_blk_config?

I agree.

Thanks

>
> Regards,
> Suwan Kim
>
Suwan Kim March 17, 2022, 3:03 p.m. UTC | #33
On Thu, Mar 17, 2022 at 10:20:05AM +0800, Jason Wang wrote:
> On Wed, Mar 16, 2022 at 11:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> >
> > On Wed, Mar 16, 2022 at 10:02:13AM +0800, Jason Wang wrote:
> > > On Tue, Mar 15, 2022 at 10:43 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 15, 2022 at 04:59:23PM +0800, Jason Wang wrote:
> > > > > On Mon, Mar 14, 2022 at 8:33 PM Suwan Kim <suwan.kim027@gmail.com> wrote:
> > > > >
> > > > > > On Mon, Mar 14, 2022 at 02:14:53PM +0800, Jason Wang wrote:
> > > > > > >
> > > > > > > 在 2022/3/11 下午11:28, Suwan Kim 写道:
> > > > > > > > diff --git a/include/uapi/linux/virtio_blk.h
> > > > > > b/include/uapi/linux/virtio_blk.h
> > > > > > > > index d888f013d9ff..3fcaf937afe1 100644
> > > > > > > > --- a/include/uapi/linux/virtio_blk.h
> > > > > > > > +++ b/include/uapi/linux/virtio_blk.h
> > > > > > > > @@ -119,8 +119,9 @@ struct virtio_blk_config {
> > > > > > > >      * deallocation of one or more of the sectors.
> > > > > > > >      */
> > > > > > > >     __u8 write_zeroes_may_unmap;
> > > > > > > > +   __u8 unused1;
> > > > > > > > -   __u8 unused1[3];
> > > > > > > > +   __virtio16 num_poll_queues;
> > > > > > > >   } __attribute__((packed));
> > > > > > >
> > > > > > >
> > > > > > > This looks like a implementation specific (virtio-blk-pci) optimization,
> > > > > > how
> > > > > > > about other implementation like vhost-user-blk?
> > > > > >
> > > > > > I didn’t consider vhost-user-blk yet. But does vhost-user-blk also
> > > > > > use vritio_blk_config as kernel-qemu interface?
> > > > > >
> > > > >
> > > > > Yes, but see below.
> > > > >
> > > > >
> > > > > >
> > > > > > Does vhost-user-blk need additional modification to support polling
> > > > > > in kernel side?
> > > > > >
> > > > >
> > > > >
> > > > > No, but the issue is, things like polling looks not a good candidate for
> > > > > the attributes belonging to the device but the driver. So I have more
> > > > > questions:
> > > > >
> > > > > 1) what does it really mean for hardware virtio block devices?
> > > > > 2) Does driver polling help for the qemu implementation without polling?
> > > > > 3) Using blk_config means we can only get the benefit from the new device
> > > >
> > > > 1) what does it really mean for hardware virtio block devices?
> > > > 3) Using blk_config means we can only get the benefit from the new device
> > > >
> > > > This patch adds dedicated HW queue for polling purpose to virtio
> > > > block device.
> > > >
> > > > So I think it can be a new hw feature. And it can be a new device
> > > > that supports hw poll queue.
> > >
> > > One possible issue is that the "poll" looks more like a
> > > software/driver concept other than the device/hardware.
> > >
> > > >
> > > > BTW, I have other idea about it.
> > > >
> > > > How about adding “num-poll-queues" property as a driver parameter
> > > > like NVMe driver, not to QEMU virtio-blk-pci property?
> > >
> > > It should be fine, but we need to listen to others.
> >
> > To Michael, Stefan, Max
> >
> > How about using driver parameter instead of virio_blk_config?
> 
> I agree.
> 
> Thanks
 
Okay. Then I will send v2 with .queue_rqs patch

Regards,
Suwan Kim
diff mbox series

Patch

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8c415be86732..bfde7d97d528 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -81,6 +81,7 @@  struct virtio_blk {
 
 	/* num of vqs */
 	int num_vqs;
+	int io_queues[HCTX_MAX_TYPES];
 	struct virtio_blk_vq *vqs;
 };
 
@@ -548,6 +549,7 @@  static int init_vq(struct virtio_blk *vblk)
 	const char **names;
 	struct virtqueue **vqs;
 	unsigned short num_vqs;
+	unsigned short num_poll_vqs;
 	struct virtio_device *vdev = vblk->vdev;
 	struct irq_affinity desc = { 0, };
 
@@ -556,6 +558,13 @@  static int init_vq(struct virtio_blk *vblk)
 				   &num_vqs);
 	if (err)
 		num_vqs = 1;
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+				struct virtio_blk_config, num_poll_queues,
+				&num_poll_vqs);
+	if (err)
+		num_poll_vqs = 0;
+
 	if (!err && !num_vqs) {
 		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
 		return -EINVAL;
@@ -565,6 +574,13 @@  static int init_vq(struct virtio_blk *vblk)
 			min_not_zero(num_request_queues, nr_cpu_ids),
 			num_vqs);
 
+	num_poll_vqs = min_t(unsigned int, num_poll_vqs, num_vqs - 1);
+
+	memset(vblk->io_queues, 0, sizeof(int) * HCTX_MAX_TYPES);
+	vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
+	vblk->io_queues[HCTX_TYPE_READ] = 0;
+	vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
+
 	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
 	if (!vblk->vqs)
 		return -ENOMEM;
@@ -578,8 +594,13 @@  static int init_vq(struct virtio_blk *vblk)
 	}
 
 	for (i = 0; i < num_vqs; i++) {
-		callbacks[i] = virtblk_done;
-		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+		if (i < num_vqs - num_poll_vqs) {
+			callbacks[i] = virtblk_done;
+			snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+		} else {
+			callbacks[i] = NULL;
+			snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
+		}
 		names[i] = vblk->vqs[i].name;
 	}
 
@@ -728,16 +749,82 @@  static const struct attribute_group *virtblk_attr_groups[] = {
 static int virtblk_map_queues(struct blk_mq_tag_set *set)
 {
 	struct virtio_blk *vblk = set->driver_data;
+	int i, qoff;
+
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = vblk->io_queues[i];
+		map->queue_offset = qoff;
+		qoff += map->nr_queues;
+
+		if (map->nr_queues == 0)
+			continue;
+
+		if (i == HCTX_TYPE_DEFAULT)
+			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
+		else
+			blk_mq_map_queues(&set->map[i]);
+	}
+
+	return 0;
+}
+
+static void virtblk_complete_batch(struct io_comp_batch *iob)
+{
+	struct request *req;
+	struct virtblk_req *vbr;
 
-	return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
-					vblk->vdev, 0);
+	rq_list_for_each(&iob->req_list, req) {
+		vbr = blk_mq_rq_to_pdu(req);
+		virtblk_unmap_data(req, vbr);
+		virtblk_cleanup_cmd(req);
+	}
+	blk_mq_end_request_batch(iob);
+}
+
+static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct virtio_blk_vq *vq = hctx->driver_data;
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+	int found = 0;
+
+	spin_lock_irqsave(&vq->lock, flags);
+
+	while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
+		struct request *req = blk_mq_rq_from_pdu(vbr);
+
+		found++;
+		if (!blk_mq_add_to_batch(req, iob, virtblk_result(vbr),
+						virtblk_complete_batch))
+			blk_mq_complete_request(req);
+	}
+
+	spin_unlock_irqrestore(&vq->lock, flags);
+
+	return found;
+}
+
+static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
+{
+	struct virtio_blk *vblk = data;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx];
+
+	WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags);
+	hctx->driver_data = vq;
+	return 0;
 }
 
 static const struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
 	.commit_rqs	= virtio_commit_rqs,
+	.init_hctx	= virtblk_init_hctx,
 	.complete	= virtblk_request_done,
 	.map_queues	= virtblk_map_queues,
+	.poll		= virtblk_poll,
 };
 
 static unsigned int virtblk_queue_depth;
@@ -816,6 +903,9 @@  static int virtblk_probe(struct virtio_device *vdev)
 		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
 	vblk->tag_set.driver_data = vblk;
 	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
+	vblk->tag_set.nr_maps = 1;
+	if (vblk->io_queues[HCTX_TYPE_POLL])
+		vblk->tag_set.nr_maps = 3;
 
 	err = blk_mq_alloc_tag_set(&vblk->tag_set);
 	if (err)
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index d888f013d9ff..3fcaf937afe1 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -119,8 +119,9 @@  struct virtio_blk_config {
 	 * deallocation of one or more of the sectors.
 	 */
 	__u8 write_zeroes_may_unmap;
+	__u8 unused1;
 
-	__u8 unused1[3];
+	__virtio16 num_poll_queues;
 } __attribute__((packed));
 
 /*