diff mbox series

[V4,4/6] vhost_vdpa: implement IRQ offloading in vhost_vdpa

Message ID 20200728042405.17579-5-lingshan.zhu@intel.com (mailing list archive)
State New, archived
Headers show
Series IRQ offloading for vDPA | expand

Commit Message

Zhu, Lingshan July 28, 2020, 4:24 a.m. UTC
This patch introduce a set of functions for setup/unsetup
and update irq offloading respectively by register/unregister
and re-register the irq_bypass_producer.

With these functions, this commit can setup/unsetup
irq offloading through setting DRIVER_OK/!DRIVER_OK, and
update irq offloading through SET_VRING_CALL.

Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/Kconfig |  1 +
 drivers/vhost/vdpa.c  | 79 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 1 deletion(-)

Comments

Jason Wang July 28, 2020, 7:53 a.m. UTC | #1
On 2020/7/28 下午12:24, Zhu Lingshan wrote:
> This patch introduce a set of functions for setup/unsetup
> and update irq offloading respectively by register/unregister
> and re-register the irq_bypass_producer.
>
> With these functions, this commit can setup/unsetup
> irq offloading through setting DRIVER_OK/!DRIVER_OK, and
> update irq offloading through SET_VRING_CALL.
>
> Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
> Suggested-by: Jason Wang <jasowang@redhat.com>
> ---
>   drivers/vhost/Kconfig |  1 +
>   drivers/vhost/vdpa.c  | 79 ++++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 79 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index d3688c6afb87..587fbae06182 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -65,6 +65,7 @@ config VHOST_VDPA
>   	tristate "Vhost driver for vDPA-based backend"
>   	depends on EVENTFD
>   	select VHOST
> +	select IRQ_BYPASS_MANAGER
>   	depends on VDPA
>   	help
>   	  This kernel module can be loaded in host kernel to accelerate
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index df3cf386b0cd..1dccced321f8 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -115,6 +115,55 @@ static irqreturn_t vhost_vdpa_config_cb(void *private)
>   	return IRQ_HANDLED;
>   }
>   
> +static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
> +{
> +	struct vhost_virtqueue *vq = &v->vqs[qid];
> +	const struct vdpa_config_ops *ops = v->vdpa->config;
> +	struct vdpa_device *vdpa = v->vdpa;
> +	int ret, irq;
> +
> +	spin_lock(&vq->call_ctx.ctx_lock);
> +	irq = ops->get_vq_irq(vdpa, qid);
> +	if (!vq->call_ctx.ctx || irq == -EINVAL) {


It's better to check returned irq as "irq < 0" to be more robust. 
Forcing a specific errno value is not good.


> +		spin_unlock(&vq->call_ctx.ctx_lock);
> +		return;
> +	}
> +
> +	vq->call_ctx.producer.token = vq->call_ctx.ctx;
> +	vq->call_ctx.producer.irq = irq;
> +	ret = irq_bypass_register_producer(&vq->call_ctx.producer);
> +	spin_unlock(&vq->call_ctx.ctx_lock);
> +}
> +
> +static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, int qid)
> +{
> +	struct vhost_virtqueue *vq = &v->vqs[qid];
> +
> +	spin_lock(&vq->call_ctx.ctx_lock);
> +	irq_bypass_unregister_producer(&vq->call_ctx.producer);
> +	spin_unlock(&vq->call_ctx.ctx_lock);
> +}
> +
> +static void vhost_vdpa_update_vq_irq(struct vhost_virtqueue *vq)
> +{
> +	spin_lock(&vq->call_ctx.ctx_lock);
> +	/*
> +	 * if it has a non-zero irq, means there is a
> +	 * previsouly registered irq_bypass_producer,
> +	 * we should update it when ctx (its token)
> +	 * changes.
> +	 */
> +	if (!vq->call_ctx.producer.irq) {
> +		spin_unlock(&vq->call_ctx.ctx_lock);
> +		return;
> +	}
> +
> +	irq_bypass_unregister_producer(&vq->call_ctx.producer);
> +	vq->call_ctx.producer.token = vq->call_ctx.ctx;
> +	irq_bypass_register_producer(&vq->call_ctx.producer);
> +	spin_unlock(&vq->call_ctx.ctx_lock);
> +}
> +
>   static void vhost_vdpa_reset(struct vhost_vdpa *v)
>   {
>   	struct vdpa_device *vdpa = v->vdpa;
> @@ -155,11 +204,15 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
>   {
>   	struct vdpa_device *vdpa = v->vdpa;
>   	const struct vdpa_config_ops *ops = vdpa->config;
> -	u8 status;
> +	u8 status, status_old;
> +	int i, nvqs;
>   
>   	if (copy_from_user(&status, statusp, sizeof(status)))
>   		return -EFAULT;
>   
> +	status_old = ops->get_status(vdpa);
> +	nvqs = v->nvqs;
> +
>   	/*
>   	 * Userspace shouldn't remove status bits unless reset the
>   	 * status to 0.
> @@ -167,6 +220,15 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
>   	if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
>   		return -EINVAL;
>   
> +	/* vq irq is not expected to be changed once DRIVER_OK is set */


So this basically limit the usage of get_vq_irq() in the context 
vhost_vdpa_set_status() and other vDPA bus drivers' set_status(). If 
this is true, there's even no need to introduce any new config ops but 
just let set_status() to return the irqs used for the device. Or if we 
want this to be more generic, we need vpda's own irq manager (which 
should be similar to irq bypass manager). That is:

- bus driver can register itself as consumer
- vDPA device driver can register itself as producer
- matching via queue index
- deal with registering/unregistering of consumer/producer

So there's no need to care when or where the vDPA device driver to 
allocate the irq, and we don't need to care at which context the vDPA 
bus driver can use the irq. Either side may get notified when the other 
side is gone (though we only care about the gone of producer probably).

Any thought on this?

Thanks


> +	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
> +		for (i = 0; i < nvqs; i++)
> +			vhost_vdpa_setup_vq_irq(v, i);
> +
> +	if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK))
> +		for (i = 0; i < nvqs; i++)
> +			vhost_vdpa_unsetup_vq_irq(v, i);
> +
>   	ops->set_status(vdpa, status);
>   
>   	return 0;
> @@ -332,6 +394,7 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
>   
>   	return 0;
>   }
> +
>   static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
>   				   void __user *argp)
>   {
> @@ -390,6 +453,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
>   			cb.private = NULL;
>   		}
>   		ops->set_vq_cb(vdpa, idx, &cb);
> +		vhost_vdpa_update_vq_irq(vq);
>   		break;
>   
>   	case VHOST_SET_VRING_NUM:
> @@ -765,6 +829,18 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
>   	return r;
>   }
>   
> +static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i;
> +
> +	for (i = 0; i < v->nvqs; i++) {
> +		vq = &v->vqs[i];
> +		if (vq->call_ctx.producer.irq)
> +			irq_bypass_unregister_producer(&vq->call_ctx.producer);
> +	}
> +}
> +
>   static int vhost_vdpa_release(struct inode *inode, struct file *filep)
>   {
>   	struct vhost_vdpa *v = filep->private_data;
> @@ -777,6 +853,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep)
>   	vhost_vdpa_iotlb_free(v);
>   	vhost_vdpa_free_domain(v);
>   	vhost_vdpa_config_put(v);
> +	vhost_vdpa_clean_irq(v);
>   	vhost_dev_cleanup(&v->vdev);
>   	kfree(v->vdev.vqs);
>   	mutex_unlock(&d->mutex);
Eli Cohen July 28, 2020, 9:04 a.m. UTC | #2
On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
>  
> +static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
> +{
> +	struct vhost_virtqueue *vq = &v->vqs[qid];
> +	const struct vdpa_config_ops *ops = v->vdpa->config;
> +	struct vdpa_device *vdpa = v->vdpa;
> +	int ret, irq;
> +
> +	spin_lock(&vq->call_ctx.ctx_lock);
> +	irq = ops->get_vq_irq(vdpa, qid);
> +	if (!vq->call_ctx.ctx || irq == -EINVAL) {
> +		spin_unlock(&vq->call_ctx.ctx_lock);
> +		return;
> +	}
> +

If I understand correctly, this will cause these IRQs to be forwarded
directly to the VCPU, e.g. will be handled by the guest/qemu.
Does this mean that the host will not handle this interrupt? How does it
work in case on level triggered interrupts?

In the case of ConnectX, I need to execute some code to acknowledge the
interrupt.

Can you explain how this should be done?
Jason Wang July 28, 2020, 10:29 a.m. UTC | #3
On 2020/7/28 下午5:18, Zhu, Lingshan wrote:
>>>
>>>        * status to 0.
>>> @@ -167,6 +220,15 @@ static long vhost_vdpa_set_status(struct 
>>> vhost_vdpa *v, u8 __user *statusp)
>>>       if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
>>>           return -EINVAL;
>>>   +    /* vq irq is not expected to be changed once DRIVER_OK is set */
>>
>>
>> So this basically limit the usage of get_vq_irq() in the context 
>> vhost_vdpa_set_status() and other vDPA bus drivers' set_status(). If 
>> this is true, there's even no need to introduce any new config ops 
>> but just let set_status() to return the irqs used for the device. Or 
>> if we want this to be more generic, we need vpda's own irq manager 
>> (which should be similar to irq bypass manager). That is:
> I think there is no need for a driver to free / re-request its irqs after DRIVER_OK though it can do so. If a driver changed its irq of a vq after DRIVER_OK, the vq is still operational but will lose irq offloading that is reasonable.
> If we want set_status() return irqs, we need to record the irqs somewhere in vdpa_device,


Why, we can simply pass an array to the driver I think?

void (*set_status)(struct vdpa_device *vdev, u8 status, int *irqs);


> as we discussed in a previous thread, this may need initialize and cleanup works, so a new ops
> with proper comments (don't say they could not change irq, but highlight if irq changes, irq offloading will not work till next DRIVER_OK) could be more elegant.
> However if we really need to change irq after DRIVER_OK, I think maybe we still need vDPA vq irq allocate / free helpers, then the helpers can not be used in probe() as we discussed before, it is a step back to V3 series.


Still, it's not about whether driver may change irq after DRIVER_OK but 
implication of the assumption. If one bus ops must be called in another 
ops, it's better to just implement them in one ops.


>>
>> - bus driver can register itself as consumer
>> - vDPA device driver can register itself as producer
>> - matching via queue index
> IMHO, is it too heavy for this feature,


Do you mean LOCs? We can:

1) refactor irq bypass manager
2) invent it our own (a much simplified version compared to bypass manager)
3) enforcing them via vDPA bus

Each of the above should be not a lot of coding. I think method 3 is 
partially done in your previous series but in an implicit manner:

- bus driver that has alloc_irq/free_irq implemented could be implicitly 
treated as consumer registering
- every vDPA device driver could be treated as producer
- vdpa_devm_alloc_irq() could be treated as producer registering
- alloc_irq/free_irq is the add_producer/del_procuer

We probably just lack some synchronization with driver probe/remove.


> and how can they match if two individual adapters both have vq idx = 1.


The matching is per vDPA device.

Thanks


> Thanks!
>> - deal with registering/unregistering of consumer/producer
>>
>> So there's no need to care when or where the vDPA device driver to 
>> allocate the irq, and we don't need to care at which context the vDPA 
>> bus driver can use the irq. Either side may get notified when the 
>> other side is gone (though we only care about the gone of producer 
>> probably).
>>
>> Any thought on this?
>>
>> Thanks
>>
>>
Jason Wang July 29, 2020, 9:21 a.m. UTC | #4
On 2020/7/28 下午5:04, Eli Cohen wrote:
> On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
>>   
>> +static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
>> +{
>> +	struct vhost_virtqueue *vq = &v->vqs[qid];
>> +	const struct vdpa_config_ops *ops = v->vdpa->config;
>> +	struct vdpa_device *vdpa = v->vdpa;
>> +	int ret, irq;
>> +
>> +	spin_lock(&vq->call_ctx.ctx_lock);
>> +	irq = ops->get_vq_irq(vdpa, qid);
>> +	if (!vq->call_ctx.ctx || irq == -EINVAL) {
>> +		spin_unlock(&vq->call_ctx.ctx_lock);
>> +		return;
>> +	}
>> +
> If I understand correctly, this will cause these IRQs to be forwarded
> directly to the VCPU, e.g. will be handled by the guest/qemu.


Yes, if it can bypassed, the interrupt will be delivered to vCPU directly.


> Does this mean that the host will not handle this interrupt? How does it
> work in case on level triggered interrupts?


There's no guarantee that the KVM arch code can make sure the irq bypass 
work for any type of irq. So if they the irq will still need to be 
handled by host first. This means we should keep the host interrupt 
handler as a slowpath (fallback).


>
> In the case of ConnectX, I need to execute some code to acknowledge the
> interrupt.


This turns out to be hard for irq bypassing to work. Is it because the 
irq is shared or what kind of ack you need to do?

Thanks


>
> Can you explain how this should be done?
>
Eli Cohen July 29, 2020, 9:55 a.m. UTC | #5
On Wed, Jul 29, 2020 at 05:21:53PM +0800, Jason Wang wrote:
> 
> On 2020/7/28 下午5:04, Eli Cohen wrote:
> >On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
> >>+static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
> >>+{
> >>+	struct vhost_virtqueue *vq = &v->vqs[qid];
> >>+	const struct vdpa_config_ops *ops = v->vdpa->config;
> >>+	struct vdpa_device *vdpa = v->vdpa;
> >>+	int ret, irq;
> >>+
> >>+	spin_lock(&vq->call_ctx.ctx_lock);
> >>+	irq = ops->get_vq_irq(vdpa, qid);
> >>+	if (!vq->call_ctx.ctx || irq == -EINVAL) {
> >>+		spin_unlock(&vq->call_ctx.ctx_lock);
> >>+		return;
> >>+	}
> >>+
> >If I understand correctly, this will cause these IRQs to be forwarded
> >directly to the VCPU, e.g. will be handled by the guest/qemu.
> 
> 
> Yes, if it can bypassed, the interrupt will be delivered to vCPU directly.
> 

So, usually the network driver knows how to handle interrups for its
devices. I assume the virtio_net driver at the guest has some default
processing but what if the underlying hardware device (such as the case
of vdpa) needs to take some actions? Is there an option to do bounce the
interrupt back to the vendor specific driver in the host so it can take
these actions?

> 
> >Does this mean that the host will not handle this interrupt? How does it
> >work in case on level triggered interrupts?
> 
> 
> There's no guarantee that the KVM arch code can make sure the irq
> bypass work for any type of irq. So if they the irq will still need
> to be handled by host first. This means we should keep the host
> interrupt handler as a slowpath (fallback).
> 
> >
> >In the case of ConnectX, I need to execute some code to acknowledge the
> >interrupt.
> 
> 
> This turns out to be hard for irq bypassing to work. Is it because
> the irq is shared or what kind of ack you need to do?

I have an EQ which is a queue for events comming from the hardware. This
EQ can created so it reports only completion events but I still need to
execute code that roughly tells the device that I saw these event
records and then arm it again so it can report more interrupts (e.g if
more packets are received or sent). This is device specific code.

> 
> Thanks
> 
> 
> >
> >Can you explain how this should be done?
> >
>
Jason Wang July 29, 2020, 10:19 a.m. UTC | #6
On 2020/7/29 下午5:55, Eli Cohen wrote:
> On Wed, Jul 29, 2020 at 05:21:53PM +0800, Jason Wang wrote:
>> On 2020/7/28 下午5:04, Eli Cohen wrote:
>>> On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
>>>> +static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
>>>> +{
>>>> +	struct vhost_virtqueue *vq = &v->vqs[qid];
>>>> +	const struct vdpa_config_ops *ops = v->vdpa->config;
>>>> +	struct vdpa_device *vdpa = v->vdpa;
>>>> +	int ret, irq;
>>>> +
>>>> +	spin_lock(&vq->call_ctx.ctx_lock);
>>>> +	irq = ops->get_vq_irq(vdpa, qid);
>>>> +	if (!vq->call_ctx.ctx || irq == -EINVAL) {
>>>> +		spin_unlock(&vq->call_ctx.ctx_lock);
>>>> +		return;
>>>> +	}
>>>> +
>>> If I understand correctly, this will cause these IRQs to be forwarded
>>> directly to the VCPU, e.g. will be handled by the guest/qemu.
>>
>> Yes, if it can bypassed, the interrupt will be delivered to vCPU directly.
>>
> So, usually the network driver knows how to handle interrups for its
> devices. I assume the virtio_net driver at the guest has some default
> processing but what if the underlying hardware device (such as the case
> of vdpa) needs to take some actions?


Virtio splits the bus operations out of device operations. So did the 
driver.

The virtio-net driver depends on a transport driver to talk to the real 
device. Usually PCI is used as the transport for the device. In this 
case virtio-pci driver is in charge of dealing with irq 
allocation/free/configuration and it needs to co-operate with platform 
specific irqchip (virtualized by KVM) to finish the work like irq 
acknowledge etc.  E.g for x86, the irq offloading can only work when 
there's a hardware support of virtual irqchip (APICv) then all stuffs 
could be done without vmexits.

So no vendor specific part since the device and transport are all standard.


>   Is there an option to do bounce the
> interrupt back to the vendor specific driver in the host so it can take
> these actions?


Currently not, but even if we can do this, I'm afraid we will lose the 
performance advantage of irq bypassing.


>
>>> Does this mean that the host will not handle this interrupt? How does it
>>> work in case on level triggered interrupts?
>>
>> There's no guarantee that the KVM arch code can make sure the irq
>> bypass work for any type of irq. So if they the irq will still need
>> to be handled by host first. This means we should keep the host
>> interrupt handler as a slowpath (fallback).
>>
>>> In the case of ConnectX, I need to execute some code to acknowledge the
>>> interrupt.
>>
>> This turns out to be hard for irq bypassing to work. Is it because
>> the irq is shared or what kind of ack you need to do?
> I have an EQ which is a queue for events comming from the hardware. This
> EQ can created so it reports only completion events but I still need to
> execute code that roughly tells the device that I saw these event
> records and then arm it again so it can report more interrupts (e.g if
> more packets are received or sent). This is device specific code.


Any chance that the hardware can use MSI (which is not the case here)?

Thanks


>> Thanks
>>
>>
>>> Can you explain how this should be done?
>>>
Eli Cohen July 29, 2020, 11:13 a.m. UTC | #7
On Wed, Jul 29, 2020 at 06:19:52PM +0800, Jason Wang wrote:
I am checking internally if we can work in a mode not requiring to
acknowledge the interrupt. I will update.

Thanks for the explanations.

> 
> On 2020/7/29 下午5:55, Eli Cohen wrote:
> >On Wed, Jul 29, 2020 at 05:21:53PM +0800, Jason Wang wrote:
> >>On 2020/7/28 下午5:04, Eli Cohen wrote:
> >>>On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
> >>>>+static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
> >>>>+{
> >>>>+	struct vhost_virtqueue *vq = &v->vqs[qid];
> >>>>+	const struct vdpa_config_ops *ops = v->vdpa->config;
> >>>>+	struct vdpa_device *vdpa = v->vdpa;
> >>>>+	int ret, irq;
> >>>>+
> >>>>+	spin_lock(&vq->call_ctx.ctx_lock);
> >>>>+	irq = ops->get_vq_irq(vdpa, qid);
> >>>>+	if (!vq->call_ctx.ctx || irq == -EINVAL) {
> >>>>+		spin_unlock(&vq->call_ctx.ctx_lock);
> >>>>+		return;
> >>>>+	}
> >>>>+
> >>>If I understand correctly, this will cause these IRQs to be forwarded
> >>>directly to the VCPU, e.g. will be handled by the guest/qemu.
> >>
> >>Yes, if it can bypassed, the interrupt will be delivered to vCPU directly.
> >>
> >So, usually the network driver knows how to handle interrups for its
> >devices. I assume the virtio_net driver at the guest has some default
> >processing but what if the underlying hardware device (such as the case
> >of vdpa) needs to take some actions?
> 
> 
> Virtio splits the bus operations out of device operations. So did
> the driver.
> 
> The virtio-net driver depends on a transport driver to talk to the
> real device. Usually PCI is used as the transport for the device. In
> this case virtio-pci driver is in charge of dealing with irq
> allocation/free/configuration and it needs to co-operate with
> platform specific irqchip (virtualized by KVM) to finish the work
> like irq acknowledge etc.  E.g for x86, the irq offloading can only
> work when there's a hardware support of virtual irqchip (APICv) then
> all stuffs could be done without vmexits.
> 
> So no vendor specific part since the device and transport are all standard.
> 
> 
> >  Is there an option to do bounce the
> >interrupt back to the vendor specific driver in the host so it can take
> >these actions?
> 
> 
> Currently not, but even if we can do this, I'm afraid we will lose
> the performance advantage of irq bypassing.
> 
> 
> >
> >>>Does this mean that the host will not handle this interrupt? How does it
> >>>work in case on level triggered interrupts?
> >>
> >>There's no guarantee that the KVM arch code can make sure the irq
> >>bypass work for any type of irq. So if they the irq will still need
> >>to be handled by host first. This means we should keep the host
> >>interrupt handler as a slowpath (fallback).
> >>
> >>>In the case of ConnectX, I need to execute some code to acknowledge the
> >>>interrupt.
> >>
> >>This turns out to be hard for irq bypassing to work. Is it because
> >>the irq is shared or what kind of ack you need to do?
> >I have an EQ which is a queue for events comming from the hardware. This
> >EQ can created so it reports only completion events but I still need to
> >execute code that roughly tells the device that I saw these event
> >records and then arm it again so it can report more interrupts (e.g if
> >more packets are received or sent). This is device specific code.
> 
> 
> Any chance that the hardware can use MSI (which is not the case here)?
> 
> Thanks
> 
> 
> >>Thanks
> >>
> >>
> >>>Can you explain how this should be done?
> >>>
>
Eli Cohen July 29, 2020, 2:15 p.m. UTC | #8
OK, we have a mode of operation that does not require driver
intervention to manipulate the event queues so I think we're ok with
this design.

On Wed, Jul 29, 2020 at 06:19:52PM +0800, Jason Wang wrote:
> 
> On 2020/7/29 下午5:55, Eli Cohen wrote:
> >On Wed, Jul 29, 2020 at 05:21:53PM +0800, Jason Wang wrote:
> >>On 2020/7/28 下午5:04, Eli Cohen wrote:
> >>>On Tue, Jul 28, 2020 at 12:24:03PM +0800, Zhu Lingshan wrote:
> >>>>+static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
> >>>>+{
> >>>>+	struct vhost_virtqueue *vq = &v->vqs[qid];
> >>>>+	const struct vdpa_config_ops *ops = v->vdpa->config;
> >>>>+	struct vdpa_device *vdpa = v->vdpa;
> >>>>+	int ret, irq;
> >>>>+
> >>>>+	spin_lock(&vq->call_ctx.ctx_lock);
> >>>>+	irq = ops->get_vq_irq(vdpa, qid);
> >>>>+	if (!vq->call_ctx.ctx || irq == -EINVAL) {
> >>>>+		spin_unlock(&vq->call_ctx.ctx_lock);
> >>>>+		return;
> >>>>+	}
> >>>>+
> >>>If I understand correctly, this will cause these IRQs to be forwarded
> >>>directly to the VCPU, e.g. will be handled by the guest/qemu.
> >>
> >>Yes, if it can bypassed, the interrupt will be delivered to vCPU directly.
> >>
> >So, usually the network driver knows how to handle interrups for its
> >devices. I assume the virtio_net driver at the guest has some default
> >processing but what if the underlying hardware device (such as the case
> >of vdpa) needs to take some actions?
> 
> 
> Virtio splits the bus operations out of device operations. So did
> the driver.
> 
> The virtio-net driver depends on a transport driver to talk to the
> real device. Usually PCI is used as the transport for the device. In
> this case virtio-pci driver is in charge of dealing with irq
> allocation/free/configuration and it needs to co-operate with
> platform specific irqchip (virtualized by KVM) to finish the work
> like irq acknowledge etc.  E.g for x86, the irq offloading can only
> work when there's a hardware support of virtual irqchip (APICv) then
> all stuffs could be done without vmexits.
> 
> So no vendor specific part since the device and transport are all standard.
> 
> 
> >  Is there an option to do bounce the
> >interrupt back to the vendor specific driver in the host so it can take
> >these actions?
> 
> 
> Currently not, but even if we can do this, I'm afraid we will lose
> the performance advantage of irq bypassing.
> 
> 
> >
> >>>Does this mean that the host will not handle this interrupt? How does it
> >>>work in case on level triggered interrupts?
> >>
> >>There's no guarantee that the KVM arch code can make sure the irq
> >>bypass work for any type of irq. So if they the irq will still need
> >>to be handled by host first. This means we should keep the host
> >>interrupt handler as a slowpath (fallback).
> >>
> >>>In the case of ConnectX, I need to execute some code to acknowledge the
> >>>interrupt.
> >>
> >>This turns out to be hard for irq bypassing to work. Is it because
> >>the irq is shared or what kind of ack you need to do?
> >I have an EQ which is a queue for events comming from the hardware. This
> >EQ can created so it reports only completion events but I still need to
> >execute code that roughly tells the device that I saw these event
> >records and then arm it again so it can report more interrupts (e.g if
> >more packets are received or sent). This is device specific code.
> 
> 
> Any chance that the hardware can use MSI (which is not the case here)?
> 
> Thanks
> 
> 
> >>Thanks
> >>
> >>
> >>>Can you explain how this should be done?
> >>>
>
Jason Wang July 31, 2020, 3:11 a.m. UTC | #9
On 2020/7/29 下午10:15, Eli Cohen wrote:
> OK, we have a mode of operation that does not require driver
> intervention to manipulate the event queues so I think we're ok with
> this design.


Good to know this.

Thanks
diff mbox series

Patch

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index d3688c6afb87..587fbae06182 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -65,6 +65,7 @@  config VHOST_VDPA
 	tristate "Vhost driver for vDPA-based backend"
 	depends on EVENTFD
 	select VHOST
+	select IRQ_BYPASS_MANAGER
 	depends on VDPA
 	help
 	  This kernel module can be loaded in host kernel to accelerate
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index df3cf386b0cd..1dccced321f8 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -115,6 +115,55 @@  static irqreturn_t vhost_vdpa_config_cb(void *private)
 	return IRQ_HANDLED;
 }
 
+static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, int qid)
+{
+	struct vhost_virtqueue *vq = &v->vqs[qid];
+	const struct vdpa_config_ops *ops = v->vdpa->config;
+	struct vdpa_device *vdpa = v->vdpa;
+	int ret, irq;
+
+	spin_lock(&vq->call_ctx.ctx_lock);
+	irq = ops->get_vq_irq(vdpa, qid);
+	if (!vq->call_ctx.ctx || irq == -EINVAL) {
+		spin_unlock(&vq->call_ctx.ctx_lock);
+		return;
+	}
+
+	vq->call_ctx.producer.token = vq->call_ctx.ctx;
+	vq->call_ctx.producer.irq = irq;
+	ret = irq_bypass_register_producer(&vq->call_ctx.producer);
+	spin_unlock(&vq->call_ctx.ctx_lock);
+}
+
+static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, int qid)
+{
+	struct vhost_virtqueue *vq = &v->vqs[qid];
+
+	spin_lock(&vq->call_ctx.ctx_lock);
+	irq_bypass_unregister_producer(&vq->call_ctx.producer);
+	spin_unlock(&vq->call_ctx.ctx_lock);
+}
+
+static void vhost_vdpa_update_vq_irq(struct vhost_virtqueue *vq)
+{
+	spin_lock(&vq->call_ctx.ctx_lock);
+	/*
+	 * if it has a non-zero irq, means there is a
+	 * previsouly registered irq_bypass_producer,
+	 * we should update it when ctx (its token)
+	 * changes.
+	 */
+	if (!vq->call_ctx.producer.irq) {
+		spin_unlock(&vq->call_ctx.ctx_lock);
+		return;
+	}
+
+	irq_bypass_unregister_producer(&vq->call_ctx.producer);
+	vq->call_ctx.producer.token = vq->call_ctx.ctx;
+	irq_bypass_register_producer(&vq->call_ctx.producer);
+	spin_unlock(&vq->call_ctx.ctx_lock);
+}
+
 static void vhost_vdpa_reset(struct vhost_vdpa *v)
 {
 	struct vdpa_device *vdpa = v->vdpa;
@@ -155,11 +204,15 @@  static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
 {
 	struct vdpa_device *vdpa = v->vdpa;
 	const struct vdpa_config_ops *ops = vdpa->config;
-	u8 status;
+	u8 status, status_old;
+	int i, nvqs;
 
 	if (copy_from_user(&status, statusp, sizeof(status)))
 		return -EFAULT;
 
+	status_old = ops->get_status(vdpa);
+	nvqs = v->nvqs;
+
 	/*
 	 * Userspace shouldn't remove status bits unless reset the
 	 * status to 0.
@@ -167,6 +220,15 @@  static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
 	if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
 		return -EINVAL;
 
+	/* vq irq is not expected to be changed once DRIVER_OK is set */
+	if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
+		for (i = 0; i < nvqs; i++)
+			vhost_vdpa_setup_vq_irq(v, i);
+
+	if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK))
+		for (i = 0; i < nvqs; i++)
+			vhost_vdpa_unsetup_vq_irq(v, i);
+
 	ops->set_status(vdpa, status);
 
 	return 0;
@@ -332,6 +394,7 @@  static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
 
 	return 0;
 }
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 				   void __user *argp)
 {
@@ -390,6 +453,7 @@  static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			cb.private = NULL;
 		}
 		ops->set_vq_cb(vdpa, idx, &cb);
+		vhost_vdpa_update_vq_irq(vq);
 		break;
 
 	case VHOST_SET_VRING_NUM:
@@ -765,6 +829,18 @@  static int vhost_vdpa_open(struct inode *inode, struct file *filep)
 	return r;
 }
 
+static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
+{
+	struct vhost_virtqueue *vq;
+	int i;
+
+	for (i = 0; i < v->nvqs; i++) {
+		vq = &v->vqs[i];
+		if (vq->call_ctx.producer.irq)
+			irq_bypass_unregister_producer(&vq->call_ctx.producer);
+	}
+}
+
 static int vhost_vdpa_release(struct inode *inode, struct file *filep)
 {
 	struct vhost_vdpa *v = filep->private_data;
@@ -777,6 +853,7 @@  static int vhost_vdpa_release(struct inode *inode, struct file *filep)
 	vhost_vdpa_iotlb_free(v);
 	vhost_vdpa_free_domain(v);
 	vhost_vdpa_config_put(v);
+	vhost_vdpa_clean_irq(v);
 	vhost_dev_cleanup(&v->vdev);
 	kfree(v->vdev.vqs);
 	mutex_unlock(&d->mutex);