diff mbox series

[v6,15/19] vfio-user: handle device interrupts

Message ID ba5a32f3def72ce66a894191747dcafd63e7a517.1645079934.git.jag.raman@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user server in QEMU | expand

Commit Message

Jag Raman Feb. 17, 2022, 7:49 a.m. UTC
Forward remote device's interrupts to the guest

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 include/hw/pci/pci.h              |   6 ++
 include/hw/remote/vfio-user-obj.h |   6 ++
 hw/pci/msi.c                      |  13 +++-
 hw/pci/msix.c                     |  12 +++-
 hw/remote/machine.c               |  11 +--
 hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
 stubs/vfio-user-obj.c             |   6 ++
 MAINTAINERS                       |   1 +
 hw/remote/trace-events            |   1 +
 stubs/meson.build                 |   1 +
 10 files changed, 158 insertions(+), 6 deletions(-)
 create mode 100644 include/hw/remote/vfio-user-obj.h
 create mode 100644 stubs/vfio-user-obj.c

Comments

Stefan Hajnoczi March 7, 2022, 10:24 a.m. UTC | #1
On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
> Forward remote device's interrupts to the guest
> 
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  include/hw/pci/pci.h              |   6 ++
>  include/hw/remote/vfio-user-obj.h |   6 ++
>  hw/pci/msi.c                      |  13 +++-
>  hw/pci/msix.c                     |  12 +++-
>  hw/remote/machine.c               |  11 +--
>  hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
>  stubs/vfio-user-obj.c             |   6 ++
>  MAINTAINERS                       |   1 +
>  hw/remote/trace-events            |   1 +
>  stubs/meson.build                 |   1 +
>  10 files changed, 158 insertions(+), 6 deletions(-)
>  create mode 100644 include/hw/remote/vfio-user-obj.h
>  create mode 100644 stubs/vfio-user-obj.c
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index c3f3c90473..d42d526a48 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
>  typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
>                                  pcibus_t addr, pcibus_t size, int type);
>  typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
>  
>  typedef struct PCIIORegion {
>      pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
> @@ -323,6 +325,10 @@ struct PCIDevice {
>      /* Space to store MSIX table & pending bit array */
>      uint8_t *msix_table;
>      uint8_t *msix_pba;
> +
> +    PCIMSINotify *msi_notify;
> +    PCIMSIxNotify *msix_notify;
> +
>      /* MemoryRegion container for msix exclusive BAR setup */
>      MemoryRegion msix_exclusive_bar;
>      /* Memory Regions for MSIX table and pending bit entries. */
> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
> new file mode 100644
> index 0000000000..87ab78b875
> --- /dev/null
> +++ b/include/hw/remote/vfio-user-obj.h
> @@ -0,0 +1,6 @@
> +#ifndef VFIO_USER_OBJ_H
> +#define VFIO_USER_OBJ_H
> +
> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
> +
> +#endif
> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
> index 47d2b0f33c..93f5e400cc 100644
> --- a/hw/pci/msi.c
> +++ b/hw/pci/msi.c
> @@ -51,6 +51,8 @@
>   */
>  bool msi_nonbroken;
>  
> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
> +
>  /* If we get rid of cap allocator, we won't need this. */
>  static inline uint8_t msi_cap_sizeof(uint16_t flags)
>  {
> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
>      dev->msi_cap = config_offset;
>      dev->cap_present |= QEMU_PCI_CAP_MSI;
>  
> +    dev->msi_notify = pci_msi_notify;

Are you sure it's correct to skip the msi_is_masked() logic? I think the
callback function should only override the behavior of
msi_send_message(), not the entire msi_notify() function.

The same applies to MSI-X.

> +
>      pci_set_word(dev->config + msi_flags_off(dev), flags);
>      pci_set_word(dev->wmask + msi_flags_off(dev),
>                   PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
> @@ -307,7 +311,7 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
>      return mask & (1U << vector);
>  }
>  
> -void msi_notify(PCIDevice *dev, unsigned int vector)
> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector)
>  {
>      uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>      bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> @@ -332,6 +336,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>      msi_send_message(dev, msg);
>  }
>  
> +void msi_notify(PCIDevice *dev, unsigned int vector)
> +{
> +    if (dev->msi_notify) {

Can this ever be NULL?

> +        dev->msi_notify(dev, vector);
> +    }
> +}
> +
>  void msi_send_message(PCIDevice *dev, MSIMessage msg)
>  {
>      MemTxAttrs attrs = {};
> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
> index ae9331cd0b..1c71e67f53 100644
> --- a/hw/pci/msix.c
> +++ b/hw/pci/msix.c
> @@ -31,6 +31,8 @@
>  #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
>  #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
>  
> +static void pci_msix_notify(PCIDevice *dev, unsigned vector);
> +
>  MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>  {
>      uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
> @@ -334,6 +336,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>      dev->msix_table = g_malloc0(table_size);
>      dev->msix_pba = g_malloc0(pba_size);
>      dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
> +    dev->msix_notify = pci_msix_notify;
>  
>      msix_mask_all(dev, nentries);
>  
> @@ -485,7 +488,7 @@ int msix_enabled(PCIDevice *dev)
>  }
>  
>  /* Send an MSI-X message */
> -void msix_notify(PCIDevice *dev, unsigned vector)
> +static void pci_msix_notify(PCIDevice *dev, unsigned vector)
>  {
>      MSIMessage msg;
>  
> @@ -503,6 +506,13 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>      msi_send_message(dev, msg);
>  }
>  
> +void msix_notify(PCIDevice *dev, unsigned vector)
> +{
> +    if (dev->msix_notify) {

Can this ever be NULL?

> +        dev->msix_notify(dev, vector);
> +    }
> +}
> +
>  void msix_reset(PCIDevice *dev)
>  {
>      if (!msix_present(dev)) {
> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
> index db4ae30710..a8b4a3aef3 100644
> --- a/hw/remote/machine.c
> +++ b/hw/remote/machine.c
> @@ -23,6 +23,7 @@
>  #include "hw/remote/iohub.h"
>  #include "hw/qdev-core.h"
>  #include "hw/remote/iommu.h"
> +#include "hw/remote/vfio-user-obj.h"
>  
>  static void remote_machine_init(MachineState *machine)
>  {
> @@ -54,12 +55,14 @@ static void remote_machine_init(MachineState *machine)
>  
>      if (s->vfio_user) {
>          remote_configure_iommu(pci_host->bus);
> -    }
>  
> -    remote_iohub_init(&s->iohub);
> +        vfu_object_set_bus_irq(pci_host->bus);
> +    } else {
> +        remote_iohub_init(&s->iohub);
>  
> -    pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
> -                 &s->iohub, REMOTE_IOHUB_NB_PIRQS);
> +        pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
> +                     &s->iohub, REMOTE_IOHUB_NB_PIRQS);
> +    }
>  
>      qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
>  }
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 2feabd06a4..d79bab87f1 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -54,6 +54,9 @@
>  #include "hw/pci/pci.h"
>  #include "qemu/timer.h"
>  #include "exec/memory.h"
> +#include "hw/pci/msi.h"
> +#include "hw/pci/msix.h"
> +#include "hw/remote/vfio-user-obj.h"
>  
>  #define TYPE_VFU_OBJECT "x-vfio-user-server"
>  OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
> @@ -107,6 +110,10 @@ struct VfuObject {
>      int vfu_poll_fd;
>  };
>  
> +static GHashTable *vfu_object_bdf_to_ctx_table;

I suggest adding a void *msi_notify_opaque field to PCIDevice and
passing the value as an argument to ->msi_notify(). vfio-user-obj.c can
set the value to vfu_ctx and eliminate the vfu_object_bdf_to_ctx_table
hash table.

This simplifies the code, makes it faster, and solves race conditions
during hot plug/unplug if other instances are running in IOThreads.

> +
> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
> +
>  static void vfu_object_init_ctx(VfuObject *o, Error **errp);
>  
>  static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
> @@ -463,6 +470,86 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>      }
>  }
>  
> +static void vfu_object_irq_trigger(int pci_bdf, unsigned vector)
> +{
> +    vfu_ctx_t *vfu_ctx = NULL;
> +
> +    if (!vfu_object_bdf_to_ctx_table) {

Can this ever be NULL?

> +        return;
> +    }
> +
> +    vfu_ctx = g_hash_table_lookup(vfu_object_bdf_to_ctx_table,
> +                                  INT2VOIDP(pci_bdf));
> +
> +    if (vfu_ctx) {
> +        vfu_irq_trigger(vfu_ctx, vector);
> +    }
> +}
> +
> +static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
> +{
> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
> +                                pci_dev->devfn);
> +
> +    return pci_bdf;
> +}
> +
> +static void vfu_object_set_irq(void *opaque, int pirq, int level)
> +{
> +    if (level) {
> +        vfu_object_irq_trigger(pirq, 0);
> +    }
> +}
> +
> +static void vfu_object_msi_notify(PCIDevice *pci_dev, unsigned vector)
> +{
> +    int pci_bdf;
> +
> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> +
> +    vfu_object_irq_trigger(pci_bdf, vector);
> +}
> +
> +static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
> +{
> +    vfu_ctx_t *vfu_ctx = o->vfu_ctx;
> +    int ret, pci_bdf;
> +
> +    ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = 0;
> +    if (msix_nr_vectors_allocated(pci_dev)) {
> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
> +                                       msix_nr_vectors_allocated(pci_dev));
> +
> +        pci_dev->msix_notify = vfu_object_msi_notify;
> +    } else if (msi_nr_vectors_allocated(pci_dev)) {
> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
> +                                       msi_nr_vectors_allocated(pci_dev));
> +
> +        pci_dev->msi_notify = vfu_object_msi_notify;
> +    }
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
> +
> +    g_hash_table_insert(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf),
> +                        o->vfu_ctx);
> +
> +    return 0;
> +}
> +
> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
> +{
> +    pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, NULL, 1);
> +}
> +
>  /*
>   * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
>   * properties. It also depends on devices instantiated in QEMU. These
> @@ -559,6 +646,13 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
>  
>      vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
>  
> +    ret = vfu_object_setup_irqs(o, o->pci_dev);
> +    if (ret < 0) {
> +        error_setg(errp, "vfu: Failed to setup interrupts for %s",
> +                   o->device);
> +        goto fail;
> +    }
> +
>      ret = vfu_realize_ctx(o->vfu_ctx);
>      if (ret < 0) {
>          error_setg(errp, "vfu: Failed to realize device %s- %s",
> @@ -612,6 +706,7 @@ static void vfu_object_finalize(Object *obj)
>  {
>      VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
>      VfuObject *o = VFU_OBJECT(obj);
> +    int pci_bdf;
>  
>      k->nr_devs--;
>  
> @@ -638,9 +733,17 @@ static void vfu_object_finalize(Object *obj)
>          o->unplug_blocker = NULL;
>      }
>  
> +    if (o->pci_dev) {
> +        pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(o->pci_dev)),
> +                                o->pci_dev->devfn);
> +        g_hash_table_remove(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf));
> +    }
> +
>      o->pci_dev = NULL;
>  
>      if (!k->nr_devs && k->auto_shutdown) {
> +        g_hash_table_destroy(vfu_object_bdf_to_ctx_table);
> +        vfu_object_bdf_to_ctx_table = NULL;
>          qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
>      }
>  
> @@ -658,6 +761,10 @@ static void vfu_object_class_init(ObjectClass *klass, void *data)
>  
>      k->auto_shutdown = true;
>  
> +    msi_nonbroken = true;

This should go in hw/remote/machine.c. It's a global variable related to
the machine's interrupt controller capabilities. The value is not
related to vfu_object_class_init(), which will be called by any QEMU
binary that links hw/remote/vfio-user-obj.o regardless of which machine
type is instantiated.

> +
> +    vfu_object_bdf_to_ctx_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
> +
>      object_class_property_add(klass, "socket", "SocketAddress", NULL,
>                                vfu_object_set_socket, NULL, NULL);
>      object_class_property_set_description(klass, "socket",
> diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c
> new file mode 100644
> index 0000000000..79100d768e
> --- /dev/null
> +++ b/stubs/vfio-user-obj.c
> @@ -0,0 +1,6 @@
> +#include "qemu/osdep.h"
> +#include "hw/remote/vfio-user-obj.h"
> +
> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
> +{
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f47232c78c..e274cb46af 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3569,6 +3569,7 @@ F: hw/remote/iohub.c
>  F: include/hw/remote/iohub.h
>  F: subprojects/libvfio-user
>  F: hw/remote/vfio-user-obj.c
> +F: include/hw/remote/vfio-user-obj.h
>  F: hw/remote/iommu.c
>  F: include/hw/remote/iommu.h
>  
> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> index 847d50d88f..c167b3c7a5 100644
> --- a/hw/remote/trace-events
> +++ b/hw/remote/trace-events
> @@ -12,3 +12,4 @@ vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
>  vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
>  vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
>  vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
> +vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
> diff --git a/stubs/meson.build b/stubs/meson.build
> index d359cbe1ad..c5ce979dc3 100644
> --- a/stubs/meson.build
> +++ b/stubs/meson.build
> @@ -57,3 +57,4 @@ if have_system
>  else
>    stub_ss.add(files('qdev.c'))
>  endif
> +stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c'))
> -- 
> 2.20.1
>
Jag Raman March 7, 2022, 3:10 p.m. UTC | #2
> On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
>> Forward remote device's interrupts to the guest
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/pci/pci.h              |   6 ++
>> include/hw/remote/vfio-user-obj.h |   6 ++
>> hw/pci/msi.c                      |  13 +++-
>> hw/pci/msix.c                     |  12 +++-
>> hw/remote/machine.c               |  11 +--
>> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
>> stubs/vfio-user-obj.c             |   6 ++
>> MAINTAINERS                       |   1 +
>> hw/remote/trace-events            |   1 +
>> stubs/meson.build                 |   1 +
>> 10 files changed, 158 insertions(+), 6 deletions(-)
>> create mode 100644 include/hw/remote/vfio-user-obj.h
>> create mode 100644 stubs/vfio-user-obj.c
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index c3f3c90473..d42d526a48 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
>> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
>>                                 pcibus_t addr, pcibus_t size, int type);
>> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
>> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
>> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
>> 
>> typedef struct PCIIORegion {
>>     pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
>> @@ -323,6 +325,10 @@ struct PCIDevice {
>>     /* Space to store MSIX table & pending bit array */
>>     uint8_t *msix_table;
>>     uint8_t *msix_pba;
>> +
>> +    PCIMSINotify *msi_notify;
>> +    PCIMSIxNotify *msix_notify;
>> +
>>     /* MemoryRegion container for msix exclusive BAR setup */
>>     MemoryRegion msix_exclusive_bar;
>>     /* Memory Regions for MSIX table and pending bit entries. */
>> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
>> new file mode 100644
>> index 0000000000..87ab78b875
>> --- /dev/null
>> +++ b/include/hw/remote/vfio-user-obj.h
>> @@ -0,0 +1,6 @@
>> +#ifndef VFIO_USER_OBJ_H
>> +#define VFIO_USER_OBJ_H
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
>> +
>> +#endif
>> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
>> index 47d2b0f33c..93f5e400cc 100644
>> --- a/hw/pci/msi.c
>> +++ b/hw/pci/msi.c
>> @@ -51,6 +51,8 @@
>>  */
>> bool msi_nonbroken;
>> 
>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
>> +
>> /* If we get rid of cap allocator, we won't need this. */
>> static inline uint8_t msi_cap_sizeof(uint16_t flags)
>> {
>> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
>>     dev->msi_cap = config_offset;
>>     dev->cap_present |= QEMU_PCI_CAP_MSI;
>> 
>> +    dev->msi_notify = pci_msi_notify;
> 
> Are you sure it's correct to skip the msi_is_masked() logic? I think the

pci_msi_notify() callback includes the test for msi_is_masked() - that
covers the non vfio-user case.

The vfio-user callback should add this test - will do it if we continue
this approach. Thanks for the catch!

> callback function should only override the behavior of
> msi_send_message(), not the entire msi_notify() function.

OK, this sounds like a better approach.

> 
> The same applies to MSI-X.
> 
>> +
>>     pci_set_word(dev->config + msi_flags_off(dev), flags);
>>     pci_set_word(dev->wmask + msi_flags_off(dev),
>>                  PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
>> @@ -307,7 +311,7 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
>>     return mask & (1U << vector);
>> }
>> 
>> -void msi_notify(PCIDevice *dev, unsigned int vector)
>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector)
>> {
>>     uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>     bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>> @@ -332,6 +336,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>>     msi_send_message(dev, msg);
>> }
>> 
>> +void msi_notify(PCIDevice *dev, unsigned int vector)
>> +{
>> +    if (dev->msi_notify) {
> 
> Can this ever be NULL?

Unlikely in the current code flow, but it could change in the future.

As a matter of principle, I thought that we should check if a function
pointer is non-NULL before invoking it in QEMU. Is that not the case?

> 
>> +        dev->msi_notify(dev, vector);
>> +    }
>> +}
>> +
>> void msi_send_message(PCIDevice *dev, MSIMessage msg)
>> {
>>     MemTxAttrs attrs = {};
>> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
>> index ae9331cd0b..1c71e67f53 100644
>> --- a/hw/pci/msix.c
>> +++ b/hw/pci/msix.c
>> @@ -31,6 +31,8 @@
>> #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
>> #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
>> 
>> +static void pci_msix_notify(PCIDevice *dev, unsigned vector);
>> +
>> MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>> {
>>     uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
>> @@ -334,6 +336,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>>     dev->msix_table = g_malloc0(table_size);
>>     dev->msix_pba = g_malloc0(pba_size);
>>     dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>> +    dev->msix_notify = pci_msix_notify;
>> 
>>     msix_mask_all(dev, nentries);
>> 
>> @@ -485,7 +488,7 @@ int msix_enabled(PCIDevice *dev)
>> }
>> 
>> /* Send an MSI-X message */
>> -void msix_notify(PCIDevice *dev, unsigned vector)
>> +static void pci_msix_notify(PCIDevice *dev, unsigned vector)
>> {
>>     MSIMessage msg;
>> 
>> @@ -503,6 +506,13 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>>     msi_send_message(dev, msg);
>> }
>> 
>> +void msix_notify(PCIDevice *dev, unsigned vector)
>> +{
>> +    if (dev->msix_notify) {
> 
> Can this ever be NULL?
> 
>> +        dev->msix_notify(dev, vector);
>> +    }
>> +}
>> +
>> void msix_reset(PCIDevice *dev)
>> {
>>     if (!msix_present(dev)) {
>> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
>> index db4ae30710..a8b4a3aef3 100644
>> --- a/hw/remote/machine.c
>> +++ b/hw/remote/machine.c
>> @@ -23,6 +23,7 @@
>> #include "hw/remote/iohub.h"
>> #include "hw/qdev-core.h"
>> #include "hw/remote/iommu.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> 
>> static void remote_machine_init(MachineState *machine)
>> {
>> @@ -54,12 +55,14 @@ static void remote_machine_init(MachineState *machine)
>> 
>>     if (s->vfio_user) {
>>         remote_configure_iommu(pci_host->bus);
>> -    }
>> 
>> -    remote_iohub_init(&s->iohub);
>> +        vfu_object_set_bus_irq(pci_host->bus);
>> +    } else {
>> +        remote_iohub_init(&s->iohub);
>> 
>> -    pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>> -                 &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>> +        pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>> +                     &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>> +    }
>> 
>>     qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
>> }
>> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
>> index 2feabd06a4..d79bab87f1 100644
>> --- a/hw/remote/vfio-user-obj.c
>> +++ b/hw/remote/vfio-user-obj.c
>> @@ -54,6 +54,9 @@
>> #include "hw/pci/pci.h"
>> #include "qemu/timer.h"
>> #include "exec/memory.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> 
>> #define TYPE_VFU_OBJECT "x-vfio-user-server"
>> OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
>> @@ -107,6 +110,10 @@ struct VfuObject {
>>     int vfu_poll_fd;
>> };
>> 
>> +static GHashTable *vfu_object_bdf_to_ctx_table;
> 
> I suggest adding a void *msi_notify_opaque field to PCIDevice and
> passing the value as an argument to ->msi_notify(). vfio-user-obj.c can
> set the value to vfu_ctx and eliminate the vfu_object_bdf_to_ctx_table
> hash table.
> 
> This simplifies the code, makes it faster, and solves race conditions
> during hot plug/unplug if other instances are running in IOThreads.

OK, will do.

> 
>> +
>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>> +
>> static void vfu_object_init_ctx(VfuObject *o, Error **errp);
>> 
>> static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
>> @@ -463,6 +470,86 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>>     }
>> }
>> 
>> +static void vfu_object_irq_trigger(int pci_bdf, unsigned vector)
>> +{
>> +    vfu_ctx_t *vfu_ctx = NULL;
>> +
>> +    if (!vfu_object_bdf_to_ctx_table) {
> 
> Can this ever be NULL?
> 
>> +        return;
>> +    }
>> +
>> +    vfu_ctx = g_hash_table_lookup(vfu_object_bdf_to_ctx_table,
>> +                                  INT2VOIDP(pci_bdf));
>> +
>> +    if (vfu_ctx) {
>> +        vfu_irq_trigger(vfu_ctx, vector);
>> +    }
>> +}
>> +
>> +static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
>> +{
>> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
>> +                                pci_dev->devfn);
>> +
>> +    return pci_bdf;
>> +}
>> +
>> +static void vfu_object_set_irq(void *opaque, int pirq, int level)
>> +{
>> +    if (level) {
>> +        vfu_object_irq_trigger(pirq, 0);
>> +    }
>> +}
>> +
>> +static void vfu_object_msi_notify(PCIDevice *pci_dev, unsigned vector)
>> +{
>> +    int pci_bdf;
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    vfu_object_irq_trigger(pci_bdf, vector);
>> +}
>> +
>> +static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
>> +{
>> +    vfu_ctx_t *vfu_ctx = o->vfu_ctx;
>> +    int ret, pci_bdf;
>> +
>> +    ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = 0;
>> +    if (msix_nr_vectors_allocated(pci_dev)) {
>> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
>> +                                       msix_nr_vectors_allocated(pci_dev));
>> +
>> +        pci_dev->msix_notify = vfu_object_msi_notify;
>> +    } else if (msi_nr_vectors_allocated(pci_dev)) {
>> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
>> +                                       msi_nr_vectors_allocated(pci_dev));
>> +
>> +        pci_dev->msi_notify = vfu_object_msi_notify;
>> +    }
>> +
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    g_hash_table_insert(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf),
>> +                        o->vfu_ctx);
>> +
>> +    return 0;
>> +}
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
>> +{
>> +    pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, NULL, 1);
>> +}
>> +
>> /*
>>  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
>>  * properties. It also depends on devices instantiated in QEMU. These
>> @@ -559,6 +646,13 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
>> 
>>     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
>> 
>> +    ret = vfu_object_setup_irqs(o, o->pci_dev);
>> +    if (ret < 0) {
>> +        error_setg(errp, "vfu: Failed to setup interrupts for %s",
>> +                   o->device);
>> +        goto fail;
>> +    }
>> +
>>     ret = vfu_realize_ctx(o->vfu_ctx);
>>     if (ret < 0) {
>>         error_setg(errp, "vfu: Failed to realize device %s- %s",
>> @@ -612,6 +706,7 @@ static void vfu_object_finalize(Object *obj)
>> {
>>     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
>>     VfuObject *o = VFU_OBJECT(obj);
>> +    int pci_bdf;
>> 
>>     k->nr_devs--;
>> 
>> @@ -638,9 +733,17 @@ static void vfu_object_finalize(Object *obj)
>>         o->unplug_blocker = NULL;
>>     }
>> 
>> +    if (o->pci_dev) {
>> +        pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(o->pci_dev)),
>> +                                o->pci_dev->devfn);
>> +        g_hash_table_remove(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf));
>> +    }
>> +
>>     o->pci_dev = NULL;
>> 
>>     if (!k->nr_devs && k->auto_shutdown) {
>> +        g_hash_table_destroy(vfu_object_bdf_to_ctx_table);
>> +        vfu_object_bdf_to_ctx_table = NULL;
>>         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
>>     }
>> 
>> @@ -658,6 +761,10 @@ static void vfu_object_class_init(ObjectClass *klass, void *data)
>> 
>>     k->auto_shutdown = true;
>> 
>> +    msi_nonbroken = true;
> 
> This should go in hw/remote/machine.c. It's a global variable related to
> the machine's interrupt controller capabilities. The value is not
> related to vfu_object_class_init(), which will be called by any QEMU
> binary that links hw/remote/vfio-user-obj.o regardless of which machine
> type is instantiated.

multiprocess QEMU, which also uses the remote machine, doesn’t support MSI and
that’s why we placed it here originally. In subsequent series, we have added
‘vfio-user’ machine sub-option to discern vfio-user and multiprocess, so this could be
moved to the machine initialization code as you just pointed out.

Thank you!
--
Jag

> 
>> +
>> +    vfu_object_bdf_to_ctx_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
>> +
>>     object_class_property_add(klass, "socket", "SocketAddress", NULL,
>>                               vfu_object_set_socket, NULL, NULL);
>>     object_class_property_set_description(klass, "socket",
>> diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c
>> new file mode 100644
>> index 0000000000..79100d768e
>> --- /dev/null
>> +++ b/stubs/vfio-user-obj.c
>> @@ -0,0 +1,6 @@
>> +#include "qemu/osdep.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
>> +{
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index f47232c78c..e274cb46af 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -3569,6 +3569,7 @@ F: hw/remote/iohub.c
>> F: include/hw/remote/iohub.h
>> F: subprojects/libvfio-user
>> F: hw/remote/vfio-user-obj.c
>> +F: include/hw/remote/vfio-user-obj.h
>> F: hw/remote/iommu.c
>> F: include/hw/remote/iommu.h
>> 
>> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
>> index 847d50d88f..c167b3c7a5 100644
>> --- a/hw/remote/trace-events
>> +++ b/hw/remote/trace-events
>> @@ -12,3 +12,4 @@ vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
>> vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
>> vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
>> vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
>> +vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
>> diff --git a/stubs/meson.build b/stubs/meson.build
>> index d359cbe1ad..c5ce979dc3 100644
>> --- a/stubs/meson.build
>> +++ b/stubs/meson.build
>> @@ -57,3 +57,4 @@ if have_system
>> else
>>   stub_ss.add(files('qdev.c'))
>> endif
>> +stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c'))
>> -- 
>> 2.20.1
>>
Stefan Hajnoczi March 8, 2022, 10:15 a.m. UTC | #3
On Mon, Mar 07, 2022 at 03:10:41PM +0000, Jag Raman wrote:
> > On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
> >> @@ -332,6 +336,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
> >>     msi_send_message(dev, msg);
> >> }
> >> 
> >> +void msi_notify(PCIDevice *dev, unsigned int vector)
> >> +{
> >> +    if (dev->msi_notify) {
> > 
> > Can this ever be NULL?
> 
> Unlikely in the current code flow, but it could change in the future.
> 
> As a matter of principle, I thought that we should check if a function
> pointer is non-NULL before invoking it in QEMU. Is that not the case?

No, it's better to dump core with a backtrace when a program invariant
is violated than to silently suppress it. If msi_notify() is called but
the function pointer is NULL then there is a bug in the program that
needs to be fixed.

Stefan
Jag Raman March 26, 2022, 11:47 p.m. UTC | #4
> On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
>> Forward remote device's interrupts to the guest
>> 
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> include/hw/pci/pci.h              |   6 ++
>> include/hw/remote/vfio-user-obj.h |   6 ++
>> hw/pci/msi.c                      |  13 +++-
>> hw/pci/msix.c                     |  12 +++-
>> hw/remote/machine.c               |  11 +--
>> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
>> stubs/vfio-user-obj.c             |   6 ++
>> MAINTAINERS                       |   1 +
>> hw/remote/trace-events            |   1 +
>> stubs/meson.build                 |   1 +
>> 10 files changed, 158 insertions(+), 6 deletions(-)
>> create mode 100644 include/hw/remote/vfio-user-obj.h
>> create mode 100644 stubs/vfio-user-obj.c
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index c3f3c90473..d42d526a48 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
>> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
>>                                 pcibus_t addr, pcibus_t size, int type);
>> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
>> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
>> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
>> 
>> typedef struct PCIIORegion {
>>     pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
>> @@ -323,6 +325,10 @@ struct PCIDevice {
>>     /* Space to store MSIX table & pending bit array */
>>     uint8_t *msix_table;
>>     uint8_t *msix_pba;
>> +
>> +    PCIMSINotify *msi_notify;
>> +    PCIMSIxNotify *msix_notify;
>> +
>>     /* MemoryRegion container for msix exclusive BAR setup */
>>     MemoryRegion msix_exclusive_bar;
>>     /* Memory Regions for MSIX table and pending bit entries. */
>> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
>> new file mode 100644
>> index 0000000000..87ab78b875
>> --- /dev/null
>> +++ b/include/hw/remote/vfio-user-obj.h
>> @@ -0,0 +1,6 @@
>> +#ifndef VFIO_USER_OBJ_H
>> +#define VFIO_USER_OBJ_H
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
>> +
>> +#endif
>> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
>> index 47d2b0f33c..93f5e400cc 100644
>> --- a/hw/pci/msi.c
>> +++ b/hw/pci/msi.c
>> @@ -51,6 +51,8 @@
>>  */
>> bool msi_nonbroken;
>> 
>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
>> +
>> /* If we get rid of cap allocator, we won't need this. */
>> static inline uint8_t msi_cap_sizeof(uint16_t flags)
>> {
>> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
>>     dev->msi_cap = config_offset;
>>     dev->cap_present |= QEMU_PCI_CAP_MSI;
>> 
>> +    dev->msi_notify = pci_msi_notify;
> 
> Are you sure it's correct to skip the msi_is_masked() logic? I think the
> callback function should only override the behavior of
> msi_send_message(), not the entire msi_notify() function.
> 
> The same applies to MSI-X.

Hi Stefan,

We noticed that the client is handling the masking and unmasking of MSIx
interrupts.

Concerning MSIx, vfio_msix_vector_use() handles unmasking and
vfio_msix_vector_release() handles masking operations. The server triggers
an MSIx interrupt by signaling the eventfd associated with the vector. If the vector
is unmasked, the interrupt bypasses the client/QEMU and takes this
path: “server -> KVM -> guest”. Whereas, if the vector is masked, it lands on the
client via: “server -> vfio_msi_interrupt()”. vfio_msi_interrupt() suppresses the
interrupt if the vector is masked. The use and release functions switch the server’s
eventfd between VFIOPCIDevice->VFIOMSIVector[i]->kvm_interrupt and
VFIOPCIDevice->VFIOMSIVector[i]->interrupt using the
VFIO_DEVICE_SET_IRQS message.

Concerning MSI, the server should check if the vector is unmasked before
triggering. The server is not doing it presently, will update it. For some reason,
I had assumed that MSI handling is similar to MSIx in terms of masking - sorry
about that. The masking and unmasking information for MSI is in the config space
registers, so the server should have this information.

You had previously suggested using callbacks for msi_get_message &
msi_send_message, considering the masking issue. Given MSIx masking
(including the MSIx table BAR) is handled at the client, the masking information
doesn’t reach the server - so msix_notify will never invoke the
msi_send_message callback - all the vectors remain masked at the server
end (msix_init() -> msix_mask_all()).

Thank you!
--
Jag

> 
>> +
>>     pci_set_word(dev->config + msi_flags_off(dev), flags);
>>     pci_set_word(dev->wmask + msi_flags_off(dev),
>>                  PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
>> @@ -307,7 +311,7 @@ bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
>>     return mask & (1U << vector);
>> }
>> 
>> -void msi_notify(PCIDevice *dev, unsigned int vector)
>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector)
>> {
>>     uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
>>     bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
>> @@ -332,6 +336,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector)
>>     msi_send_message(dev, msg);
>> }
>> 
>> +void msi_notify(PCIDevice *dev, unsigned int vector)
>> +{
>> +    if (dev->msi_notify) {
> 
> Can this ever be NULL?
> 
>> +        dev->msi_notify(dev, vector);
>> +    }
>> +}
>> +
>> void msi_send_message(PCIDevice *dev, MSIMessage msg)
>> {
>>     MemTxAttrs attrs = {};
>> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
>> index ae9331cd0b..1c71e67f53 100644
>> --- a/hw/pci/msix.c
>> +++ b/hw/pci/msix.c
>> @@ -31,6 +31,8 @@
>> #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
>> #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
>> 
>> +static void pci_msix_notify(PCIDevice *dev, unsigned vector);
>> +
>> MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
>> {
>>     uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
>> @@ -334,6 +336,7 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries,
>>     dev->msix_table = g_malloc0(table_size);
>>     dev->msix_pba = g_malloc0(pba_size);
>>     dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
>> +    dev->msix_notify = pci_msix_notify;
>> 
>>     msix_mask_all(dev, nentries);
>> 
>> @@ -485,7 +488,7 @@ int msix_enabled(PCIDevice *dev)
>> }
>> 
>> /* Send an MSI-X message */
>> -void msix_notify(PCIDevice *dev, unsigned vector)
>> +static void pci_msix_notify(PCIDevice *dev, unsigned vector)
>> {
>>     MSIMessage msg;
>> 
>> @@ -503,6 +506,13 @@ void msix_notify(PCIDevice *dev, unsigned vector)
>>     msi_send_message(dev, msg);
>> }
>> 
>> +void msix_notify(PCIDevice *dev, unsigned vector)
>> +{
>> +    if (dev->msix_notify) {
> 
> Can this ever be NULL?
> 
>> +        dev->msix_notify(dev, vector);
>> +    }
>> +}
>> +
>> void msix_reset(PCIDevice *dev)
>> {
>>     if (!msix_present(dev)) {
>> diff --git a/hw/remote/machine.c b/hw/remote/machine.c
>> index db4ae30710..a8b4a3aef3 100644
>> --- a/hw/remote/machine.c
>> +++ b/hw/remote/machine.c
>> @@ -23,6 +23,7 @@
>> #include "hw/remote/iohub.h"
>> #include "hw/qdev-core.h"
>> #include "hw/remote/iommu.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> 
>> static void remote_machine_init(MachineState *machine)
>> {
>> @@ -54,12 +55,14 @@ static void remote_machine_init(MachineState *machine)
>> 
>>     if (s->vfio_user) {
>>         remote_configure_iommu(pci_host->bus);
>> -    }
>> 
>> -    remote_iohub_init(&s->iohub);
>> +        vfu_object_set_bus_irq(pci_host->bus);
>> +    } else {
>> +        remote_iohub_init(&s->iohub);
>> 
>> -    pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>> -                 &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>> +        pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
>> +                     &s->iohub, REMOTE_IOHUB_NB_PIRQS);
>> +    }
>> 
>>     qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
>> }
>> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
>> index 2feabd06a4..d79bab87f1 100644
>> --- a/hw/remote/vfio-user-obj.c
>> +++ b/hw/remote/vfio-user-obj.c
>> @@ -54,6 +54,9 @@
>> #include "hw/pci/pci.h"
>> #include "qemu/timer.h"
>> #include "exec/memory.h"
>> +#include "hw/pci/msi.h"
>> +#include "hw/pci/msix.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> 
>> #define TYPE_VFU_OBJECT "x-vfio-user-server"
>> OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
>> @@ -107,6 +110,10 @@ struct VfuObject {
>>     int vfu_poll_fd;
>> };
>> 
>> +static GHashTable *vfu_object_bdf_to_ctx_table;
> 
> I suggest adding a void *msi_notify_opaque field to PCIDevice and
> passing the value as an argument to ->msi_notify(). vfio-user-obj.c can
> set the value to vfu_ctx and eliminate the vfu_object_bdf_to_ctx_table
> hash table.
> 
> This simplifies the code, makes it faster, and solves race conditions
> during hot plug/unplug if other instances are running in IOThreads.
> 
>> +
>> +#define INT2VOIDP(i) (void *)(uintptr_t)(i)
>> +
>> static void vfu_object_init_ctx(VfuObject *o, Error **errp);
>> 
>> static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
>> @@ -463,6 +470,86 @@ static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
>>     }
>> }
>> 
>> +static void vfu_object_irq_trigger(int pci_bdf, unsigned vector)
>> +{
>> +    vfu_ctx_t *vfu_ctx = NULL;
>> +
>> +    if (!vfu_object_bdf_to_ctx_table) {
> 
> Can this ever be NULL?
> 
>> +        return;
>> +    }
>> +
>> +    vfu_ctx = g_hash_table_lookup(vfu_object_bdf_to_ctx_table,
>> +                                  INT2VOIDP(pci_bdf));
>> +
>> +    if (vfu_ctx) {
>> +        vfu_irq_trigger(vfu_ctx, vector);
>> +    }
>> +}
>> +
>> +static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
>> +{
>> +    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
>> +                                pci_dev->devfn);
>> +
>> +    return pci_bdf;
>> +}
>> +
>> +static void vfu_object_set_irq(void *opaque, int pirq, int level)
>> +{
>> +    if (level) {
>> +        vfu_object_irq_trigger(pirq, 0);
>> +    }
>> +}
>> +
>> +static void vfu_object_msi_notify(PCIDevice *pci_dev, unsigned vector)
>> +{
>> +    int pci_bdf;
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    vfu_object_irq_trigger(pci_bdf, vector);
>> +}
>> +
>> +static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
>> +{
>> +    vfu_ctx_t *vfu_ctx = o->vfu_ctx;
>> +    int ret, pci_bdf;
>> +
>> +    ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = 0;
>> +    if (msix_nr_vectors_allocated(pci_dev)) {
>> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
>> +                                       msix_nr_vectors_allocated(pci_dev));
>> +
>> +        pci_dev->msix_notify = vfu_object_msi_notify;
>> +    } else if (msi_nr_vectors_allocated(pci_dev)) {
>> +        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
>> +                                       msi_nr_vectors_allocated(pci_dev));
>> +
>> +        pci_dev->msi_notify = vfu_object_msi_notify;
>> +    }
>> +
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
>> +
>> +    g_hash_table_insert(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf),
>> +                        o->vfu_ctx);
>> +
>> +    return 0;
>> +}
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
>> +{
>> +    pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, NULL, 1);
>> +}
>> +
>> /*
>>  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
>>  * properties. It also depends on devices instantiated in QEMU. These
>> @@ -559,6 +646,13 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
>> 
>>     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
>> 
>> +    ret = vfu_object_setup_irqs(o, o->pci_dev);
>> +    if (ret < 0) {
>> +        error_setg(errp, "vfu: Failed to setup interrupts for %s",
>> +                   o->device);
>> +        goto fail;
>> +    }
>> +
>>     ret = vfu_realize_ctx(o->vfu_ctx);
>>     if (ret < 0) {
>>         error_setg(errp, "vfu: Failed to realize device %s- %s",
>> @@ -612,6 +706,7 @@ static void vfu_object_finalize(Object *obj)
>> {
>>     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
>>     VfuObject *o = VFU_OBJECT(obj);
>> +    int pci_bdf;
>> 
>>     k->nr_devs--;
>> 
>> @@ -638,9 +733,17 @@ static void vfu_object_finalize(Object *obj)
>>         o->unplug_blocker = NULL;
>>     }
>> 
>> +    if (o->pci_dev) {
>> +        pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(o->pci_dev)),
>> +                                o->pci_dev->devfn);
>> +        g_hash_table_remove(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf));
>> +    }
>> +
>>     o->pci_dev = NULL;
>> 
>>     if (!k->nr_devs && k->auto_shutdown) {
>> +        g_hash_table_destroy(vfu_object_bdf_to_ctx_table);
>> +        vfu_object_bdf_to_ctx_table = NULL;
>>         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
>>     }
>> 
>> @@ -658,6 +761,10 @@ static void vfu_object_class_init(ObjectClass *klass, void *data)
>> 
>>     k->auto_shutdown = true;
>> 
>> +    msi_nonbroken = true;
> 
> This should go in hw/remote/machine.c. It's a global variable related to
> the machine's interrupt controller capabilities. The value is not
> related to vfu_object_class_init(), which will be called by any QEMU
> binary that links hw/remote/vfio-user-obj.o regardless of which machine
> type is instantiated.
> 
>> +
>> +    vfu_object_bdf_to_ctx_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
>> +
>>     object_class_property_add(klass, "socket", "SocketAddress", NULL,
>>                               vfu_object_set_socket, NULL, NULL);
>>     object_class_property_set_description(klass, "socket",
>> diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c
>> new file mode 100644
>> index 0000000000..79100d768e
>> --- /dev/null
>> +++ b/stubs/vfio-user-obj.c
>> @@ -0,0 +1,6 @@
>> +#include "qemu/osdep.h"
>> +#include "hw/remote/vfio-user-obj.h"
>> +
>> +void vfu_object_set_bus_irq(PCIBus *pci_bus)
>> +{
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index f47232c78c..e274cb46af 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -3569,6 +3569,7 @@ F: hw/remote/iohub.c
>> F: include/hw/remote/iohub.h
>> F: subprojects/libvfio-user
>> F: hw/remote/vfio-user-obj.c
>> +F: include/hw/remote/vfio-user-obj.h
>> F: hw/remote/iommu.c
>> F: include/hw/remote/iommu.h
>> 
>> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
>> index 847d50d88f..c167b3c7a5 100644
>> --- a/hw/remote/trace-events
>> +++ b/hw/remote/trace-events
>> @@ -12,3 +12,4 @@ vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
>> vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
>> vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
>> vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
>> +vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
>> diff --git a/stubs/meson.build b/stubs/meson.build
>> index d359cbe1ad..c5ce979dc3 100644
>> --- a/stubs/meson.build
>> +++ b/stubs/meson.build
>> @@ -57,3 +57,4 @@ if have_system
>> else
>>   stub_ss.add(files('qdev.c'))
>> endif
>> +stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c'))
>> -- 
>> 2.20.1
>>
Stefan Hajnoczi March 29, 2022, 2:24 p.m. UTC | #5
On Sat, Mar 26, 2022 at 11:47:36PM +0000, Jag Raman wrote:
> 
> 
> > On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
> >> Forward remote device's interrupts to the guest
> >> 
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> include/hw/pci/pci.h              |   6 ++
> >> include/hw/remote/vfio-user-obj.h |   6 ++
> >> hw/pci/msi.c                      |  13 +++-
> >> hw/pci/msix.c                     |  12 +++-
> >> hw/remote/machine.c               |  11 +--
> >> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
> >> stubs/vfio-user-obj.c             |   6 ++
> >> MAINTAINERS                       |   1 +
> >> hw/remote/trace-events            |   1 +
> >> stubs/meson.build                 |   1 +
> >> 10 files changed, 158 insertions(+), 6 deletions(-)
> >> create mode 100644 include/hw/remote/vfio-user-obj.h
> >> create mode 100644 stubs/vfio-user-obj.c
> >> 
> >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> index c3f3c90473..d42d526a48 100644
> >> --- a/include/hw/pci/pci.h
> >> +++ b/include/hw/pci/pci.h
> >> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
> >> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
> >>                                 pcibus_t addr, pcibus_t size, int type);
> >> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
> >> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
> >> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
> >> 
> >> typedef struct PCIIORegion {
> >>     pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
> >> @@ -323,6 +325,10 @@ struct PCIDevice {
> >>     /* Space to store MSIX table & pending bit array */
> >>     uint8_t *msix_table;
> >>     uint8_t *msix_pba;
> >> +
> >> +    PCIMSINotify *msi_notify;
> >> +    PCIMSIxNotify *msix_notify;
> >> +
> >>     /* MemoryRegion container for msix exclusive BAR setup */
> >>     MemoryRegion msix_exclusive_bar;
> >>     /* Memory Regions for MSIX table and pending bit entries. */
> >> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
> >> new file mode 100644
> >> index 0000000000..87ab78b875
> >> --- /dev/null
> >> +++ b/include/hw/remote/vfio-user-obj.h
> >> @@ -0,0 +1,6 @@
> >> +#ifndef VFIO_USER_OBJ_H
> >> +#define VFIO_USER_OBJ_H
> >> +
> >> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
> >> +
> >> +#endif
> >> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
> >> index 47d2b0f33c..93f5e400cc 100644
> >> --- a/hw/pci/msi.c
> >> +++ b/hw/pci/msi.c
> >> @@ -51,6 +51,8 @@
> >>  */
> >> bool msi_nonbroken;
> >> 
> >> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
> >> +
> >> /* If we get rid of cap allocator, we won't need this. */
> >> static inline uint8_t msi_cap_sizeof(uint16_t flags)
> >> {
> >> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
> >>     dev->msi_cap = config_offset;
> >>     dev->cap_present |= QEMU_PCI_CAP_MSI;
> >> 
> >> +    dev->msi_notify = pci_msi_notify;
> > 
> > Are you sure it's correct to skip the msi_is_masked() logic? I think the
> > callback function should only override the behavior of
> > msi_send_message(), not the entire msi_notify() function.
> > 
> > The same applies to MSI-X.
> 
> Hi Stefan,
> 
> We noticed that the client is handling the masking and unmasking of MSIx
> interrupts.
> 
> Concerning MSIx, vfio_msix_vector_use() handles unmasking and
> vfio_msix_vector_release() handles masking operations. The server triggers
> an MSIx interrupt by signaling the eventfd associated with the vector. If the vector
> is unmasked, the interrupt bypasses the client/QEMU and takes this
> path: “server -> KVM -> guest”. Whereas, if the vector is masked, it lands on the
> client via: “server -> vfio_msi_interrupt()”. vfio_msi_interrupt() suppresses the
> interrupt if the vector is masked. The use and release functions switch the server’s
> eventfd between VFIOPCIDevice->VFIOMSIVector[i]->kvm_interrupt and
> VFIOPCIDevice->VFIOMSIVector[i]->interrupt using the
> VFIO_DEVICE_SET_IRQS message.
> 
> Concerning MSI, the server should check if the vector is unmasked before
> triggering. The server is not doing it presently, will update it. For some reason,
> I had assumed that MSI handling is similar to MSIx in terms of masking - sorry
> about that. The masking and unmasking information for MSI is in the config space
> registers, so the server should have this information.
> 
> You had previously suggested using callbacks for msi_get_message &
> msi_send_message, considering the masking issue. Given MSIx masking
> (including the MSIx table BAR) is handled at the client, the masking information
> doesn’t reach the server - so msix_notify will never invoke the
> msi_send_message callback - all the vectors remain masked at the server
> end (msix_init() -> msix_mask_all()).

I was expecting vfio-user devices to be involved in MSI-X masking so
they can implement the Pending bit semantics described in the spec:

  If a masked vector has its Pending bit Set, and the associated
  underlying interrupt events are somehow satisfied (usually by software
  though the exact manner is Function-specific), the Function must Clear
  the Pending bit, to avoid sending a spurious interrupt message later
  when software unmasks the vector.

Does QEMU VFIO support this?

What happens when a hw/net/e1000e_core.c vfio-user device uses
msix_clr_pending() and related APIs?

Also, having the vfio-user daemon write to the eventfd while the vector
is masked is a waste of CPU cycles. The PCIe spec describes using MSI-X
masking for poll mode operation and going via eventfd is suboptimal:

  Software is permitted to mask one or more vectors indefinitely, and
  service their associated interrupt events strictly based on polling
  their Pending bits. A Function must Set and Clear its Pending bits as
  necessary to support this “pure polling” mode of operation.

Maybe the answer is these issues don't matter in practice because MSI-X
masking is not used much?

Stefan
Jag Raman March 29, 2022, 7:06 p.m. UTC | #6
> On Mar 29, 2022, at 10:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Sat, Mar 26, 2022 at 11:47:36PM +0000, Jag Raman wrote:
>> 
>> 
>>> On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> 
>>> On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
>>>> Forward remote device's interrupts to the guest
>>>> 
>>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>>> ---
>>>> include/hw/pci/pci.h              |   6 ++
>>>> include/hw/remote/vfio-user-obj.h |   6 ++
>>>> hw/pci/msi.c                      |  13 +++-
>>>> hw/pci/msix.c                     |  12 +++-
>>>> hw/remote/machine.c               |  11 +--
>>>> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
>>>> stubs/vfio-user-obj.c             |   6 ++
>>>> MAINTAINERS                       |   1 +
>>>> hw/remote/trace-events            |   1 +
>>>> stubs/meson.build                 |   1 +
>>>> 10 files changed, 158 insertions(+), 6 deletions(-)
>>>> create mode 100644 include/hw/remote/vfio-user-obj.h
>>>> create mode 100644 stubs/vfio-user-obj.c
>>>> 
>>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>>> index c3f3c90473..d42d526a48 100644
>>>> --- a/include/hw/pci/pci.h
>>>> +++ b/include/hw/pci/pci.h
>>>> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
>>>> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
>>>>                                pcibus_t addr, pcibus_t size, int type);
>>>> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
>>>> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
>>>> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
>>>> 
>>>> typedef struct PCIIORegion {
>>>>    pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
>>>> @@ -323,6 +325,10 @@ struct PCIDevice {
>>>>    /* Space to store MSIX table & pending bit array */
>>>>    uint8_t *msix_table;
>>>>    uint8_t *msix_pba;
>>>> +
>>>> +    PCIMSINotify *msi_notify;
>>>> +    PCIMSIxNotify *msix_notify;
>>>> +
>>>>    /* MemoryRegion container for msix exclusive BAR setup */
>>>>    MemoryRegion msix_exclusive_bar;
>>>>    /* Memory Regions for MSIX table and pending bit entries. */
>>>> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
>>>> new file mode 100644
>>>> index 0000000000..87ab78b875
>>>> --- /dev/null
>>>> +++ b/include/hw/remote/vfio-user-obj.h
>>>> @@ -0,0 +1,6 @@
>>>> +#ifndef VFIO_USER_OBJ_H
>>>> +#define VFIO_USER_OBJ_H
>>>> +
>>>> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
>>>> +
>>>> +#endif
>>>> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
>>>> index 47d2b0f33c..93f5e400cc 100644
>>>> --- a/hw/pci/msi.c
>>>> +++ b/hw/pci/msi.c
>>>> @@ -51,6 +51,8 @@
>>>> */
>>>> bool msi_nonbroken;
>>>> 
>>>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
>>>> +
>>>> /* If we get rid of cap allocator, we won't need this. */
>>>> static inline uint8_t msi_cap_sizeof(uint16_t flags)
>>>> {
>>>> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
>>>>    dev->msi_cap = config_offset;
>>>>    dev->cap_present |= QEMU_PCI_CAP_MSI;
>>>> 
>>>> +    dev->msi_notify = pci_msi_notify;
>>> 
>>> Are you sure it's correct to skip the msi_is_masked() logic? I think the
>>> callback function should only override the behavior of
>>> msi_send_message(), not the entire msi_notify() function.
>>> 
>>> The same applies to MSI-X.
>> 
>> Hi Stefan,
>> 
>> We noticed that the client is handling the masking and unmasking of MSIx
>> interrupts.
>> 
>> Concerning MSIx, vfio_msix_vector_use() handles unmasking and
>> vfio_msix_vector_release() handles masking operations. The server triggers
>> an MSIx interrupt by signaling the eventfd associated with the vector. If the vector
>> is unmasked, the interrupt bypasses the client/QEMU and takes this
>> path: “server -> KVM -> guest”. Whereas, if the vector is masked, it lands on the
>> client via: “server -> vfio_msi_interrupt()”. vfio_msi_interrupt() suppresses the
>> interrupt if the vector is masked. The use and release functions switch the server’s
>> eventfd between VFIOPCIDevice->VFIOMSIVector[i]->kvm_interrupt and
>> VFIOPCIDevice->VFIOMSIVector[i]->interrupt using the
>> VFIO_DEVICE_SET_IRQS message.
>> 
>> Concerning MSI, the server should check if the vector is unmasked before
>> triggering. The server is not doing it presently, will update it. For some reason,
>> I had assumed that MSI handling is similar to MSIx in terms of masking - sorry
>> about that. The masking and unmasking information for MSI is in the config space
>> registers, so the server should have this information.
>> 
>> You had previously suggested using callbacks for msi_get_message &
>> msi_send_message, considering the masking issue. Given MSIx masking
>> (including the MSIx table BAR) is handled at the client, the masking information
>> doesn’t reach the server - so msix_notify will never invoke the
>> msi_send_message callback - all the vectors remain masked at the server
>> end (msix_init() -> msix_mask_all()).
> 
> I was expecting vfio-user devices to be involved in MSI-X masking so
> they can implement the Pending bit semantics described in the spec:
> 
>  If a masked vector has its Pending bit Set, and the associated
>  underlying interrupt events are somehow satisfied (usually by software
>  though the exact manner is Function-specific), the Function must Clear
>  the Pending bit, to avoid sending a spurious interrupt message later
>  when software unmasks the vector.
> 
> Does QEMU VFIO support this?

QEMU VFIO doesn’t seem to support it - I couldn’t find a place where
an assigned/passthru PCI device clears the pending bits in QEMU.

> 
> What happens when a hw/net/e1000e_core.c vfio-user device uses
> msix_clr_pending() and related APIs?
> 
> Also, having the vfio-user daemon write to the eventfd while the vector
> is masked is a waste of CPU cycles. The PCIe spec describes using MSI-X
> masking for poll mode operation and going via eventfd is suboptimal:
> 
>  Software is permitted to mask one or more vectors indefinitely, and
>  service their associated interrupt events strictly based on polling
>  their Pending bits. A Function must Set and Clear its Pending bits as
>  necessary to support this “pure polling” mode of operation.
> 
> Maybe the answer is these issues don't matter in practice because MSI-X
> masking is not used much?

From what I can tell, “pure polling” is used by ivshmem and virtio-pci devices in QEMU.

e1000e doesn’t use “pure polling”, but it does clear pending interrupts.

John Johnson, John Levon & Thanos,

    Any thoughts?

Thank you!
--
Jag

> 
> Stefan
Thanos Makatos March 30, 2022, 9:40 a.m. UTC | #7
> -----Original Message-----
> From: Jag Raman <jag.raman@oracle.com>
> Sent: 29 March 2022 20:07
> To: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>; qemu-devel <qemu-
> devel@nongnu.org>; Michael S. Tsirkin <mst@redhat.com>; Philippe Mathieu-
> Daudé <f4bug@amsat.org>; Paolo Bonzini <pbonzini@redhat.com>; Beraldo
> Leal <bleal@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>;
> eduardo@habkost.net; Marcel Apfelbaum <marcel.apfelbaum@gmail.com>;
> Eric Blake <eblake@redhat.com>; Markus Armbruster <armbru@redhat.com>;
> Juan Quintela <quintela@redhat.com>; dgilbert@redhat.com; John Levon
> <john.levon@nutanix.com>; Thanos Makatos <thanos.makatos@nutanix.com>;
> Elena Ufimtseva <elena.ufimtseva@oracle.com>; John Johnson
> <john.g.johnson@oracle.com>; Kanth Ghatraju <kanth.ghatraju@oracle.com>
> Subject: Re: [PATCH v6 15/19] vfio-user: handle device interrupts
> 
> 
> 
> > On Mar 29, 2022, at 10:24 AM, Stefan Hajnoczi <stefanha@redhat.com>
> wrote:
> >
> > On Sat, Mar 26, 2022 at 11:47:36PM +0000, Jag Raman wrote:
> >>
> >>
> >>> On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com>
> wrote:
> >>>
> >>> On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
> >>>> Forward remote device's interrupts to the guest
> >>>>
> >>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >>>> ---
> >>>> include/hw/pci/pci.h              |   6 ++
> >>>> include/hw/remote/vfio-user-obj.h |   6 ++
> >>>> hw/pci/msi.c                      |  13 +++-
> >>>> hw/pci/msix.c                     |  12 +++-
> >>>> hw/remote/machine.c               |  11 +--
> >>>> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
> >>>> stubs/vfio-user-obj.c             |   6 ++
> >>>> MAINTAINERS                       |   1 +
> >>>> hw/remote/trace-events            |   1 +
> >>>> stubs/meson.build                 |   1 +
> >>>> 10 files changed, 158 insertions(+), 6 deletions(-)
> >>>> create mode 100644 include/hw/remote/vfio-user-obj.h
> >>>> create mode 100644 stubs/vfio-user-obj.c
> >>>>
> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >>>> index c3f3c90473..d42d526a48 100644
> >>>> --- a/include/hw/pci/pci.h
> >>>> +++ b/include/hw/pci/pci.h
> >>>> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice
> *pci_dev,
> >>>> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
> >>>>                                pcibus_t addr, pcibus_t size, int type);
> >>>> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
> >>>> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
> >>>> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
> >>>>
> >>>> typedef struct PCIIORegion {
> >>>>    pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
> >>>> @@ -323,6 +325,10 @@ struct PCIDevice {
> >>>>    /* Space to store MSIX table & pending bit array */
> >>>>    uint8_t *msix_table;
> >>>>    uint8_t *msix_pba;
> >>>> +
> >>>> +    PCIMSINotify *msi_notify;
> >>>> +    PCIMSIxNotify *msix_notify;
> >>>> +
> >>>>    /* MemoryRegion container for msix exclusive BAR setup */
> >>>>    MemoryRegion msix_exclusive_bar;
> >>>>    /* Memory Regions for MSIX table and pending bit entries. */
> >>>> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-
> user-obj.h
> >>>> new file mode 100644
> >>>> index 0000000000..87ab78b875
> >>>> --- /dev/null
> >>>> +++ b/include/hw/remote/vfio-user-obj.h
> >>>> @@ -0,0 +1,6 @@
> >>>> +#ifndef VFIO_USER_OBJ_H
> >>>> +#define VFIO_USER_OBJ_H
> >>>> +
> >>>> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
> >>>> +
> >>>> +#endif
> >>>> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
> >>>> index 47d2b0f33c..93f5e400cc 100644
> >>>> --- a/hw/pci/msi.c
> >>>> +++ b/hw/pci/msi.c
> >>>> @@ -51,6 +51,8 @@
> >>>> */
> >>>> bool msi_nonbroken;
> >>>>
> >>>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
> >>>> +
> >>>> /* If we get rid of cap allocator, we won't need this. */
> >>>> static inline uint8_t msi_cap_sizeof(uint16_t flags)
> >>>> {
> >>>> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
> >>>>    dev->msi_cap = config_offset;
> >>>>    dev->cap_present |= QEMU_PCI_CAP_MSI;
> >>>>
> >>>> +    dev->msi_notify = pci_msi_notify;
> >>>
> >>> Are you sure it's correct to skip the msi_is_masked() logic? I think the
> >>> callback function should only override the behavior of
> >>> msi_send_message(), not the entire msi_notify() function.
> >>>
> >>> The same applies to MSI-X.
> >>
> >> Hi Stefan,
> >>
> >> We noticed that the client is handling the masking and unmasking of MSIx
> >> interrupts.
> >>
> >> Concerning MSIx, vfio_msix_vector_use() handles unmasking and
> >> vfio_msix_vector_release() handles masking operations. The server triggers
> >> an MSIx interrupt by signaling the eventfd associated with the vector. If the
> vector
> >> is unmasked, the interrupt bypasses the client/QEMU and takes this
> >> path: “server -> KVM -> guest”. Whereas, if the vector is masked, it lands on
> the
> >> client via: “server -> vfio_msi_interrupt()”. vfio_msi_interrupt() suppresses
> the
> >> interrupt if the vector is masked. The use and release functions switch the
> server’s
> >> eventfd between VFIOPCIDevice->VFIOMSIVector[i]->kvm_interrupt and
> >> VFIOPCIDevice->VFIOMSIVector[i]->interrupt using the
> >> VFIO_DEVICE_SET_IRQS message.
> >>
> >> Concerning MSI, the server should check if the vector is unmasked before
> >> triggering. The server is not doing it presently, will update it. For some
> reason,
> >> I had assumed that MSI handling is similar to MSIx in terms of masking - sorry
> >> about that. The masking and unmasking information for MSI is in the config
> space
> >> registers, so the server should have this information.
> >>
> >> You had previously suggested using callbacks for msi_get_message &
> >> msi_send_message, considering the masking issue. Given MSIx masking
> >> (including the MSIx table BAR) is handled at the client, the masking
> information
> >> doesn’t reach the server - so msix_notify will never invoke the
> >> msi_send_message callback - all the vectors remain masked at the server
> >> end (msix_init() -> msix_mask_all()).
> >
> > I was expecting vfio-user devices to be involved in MSI-X masking so

libvfio-user can't be involved in the first place since QEMU emulates MSI/X:
https://lore.kernel.org/all/20200121101911.64701afd@w520.home/T/

> > they can implement the Pending bit semantics described in the spec:
> >
> >  If a masked vector has its Pending bit Set, and the associated
> >  underlying interrupt events are somehow satisfied (usually by software
> >  though the exact manner is Function-specific), the Function must Clear
> >  the Pending bit, to avoid sending a spurious interrupt message later
> >  when software unmasks the vector.
> >
> > Does QEMU VFIO support this?
> 
> QEMU VFIO doesn’t seem to support it - I couldn’t find a place where
> an assigned/passthru PCI device clears the pending bits in QEMU.
> 
> >
> > What happens when a hw/net/e1000e_core.c vfio-user device uses
> > msix_clr_pending() and related APIs?
> >
> > Also, having the vfio-user daemon write to the eventfd while the vector
> > is masked is a waste of CPU cycles. The PCIe spec describes using MSI-X
> > masking for poll mode operation and going via eventfd is suboptimal:
> >
> >  Software is permitted to mask one or more vectors indefinitely, and
> >  service their associated interrupt events strictly based on polling
> >  their Pending bits. A Function must Set and Clear its Pending bits as
> >  necessary to support this “pure polling” mode of operation.
> >
> > Maybe the answer is these issues don't matter in practice because MSI-X
> > masking is not used much?
> 
> From what I can tell, “pure polling” is used by ivshmem and virtio-pci devices in
> QEMU.
> 
> e1000e doesn’t use “pure polling”, but it does clear pending interrupts.
> 
> John Johnson, John Levon & Thanos,
> 
>     Any thoughts?

If QEMU stops emulating MSI/X then we libvfio-user would have to do it.
Stefan Hajnoczi April 4, 2022, 9:44 a.m. UTC | #8
On Wed, Mar 30, 2022 at 09:40:42AM +0000, Thanos Makatos wrote:
> > -----Original Message-----
> > From: Jag Raman <jag.raman@oracle.com>
> > Sent: 29 March 2022 20:07
> > To: Stefan Hajnoczi <stefanha@redhat.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>; qemu-devel <qemu-
> > devel@nongnu.org>; Michael S. Tsirkin <mst@redhat.com>; Philippe Mathieu-
> > Daudé <f4bug@amsat.org>; Paolo Bonzini <pbonzini@redhat.com>; Beraldo
> > Leal <bleal@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>;
> > eduardo@habkost.net; Marcel Apfelbaum <marcel.apfelbaum@gmail.com>;
> > Eric Blake <eblake@redhat.com>; Markus Armbruster <armbru@redhat.com>;
> > Juan Quintela <quintela@redhat.com>; dgilbert@redhat.com; John Levon
> > <john.levon@nutanix.com>; Thanos Makatos <thanos.makatos@nutanix.com>;
> > Elena Ufimtseva <elena.ufimtseva@oracle.com>; John Johnson
> > <john.g.johnson@oracle.com>; Kanth Ghatraju <kanth.ghatraju@oracle.com>
> > Subject: Re: [PATCH v6 15/19] vfio-user: handle device interrupts
> > 
> > 
> > 
> > > On Mar 29, 2022, at 10:24 AM, Stefan Hajnoczi <stefanha@redhat.com>
> > wrote:
> > >
> > > On Sat, Mar 26, 2022 at 11:47:36PM +0000, Jag Raman wrote:
> > >>
> > >>
> > >>> On Mar 7, 2022, at 5:24 AM, Stefan Hajnoczi <stefanha@redhat.com>
> > wrote:
> > >>>
> > >>> On Thu, Feb 17, 2022 at 02:49:02AM -0500, Jagannathan Raman wrote:
> > >>>> Forward remote device's interrupts to the guest
> > >>>>
> > >>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> > >>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> > >>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> > >>>> ---
> > >>>> include/hw/pci/pci.h              |   6 ++
> > >>>> include/hw/remote/vfio-user-obj.h |   6 ++
> > >>>> hw/pci/msi.c                      |  13 +++-
> > >>>> hw/pci/msix.c                     |  12 +++-
> > >>>> hw/remote/machine.c               |  11 +--
> > >>>> hw/remote/vfio-user-obj.c         | 107 ++++++++++++++++++++++++++++++
> > >>>> stubs/vfio-user-obj.c             |   6 ++
> > >>>> MAINTAINERS                       |   1 +
> > >>>> hw/remote/trace-events            |   1 +
> > >>>> stubs/meson.build                 |   1 +
> > >>>> 10 files changed, 158 insertions(+), 6 deletions(-)
> > >>>> create mode 100644 include/hw/remote/vfio-user-obj.h
> > >>>> create mode 100644 stubs/vfio-user-obj.c
> > >>>>
> > >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > >>>> index c3f3c90473..d42d526a48 100644
> > >>>> --- a/include/hw/pci/pci.h
> > >>>> +++ b/include/hw/pci/pci.h
> > >>>> @@ -129,6 +129,8 @@ typedef uint32_t PCIConfigReadFunc(PCIDevice
> > *pci_dev,
> > >>>> typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
> > >>>>                                pcibus_t addr, pcibus_t size, int type);
> > >>>> typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
> > >>>> +typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
> > >>>> +typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
> > >>>>
> > >>>> typedef struct PCIIORegion {
> > >>>>    pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
> > >>>> @@ -323,6 +325,10 @@ struct PCIDevice {
> > >>>>    /* Space to store MSIX table & pending bit array */
> > >>>>    uint8_t *msix_table;
> > >>>>    uint8_t *msix_pba;
> > >>>> +
> > >>>> +    PCIMSINotify *msi_notify;
> > >>>> +    PCIMSIxNotify *msix_notify;
> > >>>> +
> > >>>>    /* MemoryRegion container for msix exclusive BAR setup */
> > >>>>    MemoryRegion msix_exclusive_bar;
> > >>>>    /* Memory Regions for MSIX table and pending bit entries. */
> > >>>> diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-
> > user-obj.h
> > >>>> new file mode 100644
> > >>>> index 0000000000..87ab78b875
> > >>>> --- /dev/null
> > >>>> +++ b/include/hw/remote/vfio-user-obj.h
> > >>>> @@ -0,0 +1,6 @@
> > >>>> +#ifndef VFIO_USER_OBJ_H
> > >>>> +#define VFIO_USER_OBJ_H
> > >>>> +
> > >>>> +void vfu_object_set_bus_irq(PCIBus *pci_bus);
> > >>>> +
> > >>>> +#endif
> > >>>> diff --git a/hw/pci/msi.c b/hw/pci/msi.c
> > >>>> index 47d2b0f33c..93f5e400cc 100644
> > >>>> --- a/hw/pci/msi.c
> > >>>> +++ b/hw/pci/msi.c
> > >>>> @@ -51,6 +51,8 @@
> > >>>> */
> > >>>> bool msi_nonbroken;
> > >>>>
> > >>>> +static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
> > >>>> +
> > >>>> /* If we get rid of cap allocator, we won't need this. */
> > >>>> static inline uint8_t msi_cap_sizeof(uint16_t flags)
> > >>>> {
> > >>>> @@ -225,6 +227,8 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
> > >>>>    dev->msi_cap = config_offset;
> > >>>>    dev->cap_present |= QEMU_PCI_CAP_MSI;
> > >>>>
> > >>>> +    dev->msi_notify = pci_msi_notify;
> > >>>
> > >>> Are you sure it's correct to skip the msi_is_masked() logic? I think the
> > >>> callback function should only override the behavior of
> > >>> msi_send_message(), not the entire msi_notify() function.
> > >>>
> > >>> The same applies to MSI-X.
> > >>
> > >> Hi Stefan,
> > >>
> > >> We noticed that the client is handling the masking and unmasking of MSIx
> > >> interrupts.
> > >>
> > >> Concerning MSIx, vfio_msix_vector_use() handles unmasking and
> > >> vfio_msix_vector_release() handles masking operations. The server triggers
> > >> an MSIx interrupt by signaling the eventfd associated with the vector. If the
> > vector
> > >> is unmasked, the interrupt bypasses the client/QEMU and takes this
> > >> path: “server -> KVM -> guest”. Whereas, if the vector is masked, it lands on
> > the
> > >> client via: “server -> vfio_msi_interrupt()”. vfio_msi_interrupt() suppresses
> > the
> > >> interrupt if the vector is masked. The use and release functions switch the
> > server’s
> > >> eventfd between VFIOPCIDevice->VFIOMSIVector[i]->kvm_interrupt and
> > >> VFIOPCIDevice->VFIOMSIVector[i]->interrupt using the
> > >> VFIO_DEVICE_SET_IRQS message.
> > >>
> > >> Concerning MSI, the server should check if the vector is unmasked before
> > >> triggering. The server is not doing it presently, will update it. For some
> > reason,
> > >> I had assumed that MSI handling is similar to MSIx in terms of masking - sorry
> > >> about that. The masking and unmasking information for MSI is in the config
> > space
> > >> registers, so the server should have this information.
> > >>
> > >> You had previously suggested using callbacks for msi_get_message &
> > >> msi_send_message, considering the masking issue. Given MSIx masking
> > >> (including the MSIx table BAR) is handled at the client, the masking
> > information
> > >> doesn’t reach the server - so msix_notify will never invoke the
> > >> msi_send_message callback - all the vectors remain masked at the server
> > >> end (msix_init() -> msix_mask_all()).
> > >
> > > I was expecting vfio-user devices to be involved in MSI-X masking so
> 
> libvfio-user can't be involved in the first place since QEMU emulates MSI/X:
> https://lore.kernel.org/all/20200121101911.64701afd@w520.home/T/
> 
> > > they can implement the Pending bit semantics described in the spec:
> > >
> > >  If a masked vector has its Pending bit Set, and the associated
> > >  underlying interrupt events are somehow satisfied (usually by software
> > >  though the exact manner is Function-specific), the Function must Clear
> > >  the Pending bit, to avoid sending a spurious interrupt message later
> > >  when software unmasks the vector.
> > >
> > > Does QEMU VFIO support this?
> > 
> > QEMU VFIO doesn’t seem to support it - I couldn’t find a place where
> > an assigned/passthru PCI device clears the pending bits in QEMU.
> > 
> > >
> > > What happens when a hw/net/e1000e_core.c vfio-user device uses
> > > msix_clr_pending() and related APIs?
> > >
> > > Also, having the vfio-user daemon write to the eventfd while the vector
> > > is masked is a waste of CPU cycles. The PCIe spec describes using MSI-X
> > > masking for poll mode operation and going via eventfd is suboptimal:
> > >
> > >  Software is permitted to mask one or more vectors indefinitely, and
> > >  service their associated interrupt events strictly based on polling
> > >  their Pending bits. A Function must Set and Clear its Pending bits as
> > >  necessary to support this “pure polling” mode of operation.
> > >
> > > Maybe the answer is these issues don't matter in practice because MSI-X
> > > masking is not used much?
> > 
> > From what I can tell, “pure polling” is used by ivshmem and virtio-pci devices in
> > QEMU.
> > 
> > e1000e doesn’t use “pure polling”, but it does clear pending interrupts.
> > 
> > John Johnson, John Levon & Thanos,
> > 
> >     Any thoughts?
> 
> If QEMU stops emulating MSI/X then we libvfio-user would have to do it.

If Alex is happy with VFIO not supporting the exact Pending bits
semantics in the PCIe spec, then vfio-user probably doesn't need to
either.

I just wanted to raise this design issue and maybe we can document
somewhere that this is intentional.

Stefan
diff mbox series

Patch

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c3f3c90473..d42d526a48 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -129,6 +129,8 @@  typedef uint32_t PCIConfigReadFunc(PCIDevice *pci_dev,
 typedef void PCIMapIORegionFunc(PCIDevice *pci_dev, int region_num,
                                 pcibus_t addr, pcibus_t size, int type);
 typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
+typedef void PCIMSINotify(PCIDevice *pci_dev, unsigned vector);
+typedef void PCIMSIxNotify(PCIDevice *pci_dev, unsigned vector);
 
 typedef struct PCIIORegion {
     pcibus_t addr; /* current PCI mapping address. -1 means not mapped */
@@ -323,6 +325,10 @@  struct PCIDevice {
     /* Space to store MSIX table & pending bit array */
     uint8_t *msix_table;
     uint8_t *msix_pba;
+
+    PCIMSINotify *msi_notify;
+    PCIMSIxNotify *msix_notify;
+
     /* MemoryRegion container for msix exclusive BAR setup */
     MemoryRegion msix_exclusive_bar;
     /* Memory Regions for MSIX table and pending bit entries. */
diff --git a/include/hw/remote/vfio-user-obj.h b/include/hw/remote/vfio-user-obj.h
new file mode 100644
index 0000000000..87ab78b875
--- /dev/null
+++ b/include/hw/remote/vfio-user-obj.h
@@ -0,0 +1,6 @@ 
+#ifndef VFIO_USER_OBJ_H
+#define VFIO_USER_OBJ_H
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus);
+
+#endif
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 47d2b0f33c..93f5e400cc 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -51,6 +51,8 @@ 
  */
 bool msi_nonbroken;
 
+static void pci_msi_notify(PCIDevice *dev, unsigned int vector);
+
 /* If we get rid of cap allocator, we won't need this. */
 static inline uint8_t msi_cap_sizeof(uint16_t flags)
 {
@@ -225,6 +227,8 @@  int msi_init(struct PCIDevice *dev, uint8_t offset,
     dev->msi_cap = config_offset;
     dev->cap_present |= QEMU_PCI_CAP_MSI;
 
+    dev->msi_notify = pci_msi_notify;
+
     pci_set_word(dev->config + msi_flags_off(dev), flags);
     pci_set_word(dev->wmask + msi_flags_off(dev),
                  PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
@@ -307,7 +311,7 @@  bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
     return mask & (1U << vector);
 }
 
-void msi_notify(PCIDevice *dev, unsigned int vector)
+static void pci_msi_notify(PCIDevice *dev, unsigned int vector)
 {
     uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
     bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
@@ -332,6 +336,13 @@  void msi_notify(PCIDevice *dev, unsigned int vector)
     msi_send_message(dev, msg);
 }
 
+void msi_notify(PCIDevice *dev, unsigned int vector)
+{
+    if (dev->msi_notify) {
+        dev->msi_notify(dev, vector);
+    }
+}
+
 void msi_send_message(PCIDevice *dev, MSIMessage msg)
 {
     MemTxAttrs attrs = {};
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ae9331cd0b..1c71e67f53 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -31,6 +31,8 @@ 
 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
 
+static void pci_msix_notify(PCIDevice *dev, unsigned vector);
+
 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector)
 {
     uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE;
@@ -334,6 +336,7 @@  int msix_init(struct PCIDevice *dev, unsigned short nentries,
     dev->msix_table = g_malloc0(table_size);
     dev->msix_pba = g_malloc0(pba_size);
     dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used);
+    dev->msix_notify = pci_msix_notify;
 
     msix_mask_all(dev, nentries);
 
@@ -485,7 +488,7 @@  int msix_enabled(PCIDevice *dev)
 }
 
 /* Send an MSI-X message */
-void msix_notify(PCIDevice *dev, unsigned vector)
+static void pci_msix_notify(PCIDevice *dev, unsigned vector)
 {
     MSIMessage msg;
 
@@ -503,6 +506,13 @@  void msix_notify(PCIDevice *dev, unsigned vector)
     msi_send_message(dev, msg);
 }
 
+void msix_notify(PCIDevice *dev, unsigned vector)
+{
+    if (dev->msix_notify) {
+        dev->msix_notify(dev, vector);
+    }
+}
+
 void msix_reset(PCIDevice *dev)
 {
     if (!msix_present(dev)) {
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
index db4ae30710..a8b4a3aef3 100644
--- a/hw/remote/machine.c
+++ b/hw/remote/machine.c
@@ -23,6 +23,7 @@ 
 #include "hw/remote/iohub.h"
 #include "hw/qdev-core.h"
 #include "hw/remote/iommu.h"
+#include "hw/remote/vfio-user-obj.h"
 
 static void remote_machine_init(MachineState *machine)
 {
@@ -54,12 +55,14 @@  static void remote_machine_init(MachineState *machine)
 
     if (s->vfio_user) {
         remote_configure_iommu(pci_host->bus);
-    }
 
-    remote_iohub_init(&s->iohub);
+        vfu_object_set_bus_irq(pci_host->bus);
+    } else {
+        remote_iohub_init(&s->iohub);
 
-    pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
-                 &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+        pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+                     &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+    }
 
     qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
 }
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 2feabd06a4..d79bab87f1 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -54,6 +54,9 @@ 
 #include "hw/pci/pci.h"
 #include "qemu/timer.h"
 #include "exec/memory.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/remote/vfio-user-obj.h"
 
 #define TYPE_VFU_OBJECT "x-vfio-user-server"
 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -107,6 +110,10 @@  struct VfuObject {
     int vfu_poll_fd;
 };
 
+static GHashTable *vfu_object_bdf_to_ctx_table;
+
+#define INT2VOIDP(i) (void *)(uintptr_t)(i)
+
 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
 
 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
@@ -463,6 +470,86 @@  static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
     }
 }
 
+static void vfu_object_irq_trigger(int pci_bdf, unsigned vector)
+{
+    vfu_ctx_t *vfu_ctx = NULL;
+
+    if (!vfu_object_bdf_to_ctx_table) {
+        return;
+    }
+
+    vfu_ctx = g_hash_table_lookup(vfu_object_bdf_to_ctx_table,
+                                  INT2VOIDP(pci_bdf));
+
+    if (vfu_ctx) {
+        vfu_irq_trigger(vfu_ctx, vector);
+    }
+}
+
+static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
+{
+    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
+                                pci_dev->devfn);
+
+    return pci_bdf;
+}
+
+static void vfu_object_set_irq(void *opaque, int pirq, int level)
+{
+    if (level) {
+        vfu_object_irq_trigger(pirq, 0);
+    }
+}
+
+static void vfu_object_msi_notify(PCIDevice *pci_dev, unsigned vector)
+{
+    int pci_bdf;
+
+    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
+
+    vfu_object_irq_trigger(pci_bdf, vector);
+}
+
+static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
+{
+    vfu_ctx_t *vfu_ctx = o->vfu_ctx;
+    int ret, pci_bdf;
+
+    ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = 0;
+    if (msix_nr_vectors_allocated(pci_dev)) {
+        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
+                                       msix_nr_vectors_allocated(pci_dev));
+
+        pci_dev->msix_notify = vfu_object_msi_notify;
+    } else if (msi_nr_vectors_allocated(pci_dev)) {
+        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
+                                       msi_nr_vectors_allocated(pci_dev));
+
+        pci_dev->msi_notify = vfu_object_msi_notify;
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), pci_dev->devfn);
+
+    g_hash_table_insert(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf),
+                        o->vfu_ctx);
+
+    return 0;
+}
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus)
+{
+    pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, NULL, 1);
+}
+
 /*
  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
  * properties. It also depends on devices instantiated in QEMU. These
@@ -559,6 +646,13 @@  static void vfu_object_init_ctx(VfuObject *o, Error **errp)
 
     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
 
+    ret = vfu_object_setup_irqs(o, o->pci_dev);
+    if (ret < 0) {
+        error_setg(errp, "vfu: Failed to setup interrupts for %s",
+                   o->device);
+        goto fail;
+    }
+
     ret = vfu_realize_ctx(o->vfu_ctx);
     if (ret < 0) {
         error_setg(errp, "vfu: Failed to realize device %s- %s",
@@ -612,6 +706,7 @@  static void vfu_object_finalize(Object *obj)
 {
     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
     VfuObject *o = VFU_OBJECT(obj);
+    int pci_bdf;
 
     k->nr_devs--;
 
@@ -638,9 +733,17 @@  static void vfu_object_finalize(Object *obj)
         o->unplug_blocker = NULL;
     }
 
+    if (o->pci_dev) {
+        pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(o->pci_dev)),
+                                o->pci_dev->devfn);
+        g_hash_table_remove(vfu_object_bdf_to_ctx_table, INT2VOIDP(pci_bdf));
+    }
+
     o->pci_dev = NULL;
 
     if (!k->nr_devs && k->auto_shutdown) {
+        g_hash_table_destroy(vfu_object_bdf_to_ctx_table);
+        vfu_object_bdf_to_ctx_table = NULL;
         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
     }
 
@@ -658,6 +761,10 @@  static void vfu_object_class_init(ObjectClass *klass, void *data)
 
     k->auto_shutdown = true;
 
+    msi_nonbroken = true;
+
+    vfu_object_bdf_to_ctx_table = g_hash_table_new_full(NULL, NULL, NULL, NULL);
+
     object_class_property_add(klass, "socket", "SocketAddress", NULL,
                               vfu_object_set_socket, NULL, NULL);
     object_class_property_set_description(klass, "socket",
diff --git a/stubs/vfio-user-obj.c b/stubs/vfio-user-obj.c
new file mode 100644
index 0000000000..79100d768e
--- /dev/null
+++ b/stubs/vfio-user-obj.c
@@ -0,0 +1,6 @@ 
+#include "qemu/osdep.h"
+#include "hw/remote/vfio-user-obj.h"
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus)
+{
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index f47232c78c..e274cb46af 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3569,6 +3569,7 @@  F: hw/remote/iohub.c
 F: include/hw/remote/iohub.h
 F: subprojects/libvfio-user
 F: hw/remote/vfio-user-obj.c
+F: include/hw/remote/vfio-user-obj.h
 F: hw/remote/iommu.c
 F: include/hw/remote/iommu.h
 
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 847d50d88f..c167b3c7a5 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -12,3 +12,4 @@  vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
 vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
 vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
 vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
+vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
diff --git a/stubs/meson.build b/stubs/meson.build
index d359cbe1ad..c5ce979dc3 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -57,3 +57,4 @@  if have_system
 else
   stub_ss.add(files('qdev.c'))
 endif
+stub_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_false: files('vfio-user-obj.c'))