diff mbox series

[v3,4/5] vfio-pci: Add FAILOVER_PRIMARY_CHANGED event to shorten downtime during failover

Message ID 1546900184-27403-5-git-send-email-venu.busireddy@oracle.com (mailing list archive)
State New, archived
Headers show
Series Support for datapath switching during live migration | expand

Commit Message

Venu Busireddy Jan. 7, 2019, 10:29 p.m. UTC
From: Si-Wei Liu <si-wei.liu@oracle.com>

When a VF is hotplugged into the guest, datapath switching will be
performed immediately, which is sub-optimal in terms of timing, and
could end up with substantial network downtime. One of ways to shorten
this downtime is to switch the datapath only after the VF is seen to get
enabled by guest, indicated by the bus master bit in VF's PCI config
space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
at that time to indicate this condition. Then management stack can kick
off datapath switching upon receiving the event.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
---
 hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qapi/net.json | 26 ++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

Comments

Alex Williamson Jan. 7, 2019, 11:17 p.m. UTC | #1
On Mon,  7 Jan 2019 17:29:43 -0500
Venu Busireddy <venu.busireddy@oracle.com> wrote:

> From: Si-Wei Liu <si-wei.liu@oracle.com>
> 
> When a VF is hotplugged into the guest, datapath switching will be
> performed immediately, which is sub-optimal in terms of timing, and
> could end up with substantial network downtime. One of ways to shorten
> this downtime is to switch the datapath only after the VF is seen to get
> enabled by guest, indicated by the bus master bit in VF's PCI config
> space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> at that time to indicate this condition. Then management stack can kick
> off datapath switching upon receiving the event.
> 
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> ---
>  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qapi/net.json | 26 ++++++++++++++++++++++++++
>  2 files changed, 83 insertions(+)

Why is this done at the vfio driver layer rather than the PCI core
layer?  We write everything through using pci_default_write_config(), I
don't see that anything here is particularly vfio specific.  Please copy
me on any changes in hw/vfio.  Thanks,

Alex

> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index bd83b58..adcc95a 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -34,6 +34,7 @@
>  #include "pci.h"
>  #include "trace.h"
>  #include "qapi/error.h"
> +#include "qapi/qapi-events-net.h"
>  
>  #define MSIX_CAP_LENGTH 12
>  
> @@ -42,6 +43,7 @@
>  
>  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
>  
>  /*
>   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
>  {
>      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>      uint32_t val_le = cpu_to_le32(val);
> +    bool may_notify = false;
> +    bool master_was = false;
>  
>      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
>  
> @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
>                       __func__, vdev->vbasedev.name, addr, val, len);
>      }
>  
> +    /* Bus Master Enabling/Disabling */
> +    if (pdev->failover_primary && current_cpu &&
> +        range_covers_byte(addr, len, PCI_COMMAND)) {
> +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> +                        PCI_COMMAND_MASTER);
> +        may_notify = true;
> +    }
> +
>      /* MSI/MSI-X Enabling/Disabling */
>      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
>          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
>          /* Write everything to QEMU to keep emulated bits correct */
>          pci_default_write_config(pdev, addr, val, len);
>      }
> +
> +    if (may_notify) {
> +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> +                             PCI_COMMAND_MASTER);
> +        if (master_was != master_now) {
> +            vfio_failover_notify(vdev, master_now);
> +        }
> +    }
>  }
>  
>  /*
> @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    const char *n;
> +    gchar *path;
> +
> +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> +    path = object_get_canonical_path(OBJECT(vdev));
> +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> +}
> +
>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
>      vfio_put_group(group);
>  }
>  
> +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +
> +    /*
> +     * Guest driver may not get the chance to disable bus mastering
> +     * before the device object gets to be unrealized. In that event,
> +     * send out a "disabled" notification on behalf of guest driver.
> +     */
> +    if (pdev->failover_primary &&
> +        pdev->bus_master_enable_region.enabled) {
> +        vfio_failover_notify(vdev, false);
> +    }
> +}
> +
>  static void vfio_exitfn(PCIDevice *pdev)
>  {
>      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>  
> +    /*
> +     * During the guest reboot sequence, it is sometimes possible that
> +     * the guest may not get sufficient time to complete the entire driver
> +     * removal sequence, near the end of which a PCI config space write to
> +     * disable bus mastering can be intercepted by device. In such cases,
> +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> +     * is imperative to generate the event on the guest's behalf if the
> +     * guest fails to make it.
> +     */
> +    vfio_exit_failover_notify(vdev);
> +
>      vfio_unregister_req_notifier(vdev);
>      vfio_unregister_err_notifier(vdev);
>      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> diff --git a/qapi/net.json b/qapi/net.json
> index 633ac87..a5b8d70 100644
> --- a/qapi/net.json
> +++ b/qapi/net.json
> @@ -757,3 +757,29 @@
>  ##
>  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
>    'returns': ['StandbyStatusInfo'] }
> +
> +##
> +# @FAILOVER_PRIMARY_CHANGED:
> +#
> +# Emitted whenever the driver of failover primary is loaded or unloaded
> +# by the guest.
> +#
> +# @device: device name
> +#
> +# @path: device path
> +#
> +# @enabled: true if driver is loaded thus device is enabled in guest
> +#
> +# Since: 3.0
> +#
> +# Example:
> +#
> +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> +#      "data": { "device": "vfio-0",
> +#                "path": "/machine/peripheral/vfio-0" },
> +#                "enabled": "true" },
> +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> +#
> +##
> +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
>
Michael S. Tsirkin Jan. 7, 2019, 11:22 p.m. UTC | #2
On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:
> On Mon,  7 Jan 2019 17:29:43 -0500
> Venu Busireddy <venu.busireddy@oracle.com> wrote:
> 
> > From: Si-Wei Liu <si-wei.liu@oracle.com>
> > 
> > When a VF is hotplugged into the guest, datapath switching will be
> > performed immediately, which is sub-optimal in terms of timing, and
> > could end up with substantial network downtime. One of ways to shorten
> > this downtime is to switch the datapath only after the VF is seen to get
> > enabled by guest, indicated by the bus master bit in VF's PCI config
> > space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> > at that time to indicate this condition. Then management stack can kick
> > off datapath switching upon receiving the event.
> > 
> > Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> > Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> > ---
> >  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  qapi/net.json | 26 ++++++++++++++++++++++++++
> >  2 files changed, 83 insertions(+)
> 
> Why is this done at the vfio driver layer rather than the PCI core
> layer?  We write everything through using pci_default_write_config(), I
> don't see that anything here is particularly vfio specific.  Please copy
> me on any changes in hw/vfio.  Thanks,
> 
> Alex

Hmm so you are saying let's send events for each device?
I don't have a problem with this but in this case
I think I would like to see a per-device option "send events".
We don't want a ton of events in the simple default config.

> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index bd83b58..adcc95a 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -34,6 +34,7 @@
> >  #include "pci.h"
> >  #include "trace.h"
> >  #include "qapi/error.h"
> > +#include "qapi/qapi-events-net.h"
> >  
> >  #define MSIX_CAP_LENGTH 12
> >  
> > @@ -42,6 +43,7 @@
> >  
> >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
> >  
> >  /*
> >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
> >  {
> >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> >      uint32_t val_le = cpu_to_le32(val);
> > +    bool may_notify = false;
> > +    bool master_was = false;
> >  
> >      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> >  
> > @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> >                       __func__, vdev->vbasedev.name, addr, val, len);
> >      }
> >  
> > +    /* Bus Master Enabling/Disabling */
> > +    if (pdev->failover_primary && current_cpu &&
> > +        range_covers_byte(addr, len, PCI_COMMAND)) {
> > +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > +                        PCI_COMMAND_MASTER);
> > +        may_notify = true;
> > +    }
> > +
> >      /* MSI/MSI-X Enabling/Disabling */
> >      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> >          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> > @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> >          /* Write everything to QEMU to keep emulated bits correct */
> >          pci_default_write_config(pdev, addr, val, len);
> >      }
> > +
> > +    if (may_notify) {
> > +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > +                             PCI_COMMAND_MASTER);
> > +        if (master_was != master_now) {
> > +            vfio_failover_notify(vdev, master_now);
> > +        }
> > +    }
> >  }
> >  
> >  /*
> > @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> >      vdev->req_enabled = false;
> >  }
> >  
> > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> > +{
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    const char *n;
> > +    gchar *path;
> > +
> > +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> > +    path = object_get_canonical_path(OBJECT(vdev));
> > +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> > +}
> > +
> >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> >  {
> >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
> >      vfio_put_group(group);
> >  }
> >  
> > +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> > +{
> > +    PCIDevice *pdev = &vdev->pdev;
> > +
> > +    /*
> > +     * Guest driver may not get the chance to disable bus mastering
> > +     * before the device object gets to be unrealized. In that event,
> > +     * send out a "disabled" notification on behalf of guest driver.
> > +     */
> > +    if (pdev->failover_primary &&
> > +        pdev->bus_master_enable_region.enabled) {
> > +        vfio_failover_notify(vdev, false);
> > +    }
> > +}
> > +
> >  static void vfio_exitfn(PCIDevice *pdev)
> >  {
> >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> >  
> > +    /*
> > +     * During the guest reboot sequence, it is sometimes possible that
> > +     * the guest may not get sufficient time to complete the entire driver
> > +     * removal sequence, near the end of which a PCI config space write to
> > +     * disable bus mastering can be intercepted by device. In such cases,
> > +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> > +     * is imperative to generate the event on the guest's behalf if the
> > +     * guest fails to make it.
> > +     */
> > +    vfio_exit_failover_notify(vdev);
> > +
> >      vfio_unregister_req_notifier(vdev);
> >      vfio_unregister_err_notifier(vdev);
> >      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > diff --git a/qapi/net.json b/qapi/net.json
> > index 633ac87..a5b8d70 100644
> > --- a/qapi/net.json
> > +++ b/qapi/net.json
> > @@ -757,3 +757,29 @@
> >  ##
> >  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
> >    'returns': ['StandbyStatusInfo'] }
> > +
> > +##
> > +# @FAILOVER_PRIMARY_CHANGED:
> > +#
> > +# Emitted whenever the driver of failover primary is loaded or unloaded
> > +# by the guest.
> > +#
> > +# @device: device name
> > +#
> > +# @path: device path
> > +#
> > +# @enabled: true if driver is loaded thus device is enabled in guest
> > +#
> > +# Since: 3.0
> > +#
> > +# Example:
> > +#
> > +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> > +#      "data": { "device": "vfio-0",
> > +#                "path": "/machine/peripheral/vfio-0" },
> > +#                "enabled": "true" },
> > +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> > +#
> > +##
> > +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> > +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
> >
Alex Williamson Jan. 7, 2019, 11:41 p.m. UTC | #3
On Mon, 7 Jan 2019 18:22:20 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:
> > On Mon,  7 Jan 2019 17:29:43 -0500
> > Venu Busireddy <venu.busireddy@oracle.com> wrote:
> >   
> > > From: Si-Wei Liu <si-wei.liu@oracle.com>
> > > 
> > > When a VF is hotplugged into the guest, datapath switching will be
> > > performed immediately, which is sub-optimal in terms of timing, and
> > > could end up with substantial network downtime. One of ways to shorten
> > > this downtime is to switch the datapath only after the VF is seen to get
> > > enabled by guest, indicated by the bus master bit in VF's PCI config
> > > space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> > > at that time to indicate this condition. Then management stack can kick
> > > off datapath switching upon receiving the event.
> > > 
> > > Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> > > Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> > > ---
> > >  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  qapi/net.json | 26 ++++++++++++++++++++++++++
> > >  2 files changed, 83 insertions(+)  
> > 
> > Why is this done at the vfio driver layer rather than the PCI core
> > layer?  We write everything through using pci_default_write_config(), I
> > don't see that anything here is particularly vfio specific.  Please copy
> > me on any changes in hw/vfio.  Thanks,
> > 
> > Alex  
> 
> Hmm so you are saying let's send events for each device?
> I don't have a problem with this but in this case
> I think I would like to see a per-device option "send events".
> We don't want a ton of events in the simple default config.

In the below we're only sending events for PCIDevice.failover_primary,
seems like that would filter out the rest of the non-NIC PCI devices as
well as it does for non-NIC VFIO PCI devices.  The only thing remotely
vfio specific below is that it might notify based on the vfio device
name, but it's a fallback to PCIDevice.qdev.id.  A real ID could just
be a requirement to make use of this.  Thanks,

Alex

> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index bd83b58..adcc95a 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -34,6 +34,7 @@
> > >  #include "pci.h"
> > >  #include "trace.h"
> > >  #include "qapi/error.h"
> > > +#include "qapi/qapi-events-net.h"
> > >  
> > >  #define MSIX_CAP_LENGTH 12
> > >  
> > > @@ -42,6 +43,7 @@
> > >  
> > >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
> > >  
> > >  /*
> > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > >  {
> > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > >      uint32_t val_le = cpu_to_le32(val);
> > > +    bool may_notify = false;
> > > +    bool master_was = false;
> > >  
> > >      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> > >  
> > > @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > >                       __func__, vdev->vbasedev.name, addr, val, len);
> > >      }
> > >  
> > > +    /* Bus Master Enabling/Disabling */
> > > +    if (pdev->failover_primary && current_cpu &&
> > > +        range_covers_byte(addr, len, PCI_COMMAND)) {
> > > +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > +                        PCI_COMMAND_MASTER);
> > > +        may_notify = true;
> > > +    }
> > > +
> > >      /* MSI/MSI-X Enabling/Disabling */
> > >      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> > >          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> > > @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > >          /* Write everything to QEMU to keep emulated bits correct */
> > >          pci_default_write_config(pdev, addr, val, len);
> > >      }
> > > +
> > > +    if (may_notify) {
> > > +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > +                             PCI_COMMAND_MASTER);
> > > +        if (master_was != master_now) {
> > > +            vfio_failover_notify(vdev, master_now);
> > > +        }
> > > +    }
> > >  }
> > >  
> > >  /*
> > > @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > >      vdev->req_enabled = false;
> > >  }
> > >  
> > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> > > +{
> > > +    PCIDevice *pdev = &vdev->pdev;
> > > +    const char *n;
> > > +    gchar *path;
> > > +
> > > +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> > > +    path = object_get_canonical_path(OBJECT(vdev));
> > > +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> > > +}
> > > +
> > >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> > >  {
> > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
> > >      vfio_put_group(group);
> > >  }
> > >  
> > > +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> > > +{
> > > +    PCIDevice *pdev = &vdev->pdev;
> > > +
> > > +    /*
> > > +     * Guest driver may not get the chance to disable bus mastering
> > > +     * before the device object gets to be unrealized. In that event,
> > > +     * send out a "disabled" notification on behalf of guest driver.
> > > +     */
> > > +    if (pdev->failover_primary &&
> > > +        pdev->bus_master_enable_region.enabled) {
> > > +        vfio_failover_notify(vdev, false);
> > > +    }
> > > +}
> > > +
> > >  static void vfio_exitfn(PCIDevice *pdev)
> > >  {
> > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > >  
> > > +    /*
> > > +     * During the guest reboot sequence, it is sometimes possible that
> > > +     * the guest may not get sufficient time to complete the entire driver
> > > +     * removal sequence, near the end of which a PCI config space write to
> > > +     * disable bus mastering can be intercepted by device. In such cases,
> > > +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> > > +     * is imperative to generate the event on the guest's behalf if the
> > > +     * guest fails to make it.
> > > +     */
> > > +    vfio_exit_failover_notify(vdev);
> > > +
> > >      vfio_unregister_req_notifier(vdev);
> > >      vfio_unregister_err_notifier(vdev);
> > >      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > > diff --git a/qapi/net.json b/qapi/net.json
> > > index 633ac87..a5b8d70 100644
> > > --- a/qapi/net.json
> > > +++ b/qapi/net.json
> > > @@ -757,3 +757,29 @@
> > >  ##
> > >  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
> > >    'returns': ['StandbyStatusInfo'] }
> > > +
> > > +##
> > > +# @FAILOVER_PRIMARY_CHANGED:
> > > +#
> > > +# Emitted whenever the driver of failover primary is loaded or unloaded
> > > +# by the guest.
> > > +#
> > > +# @device: device name
> > > +#
> > > +# @path: device path
> > > +#
> > > +# @enabled: true if driver is loaded thus device is enabled in guest
> > > +#
> > > +# Since: 3.0
> > > +#
> > > +# Example:
> > > +#
> > > +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> > > +#      "data": { "device": "vfio-0",
> > > +#                "path": "/machine/peripheral/vfio-0" },
> > > +#                "enabled": "true" },
> > > +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> > > +#
> > > +##
> > > +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> > > +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
> > >
Michael S. Tsirkin Jan. 8, 2019, 12:12 a.m. UTC | #4
On Mon, Jan 07, 2019 at 04:41:15PM -0700, Alex Williamson wrote:
> On Mon, 7 Jan 2019 18:22:20 -0500
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:
> > > On Mon,  7 Jan 2019 17:29:43 -0500
> > > Venu Busireddy <venu.busireddy@oracle.com> wrote:
> > >   
> > > > From: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > 
> > > > When a VF is hotplugged into the guest, datapath switching will be
> > > > performed immediately, which is sub-optimal in terms of timing, and
> > > > could end up with substantial network downtime. One of ways to shorten
> > > > this downtime is to switch the datapath only after the VF is seen to get
> > > > enabled by guest, indicated by the bus master bit in VF's PCI config
> > > > space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> > > > at that time to indicate this condition. Then management stack can kick
> > > > off datapath switching upon receiving the event.
> > > > 
> > > > Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> > > > ---
> > > >  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  qapi/net.json | 26 ++++++++++++++++++++++++++
> > > >  2 files changed, 83 insertions(+)  
> > > 
> > > Why is this done at the vfio driver layer rather than the PCI core
> > > layer?  We write everything through using pci_default_write_config(), I
> > > don't see that anything here is particularly vfio specific.  Please copy
> > > me on any changes in hw/vfio.  Thanks,
> > > 
> > > Alex  
> > 
> > Hmm so you are saying let's send events for each device?
> > I don't have a problem with this but in this case
> > I think I would like to see a per-device option "send events".
> > We don't want a ton of events in the simple default config.
> 
> In the below we're only sending events for PCIDevice.failover_primary,

Well failover_primary in this patch is a vfio property, not a
pci device property.


> seems like that would filter out the rest of the non-NIC PCI devices as
> well as it does for non-NIC VFIO PCI devices.  The only thing remotely
> vfio specific below is that it might notify based on the vfio device
> name, but it's a fallback to PCIDevice.qdev.id.  A real ID could just
> be a requirement to make use of this.


Right and in fact I don't see why we can't make reporting
bus master status a capability of all devices.


>  Thanks,
> 
> Alex
> 
> > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > index bd83b58..adcc95a 100644
> > > > --- a/hw/vfio/pci.c
> > > > +++ b/hw/vfio/pci.c
> > > > @@ -34,6 +34,7 @@
> > > >  #include "pci.h"
> > > >  #include "trace.h"
> > > >  #include "qapi/error.h"
> > > > +#include "qapi/qapi-events-net.h"
> > > >  
> > > >  #define MSIX_CAP_LENGTH 12
> > > >  
> > > > @@ -42,6 +43,7 @@
> > > >  
> > > >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> > > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
> > > >  
> > > >  /*
> > > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > > @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > >  {
> > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > >      uint32_t val_le = cpu_to_le32(val);
> > > > +    bool may_notify = false;
> > > > +    bool master_was = false;
> > > >  
> > > >      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> > > >  
> > > > @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > >                       __func__, vdev->vbasedev.name, addr, val, len);
> > > >      }
> > > >  
> > > > +    /* Bus Master Enabling/Disabling */
> > > > +    if (pdev->failover_primary && current_cpu &&
> > > > +        range_covers_byte(addr, len, PCI_COMMAND)) {
> > > > +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > +                        PCI_COMMAND_MASTER);
> > > > +        may_notify = true;
> > > > +    }
> > > > +
> > > >      /* MSI/MSI-X Enabling/Disabling */
> > > >      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> > > >          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> > > > @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > >          /* Write everything to QEMU to keep emulated bits correct */
> > > >          pci_default_write_config(pdev, addr, val, len);
> > > >      }
> > > > +
> > > > +    if (may_notify) {
> > > > +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > +                             PCI_COMMAND_MASTER);
> > > > +        if (master_was != master_now) {
> > > > +            vfio_failover_notify(vdev, master_now);
> > > > +        }
> > > > +    }
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > > >      vdev->req_enabled = false;
> > > >  }
> > > >  
> > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> > > > +{
> > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > +    const char *n;
> > > > +    gchar *path;
> > > > +
> > > > +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> > > > +    path = object_get_canonical_path(OBJECT(vdev));
> > > > +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> > > > +}
> > > > +
> > > >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> > > >  {
> > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
> > > >      vfio_put_group(group);
> > > >  }
> > > >  
> > > > +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> > > > +{
> > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > +
> > > > +    /*
> > > > +     * Guest driver may not get the chance to disable bus mastering
> > > > +     * before the device object gets to be unrealized. In that event,
> > > > +     * send out a "disabled" notification on behalf of guest driver.
> > > > +     */
> > > > +    if (pdev->failover_primary &&
> > > > +        pdev->bus_master_enable_region.enabled) {
> > > > +        vfio_failover_notify(vdev, false);
> > > > +    }
> > > > +}
> > > > +
> > > >  static void vfio_exitfn(PCIDevice *pdev)
> > > >  {
> > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > >  
> > > > +    /*
> > > > +     * During the guest reboot sequence, it is sometimes possible that
> > > > +     * the guest may not get sufficient time to complete the entire driver
> > > > +     * removal sequence, near the end of which a PCI config space write to
> > > > +     * disable bus mastering can be intercepted by device. In such cases,
> > > > +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> > > > +     * is imperative to generate the event on the guest's behalf if the
> > > > +     * guest fails to make it.
> > > > +     */
> > > > +    vfio_exit_failover_notify(vdev);
> > > > +
> > > >      vfio_unregister_req_notifier(vdev);
> > > >      vfio_unregister_err_notifier(vdev);
> > > >      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > > > diff --git a/qapi/net.json b/qapi/net.json
> > > > index 633ac87..a5b8d70 100644
> > > > --- a/qapi/net.json
> > > > +++ b/qapi/net.json
> > > > @@ -757,3 +757,29 @@
> > > >  ##
> > > >  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
> > > >    'returns': ['StandbyStatusInfo'] }
> > > > +
> > > > +##
> > > > +# @FAILOVER_PRIMARY_CHANGED:
> > > > +#
> > > > +# Emitted whenever the driver of failover primary is loaded or unloaded
> > > > +# by the guest.
> > > > +#
> > > > +# @device: device name
> > > > +#
> > > > +# @path: device path
> > > > +#
> > > > +# @enabled: true if driver is loaded thus device is enabled in guest
> > > > +#
> > > > +# Since: 3.0
> > > > +#
> > > > +# Example:
> > > > +#
> > > > +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> > > > +#      "data": { "device": "vfio-0",
> > > > +#                "path": "/machine/peripheral/vfio-0" },
> > > > +#                "enabled": "true" },
> > > > +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> > > > +#
> > > > +##
> > > > +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> > > > +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
> > > >
Alex Williamson Jan. 8, 2019, 12:24 a.m. UTC | #5
On Mon, 7 Jan 2019 19:12:06 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Jan 07, 2019 at 04:41:15PM -0700, Alex Williamson wrote:
> > On Mon, 7 Jan 2019 18:22:20 -0500
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:  
> > > > On Mon,  7 Jan 2019 17:29:43 -0500
> > > > Venu Busireddy <venu.busireddy@oracle.com> wrote:
> > > >     
> > > > > From: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > > 
> > > > > When a VF is hotplugged into the guest, datapath switching will be
> > > > > performed immediately, which is sub-optimal in terms of timing, and
> > > > > could end up with substantial network downtime. One of ways to shorten
> > > > > this downtime is to switch the datapath only after the VF is seen to get
> > > > > enabled by guest, indicated by the bus master bit in VF's PCI config
> > > > > space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> > > > > at that time to indicate this condition. Then management stack can kick
> > > > > off datapath switching upon receiving the event.
> > > > > 
> > > > > Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > > Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> > > > > ---
> > > > >  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  qapi/net.json | 26 ++++++++++++++++++++++++++
> > > > >  2 files changed, 83 insertions(+)    
> > > > 
> > > > Why is this done at the vfio driver layer rather than the PCI core
> > > > layer?  We write everything through using pci_default_write_config(), I
> > > > don't see that anything here is particularly vfio specific.  Please copy
> > > > me on any changes in hw/vfio.  Thanks,
> > > > 
> > > > Alex    
> > > 
> > > Hmm so you are saying let's send events for each device?
> > > I don't have a problem with this but in this case
> > > I think I would like to see a per-device option "send events".
> > > We don't want a ton of events in the simple default config.  
> > 
> > In the below we're only sending events for PCIDevice.failover_primary,  
> 
> Well failover_primary in this patch is a vfio property, not a
> pci device property.

It's both and it's kind of a kludge (from 2/5):

--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3077,6 +3077,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_err_notifier(vdev);
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
+    pdev->failover_primary = vdev->failover_primary;
 
     return;
 
@@ -3219,6 +3220,8 @@ static Property vfio_pci_dev_properties[] = {
                                    qdev_prop_nv_gpudirect_clique, uint8_t),
     DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
                                 OFF_AUTOPCIBAR_OFF),
+    DEFINE_PROP_BOOL("failover-primary", VFIOPCIDevice, failover_primary,
+                     false),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),

The property could have set VFIOPCIDevice.pdev.failover_primary
directly.  I'm not thrilled about that name either, it's a very NIC
centric property whereas vfio-pci supports plenty of non-networking
devices, as of course does PCIDevice.  Maybe the concept needs to be
more general or the name needs to be more NIC specific and fail for
devices that don't have the correct class code.  Thanks,

Alex

> > seems like that would filter out the rest of the non-NIC PCI devices as
> > well as it does for non-NIC VFIO PCI devices.  The only thing remotely
> > vfio specific below is that it might notify based on the vfio device
> > name, but it's a fallback to PCIDevice.qdev.id.  A real ID could just
> > be a requirement to make use of this.  
> 
> 
> Right and in fact I don't see why we can't make reporting
> bus master status a capability of all devices.
> 
> 
> >  Thanks,
> > 
> > Alex
> >   
> > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > > index bd83b58..adcc95a 100644
> > > > > --- a/hw/vfio/pci.c
> > > > > +++ b/hw/vfio/pci.c
> > > > > @@ -34,6 +34,7 @@
> > > > >  #include "pci.h"
> > > > >  #include "trace.h"
> > > > >  #include "qapi/error.h"
> > > > > +#include "qapi/qapi-events-net.h"
> > > > >  
> > > > >  #define MSIX_CAP_LENGTH 12
> > > > >  
> > > > > @@ -42,6 +43,7 @@
> > > > >  
> > > > >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> > > > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
> > > > >  
> > > > >  /*
> > > > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > > > @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > >  {
> > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > >      uint32_t val_le = cpu_to_le32(val);
> > > > > +    bool may_notify = false;
> > > > > +    bool master_was = false;
> > > > >  
> > > > >      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> > > > >  
> > > > > @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > >                       __func__, vdev->vbasedev.name, addr, val, len);
> > > > >      }
> > > > >  
> > > > > +    /* Bus Master Enabling/Disabling */
> > > > > +    if (pdev->failover_primary && current_cpu &&
> > > > > +        range_covers_byte(addr, len, PCI_COMMAND)) {
> > > > > +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > > +                        PCI_COMMAND_MASTER);
> > > > > +        may_notify = true;
> > > > > +    }
> > > > > +
> > > > >      /* MSI/MSI-X Enabling/Disabling */
> > > > >      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> > > > >          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> > > > > @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > >          /* Write everything to QEMU to keep emulated bits correct */
> > > > >          pci_default_write_config(pdev, addr, val, len);
> > > > >      }
> > > > > +
> > > > > +    if (may_notify) {
> > > > > +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > > +                             PCI_COMMAND_MASTER);
> > > > > +        if (master_was != master_now) {
> > > > > +            vfio_failover_notify(vdev, master_now);
> > > > > +        }
> > > > > +    }
> > > > >  }
> > > > >  
> > > > >  /*
> > > > > @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > > > >      vdev->req_enabled = false;
> > > > >  }
> > > > >  
> > > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> > > > > +{
> > > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > > +    const char *n;
> > > > > +    gchar *path;
> > > > > +
> > > > > +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> > > > > +    path = object_get_canonical_path(OBJECT(vdev));
> > > > > +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> > > > > +}
> > > > > +
> > > > >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> > > > >  {
> > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > > @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
> > > > >      vfio_put_group(group);
> > > > >  }
> > > > >  
> > > > > +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> > > > > +{
> > > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > > +
> > > > > +    /*
> > > > > +     * Guest driver may not get the chance to disable bus mastering
> > > > > +     * before the device object gets to be unrealized. In that event,
> > > > > +     * send out a "disabled" notification on behalf of guest driver.
> > > > > +     */
> > > > > +    if (pdev->failover_primary &&
> > > > > +        pdev->bus_master_enable_region.enabled) {
> > > > > +        vfio_failover_notify(vdev, false);
> > > > > +    }
> > > > > +}
> > > > > +
> > > > >  static void vfio_exitfn(PCIDevice *pdev)
> > > > >  {
> > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > >  
> > > > > +    /*
> > > > > +     * During the guest reboot sequence, it is sometimes possible that
> > > > > +     * the guest may not get sufficient time to complete the entire driver
> > > > > +     * removal sequence, near the end of which a PCI config space write to
> > > > > +     * disable bus mastering can be intercepted by device. In such cases,
> > > > > +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> > > > > +     * is imperative to generate the event on the guest's behalf if the
> > > > > +     * guest fails to make it.
> > > > > +     */
> > > > > +    vfio_exit_failover_notify(vdev);
> > > > > +
> > > > >      vfio_unregister_req_notifier(vdev);
> > > > >      vfio_unregister_err_notifier(vdev);
> > > > >      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > > > > diff --git a/qapi/net.json b/qapi/net.json
> > > > > index 633ac87..a5b8d70 100644
> > > > > --- a/qapi/net.json
> > > > > +++ b/qapi/net.json
> > > > > @@ -757,3 +757,29 @@
> > > > >  ##
> > > > >  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
> > > > >    'returns': ['StandbyStatusInfo'] }
> > > > > +
> > > > > +##
> > > > > +# @FAILOVER_PRIMARY_CHANGED:
> > > > > +#
> > > > > +# Emitted whenever the driver of failover primary is loaded or unloaded
> > > > > +# by the guest.
> > > > > +#
> > > > > +# @device: device name
> > > > > +#
> > > > > +# @path: device path
> > > > > +#
> > > > > +# @enabled: true if driver is loaded thus device is enabled in guest
> > > > > +#
> > > > > +# Since: 3.0
> > > > > +#
> > > > > +# Example:
> > > > > +#
> > > > > +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> > > > > +#      "data": { "device": "vfio-0",
> > > > > +#                "path": "/machine/peripheral/vfio-0" },
> > > > > +#                "enabled": "true" },
> > > > > +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> > > > > +#
> > > > > +##
> > > > > +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> > > > > +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
> > > > >
Michael S. Tsirkin Jan. 8, 2019, 12:43 a.m. UTC | #6
On Mon, Jan 07, 2019 at 05:24:15PM -0700, Alex Williamson wrote:
> On Mon, 7 Jan 2019 19:12:06 -0500
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Mon, Jan 07, 2019 at 04:41:15PM -0700, Alex Williamson wrote:
> > > On Mon, 7 Jan 2019 18:22:20 -0500
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > >   
> > > > On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:  
> > > > > On Mon,  7 Jan 2019 17:29:43 -0500
> > > > > Venu Busireddy <venu.busireddy@oracle.com> wrote:
> > > > >     
> > > > > > From: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > > > 
> > > > > > When a VF is hotplugged into the guest, datapath switching will be
> > > > > > performed immediately, which is sub-optimal in terms of timing, and
> > > > > > could end up with substantial network downtime. One of ways to shorten
> > > > > > this downtime is to switch the datapath only after the VF is seen to get
> > > > > > enabled by guest, indicated by the bus master bit in VF's PCI config
> > > > > > space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
> > > > > > at that time to indicate this condition. Then management stack can kick
> > > > > > off datapath switching upon receiving the event.
> > > > > > 
> > > > > > Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> > > > > > Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
> > > > > > ---
> > > > > >  hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  qapi/net.json | 26 ++++++++++++++++++++++++++
> > > > > >  2 files changed, 83 insertions(+)    
> > > > > 
> > > > > Why is this done at the vfio driver layer rather than the PCI core
> > > > > layer?  We write everything through using pci_default_write_config(), I
> > > > > don't see that anything here is particularly vfio specific.  Please copy
> > > > > me on any changes in hw/vfio.  Thanks,
> > > > > 
> > > > > Alex    
> > > > 
> > > > Hmm so you are saying let's send events for each device?
> > > > I don't have a problem with this but in this case
> > > > I think I would like to see a per-device option "send events".
> > > > We don't want a ton of events in the simple default config.  
> > > 
> > > In the below we're only sending events for PCIDevice.failover_primary,  
> > 
> > Well failover_primary in this patch is a vfio property, not a
> > pci device property.
> 
> It's both and it's kind of a kludge (from 2/5):
> 
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3077,6 +3077,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vfio_register_err_notifier(vdev);
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
> +    pdev->failover_primary = vdev->failover_primary;
>  
>      return;
>  
> @@ -3219,6 +3220,8 @@ static Property vfio_pci_dev_properties[] = {
>                                     qdev_prop_nv_gpudirect_clique, uint8_t),
>      DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
>                                  OFF_AUTOPCIBAR_OFF),
> +    DEFINE_PROP_BOOL("failover-primary", VFIOPCIDevice, failover_primary,
> +                     false),
>      /*
>       * TODO - support passed fds... is this necessary?
>       * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
> 
> The property could have set VFIOPCIDevice.pdev.failover_primary
> directly.  I'm not thrilled about that name either, it's a very NIC
> centric property whereas vfio-pci supports plenty of non-networking
> devices, as of course does PCIDevice.  Maybe the concept needs to be
> more general or the name needs to be more NIC specific and fail for
> devices that don't have the correct class code.  Thanks,
> 
> Alex

I actually think it's generic concept. I came with a name failover
exactly to avoid the "bonding" name that was used originally
and was net specific.

In particular
https://fedoraproject.org/wiki/Features/Virt_Device_Failover
suggests using multipath for storage.

Can in theory easily be imagined to work with  rng, crypto
even though I don't think Linux makes supporting this easy.



> > > seems like that would filter out the rest of the non-NIC PCI devices as
> > > well as it does for non-NIC VFIO PCI devices.  The only thing remotely
> > > vfio specific below is that it might notify based on the vfio device
> > > name, but it's a fallback to PCIDevice.qdev.id.  A real ID could just
> > > be a requirement to make use of this.  
> > 
> > 
> > Right and in fact I don't see why we can't make reporting
> > bus master status a capability of all devices.
> > 
> > 
> > >  Thanks,
> > > 
> > > Alex
> > >   
> > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > > > index bd83b58..adcc95a 100644
> > > > > > --- a/hw/vfio/pci.c
> > > > > > +++ b/hw/vfio/pci.c
> > > > > > @@ -34,6 +34,7 @@
> > > > > >  #include "pci.h"
> > > > > >  #include "trace.h"
> > > > > >  #include "qapi/error.h"
> > > > > > +#include "qapi/qapi-events-net.h"
> > > > > >  
> > > > > >  #define MSIX_CAP_LENGTH 12
> > > > > >  
> > > > > > @@ -42,6 +43,7 @@
> > > > > >  
> > > > > >  static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> > > > > >  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
> > > > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
> > > > > >  
> > > > > >  /*
> > > > > >   * Disabling BAR mmaping can be slow, but toggling it around INTx can
> > > > > > @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > > >  {
> > > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > > >      uint32_t val_le = cpu_to_le32(val);
> > > > > > +    bool may_notify = false;
> > > > > > +    bool master_was = false;
> > > > > >  
> > > > > >      trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
> > > > > >  
> > > > > > @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > > >                       __func__, vdev->vbasedev.name, addr, val, len);
> > > > > >      }
> > > > > >  
> > > > > > +    /* Bus Master Enabling/Disabling */
> > > > > > +    if (pdev->failover_primary && current_cpu &&
> > > > > > +        range_covers_byte(addr, len, PCI_COMMAND)) {
> > > > > > +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > > > +                        PCI_COMMAND_MASTER);
> > > > > > +        may_notify = true;
> > > > > > +    }
> > > > > > +
> > > > > >      /* MSI/MSI-X Enabling/Disabling */
> > > > > >      if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
> > > > > >          ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
> > > > > > @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
> > > > > >          /* Write everything to QEMU to keep emulated bits correct */
> > > > > >          pci_default_write_config(pdev, addr, val, len);
> > > > > >      }
> > > > > > +
> > > > > > +    if (may_notify) {
> > > > > > +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
> > > > > > +                             PCI_COMMAND_MASTER);
> > > > > > +        if (master_was != master_now) {
> > > > > > +            vfio_failover_notify(vdev, master_now);
> > > > > > +        }
> > > > > > +    }
> > > > > >  }
> > > > > >  
> > > > > >  /*
> > > > > > @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > > > > >      vdev->req_enabled = false;
> > > > > >  }
> > > > > >  
> > > > > > +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
> > > > > > +{
> > > > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > > > +    const char *n;
> > > > > > +    gchar *path;
> > > > > > +
> > > > > > +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
> > > > > > +    path = object_get_canonical_path(OBJECT(vdev));
> > > > > > +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
> > > > > > +}
> > > > > > +
> > > > > >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> > > > > >  {
> > > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > > > @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
> > > > > >      vfio_put_group(group);
> > > > > >  }
> > > > > >  
> > > > > > +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
> > > > > > +{
> > > > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > > > +
> > > > > > +    /*
> > > > > > +     * Guest driver may not get the chance to disable bus mastering
> > > > > > +     * before the device object gets to be unrealized. In that event,
> > > > > > +     * send out a "disabled" notification on behalf of guest driver.
> > > > > > +     */
> > > > > > +    if (pdev->failover_primary &&
> > > > > > +        pdev->bus_master_enable_region.enabled) {
> > > > > > +        vfio_failover_notify(vdev, false);
> > > > > > +    }
> > > > > > +}
> > > > > > +
> > > > > >  static void vfio_exitfn(PCIDevice *pdev)
> > > > > >  {
> > > > > >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> > > > > >  
> > > > > > +    /*
> > > > > > +     * During the guest reboot sequence, it is sometimes possible that
> > > > > > +     * the guest may not get sufficient time to complete the entire driver
> > > > > > +     * removal sequence, near the end of which a PCI config space write to
> > > > > > +     * disable bus mastering can be intercepted by device. In such cases,
> > > > > > +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
> > > > > > +     * is imperative to generate the event on the guest's behalf if the
> > > > > > +     * guest fails to make it.
> > > > > > +     */
> > > > > > +    vfio_exit_failover_notify(vdev);
> > > > > > +
> > > > > >      vfio_unregister_req_notifier(vdev);
> > > > > >      vfio_unregister_err_notifier(vdev);
> > > > > >      pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > > > > > diff --git a/qapi/net.json b/qapi/net.json
> > > > > > index 633ac87..a5b8d70 100644
> > > > > > --- a/qapi/net.json
> > > > > > +++ b/qapi/net.json
> > > > > > @@ -757,3 +757,29 @@
> > > > > >  ##
> > > > > >  { 'command': 'query-standby-status', 'data': { '*device': 'str' },
> > > > > >    'returns': ['StandbyStatusInfo'] }
> > > > > > +
> > > > > > +##
> > > > > > +# @FAILOVER_PRIMARY_CHANGED:
> > > > > > +#
> > > > > > +# Emitted whenever the driver of failover primary is loaded or unloaded
> > > > > > +# by the guest.
> > > > > > +#
> > > > > > +# @device: device name
> > > > > > +#
> > > > > > +# @path: device path
> > > > > > +#
> > > > > > +# @enabled: true if driver is loaded thus device is enabled in guest
> > > > > > +#
> > > > > > +# Since: 3.0
> > > > > > +#
> > > > > > +# Example:
> > > > > > +#
> > > > > > +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
> > > > > > +#      "data": { "device": "vfio-0",
> > > > > > +#                "path": "/machine/peripheral/vfio-0" },
> > > > > > +#                "enabled": "true" },
> > > > > > +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
> > > > > > +#
> > > > > > +##
> > > > > > +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
> > > > > > +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
> > > > > >
Si-Wei Liu Jan. 8, 2019, 1:13 a.m. UTC | #7
On 01/07/2019 03:41 PM, Alex Williamson wrote:
> On Mon, 7 Jan 2019 18:22:20 -0500
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
>
>> On Mon, Jan 07, 2019 at 04:17:17PM -0700, Alex Williamson wrote:
>>> On Mon,  7 Jan 2019 17:29:43 -0500
>>> Venu Busireddy <venu.busireddy@oracle.com> wrote:
>>>    
>>>> From: Si-Wei Liu <si-wei.liu@oracle.com>
>>>>
>>>> When a VF is hotplugged into the guest, datapath switching will be
>>>> performed immediately, which is sub-optimal in terms of timing, and
>>>> could end up with substantial network downtime. One of ways to shorten
>>>> this downtime is to switch the datapath only after the VF is seen to get
>>>> enabled by guest, indicated by the bus master bit in VF's PCI config
>>>> space getting enabled. The FAILOVER_PRIMARY_CHANGED event is emitted
>>>> at that time to indicate this condition. Then management stack can kick
>>>> off datapath switching upon receiving the event.
>>>>
>>>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>>>> Signed-off-by: Venu Busireddy <venu.busireddy@oracle.com>
>>>> ---
>>>>   hw/vfio/pci.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>   qapi/net.json | 26 ++++++++++++++++++++++++++
>>>>   2 files changed, 83 insertions(+)
>>> Why is this done at the vfio driver layer rather than the PCI core
>>> layer?  We write everything through using pci_default_write_config(), I
>>> don't see that anything here is particularly vfio specific.  Please copy
>>> me on any changes in hw/vfio.  Thanks,
>>>
>>> Alex
>> Hmm so you are saying let's send events for each device?
>> I don't have a problem with this but in this case
>> I think I would like to see a per-device option "send events".
>> We don't want a ton of events in the simple default config.
> In the below we're only sending events for PCIDevice.failover_primary,
> seems like that would filter out the rest of the non-NIC PCI devices as
> well as it does for non-NIC VFIO PCI devices.  The only thing remotely
> vfio specific below is that it might notify based on the vfio device
> name, but it's a fallback to PCIDevice.qdev.id.
Not exactly. It will first try to use the qdev ID to notify. If qdev id 
is missing (vfio-pci device could live without it),  then sysfsdev name 
will be used instead (in the form of host device 
"<bus>:<device>.<function>" location rather than ID). The intent was 
indeed to make this notification applicable to every possible vfio-pci 
device, even those without a qdev ID.

>   A real ID could just
> be a requirement to make use of this.
I'm fine to make qdev-id required for failover_primary PCI device. But 
please be noted, this is a shrinkage rather than generalization that has 
to apply to all other non-VFIO PCI devices that don't have to specify a 
qdev ID today.  I'm not sure if it's a good idea to make it restricted 
this early.

Thanks,
-Siwei

>   Thanks,
>
> Alex
>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index bd83b58..adcc95a 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -34,6 +34,7 @@
>>>>   #include "pci.h"
>>>>   #include "trace.h"
>>>>   #include "qapi/error.h"
>>>> +#include "qapi/qapi-events-net.h"
>>>>   
>>>>   #define MSIX_CAP_LENGTH 12
>>>>   
>>>> @@ -42,6 +43,7 @@
>>>>   
>>>>   static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>>>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
>>>> +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
>>>>   
>>>>   /*
>>>>    * Disabling BAR mmaping can be slow, but toggling it around INTx can
>>>> @@ -1170,6 +1172,8 @@ void vfio_pci_write_config(PCIDevice *pdev,
>>>>   {
>>>>       VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>>>>       uint32_t val_le = cpu_to_le32(val);
>>>> +    bool may_notify = false;
>>>> +    bool master_was = false;
>>>>   
>>>>       trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
>>>>   
>>>> @@ -1180,6 +1184,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
>>>>                        __func__, vdev->vbasedev.name, addr, val, len);
>>>>       }
>>>>   
>>>> +    /* Bus Master Enabling/Disabling */
>>>> +    if (pdev->failover_primary && current_cpu &&
>>>> +        range_covers_byte(addr, len, PCI_COMMAND)) {
>>>> +        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
>>>> +                        PCI_COMMAND_MASTER);
>>>> +        may_notify = true;
>>>> +    }
>>>> +
>>>>       /* MSI/MSI-X Enabling/Disabling */
>>>>       if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
>>>>           ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
>>>> @@ -1235,6 +1247,14 @@ void vfio_pci_write_config(PCIDevice *pdev,
>>>>           /* Write everything to QEMU to keep emulated bits correct */
>>>>           pci_default_write_config(pdev, addr, val, len);
>>>>       }
>>>> +
>>>> +    if (may_notify) {
>>>> +        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
>>>> +                             PCI_COMMAND_MASTER);
>>>> +        if (master_was != master_now) {
>>>> +            vfio_failover_notify(vdev, master_now);
>>>> +        }
>>>> +    }
>>>>   }
>>>>   
>>>>   /*
>>>> @@ -2801,6 +2821,17 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>>>>       vdev->req_enabled = false;
>>>>   }
>>>>   
>>>> +static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
>>>> +{
>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>> +    const char *n;
>>>> +    gchar *path;
>>>> +
>>>> +    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
>>>> +    path = object_get_canonical_path(OBJECT(vdev));
>>>> +    qapi_event_send_failover_primary_changed(!!n, n, path, status);
>>>> +}
>>>> +
>>>>   static void vfio_realize(PCIDevice *pdev, Error **errp)
>>>>   {
>>>>       VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>>>> @@ -3109,10 +3140,36 @@ static void vfio_instance_finalize(Object *obj)
>>>>       vfio_put_group(group);
>>>>   }
>>>>   
>>>> +static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
>>>> +{
>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>> +
>>>> +    /*
>>>> +     * Guest driver may not get the chance to disable bus mastering
>>>> +     * before the device object gets to be unrealized. In that event,
>>>> +     * send out a "disabled" notification on behalf of guest driver.
>>>> +     */
>>>> +    if (pdev->failover_primary &&
>>>> +        pdev->bus_master_enable_region.enabled) {
>>>> +        vfio_failover_notify(vdev, false);
>>>> +    }
>>>> +}
>>>> +
>>>>   static void vfio_exitfn(PCIDevice *pdev)
>>>>   {
>>>>       VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>>>>   
>>>> +    /*
>>>> +     * During the guest reboot sequence, it is sometimes possible that
>>>> +     * the guest may not get sufficient time to complete the entire driver
>>>> +     * removal sequence, near the end of which a PCI config space write to
>>>> +     * disable bus mastering can be intercepted by device. In such cases,
>>>> +     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
>>>> +     * is imperative to generate the event on the guest's behalf if the
>>>> +     * guest fails to make it.
>>>> +     */
>>>> +    vfio_exit_failover_notify(vdev);
>>>> +
>>>>       vfio_unregister_req_notifier(vdev);
>>>>       vfio_unregister_err_notifier(vdev);
>>>>       pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
>>>> diff --git a/qapi/net.json b/qapi/net.json
>>>> index 633ac87..a5b8d70 100644
>>>> --- a/qapi/net.json
>>>> +++ b/qapi/net.json
>>>> @@ -757,3 +757,29 @@
>>>>   ##
>>>>   { 'command': 'query-standby-status', 'data': { '*device': 'str' },
>>>>     'returns': ['StandbyStatusInfo'] }
>>>> +
>>>> +##
>>>> +# @FAILOVER_PRIMARY_CHANGED:
>>>> +#
>>>> +# Emitted whenever the driver of failover primary is loaded or unloaded
>>>> +# by the guest.
>>>> +#
>>>> +# @device: device name
>>>> +#
>>>> +# @path: device path
>>>> +#
>>>> +# @enabled: true if driver is loaded thus device is enabled in guest
>>>> +#
>>>> +# Since: 3.0
>>>> +#
>>>> +# Example:
>>>> +#
>>>> +# <- { "event": "FAILOVER_PRIMARY_CHANGED",
>>>> +#      "data": { "device": "vfio-0",
>>>> +#                "path": "/machine/peripheral/vfio-0" },
>>>> +#                "enabled": "true" },
>>>> +#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
>>>> +#
>>>> +##
>>>> +{ 'event': 'FAILOVER_PRIMARY_CHANGED',
>>>> +  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }
>>>>    
>
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index bd83b58..adcc95a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -34,6 +34,7 @@ 
 #include "pci.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "qapi/qapi-events-net.h"
 
 #define MSIX_CAP_LENGTH 12
 
@@ -42,6 +43,7 @@ 
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -1170,6 +1172,8 @@  void vfio_pci_write_config(PCIDevice *pdev,
 {
     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
     uint32_t val_le = cpu_to_le32(val);
+    bool may_notify = false;
+    bool master_was = false;
 
     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
 
@@ -1180,6 +1184,14 @@  void vfio_pci_write_config(PCIDevice *pdev,
                      __func__, vdev->vbasedev.name, addr, val, len);
     }
 
+    /* Bus Master Enabling/Disabling */
+    if (pdev->failover_primary && current_cpu &&
+        range_covers_byte(addr, len, PCI_COMMAND)) {
+        master_was = !!(pci_get_word(pdev->config + PCI_COMMAND) &
+                        PCI_COMMAND_MASTER);
+        may_notify = true;
+    }
+
     /* MSI/MSI-X Enabling/Disabling */
     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
@@ -1235,6 +1247,14 @@  void vfio_pci_write_config(PCIDevice *pdev,
         /* Write everything to QEMU to keep emulated bits correct */
         pci_default_write_config(pdev, addr, val, len);
     }
+
+    if (may_notify) {
+        bool master_now = !!(pci_get_word(pdev->config + PCI_COMMAND) &
+                             PCI_COMMAND_MASTER);
+        if (master_was != master_now) {
+            vfio_failover_notify(vdev, master_now);
+        }
+    }
 }
 
 /*
@@ -2801,6 +2821,17 @@  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_failover_notify(VFIOPCIDevice *vdev, bool status)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    const char *n;
+    gchar *path;
+
+    n = pdev->qdev.id ? pdev->qdev.id : vdev->vbasedev.name;
+    path = object_get_canonical_path(OBJECT(vdev));
+    qapi_event_send_failover_primary_changed(!!n, n, path, status);
+}
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
@@ -3109,10 +3140,36 @@  static void vfio_instance_finalize(Object *obj)
     vfio_put_group(group);
 }
 
+static void vfio_exit_failover_notify(VFIOPCIDevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+
+    /*
+     * Guest driver may not get the chance to disable bus mastering
+     * before the device object gets to be unrealized. In that event,
+     * send out a "disabled" notification on behalf of guest driver.
+     */
+    if (pdev->failover_primary &&
+        pdev->bus_master_enable_region.enabled) {
+        vfio_failover_notify(vdev, false);
+    }
+}
+
 static void vfio_exitfn(PCIDevice *pdev)
 {
     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 
+    /*
+     * During the guest reboot sequence, it is sometimes possible that
+     * the guest may not get sufficient time to complete the entire driver
+     * removal sequence, near the end of which a PCI config space write to
+     * disable bus mastering can be intercepted by device. In such cases,
+     * the FAILOVER_PRIMARY_CHANGED "disable" event will not be emitted. It
+     * is imperative to generate the event on the guest's behalf if the
+     * guest fails to make it.
+     */
+    vfio_exit_failover_notify(vdev);
+
     vfio_unregister_req_notifier(vdev);
     vfio_unregister_err_notifier(vdev);
     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
diff --git a/qapi/net.json b/qapi/net.json
index 633ac87..a5b8d70 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -757,3 +757,29 @@ 
 ##
 { 'command': 'query-standby-status', 'data': { '*device': 'str' },
   'returns': ['StandbyStatusInfo'] }
+
+##
+# @FAILOVER_PRIMARY_CHANGED:
+#
+# Emitted whenever the driver of failover primary is loaded or unloaded
+# by the guest.
+#
+# @device: device name
+#
+# @path: device path
+#
+# @enabled: true if driver is loaded thus device is enabled in guest
+#
+# Since: 3.0
+#
+# Example:
+#
+# <- { "event": "FAILOVER_PRIMARY_CHANGED",
+#      "data": { "device": "vfio-0",
+#                "path": "/machine/peripheral/vfio-0" },
+#                "enabled": "true" },
+#      "timestamp": { "seconds": 1539935213, "microseconds": 753529 } }
+#
+##
+{ 'event': 'FAILOVER_PRIMARY_CHANGED',
+  'data': { '*device': 'str', 'path': 'str', 'enabled': 'bool' } }