diff mbox series

[V6,21/27] vfio-pci: cpr part 3 (intx)

Message ID 1628286241-217457-22-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live Update | expand

Commit Message

Steven Sistare Aug. 6, 2021, 9:43 p.m. UTC
Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
follows:
  pin : Recover this from the vfio config in kernel space
  interrupt : Preserve its eventfd descriptor across exec.
  unmask : Ditto
  route.irq : This could perhaps be recovered in vfio_pci_post_load by
    calling pci_device_route_intx_to_irq(pin), whose implementation reads
    config space for a bridge device such as ich9.  However, there is no
    guarantee that the bridge vmstate is read before vfio vmstate.  Rather
    than fiddling with MigrationPriority for vmstate handlers, explicitly
    save route.irq in vfio vmstate.
  pending : save in vfio vmstate.
  mmap_timeout, mmap_timer : Re-initialize
  bool kvm_accel : Re-initialize

In vfio_realize, defer calling vfio_intx_enable until the vmstate
is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
vfio_intx_kvm_enable to skip vfio initialization, but still perform
kvm initialization.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/vfio/pci.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 83 insertions(+), 11 deletions(-)

Comments

Fam Zheng March 29, 2022, 11:03 a.m. UTC | #1
On 2021-08-06 14:43, Steve Sistare wrote:
> Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
> follows:
>   pin : Recover this from the vfio config in kernel space
>   interrupt : Preserve its eventfd descriptor across exec.
>   unmask : Ditto
>   route.irq : This could perhaps be recovered in vfio_pci_post_load by
>     calling pci_device_route_intx_to_irq(pin), whose implementation reads
>     config space for a bridge device such as ich9.  However, there is no
>     guarantee that the bridge vmstate is read before vfio vmstate.  Rather
>     than fiddling with MigrationPriority for vmstate handlers, explicitly
>     save route.irq in vfio vmstate.
>   pending : save in vfio vmstate.
>   mmap_timeout, mmap_timer : Re-initialize
>   bool kvm_accel : Re-initialize
> 
> In vfio_realize, defer calling vfio_intx_enable until the vmstate
> is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
> vfio_intx_kvm_enable to skip vfio initialization, but still perform
> kvm initialization.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>

Hi Steve,

Not directly related to this patch, but since the context is close: it looks
like this series only takes care of exec restart mode of vfio-pci, have you had
any thoughts on kexec reboot mode with vfio-pci?

The general idea is if DMAR context is not lost during kexec, we should be able
to set up irqfds again and things will just work?

Fam

--

PS some more info below:

I have some local kernel patches to kexec reboot most part of the host kernel
while keeping IOMMU DMAR tables in a valid state; with that, not many extra
things are needed in addition to restore it. A PoC is like below (I can share
more details of the kernel changes if this patch makes any sense):


commit f8951e58be86bd6e37f816394a9a73f28d8059fc
Author: Fam Zheng <fam.zheng@bytedance.com>
Date:   Mon Mar 21 13:19:49 2022 +0000

    cpr: Add live-update support to vfio-pci devices
    
    In cpr-save, always serialize the vfio-pci states.
    
    In cpr-load, add a '-restore' mode that will do
    VFIO_GROUP_GET_DEVICE_FD_INTACT and skip DMAR setup, somewhat similar to
    the current cpr exec mode.
    
    Signed-off-by: Fam Zheng <fam.zheng@bytedance.com>

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 73f4259556..e36f0ef97d 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -584,10 +584,15 @@ void msix_init_vector_notifiers(PCIDevice *dev,
                                 MSIVectorReleaseNotifier release_notifier,
                                 MSIVectorPollNotifier poll_notifier)
 {
+    int vector;
+
     assert(use_notifier && release_notifier);
     dev->msix_vector_use_notifier = use_notifier;
     dev->msix_vector_release_notifier = release_notifier;
     dev->msix_vector_poll_notifier = poll_notifier;
+    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
+        msix_handle_mask_update(dev, vector, true);
+    }
 }
 
 int msix_set_vector_notifiers(PCIDevice *dev,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 605ffbb5d0..f1240410a8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -2066,6 +2066,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     bool reused;
     VFIOAddressSpace *space;
 
+    if (restore) {
+        return 0;
+    }
     space = vfio_get_address_space(as);
     fd = cpr_find_fd("vfio_container_for_group", group->groupid);
     reused = (fd > 0);
@@ -2486,7 +2489,8 @@ int vfio_get_device(VFIOGroup *group, const char *name,
     fd = cpr_find_fd(name, 0);
     reused = (fd >= 0);
     if (!reused) {
-        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+        int op = restore ? VFIO_GROUP_GET_DEVICE_FD_INTACT : VFIO_GROUP_GET_DEVICE_FD;
+        fd = ioctl(group->fd, op, name);
     }
 
     if (fd < 0) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e32513c668..9da5f93228 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -361,7 +361,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
      * Do not alter interrupt state during vfio_realize and cpr-load.  The
      * reused flag is cleared thereafter.
      */
-    if (!vdev->pdev.reused) {
+    if (!vdev->pdev.reused && !restore) {
         vfio_disable_interrupts(vdev);
     }
 
@@ -388,7 +388,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
     fd = event_notifier_get_fd(&vdev->intx.interrupt);
     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
 
-    if (vdev->pdev.reused) {
+    if (vdev->pdev.reused && !restore) {
         vfio_intx_reenable_kvm(vdev, &err);
         goto finish;
     }
@@ -2326,6 +2326,9 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
     int ret, i, count;
     bool multi = false;
 
+    if (restore) {
+        return 0;
+    }
     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
 
     if (!single) {
@@ -3185,7 +3188,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
 
         /* Wait until cpr-load reads intx routing data to enable */
-        if (!pdev->reused) {
+        if (!pdev->reused && !restore) {
             ret = vfio_intx_enable(vdev, errp);
             if (ret) {
                 goto out_deregister;
@@ -3295,7 +3298,7 @@ static void vfio_pci_reset(DeviceState *dev)
     VFIOPCIDevice *vdev = VFIO_PCI(dev);
 
     /* Do not reset the device during qemu_system_reset prior to cpr-load */
-    if (vdev->pdev.reused) {
+    if (vdev->pdev.reused || restore) {
         return;
     }
 
@@ -3429,33 +3432,40 @@ static void vfio_merge_config(VFIOPCIDevice *vdev)
 
 static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix)
 {
-    int i, fd;
+    int i, fd, ret;
     bool pending = false;
     PCIDevice *pdev = &vdev->pdev;
 
+    pdev->msix_entries_nr = nr_vectors;
     vdev->nr_vectors = nr_vectors;
     vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
     vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
 
-    for (i = 0; i < nr_vectors; i++) {
-        VFIOMSIVector *vector = &vdev->msi_vectors[i];
-
-        fd = load_event_fd(vdev, "interrupt", i);
-        if (fd >= 0) {
-            vfio_vector_init(vdev, i);
-            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+    if (restore) {
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d", ret);
         }
+    } else {
+        for (i = 0; i < nr_vectors; i++) {
+            VFIOMSIVector *vector = &vdev->msi_vectors[i];
 
-        if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
-            vfio_add_kvm_msi_virq(vdev, vector, i, msix);
-        }
+            fd = load_event_fd(vdev, "interrupt", i);
+            if (fd >= 0) {
+                vfio_vector_init(vdev, i);
+                qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+            }
 
-        if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
-            set_bit(i, vdev->msix->pending);
-            pending = true;
+            if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
+                vfio_add_kvm_msi_virq(vdev, vector, i, msix);
+            }
+
+            if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+                set_bit(i, vdev->msix->pending);
+                pending = true;
+            }
         }
     }
-
     if (msix) {
         memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
     }
@@ -3534,7 +3544,7 @@ static const VMStateDescription vfio_intx_vmstate = {
 
 static bool vfio_pci_needed(void *opaque)
 {
-    return cpr_get_mode() == CPR_MODE_RESTART;
+    return 1;
 }
 
 static const VMStateDescription vfio_pci_vmstate = {
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 6241c20fb1..0179b0aa90 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -26,6 +26,7 @@ void configure_rtc(QemuOpts *opts);
 void qemu_init_subsystems(void);
 
 extern int autostart;
+extern int restore;
 
 typedef enum {
     VGA_NONE, VGA_STD, VGA_CIRRUS, VGA_VMWARE, VGA_XENFB, VGA_QXL,
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index e680594f27..65c3bab074 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -188,6 +188,8 @@ struct vfio_group_status {
  */
 #define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
 
+#define VFIO_GROUP_GET_DEVICE_FD_INTACT	_IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* --------------- IOCTLs for DEVICE file descriptors --------------- */
 
 /**
diff --git a/qemu-options.hx b/qemu-options.hx
index 8b90d04cb9..03666a59b3 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3984,6 +3984,10 @@ SRST
     option is experimental.
 ERST
 
+DEF("restore", 0, QEMU_OPTION_restore, \
+    "-restore              restore mode",
+    QEMU_ARCH_ALL)
+
 DEF("S", 0, QEMU_OPTION_S, \
     "-S              freeze CPU at startup (use 'c' to start execution)\n",
     QEMU_ARCH_ALL)
diff --git a/softmmu/globals.c b/softmmu/globals.c
index a18fd8dcf3..6fcb5846b4 100644
--- a/softmmu/globals.c
+++ b/softmmu/globals.c
@@ -41,6 +41,7 @@ bool enable_cpu_pm;
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart = 1;
+int restore;
 int vga_interface_type = VGA_NONE;
 Chardev *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack;
diff --git a/softmmu/vl.c b/softmmu/vl.c
index f14e29e622..fba6b577cb 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -3088,6 +3088,9 @@ void qemu_init(int argc, char **argv, char **envp)
             case QEMU_OPTION_S:
                 autostart = 0;
                 break;
+            case QEMU_OPTION_restore:
+                restore = 1;
+                break;
             case QEMU_OPTION_k:
                 keyboard_layout = optarg;
                 break;
Steven Sistare April 11, 2022, 4:23 p.m. UTC | #2
On 3/29/2022 7:03 AM, Fam Zheng wrote:
> On 2021-08-06 14:43, Steve Sistare wrote:
>> Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
>> follows:
>>   pin : Recover this from the vfio config in kernel space
>>   interrupt : Preserve its eventfd descriptor across exec.
>>   unmask : Ditto
>>   route.irq : This could perhaps be recovered in vfio_pci_post_load by
>>     calling pci_device_route_intx_to_irq(pin), whose implementation reads
>>     config space for a bridge device such as ich9.  However, there is no
>>     guarantee that the bridge vmstate is read before vfio vmstate.  Rather
>>     than fiddling with MigrationPriority for vmstate handlers, explicitly
>>     save route.irq in vfio vmstate.
>>   pending : save in vfio vmstate.
>>   mmap_timeout, mmap_timer : Re-initialize
>>   bool kvm_accel : Re-initialize
>>
>> In vfio_realize, defer calling vfio_intx_enable until the vmstate
>> is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
>> vfio_intx_kvm_enable to skip vfio initialization, but still perform
>> kvm initialization.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> 
> Hi Steve,
> 
> Not directly related to this patch, but since the context is close: it looks
> like this series only takes care of exec restart mode of vfio-pci, have you had
> any thoughts on kexec reboot mode with vfio-pci?
> 
> The general idea is if DMAR context is not lost during kexec, we should be able
> to set up irqfds again and things will just work?
> 
> Fam

Hi Fam,
  I have thought about that use case, but only in general terms.
IMO it best fits in the cpr framework as a new mode (rather than as 
a new -restore command line argument).  

In your code below, you would have fewer code changes if you set 
'reused = true' for the new mode, rather than testing both 'reused and restored' 
at multiple sites. Lastly, I cleaned up the vector handling somewhat from V6 
to V7, so you may want to try your code using V7 as a base.

- Steve

> PS some more info below:
> 
> I have some local kernel patches to kexec reboot most part of the host kernel
> while keeping IOMMU DMAR tables in a valid state; with that, not many extra
> things are needed in addition to restore it. A PoC is like below (I can share
> more details of the kernel changes if this patch makes any sense):
> 
> 
> commit f8951e58be86bd6e37f816394a9a73f28d8059fc
> Author: Fam Zheng <fam.zheng@bytedance.com>
> Date:   Mon Mar 21 13:19:49 2022 +0000
> 
>     cpr: Add live-update support to vfio-pci devices
>     
>     In cpr-save, always serialize the vfio-pci states.
>     
>     In cpr-load, add a '-restore' mode that will do
>     VFIO_GROUP_GET_DEVICE_FD_INTACT and skip DMAR setup, somewhat similar to
>     the current cpr exec mode.
>     
>     Signed-off-by: Fam Zheng <fam.zheng@bytedance.com>
> 
> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
> index 73f4259556..e36f0ef97d 100644
> --- a/hw/pci/msix.c
> +++ b/hw/pci/msix.c
> @@ -584,10 +584,15 @@ void msix_init_vector_notifiers(PCIDevice *dev,
>                                  MSIVectorReleaseNotifier release_notifier,
>                                  MSIVectorPollNotifier poll_notifier)
>  {
> +    int vector;
> +
>      assert(use_notifier && release_notifier);
>      dev->msix_vector_use_notifier = use_notifier;
>      dev->msix_vector_release_notifier = release_notifier;
>      dev->msix_vector_poll_notifier = poll_notifier;
> +    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
> +        msix_handle_mask_update(dev, vector, true);
> +    }
>  }
>  
>  int msix_set_vector_notifiers(PCIDevice *dev,
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 605ffbb5d0..f1240410a8 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -2066,6 +2066,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
>      bool reused;
>      VFIOAddressSpace *space;
>  
> +    if (restore) {
> +        return 0;
> +    }
>      space = vfio_get_address_space(as);
>      fd = cpr_find_fd("vfio_container_for_group", group->groupid);
>      reused = (fd > 0);
> @@ -2486,7 +2489,8 @@ int vfio_get_device(VFIOGroup *group, const char *name,
>      fd = cpr_find_fd(name, 0);
>      reused = (fd >= 0);
>      if (!reused) {
> -        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
> +        int op = restore ? VFIO_GROUP_GET_DEVICE_FD_INTACT : VFIO_GROUP_GET_DEVICE_FD;
> +        fd = ioctl(group->fd, op, name);
>      }
>  
>      if (fd < 0) {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index e32513c668..9da5f93228 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -361,7 +361,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
>       * Do not alter interrupt state during vfio_realize and cpr-load.  The
>       * reused flag is cleared thereafter.
>       */
> -    if (!vdev->pdev.reused) {
> +    if (!vdev->pdev.reused && !restore) {
>          vfio_disable_interrupts(vdev);
>      }
>  
> @@ -388,7 +388,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>      qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
>  
> -    if (vdev->pdev.reused) {
> +    if (vdev->pdev.reused && !restore) {
>          vfio_intx_reenable_kvm(vdev, &err);
>          goto finish;
>      }
> @@ -2326,6 +2326,9 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
>      int ret, i, count;
>      bool multi = false;
>  
> +    if (restore) {
> +        return 0;
> +    }
>      trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
>  
>      if (!single) {
> @@ -3185,7 +3188,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>          kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
>  
>          /* Wait until cpr-load reads intx routing data to enable */
> -        if (!pdev->reused) {
> +        if (!pdev->reused && !restore) {
>              ret = vfio_intx_enable(vdev, errp);
>              if (ret) {
>                  goto out_deregister;
> @@ -3295,7 +3298,7 @@ static void vfio_pci_reset(DeviceState *dev)
>      VFIOPCIDevice *vdev = VFIO_PCI(dev);
>  
>      /* Do not reset the device during qemu_system_reset prior to cpr-load */
> -    if (vdev->pdev.reused) {
> +    if (vdev->pdev.reused || restore) {
>          return;
>      }
>  
> @@ -3429,33 +3432,40 @@ static void vfio_merge_config(VFIOPCIDevice *vdev)
>  
>  static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix)
>  {
> -    int i, fd;
> +    int i, fd, ret;
>      bool pending = false;
>      PCIDevice *pdev = &vdev->pdev;
>  
> +    pdev->msix_entries_nr = nr_vectors;
>      vdev->nr_vectors = nr_vectors;
>      vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
>      vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
>  
> -    for (i = 0; i < nr_vectors; i++) {
> -        VFIOMSIVector *vector = &vdev->msi_vectors[i];
> -
> -        fd = load_event_fd(vdev, "interrupt", i);
> -        if (fd >= 0) {
> -            vfio_vector_init(vdev, i);
> -            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
> +    if (restore) {
> +        ret = vfio_enable_vectors(vdev, true);
> +        if (ret) {
> +            error_report("vfio: failed to enable vectors, %d", ret);
>          }
> +    } else {
> +        for (i = 0; i < nr_vectors; i++) {
> +            VFIOMSIVector *vector = &vdev->msi_vectors[i];
>  
> -        if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
> -            vfio_add_kvm_msi_virq(vdev, vector, i, msix);
> -        }
> +            fd = load_event_fd(vdev, "interrupt", i);
> +            if (fd >= 0) {
> +                vfio_vector_init(vdev, i);
> +                qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
> +            }
>  
> -        if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
> -            set_bit(i, vdev->msix->pending);
> -            pending = true;
> +            if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
> +                vfio_add_kvm_msi_virq(vdev, vector, i, msix);
> +            }
> +
> +            if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
> +                set_bit(i, vdev->msix->pending);
> +                pending = true;
> +            }
>          }
>      }
> -
>      if (msix) {
>          memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
>      }
> @@ -3534,7 +3544,7 @@ static const VMStateDescription vfio_intx_vmstate = {
>  
>  static bool vfio_pci_needed(void *opaque)
>  {
> -    return cpr_get_mode() == CPR_MODE_RESTART;
> +    return 1;
>  }
>  
>  static const VMStateDescription vfio_pci_vmstate = {
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 6241c20fb1..0179b0aa90 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -26,6 +26,7 @@ void configure_rtc(QemuOpts *opts);
>  void qemu_init_subsystems(void);
>  
>  extern int autostart;
> +extern int restore;
>  
>  typedef enum {
>      VGA_NONE, VGA_STD, VGA_CIRRUS, VGA_VMWARE, VGA_XENFB, VGA_QXL,
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index e680594f27..65c3bab074 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -188,6 +188,8 @@ struct vfio_group_status {
>   */
>  #define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
>  
> +#define VFIO_GROUP_GET_DEVICE_FD_INTACT	_IO(VFIO_TYPE, VFIO_BASE + 21)
> +
>  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
>  
>  /**
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 8b90d04cb9..03666a59b3 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -3984,6 +3984,10 @@ SRST
>      option is experimental.
>  ERST
>  
> +DEF("restore", 0, QEMU_OPTION_restore, \
> +    "-restore              restore mode",
> +    QEMU_ARCH_ALL)
> +
>  DEF("S", 0, QEMU_OPTION_S, \
>      "-S              freeze CPU at startup (use 'c' to start execution)\n",
>      QEMU_ARCH_ALL)
> diff --git a/softmmu/globals.c b/softmmu/globals.c
> index a18fd8dcf3..6fcb5846b4 100644
> --- a/softmmu/globals.c
> +++ b/softmmu/globals.c
> @@ -41,6 +41,7 @@ bool enable_cpu_pm;
>  int nb_nics;
>  NICInfo nd_table[MAX_NICS];
>  int autostart = 1;
> +int restore;
>  int vga_interface_type = VGA_NONE;
>  Chardev *parallel_hds[MAX_PARALLEL_PORTS];
>  int win2k_install_hack;
> diff --git a/softmmu/vl.c b/softmmu/vl.c
> index f14e29e622..fba6b577cb 100644
> --- a/softmmu/vl.c
> +++ b/softmmu/vl.c
> @@ -3088,6 +3088,9 @@ void qemu_init(int argc, char **argv, char **envp)
>              case QEMU_OPTION_S:
>                  autostart = 0;
>                  break;
> +            case QEMU_OPTION_restore:
> +                restore = 1;
> +                break;
>              case QEMU_OPTION_k:
>                  keyboard_layout = optarg;
>                  break;
Fam April 12, 2022, 11:01 a.m. UTC | #3
On 2022-04-11 12:23, Steven Sistare wrote:
> On 3/29/2022 7:03 AM, Fam Zheng wrote:
> > On 2021-08-06 14:43, Steve Sistare wrote:
> >> Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
> >> follows:
> >>   pin : Recover this from the vfio config in kernel space
> >>   interrupt : Preserve its eventfd descriptor across exec.
> >>   unmask : Ditto
> >>   route.irq : This could perhaps be recovered in vfio_pci_post_load by
> >>     calling pci_device_route_intx_to_irq(pin), whose implementation reads
> >>     config space for a bridge device such as ich9.  However, there is no
> >>     guarantee that the bridge vmstate is read before vfio vmstate.  Rather
> >>     than fiddling with MigrationPriority for vmstate handlers, explicitly
> >>     save route.irq in vfio vmstate.
> >>   pending : save in vfio vmstate.
> >>   mmap_timeout, mmap_timer : Re-initialize
> >>   bool kvm_accel : Re-initialize
> >>
> >> In vfio_realize, defer calling vfio_intx_enable until the vmstate
> >> is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
> >> vfio_intx_kvm_enable to skip vfio initialization, but still perform
> >> kvm initialization.
> >>
> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> > 
> > Hi Steve,
> > 
> > Not directly related to this patch, but since the context is close: it looks
> > like this series only takes care of exec restart mode of vfio-pci, have you had
> > any thoughts on kexec reboot mode with vfio-pci?
> > 
> > The general idea is if DMAR context is not lost during kexec, we should be able
> > to set up irqfds again and things will just work?
> > 
> > Fam
> 
> Hi Fam,
>   I have thought about that use case, but only in general terms.
> IMO it best fits in the cpr framework as a new mode (rather than as 
> a new -restore command line argument).  

Yes I think that is better, I will try that.

> 
> In your code below, you would have fewer code changes if you set 
> 'reused = true' for the new mode, rather than testing both 'reused and restored' 
> at multiple sites. Lastly, I cleaned up the vector handling somewhat from V6 
> to V7, so you may want to try your code using V7 as a base.

I am cleaning up the kernel patches and will post both parts once ready.

Fam
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1cee52a..7e59f4f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -145,14 +145,45 @@  static void vfio_intx_eoi(VFIODevice *vbasedev)
     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 }
 
+#ifdef CONFIG_KVM
+static bool vfio_no_kvm_intx(VFIOPCIDevice *vdev)
+{
+    return vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
+           vdev->intx.route.mode != PCI_INTX_ENABLED ||
+           !kvm_resamplefds_enabled();
+}
+#endif
+
+static void vfio_intx_reenable_kvm(VFIOPCIDevice *vdev, Error **errp)
+{
+#ifdef CONFIG_KVM
+    if (vfio_no_kvm_intx(vdev)) {
+        return;
+    }
+
+    if (vfio_named_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) {
+        error_setg(errp, "vfio_named_notifier_init failed");
+        return;
+    }
+
+    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                           &vdev->intx.interrupt,
+                                           &vdev->intx.unmask,
+                                           vdev->intx.route.irq)) {
+        error_setg_errno(errp, errno, "failed to setup resample irqfd");
+        return;
+    }
+
+    vdev->intx.kvm_accel = true;
+#endif
+}
+
 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
 {
 #ifdef CONFIG_KVM
     int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
 
-    if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
-        vdev->intx.route.mode != PCI_INTX_ENABLED ||
-        !kvm_resamplefds_enabled()) {
+    if (vfio_no_kvm_intx(vdev)) {
         return;
     }
 
@@ -300,7 +331,9 @@  static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
         return 0;
     }
 
-    vfio_disable_interrupts(vdev);
+    if (!vdev->pdev.reused) {
+        vfio_disable_interrupts(vdev);
+    }
 
     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
@@ -316,7 +349,8 @@  static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
     }
 #endif
 
-    ret = event_notifier_init(&vdev->intx.interrupt, 0);
+    ret = vfio_named_notifier_init(vdev, &vdev->intx.interrupt,
+                                   "intx-interrupt", 0);
     if (ret) {
         error_setg_errno(errp, -ret, "event_notifier_init failed");
         return ret;
@@ -324,6 +358,11 @@  static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
     fd = event_notifier_get_fd(&vdev->intx.interrupt);
     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
 
+    if (vdev->pdev.reused) {
+        vfio_intx_reenable_kvm(vdev, &err);
+        goto finish;
+    }
+
     if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
                                VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
         qemu_set_fd_handler(fd, NULL, NULL, vdev);
@@ -336,6 +375,7 @@  static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
         warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
     }
 
+finish:
     vdev->interrupt = VFIO_INT_INTx;
 
     trace_vfio_intx_enable(vdev->vbasedev.name);
@@ -3092,9 +3132,13 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
                                              vfio_intx_routing_notifier);
         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
-        ret = vfio_intx_enable(vdev, errp);
-        if (ret) {
-            goto out_deregister;
+
+        /* Wait until cpr-load reads intx routing data to enable */
+        if (!pdev->reused) {
+            ret = vfio_intx_enable(vdev, errp);
+            if (ret) {
+                goto out_deregister;
+            }
         }
     }
 
@@ -3338,7 +3382,8 @@  static int vfio_pci_pre_save(void *opaque)
     int i;
 
     if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
-        assert(0);      /* completed in a subsequent patch */
+        save_event_fd(vdev, "intx-interrupt", 0, &vdev->intx.interrupt);
+        save_event_fd(vdev, "intx-unmask", 0, &vdev->intx.unmask);
     }
 
     for (i = 0; i < vdev->nr_vectors; i++) {
@@ -3395,6 +3440,7 @@  static int vfio_pci_post_load(void *opaque, int version_id)
     VFIOPCIDevice *vdev = opaque;
     PCIDevice *pdev = &vdev->pdev;
     int nr_vectors;
+    int ret = 0;
 
     vfio_merge_config(vdev);
 
@@ -3409,12 +3455,37 @@  static int vfio_pci_post_load(void *opaque, int version_id)
         vfio_claim_vectors(vdev, nr_vectors, false);
 
     } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
-        assert(0);      /* completed in a subsequent patch */
+        Error *err = 0;
+        ret = vfio_intx_enable(vdev, &err);
+        if (ret) {
+            error_report_err(err);
+        }
     }
 
     pdev->reused = false;
 
-    return 0;
+    return ret;
+}
+
+static const VMStateDescription vfio_intx_vmstate = {
+    .name = "vfio-intx",
+    .unmigratable = 1,
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(pending, VFIOINTx),
+        VMSTATE_UINT32(route.mode, VFIOINTx),
+        VMSTATE_INT32(route.irq, VFIOINTx),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_VFIO_INTX(_field, _state) {                         \
+    .name       = (stringify(_field)),                              \
+    .size       = sizeof(VFIOINTx),                                 \
+    .vmsd       = &vfio_intx_vmstate,                               \
+    .flags      = VMS_STRUCT,                                       \
+    .offset     = vmstate_offset_value(_state, _field, VFIOINTx),   \
 }
 
 static bool vfio_pci_needed(void *opaque)
@@ -3433,6 +3504,7 @@  static const VMStateDescription vfio_pci_vmstate = {
     .fields = (VMStateField[]) {
         VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
         VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
+        VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
         VMSTATE_END_OF_LIST()
     }
 };