Message ID | 1620390320-301716-14-git-send-email-steven.sistare@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Live Update | expand |
On Fri, 7 May 2021 05:25:11 -0700 Steve Sistare <steven.sistare@oracle.com> wrote: > Finish cpr for vfio-pci by preserving eventfd's and vector state. > > Signed-off-by: Steve Sistare <steven.sistare@oracle.com> > --- > hw/vfio/pci.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 108 insertions(+), 2 deletions(-) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index f7ac9f03..e983db4 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -2661,6 +2661,27 @@ static void vfio_put_device(VFIOPCIDevice *vdev) > vfio_put_base_device(&vdev->vbasedev); > } > > +static void setenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name, > + EventNotifier *ev) > +{ > + char envname[256]; > + int fd = event_notifier_get_fd(ev); > + const char *vfname = vdev->vbasedev.name; > + > + if (fd >= 0) { > + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); > + setenv_fd(envname, fd); > + } > +} > + > +static int getenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name) > +{ > + char envname[256]; > + const char *vfname = vdev->vbasedev.name; > + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); > + return getenv_fd(envname); > +} > + > static void vfio_err_notifier_handler(void *opaque) > { > VFIOPCIDevice *vdev = opaque; > @@ -2692,7 +2713,13 @@ static void vfio_err_notifier_handler(void *opaque) > static void vfio_register_err_notifier(VFIOPCIDevice *vdev) > { > Error *err = NULL; > - int32_t fd; > + int32_t fd = getenv_event_fd(vdev, 0, "err"); Arg order should match the actual env names, device name, interrupt name, interrupt number. > + > + if (fd >= 0) { > + event_notifier_init_fd(&vdev->err_notifier, fd); > + qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); > + return; > + } > > if (!vdev->pci_aer) { > return; > @@ -2753,7 +2780,14 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) > struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), > .index = VFIO_PCI_REQ_IRQ_INDEX }; > Error *err = NULL; > - int32_t fd; > + int32_t fd = getenv_event_fd(vdev, 0, "req"); > + > + if (fd >= 0) { > + event_notifier_init_fd(&vdev->req_notifier, fd); > + qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); > + vdev->req_enabled = true; > + return; > + } > > if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { > return; > @@ -3286,12 +3320,82 @@ static Property vfio_pci_dev_properties[] = { > DEFINE_PROP_END_OF_LIST(), > }; > > +static int vfio_pci_pre_save(void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + int i; > + > + for (i = 0; i < vdev->nr_vectors; i++) { > + VFIOMSIVector *vector = &vdev->msi_vectors[i]; > + if (vector->use) { > + setenv_event_fd(vdev, i, "interrupt", &vector->interrupt); > + if (vector->virq >= 0) { > + setenv_event_fd(vdev, i, "kvm_interrupt", > + &vector->kvm_interrupt); > + } > + } > + } > + setenv_event_fd(vdev, 0, "err", &vdev->err_notifier); > + setenv_event_fd(vdev, 0, "req", &vdev->req_notifier); > + return 0; > +} > + > +static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix) > +{ > + int i, fd; > + bool pending = false; > + PCIDevice *pdev = &vdev->pdev; > + > + vdev->nr_vectors = nr_vectors; > + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); > + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; > + > + for (i = 0; i < nr_vectors; i++) { > + VFIOMSIVector *vector = &vdev->msi_vectors[i]; > + > + fd = getenv_event_fd(vdev, i, "interrupt"); > + if (fd >= 0) { > + vfio_vector_init(vdev, i, fd); > + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); > + } > + > + fd = getenv_event_fd(vdev, i, "kvm_interrupt"); > + if (fd >= 0) { > + vfio_add_kvm_msi_virq(vdev, vector, i, msix, fd); > + } > + > + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { > + set_bit(i, vdev->msix->pending); > + pending = true; > + } > + } > + > + if (msix) { > + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); > + } > +} > + > static int vfio_pci_post_load(void *opaque, int version_id) > { > VFIOPCIDevice *vdev = opaque; > PCIDevice *pdev = &vdev->pdev; > + int nr_vectors; > bool enabled; > > + if (msix_enabled(pdev)) { > + nr_vectors = vdev->msix->entries; > + vfio_claim_vectors(vdev, nr_vectors, true); > + msix_init_vector_notifiers(pdev, vfio_msix_vector_use, > + vfio_msix_vector_release, NULL); > + > + } else if (msi_enabled(pdev)) { > + nr_vectors = msi_nr_vectors_allocated(pdev); > + vfio_claim_vectors(vdev, nr_vectors, false); > + > + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { > + error_report("vfio_pci_post_load does not support INTX"); > + } Why? Is post-load where we really want to find this out? Thanks, Alex > + > pdev->reused = false; > enabled = pci_get_word(pdev->config + PCI_COMMAND) & PCI_COMMAND_MASTER; > memory_region_set_enabled(&pdev->bus_master_enable_region, enabled); > @@ -3310,8 +3414,10 @@ static const VMStateDescription vfio_pci_vmstate = { > .version_id = 0, > .minimum_version_id = 0, > .post_load = vfio_pci_post_load, > + .pre_save = vfio_pci_pre_save, > .needed = vfio_pci_needed, > .fields = (VMStateField[]) { > + VMSTATE_MSIX(pdev, VFIOPCIDevice), > VMSTATE_END_OF_LIST() > } > };
On 5/21/2021 6:24 PM, Alex Williamson wrote: > On Fri, 7 May 2021 05:25:11 -0700 > Steve Sistare <steven.sistare@oracle.com> wrote: > >> Finish cpr for vfio-pci by preserving eventfd's and vector state. >> >> Signed-off-by: Steve Sistare <steven.sistare@oracle.com> >> --- >> hw/vfio/pci.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 108 insertions(+), 2 deletions(-) >> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index f7ac9f03..e983db4 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -2661,6 +2661,27 @@ static void vfio_put_device(VFIOPCIDevice *vdev) >> vfio_put_base_device(&vdev->vbasedev); >> } >> >> +static void setenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name, >> + EventNotifier *ev) >> +{ >> + char envname[256]; >> + int fd = event_notifier_get_fd(ev); >> + const char *vfname = vdev->vbasedev.name; >> + >> + if (fd >= 0) { >> + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); >> + setenv_fd(envname, fd); >> + } >> +} >> + >> +static int getenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name) >> +{ >> + char envname[256]; >> + const char *vfname = vdev->vbasedev.name; >> + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); >> + return getenv_fd(envname); >> +} >> + >> static void vfio_err_notifier_handler(void *opaque) >> { >> VFIOPCIDevice *vdev = opaque; >> @@ -2692,7 +2713,13 @@ static void vfio_err_notifier_handler(void *opaque) >> static void vfio_register_err_notifier(VFIOPCIDevice *vdev) >> { >> Error *err = NULL; >> - int32_t fd; >> + int32_t fd = getenv_event_fd(vdev, 0, "err"); > > Arg order should match the actual env names, device name, interrupt > name, interrupt number. I am happy to swap interrupt name and interrupt order, here and in setenv_event_fd(). However, I pass vdev so the getenv_event_fd() caller does not need to generate the env var name, and the details of the name are confined to {get,set}_event_fd. I could pass the vdev name instead, but IMO it would be uglier at every call site, eg: fd = getenv_event_fd(vdev->vbasedev.name, "err", 0); vs fd = getenv_event_fd(vdev, "err", 0); I could rename the functions so they do not imply argument similarity with getenv_fd: getenv_event_fd --> save_event_fd setenv_event_fd --> load_event_fd >> + if (fd >= 0) { >> + event_notifier_init_fd(&vdev->err_notifier, fd); >> + qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); >> + return; >> + } >> >> if (!vdev->pci_aer) { >> return; >> @@ -2753,7 +2780,14 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) >> struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), >> .index = VFIO_PCI_REQ_IRQ_INDEX }; >> Error *err = NULL; >> - int32_t fd; >> + int32_t fd = getenv_event_fd(vdev, 0, "req"); >> + >> + if (fd >= 0) { >> + event_notifier_init_fd(&vdev->req_notifier, fd); >> + qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); >> + vdev->req_enabled = true; >> + return; >> + } >> >> if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { >> return; >> @@ -3286,12 +3320,82 @@ static Property vfio_pci_dev_properties[] = { >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> +static int vfio_pci_pre_save(void *opaque) >> +{ >> + VFIOPCIDevice *vdev = opaque; >> + int i; >> + >> + for (i = 0; i < vdev->nr_vectors; i++) { >> + VFIOMSIVector *vector = &vdev->msi_vectors[i]; >> + if (vector->use) { >> + setenv_event_fd(vdev, i, "interrupt", &vector->interrupt); >> + if (vector->virq >= 0) { >> + setenv_event_fd(vdev, i, "kvm_interrupt", >> + &vector->kvm_interrupt); >> + } >> + } >> + } >> + setenv_event_fd(vdev, 0, "err", &vdev->err_notifier); >> + setenv_event_fd(vdev, 0, "req", &vdev->req_notifier); >> + return 0; >> +} >> + >> +static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix) >> +{ >> + int i, fd; >> + bool pending = false; >> + PCIDevice *pdev = &vdev->pdev; >> + >> + vdev->nr_vectors = nr_vectors; >> + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); >> + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; >> + >> + for (i = 0; i < nr_vectors; i++) { >> + VFIOMSIVector *vector = &vdev->msi_vectors[i]; >> + >> + fd = getenv_event_fd(vdev, i, "interrupt"); >> + if (fd >= 0) { >> + vfio_vector_init(vdev, i, fd); >> + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); >> + } >> + >> + fd = getenv_event_fd(vdev, i, "kvm_interrupt"); >> + if (fd >= 0) { >> + vfio_add_kvm_msi_virq(vdev, vector, i, msix, fd); >> + } >> + >> + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { >> + set_bit(i, vdev->msix->pending); >> + pending = true; >> + } >> + } >> + >> + if (msix) { >> + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); >> + } >> +} >> + >> static int vfio_pci_post_load(void *opaque, int version_id) >> { >> VFIOPCIDevice *vdev = opaque; >> PCIDevice *pdev = &vdev->pdev; >> + int nr_vectors; >> bool enabled; >> >> + if (msix_enabled(pdev)) { >> + nr_vectors = vdev->msix->entries; >> + vfio_claim_vectors(vdev, nr_vectors, true); >> + msix_init_vector_notifiers(pdev, vfio_msix_vector_use, >> + vfio_msix_vector_release, NULL); >> + >> + } else if (msi_enabled(pdev)) { >> + nr_vectors = msi_nr_vectors_allocated(pdev); >> + vfio_claim_vectors(vdev, nr_vectors, false); >> + >> + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { >> + error_report("vfio_pci_post_load does not support INTX"); >> + } > > Why? Is post-load where we really want to find this out? Thanks, This is also checked at VM creation time if only-cpr-capable is specified. I could also check it in vfio_pci_pre_save. - Steve >> + >> pdev->reused = false; >> enabled = pci_get_word(pdev->config + PCI_COMMAND) & PCI_COMMAND_MASTER; >> memory_region_set_enabled(&pdev->bus_master_enable_region, enabled); >> @@ -3310,8 +3414,10 @@ static const VMStateDescription vfio_pci_vmstate = { >> .version_id = 0, >> .minimum_version_id = 0, >> .post_load = vfio_pci_post_load, >> + .pre_save = vfio_pci_pre_save, >> .needed = vfio_pci_needed, >> .fields = (VMStateField[]) { >> + VMSTATE_MSIX(pdev, VFIOPCIDevice), >> VMSTATE_END_OF_LIST() >> } >> }; >
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f7ac9f03..e983db4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2661,6 +2661,27 @@ static void vfio_put_device(VFIOPCIDevice *vdev) vfio_put_base_device(&vdev->vbasedev); } +static void setenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name, + EventNotifier *ev) +{ + char envname[256]; + int fd = event_notifier_get_fd(ev); + const char *vfname = vdev->vbasedev.name; + + if (fd >= 0) { + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); + setenv_fd(envname, fd); + } +} + +static int getenv_event_fd(VFIOPCIDevice *vdev, int nr, const char *name) +{ + char envname[256]; + const char *vfname = vdev->vbasedev.name; + snprintf(envname, sizeof(envname), "%s_%s_%d", vfname, name, nr); + return getenv_fd(envname); +} + static void vfio_err_notifier_handler(void *opaque) { VFIOPCIDevice *vdev = opaque; @@ -2692,7 +2713,13 @@ static void vfio_err_notifier_handler(void *opaque) static void vfio_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; - int32_t fd; + int32_t fd = getenv_event_fd(vdev, 0, "err"); + + if (fd >= 0) { + event_notifier_init_fd(&vdev->err_notifier, fd); + qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); + return; + } if (!vdev->pci_aer) { return; @@ -2753,7 +2780,14 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), .index = VFIO_PCI_REQ_IRQ_INDEX }; Error *err = NULL; - int32_t fd; + int32_t fd = getenv_event_fd(vdev, 0, "req"); + + if (fd >= 0) { + event_notifier_init_fd(&vdev->req_notifier, fd); + qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); + vdev->req_enabled = true; + return; + } if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { return; @@ -3286,12 +3320,82 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static int vfio_pci_pre_save(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + int i; + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + if (vector->use) { + setenv_event_fd(vdev, i, "interrupt", &vector->interrupt); + if (vector->virq >= 0) { + setenv_event_fd(vdev, i, "kvm_interrupt", + &vector->kvm_interrupt); + } + } + } + setenv_event_fd(vdev, 0, "err", &vdev->err_notifier); + setenv_event_fd(vdev, 0, "req", &vdev->req_notifier); + return 0; +} + +static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = &vdev->pdev; + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = getenv_event_fd(vdev, i, "interrupt"); + if (fd >= 0) { + vfio_vector_init(vdev, i, fd); + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); + } + + fd = getenv_event_fd(vdev, i, "kvm_interrupt"); + if (fd >= 0) { + vfio_add_kvm_msi_virq(vdev, vector, i, msix, fd); + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } +} + static int vfio_pci_post_load(void *opaque, int version_id) { VFIOPCIDevice *vdev = opaque; PCIDevice *pdev = &vdev->pdev; + int nr_vectors; bool enabled; + if (msix_enabled(pdev)) { + nr_vectors = vdev->msix->entries; + vfio_claim_vectors(vdev, nr_vectors, true); + msix_init_vector_notifiers(pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); + + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_claim_vectors(vdev, nr_vectors, false); + + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + error_report("vfio_pci_post_load does not support INTX"); + } + pdev->reused = false; enabled = pci_get_word(pdev->config + PCI_COMMAND) & PCI_COMMAND_MASTER; memory_region_set_enabled(&pdev->bus_master_enable_region, enabled); @@ -3310,8 +3414,10 @@ static const VMStateDescription vfio_pci_vmstate = { .version_id = 0, .minimum_version_id = 0, .post_load = vfio_pci_post_load, + .pre_save = vfio_pci_pre_save, .needed = vfio_pci_needed, .fields = (VMStateField[]) { + VMSTATE_MSIX(pdev, VFIOPCIDevice), VMSTATE_END_OF_LIST() } };
Finish cpr for vfio-pci by preserving eventfd's and vector state. Signed-off-by: Steve Sistare <steven.sistare@oracle.com> --- hw/vfio/pci.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 2 deletions(-)