diff mbox

[RFC,3/3] Qemu: Introduce pci-sriov device type to support VF live migration

Message ID 1445446357-5539-4-git-send-email-tianyu.lan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

lan,Tianyu Oct. 21, 2015, 4:52 p.m. UTC
This patch is to migrate VF status during migration between
source and target machine.

There are three kinds of VF status involved.
1) PCI configure space regs
2) MSIX configure
3) VF status in the PF driver

The PCI configure space regs and MSIX configure are originally
stored in Qemu.

VF status in the PF driver can be saved and restored via new sysfs
node state_in_pf under VF sysfs directory.

Fake PCI configure space regs "0xF0" to let VF driver to know migration
status. Qemu set reg "0xF0" to 1 when migration starts and set it to 0
when migration completes. VF driver tells Qemu it's ready for migration
via writing 1 to reg "0xF1".

Qemu notifies VF driver about migration status change via new sysfs
node notify_vf to send mailbox msg to VF driver.

Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
---
 hw/i386/kvm/Makefile.objs |   2 +-
 hw/i386/kvm/pci-assign.c  |   2 +-
 hw/i386/kvm/sriov.c       | 213 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 hw/i386/kvm/sriov.c
diff mbox

Patch

diff --git a/hw/i386/kvm/Makefile.objs b/hw/i386/kvm/Makefile.objs
index d8bce20..09324e9 100644
--- a/hw/i386/kvm/Makefile.objs
+++ b/hw/i386/kvm/Makefile.objs
@@ -1 +1 @@ 
-obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o
+obj-y += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o sriov.o
diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
index 616532d..84c5ff5 100644
--- a/hw/i386/kvm/pci-assign.c
+++ b/hw/i386/kvm/pci-assign.c
@@ -1770,7 +1770,7 @@  static void assign_class_init(ObjectClass *klass, void *data)
     k->config_read  = assigned_dev_pci_read_config;
     k->config_write = assigned_dev_pci_write_config;
     dc->props       = assigned_dev_properties;
-    dc->vmsd        = &vmstate_assigned_device;
+//    dc->vmsd        = &vmstate_assigned_device;
     dc->reset       = reset_assigned_device;
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     dc->desc        = "KVM-based PCI passthrough";
diff --git a/hw/i386/kvm/sriov.c b/hw/i386/kvm/sriov.c
new file mode 100644
index 0000000..ac37035
--- /dev/null
+++ b/hw/i386/kvm/sriov.c
@@ -0,0 +1,213 @@ 
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include "hw/hw.h"
+#include "hw/i386/pc.h"
+#include "pci-assign.h"
+
+
+#define TYPE_PCI_SRIOV "pci-sriov"
+
+#define SRIOV_LM_SETUP 0x01
+#define SRIOV_LM_COMPLETE 0x02
+
+static int pt_save_pf_buf(struct PCIDevice *pdev, unsigned char **buf,
+			   int *len)
+{
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *f;
+
+    *len = 0;
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    if (!(f = fopen(file, "rb"))) {
+        return -EEXIST;
+    }
+    *buf = g_malloc(4096);
+    *len = fread(*buf, 1, 4096, f);
+    fclose(f);
+
+    return 0;
+}
+
+static void pt_restore_pf_buf(struct PCIDevice *pdev, unsigned char *buf, int len)
+{
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    FILE *f;
+    char file[128];
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/state_in_pf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    printf("path: %s\n", file);
+    if (!(f = fopen(file, "wb")))
+        return;
+
+    fwrite(buf, 1, len, f);
+    fclose(f);
+
+}
+
+static void assign_dev_post_load(void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *f;
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    printf("notify path %s\n", file);
+    if (!(f = fopen(file, "wb")))
+        return;
+    
+    fwrite("1", 1, 1, f);
+    fclose(f);
+}
+
+static int assign_dev_load(QEMUFile *f, void *opaque, int version_id)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    unsigned char *buf = NULL;
+    int ret, len, num;
+
+    if(qemu_get_byte(f)!= SRIOV_LM_COMPLETE)
+        return 0;
+
+    ret = pci_device_load(pdev, f);
+    if (ret) {
+        printf("pci config error %d\n", ret);
+        return ret;
+    }
+
+    qemu_get_sbe32s(f, &num);
+    qemu_get_buffer(f, (unsigned char *)adev->msix_table,
+	num * PCI_MSIX_ENTRY_SIZE);
+    assigned_dev_update_msix(pdev);
+
+    len = qemu_get_be32(f);
+    if (len) {
+        buf = g_malloc(len);
+        qemu_get_buffer(f, buf, len);
+        pt_restore_pf_buf(pdev, buf, len);
+        g_free(buf);
+    }
+
+
+    pci_default_write_config(pdev, 0xf0, 0x00, 1);
+    pci_default_write_config(pdev, 0xf1, 0x00, 1);
+    return 0;
+}
+
+static int assign_dev_save_complete(QEMUFile *f, void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    int len, entries_nr = 0;
+    unsigned char *buf = NULL;
+    int i;
+    MSIXTableEntry *entry = adev->msix_table;
+
+    qemu_put_byte(f, SRIOV_LM_COMPLETE);
+    pci_device_save(pdev, f);
+
+    for (i = 0; i < adev->msix_max; i++, entry++) {
+        if (assigned_dev_msix_skipped(entry)) {
+            continue;
+        }
+        entries_nr++;
+    }
+
+    qemu_put_sbe32s(f, &entries_nr);
+    qemu_put_buffer(f, (unsigned char *)adev->msix_table, entries_nr * PCI_MSIX_ENTRY_SIZE);
+
+    if (pt_save_pf_buf(pdev, &buf, &len))
+        return -EFAULT;
+
+    qemu_put_be32(f, len);
+    if (len) {
+        printf("pf state saved, size %d\n", len);
+        qemu_put_buffer(f, buf, len);
+    }
+    
+    return 0;
+}
+
+static int assign_dev_setup(QEMUFile *f, void *opaque)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+    AssignedDevice *adev = PCI_ASSIGN(pdev);
+    char file[128];
+    FILE *fd;
+
+    pci_default_write_config(pdev, 0xf0, 0x01, 1);
+
+    snprintf(file, sizeof(file),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/notify_vf",
+             adev->host.domain, adev->host.bus, adev->host.slot,
+             adev->host.function);
+
+    if (!(fd = fopen(file, "wb")))
+        return -EFAULT;
+
+    fwrite("1", 1, 1, fd);
+    fclose(fd);
+
+    printf("notify path %s\n", file);
+    qemu_put_byte(f, SRIOV_LM_SETUP);
+    return 0;
+}
+
+static uint64_t assign_dev_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    struct PCIDevice *pdev = (struct PCIDevice *)opaque;
+
+    return pci_default_read_config(pdev, 0xf1,1) ?
+                0 : max_size;
+}
+
+static SaveVMHandlers savevm_pt_handlers = {
+    .save_live_setup = assign_dev_setup,
+    .save_live_complete = assign_dev_save_complete,
+    .save_live_pending = assign_dev_save_pending,
+    .load_state = assign_dev_load,
+    .post_load_state = assign_dev_post_load,
+};
+
+static void sriov_pci_instance_init(Object *obj)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(obj);
+
+    register_savevm_live(NULL, "pci-assign", 1, 1,
+                         &savevm_pt_handlers, pci_dev);
+}
+
+static const TypeInfo sriov_pci_type_info = {
+    .name = TYPE_PCI_SRIOV,
+    .parent = TYPE_PCI_ASSIGN, 
+    .instance_init = sriov_pci_instance_init,
+};
+
+static void sriov_register_types(void)
+{
+    type_register_static(&sriov_pci_type_info);
+}
+type_init(sriov_register_types)