@@ -1,6 +1,7 @@
#ifndef KVM__VFIO_H
#define KVM__VFIO_H
+#include "kvm/mutex.h"
#include "kvm/parse-options.h"
#include "kvm/pci.h"
@@ -24,8 +25,59 @@ enum vfio_device_type {
VFIO_DEVICE_PCI,
};
+/* MSI/MSI-X capability enabled */
+#define VFIO_PCI_MSI_STATE_ENABLED (1 << 0)
+/* MSI/MSI-X capability or individual vector masked */
+#define VFIO_PCI_MSI_STATE_MASKED (1 << 1)
+/* MSI-X capability has no vector enabled yet */
+#define VFIO_PCI_MSI_STATE_EMPTY (1 << 2)
+
+struct vfio_pci_msi_entry {
+ struct msix_table config;
+ int gsi;
+ int eventfd;
+ u8 phys_state;
+ u8 virt_state;
+};
+
+struct vfio_pci_msix_table {
+ size_t size;
+ unsigned int bar;
+ u32 guest_phys_addr;
+};
+
+struct vfio_pci_msix_pba {
+ size_t size;
+ off_t offset; /* in VFIO device fd */
+ unsigned int bar;
+ u32 guest_phys_addr;
+};
+
+/* Common data for MSI and MSI-X */
+struct vfio_pci_msi_common {
+ off_t pos;
+ u8 virt_state;
+ u8 phys_state;
+ struct mutex mutex;
+ struct vfio_irq_info info;
+ struct vfio_irq_set *irq_set;
+ size_t nr_entries;
+ struct vfio_pci_msi_entry *entries;
+};
+
+#define VFIO_PCI_IRQ_MODE_INTX (1 << 0)
+#define VFIO_PCI_IRQ_MODE_MSI (1 << 1)
+#define VFIO_PCI_IRQ_MODE_MSIX (1 << 2)
+
struct vfio_pci_device {
struct pci_device_header hdr;
+
+ unsigned long irq_modes;
+ int intx_fd;
+ unsigned int intx_gsi;
+ struct vfio_pci_msi_common msix;
+ struct vfio_pci_msix_table msix_table;
+ struct vfio_pci_msix_pba msix_pba;
};
struct vfio_region {
@@ -5,6 +5,8 @@
#include <sys/ioctl.h>
#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
/* Wrapper around UAPI vfio_irq_set */
struct vfio_irq_eventfd {
@@ -12,6 +14,318 @@ struct vfio_irq_eventfd {
int fd;
};
+#define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED)
+#define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED)
+#define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY)
+
+#define msi_update_state(state, val, bit) \
+ (state) = (val) ? (state) | bit : (state) & ~bit;
+#define msi_set_enabled(state, val) \
+ msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED)
+#define msi_set_masked(state, val) \
+ msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED)
+#define msi_set_empty(state, val) \
+ msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY)
+
+static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev);
+
+static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev)
+{
+ size_t i;
+ int ret = 0;
+ int *eventfds;
+ struct vfio_pci_device *pdev = &vdev->pci;
+ struct vfio_pci_msi_common *msis = &pdev->msix;
+ struct vfio_irq_eventfd single = {
+ .irq = {
+ .argsz = sizeof(single),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = msis->info.index,
+ .count = 1,
+ },
+ };
+
+ if (!msi_is_enabled(msis->virt_state))
+ return 0;
+
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) {
+ /*
+ * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same
+ * time. Since INTx has to be enabled from the start (we don't
+ * have a reliable way to know when the user starts using it),
+ * disable it now.
+ */
+ vfio_pci_disable_intx(kvm, vdev);
+ /* Permanently disable INTx */
+ pdev->irq_modes &= ~VFIO_PCI_IRQ_MODE_INTX;
+ }
+
+ eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
+
+ /*
+ * Initial registration of the full range. This enables the physical
+ * MSI/MSI-X capability, which might have desired side effects. For
+ * instance when assigning virtio legacy devices, enabling the MSI
+ * capability modifies the config space layout!
+ *
+ * As an optimization, only update MSIs when guest unmasks the
+ * capability. This greatly reduces the initialization time for Linux
+ * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap
+ * masked, then fills individual vectors, then unmasks the whole
+ * function. So we only do one VFIO ioctl when enabling for the first
+ * time, and then one when unmasking.
+ *
+ * phys_state is empty when it is enabled but no vector has been
+ * registered via SET_IRQS yet.
+ */
+ if (!msi_is_enabled(msis->phys_state) ||
+ (!msi_is_masked(msis->virt_state) &&
+ msi_is_empty(msis->phys_state))) {
+ bool empty = true;
+
+ for (i = 0; i < msis->nr_entries; i++) {
+ eventfds[i] = msis->entries[i].gsi >= 0 ?
+ msis->entries[i].eventfd : -1;
+
+ if (eventfds[i] >= 0)
+ empty = false;
+ }
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set);
+ if (ret < 0) {
+ perror("VFIO_DEVICE_SET_IRQS(multi)");
+ return ret;
+ }
+
+ msi_set_enabled(msis->phys_state, true);
+ msi_set_empty(msis->phys_state, empty);
+
+ return 0;
+ }
+
+ if (msi_is_masked(msis->virt_state)) {
+ /* TODO: if phys_state is not empty nor masked, mask all vectors */
+ return 0;
+ }
+
+ /* Update individual vectors to avoid breaking those in use */
+ for (i = 0; i < msis->nr_entries; i++) {
+ struct vfio_pci_msi_entry *entry = &msis->entries[i];
+ int fd = entry->gsi >= 0 ? entry->eventfd : -1;
+
+ if (fd == eventfds[i])
+ continue;
+
+ single.irq.start = i;
+ single.fd = fd;
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single);
+ if (ret < 0) {
+ perror("VFIO_DEVICE_SET_IRQS(single)");
+ break;
+ }
+
+ eventfds[i] = fd;
+
+ if (msi_is_empty(msis->phys_state) && fd >= 0)
+ msi_set_empty(msis->phys_state, false);
+ }
+
+ return ret;
+}
+
+static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev)
+{
+ int ret;
+ struct vfio_pci_device *pdev = &vdev->pci;
+ struct vfio_pci_msi_common *msis = &pdev->msix;
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = msis->info.index,
+ .start = 0,
+ .count = 0,
+ };
+
+ if (!msi_is_enabled(msis->phys_state))
+ return 0;
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+ if (ret < 0) {
+ perror("VFIO_DEVICE_SET_IRQS(NONE)");
+ return ret;
+ }
+
+ msi_set_enabled(msis->phys_state, false);
+ msi_set_empty(msis->phys_state, true);
+
+ return 0;
+}
+
+static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev,
+ struct vfio_pci_msi_entry *entry)
+{
+ int ret;
+
+ if (entry->eventfd < 0) {
+ entry->eventfd = eventfd(0, 0);
+ if (entry->eventfd < 0) {
+ ret = -errno;
+ vfio_dev_err(vdev, "cannot create eventfd");
+ return ret;
+ }
+ }
+
+ /* Allocate IRQ if necessary */
+ if (entry->gsi < 0) {
+ int ret = irq__add_msix_route(kvm, &entry->config.msg,
+ vdev->dev_hdr.dev_num << 3);
+ if (ret < 0) {
+ vfio_dev_err(vdev, "cannot create MSI-X route");
+ return ret;
+ }
+ entry->gsi = ret;
+ } else {
+ irq__update_msix_route(kvm, entry->gsi, &entry->config.msg);
+ }
+
+ /*
+ * MSI masking is unimplemented in VFIO, so we have to handle it by
+ * disabling/enabling IRQ route instead. We do it on the KVM side rather
+ * than VFIO, because:
+ * - it is 8x faster
+ * - it allows to decouple masking logic from capability state.
+ * - in masked state, after removing irqfd route, we could easily plug
+ * the eventfd in a local handler, in order to serve Pending Bit reads
+ * to the guest.
+ *
+ * So entry->phys_state is masked when there is no active irqfd route.
+ */
+ if (msi_is_masked(entry->virt_state) == msi_is_masked(entry->phys_state))
+ return 0;
+
+ if (msi_is_masked(entry->phys_state)) {
+ ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1);
+ if (ret < 0) {
+ vfio_dev_err(vdev, "cannot setup irqfd");
+ return ret;
+ }
+ } else {
+ irq__del_irqfd(kvm, entry->gsi, entry->eventfd);
+ }
+
+ msi_set_masked(entry->phys_state, msi_is_masked(entry->virt_state));
+
+ return 0;
+}
+
+static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+ u32 len, u8 is_write, void *ptr)
+{
+ struct vfio_pci_device *pdev = ptr;
+ struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+ u64 offset = addr - pba->guest_phys_addr;
+ struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
+
+ if (is_write)
+ return;
+
+ /*
+ * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA
+ * is completely useless here. Note that Linux doesn't use PBA.
+ */
+ if (pread(vdev->fd, data, len, pba->offset + offset) != (ssize_t)len)
+ vfio_dev_err(vdev, "cannot access MSIX PBA\n");
+}
+
+static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+ u32 len, u8 is_write, void *ptr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct vfio_pci_msi_entry *entry;
+ struct vfio_pci_device *pdev = ptr;
+ struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci);
+
+ u64 offset = addr - pdev->msix_table.guest_phys_addr;
+
+ size_t vector = offset / PCI_MSIX_ENTRY_SIZE;
+ off_t field = offset % PCI_MSIX_ENTRY_SIZE;
+
+ /*
+ * PCI spec says that software must use aligned 4 or 8 bytes accesses
+ * for the MSI-X tables.
+ */
+ if ((len != 4 && len != 8) || addr & (len - 1)) {
+ vfio_dev_warn(vdev, "invalid MSI-X table access");
+ return;
+ }
+
+ entry = &pdev->msix.entries[vector];
+
+ mutex_lock(&pdev->msix.mutex);
+
+ if (!is_write) {
+ memcpy(data, (void *)&entry->config + field, len);
+ goto out_unlock;
+ }
+
+ memcpy((void *)&entry->config + field, data, len);
+
+ /*
+ * Check if access touched the vector control register, which is at the
+ * end of the MSI-X entry.
+ */
+ if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
+ goto out_unlock;
+
+ msi_set_masked(entry->virt_state, entry->config.ctrl &
+ PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
+ if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0)
+ /* Not much we can do here. */
+ vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector);
+
+ /* Update the physical capability if necessary */
+ if (vfio_pci_enable_msis(kvm, vdev))
+ vfio_dev_err(vdev, "cannot enable MSIX");
+
+out_unlock:
+ mutex_unlock(&pdev->msix.mutex);
+}
+
+static void vfio_pci_msix_cap_write(struct kvm *kvm,
+ struct vfio_device *vdev, u8 off,
+ void *data, int sz)
+{
+ struct vfio_pci_device *pdev = &vdev->pci;
+ off_t enable_pos = PCI_MSIX_FLAGS + 1;
+ bool enable;
+ u16 flags;
+
+ off -= pdev->msix.pos;
+
+ /* Check if access intersects with the MSI-X Enable bit */
+ if (off > enable_pos || off + sz <= enable_pos)
+ return;
+
+ /* Read byte that contains the Enable bit */
+ flags = *(u8 *)(data + enable_pos - off) << 8;
+
+ mutex_lock(&pdev->msix.mutex);
+
+ msi_set_masked(pdev->msix.virt_state, flags & PCI_MSIX_FLAGS_MASKALL);
+ enable = flags & PCI_MSIX_FLAGS_ENABLE;
+ msi_set_enabled(pdev->msix.virt_state, enable);
+
+ if (enable && vfio_pci_enable_msis(kvm, vdev))
+ vfio_dev_err(vdev, "cannot enable MSIX");
+ else if (!enable && vfio_pci_disable_msis(kvm, vdev))
+ vfio_dev_err(vdev, "cannot disable MSIX");
+
+ mutex_unlock(&pdev->msix.mutex);
+}
+
static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr,
u8 offset, void *data, int sz)
{
@@ -46,29 +360,102 @@ static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hd
vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x",
sz, offset);
+ /* Handle MSI write now, since it might update the hardware capability */
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX)
+ vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz);
+
if (pread(vdev->fd, base + offset, sz, info->offset + offset) != sz)
vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x",
sz, offset);
}
+static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr)
+{
+ switch (cap_hdr->type) {
+ case PCI_CAP_ID_MSIX:
+ return PCI_CAP_MSIX_SIZEOF;
+ default:
+ pr_err("unknown PCI capability 0x%x", cap_hdr->type);
+ return 0;
+ }
+}
+
+static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr,
+ struct pci_cap_hdr *cap, off_t pos)
+{
+ struct pci_cap_hdr *last;
+ struct pci_device_header *hdr = &vdev->pci.hdr;
+
+ cap->next = 0;
+
+ if (!hdr->capabilities) {
+ hdr->capabilities = pos;
+ hdr->status |= PCI_STATUS_CAP_LIST;
+ } else {
+ last = PCI_CAP(virt_hdr, hdr->capabilities);
+
+ while (last->next)
+ last = PCI_CAP(virt_hdr, last->next);
+
+ last->next = pos;
+ }
+
+ memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap));
+
+ return 0;
+}
+
static int vfio_pci_parse_caps(struct vfio_device *vdev)
{
+ int ret;
+ size_t size;
+ u8 pos, next;
+ struct pci_cap_hdr *cap;
+ u8 virt_hdr[PCI_DEV_CFG_SIZE];
struct vfio_pci_device *pdev = &vdev->pci;
if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST))
return 0;
+ memset(virt_hdr, 0, PCI_DEV_CFG_SIZE);
+
+ pos = pdev->hdr.capabilities & ~3;
+
pdev->hdr.status &= ~PCI_STATUS_CAP_LIST;
pdev->hdr.capabilities = 0;
- /* TODO: install virtual capabilities */
+ for (; pos; pos = next) {
+ if (pos >= PCI_DEV_CFG_SIZE) {
+ vfio_dev_warn(vdev, "ignoring cap outside of config space");
+ return -EINVAL;
+ }
+
+ cap = PCI_CAP(&pdev->hdr, pos);
+ next = cap->next;
+
+ switch (cap->type) {
+ case PCI_CAP_ID_MSIX:
+ ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos);
+ if (ret)
+ return ret;
+
+ pdev->msix.pos = pos;
+ pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX;
+ break;
+ }
+ }
+
+ /* Wipe remaining capabilities */
+ pos = PCI_STD_HEADER_SIZEOF;
+ size = PCI_DEV_CFG_SIZE - PCI_STD_HEADER_SIZEOF;
+ memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size);
return 0;
}
static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
{
- ssize_t sz = PCI_STD_HEADER_SIZEOF;
+ ssize_t sz = PCI_DEV_CFG_SIZE;
struct vfio_region_info *info;
struct vfio_pci_device *pdev = &vdev->pci;
@@ -89,6 +476,7 @@ static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
return -EINVAL;
}
+ /* Read standard headers and capabilities */
if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) {
vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz);
return -EIO;
@@ -103,6 +491,9 @@ static int vfio_pci_parse_cfg_space(struct vfio_device *vdev)
return -EOPNOTSUPP;
}
+ if (pdev->hdr.irq_pin)
+ pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX;
+
vfio_pci_parse_caps(vdev);
return 0;
@@ -112,6 +503,7 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
{
int i;
ssize_t hdr_sz;
+ struct msix_cap *msix;
struct vfio_region_info *info;
struct vfio_pci_device *pdev = &vdev->pci;
@@ -144,6 +536,22 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
*/
pdev->hdr.exp_rom_bar = 0;
+ /* Plumb in our fake MSI-X capability, if we have it. */
+ msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX);
+ if (msix) {
+ /* Add a shortcut to the PBA region for the MMIO handler */
+ int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar;
+ pdev->msix_pba.offset = vdev->regions[pba_index].info.offset +
+ (msix->pba_offset & PCI_MSIX_PBA_OFFSET);
+
+ /* Tidy up the capability */
+ msix->table_offset &= PCI_MSIX_TABLE_BIR;
+ msix->pba_offset &= PCI_MSIX_PBA_BIR;
+ if (pdev->msix_table.bar == pdev->msix_pba.bar)
+ msix->pba_offset |= pdev->msix_table.size &
+ PCI_MSIX_PBA_OFFSET;
+ }
+
/* Install our fake Configuration Space */
info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
hdr_sz = PCI_DEV_CFG_SIZE;
@@ -164,11 +572,84 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
return 0;
}
+static int vfio_pci_create_msix_table(struct kvm *kvm,
+ struct vfio_pci_device *pdev)
+{
+ int ret;
+ size_t i;
+ size_t mmio_size;
+ size_t nr_entries;
+ struct vfio_pci_msi_entry *entries;
+ struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
+ struct vfio_pci_msix_table *table = &pdev->msix_table;
+ struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos);
+
+ table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
+ pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
+
+ /*
+ * KVM needs memory regions to be multiple of and aligned on PAGE_SIZE.
+ */
+ nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+ table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, PAGE_SIZE);
+ pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), PAGE_SIZE);
+
+ entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry));
+ if (!entries)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_entries; i++)
+ entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+
+ /*
+ * To ease MSI-X cap configuration in case they share the same BAR,
+ * collapse table and pending array. The size of the BAR regions must be
+ * powers of two.
+ */
+ mmio_size = roundup_pow_of_two(table->size + pba->size);
+ table->guest_phys_addr = pci_get_io_space_block(mmio_size);
+ if (!table->guest_phys_addr) {
+ pr_err("cannot allocate IO space");
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ pba->guest_phys_addr = table->guest_phys_addr + table->size;
+
+ ret = kvm__register_mmio(kvm, table->guest_phys_addr, table->size,
+ false, vfio_pci_msix_table_access, pdev);
+ if (ret < 0)
+ goto out_free;
+
+ /*
+ * We could map the physical PBA directly into the guest, but it's
+ * likely smaller than a page, and we can only hand full pages to the
+ * guest. Even though the PCI spec disallows sharing a page used for
+ * MSI-X with any other resource, it allows to share the same page
+ * between MSI-X table and PBA. For the sake of isolation, create a
+ * virtual PBA.
+ */
+ ret = kvm__register_mmio(kvm, pba->guest_phys_addr, pba->size, false,
+ vfio_pci_msix_pba_access, pdev);
+ if (ret < 0)
+ goto out_free;
+
+ pdev->msix.entries = entries;
+ pdev->msix.nr_entries = nr_entries;
+
+ return 0;
+
+out_free:
+ free(entries);
+
+ return ret;
+}
+
static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
size_t nr)
{
int ret;
size_t map_size;
+ struct vfio_pci_device *pdev = &vdev->pci;
struct vfio_region *region = &vdev->regions[nr];
if (nr >= vdev->info.num_regions)
@@ -190,6 +671,17 @@ static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev,
if (!region->info.size)
return 0;
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+ /* Trap and emulate MSI-X table */
+ if (nr == pdev->msix_table.bar) {
+ region->guest_phys_addr = pdev->msix_table.guest_phys_addr;
+ return 0;
+ } else if (nr == pdev->msix_pba.bar) {
+ region->guest_phys_addr = pdev->msix_pba.guest_phys_addr;
+ return 0;
+ }
+ }
+
/* Grab some MMIO space in the guest */
map_size = ALIGN(region->info.size, PAGE_SIZE);
region->guest_phys_addr = pci_get_io_space_block(map_size);
@@ -218,6 +710,12 @@ static int vfio_pci_configure_dev_regions(struct kvm *kvm,
if (ret)
return ret;
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+ ret = vfio_pci_create_msix_table(kvm, pdev);
+ if (ret)
+ return ret;
+ }
+
for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
/* Ignore top half of 64-bit BAR */
if (i % 2 && is_64bit)
@@ -237,6 +735,122 @@ static int vfio_pci_configure_dev_regions(struct kvm *kvm,
return vfio_pci_fixup_cfg_space(vdev);
}
+/*
+ * Attempt to update the FD limit, if opening an eventfd for each IRQ vector
+ * would hit the limit. Which is likely to happen when a device uses 2048 MSIs.
+ */
+static int vfio_pci_reserve_irq_fds(size_t num)
+{
+ /*
+ * I counted around 27 fds under normal load. Let's add 100 for good
+ * measure.
+ */
+ static size_t needed = 128;
+ struct rlimit fd_limit, new_limit;
+
+ needed += num;
+
+ if (getrlimit(RLIMIT_NOFILE, &fd_limit)) {
+ perror("getrlimit(RLIMIT_NOFILE)");
+ return 0;
+ }
+
+ if (fd_limit.rlim_cur >= needed)
+ return 0;
+
+ new_limit.rlim_cur = needed;
+
+ if (fd_limit.rlim_max < needed)
+ /* Try to bump hard limit (root only) */
+ new_limit.rlim_max = needed;
+ else
+ new_limit.rlim_max = fd_limit.rlim_max;
+
+ if (setrlimit(RLIMIT_NOFILE, &new_limit)) {
+ perror("setrlimit(RLIMIT_NOFILE)");
+ pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)",
+ (size_t)(needed - fd_limit.rlim_cur));
+ }
+
+ return 0;
+}
+
+static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev,
+ struct vfio_pci_msi_common *msis)
+{
+ int ret;
+ size_t i;
+ int *eventfds;
+ size_t irq_set_size;
+ struct vfio_pci_msi_entry *entry;
+ size_t nr_entries = msis->nr_entries;
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info);
+ if (ret || &msis->info.count == 0) {
+ vfio_dev_err(vdev, "no MSI reported by VFIO");
+ return -ENODEV;
+ }
+
+ if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ vfio_dev_err(vdev, "interrupt not EVENTFD capable");
+ return -EINVAL;
+ }
+
+ if (msis->info.count != nr_entries) {
+ vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO");
+ return -EINVAL;
+ }
+
+ mutex_init(&msis->mutex);
+
+ vfio_pci_reserve_irq_fds(nr_entries);
+
+ irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int);
+ msis->irq_set = malloc(irq_set_size);
+ if (!msis->irq_set)
+ return -ENOMEM;
+
+ *msis->irq_set = (struct vfio_irq_set) {
+ .argsz = irq_set_size,
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = msis->info.index,
+ .start = 0,
+ .count = nr_entries,
+ };
+
+ eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set);
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = &msis->entries[i];
+ entry->gsi = -1;
+ entry->eventfd = -1;
+ msi_set_masked(entry->virt_state, true);
+ msi_set_masked(entry->phys_state, true);
+ eventfds[i] = -1;
+ }
+
+ return 0;
+}
+
+static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev)
+{
+ struct vfio_pci_device *pdev = &vdev->pci;
+ int gsi = pdev->intx_gsi;
+ struct vfio_irq_set irq_set = {
+ .argsz = sizeof(irq_set),
+ .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ };
+
+ pr_debug("user requested MSI, disabling INTx %d", gsi);
+
+ ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+ irq__del_irqfd(kvm, gsi, pdev->intx_fd);
+
+ close(pdev->intx_fd);
+}
+
static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
{
int ret;
@@ -251,6 +865,8 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
.index = VFIO_PCI_INTX_IRQ_INDEX,
};
+ vfio_pci_reserve_irq_fds(2);
+
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
if (ret || irq_info.count == 0) {
vfio_dev_err(vdev, "no INTx reported by VFIO");
@@ -319,6 +935,10 @@ static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev)
goto err_remove_event;
}
+ pdev->intx_fd = trigger_fd;
+ /* Guest is going to ovewrite our irq_line... */
+ pdev->intx_gsi = gsi;
+
return 0;
err_remove_event:
@@ -338,20 +958,23 @@ err_close:
static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev)
{
+ int ret = 0;
struct vfio_pci_device *pdev = &vdev->pci;
- struct vfio_irq_info irq_info = {
- .argsz = sizeof(irq_info),
- .index = VFIO_PCI_INTX_IRQ_INDEX,
- };
-
- if (!pdev->hdr.irq_pin) {
- /* TODO: add MSI support */
- vfio_dev_err(vdev, "INTx not available, MSI-X not implemented");
- return -ENOSYS;
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) {
+ pdev->msix.info = (struct vfio_irq_info) {
+ .argsz = sizeof(pdev->msix.info),
+ .index = VFIO_PCI_MSIX_IRQ_INDEX,
+ };
+ ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix);
+ if (ret)
+ return ret;
}
- return vfio_pci_enable_intx(kvm, vdev);
+ if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX)
+ ret = vfio_pci_enable_intx(kvm, vdev);
+
+ return ret;
}
int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
@@ -387,9 +1010,13 @@ int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev)
void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev)
{
size_t i;
+ struct vfio_pci_device *pdev = &vdev->pci;
for (i = 0; i < vdev->info.num_regions; i++)
vfio_unmap_region(kvm, &vdev->regions[i]);
device__unregister(&vdev->dev_hdr);
+
+ free(pdev->msix.irq_set);
+ free(pdev->msix.entries);
}