@@ -203,6 +203,7 @@ obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
obj-i386-y += debugcon.o multiboot.o
obj-i386-y += pc_piix.o
+obj-i386-y += vfio.o
# shared objects
obj-ppc-y = ppc.o
new file mode 100644
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc. All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+#include <linux/types.h>
+
+/*
+ * VFIO driver - allow mapping and use of certain PCI devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+#ifdef __KERNEL__
+
+struct vfio_nl_client {
+ struct list_head list;
+ u64 msgcap;
+ struct net *net;
+ u32 pid;
+};
+
+struct perm_bits;
+struct vfio_dev {
+ struct device *dev;
+ struct pci_dev *pdev;
+ char name[8];
+ u8 *pci_config_map;
+ int pci_config_size;
+ int devnum;
+ void __iomem *barmap[PCI_ROM_RESOURCE+1];
+ spinlock_t irqlock; /* guards command register accesses */
+ int listeners;
+ u32 locked_pages;
+ struct mutex lgate; /* listener gate */
+ struct mutex dgate; /* dma op gate */
+ struct mutex igate; /* intr op gate */
+ struct mutex ngate; /* netlink op gate */
+ struct list_head nlc_list; /* netlink clients */
+ wait_queue_head_t dev_idle_q;
+ wait_queue_head_t nl_wait_q;
+ u32 nl_reply_seq;
+ u32 nl_reply_value;
+ int mapcount;
+ struct uiommu_domain *udomain;
+ int cachec;
+ struct msix_entry *msix;
+ struct eventfd_ctx *ev_irq;
+ struct eventfd_ctx **ev_msi;
+ struct eventfd_ctx **ev_msix;
+ int msi_nvec;
+ int msix_nvec;
+ u8 *vconfig;
+ u32 rbar[7]; /* copies of real bars */
+ u8 msi_qmax;
+ u8 bardirty;
+ struct perm_bits *msi_perm;
+};
+
+struct vfio_listener {
+ struct vfio_dev *vdev;
+ struct list_head dm_list;
+ struct mm_struct *mm;
+ struct mmu_notifier mmu_notifier;
+};
+
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+ struct list_head list;
+ struct page **pages;
+ dma_addr_t daddr;
+ unsigned long vaddr;
+ int npage;
+ int rdwr;
+};
+
+/* VFIO class infrastructure */
+struct vfio_class {
+ struct kref kref;
+ struct class *class;
+};
+extern struct vfio_class *vfio_class;
+
+ssize_t vfio_io_readwrite(int, struct vfio_dev *,
+ char __user *, size_t, loff_t *);
+ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
+ char __user *, size_t, loff_t *);
+ssize_t vfio_config_readwrite(int, struct vfio_dev *,
+ char __user *, size_t, loff_t *);
+
+void vfio_drop_msi(struct vfio_dev *);
+void vfio_drop_msix(struct vfio_dev *);
+int vfio_setup_msi(struct vfio_dev *, int, void __user *);
+int vfio_setup_msix(struct vfio_dev *, int, void __user *);
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define PCI_MSIX_ENTRY_SIZE 16
+#endif
+#ifndef PCI_STATUS_INTERRUPT
+#define PCI_STATUS_INTERRUPT 0x08
+#endif
+
+struct vfio_dma_map;
+void vfio_dma_unmapall(struct vfio_listener *);
+int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
+int vfio_dma_map_common(struct vfio_listener *, unsigned int,
+ struct vfio_dma_map *);
+int vfio_domain_set(struct vfio_dev *, int, int);
+int vfio_domain_unset(struct vfio_dev *);
+
+int vfio_class_init(void);
+void vfio_class_destroy(void);
+int vfio_dev_add_attributes(struct vfio_dev *);
+int vfio_build_config_map(struct vfio_dev *);
+
+int vfio_nl_init(void);
+void vfio_nl_freeclients(struct vfio_dev *);
+void vfio_nl_exit(void);
+int vfio_nl_remove(struct vfio_dev *);
+int vfio_validate(struct vfio_dev *);
+int vfio_nl_upcall(struct vfio_dev *, u8, int, int);
+void vfio_pm_process_reply(int);
+pci_ers_result_t vfio_error_detected(struct pci_dev *, pci_channel_state_t);
+pci_ers_result_t vfio_mmio_enabled(struct pci_dev *);
+pci_ers_result_t vfio_link_reset(struct pci_dev *);
+pci_ers_result_t vfio_slot_reset(struct pci_dev *);
+void vfio_error_resume(struct pci_dev *);
+#define VFIO_ERROR_REPLY_TIMEOUT (3*HZ)
+#define VFIO_SUSPEND_REPLY_TIMEOUT (5*HZ)
+
+irqreturn_t vfio_interrupt(int, void *);
+
+#endif /* __KERNEL__ */
+
+/* Kernel & User level defines for ioctls */
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ * buffer may only be larger than 1 page if (a) there is
+ * an iommu in the system, or (b) buffer is part of a huge page
+ */
+struct vfio_dma_map {
+ __u64 vaddr; /* process virtual addr */
+ __u64 dmaaddr; /* desired and/or returned dma address */
+ __u64 size; /* size in bytes */
+ __u64 flags; /* bool: 0 for r/o; 1 for r/w */
+#define VFIO_FLAG_WRITE 0x1 /* req writeable DMA mem */
+};
+
+/* map user pages at specific dma address */
+/* requires previous VFIO_DOMAIN_SET */
+#define VFIO_DMA_MAP_IOVA _IOWR(';', 101, struct vfio_dma_map)
+
+/* unmap user pages */
+#define VFIO_DMA_UNMAP _IOW(';', 102, struct vfio_dma_map)
+
+/* request IRQ interrupts; use given eventfd */
+#define VFIO_EVENTFD_IRQ _IOW(';', 103, int)
+
+/* Request MSI interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define VFIO_EVENTFDS_MSI _IOW(';', 104, int)
+
+/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define VFIO_EVENTFDS_MSIX _IOW(';', 105, int)
+
+/* Get length of a BAR */
+#define VFIO_BAR_LEN _IOWR(';', 167, __u32)
+
+/* Set the IOMMU domain - arg is fd from uiommu driver */
+#define VFIO_DOMAIN_SET _IOW(';', 107, int)
+
+/* Unset the IOMMU domain */
+#define VFIO_DOMAIN_UNSET _IO(';', 108)
+
+/* Re-enable INTx */
+#define VFIO_IRQ_EOI _IO(';', 109)
+
+/* Re-enable INTx via eventfd*/
+#define VFIO_IRQ_EOI_EVENTFD _IOW(';', 110, int)
+
+/*
+ * Reads, writes, and mmaps determine which PCI BAR (or config space)
+ * from the high level bits of the file offset
+ */
+#define VFIO_PCI_BAR0_RESOURCE 0x0
+#define VFIO_PCI_BAR1_RESOURCE 0x1
+#define VFIO_PCI_BAR2_RESOURCE 0x2
+#define VFIO_PCI_BAR3_RESOURCE 0x3
+#define VFIO_PCI_BAR4_RESOURCE 0x4
+#define VFIO_PCI_BAR5_RESOURCE 0x5
+#define VFIO_PCI_ROM_RESOURCE 0x6
+#define VFIO_PCI_CONFIG_RESOURCE 0xF
+#define VFIO_PCI_SPACE_SHIFT 32
+#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE)
+
+static inline int vfio_offset_to_pci_space(__u64 off)
+{
+ return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
+}
+
+static inline __u32 vfio_offset_to_pci_offset(__u64 off)
+{
+ return off & (__u32)0xFFFFFFFF;
+}
+
+static inline __u64 vfio_pci_space_to_offset(int sp)
+{
+ return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
+}
+
+/*
+ * Netlink defines:
+ */
+#define VFIO_GENL_NAME "VFIO"
+
+/* message types */
+enum {
+ VFIO_MSG_INVAL = 0,
+ /* kernel to user */
+ VFIO_MSG_REMOVE, /* unbind, module or hotplug remove */
+ VFIO_MSG_ERROR_DETECTED, /* pci err handling - error detected */
+ VFIO_MSG_MMIO_ENABLED, /* pci err handling - mmio enabled */
+ VFIO_MSG_LINK_RESET, /* pci err handling - link reset */
+ VFIO_MSG_SLOT_RESET, /* pci err handling - slot reset */
+ VFIO_MSG_ERROR_RESUME, /* pci err handling - resume normal */
+ VFIO_MSG_PM_SUSPEND, /* suspend or hibernate notification */
+ VFIO_MSG_PM_RESUME, /* resume after suspend or hibernate */
+ /* user to kernel */
+ VFIO_MSG_REGISTER,
+ VFIO_MSG_ERROR_HANDLING_REPLY, /* err handling reply */
+ VFIO_MSG_PM_SUSPEND_REPLY, /* suspend notify reply */
+};
+
+/* attributes */
+enum {
+ VFIO_ATTR_UNSPEC,
+ VFIO_ATTR_MSGCAP, /* bitmask of messages desired */
+ VFIO_ATTR_PCI_DOMAIN,
+ VFIO_ATTR_PCI_BUS,
+ VFIO_ATTR_PCI_SLOT,
+ VFIO_ATTR_PCI_FUNC,
+ VFIO_ATTR_CHANNEL_STATE,
+ VFIO_ATTR_ERROR_HANDLING_REPLY,
+ VFIO_ATTR_PM_SUSPEND_REPLY,
+ __VFIO_NL_ATTR_MAX
+};
+#define VFIO_NL_ATTR_MAX (__VFIO_NL_ATTR_MAX - 1)
new file mode 100644
@@ -0,0 +1,1398 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ * Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ * Adapted for KVM by Qumranet.
+ * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "config.h"
+#include "event_notifier.h"
+#include "hw.h"
+#include "kvm.h"
+#include "memory.h"
+#include "monitor.h"
+#include "msi.h"
+#include "msix.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "range.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+ do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+ do { } while (0)
+#endif
+
+/* TODO: msix.h should define these */
+#define MSIX_CAP_LENGTH 12
+#define MSIX_PAGE_SIZE 0x1000
+
+/* XXX: on qemu-kvm.git we have msix/intx notifiers and irqfds. With these
+ * we can allow interrupts to bypass userspace. There's no good #define to
+ * figure out when these are present, so we toggle on the device assignment
+ * ifdef even though it has no relation to the bits we're looking for. */
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+#define QEMU_KVM_BUILD
+#endif
+
+static void vfio_disable_interrupts(VFIODevice *vdev);
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+ uint32_t val, int len);
+/*
+ * Generic
+ */
+static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap)
+{
+ int id;
+ int max_cap = 48;
+ int pos = PCI_CAPABILITY_LIST;
+ int status;
+
+ status = pdev->config[PCI_STATUS];
+ if ((status & PCI_STATUS_CAP_LIST) == 0) {
+ return 0;
+ }
+
+ while (max_cap--) {
+ pos = pdev->config[pos];
+ if (pos < 0x40) {
+ break;
+ }
+
+ pos &= ~3;
+ id = pdev->config[pos + PCI_CAP_LIST_ID];
+
+ if (id == 0xff) {
+ break;
+ }
+ if (id == cap) {
+ return pos;
+ }
+
+ pos += PCI_CAP_LIST_NEXT;
+ }
+ return 0;
+}
+
+static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str)
+{
+ PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+ const char *p = str;
+ int n, seg, bus, dev, func;
+ char field[5];
+
+ if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') {
+ return -EINVAL;
+ }
+
+ seg = strtol(field, NULL, 16);
+ p += n + 1;
+
+ if (sscanf(p, "%4[^:]%n", field, &n) != 1) {
+ return -EINVAL;
+ }
+
+ if (p[n] == ':') {
+ bus = strtol(field, NULL, 16);
+ p += n + 1;
+ } else {
+ bus = seg;
+ seg = 0;
+ }
+
+ if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') {
+ return -EINVAL;
+ }
+
+ dev = strtol(field, NULL, 16);
+ p += n + 1;
+
+ if (!qemu_isdigit(*p)) {
+ return -EINVAL;
+ }
+
+ func = *p - '0';
+
+ ptr->seg = seg;
+ ptr->bus = bus;
+ ptr->dev = dev;
+ ptr->func = func;
+ return 0;
+}
+
+static int print_hostaddr(DeviceState *qdev, Property *prop,
+ char *dest, size_t len)
+{
+ PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+
+ return snprintf(dest, len, "%04x:%02x:%02x.%x",
+ ptr->seg, ptr->bus, ptr->dev, ptr->func);
+}
+
+/*
+ * INTx
+ */
+static inline void vfio_unmask_intx(VFIODevice *vdev)
+{
+ ioctl(vdev->vfiofd, VFIO_IRQ_EOI);
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+ VFIODevice *vdev = opaque;
+
+ if (!event_notifier_test_and_clear(&vdev->intx.notifier)) {
+ return;
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func,
+ 'A' + vdev->intx.pin);
+
+ vdev->intx.pending = true;
+ qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(ioapic_eoi_client *client)
+{
+ VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client);
+
+ if (!vdev->intx.irqfd_enabled) {
+ if (!vdev->intx.pending) {
+ return;
+ }
+
+ vdev->intx.pending = false;
+
+ /* If the interrupt is injected via qemu (not irqfd), we need to
+ * deassert the interrupt here so qemu knows about the level change.
+ * Otherwise the next interrupt won't make it out of qemu. Interrupts
+ * via irqfd are completely outside of qemu, so we can skip it. */
+ qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func);
+
+ vfio_unmask_intx(vdev);
+}
+
+/* Wrappers for EOI client setup that allow VFIO to directly consume the
+ * eventfd from KVM. This serves the same purpose as irqfd for the EOI. */
+static int vfio_enable_eoi_client(VFIODevice *vdev)
+{
+ int fd, ret;
+
+ ret = ioapic_register_eoi_client(&vdev->intx.eoi_client);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Exit here is ok, just means EOIs bounce through qemu */
+ fd = ioapic_eoi_client_get_fd(&vdev->intx.eoi_client);
+ if (fd < 0) {
+ return 0;
+ }
+
+ ret = ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd);
+ if (ret < 0) {
+ fprintf(stderr, "vfio: VFIO_IRQ_EOI_EVENTFD setup - %s (%d)\n",
+ strerror(-ret), ret);
+ return ret;
+ }
+ qemu_set_fd_handler(fd, NULL, NULL, NULL);
+
+ return 0;
+}
+
+static void vfio_disable_eoi_client(VFIODevice *vdev)
+{
+ int fd = -1;
+
+ ioapic_unregister_eoi_client(&vdev->intx.eoi_client);
+ ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd);
+}
+
+/* Attempt to send the VFIO eventfd directly into the KVM irqchip */
+static void vfio_set_intx_handler(VFIODevice *vdev, IOHandler *fd_read,
+ bool irqfd_enable)
+{
+ int fd = event_notifier_get_fd(&vdev->intx.notifier);
+#ifdef QEMU_KVM_BUILD
+ int ret;
+
+ ret = kvm_set_irqfd(vdev->intx.eoi_client.irq, fd, irqfd_enable);
+ if (ret < 0) {
+ if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+ fprintf(stderr, "vfio: Error: irqfd %s failed - %s (%d)\n",
+ irqfd_enable ? "enable" : "disable", strerror(-ret), ret);
+ goto out;
+ }
+ }
+
+ vdev->intx.irqfd_enabled = irqfd_enable;
+out:
+#endif
+ if (vdev->intx.irqfd_enabled) {
+ qemu_set_fd_handler(fd, NULL, NULL, NULL);
+ } else {
+ qemu_set_fd_handler(fd, fd_read, NULL, vdev);
+ }
+}
+
+static void vfio_update_irqs(PCIDevice *pdev)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ int irq = pci_get_irq(pdev, vdev->intx.pin);
+
+ if (irq == vdev->intx.eoi_client.irq) {
+ return;
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, vdev->intx.eoi_client.irq, irq);
+
+ vfio_set_intx_handler(vdev, vfio_intx_interrupt, false);
+ vfio_disable_eoi_client(vdev);
+
+ vdev->intx.eoi_client.irq = irq;
+
+ if (irq < 0) {
+ fprintf(stderr, "vfio: Error - INTx moved to IRQ %d\n", irq);
+ return;
+ }
+
+ vfio_enable_eoi_client(vdev);
+ vfio_set_intx_handler(vdev, vfio_intx_interrupt, true);
+
+ /* Re-enable the interrupt in cased we missed an EOI */
+ vfio_eoi(&vdev->intx.eoi_client);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+ int fd;
+ uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
+
+ if (!pin) {
+ return 0;
+ }
+
+ vfio_disable_interrupts(vdev);
+
+ vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+ vdev->intx.eoi_client.eoi = vfio_eoi;
+ vdev->intx.eoi_client.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
+
+ vfio_enable_eoi_client(vdev);
+
+ pci_register_update_irqs(&vdev->pdev, vfio_update_irqs);
+
+ if (event_notifier_init(&vdev->intx.notifier, 0)) {
+ fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+ return -1;
+ }
+
+ vfio_set_intx_handler(vdev, vfio_intx_interrupt, true);
+
+ fd = event_notifier_get_fd(&vdev->intx.notifier);
+
+ if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) {
+ fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n",
+ strerror(errno));
+ return -1;
+ }
+ vfio_unmask_intx(vdev);
+
+ vdev->interrupt = INT_INTx;
+
+ DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func);
+
+ return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+ int fd = -1;
+
+ if (vdev->interrupt != INT_INTx) {
+ return;
+ }
+
+ pci_register_update_irqs(&vdev->pdev, NULL);
+ vfio_set_intx_handler(vdev, NULL, false);
+ vfio_disable_eoi_client(vdev);
+ ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd);
+ event_notifier_cleanup(&vdev->intx.notifier);
+ vdev->interrupt = INT_NONE;
+
+ DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * MSI-X
+ */
+static void vfio_msix_interrupt(void *opaque)
+{
+ MSIVector *vec = opaque;
+ VFIODevice *vdev = vec->vdev;
+
+ if (!event_notifier_test_and_clear(&vec->notifier)) {
+ return;
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector);
+
+ msix_notify(&vdev->pdev, vec->vector);
+}
+
+#ifdef QEMU_KVM_BUILD
+/* When a vector is masked, we disable the irqfd, forcing the interrupt
+ * through qemu userspace. We can then filter masked vectors in msix_notify. */
+static int vfio_msix_mask_notify(PCIDevice *pdev, unsigned vector, int masked)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ int fd, ret;
+
+ fd = event_notifier_get_fd(&vdev->msi_vectors[vector].notifier);
+ ret = kvm_set_irqfd(pdev->msix_irq_entries[vector].gsi, fd, !masked);
+ if (ret == -ENOSYS) {
+ return 0; /* w/o irqfd, interrupts pass through qemu anyway */
+ } else if (ret < 0) {
+ fprintf(stderr, "vfio: Error - irqfd setup failed\n");
+ return ret;
+ }
+
+ if (masked) {
+ qemu_set_fd_handler(fd, vfio_msix_interrupt, NULL,
+ &vdev->msi_vectors[vector]);
+ } else {
+ qemu_set_fd_handler(fd, NULL, NULL, NULL);
+ }
+
+ return ret;
+}
+#endif
+
+static void vfio_enable_msix(VFIODevice *vdev)
+{
+ int i, *fds;
+
+ vfio_disable_interrupts(vdev);
+
+ vdev->nr_vectors = vdev->pdev.msix_entries_nr;
+ vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector));
+
+ fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int));
+ fds[0] = vdev->nr_vectors;
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ vdev->msi_vectors[i].vdev = vdev;
+ vdev->msi_vectors[i].vector = i;
+
+ if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) {
+ fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+ }
+
+ fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+ qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL,
+ &vdev->msi_vectors[i]);
+
+ if (msix_vector_use(&vdev->pdev, i) < 0) {
+ fprintf(stderr, "vfio: Error msix_vector_use\n");
+ }
+ }
+
+ if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) {
+ fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n",
+ strerror(errno));
+ qemu_free(fds);
+ return;
+ }
+
+ vdev->interrupt = INT_MSIX;
+
+ qemu_free(fds);
+
+#ifdef QEMU_KVM_BUILD
+ if (msix_set_mask_notifier(&vdev->pdev, vfio_msix_mask_notify)) {
+ fprintf(stderr, "vfio: Error msix_set_mask_notifier\n");
+ }
+#endif
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, vdev->nr_vectors);
+}
+
+static void vfio_disable_msix(VFIODevice *vdev)
+{
+ int i, vectors = 0;
+
+ if (vdev->interrupt != INT_MSIX) {
+ return;
+ }
+
+ ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors);
+
+#ifdef QEMU_KVM_BUILD
+ if (msix_unset_mask_notifier(&vdev->pdev)) {
+ fprintf(stderr, "vfio: Error msix_unset_mask_notifier\n");
+ }
+#endif
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+
+ msix_vector_unuse(&vdev->pdev, i);
+
+ qemu_set_fd_handler(fd, NULL, NULL, NULL);
+ event_notifier_cleanup(&vdev->msi_vectors[i].notifier);
+ }
+
+ qemu_free(vdev->msi_vectors);
+ vdev->nr_vectors = 0;
+ vdev->interrupt = INT_NONE;
+ vfio_enable_intx(vdev);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * MSI
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+ MSIVector *vec = opaque;
+ VFIODevice *vdev = vec->vdev;
+
+ if (!event_notifier_test_and_clear(&vec->notifier)) {
+ return;
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector);
+
+ msi_notify(&vdev->pdev, vec->vector);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+ int i, *fds;
+
+ vfio_disable_interrupts(vdev);
+
+ vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+ vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector));
+
+ fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int));
+ fds[0] = vdev->nr_vectors;
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ vdev->msi_vectors[i].vdev = vdev;
+ vdev->msi_vectors[i].vector = i;
+
+ if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) {
+ fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+ }
+
+ fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+ qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL,
+ &vdev->msi_vectors[i]);
+ }
+
+ if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, fds)) {
+ fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n",
+ strerror(errno));
+ qemu_free(fds);
+ return;
+ }
+
+ vdev->interrupt = INT_MSI;
+
+ qemu_free(fds);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, vdev->nr_vectors);
+}
+
+static void vfio_disable_msi(VFIODevice *vdev)
+{
+ int i, vectors = 0;
+
+ if (vdev->interrupt != INT_MSI) {
+ return;
+ }
+
+ ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, &vectors);
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+ qemu_set_fd_handler(fd, NULL, NULL, NULL);
+ event_notifier_cleanup(&vdev->msi_vectors[i].notifier);
+ }
+
+ qemu_free(vdev->msi_vectors);
+ vdev->nr_vectors = 0;
+ vdev->interrupt = INT_NONE;
+ vfio_enable_intx(vdev);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * IO Port/MMIO
+ */
+static void vfio_resource_write(PCIResource *res, uint32_t addr,
+ uint32_t val, int len)
+{
+ size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+
+ if (pwrite(res->vfiofd, &val, len, offset + addr) != len) {
+ fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n",
+ __FUNCTION__, addr, val, len, strerror(errno));
+ }
+ DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n", __FUNCTION__, res->bar,
+ addr, val, len);
+}
+
+static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr,
+ uint32_t val)
+{
+ vfio_resource_write(opaque, addr, val, 1);
+}
+
+static void vfio_resource_writew(void *opaque, target_phys_addr_t addr,
+ uint32_t val)
+{
+ vfio_resource_write(opaque, addr, val, 2);
+}
+
+static void vfio_resource_writel(void *opaque, target_phys_addr_t addr,
+ uint32_t val)
+{
+ vfio_resource_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_resource_writes[] = {
+ &vfio_resource_writeb,
+ &vfio_resource_writew,
+ &vfio_resource_writel
+};
+
+static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIResource *res = opaque;
+ vfio_resource_write(res, addr - res->e_phys, val, 1);
+}
+
+static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIResource *res = opaque;
+ vfio_resource_write(res, addr - res->e_phys, val, 2);
+}
+
+static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+ PCIResource *res = opaque;
+ vfio_resource_write(res, addr - res->e_phys, val, 4);
+}
+
+static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len)
+{
+ size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+ uint32_t val;
+
+ if (pread(res->vfiofd, &val, len, offset + addr) != len) {
+ fprintf(stderr, "%s(,0x%x, %d) failed: %s\n",
+ __FUNCTION__, addr, len, strerror(errno));
+ return 0xffffffffU;
+ }
+ DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n", __FUNCTION__, res->bar,
+ addr, len, val);
+ return val;
+}
+
+static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr)
+{
+ return vfio_resource_read(opaque, addr, 1) & 0xff;
+}
+
+static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr)
+{
+ return vfio_resource_read(opaque, addr, 2) & 0xffff;
+}
+
+static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr)
+{
+ return vfio_resource_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_resource_reads[] = {
+ &vfio_resource_readb,
+ &vfio_resource_readw,
+ &vfio_resource_readl
+};
+
+static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr)
+{
+ PCIResource *res = opaque;
+ return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff;
+}
+
+static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr)
+{
+ PCIResource *res = opaque;
+ return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff;
+}
+
+static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr)
+{
+ PCIResource *res = opaque;
+ return vfio_resource_read(res, addr - res->e_phys, 4);
+}
+
+static void vfio_ioport_map(PCIDevice *pdev, int bar,
+ pcibus_t e_phys, pcibus_t e_size, int type)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ PCIResource *res = &vdev->resources[bar];
+
+ DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, bar, e_phys, e_size, type);
+
+ res->e_phys = e_phys;
+ res->e_size = e_size;
+
+ register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res);
+ register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res);
+ register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res);
+ register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res);
+ register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res);
+ register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res);
+}
+
+static void vfio_iomem_map(PCIDevice *pdev, int bar,
+ pcibus_t e_phys, pcibus_t e_size, int type)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ PCIResource *res = &vdev->resources[bar];
+
+ DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, bar, e_phys, e_size, type);
+
+ res->e_phys = e_phys;
+ res->e_size = e_size;
+
+ if (res->msix) {
+ if (res->msix_offset > 0) {
+ cpu_register_physical_memory(e_phys, res->msix_offset, res->slow ?
+ res->io_mem : res->memory_index[0]);
+ }
+
+ DPRINTF("Overlaying MSI-X table page\n");
+ msix_mmio_map(pdev, bar, e_phys, e_size, type);
+
+ if (e_size > res->msix_offset + MSIX_PAGE_SIZE) {
+ uint32_t offset = res->msix_offset + MSIX_PAGE_SIZE;
+ e_phys += offset;
+ e_size -= offset;
+ cpu_register_physical_memory_offset(e_phys, e_size,
+ res->slow ? res->io_mem : res->memory_index[1],
+ res->slow ? offset : 0);
+ }
+ } else {
+ cpu_register_physical_memory(e_phys, e_size, res->slow ?
+ res->io_mem : res->memory_index[0]);
+ }
+}
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ uint32_t val = 0;
+
+ if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
+ (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+ ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
+ (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+ ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size))) {
+
+ val = pci_default_read_config(pdev, addr, len);
+ } else {
+ if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+ fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+ __FUNCTION__, vdev->host.seg, vdev->host.bus,
+ vdev->host.dev, vdev->host.func, addr, len,
+ strerror(errno));
+ return -1;
+ }
+ }
+ DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, addr, len, val);
+ return val;
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+ uint32_t val, int len)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n", __FUNCTION__,
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, addr, val, len);
+
+ /* Write everything to VFIO, let it filter out what we can't write */
+ if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+ fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+ __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, addr, val, len, strerror(errno));
+ }
+
+ /* Write standard header bits to emulation */
+ if (addr < 0x40) {
+ pci_default_write_config(pdev, addr, val, len);
+ return;
+ }
+
+ /* MSI/MSI-X Enabling/Disabling */
+ if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+ ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size)) {
+ int is_enabled, was_enabled = msi_enabled(pdev);
+
+ pci_default_write_config(pdev, addr, val, len);
+ msi_write_config(pdev, addr, val, len);
+
+ is_enabled = msi_enabled(pdev);
+
+ if (!was_enabled && is_enabled) {
+ vfio_enable_msi(vdev);
+ } else if (was_enabled && !is_enabled) {
+ vfio_disable_msi(vdev);
+ }
+ }
+
+ if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+ ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
+ int is_enabled, was_enabled = msix_enabled(pdev);
+
+ pci_default_write_config(pdev, addr, val, len);
+ msix_write_config(pdev, addr, val, len);
+
+ is_enabled = msix_enabled(pdev);
+
+ if (!was_enabled && is_enabled) {
+ vfio_enable_msix(vdev);
+ } else if (was_enabled && !is_enabled) {
+ vfio_disable_msix(vdev);
+ }
+ }
+}
+
+/*
+ * DMA
+ */
+static int vfio_dma_map(void *opaque, target_phys_addr_t start_addr,
+ ram_addr_t size, ram_addr_t phys_offset)
+{
+ VFIODevice *vdev = opaque;
+ struct vfio_dma_map dma_map;
+
+ dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset);
+ dma_map.dmaaddr = start_addr;
+ dma_map.size = size;
+ dma_map.flags = VFIO_FLAG_WRITE;
+
+ return ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map);
+}
+
+static int vfio_dma_unmap(void *opaque, target_phys_addr_t start_addr,
+ ram_addr_t size, ram_addr_t phys_offset)
+{
+ VFIODevice *vdev = opaque;
+ struct vfio_dma_map dma_map;
+
+ dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset);
+ dma_map.dmaaddr = start_addr;
+ dma_map.size = size;
+ dma_map.flags = VFIO_FLAG_WRITE;
+
+ return ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map);
+}
+
+static int vfio_map_iommu(VFIODevice *vdev)
+{
+ return qemu_ram_for_each_slot(vdev, vfio_dma_map);
+}
+
+static int vfio_unmap_iommu(VFIODevice *vdev)
+{
+ return qemu_ram_for_each_slot(vdev, vfio_dma_unmap);
+}
+
+/*
+ * Interrupt setup
+ */
+static void vfio_disable_interrupts(VFIODevice *vdev)
+{
+ switch (vdev->interrupt) {
+ case INT_INTx:
+ vfio_disable_intx(vdev);
+ break;
+ case INT_MSI:
+ vfio_disable_msi(vdev);
+ break;
+ case INT_MSIX:
+ vfio_disable_msix(vdev);
+ }
+}
+
+static int vfio_setup_msi(VFIODevice *vdev)
+{
+ int pos;
+
+ if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) {
+ uint16_t ctrl;
+ bool msi_64bit, msi_maskbit;
+ int entries;
+
+ if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+ VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+ return -1;
+ }
+
+ msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
+ msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
+ entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+ DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.seg,
+ vdev->host.bus, vdev->host.dev, vdev->host.func, pos);
+
+ if (msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit) < 0) {
+ fprintf(stderr, "vfio: msi_init failed\n");
+ return -1;
+ }
+ }
+
+ if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) {
+ uint16_t ctrl;
+ uint32_t table, len, offset;
+ int bar, entries;
+
+ if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+ VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+ return -1;
+ }
+
+ if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF +
+ pos + PCI_MSIX_TABLE) != sizeof(table)) {
+ return -1;
+ }
+
+ ctrl = le16_to_cpu(ctrl);
+ table = le32_to_cpu(table);
+
+ bar = table & PCI_MSIX_BIR;
+ offset = table & ~PCI_MSIX_BIR;
+ entries = (ctrl & PCI_MSIX_TABSIZE) + 1;
+
+ vdev->resources[bar].msix = true;
+ vdev->resources[bar].msix_offset = offset;
+
+ DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x\n",
+ vdev->host.seg, vdev->host.bus, vdev->host.dev,
+ vdev->host.func, pos, bar, offset);
+
+ len = table & PCI_MSIX_BIR;
+ if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+ fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n");
+ return -1;
+ }
+
+ if (msix_init(&vdev->pdev, entries, bar, len) < 0) {
+ fprintf(stderr, "vfio: msix_init failed\n");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+ msi_uninit(&vdev->pdev);
+ msix_uninit(&vdev->pdev);
+}
+
+/*
+ * Resource setup
+ */
+static int vfio_setup_resources(VFIODevice *vdev)
+{
+ int i;
+
+ for (i = 0; i < PCI_ROM_SLOT; i++) {
+ uint32_t len, bar;
+ PCIResource *res;
+ uint8_t offset;
+ int ret, space;
+
+ res = &vdev->resources[i];
+ res->vfiofd = vdev->vfiofd;
+ res->bar = len = i;
+
+ if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+ fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i);
+ return -1;
+ }
+ if (!len) {
+ continue;
+ }
+
+ offset = PCI_BASE_ADDRESS_0 + (4 * i);
+ ret = pread(vdev->vfiofd, &bar, sizeof(bar),
+ VFIO_PCI_CONFIG_OFF + offset);
+ if (ret != sizeof(bar)) {
+ fprintf(stderr, "vfio: Failed to read BAR %d\n", i);
+ return -1;
+ }
+ bar = le32_to_cpu(bar);
+ space = bar & PCI_BASE_ADDRESS_SPACE;
+
+ if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) {
+ int off = VFIO_PCI_BAR0_RESOURCE + i;
+ int flags = PROT_READ | PROT_WRITE;
+ char name[32];
+
+ res->mem = true;
+ res->size = len;
+
+ if (vdev->pdev.qdev.info->vmsd) {
+ snprintf(name, sizeof(name), "%s.bar%d",
+ vdev->pdev.qdev.info->vmsd->name, i);
+ } else {
+ snprintf(name, sizeof(name), "%s.bar%d",
+ vdev->pdev.qdev.info->name, i);
+ }
+
+ if (res->msix) {
+ if (res->msix_offset) {
+ char *c = &name[strlen(name)];
+
+ res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags,
+ MAP_SHARED, vdev->vfiofd,
+ vfio_pci_space_to_offset(off));
+
+ if (res->r_virtbase[0] == MAP_FAILED) {
+ fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+ return -1;
+ }
+ strncat(name, ".0", sizeof(name));
+ res->memory_index[0] =
+ qemu_ram_alloc_from_ptr(&vdev->pdev.qdev,
+ name, res->msix_offset,
+ res->r_virtbase[0]);
+ *c = 0;
+ }
+ if (len > res->msix_offset + MSIX_PAGE_SIZE) {
+ char *c = &name[strlen(name)];
+
+ res->r_virtbase[1] = mmap(NULL,
+ len - res->msix_offset - MSIX_PAGE_SIZE,
+ flags, MAP_SHARED, vdev->vfiofd,
+ vfio_pci_space_to_offset(off) +
+ res->msix_offset + MSIX_PAGE_SIZE);
+
+ if (res->r_virtbase[1] == MAP_FAILED) {
+ fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+ return -1;
+ }
+ strncat(name, ".1", sizeof(name));
+ res->memory_index[1] =
+ qemu_ram_alloc_from_ptr(&vdev->pdev.qdev, name,
+ len - MSIX_PAGE_SIZE - res->msix_offset,
+ res->r_virtbase[1]);
+ *c = 0;
+ }
+ } else {
+ res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED,
+ vdev->vfiofd,
+ vfio_pci_space_to_offset(off));
+
+ if (res->r_virtbase[0] == MAP_FAILED) {
+ fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+ return -1;
+ }
+ res->memory_index[0] =
+ qemu_ram_alloc_from_ptr(&vdev->pdev.qdev,
+ name, len, res->r_virtbase[0]);
+ }
+
+ pci_register_bar(&vdev->pdev, i, res->size,
+ bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+ PCI_BASE_ADDRESS_MEM_PREFETCH :
+ PCI_BASE_ADDRESS_SPACE_MEMORY,
+ vfio_iomem_map);
+
+ if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ i++;
+ }
+ } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+ res->mem = true;
+ res->size = len;
+ res->slow = true;
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n",
+ __FUNCTION__, vdev->host.seg, vdev->host.bus,
+ vdev->host.dev, vdev->host.func, i);
+
+ res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+ vfio_resource_writes, res);
+
+ pci_register_bar(&vdev->pdev, i, res->size,
+ bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+ PCI_BASE_ADDRESS_MEM_PREFETCH :
+ PCI_BASE_ADDRESS_SPACE_MEMORY,
+ vfio_iomem_map);
+
+ if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ i++;
+ }
+ } else if (space == PCI_BASE_ADDRESS_SPACE_IO) {
+ res->size = len;
+ pci_register_bar(&vdev->pdev, i, res->size,
+ PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map);
+ }
+ res->valid = true;
+ }
+ return 0;
+}
+
+static void vfio_unmap_resources(VFIODevice *vdev)
+{
+ int i;
+ PCIResource *res = vdev->resources;
+
+ for (i = 0; i < PCI_ROM_SLOT; i++, res++) {
+ if (res->valid && res->mem) {
+ if (res->msix) {
+ if (res->msix_offset) {
+ cpu_register_physical_memory(res->e_phys, res->msix_offset,
+ IO_MEM_UNASSIGNED);
+ qemu_ram_free_from_ptr(res->memory_index[0]);
+ munmap(res->r_virtbase[0], res->msix_offset);
+ }
+ if (res->size > res->msix_offset + MSIX_PAGE_SIZE) {
+ cpu_register_physical_memory(res->e_phys + MSIX_PAGE_SIZE +
+ res->msix_offset,
+ res->e_size - MSIX_PAGE_SIZE -
+ res->msix_offset,
+ IO_MEM_UNASSIGNED);
+ qemu_ram_free_from_ptr(res->memory_index[1]);
+ munmap(res->r_virtbase[1],
+ res->size - MSIX_PAGE_SIZE - res->msix_offset);
+ }
+ } else {
+ if (!res->slow) {
+ cpu_register_physical_memory(res->e_phys, res->e_size,
+ IO_MEM_UNASSIGNED);
+ qemu_ram_free_from_ptr(res->memory_index[0]);
+ munmap(res->r_virtbase[0], res->size);
+ } else {
+ cpu_unregister_io_memory(res->io_mem);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * General setup
+ */
+static int get_vfio_fd(VFIODevice *vdev)
+{
+ if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) {
+ if (qemu_isdigit(vdev->vfiofd_name[0])) {
+ vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0);
+ return 0;
+ } else {
+ vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name);
+ if (vdev->vfiofd < 0) {
+ fprintf(stderr, "%s: (%s) unkown\n", __func__,
+ vdev->vfiofd_name);
+ return -1;
+ }
+ return 0;
+ }
+ } else {
+ char vfio_dir[64], vfio_dev[16];
+ DIR *dir;
+ struct dirent *de;
+
+ sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/",
+ vdev->host.seg, vdev->host.bus,
+ vdev->host.dev, vdev->host.func);
+ dir = opendir(vfio_dir);
+ if (!dir) {
+ error_report("vfio: error: Driver not attached\n");
+ return -1;
+ }
+
+ while ((de = readdir(dir))) {
+ if (de->d_name[0] == '.')
+ continue;
+ if (!strncmp(de->d_name, "vfio", 4))
+ break;
+ }
+
+ if (!de) {
+ error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir);
+ return -1;
+ }
+
+ sprintf(vfio_dev, "/dev/%s", de->d_name);
+ vdev->vfiofd = open(vfio_dev, O_RDWR);
+ if (vdev->vfiofd < 0) {
+ error_report("pci-assign: vfio: Failed to open %s: %s\n",
+ vfio_dev, strerror(errno));
+ return -1;
+ }
+ return 0;
+ }
+}
+
+static int get_uiommu_fd(VFIODevice *vdev)
+{
+ if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) {
+ if (qemu_isdigit(vdev->uiommufd_name[0])) {
+ vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0);
+ return 0;
+ } else {
+ vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name);
+ if (vdev->uiommufd < 0) {
+ fprintf(stderr, "%s: (%s) unkown\n", __func__,
+ vdev->uiommufd_name);
+ return -1;
+ }
+ return 0;
+ }
+ } else {
+ vdev->uiommufd = open("/dev/uiommu", O_RDONLY);
+ if (vdev->uiommufd < 0) {
+ return -1;
+ }
+ vdev->uiommufd_name = NULL; /* easier test later */
+ return 0;
+ }
+}
+
+static int vfio_load_rom(VFIODevice *vdev)
+{
+ uint32_t len, size = PCI_ROM_SLOT;
+ char name[32];
+ off_t off = 0, voff = vfio_pci_space_to_offset(VFIO_PCI_ROM_RESOURCE);
+ ssize_t bytes;
+ void *ptr;
+
+ /* If loading ROM from file, pci handles it */
+ if (vdev->pdev.romfile || !vdev->pdev.rom_bar)
+ return 0;
+
+ if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &size)) {
+ fprintf(stderr, "vfio: VFIO_BAR_LEN failed for OPTION ROM");
+ return -1;
+ }
+
+ if (!size)
+ return 0;
+
+ len = size;
+ snprintf(name, sizeof(name), "%s.rom", vdev->pdev.qdev.info->name);
+ vdev->pdev.rom_offset = qemu_ram_alloc(&vdev->pdev.qdev, name, size);
+ ptr = qemu_get_ram_ptr(vdev->pdev.rom_offset);
+ memset(ptr, 0xff, size);
+
+ while (size) {
+ bytes = pread(vdev->vfiofd, ptr + off, size, voff + off);
+ if (bytes == 0) {
+ break; /* expect that we could get back less than the ROM BAR */
+ } else if (bytes > 0) {
+ off += bytes;
+ size -= bytes;
+ } else {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ }
+ fprintf(stderr, "vfio: Error reading device ROM: %s\n",
+ strerror(errno));
+ qemu_ram_free(vdev->pdev.rom_offset);
+ vdev->pdev.rom_offset = 0;
+ return -1;
+ }
+ }
+
+ pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, len, 0, pci_map_option_rom);
+ return 0;
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+ char sys[64];
+ struct stat st;
+ int ret;
+
+ /* Check that the host device exists */
+ sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+ vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+ if (stat(sys, &st) < 0) {
+ error_report("vfio: error: no such host device "
+ "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus,
+ vdev->host.dev, vdev->host.func);
+ return -1;
+ }
+
+ if (get_uiommu_fd(vdev))
+ return -1;
+
+ if (get_vfio_fd(vdev))
+ goto out_close_uiommu;
+
+ if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd))
+ goto out_close_vfiofd;
+
+ /* Get a copy of config space */
+ ret = pread(vdev->vfiofd, vdev->pdev.config,
+ pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF);
+ if (ret < pci_config_size(&vdev->pdev)) {
+ fprintf(stderr, "vfio: Failed to read device config space\n");
+ goto out_unset_domain;
+ }
+
+ /* Clear host resource mapping info. If we choose not to register a
+ * BAR, such as might be the case with the option ROM, we can get
+ * confusing, unwritable, residual addresses from the host here. */
+ memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+ memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+ vfio_load_rom(vdev);
+
+ if (vfio_setup_msi(vdev))
+ goto out_unset_domain;
+
+ if (vfio_setup_resources(vdev))
+ goto out_disable_msix;
+
+ if (vfio_map_iommu(vdev))
+ goto out_unmap_resources;
+
+ if (vfio_enable_intx(vdev))
+ goto out_unmap_iommu;
+
+ return 0;
+
+out_unmap_iommu:
+ vfio_unmap_iommu(vdev);
+out_unmap_resources:
+ vfio_unmap_resources(vdev);
+out_disable_msix:
+ vfio_teardown_msi(vdev);
+out_unset_domain:
+ ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+out_close_vfiofd:
+ close(vdev->vfiofd);
+out_close_uiommu:
+ if (!vdev->uiommufd_name)
+ close(vdev->uiommufd);
+ return -1;
+}
+
+static int vfio_exitfn(struct PCIDevice *pdev)
+{
+ VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+ vfio_disable_interrupts(vdev);
+ vfio_teardown_msi(vdev);
+ vfio_unmap_iommu(vdev);
+ vfio_unmap_resources(vdev);
+ ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+ close(vdev->vfiofd);
+ if (!vdev->uiommufd_name)
+ close(vdev->uiommufd);
+ return 0;
+}
+
+static PropertyInfo qdev_prop_hostaddr = {
+ .name = "pci-hostaddr",
+ .type = -1,
+ .size = sizeof(PCIHostDevice),
+ .parse = parse_hostaddr,
+ .print = print_hostaddr,
+};
+
+static PCIDeviceInfo vfio_info = {
+ .qdev.name = "vfio",
+ .qdev.desc = "pass through host pci devices to the guest via vfio",
+ .qdev.size = sizeof(VFIODevice),
+ .init = vfio_initfn,
+ .exit = vfio_exitfn,
+ .config_read = vfio_pci_read_config,
+ .config_write = vfio_pci_write_config,
+ .qdev.props = (Property[]) {
+ DEFINE_PROP("host", VFIODevice, host,
+ qdev_prop_hostaddr, PCIHostDevice),
+ DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+ DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name),
+ DEFINE_PROP_END_OF_LIST(),
+ },
+};
+
+static void vfio_register_devices(void)
+{
+ pci_qdev_register(&vfio_info);
+}
+
+device_init(vfio_register_devices)
new file mode 100644
@@ -0,0 +1,68 @@
+#ifndef __VFIO_H__
+#define __VFIO_H__
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+
+typedef struct PCIHostDevice {
+ uint16_t seg;
+ uint8_t bus;
+ uint8_t dev:5;
+ uint8_t func:3;
+} PCIHostDevice;
+
+typedef struct PCIResource {
+ bool valid;
+ bool mem;
+ bool msix;
+ bool slow;
+ uint8_t bar;
+ uint64_t size;
+ ram_addr_t memory_index[2]; /* cpu_register_physical_memory() index */
+ void *r_virtbase[2]; /* mmapped address */
+ int io_mem; /* cpu_register_io_memory index */
+ pcibus_t e_phys; /* emulated base address */
+ pcibus_t e_size; /* emulated size of region in bytes */
+ uint32_t msix_offset;
+ int vfiofd; /* see vfio_resource_read/write */
+} PCIResource;
+
+typedef struct INTx {
+ bool pending;
+ uint8_t pin;
+ bool irqfd_enabled;
+ EventNotifier notifier;
+ ioapic_eoi_client eoi_client;
+} INTx;
+
+struct VFIODevice;
+
+typedef struct MSIVector {
+ EventNotifier notifier;
+ struct VFIODevice *vdev;
+ int vector;
+} MSIVector;
+
+enum {
+ INT_NONE = 0,
+ INT_INTx = 1,
+ INT_MSI = 2,
+ INT_MSIX = 3,
+};
+
+typedef struct VFIODevice {
+ PCIDevice pdev;
+ PCIHostDevice host;
+ PCIResource resources[PCI_NUM_REGIONS - 1]; /* No ROM */
+ INTx intx;
+ MSIVector *msi_vectors;
+ int nr_vectors;
+ int interrupt;
+ int vfiofd;
+ int uiommufd;
+ char *vfiofd_name;
+ char *uiommufd_name;
+} VFIODevice;
+
+#endif /* __VFIO_H__ */