From patchwork Fri Nov 5 20:16:14 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alex Williamson X-Patchwork-Id: 304772 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id oA5KGO5x010245 for ; Fri, 5 Nov 2010 20:16:25 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753502Ab0KEUQV (ORCPT ); Fri, 5 Nov 2010 16:16:21 -0400 Received: from mx1.redhat.com ([209.132.183.28]:31340 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752300Ab0KEUQU (ORCPT ); Fri, 5 Nov 2010 16:16:20 -0400 Received: from int-mx10.intmail.prod.int.phx2.redhat.com (int-mx10.intmail.prod.int.phx2.redhat.com [10.5.11.23]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id oA5KGHWX031208 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Fri, 5 Nov 2010 16:16:17 -0400 Received: from s20.home (ovpn01.gateway.prod.ext.phx2.redhat.com [10.5.9.1]) by int-mx10.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id oA5KGEFC009063; Fri, 5 Nov 2010 16:16:16 -0400 From: Alex Williamson Subject: [RFC PATCH v2] VFIO based device assignment To: qemu-devel@nongnu.org Cc: pugs@cisco.com, kvm@vger.kernel.org, alex.williamson@redhat.com, mst@redhat.com Date: Fri, 05 Nov 2010 14:16:14 -0600 Message-ID: <20101105200558.26484.87430.stgit@s20.home> User-Agent: StGIT/0.14.3 MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.68 on 10.5.11.23 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Fri, 05 Nov 2010 20:16:25 +0000 (UTC) diff --git a/Makefile.target b/Makefile.target index 91e6e74..f67490a 100644 --- a/Makefile.target +++ b/Makefile.target @@ -203,6 +203,7 @@ obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o obj-i386-y += debugcon.o multiboot.o obj-i386-y += pc_piix.o +obj-i386-y += vfio.o # shared objects obj-ppc-y = ppc.o diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h new file mode 100644 index 0000000..5f2e52e --- /dev/null +++ b/hw/linux-vfio.h @@ -0,0 +1,273 @@ +/* + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Portions derived from drivers/uio/uio.c: + * Copyright(C) 2005, Benedikt Spranger + * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2006, Hans J. Koch + * Copyright(C) 2006, Greg Kroah-Hartman + * + * Portions derived from drivers/uio/uio_pci_generic.c: + * Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin + */ +#include + +/* + * VFIO driver - allow mapping and use of certain PCI devices + * in unprivileged user processes. (If IOMMU is present) + * Especially useful for Virtual Function parts of SR-IOV devices + */ + +#ifdef __KERNEL__ + +struct vfio_nl_client { + struct list_head list; + u64 msgcap; + struct net *net; + u32 pid; +}; + +struct perm_bits; +struct vfio_dev { + struct device *dev; + struct pci_dev *pdev; + char name[8]; + u8 *pci_config_map; + int pci_config_size; + int devnum; + void __iomem *barmap[PCI_ROM_RESOURCE+1]; + spinlock_t irqlock; /* guards command register accesses */ + int listeners; + u32 locked_pages; + struct mutex lgate; /* listener gate */ + struct mutex dgate; /* dma op gate */ + struct mutex igate; /* intr op gate */ + struct mutex ngate; /* netlink op gate */ + struct list_head nlc_list; /* netlink clients */ + wait_queue_head_t dev_idle_q; + wait_queue_head_t nl_wait_q; + u32 nl_reply_seq; + u32 nl_reply_value; + int mapcount; + struct uiommu_domain *udomain; + int cachec; + struct msix_entry *msix; + struct eventfd_ctx *ev_irq; + struct eventfd_ctx **ev_msi; + struct eventfd_ctx **ev_msix; + int msi_nvec; + int msix_nvec; + u8 *vconfig; + u32 rbar[7]; /* copies of real bars */ + u8 msi_qmax; + u8 bardirty; + struct perm_bits *msi_perm; +}; + +struct vfio_listener { + struct vfio_dev *vdev; + struct list_head dm_list; + struct mm_struct *mm; + struct mmu_notifier mmu_notifier; +}; + +/* + * Structure for keeping track of memory nailed down by the + * user for DMA + */ +struct dma_map_page { + struct list_head list; + struct page **pages; + dma_addr_t daddr; + unsigned long vaddr; + int npage; + int rdwr; +}; + +/* VFIO class infrastructure */ +struct vfio_class { + struct kref kref; + struct class *class; +}; +extern struct vfio_class *vfio_class; + +ssize_t vfio_io_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); +ssize_t vfio_mem_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); +ssize_t vfio_config_readwrite(int, struct vfio_dev *, + char __user *, size_t, loff_t *); + +void vfio_drop_msi(struct vfio_dev *); +void vfio_drop_msix(struct vfio_dev *); +int vfio_setup_msi(struct vfio_dev *, int, void __user *); +int vfio_setup_msix(struct vfio_dev *, int, void __user *); + +#ifndef PCI_MSIX_ENTRY_SIZE +#define PCI_MSIX_ENTRY_SIZE 16 +#endif +#ifndef PCI_STATUS_INTERRUPT +#define PCI_STATUS_INTERRUPT 0x08 +#endif + +struct vfio_dma_map; +void vfio_dma_unmapall(struct vfio_listener *); +int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *); +int vfio_dma_map_common(struct vfio_listener *, unsigned int, + struct vfio_dma_map *); +int vfio_domain_set(struct vfio_dev *, int, int); +int vfio_domain_unset(struct vfio_dev *); + +int vfio_class_init(void); +void vfio_class_destroy(void); +int vfio_dev_add_attributes(struct vfio_dev *); +int vfio_build_config_map(struct vfio_dev *); + +int vfio_nl_init(void); +void vfio_nl_freeclients(struct vfio_dev *); +void vfio_nl_exit(void); +int vfio_nl_remove(struct vfio_dev *); +int vfio_validate(struct vfio_dev *); +int vfio_nl_upcall(struct vfio_dev *, u8, int, int); +void vfio_pm_process_reply(int); +pci_ers_result_t vfio_error_detected(struct pci_dev *, pci_channel_state_t); +pci_ers_result_t vfio_mmio_enabled(struct pci_dev *); +pci_ers_result_t vfio_link_reset(struct pci_dev *); +pci_ers_result_t vfio_slot_reset(struct pci_dev *); +void vfio_error_resume(struct pci_dev *); +#define VFIO_ERROR_REPLY_TIMEOUT (3*HZ) +#define VFIO_SUSPEND_REPLY_TIMEOUT (5*HZ) + +irqreturn_t vfio_interrupt(int, void *); + +#endif /* __KERNEL__ */ + +/* Kernel & User level defines for ioctls */ + +/* + * Structure for DMA mapping of user buffers + * vaddr, dmaaddr, and size must all be page aligned + * buffer may only be larger than 1 page if (a) there is + * an iommu in the system, or (b) buffer is part of a huge page + */ +struct vfio_dma_map { + __u64 vaddr; /* process virtual addr */ + __u64 dmaaddr; /* desired and/or returned dma address */ + __u64 size; /* size in bytes */ + __u64 flags; /* bool: 0 for r/o; 1 for r/w */ +#define VFIO_FLAG_WRITE 0x1 /* req writeable DMA mem */ +}; + +/* map user pages at specific dma address */ +/* requires previous VFIO_DOMAIN_SET */ +#define VFIO_DMA_MAP_IOVA _IOWR(';', 101, struct vfio_dma_map) + +/* unmap user pages */ +#define VFIO_DMA_UNMAP _IOW(';', 102, struct vfio_dma_map) + +/* request IRQ interrupts; use given eventfd */ +#define VFIO_EVENTFD_IRQ _IOW(';', 103, int) + +/* Request MSI interrupts: arg[0] is #, arg[1-n] are eventfds */ +#define VFIO_EVENTFDS_MSI _IOW(';', 104, int) + +/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */ +#define VFIO_EVENTFDS_MSIX _IOW(';', 105, int) + +/* Get length of a BAR */ +#define VFIO_BAR_LEN _IOWR(';', 167, __u32) + +/* Set the IOMMU domain - arg is fd from uiommu driver */ +#define VFIO_DOMAIN_SET _IOW(';', 107, int) + +/* Unset the IOMMU domain */ +#define VFIO_DOMAIN_UNSET _IO(';', 108) + +/* Re-enable INTx */ +#define VFIO_IRQ_EOI _IO(';', 109) + +/* Re-enable INTx via eventfd*/ +#define VFIO_IRQ_EOI_EVENTFD _IOW(';', 110, int) + +/* + * Reads, writes, and mmaps determine which PCI BAR (or config space) + * from the high level bits of the file offset + */ +#define VFIO_PCI_BAR0_RESOURCE 0x0 +#define VFIO_PCI_BAR1_RESOURCE 0x1 +#define VFIO_PCI_BAR2_RESOURCE 0x2 +#define VFIO_PCI_BAR3_RESOURCE 0x3 +#define VFIO_PCI_BAR4_RESOURCE 0x4 +#define VFIO_PCI_BAR5_RESOURCE 0x5 +#define VFIO_PCI_ROM_RESOURCE 0x6 +#define VFIO_PCI_CONFIG_RESOURCE 0xF +#define VFIO_PCI_SPACE_SHIFT 32 +#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE) + +static inline int vfio_offset_to_pci_space(__u64 off) +{ + return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF; +} + +static inline __u32 vfio_offset_to_pci_offset(__u64 off) +{ + return off & (__u32)0xFFFFFFFF; +} + +static inline __u64 vfio_pci_space_to_offset(int sp) +{ + return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT; +} + +/* + * Netlink defines: + */ +#define VFIO_GENL_NAME "VFIO" + +/* message types */ +enum { + VFIO_MSG_INVAL = 0, + /* kernel to user */ + VFIO_MSG_REMOVE, /* unbind, module or hotplug remove */ + VFIO_MSG_ERROR_DETECTED, /* pci err handling - error detected */ + VFIO_MSG_MMIO_ENABLED, /* pci err handling - mmio enabled */ + VFIO_MSG_LINK_RESET, /* pci err handling - link reset */ + VFIO_MSG_SLOT_RESET, /* pci err handling - slot reset */ + VFIO_MSG_ERROR_RESUME, /* pci err handling - resume normal */ + VFIO_MSG_PM_SUSPEND, /* suspend or hibernate notification */ + VFIO_MSG_PM_RESUME, /* resume after suspend or hibernate */ + /* user to kernel */ + VFIO_MSG_REGISTER, + VFIO_MSG_ERROR_HANDLING_REPLY, /* err handling reply */ + VFIO_MSG_PM_SUSPEND_REPLY, /* suspend notify reply */ +}; + +/* attributes */ +enum { + VFIO_ATTR_UNSPEC, + VFIO_ATTR_MSGCAP, /* bitmask of messages desired */ + VFIO_ATTR_PCI_DOMAIN, + VFIO_ATTR_PCI_BUS, + VFIO_ATTR_PCI_SLOT, + VFIO_ATTR_PCI_FUNC, + VFIO_ATTR_CHANNEL_STATE, + VFIO_ATTR_ERROR_HANDLING_REPLY, + VFIO_ATTR_PM_SUSPEND_REPLY, + __VFIO_NL_ATTR_MAX +}; +#define VFIO_NL_ATTR_MAX (__VFIO_NL_ATTR_MAX - 1) diff --git a/hw/vfio.c b/hw/vfio.c new file mode 100644 index 0000000..922a47a --- /dev/null +++ b/hw/vfio.c @@ -0,0 +1,1398 @@ +/* + * vfio based device assignment support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Alex Williamson + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" +#include "event_notifier.h" +#include "hw.h" +#include "kvm.h" +#include "memory.h" +#include "monitor.h" +#include "msi.h" +#include "msix.h" +#include "pc.h" +#include "qemu-error.h" +#include "range.h" +#include "vfio.h" +#include +#include +#include +#include "linux-vfio.h" + +//#define DEBUG_VFIO +#ifdef DEBUG_VFIO +#define DPRINTF(fmt, ...) \ + do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +/* TODO: msix.h should define these */ +#define MSIX_CAP_LENGTH 12 +#define MSIX_PAGE_SIZE 0x1000 + +/* XXX: on qemu-kvm.git we have msix/intx notifiers and irqfds. With these + * we can allow interrupts to bypass userspace. There's no good #define to + * figure out when these are present, so we toggle on the device assignment + * ifdef even though it has no relation to the bits we're looking for. */ +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT +#define QEMU_KVM_BUILD +#endif + +static void vfio_disable_interrupts(VFIODevice *vdev); +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len); +/* + * Generic + */ +static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap) +{ + int id; + int max_cap = 48; + int pos = PCI_CAPABILITY_LIST; + int status; + + status = pdev->config[PCI_STATUS]; + if ((status & PCI_STATUS_CAP_LIST) == 0) { + return 0; + } + + while (max_cap--) { + pos = pdev->config[pos]; + if (pos < 0x40) { + break; + } + + pos &= ~3; + id = pdev->config[pos + PCI_CAP_LIST_ID]; + + if (id == 0xff) { + break; + } + if (id == cap) { + return pos; + } + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop); + const char *p = str; + int n, seg, bus, dev, func; + char field[5]; + + if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') { + return -EINVAL; + } + + seg = strtol(field, NULL, 16); + p += n + 1; + + if (sscanf(p, "%4[^:]%n", field, &n) != 1) { + return -EINVAL; + } + + if (p[n] == ':') { + bus = strtol(field, NULL, 16); + p += n + 1; + } else { + bus = seg; + seg = 0; + } + + if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') { + return -EINVAL; + } + + dev = strtol(field, NULL, 16); + p += n + 1; + + if (!qemu_isdigit(*p)) { + return -EINVAL; + } + + func = *p - '0'; + + ptr->seg = seg; + ptr->bus = bus; + ptr->dev = dev; + ptr->func = func; + return 0; +} + +static int print_hostaddr(DeviceState *qdev, Property *prop, + char *dest, size_t len) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop); + + return snprintf(dest, len, "%04x:%02x:%02x.%x", + ptr->seg, ptr->bus, ptr->dev, ptr->func); +} + +/* + * INTx + */ +static inline void vfio_unmask_intx(VFIODevice *vdev) +{ + ioctl(vdev->vfiofd, VFIO_IRQ_EOI); +} + +static void vfio_intx_interrupt(void *opaque) +{ + VFIODevice *vdev = opaque; + + if (!event_notifier_test_and_clear(&vdev->intx.notifier)) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func, + 'A' + vdev->intx.pin); + + vdev->intx.pending = true; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1); +} + +static void vfio_eoi(ioapic_eoi_client *client) +{ + VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client); + + if (!vdev->intx.irqfd_enabled) { + if (!vdev->intx.pending) { + return; + } + + vdev->intx.pending = false; + + /* If the interrupt is injected via qemu (not irqfd), we need to + * deassert the interrupt here so qemu knows about the level change. + * Otherwise the next interrupt won't make it out of qemu. Interrupts + * via irqfd are completely outside of qemu, so we can skip it. */ + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); + } + + DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func); + + vfio_unmask_intx(vdev); +} + +/* Wrappers for EOI client setup that allow VFIO to directly consume the + * eventfd from KVM. This serves the same purpose as irqfd for the EOI. */ +static int vfio_enable_eoi_client(VFIODevice *vdev) +{ + int fd, ret; + + ret = ioapic_register_eoi_client(&vdev->intx.eoi_client); + if (ret < 0) { + return ret; + } + + /* Exit here is ok, just means EOIs bounce through qemu */ + fd = ioapic_eoi_client_get_fd(&vdev->intx.eoi_client); + if (fd < 0) { + return 0; + } + + ret = ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd); + if (ret < 0) { + fprintf(stderr, "vfio: VFIO_IRQ_EOI_EVENTFD setup - %s (%d)\n", + strerror(-ret), ret); + return ret; + } + qemu_set_fd_handler(fd, NULL, NULL, NULL); + + return 0; +} + +static void vfio_disable_eoi_client(VFIODevice *vdev) +{ + int fd = -1; + + ioapic_unregister_eoi_client(&vdev->intx.eoi_client); + ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd); +} + +/* Attempt to send the VFIO eventfd directly into the KVM irqchip */ +static void vfio_set_intx_handler(VFIODevice *vdev, IOHandler *fd_read, + bool irqfd_enable) +{ + int fd = event_notifier_get_fd(&vdev->intx.notifier); +#ifdef QEMU_KVM_BUILD + int ret; + + ret = kvm_set_irqfd(vdev->intx.eoi_client.irq, fd, irqfd_enable); + if (ret < 0) { + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + fprintf(stderr, "vfio: Error: irqfd %s failed - %s (%d)\n", + irqfd_enable ? "enable" : "disable", strerror(-ret), ret); + goto out; + } + } + + vdev->intx.irqfd_enabled = irqfd_enable; +out: +#endif + if (vdev->intx.irqfd_enabled) { + qemu_set_fd_handler(fd, NULL, NULL, NULL); + } else { + qemu_set_fd_handler(fd, fd_read, NULL, vdev); + } +} + +static void vfio_update_irqs(PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + int irq = pci_get_irq(pdev, vdev->intx.pin); + + if (irq == vdev->intx.eoi_client.irq) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, vdev->intx.eoi_client.irq, irq); + + vfio_set_intx_handler(vdev, vfio_intx_interrupt, false); + vfio_disable_eoi_client(vdev); + + vdev->intx.eoi_client.irq = irq; + + if (irq < 0) { + fprintf(stderr, "vfio: Error - INTx moved to IRQ %d\n", irq); + return; + } + + vfio_enable_eoi_client(vdev); + vfio_set_intx_handler(vdev, vfio_intx_interrupt, true); + + /* Re-enable the interrupt in cased we missed an EOI */ + vfio_eoi(&vdev->intx.eoi_client); +} + +static int vfio_enable_intx(VFIODevice *vdev) +{ + int fd; + uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); + + if (!pin) { + return 0; + } + + vfio_disable_interrupts(vdev); + + vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ + vdev->intx.eoi_client.eoi = vfio_eoi; + vdev->intx.eoi_client.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin); + + vfio_enable_eoi_client(vdev); + + pci_register_update_irqs(&vdev->pdev, vfio_update_irqs); + + if (event_notifier_init(&vdev->intx.notifier, 0)) { + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + return -1; + } + + vfio_set_intx_handler(vdev, vfio_intx_interrupt, true); + + fd = event_notifier_get_fd(&vdev->intx.notifier); + + if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) { + fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n", + strerror(errno)); + return -1; + } + vfio_unmask_intx(vdev); + + vdev->interrupt = INT_INTx; + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func); + + return 0; +} + +static void vfio_disable_intx(VFIODevice *vdev) +{ + int fd = -1; + + if (vdev->interrupt != INT_INTx) { + return; + } + + pci_register_update_irqs(&vdev->pdev, NULL); + vfio_set_intx_handler(vdev, NULL, false); + vfio_disable_eoi_client(vdev); + ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd); + event_notifier_cleanup(&vdev->intx.notifier); + vdev->interrupt = INT_NONE; + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func); +} + +/* + * MSI-X + */ +static void vfio_msix_interrupt(void *opaque) +{ + MSIVector *vec = opaque; + VFIODevice *vdev = vec->vdev; + + if (!event_notifier_test_and_clear(&vec->notifier)) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector); + + msix_notify(&vdev->pdev, vec->vector); +} + +#ifdef QEMU_KVM_BUILD +/* When a vector is masked, we disable the irqfd, forcing the interrupt + * through qemu userspace. We can then filter masked vectors in msix_notify. */ +static int vfio_msix_mask_notify(PCIDevice *pdev, unsigned vector, int masked) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + int fd, ret; + + fd = event_notifier_get_fd(&vdev->msi_vectors[vector].notifier); + ret = kvm_set_irqfd(pdev->msix_irq_entries[vector].gsi, fd, !masked); + if (ret == -ENOSYS) { + return 0; /* w/o irqfd, interrupts pass through qemu anyway */ + } else if (ret < 0) { + fprintf(stderr, "vfio: Error - irqfd setup failed\n"); + return ret; + } + + if (masked) { + qemu_set_fd_handler(fd, vfio_msix_interrupt, NULL, + &vdev->msi_vectors[vector]); + } else { + qemu_set_fd_handler(fd, NULL, NULL, NULL); + } + + return ret; +} +#endif + +static void vfio_enable_msix(VFIODevice *vdev) +{ + int i, *fds; + + vfio_disable_interrupts(vdev); + + vdev->nr_vectors = vdev->pdev.msix_entries_nr; + vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector)); + + fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int)); + fds[0] = vdev->nr_vectors; + + for (i = 0; i < vdev->nr_vectors; i++) { + vdev->msi_vectors[i].vdev = vdev; + vdev->msi_vectors[i].vector = i; + + if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) { + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + } + + fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier); + qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL, + &vdev->msi_vectors[i]); + + if (msix_vector_use(&vdev->pdev, i) < 0) { + fprintf(stderr, "vfio: Error msix_vector_use\n"); + } + } + + if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) { + fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n", + strerror(errno)); + qemu_free(fds); + return; + } + + vdev->interrupt = INT_MSIX; + + qemu_free(fds); + +#ifdef QEMU_KVM_BUILD + if (msix_set_mask_notifier(&vdev->pdev, vfio_msix_mask_notify)) { + fprintf(stderr, "vfio: Error msix_set_mask_notifier\n"); + } +#endif + + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, vdev->nr_vectors); +} + +static void vfio_disable_msix(VFIODevice *vdev) +{ + int i, vectors = 0; + + if (vdev->interrupt != INT_MSIX) { + return; + } + + ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors); + +#ifdef QEMU_KVM_BUILD + if (msix_unset_mask_notifier(&vdev->pdev)) { + fprintf(stderr, "vfio: Error msix_unset_mask_notifier\n"); + } +#endif + + for (i = 0; i < vdev->nr_vectors; i++) { + int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier); + + msix_vector_unuse(&vdev->pdev, i); + + qemu_set_fd_handler(fd, NULL, NULL, NULL); + event_notifier_cleanup(&vdev->msi_vectors[i].notifier); + } + + qemu_free(vdev->msi_vectors); + vdev->nr_vectors = 0; + vdev->interrupt = INT_NONE; + vfio_enable_intx(vdev); + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func); +} + +/* + * MSI + */ +static void vfio_msi_interrupt(void *opaque) +{ + MSIVector *vec = opaque; + VFIODevice *vdev = vec->vdev; + + if (!event_notifier_test_and_clear(&vec->notifier)) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector); + + msi_notify(&vdev->pdev, vec->vector); +} + +static void vfio_enable_msi(VFIODevice *vdev) +{ + int i, *fds; + + vfio_disable_interrupts(vdev); + + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); + vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector)); + + fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int)); + fds[0] = vdev->nr_vectors; + + for (i = 0; i < vdev->nr_vectors; i++) { + vdev->msi_vectors[i].vdev = vdev; + vdev->msi_vectors[i].vector = i; + + if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) { + fprintf(stderr, "vfio: Error: event_notifier_init failed\n"); + } + + fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier); + qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL, + &vdev->msi_vectors[i]); + } + + if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, fds)) { + fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n", + strerror(errno)); + qemu_free(fds); + return; + } + + vdev->interrupt = INT_MSI; + + qemu_free(fds); + + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, vdev->nr_vectors); +} + +static void vfio_disable_msi(VFIODevice *vdev) +{ + int i, vectors = 0; + + if (vdev->interrupt != INT_MSI) { + return; + } + + ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, &vectors); + + for (i = 0; i < vdev->nr_vectors; i++) { + int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier); + qemu_set_fd_handler(fd, NULL, NULL, NULL); + event_notifier_cleanup(&vdev->msi_vectors[i].notifier); + } + + qemu_free(vdev->msi_vectors); + vdev->nr_vectors = 0; + vdev->interrupt = INT_NONE; + vfio_enable_intx(vdev); + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func); +} + +/* + * IO Port/MMIO + */ +static void vfio_resource_write(PCIResource *res, uint32_t addr, + uint32_t val, int len) +{ + size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar); + + if (pwrite(res->vfiofd, &val, len, offset + addr) != len) { + fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n", + __FUNCTION__, addr, val, len, strerror(errno)); + } + DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n", __FUNCTION__, res->bar, + addr, val, len); +} + +static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 1); +} + +static void vfio_resource_writew(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 2); +} + +static void vfio_resource_writel(void *opaque, target_phys_addr_t addr, + uint32_t val) +{ + vfio_resource_write(opaque, addr, val, 4); +} + +static CPUWriteMemoryFunc * const vfio_resource_writes[] = { + &vfio_resource_writeb, + &vfio_resource_writew, + &vfio_resource_writel +}; + +static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 1); +} + +static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 2); +} + +static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val) +{ + PCIResource *res = opaque; + vfio_resource_write(res, addr - res->e_phys, val, 4); +} + +static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len) +{ + size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar); + uint32_t val; + + if (pread(res->vfiofd, &val, len, offset + addr) != len) { + fprintf(stderr, "%s(,0x%x, %d) failed: %s\n", + __FUNCTION__, addr, len, strerror(errno)); + return 0xffffffffU; + } + DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n", __FUNCTION__, res->bar, + addr, len, val); + return val; +} + +static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 1) & 0xff; +} + +static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 2) & 0xffff; +} + +static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr) +{ + return vfio_resource_read(opaque, addr, 4); +} + +static CPUReadMemoryFunc * const vfio_resource_reads[] = { + &vfio_resource_readb, + &vfio_resource_readw, + &vfio_resource_readl +}; + +static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff; +} + +static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff; +} + +static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr) +{ + PCIResource *res = opaque; + return vfio_resource_read(res, addr - res->e_phys, 4); +} + +static void vfio_ioport_map(PCIDevice *pdev, int bar, + pcibus_t e_phys, pcibus_t e_size, int type) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + PCIResource *res = &vdev->resources[bar]; + + DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, bar, e_phys, e_size, type); + + res->e_phys = e_phys; + res->e_size = e_size; + + register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res); + register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res); + register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res); + register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res); + register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res); + register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res); +} + +static void vfio_iomem_map(PCIDevice *pdev, int bar, + pcibus_t e_phys, pcibus_t e_size, int type) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + PCIResource *res = &vdev->resources[bar]; + + DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, bar, e_phys, e_size, type); + + res->e_phys = e_phys; + res->e_size = e_size; + + if (res->msix) { + if (res->msix_offset > 0) { + cpu_register_physical_memory(e_phys, res->msix_offset, res->slow ? + res->io_mem : res->memory_index[0]); + } + + DPRINTF("Overlaying MSI-X table page\n"); + msix_mmio_map(pdev, bar, e_phys, e_size, type); + + if (e_size > res->msix_offset + MSIX_PAGE_SIZE) { + uint32_t offset = res->msix_offset + MSIX_PAGE_SIZE; + e_phys += offset; + e_size -= offset; + cpu_register_physical_memory_offset(e_phys, e_size, + res->slow ? res->io_mem : res->memory_index[1], + res->slow ? offset : 0); + } + } else { + cpu_register_physical_memory(e_phys, e_size, res->slow ? + res->io_mem : res->memory_index[0]); + } +} + +/* + * PCI config space + */ +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + uint32_t val = 0; + + if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) || + (pdev->cap_present & QEMU_PCI_CAP_MSIX && + ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) || + (pdev->cap_present & QEMU_PCI_CAP_MSI && + ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size))) { + + val = pci_default_read_config(pdev, addr, len); + } else { + if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) { + fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, addr, len, + strerror(errno)); + return -1; + } + } + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, len, val); + return val; +} + +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n", __FUNCTION__, + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, val, len); + + /* Write everything to VFIO, let it filter out what we can't write */ + if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) { + fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, addr, val, len, strerror(errno)); + } + + /* Write standard header bits to emulation */ + if (addr < 0x40) { + pci_default_write_config(pdev, addr, val, len); + return; + } + + /* MSI/MSI-X Enabling/Disabling */ + if (pdev->cap_present & QEMU_PCI_CAP_MSI && + ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size)) { + int is_enabled, was_enabled = msi_enabled(pdev); + + pci_default_write_config(pdev, addr, val, len); + msi_write_config(pdev, addr, val, len); + + is_enabled = msi_enabled(pdev); + + if (!was_enabled && is_enabled) { + vfio_enable_msi(vdev); + } else if (was_enabled && !is_enabled) { + vfio_disable_msi(vdev); + } + } + + if (pdev->cap_present & QEMU_PCI_CAP_MSIX && + ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { + int is_enabled, was_enabled = msix_enabled(pdev); + + pci_default_write_config(pdev, addr, val, len); + msix_write_config(pdev, addr, val, len); + + is_enabled = msix_enabled(pdev); + + if (!was_enabled && is_enabled) { + vfio_enable_msix(vdev); + } else if (was_enabled && !is_enabled) { + vfio_disable_msix(vdev); + } + } +} + +/* + * DMA + */ +static int vfio_dma_map(void *opaque, target_phys_addr_t start_addr, + ram_addr_t size, ram_addr_t phys_offset) +{ + VFIODevice *vdev = opaque; + struct vfio_dma_map dma_map; + + dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset); + dma_map.dmaaddr = start_addr; + dma_map.size = size; + dma_map.flags = VFIO_FLAG_WRITE; + + return ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map); +} + +static int vfio_dma_unmap(void *opaque, target_phys_addr_t start_addr, + ram_addr_t size, ram_addr_t phys_offset) +{ + VFIODevice *vdev = opaque; + struct vfio_dma_map dma_map; + + dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset); + dma_map.dmaaddr = start_addr; + dma_map.size = size; + dma_map.flags = VFIO_FLAG_WRITE; + + return ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map); +} + +static int vfio_map_iommu(VFIODevice *vdev) +{ + return qemu_ram_for_each_slot(vdev, vfio_dma_map); +} + +static int vfio_unmap_iommu(VFIODevice *vdev) +{ + return qemu_ram_for_each_slot(vdev, vfio_dma_unmap); +} + +/* + * Interrupt setup + */ +static void vfio_disable_interrupts(VFIODevice *vdev) +{ + switch (vdev->interrupt) { + case INT_INTx: + vfio_disable_intx(vdev); + break; + case INT_MSI: + vfio_disable_msi(vdev); + break; + case INT_MSIX: + vfio_disable_msix(vdev); + } +} + +static int vfio_setup_msi(VFIODevice *vdev) +{ + int pos; + + if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) { + uint16_t ctrl; + bool msi_64bit, msi_maskbit; + int entries; + + if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl), + VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -1; + } + + msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); + msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); + entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); + + DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.seg, + vdev->host.bus, vdev->host.dev, vdev->host.func, pos); + + if (msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit) < 0) { + fprintf(stderr, "vfio: msi_init failed\n"); + return -1; + } + } + + if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) { + uint16_t ctrl; + uint32_t table, len, offset; + int bar, entries; + + if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl), + VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -1; + } + + if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF + + pos + PCI_MSIX_TABLE) != sizeof(table)) { + return -1; + } + + ctrl = le16_to_cpu(ctrl); + table = le32_to_cpu(table); + + bar = table & PCI_MSIX_BIR; + offset = table & ~PCI_MSIX_BIR; + entries = (ctrl & PCI_MSIX_TABSIZE) + 1; + + vdev->resources[bar].msix = true; + vdev->resources[bar].msix_offset = offset; + + DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x\n", + vdev->host.seg, vdev->host.bus, vdev->host.dev, + vdev->host.func, pos, bar, offset); + + len = table & PCI_MSIX_BIR; + if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) { + fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n"); + return -1; + } + + if (msix_init(&vdev->pdev, entries, bar, len) < 0) { + fprintf(stderr, "vfio: msix_init failed\n"); + return -1; + } + } + return 0; +} + +static void vfio_teardown_msi(VFIODevice *vdev) +{ + msi_uninit(&vdev->pdev); + msix_uninit(&vdev->pdev); +} + +/* + * Resource setup + */ +static int vfio_setup_resources(VFIODevice *vdev) +{ + int i; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + uint32_t len, bar; + PCIResource *res; + uint8_t offset; + int ret, space; + + res = &vdev->resources[i]; + res->vfiofd = vdev->vfiofd; + res->bar = len = i; + + if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) { + fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i); + return -1; + } + if (!len) { + continue; + } + + offset = PCI_BASE_ADDRESS_0 + (4 * i); + ret = pread(vdev->vfiofd, &bar, sizeof(bar), + VFIO_PCI_CONFIG_OFF + offset); + if (ret != sizeof(bar)) { + fprintf(stderr, "vfio: Failed to read BAR %d\n", i); + return -1; + } + bar = le32_to_cpu(bar); + space = bar & PCI_BASE_ADDRESS_SPACE; + + if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) { + int off = VFIO_PCI_BAR0_RESOURCE + i; + int flags = PROT_READ | PROT_WRITE; + char name[32]; + + res->mem = true; + res->size = len; + + if (vdev->pdev.qdev.info->vmsd) { + snprintf(name, sizeof(name), "%s.bar%d", + vdev->pdev.qdev.info->vmsd->name, i); + } else { + snprintf(name, sizeof(name), "%s.bar%d", + vdev->pdev.qdev.info->name, i); + } + + if (res->msix) { + if (res->msix_offset) { + char *c = &name[strlen(name)]; + + res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags, + MAP_SHARED, vdev->vfiofd, + vfio_pci_space_to_offset(off)); + + if (res->r_virtbase[0] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + strncat(name, ".0", sizeof(name)); + res->memory_index[0] = + qemu_ram_alloc_from_ptr(&vdev->pdev.qdev, + name, res->msix_offset, + res->r_virtbase[0]); + *c = 0; + } + if (len > res->msix_offset + MSIX_PAGE_SIZE) { + char *c = &name[strlen(name)]; + + res->r_virtbase[1] = mmap(NULL, + len - res->msix_offset - MSIX_PAGE_SIZE, + flags, MAP_SHARED, vdev->vfiofd, + vfio_pci_space_to_offset(off) + + res->msix_offset + MSIX_PAGE_SIZE); + + if (res->r_virtbase[1] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + strncat(name, ".1", sizeof(name)); + res->memory_index[1] = + qemu_ram_alloc_from_ptr(&vdev->pdev.qdev, name, + len - MSIX_PAGE_SIZE - res->msix_offset, + res->r_virtbase[1]); + *c = 0; + } + } else { + res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED, + vdev->vfiofd, + vfio_pci_space_to_offset(off)); + + if (res->r_virtbase[0] == MAP_FAILED) { + fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i); + return -1; + } + res->memory_index[0] = + qemu_ram_alloc_from_ptr(&vdev->pdev.qdev, + name, len, res->r_virtbase[0]); + } + + pci_register_bar(&vdev->pdev, i, res->size, + bar & PCI_BASE_ADDRESS_MEM_PREFETCH ? + PCI_BASE_ADDRESS_MEM_PREFETCH : + PCI_BASE_ADDRESS_SPACE_MEMORY, + vfio_iomem_map); + + if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) { + i++; + } + } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) { + res->mem = true; + res->size = len; + res->slow = true; + + DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n", + __FUNCTION__, vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func, i); + + res->io_mem = cpu_register_io_memory(vfio_resource_reads, + vfio_resource_writes, res); + + pci_register_bar(&vdev->pdev, i, res->size, + bar & PCI_BASE_ADDRESS_MEM_PREFETCH ? + PCI_BASE_ADDRESS_MEM_PREFETCH : + PCI_BASE_ADDRESS_SPACE_MEMORY, + vfio_iomem_map); + + if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) { + i++; + } + } else if (space == PCI_BASE_ADDRESS_SPACE_IO) { + res->size = len; + pci_register_bar(&vdev->pdev, i, res->size, + PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map); + } + res->valid = true; + } + return 0; +} + +static void vfio_unmap_resources(VFIODevice *vdev) +{ + int i; + PCIResource *res = vdev->resources; + + for (i = 0; i < PCI_ROM_SLOT; i++, res++) { + if (res->valid && res->mem) { + if (res->msix) { + if (res->msix_offset) { + cpu_register_physical_memory(res->e_phys, res->msix_offset, + IO_MEM_UNASSIGNED); + qemu_ram_free_from_ptr(res->memory_index[0]); + munmap(res->r_virtbase[0], res->msix_offset); + } + if (res->size > res->msix_offset + MSIX_PAGE_SIZE) { + cpu_register_physical_memory(res->e_phys + MSIX_PAGE_SIZE + + res->msix_offset, + res->e_size - MSIX_PAGE_SIZE - + res->msix_offset, + IO_MEM_UNASSIGNED); + qemu_ram_free_from_ptr(res->memory_index[1]); + munmap(res->r_virtbase[1], + res->size - MSIX_PAGE_SIZE - res->msix_offset); + } + } else { + if (!res->slow) { + cpu_register_physical_memory(res->e_phys, res->e_size, + IO_MEM_UNASSIGNED); + qemu_ram_free_from_ptr(res->memory_index[0]); + munmap(res->r_virtbase[0], res->size); + } else { + cpu_unregister_io_memory(res->io_mem); + } + } + } + } +} + +/* + * General setup + */ +static int get_vfio_fd(VFIODevice *vdev) +{ + if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) { + if (qemu_isdigit(vdev->vfiofd_name[0])) { + vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0); + return 0; + } else { + vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name); + if (vdev->vfiofd < 0) { + fprintf(stderr, "%s: (%s) unkown\n", __func__, + vdev->vfiofd_name); + return -1; + } + return 0; + } + } else { + char vfio_dir[64], vfio_dev[16]; + DIR *dir; + struct dirent *de; + + sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/", + vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func); + dir = opendir(vfio_dir); + if (!dir) { + error_report("vfio: error: Driver not attached\n"); + return -1; + } + + while ((de = readdir(dir))) { + if (de->d_name[0] == '.') + continue; + if (!strncmp(de->d_name, "vfio", 4)) + break; + } + + if (!de) { + error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir); + return -1; + } + + sprintf(vfio_dev, "/dev/%s", de->d_name); + vdev->vfiofd = open(vfio_dev, O_RDWR); + if (vdev->vfiofd < 0) { + error_report("pci-assign: vfio: Failed to open %s: %s\n", + vfio_dev, strerror(errno)); + return -1; + } + return 0; + } +} + +static int get_uiommu_fd(VFIODevice *vdev) +{ + if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) { + if (qemu_isdigit(vdev->uiommufd_name[0])) { + vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0); + return 0; + } else { + vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name); + if (vdev->uiommufd < 0) { + fprintf(stderr, "%s: (%s) unkown\n", __func__, + vdev->uiommufd_name); + return -1; + } + return 0; + } + } else { + vdev->uiommufd = open("/dev/uiommu", O_RDONLY); + if (vdev->uiommufd < 0) { + return -1; + } + vdev->uiommufd_name = NULL; /* easier test later */ + return 0; + } +} + +static int vfio_load_rom(VFIODevice *vdev) +{ + uint32_t len, size = PCI_ROM_SLOT; + char name[32]; + off_t off = 0, voff = vfio_pci_space_to_offset(VFIO_PCI_ROM_RESOURCE); + ssize_t bytes; + void *ptr; + + /* If loading ROM from file, pci handles it */ + if (vdev->pdev.romfile || !vdev->pdev.rom_bar) + return 0; + + if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &size)) { + fprintf(stderr, "vfio: VFIO_BAR_LEN failed for OPTION ROM"); + return -1; + } + + if (!size) + return 0; + + len = size; + snprintf(name, sizeof(name), "%s.rom", vdev->pdev.qdev.info->name); + vdev->pdev.rom_offset = qemu_ram_alloc(&vdev->pdev.qdev, name, size); + ptr = qemu_get_ram_ptr(vdev->pdev.rom_offset); + memset(ptr, 0xff, size); + + while (size) { + bytes = pread(vdev->vfiofd, ptr + off, size, voff + off); + if (bytes == 0) { + break; /* expect that we could get back less than the ROM BAR */ + } else if (bytes > 0) { + off += bytes; + size -= bytes; + } else { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + fprintf(stderr, "vfio: Error reading device ROM: %s\n", + strerror(errno)); + qemu_ram_free(vdev->pdev.rom_offset); + vdev->pdev.rom_offset = 0; + return -1; + } + } + + pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, len, 0, pci_map_option_rom); + return 0; +} + +static int vfio_initfn(struct PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + char sys[64]; + struct stat st; + int ret; + + /* Check that the host device exists */ + sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", + vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func); + if (stat(sys, &st) < 0) { + error_report("vfio: error: no such host device " + "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus, + vdev->host.dev, vdev->host.func); + return -1; + } + + if (get_uiommu_fd(vdev)) + return -1; + + if (get_vfio_fd(vdev)) + goto out_close_uiommu; + + if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd)) + goto out_close_vfiofd; + + /* Get a copy of config space */ + ret = pread(vdev->vfiofd, vdev->pdev.config, + pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF); + if (ret < pci_config_size(&vdev->pdev)) { + fprintf(stderr, "vfio: Failed to read device config space\n"); + goto out_unset_domain; + } + + /* Clear host resource mapping info. If we choose not to register a + * BAR, such as might be the case with the option ROM, we can get + * confusing, unwritable, residual addresses from the host here. */ + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); + + vfio_load_rom(vdev); + + if (vfio_setup_msi(vdev)) + goto out_unset_domain; + + if (vfio_setup_resources(vdev)) + goto out_disable_msix; + + if (vfio_map_iommu(vdev)) + goto out_unmap_resources; + + if (vfio_enable_intx(vdev)) + goto out_unmap_iommu; + + return 0; + +out_unmap_iommu: + vfio_unmap_iommu(vdev); +out_unmap_resources: + vfio_unmap_resources(vdev); +out_disable_msix: + vfio_teardown_msi(vdev); +out_unset_domain: + ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET); +out_close_vfiofd: + close(vdev->vfiofd); +out_close_uiommu: + if (!vdev->uiommufd_name) + close(vdev->uiommufd); + return -1; +} + +static int vfio_exitfn(struct PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + + vfio_disable_interrupts(vdev); + vfio_teardown_msi(vdev); + vfio_unmap_iommu(vdev); + vfio_unmap_resources(vdev); + ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET); + close(vdev->vfiofd); + if (!vdev->uiommufd_name) + close(vdev->uiommufd); + return 0; +} + +static PropertyInfo qdev_prop_hostaddr = { + .name = "pci-hostaddr", + .type = -1, + .size = sizeof(PCIHostDevice), + .parse = parse_hostaddr, + .print = print_hostaddr, +}; + +static PCIDeviceInfo vfio_info = { + .qdev.name = "vfio", + .qdev.desc = "pass through host pci devices to the guest via vfio", + .qdev.size = sizeof(VFIODevice), + .init = vfio_initfn, + .exit = vfio_exitfn, + .config_read = vfio_pci_read_config, + .config_write = vfio_pci_write_config, + .qdev.props = (Property[]) { + DEFINE_PROP("host", VFIODevice, host, + qdev_prop_hostaddr, PCIHostDevice), + DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), + DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name), + DEFINE_PROP_END_OF_LIST(), + }, +}; + +static void vfio_register_devices(void) +{ + pci_qdev_register(&vfio_info); +} + +device_init(vfio_register_devices) diff --git a/hw/vfio.h b/hw/vfio.h new file mode 100644 index 0000000..20ae5db --- /dev/null +++ b/hw/vfio.h @@ -0,0 +1,68 @@ +#ifndef __VFIO_H__ +#define __VFIO_H__ + +#include "qemu-common.h" +#include "qemu-queue.h" +#include "pci.h" + +typedef struct PCIHostDevice { + uint16_t seg; + uint8_t bus; + uint8_t dev:5; + uint8_t func:3; +} PCIHostDevice; + +typedef struct PCIResource { + bool valid; + bool mem; + bool msix; + bool slow; + uint8_t bar; + uint64_t size; + ram_addr_t memory_index[2]; /* cpu_register_physical_memory() index */ + void *r_virtbase[2]; /* mmapped address */ + int io_mem; /* cpu_register_io_memory index */ + pcibus_t e_phys; /* emulated base address */ + pcibus_t e_size; /* emulated size of region in bytes */ + uint32_t msix_offset; + int vfiofd; /* see vfio_resource_read/write */ +} PCIResource; + +typedef struct INTx { + bool pending; + uint8_t pin; + bool irqfd_enabled; + EventNotifier notifier; + ioapic_eoi_client eoi_client; +} INTx; + +struct VFIODevice; + +typedef struct MSIVector { + EventNotifier notifier; + struct VFIODevice *vdev; + int vector; +} MSIVector; + +enum { + INT_NONE = 0, + INT_INTx = 1, + INT_MSI = 2, + INT_MSIX = 3, +}; + +typedef struct VFIODevice { + PCIDevice pdev; + PCIHostDevice host; + PCIResource resources[PCI_NUM_REGIONS - 1]; /* No ROM */ + INTx intx; + MSIVector *msi_vectors; + int nr_vectors; + int interrupt; + int vfiofd; + int uiommufd; + char *vfiofd_name; + char *uiommufd_name; +} VFIODevice; + +#endif /* __VFIO_H__ */