Message ID | 1462214441-3732-3-git-send-email-kwankhede@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, 3 May 2016 00:10:40 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > VFIO driver registers with vGPU core driver. vGPU core driver creates vGPU > device and calls probe routine of vGPU VFIO driver. This vGPU VFIO driver adds > vGPU device to VFIO core module. > Main aim of this module is to manage all VFIO APIs for each vGPU device. > Those are: > - get region information from GPU driver. > - trap and emulate PCI config space and BAR region. > - Send interrupt configuration information to GPU driver. > - mmap mappable region with invalidate mapping and fault on access to remap pfn. > > Thanks, > Kirti. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Signed-off-by: Neo Jia <cjia@nvidia.com> > Change-Id: I949a6b499d2e98d9c3352ae579535a608729b223 > --- > drivers/vgpu/Makefile | 1 + > drivers/vgpu/vgpu_vfio.c | 671 ++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 672 insertions(+), 0 deletions(-) > create mode 100644 drivers/vgpu/vgpu_vfio.c > > diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile > index f5be980..a0a2655 100644 > --- a/drivers/vgpu/Makefile > +++ b/drivers/vgpu/Makefile > @@ -2,3 +2,4 @@ > vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o > > obj-$(CONFIG_VGPU) += vgpu.o > +obj-$(CONFIG_VGPU_VFIO) += vgpu_vfio.o This is where we should add a new Kconfig entry for VGPU_VFIO, nothing in patch 1 has any vfio dependency. Perhaps it should also depend on VFIO_PCI rather than VFIO since you are getting very PCI specific below. > diff --git a/drivers/vgpu/vgpu_vfio.c b/drivers/vgpu/vgpu_vfio.c > new file mode 100644 > index 0000000..460a4dc > --- /dev/null > +++ b/drivers/vgpu/vgpu_vfio.c > @@ -0,0 +1,671 @@ > +/* > + * VGPU VFIO device > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/fs.h> > +#include <linux/poll.h> > +#include <linux/slab.h> > +#include <linux/cdev.h> > +#include <linux/sched.h> > +#include <linux/wait.h> > +#include <linux/uuid.h> > +#include <linux/vfio.h> > +#include <linux/iommu.h> > +#include <linux/vgpu.h> > + > +#include "vgpu_private.h" > + > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "NVIDIA Corporation" > +#define DRIVER_DESC "VGPU VFIO Driver" > + > +#define VFIO_PCI_OFFSET_SHIFT 40 > + > +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) Change the name of these from vfio-pci please or shift code around to use them directly. You're certainly free to redefine these, but using the same name is confusing. > + > +struct vfio_vgpu_device { > + struct iommu_group *group; > + struct vgpu_device *vgpu_dev; > + int refcnt; > + struct pci_bar_info bar_info[VFIO_PCI_NUM_REGIONS]; > + u8 *vconfig; > +}; > + > +static DEFINE_MUTEX(vfio_vgpu_lock); > + > +static int get_virtual_bar_info(struct vgpu_device *vgpu_dev, > + struct pci_bar_info *bar_info, > + int index) > +{ > + int ret = -1; Use a real errno. > + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; > + > + if (gpu_dev->ops->vgpu_bar_info) > + ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info); vgpu_bar_info is already optional, further validating that the vgpu core is not PCI specific. > + return ret; > +} > + > +static int vdev_read_base(struct vfio_vgpu_device *vdev) > +{ > + int index, pos; > + u32 start_lo, start_hi; > + u32 mem_type; > + > + pos = PCI_BASE_ADDRESS_0; > + > + for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) { > + > + if (!vdev->bar_info[index].size) > + continue; > + > + start_lo = (*(u32 *)(vdev->vconfig + pos)) & > + PCI_BASE_ADDRESS_MEM_MASK; > + mem_type = (*(u32 *)(vdev->vconfig + pos)) & > + PCI_BASE_ADDRESS_MEM_TYPE_MASK; > + > + switch (mem_type) { > + case PCI_BASE_ADDRESS_MEM_TYPE_64: > + start_hi = (*(u32 *)(vdev->vconfig + pos + 4)); > + pos += 4; > + break; > + case PCI_BASE_ADDRESS_MEM_TYPE_32: > + case PCI_BASE_ADDRESS_MEM_TYPE_1M: > + /* 1M mem BAR treated as 32-bit BAR */ > + default: > + /* mem unknown type treated as 32-bit BAR */ > + start_hi = 0; > + break; > + } Let's not neglect ioport BARs here, IO_MASK is different. > + pos += 4; > + vdev->bar_info[index].start = ((u64)start_hi << 32) | start_lo; > + } > + return 0; > +} > + > +static int vgpu_dev_open(void *device_data) > +{ > + int ret = 0; > + struct vfio_vgpu_device *vdev = device_data; > + > + if (!try_module_get(THIS_MODULE)) > + return -ENODEV; > + > + mutex_lock(&vfio_vgpu_lock); > + > + if (!vdev->refcnt) { > + u8 *vconfig; > + int vconfig_size, index; > + > + for (index = 0; index < VFIO_PCI_NUM_REGIONS; index++) { nit, region indexes are not all BARs. > + ret = get_virtual_bar_info(vdev->vgpu_dev, > + &vdev->bar_info[index], > + index); > + if (ret) > + goto open_error; > + } > + vconfig_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size; nit, config space is not a BAR. > + if (!vconfig_size) > + goto open_error; > + > + vconfig = kzalloc(vconfig_size, GFP_KERNEL); > + if (!vconfig) { > + ret = -ENOMEM; > + goto open_error; > + } > + > + vdev->vconfig = vconfig; > + } > + > + vdev->refcnt++; > +open_error: > + > + mutex_unlock(&vfio_vgpu_lock); > + > + if (ret) > + module_put(THIS_MODULE); > + > + return ret; > +} > + > +static void vgpu_dev_close(void *device_data) > +{ > + struct vfio_vgpu_device *vdev = device_data; > + > + mutex_lock(&vfio_vgpu_lock); > + > + vdev->refcnt--; > + if (!vdev->refcnt) { > + memset(&vdev->bar_info, 0, sizeof(vdev->bar_info)); Why? > + if (vdev->vconfig) How would we ever achieve that? > + kfree(vdev->vconfig); > + } > + > + mutex_unlock(&vfio_vgpu_lock); > + > + module_put(THIS_MODULE); > +} > + > +static int vgpu_get_irq_count(struct vfio_vgpu_device *vdev, int irq_type) > +{ > + // Don't support MSIX for now > + if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) > + return -1; How are we going to expand the API later for it? Shouldn't this just be a passthrough to a gpu_devices_ops.vgpu_vfio_get_irq_info callback? > + > + return 1; > +} > + > +static long vgpu_dev_unlocked_ioctl(void *device_data, > + unsigned int cmd, unsigned long arg) > +{ > + int ret = 0; > + struct vfio_vgpu_device *vdev = device_data; > + unsigned long minsz; > + > + switch (cmd) > + { > + case VFIO_DEVICE_GET_INFO: > + { > + struct vfio_device_info info; > + printk(KERN_INFO "%s VFIO_DEVICE_GET_INFO cmd index ", __FUNCTION__); > + minsz = offsetofend(struct vfio_device_info, num_irqs); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + info.flags = VFIO_DEVICE_FLAGS_PCI; > + info.num_regions = VFIO_PCI_NUM_REGIONS; > + info.num_irqs = VFIO_PCI_NUM_IRQS; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } > + > + case VFIO_DEVICE_GET_REGION_INFO: > + { > + struct vfio_region_info info; > + > + minsz = offsetofend(struct vfio_region_info, offset); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + printk(KERN_INFO "%s VFIO_DEVICE_GET_REGION_INFO cmd for region_index %d", __FUNCTION__, info.index); > + switch (info.index) { > + case VFIO_PCI_CONFIG_REGION_INDEX: > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); > + info.size = vdev->bar_info[info.index].size; > + if (!info.size) { > + info.flags = 0; > + break; > + } > + > + info.flags = vdev->bar_info[info.index].flags; Ah, so bar_info.flags are vfio region info flags, that's not documented anywhere in the API. > + break; > + case VFIO_PCI_VGA_REGION_INDEX: > + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); > + info.size = 0xc0000; > + info.flags = VFIO_REGION_INFO_FLAG_READ | > + VFIO_REGION_INFO_FLAG_WRITE; > + break; I think VGA support needs to be at the discretion of the vendor driver. There are certainly use cases that don't require VGA. > + > + case VFIO_PCI_ROM_REGION_INDEX: So should ROM support. What's the assumption here, that QEMU will provide a ROM, much like is required for SR-IOV VFs? > + default: > + return -EINVAL; > + } > + > + return copy_to_user((void __user *)arg, &info, minsz); > + > + } > + case VFIO_DEVICE_GET_IRQ_INFO: > + { > + struct vfio_irq_info info; > + > + printk(KERN_INFO "%s VFIO_DEVICE_GET_IRQ_INFO cmd", __FUNCTION__); Clearly lots of debug remaining in these functions. > + minsz = offsetofend(struct vfio_irq_info, count); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) > + return -EINVAL; > + > + switch (info.index) { > + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: > + case VFIO_PCI_REQ_IRQ_INDEX: > + break; > + /* pass thru to return error */ > + case VFIO_PCI_MSIX_IRQ_INDEX: Lots of assumptions about what the vendor driver is going to support. > + default: > + return -EINVAL; > + } > + > + info.count = VFIO_PCI_NUM_IRQS; > + > + info.flags = VFIO_IRQ_INFO_EVENTFD; > + info.count = vgpu_get_irq_count(vdev, info.index); > + > + if (info.count == -1) > + return -EINVAL; > + > + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) > + info.flags |= (VFIO_IRQ_INFO_MASKABLE | > + VFIO_IRQ_INFO_AUTOMASKED); > + else > + info.flags |= VFIO_IRQ_INFO_NORESIZE; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } > + > + case VFIO_DEVICE_SET_IRQS: > + { > + struct vfio_irq_set hdr; > + struct gpu_device *gpu_dev = vdev->vgpu_dev->gpu_dev; > + u8 *data = NULL; > + int ret = 0; > + minsz = offsetofend(struct vfio_irq_set, count); > + > + if (copy_from_user(&hdr, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || > + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | > + VFIO_IRQ_SET_ACTION_TYPE_MASK)) > + return -EINVAL; > + > + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { > + size_t size; > + int max = vgpu_get_irq_count(vdev, hdr.index); > + > + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) > + size = sizeof(uint8_t); > + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) > + size = sizeof(int32_t); > + else > + return -EINVAL; > + > + if (hdr.argsz - minsz < hdr.count * size || > + hdr.start >= max || hdr.start + hdr.count > max) > + return -EINVAL; > + > + data = memdup_user((void __user *)(arg + minsz), > + hdr.count * size); > + if (IS_ERR(data)) > + return PTR_ERR(data); > + > + } > + > + if (gpu_dev->ops->vgpu_set_irqs) { > + ret = gpu_dev->ops->vgpu_set_irqs(vdev->vgpu_dev, > + hdr.flags, > + hdr.index, hdr.start, > + hdr.count, data); > + } > + kfree(data); > + return ret; > + } > + > + default: > + return -EINVAL; > + } > + return ret; > +} > + > +ssize_t vgpu_dev_config_rw(struct vfio_vgpu_device *vdev, char __user *buf, > + size_t count, loff_t *ppos, bool iswrite) > +{ > + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; > + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; > + int cfg_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size; > + int ret = 0; > + uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK; > + > + if (pos < 0 || pos >= cfg_size || > + pos + count > cfg_size) { > + printk(KERN_ERR "%s pos 0x%llx out of range\n", __FUNCTION__, pos); > + ret = -EFAULT; > + goto config_rw_exit; > + } > + > + if (iswrite) { > + char *user_data = kmalloc(count, GFP_KERNEL); > + > + if (user_data == NULL) { > + ret = -ENOMEM; > + goto config_rw_exit; > + } > + > + if (copy_from_user(user_data, buf, count)) { > + ret = -EFAULT; > + kfree(user_data); > + goto config_rw_exit; > + } memdup_user()? > + > + if (gpu_dev->ops->write) { > + ret = gpu_dev->ops->write(vgpu_dev, > + user_data, > + count, > + vgpu_emul_space_config, > + pos); > + } > + > + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); So write is expected to user_data to allow only the writable bits to be changed? What's really being saved in the vconfig here vs the vendor vgpu driver? It seems like we're only using it to cache the BAR values, but we're not providing the BAR emulation here, which seems like one of the few things we could provide so it's not duplicated in every vendor driver. But then we only need a few u32s to do that, not all of config space. > + kfree(user_data); > + } > + else > + { > + char *ret_data = kzalloc(count, GFP_KERNEL); > + > + if (ret_data == NULL) { > + ret = -ENOMEM; > + goto config_rw_exit; > + } > + > + if (gpu_dev->ops->read) { > + ret = gpu_dev->ops->read(vgpu_dev, > + ret_data, > + count, > + vgpu_emul_space_config, > + pos); > + } > + > + if (ret > 0 ) { > + if (copy_to_user(buf, ret_data, ret)) { > + ret = -EFAULT; > + kfree(ret_data); > + goto config_rw_exit; > + } > + > + memcpy((void *)(vdev->vconfig + pos), (void *)ret_data, count); > + } > + kfree(ret_data); > + } > +config_rw_exit: > + return ret; > +} > + > +ssize_t vgpu_dev_bar_rw(struct vfio_vgpu_device *vdev, char __user *buf, > + size_t count, loff_t *ppos, bool iswrite) > +{ > + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; > + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; > + loff_t offset = *ppos & VFIO_PCI_OFFSET_MASK; > + loff_t pos; > + int bar_index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); > + int ret = 0; > + > + if (!vdev->bar_info[bar_index].start) { > + ret = vdev_read_base(vdev); > + if (ret) > + goto bar_rw_exit; > + } > + > + if (offset >= vdev->bar_info[bar_index].size) { > + ret = -EINVAL; > + goto bar_rw_exit; > + } > + > + pos = vdev->bar_info[bar_index].start + offset; > + if (iswrite) { > + char *user_data = kmalloc(count, GFP_KERNEL); > + > + if (user_data == NULL) { > + ret = -ENOMEM; > + goto bar_rw_exit; > + } > + > + if (copy_from_user(user_data, buf, count)) { > + ret = -EFAULT; > + kfree(user_data); > + goto bar_rw_exit; > + } memdup_user() again. > + > + if (gpu_dev->ops->write) { > + ret = gpu_dev->ops->write(vgpu_dev, > + user_data, > + count, > + vgpu_emul_space_mmio, > + pos); > + } What's the usefulness in a vendor driver that doesn't provide read/write? > + > + kfree(user_data); > + } > + else > + { > + char *ret_data = kmalloc(count, GFP_KERNEL); > + > + if (ret_data == NULL) { > + ret = -ENOMEM; > + goto bar_rw_exit; > + } > + > + memset(ret_data, 0, count); > + > + if (gpu_dev->ops->read) { > + ret = gpu_dev->ops->read(vgpu_dev, > + ret_data, > + count, > + vgpu_emul_space_mmio, > + pos); > + } > + > + if (ret > 0 ) { > + if (copy_to_user(buf, ret_data, ret)) { > + ret = -EFAULT; > + } > + } > + kfree(ret_data); > + } > + > +bar_rw_exit: > + return ret; No freeing, no lock releasing, no cleanup, just return from the point of error. > +} > + > + > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, > + size_t count, loff_t *ppos, bool iswrite) > +{ > + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); > + struct vfio_vgpu_device *vdev = device_data; > + > + if (index >= VFIO_PCI_NUM_REGIONS) > + return -EINVAL; > + > + switch (index) { > + case VFIO_PCI_CONFIG_REGION_INDEX: > + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); > + > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); > + > + case VFIO_PCI_ROM_REGION_INDEX: > + case VFIO_PCI_VGA_REGION_INDEX: Wait a sec, who's doing the VGA emulation? We can't be claiming to support a VGA region and then fail to provide read/write access to it like we said it has. > + break; > + } > + > + return -EINVAL; > +} > + > + > +static ssize_t vgpu_dev_read(void *device_data, char __user *buf, > + size_t count, loff_t *ppos) > +{ > + int ret = 0; > + > + if (count) > + ret = vgpu_dev_rw(device_data, buf, count, ppos, false); > + > + return ret; > +} > + > +static ssize_t vgpu_dev_write(void *device_data, const char __user *buf, > + size_t count, loff_t *ppos) > +{ > + int ret = 0; > + > + if (count) > + ret = vgpu_dev_rw(device_data, (char *)buf, count, ppos, true); > + > + return ret; > +} > + > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > +{ > + int ret = 0; > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > + struct vgpu_device *vgpu_dev; > + struct gpu_device *gpu_dev; > + u64 virtaddr = (u64)vmf->virtual_address; > + u64 offset, phyaddr; > + unsigned long req_size, pgoff; > + pgprot_t pg_prot; > + > + if (!vdev && !vdev->vgpu_dev) > + return -EINVAL; > + > + vgpu_dev = vdev->vgpu_dev; > + gpu_dev = vgpu_dev->gpu_dev; > + > + offset = vma->vm_pgoff << PAGE_SHIFT; > + phyaddr = virtaddr - vma->vm_start + offset; > + pgoff = phyaddr >> PAGE_SHIFT; > + req_size = vma->vm_end - virtaddr; > + pg_prot = vma->vm_page_prot; > + > + if (gpu_dev->ops->validate_map_request) { > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, > + &req_size, &pg_prot); > + if (ret) > + return ret; > + > + if (!req_size) > + return -EINVAL; > + } > + > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); So not supporting validate_map_request() means that the user can directly mmap BARs of the host GPU and as shown below, we assume a 1:1 mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU scenario or should this callback be required? It's not clear to me how the vendor driver determines what this maps to, do they compare it to the physical device's own BAR addresses? > + > + return ret | VM_FAULT_NOPAGE; > +} > + > +static const struct vm_operations_struct vgpu_dev_mmio_ops = { > + .fault = vgpu_dev_mmio_fault, > +}; > + > + > +static int vgpu_dev_mmap(void *device_data, struct vm_area_struct *vma) > +{ > + unsigned int index; > + struct vfio_vgpu_device *vdev = device_data; > + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; > + struct pci_dev *pdev = vgpu_dev->gpu_dev->dev; > + unsigned long pgoff; > + > + loff_t offset = vma->vm_pgoff << PAGE_SHIFT; > + > + index = VFIO_PCI_OFFSET_TO_INDEX(offset); > + > + if (index >= VFIO_PCI_ROM_REGION_INDEX) > + return -EINVAL; ioport BARs? > + > + pgoff = vma->vm_pgoff & > + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > + > + vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; > + > + vma->vm_private_data = vdev; > + vma->vm_ops = &vgpu_dev_mmio_ops; > + > + return 0; > +} > + > +static const struct vfio_device_ops vgpu_vfio_dev_ops = { > + .name = "vfio-vgpu", Should all of this be vfio-pci-vgpu? We've certainly gotten PCI specific here. > + .open = vgpu_dev_open, > + .release = vgpu_dev_close, > + .ioctl = vgpu_dev_unlocked_ioctl, > + .read = vgpu_dev_read, > + .write = vgpu_dev_write, > + .mmap = vgpu_dev_mmap, > +}; > + > +int vgpu_vfio_probe(struct device *dev) > +{ > + struct vfio_vgpu_device *vdev; > + struct vgpu_device *vgpu_dev = to_vgpu_device(dev); > + int ret = 0; > + > + if (vgpu_dev == NULL) > + return -EINVAL; > + > + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); > + if (!vdev) { > + return -ENOMEM; > + } > + > + vdev->vgpu_dev = vgpu_dev; > + vdev->group = vgpu_dev->group; > + > + ret = vfio_add_group_dev(dev, &vgpu_vfio_dev_ops, vdev); > + if (ret) > + kfree(vdev); > + > + printk(KERN_INFO "%s ret = %d\n", __FUNCTION__, ret); > + return ret; > +} > + > +void vgpu_vfio_remove(struct device *dev) > +{ > + struct vfio_vgpu_device *vdev; > + > + printk(KERN_INFO "%s \n", __FUNCTION__); > + vdev = vfio_del_group_dev(dev); > + if (vdev) { > + printk(KERN_INFO "%s vdev being freed\n", __FUNCTION__); > + kfree(vdev); > + } > +} > + > +struct vgpu_driver vgpu_vfio_driver = { > + .name = "vgpu-vfio", > + .probe = vgpu_vfio_probe, > + .remove = vgpu_vfio_remove, > +}; > + > +static int __init vgpu_vfio_init(void) > +{ > + printk(KERN_INFO "%s \n", __FUNCTION__); > + return vgpu_register_driver(&vgpu_vfio_driver, THIS_MODULE); > +} > + > +static void __exit vgpu_vfio_exit(void) > +{ > + printk(KERN_INFO "%s \n", __FUNCTION__); > + vgpu_unregister_driver(&vgpu_vfio_driver); > +} > + > +module_init(vgpu_vfio_init) > +module_exit(vgpu_vfio_exit) > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Alex Williamson [mailto:alex.williamson@redhat.com] > Sent: Wednesday, May 04, 2016 6:43 AM > > + > > + if (gpu_dev->ops->write) { > > + ret = gpu_dev->ops->write(vgpu_dev, > > + user_data, > > + count, > > + vgpu_emul_space_config, > > + pos); > > + } > > + > > + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); > > So write is expected to user_data to allow only the writable bits to be > changed? What's really being saved in the vconfig here vs the vendor > vgpu driver? It seems like we're only using it to cache the BAR > values, but we're not providing the BAR emulation here, which seems > like one of the few things we could provide so it's not duplicated in > every vendor driver. But then we only need a few u32s to do that, not > all of config space. We can borrow same vconfig emulation from existing vfio-pci driver. But doing so doesn't mean that vendor vgpu driver cannot have its own vconfig emulation further. vGPU is not like a real device, since there may be no physical config space implemented for each vGPU. So anyway vendor vGPU driver needs to create/emulate the virtualized config space while the way how is created might be vendor specific. So better to keep the interface to access raw vconfig space from vendor vGPU driver. > > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, > > + size_t count, loff_t *ppos, bool iswrite) > > +{ > > + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); > > + struct vfio_vgpu_device *vdev = device_data; > > + > > + if (index >= VFIO_PCI_NUM_REGIONS) > > + return -EINVAL; > > + > > + switch (index) { > > + case VFIO_PCI_CONFIG_REGION_INDEX: > > + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); > > + > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > > + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); > > + > > + case VFIO_PCI_ROM_REGION_INDEX: > > + case VFIO_PCI_VGA_REGION_INDEX: > > Wait a sec, who's doing the VGA emulation? We can't be claiming to > support a VGA region and then fail to provide read/write access to it > like we said it has. For Intel side we plan to not support VGA region when upstreaming our KVMGT work, which means Intel vGPU will be exposed only as a secondary graphics card then so legacy VGA is not required. Also no VBIOS/ROM requirement. Guess we can remove above two regions. > > + > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > > +{ > > + int ret = 0; > > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > > + struct vgpu_device *vgpu_dev; > > + struct gpu_device *gpu_dev; > > + u64 virtaddr = (u64)vmf->virtual_address; > > + u64 offset, phyaddr; > > + unsigned long req_size, pgoff; > > + pgprot_t pg_prot; > > + > > + if (!vdev && !vdev->vgpu_dev) > > + return -EINVAL; > > + > > + vgpu_dev = vdev->vgpu_dev; > > + gpu_dev = vgpu_dev->gpu_dev; > > + > > + offset = vma->vm_pgoff << PAGE_SHIFT; > > + phyaddr = virtaddr - vma->vm_start + offset; > > + pgoff = phyaddr >> PAGE_SHIFT; > > + req_size = vma->vm_end - virtaddr; > > + pg_prot = vma->vm_page_prot; > > + > > + if (gpu_dev->ops->validate_map_request) { > > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, > > + &req_size, &pg_prot); > > + if (ret) > > + return ret; > > + > > + if (!req_size) > > + return -EINVAL; > > + } > > + > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > So not supporting validate_map_request() means that the user can > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > scenario or should this callback be required? It's not clear to me how > the vendor driver determines what this maps to, do they compare it to > the physical device's own BAR addresses? I didn't quite understand too. Based on earlier discussion, do we need something like this, or could achieve the purpose just by leveraging recent sparse mmap support? Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 5/4/2016 4:13 AM, Alex Williamson wrote: > On Tue, 3 May 2016 00:10:40 +0530 >> obj-$(CONFIG_VGPU) += vgpu.o >> +obj-$(CONFIG_VGPU_VFIO) += vgpu_vfio.o > > This is where we should add a new Kconfig entry for VGPU_VFIO, nothing > in patch 1 has any vfio dependency. Perhaps it should also depend on > VFIO_PCI rather than VFIO since you are getting very PCI specific below. VGPU_VFIO depends on VFIO but is independent of VFIO_PCI. VGPU_VFIO uses VFIO apis defined for PCI devices and uses common #defines but that doesn't mean it depends on VFIO_PCI. I'll move Kconfig entry for VGPU_VFIO here in next version of patch. >> +#define VFIO_PCI_OFFSET_SHIFT 40 >> + >> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) >> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) >> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > > Change the name of these from vfio-pci please or shift code around to > use them directly. You're certainly free to redefine these, but using > the same name is confusing. > I'll move these defines to common location. >> + if (gpu_dev->ops->vgpu_bar_info) >> + ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info); > > vgpu_bar_info is already optional, further validating that the vgpu > core is not PCI specific. It is not optional if vgpu_vfio module should work on the device. If vgpu_bar_info is not provided by vendor driver, open() would fail. vgpu_vfio expect PCI device. Also need to PCI device validation. > > Let's not neglect ioport BARs here, IO_MASK is different. > vgpu_device is virtual device, it is not going to drive VGA signals. Nvidia vGPU would not support IO BAR. >> + vdev->refcnt--; >> + if (!vdev->refcnt) { >> + memset(&vdev->bar_info, 0, sizeof(vdev->bar_info)); > > Why? vfio_vgpu_device is allocated when vgpu device is created by vgpu core, then QEMU/VMM call open() on that device, where vdev->bar_info is populated and allocates vconfig. In teardown path, QEMU/VMM call close() on the device and vfio_vgpu_device is destroyed when vgpu device is destroyed by vgpu core. If QEMU/VMM restarts and in that case vgpu device is not destroyed, vdev->bar_info should be cleared to fetch it again from vendor driver. It should not keep any stale addresses. >> + if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) >> + return -1; > > How are we going to expand the API later for it? Shouldn't this just > be a passthrough to a gpu_devices_ops.vgpu_vfio_get_irq_info callback? Vendor driver convey interrupt type by defining capabilities in config space. I don't think we should add new callback for it. >> + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); > > So write is expected to user_data to allow only the writable bits to be > changed? What's really being saved in the vconfig here vs the vendor > vgpu driver? It seems like we're only using it to cache the BAR > values, but we're not providing the BAR emulation here, which seems > like one of the few things we could provide so it's not duplicated in > every vendor driver. But then we only need a few u32s to do that, not > all of config space. > Vendor driver should emulate config space. It is not just BAR addresses, vendor driver should add the capabilities supported by its vGPU device. >> + >> + if (gpu_dev->ops->write) { >> + ret = gpu_dev->ops->write(vgpu_dev, >> + user_data, >> + count, >> + vgpu_emul_space_mmio, >> + pos); >> + } > > What's the usefulness in a vendor driver that doesn't provide > read/write? The checks are to avoid NULL pointer deference if this callbacks are not provided. Whether it will work or not that completely depends on vendor driver stack in host and guest. >> + case VFIO_PCI_ROM_REGION_INDEX: >> + case VFIO_PCI_VGA_REGION_INDEX: > > Wait a sec, who's doing the VGA emulation? We can't be claiming to > support a VGA region and then fail to provide read/write access to it > like we said it has. > Nvidia vGPU doesn't support IO BAR and ROM BAR. But I can move these cases to case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: So that if vendor driver support IO BAR or ROM BAR emulation, it would be same as other BARs. >> + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > So not supporting validate_map_request() means that the user can > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > scenario or should this callback be required? Yes, if restrictions are imposed such that onle one vGPU device can be created on one physical GPU, i.e. 1:1 vGPU to host GPU. > It's not clear to me how > the vendor driver determines what this maps to, do they compare it to > the physical device's own BAR addresses? > Yes. Thanks, Kirti -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 4 May 2016 03:23:13 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson [mailto:alex.williamson@redhat.com] > > Sent: Wednesday, May 04, 2016 6:43 AM > > > + > > > + if (gpu_dev->ops->write) { > > > + ret = gpu_dev->ops->write(vgpu_dev, > > > + user_data, > > > + count, > > > + vgpu_emul_space_config, > > > + pos); > > > + } > > > + > > > + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); > > > > So write is expected to user_data to allow only the writable bits to be > > changed? What's really being saved in the vconfig here vs the vendor > > vgpu driver? It seems like we're only using it to cache the BAR > > values, but we're not providing the BAR emulation here, which seems > > like one of the few things we could provide so it's not duplicated in > > every vendor driver. But then we only need a few u32s to do that, not > > all of config space. > > We can borrow same vconfig emulation from existing vfio-pci driver. > But doing so doesn't mean that vendor vgpu driver cannot have its > own vconfig emulation further. vGPU is not like a real device, since > there may be no physical config space implemented for each vGPU. > So anyway vendor vGPU driver needs to create/emulate the virtualized > config space while the way how is created might be vendor specific. > So better to keep the interface to access raw vconfig space from > vendor vGPU driver. I'm hoping config space will be very simple for a vgpu, so I don't know that it makes sense to add that complexity early on. Neo/Kirti, what capabilities do you expect to provide? Who provides the MSI capability? Is a PCIe capability provided? Others? > > > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, > > > + size_t count, loff_t *ppos, bool iswrite) > > > +{ > > > + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); > > > + struct vfio_vgpu_device *vdev = device_data; > > > + > > > + if (index >= VFIO_PCI_NUM_REGIONS) > > > + return -EINVAL; > > > + > > > + switch (index) { > > > + case VFIO_PCI_CONFIG_REGION_INDEX: > > > + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); > > > + > > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > > > + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); > > > + > > > + case VFIO_PCI_ROM_REGION_INDEX: > > > + case VFIO_PCI_VGA_REGION_INDEX: > > > > Wait a sec, who's doing the VGA emulation? We can't be claiming to > > support a VGA region and then fail to provide read/write access to it > > like we said it has. > > For Intel side we plan to not support VGA region when upstreaming our > KVMGT work, which means Intel vGPU will be exposed only as a > secondary graphics card then so legacy VGA is not required. Also no > VBIOS/ROM requirement. Guess we can remove above two regions. So this needs to be optional based on what the mediation driver provides. It seems like we're just making passthroughs for the vendor mediation driver to speak vfio. > > > + > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > > > +{ > > > + int ret = 0; > > > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > > > + struct vgpu_device *vgpu_dev; > > > + struct gpu_device *gpu_dev; > > > + u64 virtaddr = (u64)vmf->virtual_address; > > > + u64 offset, phyaddr; > > > + unsigned long req_size, pgoff; > > > + pgprot_t pg_prot; > > > + > > > + if (!vdev && !vdev->vgpu_dev) > > > + return -EINVAL; > > > + > > > + vgpu_dev = vdev->vgpu_dev; > > > + gpu_dev = vgpu_dev->gpu_dev; > > > + > > > + offset = vma->vm_pgoff << PAGE_SHIFT; > > > + phyaddr = virtaddr - vma->vm_start + offset; > > > + pgoff = phyaddr >> PAGE_SHIFT; > > > + req_size = vma->vm_end - virtaddr; > > > + pg_prot = vma->vm_page_prot; > > > + > > > + if (gpu_dev->ops->validate_map_request) { > > > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, > > > + &req_size, &pg_prot); > > > + if (ret) > > > + return ret; > > > + > > > + if (!req_size) > > > + return -EINVAL; > > > + } > > > + > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > So not supporting validate_map_request() means that the user can > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > scenario or should this callback be required? It's not clear to me how > > the vendor driver determines what this maps to, do they compare it to > > the physical device's own BAR addresses? > > I didn't quite understand too. Based on earlier discussion, do we need > something like this, or could achieve the purpose just by leveraging > recent sparse mmap support? The reason for faulting in the mmio space, if I recall correctly, is to enable an ordering where the user driver (QEMU) can mmap regions of the device prior to resources being allocated on the host GPU to handle them. Sparse mmap only partially handles that, it's not dynamic. With this faulting mechanism, the host GPU doesn't need to commit resources until the mmap is actually accessed. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 04, 2016 at 11:06:19AM -0600, Alex Williamson wrote: > On Wed, 4 May 2016 03:23:13 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com] > > > Sent: Wednesday, May 04, 2016 6:43 AM > > > > + > > > > + if (gpu_dev->ops->write) { > > > > + ret = gpu_dev->ops->write(vgpu_dev, > > > > + user_data, > > > > + count, > > > > + vgpu_emul_space_config, > > > > + pos); > > > > + } > > > > + > > > > + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); > > > > > > So write is expected to user_data to allow only the writable bits to be > > > changed? What's really being saved in the vconfig here vs the vendor > > > vgpu driver? It seems like we're only using it to cache the BAR > > > values, but we're not providing the BAR emulation here, which seems > > > like one of the few things we could provide so it's not duplicated in > > > every vendor driver. But then we only need a few u32s to do that, not > > > all of config space. > > > > We can borrow same vconfig emulation from existing vfio-pci driver. > > But doing so doesn't mean that vendor vgpu driver cannot have its > > own vconfig emulation further. vGPU is not like a real device, since > > there may be no physical config space implemented for each vGPU. > > So anyway vendor vGPU driver needs to create/emulate the virtualized > > config space while the way how is created might be vendor specific. > > So better to keep the interface to access raw vconfig space from > > vendor vGPU driver. > > I'm hoping config space will be very simple for a vgpu, so I don't know > that it makes sense to add that complexity early on. Neo/Kirti, what > capabilities do you expect to provide? Who provides the MSI > capability? Is a PCIe capability provided? Others? Currently only standard PCI caps. MSI cap is emulated by the vendor drivers via the above interface. No PCIe caps so far. > > > > > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, > > > > + size_t count, loff_t *ppos, bool iswrite) > > > > +{ > > > > + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); > > > > + struct vfio_vgpu_device *vdev = device_data; > > > > + > > > > + if (index >= VFIO_PCI_NUM_REGIONS) > > > > + return -EINVAL; > > > > + > > > > + switch (index) { > > > > + case VFIO_PCI_CONFIG_REGION_INDEX: > > > > + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); > > > > + > > > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > > > > + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); > > > > + > > > > + case VFIO_PCI_ROM_REGION_INDEX: > > > > + case VFIO_PCI_VGA_REGION_INDEX: > > > > > > Wait a sec, who's doing the VGA emulation? We can't be claiming to > > > support a VGA region and then fail to provide read/write access to it > > > like we said it has. > > > > For Intel side we plan to not support VGA region when upstreaming our > > KVMGT work, which means Intel vGPU will be exposed only as a > > secondary graphics card then so legacy VGA is not required. Also no > > VBIOS/ROM requirement. Guess we can remove above two regions. > > So this needs to be optional based on what the mediation driver > provides. It seems like we're just making passthroughs for the vendor > mediation driver to speak vfio. > > > > > + > > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > > > > +{ > > > > + int ret = 0; > > > > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > > > > + struct vgpu_device *vgpu_dev; > > > > + struct gpu_device *gpu_dev; > > > > + u64 virtaddr = (u64)vmf->virtual_address; > > > > + u64 offset, phyaddr; > > > > + unsigned long req_size, pgoff; > > > > + pgprot_t pg_prot; > > > > + > > > > + if (!vdev && !vdev->vgpu_dev) > > > > + return -EINVAL; > > > > + > > > > + vgpu_dev = vdev->vgpu_dev; > > > > + gpu_dev = vgpu_dev->gpu_dev; > > > > + > > > > + offset = vma->vm_pgoff << PAGE_SHIFT; > > > > + phyaddr = virtaddr - vma->vm_start + offset; > > > > + pgoff = phyaddr >> PAGE_SHIFT; > > > > + req_size = vma->vm_end - virtaddr; > > > > + pg_prot = vma->vm_page_prot; > > > > + > > > > + if (gpu_dev->ops->validate_map_request) { > > > > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, > > > > + &req_size, &pg_prot); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + if (!req_size) > > > > + return -EINVAL; > > > > + } > > > > + > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > So not supporting validate_map_request() means that the user can > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > scenario or should this callback be required? It's not clear to me how > > > the vendor driver determines what this maps to, do they compare it to > > > the physical device's own BAR addresses? > > > > I didn't quite understand too. Based on earlier discussion, do we need > > something like this, or could achieve the purpose just by leveraging > > recent sparse mmap support? > > The reason for faulting in the mmio space, if I recall correctly, is to > enable an ordering where the user driver (QEMU) can mmap regions of the > device prior to resources being allocated on the host GPU to handle > them. Sparse mmap only partially handles that, it's not dynamic. With > this faulting mechanism, the host GPU doesn't need to commit resources > until the mmap is actually accessed. Thanks, Correct. Thanks, Neo > > Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 5/5/2016 2:44 AM, Neo Jia wrote: > On Wed, May 04, 2016 at 11:06:19AM -0600, Alex Williamson wrote: >> On Wed, 4 May 2016 03:23:13 +0000 >> "Tian, Kevin" <kevin.tian@intel.com> wrote: >> >>>> From: Alex Williamson [mailto:alex.williamson@redhat.com] >>>> Sent: Wednesday, May 04, 2016 6:43 AM >>>>> + >>>>> + if (gpu_dev->ops->write) { >>>>> + ret = gpu_dev->ops->write(vgpu_dev, >>>>> + user_data, >>>>> + count, >>>>> + vgpu_emul_space_config, >>>>> + pos); >>>>> + } >>>>> + >>>>> + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); >>>> >>>> So write is expected to user_data to allow only the writable bits to be >>>> changed? What's really being saved in the vconfig here vs the vendor >>>> vgpu driver? It seems like we're only using it to cache the BAR >>>> values, but we're not providing the BAR emulation here, which seems >>>> like one of the few things we could provide so it's not duplicated in >>>> every vendor driver. But then we only need a few u32s to do that, not >>>> all of config space. >>> >>> We can borrow same vconfig emulation from existing vfio-pci driver. >>> But doing so doesn't mean that vendor vgpu driver cannot have its >>> own vconfig emulation further. vGPU is not like a real device, since >>> there may be no physical config space implemented for each vGPU. >>> So anyway vendor vGPU driver needs to create/emulate the virtualized >>> config space while the way how is created might be vendor specific. >>> So better to keep the interface to access raw vconfig space from >>> vendor vGPU driver. >> >> I'm hoping config space will be very simple for a vgpu, so I don't know >> that it makes sense to add that complexity early on. Neo/Kirti, what >> capabilities do you expect to provide? Who provides the MSI >> capability? Is a PCIe capability provided? Others? > From VGPU_VFIO point of view, VGPU_VFIO would not provide or modify any capabilities. Vendor vGPU driver should provide config space. Then vendor driver can provide PCI capabilities or PCIe capabilities, it might also have vendor specific information. VGPU_VFIO driver would not intercept that information. > Currently only standard PCI caps. > > MSI cap is emulated by the vendor drivers via the above interface. > > No PCIe caps so far. > Nvidia vGPU device is standard PCI device. We tested standard PCI caps. Thanks, Kirti. >> >>>>> +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, >>>>> + size_t count, loff_t *ppos, bool iswrite) >>>>> +{ >>>>> + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); >>>>> + struct vfio_vgpu_device *vdev = device_data; >>>>> + >>>>> + if (index >= VFIO_PCI_NUM_REGIONS) >>>>> + return -EINVAL; >>>>> + >>>>> + switch (index) { >>>>> + case VFIO_PCI_CONFIG_REGION_INDEX: >>>>> + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); >>>>> + >>>>> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: >>>>> + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); >>>>> + >>>>> + case VFIO_PCI_ROM_REGION_INDEX: >>>>> + case VFIO_PCI_VGA_REGION_INDEX: >>>> >>>> Wait a sec, who's doing the VGA emulation? We can't be claiming to >>>> support a VGA region and then fail to provide read/write access to it >>>> like we said it has. >>> >>> For Intel side we plan to not support VGA region when upstreaming our >>> KVMGT work, which means Intel vGPU will be exposed only as a >>> secondary graphics card then so legacy VGA is not required. Also no >>> VBIOS/ROM requirement. Guess we can remove above two regions. >> >> So this needs to be optional based on what the mediation driver >> provides. It seems like we're just making passthroughs for the vendor >> mediation driver to speak vfio. >> >>>>> + >>>>> +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >>>>> +{ >>>>> + int ret = 0; >>>>> + struct vfio_vgpu_device *vdev = vma->vm_private_data; >>>>> + struct vgpu_device *vgpu_dev; >>>>> + struct gpu_device *gpu_dev; >>>>> + u64 virtaddr = (u64)vmf->virtual_address; >>>>> + u64 offset, phyaddr; >>>>> + unsigned long req_size, pgoff; >>>>> + pgprot_t pg_prot; >>>>> + >>>>> + if (!vdev && !vdev->vgpu_dev) >>>>> + return -EINVAL; >>>>> + >>>>> + vgpu_dev = vdev->vgpu_dev; >>>>> + gpu_dev = vgpu_dev->gpu_dev; >>>>> + >>>>> + offset = vma->vm_pgoff << PAGE_SHIFT; >>>>> + phyaddr = virtaddr - vma->vm_start + offset; >>>>> + pgoff = phyaddr >> PAGE_SHIFT; >>>>> + req_size = vma->vm_end - virtaddr; >>>>> + pg_prot = vma->vm_page_prot; >>>>> + >>>>> + if (gpu_dev->ops->validate_map_request) { >>>>> + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, >>>>> + &req_size, &pg_prot); >>>>> + if (ret) >>>>> + return ret; >>>>> + >>>>> + if (!req_size) >>>>> + return -EINVAL; >>>>> + } >>>>> + >>>>> + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); >>>> >>>> So not supporting validate_map_request() means that the user can >>>> directly mmap BARs of the host GPU and as shown below, we assume a 1:1 >>>> mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU >>>> scenario or should this callback be required? It's not clear to me how >>>> the vendor driver determines what this maps to, do they compare it to >>>> the physical device's own BAR addresses? >>> >>> I didn't quite understand too. Based on earlier discussion, do we need >>> something like this, or could achieve the purpose just by leveraging >>> recent sparse mmap support? >> >> The reason for faulting in the mmio space, if I recall correctly, is to >> enable an ordering where the user driver (QEMU) can mmap regions of the >> device prior to resources being allocated on the host GPU to handle >> them. Sparse mmap only partially handles that, it's not dynamic. With >> this faulting mechanism, the host GPU doesn't need to commit resources >> until the mmap is actually accessed. Thanks, > > Correct. > > Thanks, > Neo > >> >> Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Alex Williamson > Sent: Thursday, May 05, 2016 1:06 AM > > > > + > > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault > *vmf) > > > > +{ > > > > + int ret = 0; > > > > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > > > > + struct vgpu_device *vgpu_dev; > > > > + struct gpu_device *gpu_dev; > > > > + u64 virtaddr = (u64)vmf->virtual_address; > > > > + u64 offset, phyaddr; > > > > + unsigned long req_size, pgoff; > > > > + pgprot_t pg_prot; > > > > + > > > > + if (!vdev && !vdev->vgpu_dev) > > > > + return -EINVAL; > > > > + > > > > + vgpu_dev = vdev->vgpu_dev; > > > > + gpu_dev = vgpu_dev->gpu_dev; > > > > + > > > > + offset = vma->vm_pgoff << PAGE_SHIFT; > > > > + phyaddr = virtaddr - vma->vm_start + offset; > > > > + pgoff = phyaddr >> PAGE_SHIFT; > > > > + req_size = vma->vm_end - virtaddr; > > > > + pg_prot = vma->vm_page_prot; > > > > + > > > > + if (gpu_dev->ops->validate_map_request) { > > > > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, > &pgoff, > > > > + &req_size, &pg_prot); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + if (!req_size) > > > > + return -EINVAL; > > > > + } > > > > + > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > So not supporting validate_map_request() means that the user can > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > scenario or should this callback be required? It's not clear to me how > > > the vendor driver determines what this maps to, do they compare it to > > > the physical device's own BAR addresses? > > > > I didn't quite understand too. Based on earlier discussion, do we need > > something like this, or could achieve the purpose just by leveraging > > recent sparse mmap support? > > The reason for faulting in the mmio space, if I recall correctly, is to > enable an ordering where the user driver (QEMU) can mmap regions of the > device prior to resources being allocated on the host GPU to handle > them. Sparse mmap only partially handles that, it's not dynamic. With > this faulting mechanism, the host GPU doesn't need to commit resources > until the mmap is actually accessed. Thanks, > > Alex Neo/Kirti, any specific example how above exactly works? I can see difference from sparse mmap based on Alex's explanation, but still cannot map the 1st sentence to a real scenario clearly. Now our side doesn't use such faulting-based method. So I'd like to understand it clearly and then see any value to do same thing for Intel GPU. Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, May 05, 2016 at 09:24:26AM +0000, Tian, Kevin wrote: > > From: Alex Williamson > > Sent: Thursday, May 05, 2016 1:06 AM > > > > > + > > > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault > > *vmf) > > > > > +{ > > > > > + int ret = 0; > > > > > + struct vfio_vgpu_device *vdev = vma->vm_private_data; > > > > > + struct vgpu_device *vgpu_dev; > > > > > + struct gpu_device *gpu_dev; > > > > > + u64 virtaddr = (u64)vmf->virtual_address; > > > > > + u64 offset, phyaddr; > > > > > + unsigned long req_size, pgoff; > > > > > + pgprot_t pg_prot; > > > > > + > > > > > + if (!vdev && !vdev->vgpu_dev) > > > > > + return -EINVAL; > > > > > + > > > > > + vgpu_dev = vdev->vgpu_dev; > > > > > + gpu_dev = vgpu_dev->gpu_dev; > > > > > + > > > > > + offset = vma->vm_pgoff << PAGE_SHIFT; > > > > > + phyaddr = virtaddr - vma->vm_start + offset; > > > > > + pgoff = phyaddr >> PAGE_SHIFT; > > > > > + req_size = vma->vm_end - virtaddr; > > > > > + pg_prot = vma->vm_page_prot; > > > > > + > > > > > + if (gpu_dev->ops->validate_map_request) { > > > > > + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, > > &pgoff, > > > > > + &req_size, &pg_prot); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + if (!req_size) > > > > > + return -EINVAL; > > > > > + } > > > > > + > > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > > > So not supporting validate_map_request() means that the user can > > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > > scenario or should this callback be required? It's not clear to me how > > > > the vendor driver determines what this maps to, do they compare it to > > > > the physical device's own BAR addresses? > > > > > > I didn't quite understand too. Based on earlier discussion, do we need > > > something like this, or could achieve the purpose just by leveraging > > > recent sparse mmap support? > > > > The reason for faulting in the mmio space, if I recall correctly, is to > > enable an ordering where the user driver (QEMU) can mmap regions of the > > device prior to resources being allocated on the host GPU to handle > > them. Sparse mmap only partially handles that, it's not dynamic. With > > this faulting mechanism, the host GPU doesn't need to commit resources > > until the mmap is actually accessed. Thanks, > > > > Alex > > Neo/Kirti, any specific example how above exactly works? I can see > difference from sparse mmap based on Alex's explanation, but still > cannot map the 1st sentence to a real scenario clearly. Now our side > doesn't use such faulting-based method. So I'd like to understand it > clearly and then see any value to do same thing for Intel GPU. Hi Kevin, The short answer is CPU access to GPU resources via MMIO region. Thanks, Neo > > Thanks > Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Alex Williamson > Sent: Thursday, May 05, 2016 1:06 AM > > > > + > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > So not supporting validate_map_request() means that the user can > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > scenario or should this callback be required? It's not clear to me how > > > the vendor driver determines what this maps to, do they compare it to > > > the physical device's own BAR addresses? > > > > I didn't quite understand too. Based on earlier discussion, do we need > > something like this, or could achieve the purpose just by leveraging > > recent sparse mmap support? > > The reason for faulting in the mmio space, if I recall correctly, is to > enable an ordering where the user driver (QEMU) can mmap regions of the > device prior to resources being allocated on the host GPU to handle > them. Sparse mmap only partially handles that, it's not dynamic. With > this faulting mechanism, the host GPU doesn't need to commit resources > until the mmap is actually accessed. Thanks, > > Alex Just double confirm. I assume this faulting mechanism can work with sparse mmap, right? Regardless of whether it's a full or partial region, this faulting mechanism would commit resource only when accessed page has MMAP flag set... Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 11 May 2016 06:45:41 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Alex Williamson > > Sent: Thursday, May 05, 2016 1:06 AM > > > > > + > > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > > > So not supporting validate_map_request() means that the user can > > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > > scenario or should this callback be required? It's not clear to me how > > > > the vendor driver determines what this maps to, do they compare it to > > > > the physical device's own BAR addresses? > > > > > > I didn't quite understand too. Based on earlier discussion, do we need > > > something like this, or could achieve the purpose just by leveraging > > > recent sparse mmap support? > > > > The reason for faulting in the mmio space, if I recall correctly, is to > > enable an ordering where the user driver (QEMU) can mmap regions of the > > device prior to resources being allocated on the host GPU to handle > > them. Sparse mmap only partially handles that, it's not dynamic. With > > this faulting mechanism, the host GPU doesn't need to commit resources > > until the mmap is actually accessed. Thanks, > > > > Alex > > Just double confirm. I assume this faulting mechanism can work with > sparse mmap, right? Regardless of whether it's a full or partial region, > this faulting mechanism would commit resource only when accessed > page has MMAP flag set... Yes, the vfio sparse mmap just solves the problem that a vfio region maps to an entire device resource, for example in the case of vfio-pci, a PCI BAR. It turns out that specifying mmap on a whole region doesn't give us the granularity we need. Sparse mmap gives us a generic way to tell userspace which areas within a region support mmap and which should use read/write access through the vfio device file descriptor. The latter allows us to protect specific regions or provide further emulation/virtualization for that sub-area. How the mmap vma is populated for the portions that do support mmap is an orthogonal issue. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> From: Alex Williamson [mailto:alex.williamson@redhat.com] > Sent: Thursday, May 12, 2016 4:11 AM > On Wed, 11 May 2016 06:45:41 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Alex Williamson > > > Sent: Thursday, May 05, 2016 1:06 AM > > > > > > + > > > > > > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > > > > > > > > > > So not supporting validate_map_request() means that the user can > > > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1 > > > > > mapping of vGPU BAR to host GPU BAR. Is that ever valid in a vGPU > > > > > scenario or should this callback be required? It's not clear to me how > > > > > the vendor driver determines what this maps to, do they compare it to > > > > > the physical device's own BAR addresses? > > > > > > > > I didn't quite understand too. Based on earlier discussion, do we need > > > > something like this, or could achieve the purpose just by leveraging > > > > recent sparse mmap support? > > > > > > The reason for faulting in the mmio space, if I recall correctly, is to > > > enable an ordering where the user driver (QEMU) can mmap regions of the > > > device prior to resources being allocated on the host GPU to handle > > > them. Sparse mmap only partially handles that, it's not dynamic. With > > > this faulting mechanism, the host GPU doesn't need to commit resources > > > until the mmap is actually accessed. Thanks, > > > > > > Alex > > > > Just double confirm. I assume this faulting mechanism can work with > > sparse mmap, right? Regardless of whether it's a full or partial region, > > this faulting mechanism would commit resource only when accessed > > page has MMAP flag set... > > Yes, the vfio sparse mmap just solves the problem that a vfio region > maps to an entire device resource, for example in the case of vfio-pci, > a PCI BAR. It turns out that specifying mmap on a whole region doesn't > give us the granularity we need. Sparse mmap gives us a generic way to > tell userspace which areas within a region support mmap and which > should use read/write access through the vfio device file descriptor. > The latter allows us to protect specific regions or provide further > emulation/virtualization for that sub-area. How the mmap vma is > populated for the portions that do support mmap is an orthogonal > issue. Thanks, > Exactly! Thanks for confirmation. Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile index f5be980..a0a2655 100644 --- a/drivers/vgpu/Makefile +++ b/drivers/vgpu/Makefile @@ -2,3 +2,4 @@ vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o obj-$(CONFIG_VGPU) += vgpu.o +obj-$(CONFIG_VGPU_VFIO) += vgpu_vfio.o diff --git a/drivers/vgpu/vgpu_vfio.c b/drivers/vgpu/vgpu_vfio.c new file mode 100644 index 0000000..460a4dc --- /dev/null +++ b/drivers/vgpu/vgpu_vfio.c @@ -0,0 +1,671 @@ +/* + * VGPU VFIO device + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/cdev.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/vgpu.h> + +#include "vgpu_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "VGPU VFIO Driver" + +#define VFIO_PCI_OFFSET_SHIFT 40 + +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +struct vfio_vgpu_device { + struct iommu_group *group; + struct vgpu_device *vgpu_dev; + int refcnt; + struct pci_bar_info bar_info[VFIO_PCI_NUM_REGIONS]; + u8 *vconfig; +}; + +static DEFINE_MUTEX(vfio_vgpu_lock); + +static int get_virtual_bar_info(struct vgpu_device *vgpu_dev, + struct pci_bar_info *bar_info, + int index) +{ + int ret = -1; + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; + + if (gpu_dev->ops->vgpu_bar_info) + ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info); + return ret; +} + +static int vdev_read_base(struct vfio_vgpu_device *vdev) +{ + int index, pos; + u32 start_lo, start_hi; + u32 mem_type; + + pos = PCI_BASE_ADDRESS_0; + + for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) { + + if (!vdev->bar_info[index].size) + continue; + + start_lo = (*(u32 *)(vdev->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_MASK; + mem_type = (*(u32 *)(vdev->vconfig + pos)) & + PCI_BASE_ADDRESS_MEM_TYPE_MASK; + + switch (mem_type) { + case PCI_BASE_ADDRESS_MEM_TYPE_64: + start_hi = (*(u32 *)(vdev->vconfig + pos + 4)); + pos += 4; + break; + case PCI_BASE_ADDRESS_MEM_TYPE_32: + case PCI_BASE_ADDRESS_MEM_TYPE_1M: + /* 1M mem BAR treated as 32-bit BAR */ + default: + /* mem unknown type treated as 32-bit BAR */ + start_hi = 0; + break; + } + pos += 4; + vdev->bar_info[index].start = ((u64)start_hi << 32) | start_lo; + } + return 0; +} + +static int vgpu_dev_open(void *device_data) +{ + int ret = 0; + struct vfio_vgpu_device *vdev = device_data; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&vfio_vgpu_lock); + + if (!vdev->refcnt) { + u8 *vconfig; + int vconfig_size, index; + + for (index = 0; index < VFIO_PCI_NUM_REGIONS; index++) { + ret = get_virtual_bar_info(vdev->vgpu_dev, + &vdev->bar_info[index], + index); + if (ret) + goto open_error; + } + vconfig_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size; + if (!vconfig_size) + goto open_error; + + vconfig = kzalloc(vconfig_size, GFP_KERNEL); + if (!vconfig) { + ret = -ENOMEM; + goto open_error; + } + + vdev->vconfig = vconfig; + } + + vdev->refcnt++; +open_error: + + mutex_unlock(&vfio_vgpu_lock); + + if (ret) + module_put(THIS_MODULE); + + return ret; +} + +static void vgpu_dev_close(void *device_data) +{ + struct vfio_vgpu_device *vdev = device_data; + + mutex_lock(&vfio_vgpu_lock); + + vdev->refcnt--; + if (!vdev->refcnt) { + memset(&vdev->bar_info, 0, sizeof(vdev->bar_info)); + if (vdev->vconfig) + kfree(vdev->vconfig); + } + + mutex_unlock(&vfio_vgpu_lock); + + module_put(THIS_MODULE); +} + +static int vgpu_get_irq_count(struct vfio_vgpu_device *vdev, int irq_type) +{ + // Don't support MSIX for now + if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) + return -1; + + return 1; +} + +static long vgpu_dev_unlocked_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + int ret = 0; + struct vfio_vgpu_device *vdev = device_data; + unsigned long minsz; + + switch (cmd) + { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + printk(KERN_INFO "%s VFIO_DEVICE_GET_INFO cmd index ", __FUNCTION__); + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_PCI; + info.num_regions = VFIO_PCI_NUM_REGIONS; + info.num_irqs = VFIO_PCI_NUM_IRQS; + + return copy_to_user((void __user *)arg, &info, minsz); + } + + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + printk(KERN_INFO "%s VFIO_DEVICE_GET_REGION_INFO cmd for region_index %d", __FUNCTION__, info.index); + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vdev->bar_info[info.index].size; + if (!info.size) { + info.flags = 0; + break; + } + + info.flags = vdev->bar_info[info.index].flags; + break; + case VFIO_PCI_VGA_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + + case VFIO_PCI_ROM_REGION_INDEX: + default: + return -EINVAL; + } + + return copy_to_user((void __user *)arg, &info, minsz); + + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + printk(KERN_INFO "%s VFIO_DEVICE_GET_IRQ_INFO cmd", __FUNCTION__); + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + /* pass thru to return error */ + case VFIO_PCI_MSIX_IRQ_INDEX: + default: + return -EINVAL; + } + + info.count = VFIO_PCI_NUM_IRQS; + + info.flags = VFIO_IRQ_INFO_EVENTFD; + info.count = vgpu_get_irq_count(vdev, info.index); + + if (info.count == -1) + return -EINVAL; + + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; + + return copy_to_user((void __user *)arg, &info, minsz); + } + + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + struct gpu_device *gpu_dev = vdev->vgpu_dev->gpu_dev; + u8 *data = NULL; + int ret = 0; + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK)) + return -EINVAL; + + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { + size_t size; + int max = vgpu_get_irq_count(vdev, hdr.index); + + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) + size = sizeof(uint8_t); + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) + size = sizeof(int32_t); + else + return -EINVAL; + + if (hdr.argsz - minsz < hdr.count * size || + hdr.start >= max || hdr.start + hdr.count > max) + return -EINVAL; + + data = memdup_user((void __user *)(arg + minsz), + hdr.count * size); + if (IS_ERR(data)) + return PTR_ERR(data); + + } + + if (gpu_dev->ops->vgpu_set_irqs) { + ret = gpu_dev->ops->vgpu_set_irqs(vdev->vgpu_dev, + hdr.flags, + hdr.index, hdr.start, + hdr.count, data); + } + kfree(data); + return ret; + } + + default: + return -EINVAL; + } + return ret; +} + +ssize_t vgpu_dev_config_rw(struct vfio_vgpu_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; + int cfg_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size; + int ret = 0; + uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (pos < 0 || pos >= cfg_size || + pos + count > cfg_size) { + printk(KERN_ERR "%s pos 0x%llx out of range\n", __FUNCTION__, pos); + ret = -EFAULT; + goto config_rw_exit; + } + + if (iswrite) { + char *user_data = kmalloc(count, GFP_KERNEL); + + if (user_data == NULL) { + ret = -ENOMEM; + goto config_rw_exit; + } + + if (copy_from_user(user_data, buf, count)) { + ret = -EFAULT; + kfree(user_data); + goto config_rw_exit; + } + + if (gpu_dev->ops->write) { + ret = gpu_dev->ops->write(vgpu_dev, + user_data, + count, + vgpu_emul_space_config, + pos); + } + + memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count); + kfree(user_data); + } + else + { + char *ret_data = kzalloc(count, GFP_KERNEL); + + if (ret_data == NULL) { + ret = -ENOMEM; + goto config_rw_exit; + } + + if (gpu_dev->ops->read) { + ret = gpu_dev->ops->read(vgpu_dev, + ret_data, + count, + vgpu_emul_space_config, + pos); + } + + if (ret > 0 ) { + if (copy_to_user(buf, ret_data, ret)) { + ret = -EFAULT; + kfree(ret_data); + goto config_rw_exit; + } + + memcpy((void *)(vdev->vconfig + pos), (void *)ret_data, count); + } + kfree(ret_data); + } +config_rw_exit: + return ret; +} + +ssize_t vgpu_dev_bar_rw(struct vfio_vgpu_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; + struct gpu_device *gpu_dev = vgpu_dev->gpu_dev; + loff_t offset = *ppos & VFIO_PCI_OFFSET_MASK; + loff_t pos; + int bar_index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret = 0; + + if (!vdev->bar_info[bar_index].start) { + ret = vdev_read_base(vdev); + if (ret) + goto bar_rw_exit; + } + + if (offset >= vdev->bar_info[bar_index].size) { + ret = -EINVAL; + goto bar_rw_exit; + } + + pos = vdev->bar_info[bar_index].start + offset; + if (iswrite) { + char *user_data = kmalloc(count, GFP_KERNEL); + + if (user_data == NULL) { + ret = -ENOMEM; + goto bar_rw_exit; + } + + if (copy_from_user(user_data, buf, count)) { + ret = -EFAULT; + kfree(user_data); + goto bar_rw_exit; + } + + if (gpu_dev->ops->write) { + ret = gpu_dev->ops->write(vgpu_dev, + user_data, + count, + vgpu_emul_space_mmio, + pos); + } + + kfree(user_data); + } + else + { + char *ret_data = kmalloc(count, GFP_KERNEL); + + if (ret_data == NULL) { + ret = -ENOMEM; + goto bar_rw_exit; + } + + memset(ret_data, 0, count); + + if (gpu_dev->ops->read) { + ret = gpu_dev->ops->read(vgpu_dev, + ret_data, + count, + vgpu_emul_space_mmio, + pos); + } + + if (ret > 0 ) { + if (copy_to_user(buf, ret_data, ret)) { + ret = -EFAULT; + } + } + kfree(ret_data); + } + +bar_rw_exit: + return ret; +} + + +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct vfio_vgpu_device *vdev = device_data; + + if (index >= VFIO_PCI_NUM_REGIONS) + return -EINVAL; + + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + break; + } + + return -EINVAL; +} + + +static ssize_t vgpu_dev_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + int ret = 0; + + if (count) + ret = vgpu_dev_rw(device_data, buf, count, ppos, false); + + return ret; +} + +static ssize_t vgpu_dev_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + int ret = 0; + + if (count) + ret = vgpu_dev_rw(device_data, (char *)buf, count, ppos, true); + + return ret; +} + +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int ret = 0; + struct vfio_vgpu_device *vdev = vma->vm_private_data; + struct vgpu_device *vgpu_dev; + struct gpu_device *gpu_dev; + u64 virtaddr = (u64)vmf->virtual_address; + u64 offset, phyaddr; + unsigned long req_size, pgoff; + pgprot_t pg_prot; + + if (!vdev && !vdev->vgpu_dev) + return -EINVAL; + + vgpu_dev = vdev->vgpu_dev; + gpu_dev = vgpu_dev->gpu_dev; + + offset = vma->vm_pgoff << PAGE_SHIFT; + phyaddr = virtaddr - vma->vm_start + offset; + pgoff = phyaddr >> PAGE_SHIFT; + req_size = vma->vm_end - virtaddr; + pg_prot = vma->vm_page_prot; + + if (gpu_dev->ops->validate_map_request) { + ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff, + &req_size, &pg_prot); + if (ret) + return ret; + + if (!req_size) + return -EINVAL; + } + + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); + + return ret | VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct vgpu_dev_mmio_ops = { + .fault = vgpu_dev_mmio_fault, +}; + + +static int vgpu_dev_mmap(void *device_data, struct vm_area_struct *vma) +{ + unsigned int index; + struct vfio_vgpu_device *vdev = device_data; + struct vgpu_device *vgpu_dev = vdev->vgpu_dev; + struct pci_dev *pdev = vgpu_dev->gpu_dev->dev; + unsigned long pgoff; + + loff_t offset = vma->vm_pgoff << PAGE_SHIFT; + + index = VFIO_PCI_OFFSET_TO_INDEX(offset); + + if (index >= VFIO_PCI_ROM_REGION_INDEX) + return -EINVAL; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + + vma->vm_private_data = vdev; + vma->vm_ops = &vgpu_dev_mmio_ops; + + return 0; +} + +static const struct vfio_device_ops vgpu_vfio_dev_ops = { + .name = "vfio-vgpu", + .open = vgpu_dev_open, + .release = vgpu_dev_close, + .ioctl = vgpu_dev_unlocked_ioctl, + .read = vgpu_dev_read, + .write = vgpu_dev_write, + .mmap = vgpu_dev_mmap, +}; + +int vgpu_vfio_probe(struct device *dev) +{ + struct vfio_vgpu_device *vdev; + struct vgpu_device *vgpu_dev = to_vgpu_device(dev); + int ret = 0; + + if (vgpu_dev == NULL) + return -EINVAL; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + return -ENOMEM; + } + + vdev->vgpu_dev = vgpu_dev; + vdev->group = vgpu_dev->group; + + ret = vfio_add_group_dev(dev, &vgpu_vfio_dev_ops, vdev); + if (ret) + kfree(vdev); + + printk(KERN_INFO "%s ret = %d\n", __FUNCTION__, ret); + return ret; +} + +void vgpu_vfio_remove(struct device *dev) +{ + struct vfio_vgpu_device *vdev; + + printk(KERN_INFO "%s \n", __FUNCTION__); + vdev = vfio_del_group_dev(dev); + if (vdev) { + printk(KERN_INFO "%s vdev being freed\n", __FUNCTION__); + kfree(vdev); + } +} + +struct vgpu_driver vgpu_vfio_driver = { + .name = "vgpu-vfio", + .probe = vgpu_vfio_probe, + .remove = vgpu_vfio_remove, +}; + +static int __init vgpu_vfio_init(void) +{ + printk(KERN_INFO "%s \n", __FUNCTION__); + return vgpu_register_driver(&vgpu_vfio_driver, THIS_MODULE); +} + +static void __exit vgpu_vfio_exit(void) +{ + printk(KERN_INFO "%s \n", __FUNCTION__); + vgpu_unregister_driver(&vgpu_vfio_driver); +} + +module_init(vgpu_vfio_init) +module_exit(vgpu_vfio_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC);