Message ID | 1470251034-1555-3-git-send-email-kwankhede@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Kirti, [auto build test WARNING on vfio/next] [also build test WARNING on v4.7 next-20160803] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Kirti-Wankhede/Add-Mediated-device-support/20160804-032209 base: https://github.com/awilliam/linux-vfio.git next config: i386-allmodconfig (attached as .config) compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705 reproduce: # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): drivers/vfio/mdev/vfio_mpci.c: In function 'mdev_dev_mmio_fault': >> drivers/vfio/mdev/vfio_mpci.c:384:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] u64 virtaddr = (u64)vmf->virtual_address; ^ In file included from drivers/vfio/mdev/vfio_mpci.c:19:0: >> include/linux/vfio.h:23:46: warning: right shift count >= width of type [-Wshift-count-overflow] #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) ^ >> drivers/vfio/mdev/vfio_mpci.c:424:11: note: in expansion of macro 'VFIO_PCI_OFFSET_TO_INDEX' index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); ^~~~~~~~~~~~~~~~~~~~~~~~ vim +384 drivers/vfio/mdev/vfio_mpci.c 378 static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 379 { 380 int ret; 381 struct vfio_mdev *vmdev = vma->vm_private_data; 382 struct mdev_device *mdev; 383 struct parent_device *parent; > 384 u64 virtaddr = (u64)vmf->virtual_address; 385 unsigned long req_size, pgoff = 0; 386 pgprot_t pg_prot; 387 unsigned int index; 388 389 if (!vmdev && !vmdev->mdev) 390 return -EINVAL; 391 392 mdev = vmdev->mdev; 393 parent = mdev->parent; 394 395 pg_prot = vma->vm_page_prot; 396 397 if (parent->ops->validate_map_request) { 398 u64 offset; 399 loff_t pos; 400 401 offset = virtaddr - vma->vm_start; 402 req_size = vma->vm_end - virtaddr; 403 pos = (vma->vm_pgoff << PAGE_SHIFT) + offset; 404 405 ret = parent->ops->validate_map_request(mdev, pos, &virtaddr, 406 &pgoff, &req_size, &pg_prot); 407 if (ret) 408 return ret; 409 410 /* 411 * Verify pgoff and req_size are valid and virtaddr is within 412 * vma range 413 */ 414 if (!pgoff || !req_size || (virtaddr < vma->vm_start) || 415 ((virtaddr + req_size) >= vma->vm_end)) 416 return -EINVAL; 417 } else { 418 struct pci_dev *pdev; 419 420 virtaddr = vma->vm_start; 421 req_size = vma->vm_end - vma->vm_start; 422 423 pdev = to_pci_dev(parent->dev); > 424 index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); 425 pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT; 426 } 427 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Kirti, [auto build test WARNING on vfio/next] [also build test WARNING on v4.7 next-20160803] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Kirti-Wankhede/Add-Mediated-device-support/20160804-032209 base: https://github.com/awilliam/linux-vfio.git next config: i386-allyesconfig (attached as .config) compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705 reproduce: # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): drivers/vfio/mdev/vfio_mpci.c: In function 'mdev_dev_mmio_fault': drivers/vfio/mdev/vfio_mpci.c:384:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] u64 virtaddr = (u64)vmf->virtual_address; ^ >> drivers/vfio/mdev/vfio_mpci.c:424:32: warning: right shift count >= width of type [-Wshift-count-overflow] index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); ^~ vim +424 drivers/vfio/mdev/vfio_mpci.c 378 static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 379 { 380 int ret; 381 struct vfio_mdev *vmdev = vma->vm_private_data; 382 struct mdev_device *mdev; 383 struct parent_device *parent; > 384 u64 virtaddr = (u64)vmf->virtual_address; 385 unsigned long req_size, pgoff = 0; 386 pgprot_t pg_prot; 387 unsigned int index; 388 389 if (!vmdev && !vmdev->mdev) 390 return -EINVAL; 391 392 mdev = vmdev->mdev; 393 parent = mdev->parent; 394 395 pg_prot = vma->vm_page_prot; 396 397 if (parent->ops->validate_map_request) { 398 u64 offset; 399 loff_t pos; 400 401 offset = virtaddr - vma->vm_start; 402 req_size = vma->vm_end - virtaddr; 403 pos = (vma->vm_pgoff << PAGE_SHIFT) + offset; 404 405 ret = parent->ops->validate_map_request(mdev, pos, &virtaddr, 406 &pgoff, &req_size, &pg_prot); 407 if (ret) 408 return ret; 409 410 /* 411 * Verify pgoff and req_size are valid and virtaddr is within 412 * vma range 413 */ 414 if (!pgoff || !req_size || (virtaddr < vma->vm_start) || 415 ((virtaddr + req_size) >= vma->vm_end)) 416 return -EINVAL; 417 } else { 418 struct pci_dev *pdev; 419 420 virtaddr = vma->vm_start; 421 req_size = vma->vm_end - vma->vm_start; 422 423 pdev = to_pci_dev(parent->dev); > 424 index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); 425 pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT; 426 } 427 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Thu, 4 Aug 2016 00:33:52 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > MPCI VFIO driver registers with MDEV core driver. MDEV core driver creates > mediated device and calls probe routine of MPCI VFIO driver. This driver > adds mediated device to VFIO core module. > Main aim of this module is to manage all VFIO APIs for each mediated PCI > device. Those are: > - get region information from vendor driver. > - trap and emulate PCI config space and BAR region. > - Send interrupt configuration information to vendor driver. > - Device reset > - mmap mappable region with invalidate mapping and fault on access to > remap pfns. If validate_map_request() is not provided by vendor driver, > fault handler maps physical devices region. > - Add and delete mappable region's physical mappings to mdev's mapping > tracking logic. > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com> > Signed-off-by: Neo Jia <cjia@nvidia.com> > Change-Id: I583f4734752971d3d112324d69e2508c88f359ec > --- > drivers/vfio/mdev/Kconfig | 6 + > drivers/vfio/mdev/Makefile | 1 + > drivers/vfio/mdev/vfio_mpci.c | 536 ++++++++++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_private.h | 6 - > drivers/vfio/pci/vfio_pci_rdwr.c | 1 + > include/linux/vfio.h | 7 + > 6 files changed, 551 insertions(+), 6 deletions(-) > create mode 100644 drivers/vfio/mdev/vfio_mpci.c > > diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig > index a34fbc66f92f..431ed595c8da 100644 > --- a/drivers/vfio/mdev/Kconfig > +++ b/drivers/vfio/mdev/Kconfig > @@ -9,4 +9,10 @@ config VFIO_MDEV > > If you don't know what do here, say N. > > +config VFIO_MPCI > + tristate "VFIO support for Mediated PCI devices" > + depends on VFIO && PCI && VFIO_MDEV > + default n > + help > + VFIO based driver for mediated PCI devices. > > diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile > index 56a75e689582..264fb03dd0e3 100644 > --- a/drivers/vfio/mdev/Makefile > +++ b/drivers/vfio/mdev/Makefile > @@ -2,4 +2,5 @@ > mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o > > obj-$(CONFIG_VFIO_MDEV) += mdev.o > +obj-$(CONFIG_VFIO_MPCI) += vfio_mpci.o > > diff --git a/drivers/vfio/mdev/vfio_mpci.c b/drivers/vfio/mdev/vfio_mpci.c > new file mode 100644 > index 000000000000..9da94b76ae3e > --- /dev/null > +++ b/drivers/vfio/mdev/vfio_mpci.c > @@ -0,0 +1,536 @@ > +/* > + * VFIO based Mediated PCI device driver > + * > + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. > + * Author: Neo Jia <cjia@nvidia.com> > + * Kirti Wankhede <kwankhede@nvidia.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/device.h> > +#include <linux/kernel.h> > +#include <linux/slab.h> > +#include <linux/uuid.h> > +#include <linux/vfio.h> > +#include <linux/iommu.h> > +#include <linux/mdev.h> > + > +#include "mdev_private.h" > + > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "NVIDIA Corporation" > +#define DRIVER_DESC "VFIO based Mediated PCI device driver" > + > +struct vfio_mdev { > + struct iommu_group *group; > + struct mdev_device *mdev; > + int refcnt; > + struct vfio_region_info vfio_region_info[VFIO_PCI_NUM_REGIONS]; > + struct mutex vfio_mdev_lock; > +}; > + > +static int vfio_mpci_open(void *device_data) > +{ > + int ret = 0; > + struct vfio_mdev *vmdev = device_data; > + struct parent_device *parent = vmdev->mdev->parent; > + > + if (!try_module_get(THIS_MODULE)) > + return -ENODEV; > + > + mutex_lock(&vmdev->vfio_mdev_lock); > + if (!vmdev->refcnt && parent->ops->get_region_info) { > + int index; > + > + for (index = VFIO_PCI_BAR0_REGION_INDEX; > + index < VFIO_PCI_NUM_REGIONS; index++) { > + ret = parent->ops->get_region_info(vmdev->mdev, index, > + &vmdev->vfio_region_info[index]); > + if (ret) > + goto open_error; > + } > + } > + > + vmdev->refcnt++; > + > +open_error: > + mutex_unlock(&vmdev->vfio_mdev_lock); > + if (ret) > + module_put(THIS_MODULE); > + > + return ret; > +} > + > +static void vfio_mpci_close(void *device_data) > +{ > + struct vfio_mdev *vmdev = device_data; > + > + mutex_lock(&vmdev->vfio_mdev_lock); > + vmdev->refcnt--; > + if (!vmdev->refcnt) { > + memset(&vmdev->vfio_region_info, 0, > + sizeof(vmdev->vfio_region_info)); > + } > + mutex_unlock(&vmdev->vfio_mdev_lock); > + module_put(THIS_MODULE); > +} > + > +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability) > +{ > + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); This creates a fixed ABI between vfio-mdev-pci and vendor drivers that a given region starts at a pre-defined offset. We have the offset stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset, use it. It's just as unacceptable to impose this fixed relationship with a vendor driver here as if a userspace driver were to do the same. > + struct parent_device *parent = mdev->parent; > + u16 status; > + u8 cap_ptr, cap_id = 0xff; > + > + parent->ops->read(mdev, (char *)&status, sizeof(status), > + pos + PCI_STATUS); > + if (!(status & PCI_STATUS_CAP_LIST)) > + return 0; > + > + parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr), > + pos + PCI_CAPABILITY_LIST); > + > + do { > + cap_ptr &= 0xfc; > + parent->ops->read(mdev, &cap_id, sizeof(cap_id), > + pos + cap_ptr + PCI_CAP_LIST_ID); > + if (cap_id == capability) > + return cap_ptr; > + parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr), > + pos + cap_ptr + PCI_CAP_LIST_NEXT); > + } while (cap_ptr && cap_id != 0xff); > + > + return 0; > +} > + > +static int mpci_get_irq_count(struct vfio_mdev *vmdev, int irq_type) > +{ > + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); > + struct mdev_device *mdev = vmdev->mdev; > + struct parent_device *parent = mdev->parent; > + > + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { > + u8 pin; > + > + parent->ops->read(mdev, &pin, sizeof(pin), > + pos + PCI_INTERRUPT_PIN); > + if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) > + return 1; > + > + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { > + u8 cap_ptr; > + u16 flags; > + > + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSI); > + if (cap_ptr) { > + parent->ops->read(mdev, (char *)&flags, sizeof(flags), > + pos + cap_ptr + PCI_MSI_FLAGS); > + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); > + } > + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { > + u8 cap_ptr; > + u16 flags; > + > + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSIX); > + if (cap_ptr) { > + parent->ops->read(mdev, (char *)&flags, sizeof(flags), > + pos + cap_ptr + PCI_MSIX_FLAGS); > + > + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; > + } > + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { > + u8 cap_ptr; > + > + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_EXP); > + if (cap_ptr) > + return 1; > + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { > + return 1; > + } Much better than previous versions, but use the region_info provided by the vendor driver. Maybe you want helpers such as mpci_config_{read,write}{b,w,l}. > + > + return 0; > +} > + > +static long vfio_mpci_unlocked_ioctl(void *device_data, > + unsigned int cmd, unsigned long arg) > +{ > + int ret = 0; > + struct vfio_mdev *vmdev = device_data; > + unsigned long minsz; > + > + switch (cmd) { > + case VFIO_DEVICE_GET_INFO: > + { > + struct vfio_device_info info; > + struct parent_device *parent = vmdev->mdev->parent; > + > + minsz = offsetofend(struct vfio_device_info, num_irqs); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + info.flags = VFIO_DEVICE_FLAGS_PCI; > + > + if (parent->ops->reset) > + info.flags |= VFIO_DEVICE_FLAGS_RESET; > + > + info.num_regions = VFIO_PCI_NUM_REGIONS; > + info.num_irqs = VFIO_PCI_NUM_IRQS; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } > + case VFIO_DEVICE_GET_REGION_INFO: > + { > + struct vfio_region_info info; > + > + minsz = offsetofend(struct vfio_region_info, offset); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + switch (info.index) { > + case VFIO_PCI_CONFIG_REGION_INDEX: > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); No, vmdev->vfio_region_info[info.index].offset > + info.size = vmdev->vfio_region_info[info.index].size; > + if (!info.size) { > + info.flags = 0; > + break; > + } > + > + info.flags = vmdev->vfio_region_info[info.index].flags; > + break; > + case VFIO_PCI_VGA_REGION_INDEX: > + case VFIO_PCI_ROM_REGION_INDEX: Why? Let the vendor driver decide. > + default: > + return -EINVAL; > + } > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } > + case VFIO_DEVICE_GET_IRQ_INFO: > + { > + struct vfio_irq_info info; > + > + minsz = offsetofend(struct vfio_irq_info, count); > + > + if (copy_from_user(&info, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) > + return -EINVAL; > + > + switch (info.index) { > + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: > + case VFIO_PCI_REQ_IRQ_INDEX: > + break; > + /* pass thru to return error */ > + case VFIO_PCI_MSIX_IRQ_INDEX: ??? > + default: > + return -EINVAL; > + } > + > + info.flags = VFIO_IRQ_INFO_EVENTFD; > + info.count = mpci_get_irq_count(vmdev, info.index); > + > + if (info.count == -1) > + return -EINVAL; > + > + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) > + info.flags |= (VFIO_IRQ_INFO_MASKABLE | > + VFIO_IRQ_INFO_AUTOMASKED); > + else > + info.flags |= VFIO_IRQ_INFO_NORESIZE; > + > + return copy_to_user((void __user *)arg, &info, minsz); > + } > + case VFIO_DEVICE_SET_IRQS: > + { > + struct vfio_irq_set hdr; > + struct mdev_device *mdev = vmdev->mdev; > + struct parent_device *parent = mdev->parent; > + u8 *data = NULL, *ptr = NULL; > + > + minsz = offsetofend(struct vfio_irq_set, count); > + > + if (copy_from_user(&hdr, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || > + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | > + VFIO_IRQ_SET_ACTION_TYPE_MASK)) > + return -EINVAL; > + > + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { > + size_t size; > + int max = mpci_get_irq_count(vmdev, hdr.index); > + > + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) > + size = sizeof(uint8_t); > + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) > + size = sizeof(int32_t); > + else > + return -EINVAL; > + > + if (hdr.argsz - minsz < hdr.count * size || > + hdr.start >= max || hdr.start + hdr.count > max) > + return -EINVAL; > + > + ptr = data = memdup_user((void __user *)(arg + minsz), > + hdr.count * size); > + if (IS_ERR(data)) > + return PTR_ERR(data); > + } > + > + if (parent->ops->set_irqs) > + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, > + hdr.start, hdr.count, data); > + > + kfree(ptr); > + return ret; Return success if no set_irqs callback? > + } > + case VFIO_DEVICE_RESET: > + { > + struct parent_device *parent = vmdev->mdev->parent; > + > + if (parent->ops->reset) > + return parent->ops->reset(vmdev->mdev); > + > + return -EINVAL; > + } > + } > + return -ENOTTY; > +} > + > +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, > + size_t count, loff_t *ppos) > +{ > + struct vfio_mdev *vmdev = device_data; > + struct mdev_device *mdev = vmdev->mdev; > + struct parent_device *parent = mdev->parent; > + int ret = 0; > + > + if (!count) > + return 0; > + > + if (parent->ops->read) { > + char *ret_data, *ptr; > + > + ptr = ret_data = kzalloc(count, GFP_KERNEL); Do we really need to support arbitrary lengths in one shot? Seems like we could just use a 4 or 8 byte variable on the stack and iterate until done. > + > + if (!ret_data) > + return -ENOMEM; > + > + ret = parent->ops->read(mdev, ret_data, count, *ppos); > + > + if (ret > 0) { > + if (copy_to_user(buf, ret_data, ret)) > + ret = -EFAULT; > + else > + *ppos += ret; > + } > + kfree(ptr); > + } > + > + return ret; > +} > + > +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, > + size_t count, loff_t *ppos) > +{ > + struct vfio_mdev *vmdev = device_data; > + struct mdev_device *mdev = vmdev->mdev; > + struct parent_device *parent = mdev->parent; > + int ret = 0; > + > + if (!count) > + return 0; > + > + if (parent->ops->write) { > + char *usr_data, *ptr; > + > + ptr = usr_data = memdup_user(buf, count); Same here, how much do we care to let the user write in one pass and is there any advantage to it? When QEMU is our userspace we're only likely to see 4-byte accesses anyway. > + if (IS_ERR(usr_data)) > + return PTR_ERR(usr_data); > + > + ret = parent->ops->write(mdev, usr_data, count, *ppos); > + > + if (ret > 0) > + *ppos += ret; > + > + kfree(ptr); > + } > + > + return ret; > +} > + > +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > +{ > + int ret; > + struct vfio_mdev *vmdev = vma->vm_private_data; > + struct mdev_device *mdev; > + struct parent_device *parent; > + u64 virtaddr = (u64)vmf->virtual_address; > + unsigned long req_size, pgoff = 0; > + pgprot_t pg_prot; > + unsigned int index; > + > + if (!vmdev && !vmdev->mdev) > + return -EINVAL; > + > + mdev = vmdev->mdev; > + parent = mdev->parent; > + > + pg_prot = vma->vm_page_prot; > + > + if (parent->ops->validate_map_request) { > + u64 offset; > + loff_t pos; > + > + offset = virtaddr - vma->vm_start; > + req_size = vma->vm_end - virtaddr; > + pos = (vma->vm_pgoff << PAGE_SHIFT) + offset; > + > + ret = parent->ops->validate_map_request(mdev, pos, &virtaddr, > + &pgoff, &req_size, &pg_prot); > + if (ret) > + return ret; > + > + /* > + * Verify pgoff and req_size are valid and virtaddr is within > + * vma range > + */ > + if (!pgoff || !req_size || (virtaddr < vma->vm_start) || > + ((virtaddr + req_size) >= vma->vm_end)) > + return -EINVAL; > + } else { > + struct pci_dev *pdev; > + > + virtaddr = vma->vm_start; > + req_size = vma->vm_end - vma->vm_start; > + > + pdev = to_pci_dev(parent->dev); > + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); Iterate through region_info[*].offset/size provided by vendor driver. > + pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT; > + } > + > + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); > + > + return ret | VM_FAULT_NOPAGE; > +} > + > +void mdev_dev_mmio_close(struct vm_area_struct *vma) > +{ > + struct vfio_mdev *vmdev = vma->vm_private_data; > + struct mdev_device *mdev = vmdev->mdev; > + > + mdev_del_phys_mapping(mdev, vma->vm_pgoff << PAGE_SHIFT); > +} > + > +static const struct vm_operations_struct mdev_dev_mmio_ops = { > + .fault = mdev_dev_mmio_fault, > + .close = mdev_dev_mmio_close, > +}; > + > +static int vfio_mpci_mmap(void *device_data, struct vm_area_struct *vma) > +{ > + unsigned int index; > + struct vfio_mdev *vmdev = device_data; > + struct mdev_device *mdev = vmdev->mdev; > + > + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); > + > + if (index >= VFIO_PCI_ROM_REGION_INDEX) > + return -EINVAL; > + > + vma->vm_private_data = vmdev; > + vma->vm_ops = &mdev_dev_mmio_ops; > + > + return mdev_add_phys_mapping(mdev, vma->vm_file->f_mapping, > + vma->vm_pgoff << PAGE_SHIFT, > + vma->vm_end - vma->vm_start); > +} > + > +static const struct vfio_device_ops vfio_mpci_dev_ops = { > + .name = "vfio-mpci", > + .open = vfio_mpci_open, > + .release = vfio_mpci_close, > + .ioctl = vfio_mpci_unlocked_ioctl, > + .read = vfio_mpci_read, > + .write = vfio_mpci_write, > + .mmap = vfio_mpci_mmap, > +}; > + > +int vfio_mpci_probe(struct device *dev) > +{ > + struct vfio_mdev *vmdev; > + struct mdev_device *mdev = to_mdev_device(dev); > + int ret; > + > + vmdev = kzalloc(sizeof(*vmdev), GFP_KERNEL); > + if (IS_ERR(vmdev)) > + return PTR_ERR(vmdev); > + > + vmdev->mdev = mdev_get_device(mdev); > + vmdev->group = mdev->group; > + mutex_init(&vmdev->vfio_mdev_lock); > + > + ret = vfio_add_group_dev(dev, &vfio_mpci_dev_ops, vmdev); > + if (ret) > + kfree(vmdev); > + > + mdev_put_device(mdev); > + return ret; > +} > + > +void vfio_mpci_remove(struct device *dev) > +{ > + struct vfio_mdev *vmdev; > + > + vmdev = vfio_del_group_dev(dev); > + kfree(vmdev); > +} > + > +int vfio_mpci_match(struct device *dev) > +{ > + if (dev_is_pci(dev->parent)) This is the wrong test, there's really no requirement that a pci mdev device is hosted by a real pci device. Can't we check that the device is on an mdev_pci_bus_type? > + return 1; > + > + return 0; > +} > + > +struct mdev_driver vfio_mpci_driver = { > + .name = "vfio_mpci", > + .probe = vfio_mpci_probe, > + .remove = vfio_mpci_remove, > + .match = vfio_mpci_match, > +}; > + > +static int __init vfio_mpci_init(void) > +{ > + return mdev_register_driver(&vfio_mpci_driver, THIS_MODULE); > +} > + > +static void __exit vfio_mpci_exit(void) > +{ > + mdev_unregister_driver(&vfio_mpci_driver); > +} > + > +module_init(vfio_mpci_init) > +module_exit(vfio_mpci_exit) > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index 8a7d546d18a0..04a450908ffb 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -19,12 +19,6 @@ > #ifndef VFIO_PCI_PRIVATE_H > #define VFIO_PCI_PRIVATE_H > > -#define VFIO_PCI_OFFSET_SHIFT 40 > - > -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > -#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > -#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > - > /* Special capability IDs predefined access */ > #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ > #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ > diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c > index 5ffd1d9ad4bd..5b912be9d9c3 100644 > --- a/drivers/vfio/pci/vfio_pci_rdwr.c > +++ b/drivers/vfio/pci/vfio_pci_rdwr.c > @@ -18,6 +18,7 @@ > #include <linux/uaccess.h> > #include <linux/io.h> > #include <linux/vgaarb.h> > +#include <linux/vfio.h> > > #include "vfio_pci_private.h" > > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index 0ecae0b1cd34..431b824b0d3e 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -18,6 +18,13 @@ > #include <linux/poll.h> > #include <uapi/linux/vfio.h> > > +#define VFIO_PCI_OFFSET_SHIFT 40 > + > +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > + > + Nak this, I'm not interested in making this any sort of ABI. > /** > * struct vfio_device_ops - VFIO bus driver device callbacks > * -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/10/2016 12:30 AM, Alex Williamson wrote: > On Thu, 4 Aug 2016 00:33:52 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > ... >> + >> + switch (info.index) { >> + case VFIO_PCI_CONFIG_REGION_INDEX: >> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: >> + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); > > No, vmdev->vfio_region_info[info.index].offset > Ok. >> + info.size = vmdev->vfio_region_info[info.index].size; >> + if (!info.size) { >> + info.flags = 0; >> + break; >> + } >> + >> + info.flags = vmdev->vfio_region_info[info.index].flags; >> + break; >> + case VFIO_PCI_VGA_REGION_INDEX: >> + case VFIO_PCI_ROM_REGION_INDEX: > > Why? Let the vendor driver decide. > Ok. >> + switch (info.index) { >> + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: >> + case VFIO_PCI_REQ_IRQ_INDEX: >> + break; >> + /* pass thru to return error */ >> + case VFIO_PCI_MSIX_IRQ_INDEX: > > ??? Sorry, I missed to update this. Updating it. >> + case VFIO_DEVICE_SET_IRQS: >> + { ... >> + >> + if (parent->ops->set_irqs) >> + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, >> + hdr.start, hdr.count, data); >> + >> + kfree(ptr); >> + return ret; > > Return success if no set_irqs callback? > Ideally, vendor driver should provide this function. If vendor driver doesn't provide it, do we really need to fail here? >> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct vfio_mdev *vmdev = device_data; >> + struct mdev_device *mdev = vmdev->mdev; >> + struct parent_device *parent = mdev->parent; >> + int ret = 0; >> + >> + if (!count) >> + return 0; >> + >> + if (parent->ops->read) { >> + char *ret_data, *ptr; >> + >> + ptr = ret_data = kzalloc(count, GFP_KERNEL); > > Do we really need to support arbitrary lengths in one shot? Seems like > we could just use a 4 or 8 byte variable on the stack and iterate until > done. > We just want to pass the arguments to vendor driver as is here. Vendor driver could take care of that. >> + >> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct vfio_mdev *vmdev = device_data; >> + struct mdev_device *mdev = vmdev->mdev; >> + struct parent_device *parent = mdev->parent; >> + int ret = 0; >> + >> + if (!count) >> + return 0; >> + >> + if (parent->ops->write) { >> + char *usr_data, *ptr; >> + >> + ptr = usr_data = memdup_user(buf, count); > > Same here, how much do we care to let the user write in one pass and is > there any advantage to it? When QEMU is our userspace we're only > likely to see 4-byte accesses anyway. Same as above. >> + >> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >> +{ ... >> + } else { >> + struct pci_dev *pdev; >> + >> + virtaddr = vma->vm_start; >> + req_size = vma->vm_end - vma->vm_start; >> + >> + pdev = to_pci_dev(parent->dev); >> + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); > > Iterate through region_info[*].offset/size provided by vendor driver. > Yes, makes sense. >> + >> +int vfio_mpci_match(struct device *dev) >> +{ >> + if (dev_is_pci(dev->parent)) > > This is the wrong test, there's really no requirement that a pci mdev > device is hosted by a real pci device. Ideally this module is for the mediated device whose parent is PCI device. And we are relying on kernel functions like pci_resource_start(), to_pci_dev() in this module, so better to check it while loading. > Can't we check that the device > is on an mdev_pci_bus_type? > I didn't get this part. Each mediated device is of mdev_bus_type. But VFIO module could be different based on parent device type and loaded at the same time. For example, there should be different modules for channel IO or any other type of devices and could be loaded at the same time. Then when mdev device is created based on check in match() function of each module, and proper driver would be linked for that mdev device. If this check is not based on parent device type, do you expect to set parent device type by vendor driver and accordingly load corresponding VFIO driver? >> @@ -18,6 +18,7 @@ >> #include <linux/uaccess.h> >> #include <linux/io.h> >> #include <linux/vgaarb.h> >> +#include <linux/vfio.h> >> >> #include "vfio_pci_private.h" >> >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h >> index 0ecae0b1cd34..431b824b0d3e 100644 >> --- a/include/linux/vfio.h >> +++ b/include/linux/vfio.h >> @@ -18,6 +18,13 @@ >> #include <linux/poll.h> >> #include <uapi/linux/vfio.h> >> >> +#define VFIO_PCI_OFFSET_SHIFT 40 >> + >> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) >> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) >> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) >> + >> + > > Nak this, I'm not interested in making this any sort of ABI. > These macros are used by drivers/vfio/pci/vfio_pci.c and drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, they should be moved to common place as you suggested in earlier reviews. I think this is better common place. Are there any other suggestion? >> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability) >> +{ >> + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); > > This creates a fixed ABI between vfio-mdev-pci and vendor drivers that > a given region starts at a pre-defined offset. We have the offset > stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset, > use it. It's just as unacceptable to impose this fixed relationship > with a vendor driver here as if a userspace driver were to do the same. > In the v5 version, where config space was cached in this module, suggestion was to don't care about data or caching it at read/write, just pass it through. Now since VFIO_PCI_* macros are also available here, vendor driver can use it to decode pos to find region index and offset of access. Then vendor driver itself add vmdev->vfio_region_info[info.index].offset, which is known to him. Either we do this in VFIO module or vendor driver? Thanks, Kirti. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 11 Aug 2016 02:53:10 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 8/10/2016 12:30 AM, Alex Williamson wrote: > > On Thu, 4 Aug 2016 00:33:52 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > > ... > > >> + > >> + switch (info.index) { > >> + case VFIO_PCI_CONFIG_REGION_INDEX: > >> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > >> + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); > > > > No, vmdev->vfio_region_info[info.index].offset > > > > Ok. > > >> + info.size = vmdev->vfio_region_info[info.index].size; > >> + if (!info.size) { > >> + info.flags = 0; > >> + break; > >> + } > >> + > >> + info.flags = vmdev->vfio_region_info[info.index].flags; > >> + break; > >> + case VFIO_PCI_VGA_REGION_INDEX: > >> + case VFIO_PCI_ROM_REGION_INDEX: > > > > Why? Let the vendor driver decide. > > > > Ok. > > >> + switch (info.index) { > >> + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: > >> + case VFIO_PCI_REQ_IRQ_INDEX: > >> + break; > >> + /* pass thru to return error */ > >> + case VFIO_PCI_MSIX_IRQ_INDEX: > > > > ??? > > Sorry, I missed to update this. Updating it. > > >> + case VFIO_DEVICE_SET_IRQS: > >> + { > ... > >> + > >> + if (parent->ops->set_irqs) > >> + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, > >> + hdr.start, hdr.count, data); > >> + > >> + kfree(ptr); > >> + return ret; > > > > Return success if no set_irqs callback? > > > > Ideally, vendor driver should provide this function. If vendor driver > doesn't provide it, do we really need to fail here? Wouldn't you as a user expect to get an error if you try to call an ioctl that has no backing rather than assume success and never receive and interrupt? > >> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, > >> + size_t count, loff_t *ppos) > >> +{ > >> + struct vfio_mdev *vmdev = device_data; > >> + struct mdev_device *mdev = vmdev->mdev; > >> + struct parent_device *parent = mdev->parent; > >> + int ret = 0; > >> + > >> + if (!count) > >> + return 0; > >> + > >> + if (parent->ops->read) { > >> + char *ret_data, *ptr; > >> + > >> + ptr = ret_data = kzalloc(count, GFP_KERNEL); > > > > Do we really need to support arbitrary lengths in one shot? Seems like > > we could just use a 4 or 8 byte variable on the stack and iterate until > > done. > > > > We just want to pass the arguments to vendor driver as is here. Vendor > driver could take care of that. But I think this is exploitable, it lets the user make the kernel allocate an arbitrarily sized buffer. > >> + > >> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, > >> + size_t count, loff_t *ppos) > >> +{ > >> + struct vfio_mdev *vmdev = device_data; > >> + struct mdev_device *mdev = vmdev->mdev; > >> + struct parent_device *parent = mdev->parent; > >> + int ret = 0; > >> + > >> + if (!count) > >> + return 0; > >> + > >> + if (parent->ops->write) { > >> + char *usr_data, *ptr; > >> + > >> + ptr = usr_data = memdup_user(buf, count); > > > > Same here, how much do we care to let the user write in one pass and is > > there any advantage to it? When QEMU is our userspace we're only > > likely to see 4-byte accesses anyway. > > Same as above. > > >> + > >> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > >> +{ > ... > >> + } else { > >> + struct pci_dev *pdev; > >> + > >> + virtaddr = vma->vm_start; > >> + req_size = vma->vm_end - vma->vm_start; > >> + > >> + pdev = to_pci_dev(parent->dev); > >> + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); > > > > Iterate through region_info[*].offset/size provided by vendor driver. > > > > Yes, makes sense. > > >> + > >> +int vfio_mpci_match(struct device *dev) > >> +{ > >> + if (dev_is_pci(dev->parent)) > > > > This is the wrong test, there's really no requirement that a pci mdev > > device is hosted by a real pci device. > > Ideally this module is for the mediated device whose parent is PCI > device. And we are relying on kernel functions like > pci_resource_start(), to_pci_dev() in this module, so better to check it > while loading. IMO, we don't want to care what the parent device is, it's not ideal, it's actually a limitation to impose that it is a PCI device. I want to be able to make purely virtual mediated devices. I only see that you use these functions in the mmio fault handling. Is it useful to assume that on mmio fault we map to the parent device PCI BAR regions? Just require that the vendor driver provides a fault mapping function or SIGBUS if we get a fault and it doesn't. > > Can't we check that the device > > is on an mdev_pci_bus_type? > > > > I didn't get this part. > > Each mediated device is of mdev_bus_type. But VFIO module could be > different based on parent device type and loaded at the same time. For > example, there should be different modules for channel IO or any other > type of devices and could be loaded at the same time. Then when mdev > device is created based on check in match() function of each module, and > proper driver would be linked for that mdev device. > > If this check is not based on parent device type, do you expect to set > parent device type by vendor driver and accordingly load corresponding > VFIO driver? mdev_pci_bus_type was an off the cuff response since the driver.bus controls which devices a probe function will see. If we have a unique bus for a driver and create devices appropriately, we really don't even need a match function. That would still work, but what if you made a get_device_info callback to the vendor driver rather than creating that info in the mediated bus driver layer. Then the probe function here could simply check the flags to see if the device is VFIO_DEVICE_FLAGS_PCI? > >> @@ -18,6 +18,7 @@ > >> #include <linux/uaccess.h> > >> #include <linux/io.h> > >> #include <linux/vgaarb.h> > >> +#include <linux/vfio.h> > >> > >> #include "vfio_pci_private.h" > >> > >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h > >> index 0ecae0b1cd34..431b824b0d3e 100644 > >> --- a/include/linux/vfio.h > >> +++ b/include/linux/vfio.h > >> @@ -18,6 +18,13 @@ > >> #include <linux/poll.h> > >> #include <uapi/linux/vfio.h> > >> > >> +#define VFIO_PCI_OFFSET_SHIFT 40 > >> + > >> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > >> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > >> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > >> + > >> + > > > > Nak this, I'm not interested in making this any sort of ABI. > > > > These macros are used by drivers/vfio/pci/vfio_pci.c and > drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, > they should be moved to common place as you suggested in earlier > reviews. I think this is better common place. Are there any other > suggestion? They're only used in ways that I objected to above and you've agreed to. These define implementation details that must not become part of the mediated vendor driver ABI. A vendor driver is free to redefine this the same if they want, but as we can see with how easily they slip into code where they don't belong, the only way to make sure they don't become ABI is to keep them in private headers. > >> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability) > >> +{ > >> + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); > > > > This creates a fixed ABI between vfio-mdev-pci and vendor drivers that > > a given region starts at a pre-defined offset. We have the offset > > stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset, > > use it. It's just as unacceptable to impose this fixed relationship > > with a vendor driver here as if a userspace driver were to do the same. > > > > In the v5 version, where config space was cached in this module, > suggestion was to don't care about data or caching it at read/write, > just pass it through. Now since VFIO_PCI_* macros are also available > here, vendor driver can use it to decode pos to find region index and > offset of access. Then vendor driver itself add > vmdev->vfio_region_info[info.index].offset, which is known to him. > Either we do this in VFIO module or vendor driver? As I say above, a vendor driver is absolutely free to use the same index/offset scheme, but it absolutely must not be part of the ABI between vendor drivers and the mediated driver core. It's up to the vendor driver to define that relation and moving these to a common header is clearly too dangerous. I'm sorry if I've said otherwise in the past, but I've only recently discovered a userspace driver (DPDK) copying these defines and ignoring the index offsets reported through the REGION_INFO API. So I'm now bitterly aware how an internal implementation detail can be abused and if we don't catch them, it's going to lock us into an implementation that was designed to be flexible. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/11/2016 4:30 AM, Alex Williamson wrote: > On Thu, 11 Aug 2016 02:53:10 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 8/10/2016 12:30 AM, Alex Williamson wrote: >>> On Thu, 4 Aug 2016 00:33:52 +0530 >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>> >> >> ... >> >>>> + >>>> + switch (info.index) { >>>> + case VFIO_PCI_CONFIG_REGION_INDEX: >>>> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: >>>> + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); >>> >>> No, vmdev->vfio_region_info[info.index].offset >>> >> >> Ok. >> >>>> + info.size = vmdev->vfio_region_info[info.index].size; >>>> + if (!info.size) { >>>> + info.flags = 0; >>>> + break; >>>> + } >>>> + >>>> + info.flags = vmdev->vfio_region_info[info.index].flags; >>>> + break; >>>> + case VFIO_PCI_VGA_REGION_INDEX: >>>> + case VFIO_PCI_ROM_REGION_INDEX: >>> >>> Why? Let the vendor driver decide. >>> >> >> Ok. >> >>>> + switch (info.index) { >>>> + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: >>>> + case VFIO_PCI_REQ_IRQ_INDEX: >>>> + break; >>>> + /* pass thru to return error */ >>>> + case VFIO_PCI_MSIX_IRQ_INDEX: >>> >>> ??? >> >> Sorry, I missed to update this. Updating it. >> >>>> + case VFIO_DEVICE_SET_IRQS: >>>> + { >> ... >>>> + >>>> + if (parent->ops->set_irqs) >>>> + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, >>>> + hdr.start, hdr.count, data); >>>> + >>>> + kfree(ptr); >>>> + return ret; >>> >>> Return success if no set_irqs callback? >>> >> >> Ideally, vendor driver should provide this function. If vendor driver >> doesn't provide it, do we really need to fail here? > > Wouldn't you as a user expect to get an error if you try to call an > ioctl that has no backing rather than assume success and never receive > and interrupt? > If we really don't want to proceed if set_irqs() is not provided then its better to add it in mandatory list in mdev_register_device() in mdev_core.c and fail earlier, i.e. fail to register the device. >>>> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, >>>> + size_t count, loff_t *ppos) >>>> +{ >>>> + struct vfio_mdev *vmdev = device_data; >>>> + struct mdev_device *mdev = vmdev->mdev; >>>> + struct parent_device *parent = mdev->parent; >>>> + int ret = 0; >>>> + >>>> + if (!count) >>>> + return 0; >>>> + >>>> + if (parent->ops->read) { >>>> + char *ret_data, *ptr; >>>> + >>>> + ptr = ret_data = kzalloc(count, GFP_KERNEL); >>> >>> Do we really need to support arbitrary lengths in one shot? Seems like >>> we could just use a 4 or 8 byte variable on the stack and iterate until >>> done. >>> >> >> We just want to pass the arguments to vendor driver as is here. Vendor >> driver could take care of that. > > But I think this is exploitable, it lets the user make the kernel > allocate an arbitrarily sized buffer. > >>>> + >>>> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, >>>> + size_t count, loff_t *ppos) >>>> +{ >>>> + struct vfio_mdev *vmdev = device_data; >>>> + struct mdev_device *mdev = vmdev->mdev; >>>> + struct parent_device *parent = mdev->parent; >>>> + int ret = 0; >>>> + >>>> + if (!count) >>>> + return 0; >>>> + >>>> + if (parent->ops->write) { >>>> + char *usr_data, *ptr; >>>> + >>>> + ptr = usr_data = memdup_user(buf, count); >>> >>> Same here, how much do we care to let the user write in one pass and is >>> there any advantage to it? When QEMU is our userspace we're only >>> likely to see 4-byte accesses anyway. >> >> Same as above. >> >>>> + >>>> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >>>> +{ >> ... >>>> + } else { >>>> + struct pci_dev *pdev; >>>> + >>>> + virtaddr = vma->vm_start; >>>> + req_size = vma->vm_end - vma->vm_start; >>>> + >>>> + pdev = to_pci_dev(parent->dev); >>>> + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); >>> >>> Iterate through region_info[*].offset/size provided by vendor driver. >>> >> >> Yes, makes sense. >> >>>> + >>>> +int vfio_mpci_match(struct device *dev) >>>> +{ >>>> + if (dev_is_pci(dev->parent)) >>> >>> This is the wrong test, there's really no requirement that a pci mdev >>> device is hosted by a real pci device. >> >> Ideally this module is for the mediated device whose parent is PCI >> device. And we are relying on kernel functions like >> pci_resource_start(), to_pci_dev() in this module, so better to check it >> while loading. > > IMO, we don't want to care what the parent device is, it's not ideal, > it's actually a limitation to impose that it is a PCI device. I want to > be able to make purely virtual mediated devices. I only see that you > use these functions in the mmio fault handling. Is it useful to assume > that on mmio fault we map to the parent device PCI BAR regions? Just > require that the vendor driver provides a fault mapping function or > SIGBUS if we get a fault and it doesn't. > >>> Can't we check that the device >>> is on an mdev_pci_bus_type? >>> >> >> I didn't get this part. >> >> Each mediated device is of mdev_bus_type. But VFIO module could be >> different based on parent device type and loaded at the same time. For >> example, there should be different modules for channel IO or any other >> type of devices and could be loaded at the same time. Then when mdev >> device is created based on check in match() function of each module, and >> proper driver would be linked for that mdev device. >> >> If this check is not based on parent device type, do you expect to set >> parent device type by vendor driver and accordingly load corresponding >> VFIO driver? > > mdev_pci_bus_type was an off the cuff response since the driver.bus > controls which devices a probe function will see. If we have a unique > bus for a driver and create devices appropriately, we really don't > even need a match function. I still think that all types of mdev devices should have unique bus type so that VFIO IOMMU module could be used for any type of mediated device without any change. Otherwise we have to add checks for all supported bus types in vfio_iommu_type1_attach_group(). > That would still work, but what if you > made a get_device_info callback to the vendor driver rather than > creating that info in the mediated bus driver layer. Then the probe > function here could simply check the flags to see if the device is > VFIO_DEVICE_FLAGS_PCI? > Right. get_device_info() would be a mandatory callback and it would be vendor driver's responsibility to return proper flag. >>>> @@ -18,6 +18,7 @@ >>>> #include <linux/uaccess.h> >>>> #include <linux/io.h> >>>> #include <linux/vgaarb.h> >>>> +#include <linux/vfio.h> >>>> >>>> #include "vfio_pci_private.h" >>>> >>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h >>>> index 0ecae0b1cd34..431b824b0d3e 100644 >>>> --- a/include/linux/vfio.h >>>> +++ b/include/linux/vfio.h >>>> @@ -18,6 +18,13 @@ >>>> #include <linux/poll.h> >>>> #include <uapi/linux/vfio.h> >>>> >>>> +#define VFIO_PCI_OFFSET_SHIFT 40 >>>> + >>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) >>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) >>>> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) >>>> + >>>> + >>> >>> Nak this, I'm not interested in making this any sort of ABI. >>> >> >> These macros are used by drivers/vfio/pci/vfio_pci.c and >> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, >> they should be moved to common place as you suggested in earlier >> reviews. I think this is better common place. Are there any other >> suggestion? > > They're only used in ways that I objected to above and you've agreed > to. These define implementation details that must not become part of > the mediated vendor driver ABI. A vendor driver is free to redefine > this the same if they want, but as we can see with how easily they slip > into code where they don't belong, the only way to make sure they don't > become ABI is to keep them in private headers. > Then I think, I can't use these macros in mdev modules, they are defined in drivers/vfio/pci/vfio_pci_private.h I have to define similar macros in drivers/vfio/mdev/mdev_private.h? parent->ops->get_region_info() is called from vfio_mpci_open() that is before PCI config space is setup. Main expectation from get_region_info() was to get flags and size. At this point of time vendor driver also don't know about the base addresses of regions. case VFIO_DEVICE_GET_REGION_INFO: ... info.offset = vmdev->vfio_region_info[info.index].offset; In that case, as suggested in previous reply, above is not going to work. I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above offset according to these macros. Then on first access to any BAR region, i.e. after PCI config space is populated, call parent->ops->get_region_info() again so that vfio_region_info[index].offset for all regions are set by vendor driver. Then use these offsets to calculate 'pos' for read/write/validate_map_request(). Does this seems reasonable? Thanks, Kirti >>>> +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability) >>>> +{ >>>> + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); >>> >>> This creates a fixed ABI between vfio-mdev-pci and vendor drivers that >>> a given region starts at a pre-defined offset. We have the offset >>> stored in vfio_mdev.region_info[VFIO_PCI_CONFIG_REGION_INDEX].offset, >>> use it. It's just as unacceptable to impose this fixed relationship >>> with a vendor driver here as if a userspace driver were to do the same. >>> >> >> In the v5 version, where config space was cached in this module, >> suggestion was to don't care about data or caching it at read/write, >> just pass it through. Now since VFIO_PCI_* macros are also available >> here, vendor driver can use it to decode pos to find region index and >> offset of access. Then vendor driver itself add >> vmdev->vfio_region_info[info.index].offset, which is known to him. >> Either we do this in VFIO module or vendor driver? > > As I say above, a vendor driver is absolutely free to use the same > index/offset scheme, but it absolutely must not be part of the ABI > between vendor drivers and the mediated driver core. It's up to the > vendor driver to define that relation and moving these to a common > header is clearly too dangerous. I'm sorry if I've said otherwise in > the past, but I've only recently discovered a userspace driver (DPDK) > copying these defines and ignoring the index offsets reported through > the REGION_INFO API. So I'm now bitterly aware how an internal > implementation detail can be abused and if we don't catch them, it's > going to lock us into an implementation that was designed to be > flexible. Thanks, > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 11 Aug 2016 21:29:35 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 8/11/2016 4:30 AM, Alex Williamson wrote: > > On Thu, 11 Aug 2016 02:53:10 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > >> On 8/10/2016 12:30 AM, Alex Williamson wrote: > >>> On Thu, 4 Aug 2016 00:33:52 +0530 > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: > >>> > >> > >> ... > >> > >>>> + > >>>> + switch (info.index) { > >>>> + case VFIO_PCI_CONFIG_REGION_INDEX: > >>>> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: > >>>> + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); > >>> > >>> No, vmdev->vfio_region_info[info.index].offset > >>> > >> > >> Ok. > >> > >>>> + info.size = vmdev->vfio_region_info[info.index].size; > >>>> + if (!info.size) { > >>>> + info.flags = 0; > >>>> + break; > >>>> + } > >>>> + > >>>> + info.flags = vmdev->vfio_region_info[info.index].flags; > >>>> + break; > >>>> + case VFIO_PCI_VGA_REGION_INDEX: > >>>> + case VFIO_PCI_ROM_REGION_INDEX: > >>> > >>> Why? Let the vendor driver decide. > >>> > >> > >> Ok. > >> > >>>> + switch (info.index) { > >>>> + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: > >>>> + case VFIO_PCI_REQ_IRQ_INDEX: > >>>> + break; > >>>> + /* pass thru to return error */ > >>>> + case VFIO_PCI_MSIX_IRQ_INDEX: > >>> > >>> ??? > >> > >> Sorry, I missed to update this. Updating it. > >> > >>>> + case VFIO_DEVICE_SET_IRQS: > >>>> + { > >> ... > >>>> + > >>>> + if (parent->ops->set_irqs) > >>>> + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, > >>>> + hdr.start, hdr.count, data); > >>>> + > >>>> + kfree(ptr); > >>>> + return ret; > >>> > >>> Return success if no set_irqs callback? > >>> > >> > >> Ideally, vendor driver should provide this function. If vendor driver > >> doesn't provide it, do we really need to fail here? > > > > Wouldn't you as a user expect to get an error if you try to call an > > ioctl that has no backing rather than assume success and never receive > > and interrupt? > > > > If we really don't want to proceed if set_irqs() is not provided then > its better to add it in mandatory list in mdev_register_device() in > mdev_core.c and fail earlier, i.e. fail to register the device. Is a device required to implement some form of interrupt to be useful? What if there's a memory-only device that does not report INTx or provide MSI or MSI-X capabilities? It could still be PCI spec complaint. Really though it's just a matter of whether we're going to require the mediated driver to provide a set_irqs() stub or let them skip it and return error ourselves. Either is really fine with me, but we can't return success for an ioctl that has no backing. > >>>> +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, > >>>> + size_t count, loff_t *ppos) > >>>> +{ > >>>> + struct vfio_mdev *vmdev = device_data; > >>>> + struct mdev_device *mdev = vmdev->mdev; > >>>> + struct parent_device *parent = mdev->parent; > >>>> + int ret = 0; > >>>> + > >>>> + if (!count) > >>>> + return 0; > >>>> + > >>>> + if (parent->ops->read) { > >>>> + char *ret_data, *ptr; > >>>> + > >>>> + ptr = ret_data = kzalloc(count, GFP_KERNEL); > >>> > >>> Do we really need to support arbitrary lengths in one shot? Seems like > >>> we could just use a 4 or 8 byte variable on the stack and iterate until > >>> done. > >>> > >> > >> We just want to pass the arguments to vendor driver as is here. Vendor > >> driver could take care of that. > > > > But I think this is exploitable, it lets the user make the kernel > > allocate an arbitrarily sized buffer. > > > >>>> + > >>>> +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, > >>>> + size_t count, loff_t *ppos) > >>>> +{ > >>>> + struct vfio_mdev *vmdev = device_data; > >>>> + struct mdev_device *mdev = vmdev->mdev; > >>>> + struct parent_device *parent = mdev->parent; > >>>> + int ret = 0; > >>>> + > >>>> + if (!count) > >>>> + return 0; > >>>> + > >>>> + if (parent->ops->write) { > >>>> + char *usr_data, *ptr; > >>>> + > >>>> + ptr = usr_data = memdup_user(buf, count); > >>> > >>> Same here, how much do we care to let the user write in one pass and is > >>> there any advantage to it? When QEMU is our userspace we're only > >>> likely to see 4-byte accesses anyway. > >> > >> Same as above. > >> > >>>> + > >>>> +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > >>>> +{ > >> ... > >>>> + } else { > >>>> + struct pci_dev *pdev; > >>>> + > >>>> + virtaddr = vma->vm_start; > >>>> + req_size = vma->vm_end - vma->vm_start; > >>>> + > >>>> + pdev = to_pci_dev(parent->dev); > >>>> + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); > >>> > >>> Iterate through region_info[*].offset/size provided by vendor driver. > >>> > >> > >> Yes, makes sense. > >> > >>>> + > >>>> +int vfio_mpci_match(struct device *dev) > >>>> +{ > >>>> + if (dev_is_pci(dev->parent)) > >>> > >>> This is the wrong test, there's really no requirement that a pci mdev > >>> device is hosted by a real pci device. > >> > >> Ideally this module is for the mediated device whose parent is PCI > >> device. And we are relying on kernel functions like > >> pci_resource_start(), to_pci_dev() in this module, so better to check it > >> while loading. > > > > IMO, we don't want to care what the parent device is, it's not ideal, > > it's actually a limitation to impose that it is a PCI device. I want to > > be able to make purely virtual mediated devices. I only see that you > > use these functions in the mmio fault handling. Is it useful to assume > > that on mmio fault we map to the parent device PCI BAR regions? Just > > require that the vendor driver provides a fault mapping function or > > SIGBUS if we get a fault and it doesn't. > > > >>> Can't we check that the device > >>> is on an mdev_pci_bus_type? > >>> > >> > >> I didn't get this part. > >> > >> Each mediated device is of mdev_bus_type. But VFIO module could be > >> different based on parent device type and loaded at the same time. For > >> example, there should be different modules for channel IO or any other > >> type of devices and could be loaded at the same time. Then when mdev > >> device is created based on check in match() function of each module, and > >> proper driver would be linked for that mdev device. > >> > >> If this check is not based on parent device type, do you expect to set > >> parent device type by vendor driver and accordingly load corresponding > >> VFIO driver? > > > > mdev_pci_bus_type was an off the cuff response since the driver.bus > > controls which devices a probe function will see. If we have a unique > > bus for a driver and create devices appropriately, we really don't > > even need a match function. > > I still think that all types of mdev devices should have unique bus type > so that VFIO IOMMU module could be used for any type of mediated device > without any change. Otherwise we have to add checks for all supported > bus types in vfio_iommu_type1_attach_group(). Good point, so perhaps the vendor driver reporting the type through vfio_device_info.flags is the way to go. > > That would still work, but what if you > > made a get_device_info callback to the vendor driver rather than > > creating that info in the mediated bus driver layer. Then the probe > > function here could simply check the flags to see if the device is > > VFIO_DEVICE_FLAGS_PCI? > > > > Right. get_device_info() would be a mandatory callback and it would be > vendor driver's responsibility to return proper flag. Yep, then we don't care what the parent device is, the flags will tell us that the mediated device adheres to PCI and that's all we care about for binding here. > >>>> @@ -18,6 +18,7 @@ > >>>> #include <linux/uaccess.h> > >>>> #include <linux/io.h> > >>>> #include <linux/vgaarb.h> > >>>> +#include <linux/vfio.h> > >>>> > >>>> #include "vfio_pci_private.h" > >>>> > >>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h > >>>> index 0ecae0b1cd34..431b824b0d3e 100644 > >>>> --- a/include/linux/vfio.h > >>>> +++ b/include/linux/vfio.h > >>>> @@ -18,6 +18,13 @@ > >>>> #include <linux/poll.h> > >>>> #include <uapi/linux/vfio.h> > >>>> > >>>> +#define VFIO_PCI_OFFSET_SHIFT 40 > >>>> + > >>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > >>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > >>>> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > >>>> + > >>>> + > >>> > >>> Nak this, I'm not interested in making this any sort of ABI. > >>> > >> > >> These macros are used by drivers/vfio/pci/vfio_pci.c and > >> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, > >> they should be moved to common place as you suggested in earlier > >> reviews. I think this is better common place. Are there any other > >> suggestion? > > > > They're only used in ways that I objected to above and you've agreed > > to. These define implementation details that must not become part of > > the mediated vendor driver ABI. A vendor driver is free to redefine > > this the same if they want, but as we can see with how easily they slip > > into code where they don't belong, the only way to make sure they don't > > become ABI is to keep them in private headers. > > > > Then I think, I can't use these macros in mdev modules, they are defined > in drivers/vfio/pci/vfio_pci_private.h > I have to define similar macros in drivers/vfio/mdev/mdev_private.h? > > parent->ops->get_region_info() is called from vfio_mpci_open() that is > before PCI config space is setup. Main expectation from > get_region_info() was to get flags and size. At this point of time > vendor driver also don't know about the base addresses of regions. > > case VFIO_DEVICE_GET_REGION_INFO: > ... > > info.offset = vmdev->vfio_region_info[info.index].offset; > > In that case, as suggested in previous reply, above is not going to work. > I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above > offset according to these macros. Then on first access to any BAR > region, i.e. after PCI config space is populated, call > parent->ops->get_region_info() again so that > vfio_region_info[index].offset for all regions are set by vendor driver. > Then use these offsets to calculate 'pos' for > read/write/validate_map_request(). Does this seems reasonable? This doesn't make any sense to me, there should be absolutely no reason for the mid-layer mediated device infrastructure to impose region offsets. vfio-pci is a leaf driver, like the mediated vendor driver. Only the leaf drivers can define how they layout the offsets within the device file descriptor. Being a VFIO_PCI device only defines region indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is BAR1,... region 7 is PCI config space). If this mid-layer even needs to know region offsets, then caching them on opening the vendor device is certainly sufficient. Remember we're talking about the offset into the vfio device file descriptor, how that potentially maps onto a physical MMIO space later doesn't matter here. It seems like maybe we're confusing those points. Anyway, the more I hear about needing to reproduce these INDEX/OFFSET translation macros in places they shouldn't be used, the more confident I am in keeping them private. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/11/2016 9:54 PM, Alex Williamson wrote: > On Thu, 11 Aug 2016 21:29:35 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 8/11/2016 4:30 AM, Alex Williamson wrote: >>> On Thu, 11 Aug 2016 02:53:10 +0530 >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>> >>>> On 8/10/2016 12:30 AM, Alex Williamson wrote: >>>>> On Thu, 4 Aug 2016 00:33:52 +0530 >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote: >>>>> >>>> >>>> ... >>>>>> #include "vfio_pci_private.h" >>>>>> >>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h >>>>>> index 0ecae0b1cd34..431b824b0d3e 100644 >>>>>> --- a/include/linux/vfio.h >>>>>> +++ b/include/linux/vfio.h >>>>>> @@ -18,6 +18,13 @@ >>>>>> #include <linux/poll.h> >>>>>> #include <uapi/linux/vfio.h> >>>>>> >>>>>> +#define VFIO_PCI_OFFSET_SHIFT 40 >>>>>> + >>>>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) >>>>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) >>>>>> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) >>>>>> + >>>>>> + >>>>> >>>>> Nak this, I'm not interested in making this any sort of ABI. >>>>> >>>> >>>> These macros are used by drivers/vfio/pci/vfio_pci.c and >>>> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, >>>> they should be moved to common place as you suggested in earlier >>>> reviews. I think this is better common place. Are there any other >>>> suggestion? >>> >>> They're only used in ways that I objected to above and you've agreed >>> to. These define implementation details that must not become part of >>> the mediated vendor driver ABI. A vendor driver is free to redefine >>> this the same if they want, but as we can see with how easily they slip >>> into code where they don't belong, the only way to make sure they don't >>> become ABI is to keep them in private headers. >>> >> >> Then I think, I can't use these macros in mdev modules, they are defined >> in drivers/vfio/pci/vfio_pci_private.h >> I have to define similar macros in drivers/vfio/mdev/mdev_private.h? >> >> parent->ops->get_region_info() is called from vfio_mpci_open() that is >> before PCI config space is setup. Main expectation from >> get_region_info() was to get flags and size. At this point of time >> vendor driver also don't know about the base addresses of regions. >> >> case VFIO_DEVICE_GET_REGION_INFO: >> ... >> >> info.offset = vmdev->vfio_region_info[info.index].offset; >> >> In that case, as suggested in previous reply, above is not going to work. >> I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above >> offset according to these macros. Then on first access to any BAR >> region, i.e. after PCI config space is populated, call >> parent->ops->get_region_info() again so that >> vfio_region_info[index].offset for all regions are set by vendor driver. >> Then use these offsets to calculate 'pos' for >> read/write/validate_map_request(). Does this seems reasonable? > > This doesn't make any sense to me, there should be absolutely no reason > for the mid-layer mediated device infrastructure to impose region > offsets. vfio-pci is a leaf driver, like the mediated vendor driver. > Only the leaf drivers can define how they layout the offsets within the > device file descriptor. Being a VFIO_PCI device only defines region > indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is > BAR1,... region 7 is PCI config space). If this mid-layer even needs > to know region offsets, then caching them on opening the vendor device > is certainly sufficient. Remember we're talking about the offset into > the vfio device file descriptor, how that potentially maps onto a > physical MMIO space later doesn't matter here. It seems like maybe > we're confusing those points. Anyway, the more I hear about needing to > reproduce these INDEX/OFFSET translation macros in places they > shouldn't be used, the more confident I am in keeping them private. If vendor driver defines the offsets into vfio device file descriptor, it will be vendor drivers responsibility that the ranges defined (offset to offset + size) are not overlapping with other regions ranges. There will be no validation in vfio-mpci, right? In current implementation there is a provision that if validate_map_request() callback is not provided, map it to physical device's region and start of physical device's BAR address is queried using pci_resource_start(). Since with the above change that you are proposing, index could not be extracted from offset. Then if vendor driver doesn't provide validate_map_request(), return SIGBUS from fault handler. So that impose indirect requirement that if vendor driver sets VFIO_REGION_INFO_FLAG_MMAP for any region, they should provide validate_map_request(). Thanks, Kirti. > Thanks, > > Alex > ----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. ----------------------------------------------------------------------------------- -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 11 Aug 2016 23:16:06 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 8/11/2016 9:54 PM, Alex Williamson wrote: > > On Thu, 11 Aug 2016 21:29:35 +0530 > > Kirti Wankhede <kwankhede@nvidia.com> wrote: > > > >> On 8/11/2016 4:30 AM, Alex Williamson wrote: > >>> On Thu, 11 Aug 2016 02:53:10 +0530 > >>> Kirti Wankhede <kwankhede@nvidia.com> wrote: > >>> > >>>> On 8/10/2016 12:30 AM, Alex Williamson wrote: > >>>>> On Thu, 4 Aug 2016 00:33:52 +0530 > >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote: > >>>>> > >>>> > >>>> ... > >>>>>> #include "vfio_pci_private.h" > >>>>>> > >>>>>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h > >>>>>> index 0ecae0b1cd34..431b824b0d3e 100644 > >>>>>> --- a/include/linux/vfio.h > >>>>>> +++ b/include/linux/vfio.h > >>>>>> @@ -18,6 +18,13 @@ > >>>>>> #include <linux/poll.h> > >>>>>> #include <uapi/linux/vfio.h> > >>>>>> > >>>>>> +#define VFIO_PCI_OFFSET_SHIFT 40 > >>>>>> + > >>>>>> +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) > >>>>>> +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) > >>>>>> +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) > >>>>>> + > >>>>>> + > >>>>> > >>>>> Nak this, I'm not interested in making this any sort of ABI. > >>>>> > >>>> > >>>> These macros are used by drivers/vfio/pci/vfio_pci.c and > >>>> drivers/vfio/mdev/vfio_mpci.c and to use those in both these modules, > >>>> they should be moved to common place as you suggested in earlier > >>>> reviews. I think this is better common place. Are there any other > >>>> suggestion? > >>> > >>> They're only used in ways that I objected to above and you've agreed > >>> to. These define implementation details that must not become part of > >>> the mediated vendor driver ABI. A vendor driver is free to redefine > >>> this the same if they want, but as we can see with how easily they slip > >>> into code where they don't belong, the only way to make sure they don't > >>> become ABI is to keep them in private headers. > >>> > >> > >> Then I think, I can't use these macros in mdev modules, they are defined > >> in drivers/vfio/pci/vfio_pci_private.h > >> I have to define similar macros in drivers/vfio/mdev/mdev_private.h? > >> > >> parent->ops->get_region_info() is called from vfio_mpci_open() that is > >> before PCI config space is setup. Main expectation from > >> get_region_info() was to get flags and size. At this point of time > >> vendor driver also don't know about the base addresses of regions. > >> > >> case VFIO_DEVICE_GET_REGION_INFO: > >> ... > >> > >> info.offset = vmdev->vfio_region_info[info.index].offset; > >> > >> In that case, as suggested in previous reply, above is not going to work. > >> I'll define such macros in drivers/vfio/mdev/mdev_private.h, set above > >> offset according to these macros. Then on first access to any BAR > >> region, i.e. after PCI config space is populated, call > >> parent->ops->get_region_info() again so that > >> vfio_region_info[index].offset for all regions are set by vendor driver. > >> Then use these offsets to calculate 'pos' for > >> read/write/validate_map_request(). Does this seems reasonable? > > > > This doesn't make any sense to me, there should be absolutely no reason > > for the mid-layer mediated device infrastructure to impose region > > offsets. vfio-pci is a leaf driver, like the mediated vendor driver. > > Only the leaf drivers can define how they layout the offsets within the > > device file descriptor. Being a VFIO_PCI device only defines region > > indexes to resources, not offsets (ie. region 0 is BAR0, region 1 is > > BAR1,... region 7 is PCI config space). If this mid-layer even needs > > to know region offsets, then caching them on opening the vendor device > > is certainly sufficient. Remember we're talking about the offset into > > the vfio device file descriptor, how that potentially maps onto a > > physical MMIO space later doesn't matter here. It seems like maybe > > we're confusing those points. Anyway, the more I hear about needing to > > reproduce these INDEX/OFFSET translation macros in places they > > shouldn't be used, the more confident I am in keeping them private. > > If vendor driver defines the offsets into vfio device file descriptor, > it will be vendor drivers responsibility that the ranges defined (offset > to offset + size) are not overlapping with other regions ranges. There > will be no validation in vfio-mpci, right? Right, this seems like a pretty basic requirement of the vendor driver to offer region ranges that do not overlap and there's plenty else about the vendor driver that the mid-layer can't validate its behavior... > In current implementation there is a provision that if > validate_map_request() callback is not provided, map it to physical > device's region and start of physical device's BAR address is queried > using pci_resource_start(). Since with the above change that you are > proposing, index could not be extracted from offset. Then if vendor > driver doesn't provide validate_map_request(), return SIGBUS from fault > handler. > So that impose indirect requirement that if vendor driver sets > VFIO_REGION_INFO_FLAG_MMAP for any region, they should provide > validate_map_request(). TBH, I don't see how providing a default implementation of validate_map_request() is useful. How many mediated devices are going to want to identity map resources from the parent? Even if they do, it seems we can only support a single mediated device per parent device since each will map the same parent resource offset. Let's not even try to define a default. If we get a fault and the vendor driver hasn't provided a handler, send a SIGBUS. I expect we should also allow vendor drivers to fill the mapping at mmap() time rather than expecting this map on fault scheme. Maybe the mid-level driver should not even be interacting with mmap() and should let the vendor driver entirely determine the handling. For the most part these mid-level drivers, like mediated pci, should be as thin as possible, and to some extent I wonder if we need them at all. We mostly want user interaction with the vfio device file descriptor to pass directly to the vendor driver and we should only be adding logic to the mid-level driver when it actually provides some useful and generic simplification to the vendor driver. Things like this default fault handling scheme don't appear to be generic at all, it's actually a very unique use case I think. For the most part I think the mediated interface is just a shim to standardize the lifecycle of a mediated device for management purposes, integrate "fake/virtual" devices into the vfio infrastructure, provide common page tracking, pinning and mapping services, but the device interface itself should mostly just pass through the vfio device API straight through to the vendor driver. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/12/2016 12:13 AM, Alex Williamson wrote: > > TBH, I don't see how providing a default implementation of > validate_map_request() is useful. How many mediated devices are going > to want to identity map resources from the parent? Even if they do, it > seems we can only support a single mediated device per parent device > since each will map the same parent resource offset. Let's not even try > to define a default. If we get a fault and the vendor driver hasn't > provided a handler, send a SIGBUS. I expect we should also allow > vendor drivers to fill the mapping at mmap() time rather than expecting > this map on fault scheme. Maybe the mid-level driver should not even be > interacting with mmap() and should let the vendor driver entirely > determine the handling. > Should we go ahead with pass through mmap() call to vendor driver and let vendor driver decide what to do in mmap() call, either remap_pfn_range in mmap() or do fault on access and handle the fault in their driver. In that case we don't need to track mappings in mdev core. Let vendor driver do that on their own, right? > For the most part these mid-level drivers, like mediated pci, should be > as thin as possible, and to some extent I wonder if we need them at > all. We mostly want user interaction with the vfio device file > descriptor to pass directly to the vendor driver and we should only be > adding logic to the mid-level driver when it actually provides some > useful and generic simplification to the vendor driver. Things like > this default fault handling scheme don't appear to be generic at all, > it's actually a very unique use case I think. For the most part > I think the mediated interface is just a shim to standardize the > lifecycle of a mediated device for management purposes, > integrate "fake/virtual" devices into the vfio infrastructure, > provide common page tracking, pinning and mapping services, but > the device interface itself should mostly just pass through the > vfio device API straight through to the vendor driver. Thanks, > > Alex > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 12 Aug 2016 23:27:01 +0530 Kirti Wankhede <kwankhede@nvidia.com> wrote: > On 8/12/2016 12:13 AM, Alex Williamson wrote: > > > > > TBH, I don't see how providing a default implementation of > > validate_map_request() is useful. How many mediated devices are going > > to want to identity map resources from the parent? Even if they do, it > > seems we can only support a single mediated device per parent device > > since each will map the same parent resource offset. Let's not even try > > to define a default. If we get a fault and the vendor driver hasn't > > provided a handler, send a SIGBUS. I expect we should also allow > > vendor drivers to fill the mapping at mmap() time rather than expecting > > this map on fault scheme. Maybe the mid-level driver should not even be > > interacting with mmap() and should let the vendor driver entirely > > determine the handling. > > > > Should we go ahead with pass through mmap() call to vendor driver and > let vendor driver decide what to do in mmap() call, either > remap_pfn_range in mmap() or do fault on access and handle the fault in > their driver. In that case we don't need to track mappings in mdev core. > Let vendor driver do that on their own, right? This sounds right to me, I don't think we want to impose either model on the vendor driver. The vendor driver owns the vfio device file descriptor and is responsible for managing it should they expose mmap support for regions on the file descriptor. They either need to insert mappings at the point where mmap() is called or setup fault handlers to insert them on demand. If we can provide helper functions so that each vendor driver doesn't need to re-invent either of those, that would be a bonus. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 8/13/2016 2:55 AM, Alex Williamson wrote: > On Fri, 12 Aug 2016 23:27:01 +0530 > Kirti Wankhede <kwankhede@nvidia.com> wrote: > >> On 8/12/2016 12:13 AM, Alex Williamson wrote: >> >>> >>> TBH, I don't see how providing a default implementation of >>> validate_map_request() is useful. How many mediated devices are going >>> to want to identity map resources from the parent? Even if they do, it >>> seems we can only support a single mediated device per parent device >>> since each will map the same parent resource offset. Let's not even try >>> to define a default. If we get a fault and the vendor driver hasn't >>> provided a handler, send a SIGBUS. I expect we should also allow >>> vendor drivers to fill the mapping at mmap() time rather than expecting >>> this map on fault scheme. Maybe the mid-level driver should not even be >>> interacting with mmap() and should let the vendor driver entirely >>> determine the handling. >>> >> >> Should we go ahead with pass through mmap() call to vendor driver and >> let vendor driver decide what to do in mmap() call, either >> remap_pfn_range in mmap() or do fault on access and handle the fault in >> their driver. In that case we don't need to track mappings in mdev core. >> Let vendor driver do that on their own, right? > > This sounds right to me, I don't think we want to impose either model > on the vendor driver. The vendor driver owns the vfio device file > descriptor and is responsible for managing it should they expose mmap > support for regions on the file descriptor. They either need to insert > mappings at the point where mmap() is called or setup fault handlers to > insert them on demand. If we can provide helper functions so that each > vendor driver doesn't need to re-invent either of those, that would be > a bonus. Thanks, > Since mmap() is going to be handled in vendor driver, let vendor driver do their own tracking logic of mappings based on which way they decide to go. No need to keep it in mdev coer module and try to handle all the cases in one function. Thanks, Kirti > Alex > ----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. ----------------------------------------------------------------------------------- -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig index a34fbc66f92f..431ed595c8da 100644 --- a/drivers/vfio/mdev/Kconfig +++ b/drivers/vfio/mdev/Kconfig @@ -9,4 +9,10 @@ config VFIO_MDEV If you don't know what do here, say N. +config VFIO_MPCI + tristate "VFIO support for Mediated PCI devices" + depends on VFIO && PCI && VFIO_MDEV + default n + help + VFIO based driver for mediated PCI devices. diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile index 56a75e689582..264fb03dd0e3 100644 --- a/drivers/vfio/mdev/Makefile +++ b/drivers/vfio/mdev/Makefile @@ -2,4 +2,5 @@ mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o obj-$(CONFIG_VFIO_MDEV) += mdev.o +obj-$(CONFIG_VFIO_MPCI) += vfio_mpci.o diff --git a/drivers/vfio/mdev/vfio_mpci.c b/drivers/vfio/mdev/vfio_mpci.c new file mode 100644 index 000000000000..9da94b76ae3e --- /dev/null +++ b/drivers/vfio/mdev/vfio_mpci.c @@ -0,0 +1,536 @@ +/* + * VFIO based Mediated PCI device driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "VFIO based Mediated PCI device driver" + +struct vfio_mdev { + struct iommu_group *group; + struct mdev_device *mdev; + int refcnt; + struct vfio_region_info vfio_region_info[VFIO_PCI_NUM_REGIONS]; + struct mutex vfio_mdev_lock; +}; + +static int vfio_mpci_open(void *device_data) +{ + int ret = 0; + struct vfio_mdev *vmdev = device_data; + struct parent_device *parent = vmdev->mdev->parent; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&vmdev->vfio_mdev_lock); + if (!vmdev->refcnt && parent->ops->get_region_info) { + int index; + + for (index = VFIO_PCI_BAR0_REGION_INDEX; + index < VFIO_PCI_NUM_REGIONS; index++) { + ret = parent->ops->get_region_info(vmdev->mdev, index, + &vmdev->vfio_region_info[index]); + if (ret) + goto open_error; + } + } + + vmdev->refcnt++; + +open_error: + mutex_unlock(&vmdev->vfio_mdev_lock); + if (ret) + module_put(THIS_MODULE); + + return ret; +} + +static void vfio_mpci_close(void *device_data) +{ + struct vfio_mdev *vmdev = device_data; + + mutex_lock(&vmdev->vfio_mdev_lock); + vmdev->refcnt--; + if (!vmdev->refcnt) { + memset(&vmdev->vfio_region_info, 0, + sizeof(vmdev->vfio_region_info)); + } + mutex_unlock(&vmdev->vfio_mdev_lock); + module_put(THIS_MODULE); +} + +static u8 mpci_find_pci_capability(struct mdev_device *mdev, u8 capability) +{ + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); + struct parent_device *parent = mdev->parent; + u16 status; + u8 cap_ptr, cap_id = 0xff; + + parent->ops->read(mdev, (char *)&status, sizeof(status), + pos + PCI_STATUS); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr), + pos + PCI_CAPABILITY_LIST); + + do { + cap_ptr &= 0xfc; + parent->ops->read(mdev, &cap_id, sizeof(cap_id), + pos + cap_ptr + PCI_CAP_LIST_ID); + if (cap_id == capability) + return cap_ptr; + parent->ops->read(mdev, &cap_ptr, sizeof(cap_ptr), + pos + cap_ptr + PCI_CAP_LIST_NEXT); + } while (cap_ptr && cap_id != 0xff); + + return 0; +} + +static int mpci_get_irq_count(struct vfio_mdev *vmdev, int irq_type) +{ + loff_t pos = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_CONFIG_REGION_INDEX); + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { + u8 pin; + + parent->ops->read(mdev, &pin, sizeof(pin), + pos + PCI_INTERRUPT_PIN); + if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) + return 1; + + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { + u8 cap_ptr; + u16 flags; + + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSI); + if (cap_ptr) { + parent->ops->read(mdev, (char *)&flags, sizeof(flags), + pos + cap_ptr + PCI_MSI_FLAGS); + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); + } + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { + u8 cap_ptr; + u16 flags; + + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_MSIX); + if (cap_ptr) { + parent->ops->read(mdev, (char *)&flags, sizeof(flags), + pos + cap_ptr + PCI_MSIX_FLAGS); + + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; + } + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + u8 cap_ptr; + + cap_ptr = mpci_find_pci_capability(mdev, PCI_CAP_ID_EXP); + if (cap_ptr) + return 1; + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { + return 1; + } + + return 0; +} + +static long vfio_mpci_unlocked_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + int ret = 0; + struct vfio_mdev *vmdev = device_data; + unsigned long minsz; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + struct parent_device *parent = vmdev->mdev->parent; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_PCI; + + if (parent->ops->reset) + info.flags |= VFIO_DEVICE_FLAGS_RESET; + + info.num_regions = VFIO_PCI_NUM_REGIONS; + info.num_irqs = VFIO_PCI_NUM_IRQS; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vmdev->vfio_region_info[info.index].size; + if (!info.size) { + info.flags = 0; + break; + } + + info.flags = vmdev->vfio_region_info[info.index].flags; + break; + case VFIO_PCI_VGA_REGION_INDEX: + case VFIO_PCI_ROM_REGION_INDEX: + default: + return -EINVAL; + } + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + /* pass thru to return error */ + case VFIO_PCI_MSIX_IRQ_INDEX: + default: + return -EINVAL; + } + + info.flags = VFIO_IRQ_INFO_EVENTFD; + info.count = mpci_get_irq_count(vmdev, info.index); + + if (info.count == -1) + return -EINVAL; + + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + u8 *data = NULL, *ptr = NULL; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK)) + return -EINVAL; + + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { + size_t size; + int max = mpci_get_irq_count(vmdev, hdr.index); + + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) + size = sizeof(uint8_t); + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) + size = sizeof(int32_t); + else + return -EINVAL; + + if (hdr.argsz - minsz < hdr.count * size || + hdr.start >= max || hdr.start + hdr.count > max) + return -EINVAL; + + ptr = data = memdup_user((void __user *)(arg + minsz), + hdr.count * size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + if (parent->ops->set_irqs) + ret = parent->ops->set_irqs(mdev, hdr.flags, hdr.index, + hdr.start, hdr.count, data); + + kfree(ptr); + return ret; + } + case VFIO_DEVICE_RESET: + { + struct parent_device *parent = vmdev->mdev->parent; + + if (parent->ops->reset) + return parent->ops->reset(vmdev->mdev); + + return -EINVAL; + } + } + return -ENOTTY; +} + +static ssize_t vfio_mpci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + int ret = 0; + + if (!count) + return 0; + + if (parent->ops->read) { + char *ret_data, *ptr; + + ptr = ret_data = kzalloc(count, GFP_KERNEL); + + if (!ret_data) + return -ENOMEM; + + ret = parent->ops->read(mdev, ret_data, count, *ppos); + + if (ret > 0) { + if (copy_to_user(buf, ret_data, ret)) + ret = -EFAULT; + else + *ppos += ret; + } + kfree(ptr); + } + + return ret; +} + +static ssize_t vfio_mpci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + struct parent_device *parent = mdev->parent; + int ret = 0; + + if (!count) + return 0; + + if (parent->ops->write) { + char *usr_data, *ptr; + + ptr = usr_data = memdup_user(buf, count); + if (IS_ERR(usr_data)) + return PTR_ERR(usr_data); + + ret = parent->ops->write(mdev, usr_data, count, *ppos); + + if (ret > 0) + *ppos += ret; + + kfree(ptr); + } + + return ret; +} + +static int mdev_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int ret; + struct vfio_mdev *vmdev = vma->vm_private_data; + struct mdev_device *mdev; + struct parent_device *parent; + u64 virtaddr = (u64)vmf->virtual_address; + unsigned long req_size, pgoff = 0; + pgprot_t pg_prot; + unsigned int index; + + if (!vmdev && !vmdev->mdev) + return -EINVAL; + + mdev = vmdev->mdev; + parent = mdev->parent; + + pg_prot = vma->vm_page_prot; + + if (parent->ops->validate_map_request) { + u64 offset; + loff_t pos; + + offset = virtaddr - vma->vm_start; + req_size = vma->vm_end - virtaddr; + pos = (vma->vm_pgoff << PAGE_SHIFT) + offset; + + ret = parent->ops->validate_map_request(mdev, pos, &virtaddr, + &pgoff, &req_size, &pg_prot); + if (ret) + return ret; + + /* + * Verify pgoff and req_size are valid and virtaddr is within + * vma range + */ + if (!pgoff || !req_size || (virtaddr < vma->vm_start) || + ((virtaddr + req_size) >= vma->vm_end)) + return -EINVAL; + } else { + struct pci_dev *pdev; + + virtaddr = vma->vm_start; + req_size = vma->vm_end - vma->vm_start; + + pdev = to_pci_dev(parent->dev); + index = VFIO_PCI_OFFSET_TO_INDEX(vma->vm_pgoff << PAGE_SHIFT); + pgoff = pci_resource_start(pdev, index) >> PAGE_SHIFT; + } + + ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot); + + return ret | VM_FAULT_NOPAGE; +} + +void mdev_dev_mmio_close(struct vm_area_struct *vma) +{ + struct vfio_mdev *vmdev = vma->vm_private_data; + struct mdev_device *mdev = vmdev->mdev; + + mdev_del_phys_mapping(mdev, vma->vm_pgoff << PAGE_SHIFT); +} + +static const struct vm_operations_struct mdev_dev_mmio_ops = { + .fault = mdev_dev_mmio_fault, + .close = mdev_dev_mmio_close, +}; + +static int vfio_mpci_mmap(void *device_data, struct vm_area_struct *vma) +{ + unsigned int index; + struct vfio_mdev *vmdev = device_data; + struct mdev_device *mdev = vmdev->mdev; + + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + + if (index >= VFIO_PCI_ROM_REGION_INDEX) + return -EINVAL; + + vma->vm_private_data = vmdev; + vma->vm_ops = &mdev_dev_mmio_ops; + + return mdev_add_phys_mapping(mdev, vma->vm_file->f_mapping, + vma->vm_pgoff << PAGE_SHIFT, + vma->vm_end - vma->vm_start); +} + +static const struct vfio_device_ops vfio_mpci_dev_ops = { + .name = "vfio-mpci", + .open = vfio_mpci_open, + .release = vfio_mpci_close, + .ioctl = vfio_mpci_unlocked_ioctl, + .read = vfio_mpci_read, + .write = vfio_mpci_write, + .mmap = vfio_mpci_mmap, +}; + +int vfio_mpci_probe(struct device *dev) +{ + struct vfio_mdev *vmdev; + struct mdev_device *mdev = to_mdev_device(dev); + int ret; + + vmdev = kzalloc(sizeof(*vmdev), GFP_KERNEL); + if (IS_ERR(vmdev)) + return PTR_ERR(vmdev); + + vmdev->mdev = mdev_get_device(mdev); + vmdev->group = mdev->group; + mutex_init(&vmdev->vfio_mdev_lock); + + ret = vfio_add_group_dev(dev, &vfio_mpci_dev_ops, vmdev); + if (ret) + kfree(vmdev); + + mdev_put_device(mdev); + return ret; +} + +void vfio_mpci_remove(struct device *dev) +{ + struct vfio_mdev *vmdev; + + vmdev = vfio_del_group_dev(dev); + kfree(vmdev); +} + +int vfio_mpci_match(struct device *dev) +{ + if (dev_is_pci(dev->parent)) + return 1; + + return 0; +} + +struct mdev_driver vfio_mpci_driver = { + .name = "vfio_mpci", + .probe = vfio_mpci_probe, + .remove = vfio_mpci_remove, + .match = vfio_mpci_match, +}; + +static int __init vfio_mpci_init(void) +{ + return mdev_register_driver(&vfio_mpci_driver, THIS_MODULE); +} + +static void __exit vfio_mpci_exit(void) +{ + mdev_unregister_driver(&vfio_mpci_driver); +} + +module_init(vfio_mpci_init) +module_exit(vfio_mpci_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 8a7d546d18a0..04a450908ffb 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -19,12 +19,6 @@ #ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H -#define VFIO_PCI_OFFSET_SHIFT 40 - -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) -#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) -#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) - /* Special capability IDs predefined access */ #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 5ffd1d9ad4bd..5b912be9d9c3 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -18,6 +18,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/vgaarb.h> +#include <linux/vfio.h> #include "vfio_pci_private.h" diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0ecae0b1cd34..431b824b0d3e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -18,6 +18,13 @@ #include <linux/poll.h> #include <uapi/linux/vfio.h> +#define VFIO_PCI_OFFSET_SHIFT 40 + +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + + /** * struct vfio_device_ops - VFIO bus driver device callbacks *