Message ID | 20180228201520.25283.97532.stgit@gimli.home (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Alex, I love your patch! Perhaps something to improve: [auto build test WARNING on linus/master] [also build test WARNING on v4.16-rc4 next-20180306] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Alex-Williamson/vfio-pci-Pull-BAR-mapping-setup-from-read-write-path/20180303-015851 reproduce: # apt-get install sparse make ARCH=x86_64 allmodconfig make C=1 CF=-D__CHECK_ENDIAN__ sparse warnings: (new ones prefixed by >>) >> drivers/vfio/pci/vfio_pci_rdwr.c:290:1: sparse: incorrect type in argument 2 (different address spaces) @@ expected void [noderef] <asn:2>*<noident> @@ got sn:2>*<noident> @@ drivers/vfio/pci/vfio_pci_rdwr.c:290:1: expected void [noderef] <asn:2>*<noident> drivers/vfio/pci/vfio_pci_rdwr.c:290:1: got void *opaque drivers/vfio/pci/vfio_pci_rdwr.c:291:1: sparse: incorrect type in argument 2 (different address spaces) @@ expected void [noderef] <asn:2>*<noident> @@ got sn:2>*<noident> @@ drivers/vfio/pci/vfio_pci_rdwr.c:291:1: expected void [noderef] <asn:2>*<noident> drivers/vfio/pci/vfio_pci_rdwr.c:291:1: got void *opaque drivers/vfio/pci/vfio_pci_rdwr.c:292:1: sparse: incorrect type in argument 2 (different address spaces) @@ expected void [noderef] <asn:2>*<noident> @@ got sn:2>*<noident> @@ drivers/vfio/pci/vfio_pci_rdwr.c:292:1: expected void [noderef] <asn:2>*<noident> drivers/vfio/pci/vfio_pci_rdwr.c:292:1: got void *opaque >> drivers/vfio/pci/vfio_pci_rdwr.c:378:52: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *opaque @@ got void [noderef] <avoid *opaque @@ drivers/vfio/pci/vfio_pci_rdwr.c:378:52: expected void *opaque drivers/vfio/pci/vfio_pci_rdwr.c:378:52: got void [noderef] <asn:2>* vim +290 drivers/vfio/pci/vfio_pci_rdwr.c 286 287 #ifdef iowrite64 288 VFIO_PCI_IOEVENTFD_HANDLER(64) 289 #endif > 290 VFIO_PCI_IOEVENTFD_HANDLER(32) 291 VFIO_PCI_IOEVENTFD_HANDLER(16) 292 VFIO_PCI_IOEVENTFD_HANDLER(8) 293 294 long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, 295 uint64_t data, int count, int fd) 296 { 297 struct pci_dev *pdev = vdev->pdev; 298 loff_t pos = offset & VFIO_PCI_OFFSET_MASK; 299 int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); 300 struct vfio_pci_ioeventfd *ioeventfd; 301 int (*handler)(void *addr, void *value); 302 303 /* Only support ioeventfds into BARs */ 304 if (bar > VFIO_PCI_BAR5_REGION_INDEX) 305 return -EINVAL; 306 307 if (pos + count > pci_resource_len(pdev, bar)) 308 return -EINVAL; 309 310 /* Disallow ioeventfds working around MSI-X table writes */ 311 if (bar == vdev->msix_bar && 312 !(pos + count <= vdev->msix_offset || 313 pos >= vdev->msix_offset + vdev->msix_size)) 314 return -EINVAL; 315 316 switch (count) { 317 case 1: 318 handler = &vfio_pci_ioeventfd_handler8; 319 break; 320 case 2: 321 handler = &vfio_pci_ioeventfd_handler16; 322 break; 323 case 4: 324 handler = &vfio_pci_ioeventfd_handler32; 325 break; 326 #ifdef iowrite64 327 case 8: 328 handler = &vfio_pci_ioeventfd_handler64; 329 break; 330 #endif 331 default: 332 return -EINVAL; 333 } 334 335 ret = vfio_pci_setup_barmap(vdev, bar); 336 if (ret) 337 return ret; 338 339 mutex_lock(&vdev->ioeventfds_lock); 340 341 list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) { 342 if (ioeventfd->pos == pos && ioeventfd->bar == bar && 343 ioeventfd->data == data && ioeventfd->count == count) { 344 if (fd == -1) { 345 vfio_virqfd_disable(&ioeventfd->virqfd); 346 list_del(&ioeventfd->next); 347 vdev->ioeventfds_nr--; 348 kfree(ioeventfd); 349 ret = 0; 350 } else 351 ret = -EEXIST; 352 353 goto out_unlock; 354 } 355 } 356 357 if (fd < 0) { 358 ret = -ENODEV; 359 goto out_unlock; 360 } 361 362 if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) { 363 ret = -ENOSPC; 364 goto out_unlock; 365 } 366 367 ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); 368 if (!ioeventfd) { 369 ret = -ENOMEM; 370 goto out_unlock; 371 } 372 373 ioeventfd->pos = pos; 374 ioeventfd->bar = bar; 375 ioeventfd->data = data; 376 ioeventfd->count = count; 377 > 378 ret = vfio_virqfd_enable(vdev->barmap[bar] + pos, handler, NULL, --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Wed, Feb 28, 2018 at 01:15:20PM -0700, Alex Williamson wrote: [...] > @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > vdev->irq_type = VFIO_PCI_NUM_IRQS; > mutex_init(&vdev->igate); > spin_lock_init(&vdev->irqlock); > + mutex_init(&vdev->ioeventfds_lock); Do we better need to destroy the mutex in vfio_pci_remove? I see that vfio_pci_device.igate is also without a destructor. I'm not sure on both. Thanks, > + INIT_LIST_HEAD(&vdev->ioeventfds_list); > > ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); > if (ret) {
Hi Alex, On 28/02/18 21:15, Alex Williamson wrote: > The ioeventfd here is actually irqfd handling of an ioeventfd such as > supported in KVM. A user is able to pre-program a device write to > occur when the eventfd triggers. This is yet another instance of > eventfd-irqfd triggering between KVM and vfio. The impetus for this > is high frequency writes to pages which are virtualized in QEMU. > Enabling this near-direct write path for selected registers within > the virtualized page can improve performance and reduce overhead. > Specifically this is initially targeted at NVIDIA graphics cards where > the driver issues a write to an MMIO register within a virtualized > region in order to allow the MSI interrupt to re-trigger. > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > --- > drivers/vfio/pci/vfio_pci.c | 34 ++++++++++ > drivers/vfio/pci/vfio_pci_private.h | 18 +++++ > drivers/vfio/pci/vfio_pci_rdwr.c | 115 +++++++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 27 ++++++++ > 4 files changed, 194 insertions(+) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index b0f759476900..ad18ed266dc0 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > { > struct pci_dev *pdev = vdev->pdev; > struct vfio_pci_dummy_resource *dummy_res, *tmp; > + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; > int i, bar; > > /* Stop the device from further DMA */ > @@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > VFIO_IRQ_SET_ACTION_TRIGGER, > vdev->irq_type, 0, 0, NULL); > > + /* Device closed, don't need mutex here */ > + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, > + &vdev->ioeventfds_list, next) { > + vfio_virqfd_disable(&ioeventfd->virqfd); > + list_del(&ioeventfd->next); > + kfree(ioeventfd); > + } > + vdev->ioeventfds_nr = 0; > + > vdev->virq_disabled = false; > > for (i = 0; i < vdev->num_regions; i++) > @@ -1012,6 +1022,28 @@ static long vfio_pci_ioctl(void *device_data, > > kfree(groups); > return ret; > + } else if (cmd == VFIO_DEVICE_IOEVENTFD) { > + struct vfio_device_ioeventfd ioeventfd; > + int count; > + > + minsz = offsetofend(struct vfio_device_ioeventfd, fd); > + > + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (ioeventfd.argsz < minsz) > + return -EINVAL; > + > + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) > + return -EINVAL; > + > + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; > + > + if (hweight8(count) != 1 || ioeventfd.fd < -1) > + return -EINVAL; > + > + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, > + ioeventfd.data, count, ioeventfd.fd); > } > > return -ENOTTY; > @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > vdev->irq_type = VFIO_PCI_NUM_IRQS; > mutex_init(&vdev->igate); > spin_lock_init(&vdev->irqlock); > + mutex_init(&vdev->ioeventfds_lock); > + INIT_LIST_HEAD(&vdev->ioeventfds_list); > > ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); > if (ret) { > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index f561ac1c78a0..33a48c3ba11c 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -29,6 +29,18 @@ > #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ > #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ > > +/* Cap maximum number of ioeventfds per device (arbitrary) */ > +#define VFIO_PCI_IOEVENTFD_MAX 1000 > + > +struct vfio_pci_ioeventfd { > + struct list_head next; > + struct virqfd *virqfd; > + loff_t pos; > + uint64_t data; > + int bar; > + int count; > +}; > + > struct vfio_pci_irq_ctx { > struct eventfd_ctx *trigger; > struct virqfd *unmask; > @@ -92,9 +104,12 @@ struct vfio_pci_device { > bool nointx; > struct pci_saved_state *pci_saved_state; > int refcnt; > + int ioeventfds_nr; > struct eventfd_ctx *err_trigger; > struct eventfd_ctx *req_trigger; > struct list_head dummy_resources_list; > + struct mutex ioeventfds_lock; > + struct list_head ioeventfds_list; > }; > > #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) > @@ -120,6 +135,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, > extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, > size_t count, loff_t *ppos, bool iswrite); > > +extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, > + uint64_t data, int count, int fd); > + > extern int vfio_pci_init_perm_bits(void); > extern void vfio_pci_uninit_perm_bits(void); > > diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c > index 925419e0f459..43e4b5112337 100644 > --- a/drivers/vfio/pci/vfio_pci_rdwr.c > +++ b/drivers/vfio/pci/vfio_pci_rdwr.c > @@ -17,6 +17,7 @@ > #include <linux/pci.h> > #include <linux/uaccess.h> > #include <linux/io.h> > +#include <linux/vfio.h> > #include <linux/vgaarb.h> > > #include "vfio_pci_private.h" > @@ -275,3 +276,117 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, > > return done; > } > + > +#define VFIO_PCI_IOEVENTFD_HANDLER(size) \ > +static int vfio_pci_ioeventfd_handler##size(void *opaque, void *data) \ > +{ \ > + vfio_iowrite##size((unsigned long)data, opaque); \ > + return 0; \ > +} > + > +#ifdef iowrite64 > +VFIO_PCI_IOEVENTFD_HANDLER(64) > +#endif > +VFIO_PCI_IOEVENTFD_HANDLER(32) > +VFIO_PCI_IOEVENTFD_HANDLER(16) > +VFIO_PCI_IOEVENTFD_HANDLER(8) > + > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, > + uint64_t data, int count, int fd) > +{ > + struct pci_dev *pdev = vdev->pdev; > + loff_t pos = offset & VFIO_PCI_OFFSET_MASK; > + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); > + struct vfio_pci_ioeventfd *ioeventfd; > + int (*handler)(void *addr, void *value); > + > + /* Only support ioeventfds into BARs */ > + if (bar > VFIO_PCI_BAR5_REGION_INDEX) > + return -EINVAL; > + > + if (pos + count > pci_resource_len(pdev, bar)) > + return -EINVAL; > + > + /* Disallow ioeventfds working around MSI-X table writes */ > + if (bar == vdev->msix_bar && > + !(pos + count <= vdev->msix_offset || > + pos >= vdev->msix_offset + vdev->msix_size)) > + return -EINVAL; > + > + switch (count) { > + case 1: > + handler = &vfio_pci_ioeventfd_handler8; > + break; > + case 2: > + handler = &vfio_pci_ioeventfd_handler16; > + break; > + case 4: > + handler = &vfio_pci_ioeventfd_handler32; > + break; > +#ifdef iowrite64 > + case 8: > + handler = &vfio_pci_ioeventfd_handler64; > + break; from a user point of view, it is straightforward this setup will be rejected? This is not documented in the uapi at the moment. Thanks Eric > +#endif > + default: > + return -EINVAL; > + } > + > + ret = vfio_pci_setup_barmap(vdev, bar); > + if (ret) > + return ret; > + > + mutex_lock(&vdev->ioeventfds_lock); > + > + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) { > + if (ioeventfd->pos == pos && ioeventfd->bar == bar && > + ioeventfd->data == data && ioeventfd->count == count) { > + if (fd == -1) { > + vfio_virqfd_disable(&ioeventfd->virqfd); > + list_del(&ioeventfd->next); > + vdev->ioeventfds_nr--; > + kfree(ioeventfd); > + ret = 0; > + } else > + ret = -EEXIST; > + > + goto out_unlock; > + } > + } > + > + if (fd < 0) { > + ret = -ENODEV; > + goto out_unlock; > + } > + > + if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) { > + ret = -ENOSPC; > + goto out_unlock; > + } > + > + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); > + if (!ioeventfd) { > + ret = -ENOMEM; > + goto out_unlock; > + } > + > + ioeventfd->pos = pos; > + ioeventfd->bar = bar; > + ioeventfd->data = data; > + ioeventfd->count = count; > + > + ret = vfio_virqfd_enable(vdev->barmap[bar] + pos, handler, NULL, > + (void *)data, &ioeventfd->virqfd, fd); > + if (ret) { > + kfree(ioeventfd); > + goto out_unlock; > + } > + > + list_add(&ioeventfd->next, &vdev->ioeventfds_list); > + vdev->ioeventfds_nr++; > + > +out_unlock: > + mutex_unlock(&vdev->ioeventfds_lock); > + > + return ret; > +} > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index c74372163ed2..7e9d76203e86 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -575,6 +575,33 @@ struct vfio_device_gfx_plane_info { > > #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) > > +/** > + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, > + * struct vfio_device_ioeventfd) > + * > + * Perform a write to the device at the specified device fd offset, with > + * the specified data and width when the provided eventfd is triggered. > + * vfio bus drivers may not support this for all regions, or at all. > + * vfio-pci currently only enables support for BAR regions and excludes > + * the MSI-X vector table. > + * > + * Return: 0 on success, -errno on failure. > + */ > +struct vfio_device_ioeventfd { > + __u32 argsz; > + __u32 flags; > +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ > +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ > +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ > +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ > +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) > + __u64 offset; /* device fd offset of write */ > + __u64 data; /* data to be written */ > + __s32 fd; /* -1 for de-assignment */ > +}; > + > +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) > + > /* -------- API for Type1 VFIO IOMMU -------- */ > > /** >
On Tue, 13 Mar 2018 14:12:34 +0100 Auger Eric <eric.auger@redhat.com> wrote: > On 28/02/18 21:15, Alex Williamson wrote: > > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, > > + uint64_t data, int count, int fd) > > +{ > > + struct pci_dev *pdev = vdev->pdev; > > + loff_t pos = offset & VFIO_PCI_OFFSET_MASK; > > + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); > > + struct vfio_pci_ioeventfd *ioeventfd; > > + int (*handler)(void *addr, void *value); > > + > > + /* Only support ioeventfds into BARs */ > > + if (bar > VFIO_PCI_BAR5_REGION_INDEX) > > + return -EINVAL; > > + > > + if (pos + count > pci_resource_len(pdev, bar)) > > + return -EINVAL; > > + > > + /* Disallow ioeventfds working around MSI-X table writes */ > > + if (bar == vdev->msix_bar && > > + !(pos + count <= vdev->msix_offset || > > + pos >= vdev->msix_offset + vdev->msix_size)) > > + return -EINVAL; > > + > > + switch (count) { > > + case 1: > > + handler = &vfio_pci_ioeventfd_handler8; > > + break; > > + case 2: > > + handler = &vfio_pci_ioeventfd_handler16; > > + break; > > + case 4: > > + handler = &vfio_pci_ioeventfd_handler32; > > + break; > > +#ifdef iowrite64 > > + case 8: > > + handler = &vfio_pci_ioeventfd_handler64; > > + break; > from a user point of view, it is straightforward this setup will be > rejected? This is not documented in the uapi at the moment. I added a mention in the uapi, do you see any need for more? Essentially I consider this an entirely optional accelerator, bus drivers are free to implement as much or little as they want. Userspace can clearly make due without it, we've gone this long, and it's easy to reject cases we don't want to support. Thanks, Alex
On Wed, 7 Mar 2018 13:56:44 +0800 Peter Xu <peterx@redhat.com> wrote: > On Wed, Feb 28, 2018 at 01:15:20PM -0700, Alex Williamson wrote: > > [...] > > > @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > > vdev->irq_type = VFIO_PCI_NUM_IRQS; > > mutex_init(&vdev->igate); > > spin_lock_init(&vdev->irqlock); > > + mutex_init(&vdev->ioeventfds_lock); > > Do we better need to destroy the mutex in vfio_pci_remove? > > I see that vfio_pci_device.igate is also without a destructor. I'm > not sure on both. Yeah, mutex_destroy() is purely for debugging and I must have missed it when implementing vfio. I'll add it in the remove function and try to cleanup the others in a separate patch, at some point. Thanks, Alex
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index b0f759476900..ad18ed266dc0 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; int i, bar; /* Stop the device from further DMA */ @@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, vdev->irq_type, 0, 0, NULL); + /* Device closed, don't need mutex here */ + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, + &vdev->ioeventfds_list, next) { + vfio_virqfd_disable(&ioeventfd->virqfd); + list_del(&ioeventfd->next); + kfree(ioeventfd); + } + vdev->ioeventfds_nr = 0; + vdev->virq_disabled = false; for (i = 0; i < vdev->num_regions; i++) @@ -1012,6 +1022,28 @@ static long vfio_pci_ioctl(void *device_data, kfree(groups); return ret; + } else if (cmd == VFIO_DEVICE_IOEVENTFD) { + struct vfio_device_ioeventfd ioeventfd; + int count; + + minsz = offsetofend(struct vfio_device_ioeventfd, fd); + + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + return -EFAULT; + + if (ioeventfd.argsz < minsz) + return -EINVAL; + + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) + return -EINVAL; + + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; + + if (hweight8(count) != 1 || ioeventfd.fd < -1) + return -EINVAL; + + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, + ioeventfd.data, count, ioeventfd.fd); } return -ENOTTY; @@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + mutex_init(&vdev->ioeventfds_lock); + INIT_LIST_HEAD(&vdev->ioeventfds_list); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index f561ac1c78a0..33a48c3ba11c 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -29,6 +29,18 @@ #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ +/* Cap maximum number of ioeventfds per device (arbitrary) */ +#define VFIO_PCI_IOEVENTFD_MAX 1000 + +struct vfio_pci_ioeventfd { + struct list_head next; + struct virqfd *virqfd; + loff_t pos; + uint64_t data; + int bar; + int count; +}; + struct vfio_pci_irq_ctx { struct eventfd_ctx *trigger; struct virqfd *unmask; @@ -92,9 +104,12 @@ struct vfio_pci_device { bool nointx; struct pci_saved_state *pci_saved_state; int refcnt; + int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; struct list_head dummy_resources_list; + struct mutex ioeventfds_lock; + struct list_head ioeventfds_list; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) @@ -120,6 +135,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, + uint64_t data, int count, int fd); + extern int vfio_pci_init_perm_bits(void); extern void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 925419e0f459..43e4b5112337 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -17,6 +17,7 @@ #include <linux/pci.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/vfio.h> #include <linux/vgaarb.h> #include "vfio_pci_private.h" @@ -275,3 +276,117 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, return done; } + +#define VFIO_PCI_IOEVENTFD_HANDLER(size) \ +static int vfio_pci_ioeventfd_handler##size(void *opaque, void *data) \ +{ \ + vfio_iowrite##size((unsigned long)data, opaque); \ + return 0; \ +} + +#ifdef iowrite64 +VFIO_PCI_IOEVENTFD_HANDLER(64) +#endif +VFIO_PCI_IOEVENTFD_HANDLER(32) +VFIO_PCI_IOEVENTFD_HANDLER(16) +VFIO_PCI_IOEVENTFD_HANDLER(8) + +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, + uint64_t data, int count, int fd) +{ + struct pci_dev *pdev = vdev->pdev; + loff_t pos = offset & VFIO_PCI_OFFSET_MASK; + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); + struct vfio_pci_ioeventfd *ioeventfd; + int (*handler)(void *addr, void *value); + + /* Only support ioeventfds into BARs */ + if (bar > VFIO_PCI_BAR5_REGION_INDEX) + return -EINVAL; + + if (pos + count > pci_resource_len(pdev, bar)) + return -EINVAL; + + /* Disallow ioeventfds working around MSI-X table writes */ + if (bar == vdev->msix_bar && + !(pos + count <= vdev->msix_offset || + pos >= vdev->msix_offset + vdev->msix_size)) + return -EINVAL; + + switch (count) { + case 1: + handler = &vfio_pci_ioeventfd_handler8; + break; + case 2: + handler = &vfio_pci_ioeventfd_handler16; + break; + case 4: + handler = &vfio_pci_ioeventfd_handler32; + break; +#ifdef iowrite64 + case 8: + handler = &vfio_pci_ioeventfd_handler64; + break; +#endif + default: + return -EINVAL; + } + + ret = vfio_pci_setup_barmap(vdev, bar); + if (ret) + return ret; + + mutex_lock(&vdev->ioeventfds_lock); + + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) { + if (ioeventfd->pos == pos && ioeventfd->bar == bar && + ioeventfd->data == data && ioeventfd->count == count) { + if (fd == -1) { + vfio_virqfd_disable(&ioeventfd->virqfd); + list_del(&ioeventfd->next); + vdev->ioeventfds_nr--; + kfree(ioeventfd); + ret = 0; + } else + ret = -EEXIST; + + goto out_unlock; + } + } + + if (fd < 0) { + ret = -ENODEV; + goto out_unlock; + } + + if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) { + ret = -ENOSPC; + goto out_unlock; + } + + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); + if (!ioeventfd) { + ret = -ENOMEM; + goto out_unlock; + } + + ioeventfd->pos = pos; + ioeventfd->bar = bar; + ioeventfd->data = data; + ioeventfd->count = count; + + ret = vfio_virqfd_enable(vdev->barmap[bar] + pos, handler, NULL, + (void *)data, &ioeventfd->virqfd, fd); + if (ret) { + kfree(ioeventfd); + goto out_unlock; + } + + list_add(&ioeventfd->next, &vdev->ioeventfds_list); + vdev->ioeventfds_nr++; + +out_unlock: + mutex_unlock(&vdev->ioeventfds_lock); + + return ret; +} diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c74372163ed2..7e9d76203e86 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -575,6 +575,33 @@ struct vfio_device_gfx_plane_info { #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) +/** + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, + * struct vfio_device_ioeventfd) + * + * Perform a write to the device at the specified device fd offset, with + * the specified data and width when the provided eventfd is triggered. + * vfio bus drivers may not support this for all regions, or at all. + * vfio-pci currently only enables support for BAR regions and excludes + * the MSI-X vector table. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_ioeventfd { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) + __u64 offset; /* device fd offset of write */ + __u64 data; /* data to be written */ + __s32 fd; /* -1 for de-assignment */ +}; + +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) + /* -------- API for Type1 VFIO IOMMU -------- */ /**
The ioeventfd here is actually irqfd handling of an ioeventfd such as supported in KVM. A user is able to pre-program a device write to occur when the eventfd triggers. This is yet another instance of eventfd-irqfd triggering between KVM and vfio. The impetus for this is high frequency writes to pages which are virtualized in QEMU. Enabling this near-direct write path for selected registers within the virtualized page can improve performance and reduce overhead. Specifically this is initially targeted at NVIDIA graphics cards where the driver issues a write to an MMIO register within a virtualized region in order to allow the MSI interrupt to re-trigger. Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/vfio/pci/vfio_pci.c | 34 ++++++++++ drivers/vfio/pci/vfio_pci_private.h | 18 +++++ drivers/vfio/pci/vfio_pci_rdwr.c | 115 +++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 27 ++++++++ 4 files changed, 194 insertions(+)