Message ID | 20230227111135.61728-17-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add vfio_device cdev for iommufd support | expand |
On Mon, Feb 27, 2023 at 03:11:32AM -0800, Yi Liu wrote: > This adds ioctl for userspace to bind device cdev fd to iommufd. > > VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA > control provided by the iommufd. open_device > op is called after bind_iommufd op. > VFIO no iommu mode is indicated by passing > a negative iommufd value. > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > --- > drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++ > drivers/vfio/vfio.h | 17 ++++- > drivers/vfio/vfio_main.c | 54 ++++++++++++-- > include/linux/iommufd.h | 6 ++ > include/uapi/linux/vfio.h | 34 +++++++++ > 5 files changed, 248 insertions(+), 9 deletions(-) > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c > index 9e2c1ecaaf4f..37f80e368551 100644 > --- a/drivers/vfio/device_cdev.c > +++ b/drivers/vfio/device_cdev.c > @@ -3,6 +3,7 @@ > * Copyright (c) 2023 Intel Corporation. > */ > #include <linux/vfio.h> > +#include <linux/iommufd.h> > > #include "vfio.h" > > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) > return ret; > } > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df) > +{ > + spin_lock(&df->kvm_ref_lock); > + if (!df->kvm) > + goto unlock; > + > + _vfio_device_get_kvm_safe(df->device, df->kvm); > + > +unlock: Just if (df->kvm) _vfio_device_get_kvm_safe(df->device, df->kvm); Without the goto > + spin_unlock(&df->kvm_ref_lock); > +} > + > +void vfio_device_cdev_close(struct vfio_device_file *df) > +{ > + struct vfio_device *device = df->device; > + > + mutex_lock(&device->dev_set->lock); > + /* > + * As df->access_granted writer is under dev_set->lock as well, > + * so this read no need to use smp_load_acquire() to pair with > + * smp_store_release() in the caller of vfio_device_open(). > + */ This is a bit misleading, we are about to free df in the caller, so at this moment df has no current access. We don't even need to have the mutex to test it. > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > + unsigned long arg) struct device __user *arg and remove all the casts. > +{ > + struct vfio_device *device = df->device; > + struct vfio_device_bind_iommufd bind; > + struct iommufd_ctx *iommufd = NULL; > + unsigned long minsz; > + int ret; > + > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz || bind.flags) > + return -EINVAL; > + > + if (!device->ops->bind_iommufd) > + return -ENODEV; > + > + ret = vfio_device_block_group(device); > + if (ret) > + return ret; > + > + mutex_lock(&device->dev_set->lock); > + /* > + * If already been bound to an iommufd, or already set noiommu > + * then fail it. > + */ > + if (df->iommufd || df->noiommu) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + /* iommufd < 0 means noiommu mode */ > + if (bind.iommufd < 0) { > + if (!capable(CAP_SYS_RAWIO)) { > + ret = -EPERM; > + goto out_unlock; > + } > + df->noiommu = true; > + } else { > + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); > + if (IS_ERR(iommufd)) { > + ret = PTR_ERR(iommufd); > + goto out_unlock; > + } > + } > + > + /* > + * Before the device open, get the KVM pointer currently > + * associated with the device file (if there is) and obtain > + * a reference. This reference is held until device closed. > + * Save the pointer in the device for use by drivers. > + */ > + vfio_device_get_kvm_safe(df); > + > + df->iommufd = iommufd; > + ret = vfio_device_open(df, &bind.out_devid, NULL); > + if (ret) > + goto out_put_kvm; > + > + ret = copy_to_user((void __user *)arg + > + offsetofend(struct vfio_device_bind_iommufd, iommufd), ?? &arg->out_dev_id static_assert(__same_type...) > diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h > index 650d45629647..9672cf839687 100644 > --- a/include/linux/iommufd.h > +++ b/include/linux/iommufd.h > @@ -17,6 +17,12 @@ struct iommufd_ctx; > struct iommufd_access; > struct file; > > +/* > + * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid > + * ID starts from 1. > + */ > +#define IOMMUFD_INVALID_ID 0 Why? vfio doesn't need to check this just to generate EINVAL. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, February 28, 2023 3:20 AM > > On Mon, Feb 27, 2023 at 03:11:32AM -0800, Yi Liu wrote: > > This adds ioctl for userspace to bind device cdev fd to iommufd. > > > > VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain > DMA > > control provided by the iommufd. open_device > > op is called after bind_iommufd op. > > VFIO no iommu mode is indicated by passing > > a negative iommufd value. > > > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > > --- > > drivers/vfio/device_cdev.c | 146 > +++++++++++++++++++++++++++++++++++++ > > drivers/vfio/vfio.h | 17 ++++- > > drivers/vfio/vfio_main.c | 54 ++++++++++++-- > > include/linux/iommufd.h | 6 ++ > > include/uapi/linux/vfio.h | 34 +++++++++ > > 5 files changed, 248 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c > > index 9e2c1ecaaf4f..37f80e368551 100644 > > --- a/drivers/vfio/device_cdev.c > > +++ b/drivers/vfio/device_cdev.c > > @@ -3,6 +3,7 @@ > > * Copyright (c) 2023 Intel Corporation. > > */ > > #include <linux/vfio.h> > > +#include <linux/iommufd.h> > > > > #include "vfio.h" > > > > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode > *inode, struct file *filep) > > return ret; > > } > > > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df) > > +{ > > + spin_lock(&df->kvm_ref_lock); > > + if (!df->kvm) > > + goto unlock; > > + > > + _vfio_device_get_kvm_safe(df->device, df->kvm); > > + > > +unlock: > > Just > > if (df->kvm) > _vfio_device_get_kvm_safe(df->device, df->kvm); > > Without the goto Got it. > > + spin_unlock(&df->kvm_ref_lock); > > +} > > + > > +void vfio_device_cdev_close(struct vfio_device_file *df) > > +{ > > + struct vfio_device *device = df->device; > > + > > + mutex_lock(&device->dev_set->lock); > > + /* > > + * As df->access_granted writer is under dev_set->lock as well, > > + * so this read no need to use smp_load_acquire() to pair with > > + * smp_store_release() in the caller of vfio_device_open(). > > + */ > > This is a bit misleading, we are about to free df in the caller, so at > this moment df has no current access. We don't even need to have the > mutex to test it. Ok. so I can test it outside the lock and make the comment more clear? How about below? Or simply no need to have a comment here? /* * caller of vfio_device_cdev_close() is going to free df, so there * is no need to use smp_load_acquire() to pair with * smp_store_release() in the writer path of df->access_granted. */ > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > + unsigned long arg) > > struct device __user *arg and remove all the casts. > > > +{ > > + struct vfio_device *device = df->device; > > + struct vfio_device_bind_iommufd bind; > > + struct iommufd_ctx *iommufd = NULL; > > + unsigned long minsz; > > + int ret; > > + > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz || bind.flags) > > + return -EINVAL; > > + > > + if (!device->ops->bind_iommufd) > > + return -ENODEV; > > + > > + ret = vfio_device_block_group(device); > > + if (ret) > > + return ret; > > + > > + mutex_lock(&device->dev_set->lock); > > + /* > > + * If already been bound to an iommufd, or already set noiommu > > + * then fail it. > > + */ > > + if (df->iommufd || df->noiommu) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + /* iommufd < 0 means noiommu mode */ > > + if (bind.iommufd < 0) { > > + if (!capable(CAP_SYS_RAWIO)) { > > + ret = -EPERM; > > + goto out_unlock; > > + } > > + df->noiommu = true; > > + } else { > > + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); > > + if (IS_ERR(iommufd)) { > > + ret = PTR_ERR(iommufd); > > + goto out_unlock; > > + } > > + } > > + > > + /* > > + * Before the device open, get the KVM pointer currently > > + * associated with the device file (if there is) and obtain > > + * a reference. This reference is held until device closed. > > + * Save the pointer in the device for use by drivers. > > + */ > > + vfio_device_get_kvm_safe(df); > > + > > + df->iommufd = iommufd; > > + ret = vfio_device_open(df, &bind.out_devid, NULL); > > + if (ret) > > + goto out_put_kvm; > > + > > + ret = copy_to_user((void __user *)arg + > > + offsetofend(struct vfio_device_bind_iommufd, > iommufd), > > ?? > > &arg->out_dev_id > > static_assert(__same_type...) Yes, all the above comments are similar with other two patches. Will refine accordingly. > > diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h > > index 650d45629647..9672cf839687 100644 > > --- a/include/linux/iommufd.h > > +++ b/include/linux/iommufd.h > > @@ -17,6 +17,12 @@ struct iommufd_ctx; > > struct iommufd_access; > > struct file; > > > > +/* > > + * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid > > + * ID starts from 1. > > + */ > > +#define IOMMUFD_INVALID_ID 0 > > Why? vfio doesn't need to check this just to generate EINVAL. Hmmm, you are right. Not needed any more. Regards, Yi Liu
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Monday, February 27, 2023 7:12 PM [...] > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > + unsigned long arg) > +{ > + struct vfio_device *device = df->device; > + struct vfio_device_bind_iommufd bind; > + struct iommufd_ctx *iommufd = NULL; > + unsigned long minsz; > + int ret; > + > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz || bind.flags) > + return -EINVAL; > + > + if (!device->ops->bind_iommufd) > + return -ENODEV; Hi Jason, Per the comment in vfio_iommufd_bind(), such device driver won't provide .bind_iommufd(). So shall we allow this ioctl to go longer to call .open_device() instead of failing it here? I think we need to allow it to go further. E.g. leave the check to be in vfio_iommufd_bind(). Otherwise, user may not able to use such devices. Is it? > + > + ret = vfio_device_block_group(device); > + if (ret) > + return ret; > + > + mutex_lock(&device->dev_set->lock); > + /* > + * If already been bound to an iommufd, or already set noiommu > + * then fail it. > + */ > + if (df->iommufd || df->noiommu) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + /* iommufd < 0 means noiommu mode */ > + if (bind.iommufd < 0) { > + if (!capable(CAP_SYS_RAWIO)) { > + ret = -EPERM; > + goto out_unlock; > + } > + df->noiommu = true; > + } else { > + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); > + if (IS_ERR(iommufd)) { > + ret = PTR_ERR(iommufd); > + goto out_unlock; > + } > + } > + > + /* > + * Before the device open, get the KVM pointer currently > + * associated with the device file (if there is) and obtain > + * a reference. This reference is held until device closed. > + * Save the pointer in the device for use by drivers. > + */ > + vfio_device_get_kvm_safe(df); > + > + df->iommufd = iommufd; > + ret = vfio_device_open(df, &bind.out_devid, NULL); > + if (ret) > + goto out_put_kvm; [...] > > /* --------------- IOCTLs for DEVICE file descriptors --------------- */ > > +/* > + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19, > + * struct vfio_device_bind_iommufd) > + * > + * Bind a vfio_device to the specified iommufd. > + * > + * The user should provide a device cookie when calling this ioctl. The > + * cookie is carried only in event e.g. I/O fault reported to userspace > + * via iommufd. The user should use devid returned by this ioctl to mark > + * the target device in other ioctls (e.g. capability query via iommufd). > + * > + * User is not allowed to access the device before the binding operation > + * is completed. > + * > + * Unbind is automatically conducted when device fd is closed. > + * > + * @argsz: user filled size of this data. > + * @flags: reserved for future extension. > + * @dev_cookie: a per device cookie provided by userspace. > + * @iommufd: iommufd to bind. a negative value means noiommu. > + * @out_devid: the device id generated by this bind. > + * > + * Return: 0 on success, -errno on failure. > + */ > +struct vfio_device_bind_iommufd { > + __u32 argsz; > + __u32 flags; > + __aligned_u64 dev_cookie; > + __s32 iommufd; > + __u32 out_devid; As above, for the devices that do not do DMA, there is no .bind_iommufd op, hence no iommufd_device generated. This means no good value can be filled in this out_devid field. So this field is optional. Only for the devices which do DMA, should this out_devid field return a valid ID otherwise an invalid ID would be filled (e.g. value #0 is an invalid value in the iommufd object id pool). Userspace needs to check if the out_devid is valid or not before use. This ID can be further used in iommufd uAPIs like IOMMU_HWPT_ALLOC, IOMMU_DEVICE_GET_INFO and etc. > +}; > + > +#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE > + 19) > + > /** > * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, > * struct vfio_device_info) > -- > 2.34.1 Regards, Yi Liu
On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote: > > From: Liu, Yi L <yi.l.liu@intel.com> > > Sent: Monday, February 27, 2023 7:12 PM > [...] > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > + unsigned long arg) > > +{ > > + struct vfio_device *device = df->device; > > + struct vfio_device_bind_iommufd bind; > > + struct iommufd_ctx *iommufd = NULL; > > + unsigned long minsz; > > + int ret; > > + > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz || bind.flags) > > + return -EINVAL; > > + > > + if (!device->ops->bind_iommufd) > > + return -ENODEV; > > Hi Jason, > > Per the comment in vfio_iommufd_bind(), such device driver > won't provide .bind_iommufd(). So shall we allow this ioctl > to go longer to call .open_device() instead of failing it here? > I think we need to allow it to go further. E.g. leave the check > to be in vfio_iommufd_bind(). Otherwise, user may not able > to use such devices. Is it? You are thinking about the crazy mdev samples? We should probably just change them to provide a 'no dma' set of ops. > > +struct vfio_device_bind_iommufd { > > + __u32 argsz; > > + __u32 flags; > > + __aligned_u64 dev_cookie; > > + __s32 iommufd; > > + __u32 out_devid; > > As above, for the devices that do not do DMA, there is no .bind_iommufd > op, hence no iommufd_device generated. This means no good value > can be filled in this out_devid field. So this field is optional. Only > for the devices which do DMA, should this out_devid field return a > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an > invalid value in the iommufd object id pool). Userspace needs to > check if the out_devid is valid or not before use. This ID can be further > used in iommufd uAPIs like IOMMU_HWPT_ALLOC, IOMMU_DEVICE_GET_INFO > and etc. I would say create an access and harmonize the no-DMA devices with the emulated devices. What should we return here anyhow if an access was created? Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, March 2, 2023 1:47 AM > > On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote: > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > Sent: Monday, February 27, 2023 7:12 PM > > [...] > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > > + unsigned long arg) > > > +{ > > > + struct vfio_device *device = df->device; > > > + struct vfio_device_bind_iommufd bind; > > > + struct iommufd_ctx *iommufd = NULL; > > > + unsigned long minsz; > > > + int ret; > > > + > > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > > + > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (bind.argsz < minsz || bind.flags) > > > + return -EINVAL; > > > + > > > + if (!device->ops->bind_iommufd) > > > + return -ENODEV; > > > > Hi Jason, > > > > Per the comment in vfio_iommufd_bind(), such device driver > > won't provide .bind_iommufd(). So shall we allow this ioctl > > to go longer to call .open_device() instead of failing it here? > > I think we need to allow it to go further. E.g. leave the check > > to be in vfio_iommufd_bind(). Otherwise, user may not able > > to use such devices. Is it? > > You are thinking about the crazy mdev samples? Yes. we don't have real devices which don't do DMA. Is it? > We should probably just change them to provide a 'no dma' set of ops. Yes. at least generate iommufd_device I suppose. > > > +struct vfio_device_bind_iommufd { > > > + __u32 argsz; > > > + __u32 flags; > > > + __aligned_u64 dev_cookie; > > > + __s32 iommufd; > > > + __u32 out_devid; > > > > As above, for the devices that do not do DMA, there is no .bind_iommufd > > op, hence no iommufd_device generated. This means no good value > > can be filled in this out_devid field. So this field is optional. Only > > for the devices which do DMA, should this out_devid field return a > > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an > > invalid value in the iommufd object id pool). Userspace needs to > > check if the out_devid is valid or not before use. This ID can be further > > used in iommufd uAPIs like IOMMU_HWPT_ALLOC, > IOMMU_DEVICE_GET_INFO > > and etc. > > I would say create an access and harmonize the no-DMA devices with the > emulated devices. In this case, iommufd_access would be created instead of iommufd_device. > What should we return here anyhow if an access was created? It depends on what can be done with this id and whether this field is mandatory. For iommufd_device ID, the user could further use it to query iommu device info and alloc hwpt. Do we have a similar usage for iommufd_access? And if we define this field as optional, then we may return iommufd_access object Id in future if it is needed. Regards, Yi Liu
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, March 2, 2023 1:47 AM > > On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote: > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > Sent: Monday, February 27, 2023 7:12 PM > > [...] > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > > + unsigned long arg) > > > +{ > > > + struct vfio_device *device = df->device; > > > + struct vfio_device_bind_iommufd bind; > > > + struct iommufd_ctx *iommufd = NULL; > > > + unsigned long minsz; > > > + int ret; > > > + > > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > > + > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > + return -EFAULT; > > > + > > > + if (bind.argsz < minsz || bind.flags) > > > + return -EINVAL; > > > + > > > + if (!device->ops->bind_iommufd) > > > + return -ENODEV; > > > > Hi Jason, > > > > Per the comment in vfio_iommufd_bind(), such device driver > > won't provide .bind_iommufd(). So shall we allow this ioctl > > to go longer to call .open_device() instead of failing it here? > > I think we need to allow it to go further. E.g. leave the check > > to be in vfio_iommufd_bind(). Otherwise, user may not able > > to use such devices. Is it? > > You are thinking about the crazy mdev samples? > > We should probably just change them to provide a 'no dma' set of ops. > > > > +struct vfio_device_bind_iommufd { > > > + __u32 argsz; > > > + __u32 flags; > > > + __aligned_u64 dev_cookie; > > > + __s32 iommufd; > > > + __u32 out_devid; > > > > As above, for the devices that do not do DMA, there is no .bind_iommufd > > op, hence no iommufd_device generated. This means no good value > > can be filled in this out_devid field. So this field is optional. Only > > for the devices which do DMA, should this out_devid field return a > > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an > > invalid value in the iommufd object id pool). Userspace needs to > > check if the out_devid is valid or not before use. This ID can be further > > used in iommufd uAPIs like IOMMU_HWPT_ALLOC, > IOMMU_DEVICE_GET_INFO > > and etc. > > I would say create an access and harmonize the no-DMA devices with the > emulated devices. How about below change? diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 4f82a6fa7c6c..e536515086d7 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -18,12 +18,8 @@ int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) lockdep_assert_held(&vdev->dev_set->lock); - /* - * If the driver doesn't provide this op then it means the device does - * not do DMA at all. So nothing to do. - */ - if (!vdev->ops->bind_iommufd) - return 0; + if (WARN_ON(!vdev->ops->bind_iommufd)) + return -ENODEV; ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); if (ret) @@ -102,7 +98,9 @@ EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); /* * The emulated standard ops mean that vfio_device is going to use the * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this - * ops set should call vfio_register_emulated_iommu_dev(). + * ops set should call vfio_register_emulated_iommu_dev(). Drivers that do + * not call vfio_pin_pages()/vfio_dma_rw() has no need to provide dma_unmap + * callback. */ static void vfio_emulated_unmap(void *data, unsigned long iova, @@ -110,7 +107,8 @@ static void vfio_emulated_unmap(void *data, unsigned long iova, { struct vfio_device *vdev = data; - vdev->ops->dma_unmap(vdev, iova, length); + if (vdev->ops->dma_unmap) + vdev->ops->dma_unmap(vdev, iova, length); } static const struct iommufd_access_ops vfio_user_ops = { diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index e54eb752e1ba..19391dda5fba 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1374,6 +1374,9 @@ static const struct vfio_device_ops mbochs_dev_ops = { .write = mbochs_write, .ioctl = mbochs_ioctl, .mmap = mbochs_mmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static struct mdev_driver mbochs_driver = { diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index e8400fdab71d..5f48aef36995 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -663,6 +663,9 @@ static const struct vfio_device_ops mdpy_dev_ops = { .write = mdpy_write, .ioctl = mdpy_ioctl, .mmap = mdpy_mmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static struct mdev_driver mdpy_driver = { diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e887de672c52..35460901b9f7 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1269,6 +1269,9 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static struct mdev_driver mtty_driver = { > What should we return here anyhow if an access was created? iommufd_access->obj.id. should be fine. Is it? Regards, Yi Liu
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Friday, March 3, 2023 2:58 PM > > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Thursday, March 2, 2023 1:47 AM > > > > On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote: > > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > > Sent: Monday, February 27, 2023 7:12 PM > > > [...] > > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > > > + unsigned long arg) > > > > +{ > > > > + struct vfio_device *device = df->device; > > > > + struct vfio_device_bind_iommufd bind; > > > > + struct iommufd_ctx *iommufd = NULL; > > > > + unsigned long minsz; > > > > + int ret; > > > > + > > > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > > > + > > > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > > > + return -EFAULT; > > > > + > > > > + if (bind.argsz < minsz || bind.flags) > > > > + return -EINVAL; > > > > + > > > > + if (!device->ops->bind_iommufd) > > > > + return -ENODEV; > > > > > > Hi Jason, > > > > > > Per the comment in vfio_iommufd_bind(), such device driver > > > won't provide .bind_iommufd(). So shall we allow this ioctl > > > to go longer to call .open_device() instead of failing it here? > > > I think we need to allow it to go further. E.g. leave the check > > > to be in vfio_iommufd_bind(). Otherwise, user may not able > > > to use such devices. Is it? > > > > You are thinking about the crazy mdev samples? > > > > We should probably just change them to provide a 'no dma' set of ops. > > > > > > +struct vfio_device_bind_iommufd { > > > > + __u32 argsz; > > > > + __u32 flags; > > > > + __aligned_u64 dev_cookie; > > > > + __s32 iommufd; > > > > + __u32 out_devid; > > > > > > As above, for the devices that do not do DMA, there is > no .bind_iommufd > > > op, hence no iommufd_device generated. This means no good value > > > can be filled in this out_devid field. So this field is optional. Only > > > for the devices which do DMA, should this out_devid field return a > > > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an > > > invalid value in the iommufd object id pool). Userspace needs to > > > check if the out_devid is valid or not before use. This ID can be further > > > used in iommufd uAPIs like IOMMU_HWPT_ALLOC, > > IOMMU_DEVICE_GET_INFO > > > and etc. > > > > I would say create an access and harmonize the no-DMA devices with the > > emulated devices. > > How about below change? > > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c > index 4f82a6fa7c6c..e536515086d7 100644 > --- a/drivers/vfio/iommufd.c > +++ b/drivers/vfio/iommufd.c > @@ -18,12 +18,8 @@ int vfio_iommufd_bind(struct vfio_device *vdev, > struct iommufd_ctx *ictx) > > lockdep_assert_held(&vdev->dev_set->lock); > > - /* > - * If the driver doesn't provide this op then it means the device does > - * not do DMA at all. So nothing to do. > - */ > - if (!vdev->ops->bind_iommufd) > - return 0; > + if (WARN_ON(!vdev->ops->bind_iommufd)) > + return -ENODEV; > > ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); > if (ret) > @@ -102,7 +98,9 @@ > EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); > /* > * The emulated standard ops mean that vfio_device is going to use the > * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using > this > - * ops set should call vfio_register_emulated_iommu_dev(). > + * ops set should call vfio_register_emulated_iommu_dev(). Drivers that > do > + * not call vfio_pin_pages()/vfio_dma_rw() has no need to provide > dma_unmap > + * callback. > */ > > static void vfio_emulated_unmap(void *data, unsigned long iova, > @@ -110,7 +107,8 @@ static void vfio_emulated_unmap(void *data, > unsigned long iova, > { > struct vfio_device *vdev = data; > > - vdev->ops->dma_unmap(vdev, iova, length); > + if (vdev->ops->dma_unmap) > + vdev->ops->dma_unmap(vdev, iova, length); > } > > static const struct iommufd_access_ops vfio_user_ops = { > diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c > index e54eb752e1ba..19391dda5fba 100644 > --- a/samples/vfio-mdev/mbochs.c > +++ b/samples/vfio-mdev/mbochs.c > @@ -1374,6 +1374,9 @@ static const struct vfio_device_ops > mbochs_dev_ops = { > .write = mbochs_write, > .ioctl = mbochs_ioctl, > .mmap = mbochs_mmap, > + .bind_iommufd = vfio_iommufd_emulated_bind, > + .unbind_iommufd = vfio_iommufd_emulated_unbind, > + .attach_ioas = vfio_iommufd_emulated_attach_ioas, > }; > > static struct mdev_driver mbochs_driver = { > diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c > index e8400fdab71d..5f48aef36995 100644 > --- a/samples/vfio-mdev/mdpy.c > +++ b/samples/vfio-mdev/mdpy.c > @@ -663,6 +663,9 @@ static const struct vfio_device_ops mdpy_dev_ops = > { > .write = mdpy_write, > .ioctl = mdpy_ioctl, > .mmap = mdpy_mmap, > + .bind_iommufd = vfio_iommufd_emulated_bind, > + .unbind_iommufd = vfio_iommufd_emulated_unbind, > + .attach_ioas = vfio_iommufd_emulated_attach_ioas, > }; > > static struct mdev_driver mdpy_driver = { > diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c > index e887de672c52..35460901b9f7 100644 > --- a/samples/vfio-mdev/mtty.c > +++ b/samples/vfio-mdev/mtty.c > @@ -1269,6 +1269,9 @@ static const struct vfio_device_ops mtty_dev_ops > = { > .read = mtty_read, > .write = mtty_write, > .ioctl = mtty_ioctl, > + .bind_iommufd = vfio_iommufd_emulated_bind, > + .unbind_iommufd = vfio_iommufd_emulated_unbind, > + .attach_ioas = vfio_iommufd_emulated_attach_ioas, > }; > > static struct mdev_driver mtty_driver = { > > > What should we return here anyhow if an access was created? > > iommufd_access->obj.id. should be fine. Is it? btw. It requires creating iommufd_access in vfio_iommufd_emulated_bind() instead of in the attach(). Seems like Nicolin's replace domain series has a patch to move iommufd_access creation to the bind(). Regards, Yi Liu
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Friday, March 3, 2023 2:58 PM > > > What should we return here anyhow if an access was created? > > iommufd_access->obj.id. should be fine. Is it? > Thinking more I'm not sure whether it's a good idea to fill the dev_id field with an access object id and then later confuse the user to get an -ENOENT error when trying to allocate a hwpt with an access object id. How can user differentiate it from the real error case where invalid iommufd object is used? It sounds clearer to return dev_id only when there is a true device object being created by the bind_iommufd cmd. Then the user can use it to decide whether to further attempt dev_id related cmds.
On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote: > > From: Liu, Yi L <yi.l.liu@intel.com> > > Sent: Friday, March 3, 2023 2:58 PM > > > > > What should we return here anyhow if an access was created? > > > > iommufd_access->obj.id. should be fine. Is it? > > Thinking more I'm not sure whether it's a good idea to fill the > dev_id field with an access object id and then later confuse > the user to get an -ENOENT error when trying to allocate a > hwpt with an access object id. > > How can user differentiate it from the real error case where > invalid iommufd object is used? > > It sounds clearer to return dev_id only when there is a true > device object being created by the bind_iommufd cmd. Then > the user can use it to decide whether to further attempt > dev_id related cmds. It means we can never return an access_id I don't think this is a problem, the first thing userspace should do is a get info to the dev_id which is needed to learn which iommu driver is running it, if that returns EOPNOTSUPP then it isn't a physical iommu device. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, March 7, 2023 8:38 PM > > On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote: > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > Sent: Friday, March 3, 2023 2:58 PM > > > > > > > What should we return here anyhow if an access was created? > > > > > > iommufd_access->obj.id. should be fine. Is it? > > > > Thinking more I'm not sure whether it's a good idea to fill the > > dev_id field with an access object id and then later confuse > > the user to get an -ENOENT error when trying to allocate a > > hwpt with an access object id. > > > > How can user differentiate it from the real error case where > > invalid iommufd object is used? > > > > It sounds clearer to return dev_id only when there is a true > > device object being created by the bind_iommufd cmd. Then > > the user can use it to decide whether to further attempt > > dev_id related cmds. > > It means we can never return an access_id > > I don't think this is a problem, the first thing userspace should do > is a get info to the dev_id which is needed to learn which iommu > driver is running it, if that returns EOPNOTSUPP then it isn't a > physical iommu device. This may mean your below patch depends on the get info series.
> From: Liu, Yi L <yi.l.liu@intel.com> > Sent: Tuesday, March 7, 2023 9:04 PM > > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Tuesday, March 7, 2023 8:38 PM > > > > On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote: > > > > From: Liu, Yi L <yi.l.liu@intel.com> > > > > Sent: Friday, March 3, 2023 2:58 PM > > > > > > > > > What should we return here anyhow if an access was created? > > > > > > > > iommufd_access->obj.id. should be fine. Is it? > > > > > > Thinking more I'm not sure whether it's a good idea to fill the > > > dev_id field with an access object id and then later confuse > > > the user to get an -ENOENT error when trying to allocate a > > > hwpt with an access object id. > > > > > > How can user differentiate it from the real error case where > > > invalid iommufd object is used? > > > > > > It sounds clearer to return dev_id only when there is a true > > > device object being created by the bind_iommufd cmd. Then > > > the user can use it to decide whether to further attempt > > > dev_id related cmds. > > > > It means we can never return an access_id > > > > I don't think this is a problem, the first thing userspace should do > > is a get info to the dev_id which is needed to learn which iommu > > driver is running it, if that returns EOPNOTSUPP then it isn't a > > physical iommu device. > > This may mean your below patch depends on the get info series.
On 27/2/23 22:11, Yi Liu wrote: > This adds ioctl for userspace to bind device cdev fd to iommufd. > > VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA > control provided by the iommufd. open_device > op is called after bind_iommufd op. > VFIO no iommu mode is indicated by passing > a negative iommufd value. > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > --- > drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++ > drivers/vfio/vfio.h | 17 ++++- > drivers/vfio/vfio_main.c | 54 ++++++++++++-- > include/linux/iommufd.h | 6 ++ > include/uapi/linux/vfio.h | 34 +++++++++ > 5 files changed, 248 insertions(+), 9 deletions(-) > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c > index 9e2c1ecaaf4f..37f80e368551 100644 > --- a/drivers/vfio/device_cdev.c > +++ b/drivers/vfio/device_cdev.c > @@ -3,6 +3,7 @@ > * Copyright (c) 2023 Intel Corporation. > */ > #include <linux/vfio.h> > +#include <linux/iommufd.h> > > #include "vfio.h" > > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) > return ret; > } > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df) > +{ > + spin_lock(&df->kvm_ref_lock); > + if (!df->kvm) > + goto unlock; > + > + _vfio_device_get_kvm_safe(df->device, df->kvm); > + > +unlock: > + spin_unlock(&df->kvm_ref_lock); > +} > + > +void vfio_device_cdev_close(struct vfio_device_file *df) > +{ > + struct vfio_device *device = df->device; > + > + mutex_lock(&device->dev_set->lock); > + /* > + * As df->access_granted writer is under dev_set->lock as well, > + * so this read no need to use smp_load_acquire() to pair with > + * smp_store_release() in the caller of vfio_device_open(). > + */ > + if (!df->access_granted) { > + mutex_unlock(&device->dev_set->lock); > + return; > + } > + vfio_device_close(df); > + vfio_device_put_kvm(device); > + if (df->iommufd) > + iommufd_ctx_put(df->iommufd); > + mutex_unlock(&device->dev_set->lock); > + vfio_device_unblock_group(device); > +} > + > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd) > +{ > + struct fd f; > + struct iommufd_ctx *iommufd; > + > + f = fdget(fd); > + if (!f.file) > + return ERR_PTR(-EBADF); > + > + iommufd = iommufd_ctx_from_file(f.file); > + > + fdput(f); > + return iommufd; > +} > + > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > + unsigned long arg) > +{ > + struct vfio_device *device = df->device; > + struct vfio_device_bind_iommufd bind; > + struct iommufd_ctx *iommufd = NULL; > + unsigned long minsz; > + int ret; > + > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz || bind.flags) > + return -EINVAL; > + > + if (!device->ops->bind_iommufd) > + return -ENODEV; > + > + ret = vfio_device_block_group(device); > + if (ret) > + return ret; > + > + mutex_lock(&device->dev_set->lock); > + /* > + * If already been bound to an iommufd, or already set noiommu > + * then fail it. > + */ > + if (df->iommufd || df->noiommu) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + /* iommufd < 0 means noiommu mode */ > + if (bind.iommufd < 0) { > + if (!capable(CAP_SYS_RAWIO)) { > + ret = -EPERM; > + goto out_unlock; > + } > + df->noiommu = true; > + } else { > + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); > + if (IS_ERR(iommufd)) { > + ret = PTR_ERR(iommufd); > + goto out_unlock; > + } > + } > + > + /* > + * Before the device open, get the KVM pointer currently > + * associated with the device file (if there is) and obtain > + * a reference. This reference is held until device closed. > + * Save the pointer in the device for use by drivers. > + */ > + vfio_device_get_kvm_safe(df); > + > + df->iommufd = iommufd; > + ret = vfio_device_open(df, &bind.out_devid, NULL); This is unrelated to this patch but reminded me - while debugging QEMU, vfio_assert_device_open() kept firing as I was killing QEMU (which in turn made the kernel close all fds), device->open_count==0 as QEMU was dying before calling ioctl(VFIO_DEVICE_BIND_IOMMUFD) which would call this vfio_device_open() just above. Has this been reported/fixed, just curious?
> From: Alexey Kardashevskiy <aik@amd.com> > Sent: Friday, March 10, 2023 10:39 AM > > On 27/2/23 22:11, Yi Liu wrote: > > This adds ioctl for userspace to bind device cdev fd to iommufd. > > > > VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain > DMA > > control provided by the iommufd. open_device > > op is called after bind_iommufd op. > > VFIO no iommu mode is indicated by passing > > a negative iommufd value. > > > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > > --- > > drivers/vfio/device_cdev.c | 146 > +++++++++++++++++++++++++++++++++++++ > > drivers/vfio/vfio.h | 17 ++++- > > drivers/vfio/vfio_main.c | 54 ++++++++++++-- > > include/linux/iommufd.h | 6 ++ > > include/uapi/linux/vfio.h | 34 +++++++++ > > 5 files changed, 248 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c > > index 9e2c1ecaaf4f..37f80e368551 100644 > > --- a/drivers/vfio/device_cdev.c > > +++ b/drivers/vfio/device_cdev.c > > @@ -3,6 +3,7 @@ > > * Copyright (c) 2023 Intel Corporation. > > */ > > #include <linux/vfio.h> > > +#include <linux/iommufd.h> > > > > #include "vfio.h" > > > > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode > *inode, struct file *filep) > > return ret; > > } > > > > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df) > > +{ > > + spin_lock(&df->kvm_ref_lock); > > + if (!df->kvm) > > + goto unlock; > > + > > + _vfio_device_get_kvm_safe(df->device, df->kvm); > > + > > +unlock: > > + spin_unlock(&df->kvm_ref_lock); > > +} > > + > > +void vfio_device_cdev_close(struct vfio_device_file *df) > > +{ > > + struct vfio_device *device = df->device; > > + > > + mutex_lock(&device->dev_set->lock); > > + /* > > + * As df->access_granted writer is under dev_set->lock as well, > > + * so this read no need to use smp_load_acquire() to pair with > > + * smp_store_release() in the caller of vfio_device_open(). > > + */ device->open_count is sure to be non-zero if df->access_granted is true. Otherwise, it means this device file has not opened device successfully, so no need to do further tidy up. > > + if (!df->access_granted) { > > + mutex_unlock(&device->dev_set->lock); > > + return; > > + } > > + vfio_device_close(df); > > + vfio_device_put_kvm(device); > > + if (df->iommufd) > > + iommufd_ctx_put(df->iommufd); > > + mutex_unlock(&device->dev_set->lock); > > + vfio_device_unblock_group(device); > > +} > > + > > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd) > > +{ > > + struct fd f; > > + struct iommufd_ctx *iommufd; > > + > > + f = fdget(fd); > > + if (!f.file) > > + return ERR_PTR(-EBADF); > > + > > + iommufd = iommufd_ctx_from_file(f.file); > > + > > + fdput(f); > > + return iommufd; > > +} > > + > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, > > + unsigned long arg) > > +{ > > + struct vfio_device *device = df->device; > > + struct vfio_device_bind_iommufd bind; > > + struct iommufd_ctx *iommufd = NULL; > > + unsigned long minsz; > > + int ret; > > + > > + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); > > + > > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (bind.argsz < minsz || bind.flags) > > + return -EINVAL; > > + > > + if (!device->ops->bind_iommufd) > > + return -ENODEV; > > + > > + ret = vfio_device_block_group(device); > > + if (ret) > > + return ret; > > + > > + mutex_lock(&device->dev_set->lock); > > + /* > > + * If already been bound to an iommufd, or already set noiommu > > + * then fail it. > > + */ > > + if (df->iommufd || df->noiommu) { > > + ret = -EINVAL; > > + goto out_unlock; > > + } > > + > > + /* iommufd < 0 means noiommu mode */ > > + if (bind.iommufd < 0) { > > + if (!capable(CAP_SYS_RAWIO)) { > > + ret = -EPERM; > > + goto out_unlock; > > + } > > + df->noiommu = true; > > + } else { > > + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); > > + if (IS_ERR(iommufd)) { > > + ret = PTR_ERR(iommufd); > > + goto out_unlock; > > + } > > + } > > + > > + /* > > + * Before the device open, get the KVM pointer currently > > + * associated with the device file (if there is) and obtain > > + * a reference. This reference is held until device closed. > > + * Save the pointer in the device for use by drivers. > > + */ > > + vfio_device_get_kvm_safe(df); > > + > > + df->iommufd = iommufd; > > + ret = vfio_device_open(df, &bind.out_devid, NULL); > > > This is unrelated to this patch but reminded me - while debugging QEMU, > vfio_assert_device_open() kept firing as I was killing QEMU (which in > turn made the kernel close all fds), device->open_count==0 as QEMU was > dying before calling ioctl(VFIO_DEVICE_BIND_IOMMUFD) which would call > this vfio_device_open() just above. Has this been reported/fixed, just > curious? Thanks, I think this was fixed by the code I marked above. I think it was raised in v2 review. Should have been fixed after that. Have tried v5 or v6? If still have issue, please feel free let me know it. https://lore.kernel.org/kvm/Y+HIWRM%2FTjWcuT6I@yzhao56-desk.sh.intel.com/ Regards, Yi Liu
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 9e2c1ecaaf4f..37f80e368551 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -3,6 +3,7 @@ * Copyright (c) 2023 Intel Corporation. */ #include <linux/vfio.h> +#include <linux/iommufd.h> #include "vfio.h" @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) return ret; } +static void vfio_device_get_kvm_safe(struct vfio_device_file *df) +{ + spin_lock(&df->kvm_ref_lock); + if (!df->kvm) + goto unlock; + + _vfio_device_get_kvm_safe(df->device, df->kvm); + +unlock: + spin_unlock(&df->kvm_ref_lock); +} + +void vfio_device_cdev_close(struct vfio_device_file *df) +{ + struct vfio_device *device = df->device; + + mutex_lock(&device->dev_set->lock); + /* + * As df->access_granted writer is under dev_set->lock as well, + * so this read no need to use smp_load_acquire() to pair with + * smp_store_release() in the caller of vfio_device_open(). + */ + if (!df->access_granted) { + mutex_unlock(&device->dev_set->lock); + return; + } + vfio_device_close(df); + vfio_device_put_kvm(device); + if (df->iommufd) + iommufd_ctx_put(df->iommufd); + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); +} + +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd) +{ + struct fd f; + struct iommufd_ctx *iommufd; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + iommufd = iommufd_ctx_from_file(f.file); + + fdput(f); + return iommufd; +} + +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, + unsigned long arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_bind_iommufd bind; + struct iommufd_ctx *iommufd = NULL; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); + + if (copy_from_user(&bind, (void __user *)arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz || bind.flags) + return -EINVAL; + + if (!device->ops->bind_iommufd) + return -ENODEV; + + ret = vfio_device_block_group(device); + if (ret) + return ret; + + mutex_lock(&device->dev_set->lock); + /* + * If already been bound to an iommufd, or already set noiommu + * then fail it. + */ + if (df->iommufd || df->noiommu) { + ret = -EINVAL; + goto out_unlock; + } + + /* iommufd < 0 means noiommu mode */ + if (bind.iommufd < 0) { + if (!capable(CAP_SYS_RAWIO)) { + ret = -EPERM; + goto out_unlock; + } + df->noiommu = true; + } else { + iommufd = vfio_get_iommufd_from_fd(bind.iommufd); + if (IS_ERR(iommufd)) { + ret = PTR_ERR(iommufd); + goto out_unlock; + } + } + + /* + * Before the device open, get the KVM pointer currently + * associated with the device file (if there is) and obtain + * a reference. This reference is held until device closed. + * Save the pointer in the device for use by drivers. + */ + vfio_device_get_kvm_safe(df); + + df->iommufd = iommufd; + ret = vfio_device_open(df, &bind.out_devid, NULL); + if (ret) + goto out_put_kvm; + + ret = copy_to_user((void __user *)arg + + offsetofend(struct vfio_device_bind_iommufd, iommufd), + &bind.out_devid, + sizeof(bind.out_devid)) ? -EFAULT : 0; + if (ret) + goto out_close_device; + + if (df->noiommu) + dev_warn(device->dev, "vfio-noiommu device used by user " + "(%s:%d)\n", current->comm, task_pid_nr(current)); + + /* + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ + * read/write/mmap + */ + smp_store_release(&df->access_granted, true); + mutex_unlock(&device->dev_set->lock); + + return 0; + +out_close_device: + vfio_device_close(df); +out_put_kvm: + df->iommufd = NULL; + df->noiommu = false; + vfio_device_put_kvm(device); + if (iommufd) + iommufd_ctx_put(iommufd); +out_unlock: + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); + return ret; +} + static char *vfio_device_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 8661de75f94b..4716a904e63b 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -23,7 +23,9 @@ struct vfio_device_file { bool access_granted; spinlock_t kvm_ref_lock; /* protect kvm field */ struct kvm *kvm; - struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ + /* protected by struct vfio_device_set::lock */ + struct iommufd_ctx *iommufd; + bool noiommu; }; void vfio_device_put_registration(struct vfio_device *device); @@ -269,6 +271,9 @@ static inline void vfio_device_del(struct vfio_device *device) void vfio_init_device_cdev(struct vfio_device *device); int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep); +void vfio_device_cdev_close(struct vfio_device_file *df); +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, + unsigned long arg); int vfio_cdev_init(struct class *device_class); void vfio_cdev_cleanup(void); #else @@ -292,6 +297,16 @@ static inline int vfio_device_fops_cdev_open(struct inode *inode, return 0; } +static inline void vfio_device_cdev_close(struct vfio_device_file *df) +{ +} + +static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df, + unsigned long arg) +{ + return -EOPNOTSUPP; +} + static inline int vfio_cdev_init(struct class *device_class) { return 0; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3f83447d022e..69d0add930bb 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -37,6 +37,7 @@ #include <linux/interval_tree.h> #include <linux/iova_bitmap.h> #include <linux/iommufd.h> +#include <uapi/linux/iommufd.h> #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -422,16 +423,32 @@ static int vfio_device_first_open(struct vfio_device_file *df, { struct vfio_device *device = df->device; struct iommufd_ctx *iommufd = df->iommufd; - int ret; + int ret = 0; lockdep_assert_held(&device->dev_set->lock); + if (WARN_ON(iommufd && df->noiommu)) + return -EINVAL; + if (!try_module_get(device->dev->driver->owner)) return -ENODEV; + /* + * For group/container path, iommufd pointer is NULL when comes + * into this helper. Its noiommu support is handled by + * vfio_device_group_use_iommu() + * + * For iommufd compat mode, iommufd pointer here is a valid value. + * Its noiommu support is in vfio_iommufd_bind(). + * + * For device cdev path, iommufd pointer here is a valid value for + * normal cases, but it is NULL if it's noiommu. Check df->noiommu + * to differentiate cdev noiommu from the group/container path which + * also passes NULL iommufd pointer in. If set then do nothing. + */ if (iommufd) ret = vfio_iommufd_bind(device, iommufd, dev_id, pt_id); - else + else if (!df->noiommu) ret = vfio_device_group_use_iommu(device); if (ret) goto err_module_put; @@ -446,7 +463,7 @@ static int vfio_device_first_open(struct vfio_device_file *df, err_unuse_iommu: if (iommufd) vfio_iommufd_unbind(device); - else + else if (!df->noiommu) vfio_device_group_unuse_iommu(device); err_module_put: module_put(device->dev->driver->owner); @@ -464,7 +481,7 @@ static void vfio_device_last_close(struct vfio_device_file *df) device->ops->close_device(device); if (iommufd) vfio_iommufd_unbind(device); - else + else if (!df->noiommu) vfio_device_group_unuse_iommu(device); module_put(device->dev->driver->owner); } @@ -549,6 +566,8 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) if (!df->is_cdev_device) vfio_device_group_close(df); + else + vfio_device_cdev_close(df); vfio_device_put_registration(device); @@ -1122,7 +1141,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, struct vfio_device *device = df->device; int ret; - /* Paired with smp_store_release() in vfio_device_group_open() */ + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) + return vfio_device_ioctl_bind_iommufd(df, arg); + + /* + * Paired with smp_store_release() in the caller of + * vfio_device_open(). e.g. vfio_device_group_open() + * and vfio_device_ioctl_bind_iommufd() + */ if (!smp_load_acquire(&df->access_granted)) return -EINVAL; @@ -1153,7 +1179,11 @@ static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, struct vfio_device_file *df = filep->private_data; struct vfio_device *device = df->device; - /* Paired with smp_store_release() in vfio_device_group_open() */ + /* + * Paired with smp_store_release() in the caller of + * vfio_device_open(). e.g. vfio_device_group_open() + * and vfio_device_ioctl_bind_iommufd() + */ if (!smp_load_acquire(&df->access_granted)) return -EINVAL; @@ -1170,7 +1200,11 @@ static ssize_t vfio_device_fops_write(struct file *filep, struct vfio_device_file *df = filep->private_data; struct vfio_device *device = df->device; - /* Paired with smp_store_release() in vfio_device_group_open() */ + /* + * Paired with smp_store_release() in the caller of + * vfio_device_open(). e.g. vfio_device_group_open() + * and vfio_device_ioctl_bind_iommufd() + */ if (!smp_load_acquire(&df->access_granted)) return -EINVAL; @@ -1185,7 +1219,11 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) struct vfio_device_file *df = filep->private_data; struct vfio_device *device = df->device; - /* Paired with smp_store_release() in vfio_device_group_open() */ + /* + * Paired with smp_store_release() in the caller of + * vfio_device_open(). e.g. vfio_device_group_open() + * and vfio_device_ioctl_bind_iommufd() + */ if (!smp_load_acquire(&df->access_granted)) return -EINVAL; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 650d45629647..9672cf839687 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -17,6 +17,12 @@ struct iommufd_ctx; struct iommufd_access; struct file; +/* + * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid + * ID starts from 1. + */ +#define IOMMUFD_INVALID_ID 0 + struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4bf11ee8de53..92aa8dbc970a 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -194,6 +194,40 @@ struct vfio_group_status { /* --------------- IOCTLs for DEVICE file descriptors --------------- */ +/* + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19, + * struct vfio_device_bind_iommufd) + * + * Bind a vfio_device to the specified iommufd. + * + * The user should provide a device cookie when calling this ioctl. The + * cookie is carried only in event e.g. I/O fault reported to userspace + * via iommufd. The user should use devid returned by this ioctl to mark + * the target device in other ioctls (e.g. capability query via iommufd). + * + * User is not allowed to access the device before the binding operation + * is completed. + * + * Unbind is automatically conducted when device fd is closed. + * + * @argsz: user filled size of this data. + * @flags: reserved for future extension. + * @dev_cookie: a per device cookie provided by userspace. + * @iommufd: iommufd to bind. a negative value means noiommu. + * @out_devid: the device id generated by this bind. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_bind_iommufd { + __u32 argsz; + __u32 flags; + __aligned_u64 dev_cookie; + __s32 iommufd; + __u32 out_devid; +}; + +#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 19) + /** * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, * struct vfio_device_info)
This adds ioctl for userspace to bind device cdev fd to iommufd. VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA control provided by the iommufd. open_device op is called after bind_iommufd op. VFIO no iommu mode is indicated by passing a negative iommufd value. Signed-off-by: Yi Liu <yi.l.liu@intel.com> --- drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 17 ++++- drivers/vfio/vfio_main.c | 54 ++++++++++++-- include/linux/iommufd.h | 6 ++ include/uapi/linux/vfio.h | 34 +++++++++ 5 files changed, 248 insertions(+), 9 deletions(-)