diff mbox series

[v5,16/19] vfio: Add VFIO_DEVICE_BIND_IOMMUFD

Message ID 20230227111135.61728-17-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Add vfio_device cdev for iommufd support | expand

Commit Message

Yi Liu Feb. 27, 2023, 11:11 a.m. UTC
This adds ioctl for userspace to bind device cdev fd to iommufd.

    VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
			      control provided by the iommufd. open_device
			      op is called after bind_iommufd op.
			      VFIO no iommu mode is indicated by passing
			      a negative iommufd value.

Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio.h        |  17 ++++-
 drivers/vfio/vfio_main.c   |  54 ++++++++++++--
 include/linux/iommufd.h    |   6 ++
 include/uapi/linux/vfio.h  |  34 +++++++++
 5 files changed, 248 insertions(+), 9 deletions(-)

Comments

Jason Gunthorpe Feb. 27, 2023, 7:19 p.m. UTC | #1
On Mon, Feb 27, 2023 at 03:11:32AM -0800, Yi Liu wrote:
> This adds ioctl for userspace to bind device cdev fd to iommufd.
> 
>     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> 			      control provided by the iommufd. open_device
> 			      op is called after bind_iommufd op.
> 			      VFIO no iommu mode is indicated by passing
> 			      a negative iommufd value.
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio.h        |  17 ++++-
>  drivers/vfio/vfio_main.c   |  54 ++++++++++++--
>  include/linux/iommufd.h    |   6 ++
>  include/uapi/linux/vfio.h  |  34 +++++++++
>  5 files changed, 248 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index 9e2c1ecaaf4f..37f80e368551 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -3,6 +3,7 @@
>   * Copyright (c) 2023 Intel Corporation.
>   */
>  #include <linux/vfio.h>
> +#include <linux/iommufd.h>
>  
>  #include "vfio.h"
>  
> @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>  	return ret;
>  }
>  
> +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> +{
> +	spin_lock(&df->kvm_ref_lock);
> +	if (!df->kvm)
> +		goto unlock;
> +
> +	_vfio_device_get_kvm_safe(df->device, df->kvm);
> +
> +unlock:

Just 

if (df->kvm)
   _vfio_device_get_kvm_safe(df->device, df->kvm);

Without the goto

> +	spin_unlock(&df->kvm_ref_lock);
> +}
> +
> +void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +	struct vfio_device *device = df->device;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/*
> +	 * As df->access_granted writer is under dev_set->lock as well,
> +	 * so this read no need to use smp_load_acquire() to pair with
> +	 * smp_store_release() in the caller of vfio_device_open().
> +	 */

This is a bit misleading, we are about to free df in the caller, so at
this moment df has no current access. We don't even need to have the
mutex to test it.

> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    unsigned long arg)

struct device __user *arg and remove all the casts.

> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_bind_iommufd bind;
> +	struct iommufd_ctx *iommufd = NULL;
> +	unsigned long minsz;
> +	int ret;
> +
> +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> +
> +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> +		return -EFAULT;
> +
> +	if (bind.argsz < minsz || bind.flags)
> +		return -EINVAL;
> +
> +	if (!device->ops->bind_iommufd)
> +		return -ENODEV;
> +
> +	ret = vfio_device_block_group(device);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/*
> +	 * If already been bound to an iommufd, or already set noiommu
> +	 * then fail it.
> +	 */
> +	if (df->iommufd || df->noiommu) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/* iommufd < 0 means noiommu mode */
> +	if (bind.iommufd < 0) {
> +		if (!capable(CAP_SYS_RAWIO)) {
> +			ret = -EPERM;
> +			goto out_unlock;
> +		}
> +		df->noiommu = true;
> +	} else {
> +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> +		if (IS_ERR(iommufd)) {
> +			ret = PTR_ERR(iommufd);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * Before the device open, get the KVM pointer currently
> +	 * associated with the device file (if there is) and obtain
> +	 * a reference.  This reference is held until device closed.
> +	 * Save the pointer in the device for use by drivers.
> +	 */
> +	vfio_device_get_kvm_safe(df);
> +
> +	df->iommufd = iommufd;
> +	ret = vfio_device_open(df, &bind.out_devid, NULL);
> +	if (ret)
> +		goto out_put_kvm;
> +
> +	ret = copy_to_user((void __user *)arg +
> +			   offsetofend(struct vfio_device_bind_iommufd, iommufd),

??

&arg->out_dev_id

static_assert(__same_type...)

> diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
> index 650d45629647..9672cf839687 100644
> --- a/include/linux/iommufd.h
> +++ b/include/linux/iommufd.h
> @@ -17,6 +17,12 @@ struct iommufd_ctx;
>  struct iommufd_access;
>  struct file;
>  
> +/*
> + * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid
> + * ID starts from 1.
> + */
> +#define IOMMUFD_INVALID_ID 0

Why? vfio doesn't need to check this just to generate EINVAL.

Jason
Yi Liu Feb. 28, 2023, 4:08 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, February 28, 2023 3:20 AM
> 
> On Mon, Feb 27, 2023 at 03:11:32AM -0800, Yi Liu wrote:
> > This adds ioctl for userspace to bind device cdev fd to iommufd.
> >
> >     VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain
> DMA
> > 			      control provided by the iommufd. open_device
> > 			      op is called after bind_iommufd op.
> > 			      VFIO no iommu mode is indicated by passing
> > 			      a negative iommufd value.
> >
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/device_cdev.c | 146
> +++++++++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio.h        |  17 ++++-
> >  drivers/vfio/vfio_main.c   |  54 ++++++++++++--
> >  include/linux/iommufd.h    |   6 ++
> >  include/uapi/linux/vfio.h  |  34 +++++++++
> >  5 files changed, 248 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index 9e2c1ecaaf4f..37f80e368551 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -3,6 +3,7 @@
> >   * Copyright (c) 2023 Intel Corporation.
> >   */
> >  #include <linux/vfio.h>
> > +#include <linux/iommufd.h>
> >
> >  #include "vfio.h"
> >
> > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode
> *inode, struct file *filep)
> >  	return ret;
> >  }
> >
> > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > +{
> > +	spin_lock(&df->kvm_ref_lock);
> > +	if (!df->kvm)
> > +		goto unlock;
> > +
> > +	_vfio_device_get_kvm_safe(df->device, df->kvm);
> > +
> > +unlock:
> 
> Just
> 
> if (df->kvm)
>    _vfio_device_get_kvm_safe(df->device, df->kvm);
> 
> Without the goto

Got it.

> > +	spin_unlock(&df->kvm_ref_lock);
> > +}
> > +
> > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +	struct vfio_device *device = df->device;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/*
> > +	 * As df->access_granted writer is under dev_set->lock as well,
> > +	 * so this read no need to use smp_load_acquire() to pair with
> > +	 * smp_store_release() in the caller of vfio_device_open().
> > +	 */
> 
> This is a bit misleading, we are about to free df in the caller, so at
> this moment df has no current access. We don't even need to have the
> mutex to test it.

Ok. so I can test it outside the lock and make the comment
more clear? How about below? Or simply no need to have
a comment here?

/*
  * caller of vfio_device_cdev_close() is going to free df, so there
  * is no need to use smp_load_acquire() to pair with
  * smp_store_release() in the writer path of df->access_granted.
  */

> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    unsigned long arg)
> 
> struct device __user *arg and remove all the casts.
> 
> > +{
> > +	struct vfio_device *device = df->device;
> > +	struct vfio_device_bind_iommufd bind;
> > +	struct iommufd_ctx *iommufd = NULL;
> > +	unsigned long minsz;
> > +	int ret;
> > +
> > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > +
> > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (bind.argsz < minsz || bind.flags)
> > +		return -EINVAL;
> > +
> > +	if (!device->ops->bind_iommufd)
> > +		return -ENODEV;
> > +
> > +	ret = vfio_device_block_group(device);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/*
> > +	 * If already been bound to an iommufd, or already set noiommu
> > +	 * then fail it.
> > +	 */
> > +	if (df->iommufd || df->noiommu) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/* iommufd < 0 means noiommu mode */
> > +	if (bind.iommufd < 0) {
> > +		if (!capable(CAP_SYS_RAWIO)) {
> > +			ret = -EPERM;
> > +			goto out_unlock;
> > +		}
> > +		df->noiommu = true;
> > +	} else {
> > +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > +		if (IS_ERR(iommufd)) {
> > +			ret = PTR_ERR(iommufd);
> > +			goto out_unlock;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Before the device open, get the KVM pointer currently
> > +	 * associated with the device file (if there is) and obtain
> > +	 * a reference.  This reference is held until device closed.
> > +	 * Save the pointer in the device for use by drivers.
> > +	 */
> > +	vfio_device_get_kvm_safe(df);
> > +
> > +	df->iommufd = iommufd;
> > +	ret = vfio_device_open(df, &bind.out_devid, NULL);
> > +	if (ret)
> > +		goto out_put_kvm;
> > +
> > +	ret = copy_to_user((void __user *)arg +
> > +			   offsetofend(struct vfio_device_bind_iommufd,
> iommufd),
> 
> ??
> 
> &arg->out_dev_id
>
> static_assert(__same_type...)

Yes, all the above comments are similar with other two patches. Will
refine accordingly.

> > diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
> > index 650d45629647..9672cf839687 100644
> > --- a/include/linux/iommufd.h
> > +++ b/include/linux/iommufd.h
> > @@ -17,6 +17,12 @@ struct iommufd_ctx;
> >  struct iommufd_access;
> >  struct file;
> >
> > +/*
> > + * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid
> > + * ID starts from 1.
> > + */
> > +#define IOMMUFD_INVALID_ID 0
> 
> Why? vfio doesn't need to check this just to generate EINVAL.

Hmmm, you are right. Not needed any more.

Regards,
Yi Liu
Yi Liu March 1, 2023, 9:19 a.m. UTC | #3
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, February 27, 2023 7:12 PM
[...]
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    unsigned long arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_bind_iommufd bind;
> +	struct iommufd_ctx *iommufd = NULL;
> +	unsigned long minsz;
> +	int ret;
> +
> +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> +
> +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> +		return -EFAULT;
> +
> +	if (bind.argsz < minsz || bind.flags)
> +		return -EINVAL;
> +
> +	if (!device->ops->bind_iommufd)
> +		return -ENODEV;

Hi Jason,

Per the comment in vfio_iommufd_bind(), such device driver
won't provide .bind_iommufd(). So shall we allow this ioctl
to go longer to call .open_device() instead of failing it here?
I think we need to allow it to go further. E.g. leave the check
to be in vfio_iommufd_bind(). Otherwise, user may not able
to use such devices. Is it?

> +
> +	ret = vfio_device_block_group(device);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/*
> +	 * If already been bound to an iommufd, or already set noiommu
> +	 * then fail it.
> +	 */
> +	if (df->iommufd || df->noiommu) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/* iommufd < 0 means noiommu mode */
> +	if (bind.iommufd < 0) {
> +		if (!capable(CAP_SYS_RAWIO)) {
> +			ret = -EPERM;
> +			goto out_unlock;
> +		}
> +		df->noiommu = true;
> +	} else {
> +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> +		if (IS_ERR(iommufd)) {
> +			ret = PTR_ERR(iommufd);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * Before the device open, get the KVM pointer currently
> +	 * associated with the device file (if there is) and obtain
> +	 * a reference.  This reference is held until device closed.
> +	 * Save the pointer in the device for use by drivers.
> +	 */
> +	vfio_device_get_kvm_safe(df);
> +
> +	df->iommufd = iommufd;
> +	ret = vfio_device_open(df, &bind.out_devid, NULL);
> +	if (ret)
> +		goto out_put_kvm;
[...]
> 
>  /* --------------- IOCTLs for DEVICE file descriptors --------------- */
> 
> +/*
> + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> + *				   struct vfio_device_bind_iommufd)
> + *
> + * Bind a vfio_device to the specified iommufd.
> + *
> + * The user should provide a device cookie when calling this ioctl. The
> + * cookie is carried only in event e.g. I/O fault reported to userspace
> + * via iommufd. The user should use devid returned by this ioctl to mark
> + * the target device in other ioctls (e.g. capability query via iommufd).
> + *
> + * User is not allowed to access the device before the binding operation
> + * is completed.
> + *
> + * Unbind is automatically conducted when device fd is closed.
> + *
> + * @argsz:	 user filled size of this data.
> + * @flags:	 reserved for future extension.
> + * @dev_cookie:	 a per device cookie provided by userspace.
> + * @iommufd:	 iommufd to bind. a negative value means noiommu.
> + * @out_devid:	 the device id generated by this bind.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_bind_iommufd {
> +	__u32		argsz;
> +	__u32		flags;
> +	__aligned_u64	dev_cookie;
> +	__s32		iommufd;
> +	__u32		out_devid;

As above, for the devices that do not do DMA, there is no .bind_iommufd
op, hence no iommufd_device generated. This means no good value
can be filled in this out_devid field. So this field is optional. Only
for the devices which do DMA, should this out_devid field return a
valid ID otherwise an invalid ID would be filled (e.g. value #0 is an
invalid value in the iommufd object id pool). Userspace needs to
check if the out_devid is valid or not before use. This ID can be further
used in iommufd uAPIs like IOMMU_HWPT_ALLOC, IOMMU_DEVICE_GET_INFO
and etc.

> +};
> +
> +#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE
> + 19)
> +
>  /**
>   * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
>   *						struct vfio_device_info)
> --
> 2.34.1

Regards,
Yi Liu
Jason Gunthorpe March 1, 2023, 5:46 p.m. UTC | #4
On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, February 27, 2023 7:12 PM
> [...]
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    unsigned long arg)
> > +{
> > +	struct vfio_device *device = df->device;
> > +	struct vfio_device_bind_iommufd bind;
> > +	struct iommufd_ctx *iommufd = NULL;
> > +	unsigned long minsz;
> > +	int ret;
> > +
> > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > +
> > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (bind.argsz < minsz || bind.flags)
> > +		return -EINVAL;
> > +
> > +	if (!device->ops->bind_iommufd)
> > +		return -ENODEV;
> 
> Hi Jason,
> 
> Per the comment in vfio_iommufd_bind(), such device driver
> won't provide .bind_iommufd(). So shall we allow this ioctl
> to go longer to call .open_device() instead of failing it here?
> I think we need to allow it to go further. E.g. leave the check
> to be in vfio_iommufd_bind(). Otherwise, user may not able
> to use such devices. Is it?

You are thinking about the crazy mdev samples?

We should probably just change them to provide a 'no dma' set of ops.

> > +struct vfio_device_bind_iommufd {
> > +	__u32		argsz;
> > +	__u32		flags;
> > +	__aligned_u64	dev_cookie;
> > +	__s32		iommufd;
> > +	__u32		out_devid;
> 
> As above, for the devices that do not do DMA, there is no .bind_iommufd
> op, hence no iommufd_device generated. This means no good value
> can be filled in this out_devid field. So this field is optional. Only
> for the devices which do DMA, should this out_devid field return a
> valid ID otherwise an invalid ID would be filled (e.g. value #0 is an
> invalid value in the iommufd object id pool). Userspace needs to
> check if the out_devid is valid or not before use. This ID can be further
> used in iommufd uAPIs like IOMMU_HWPT_ALLOC, IOMMU_DEVICE_GET_INFO
> and etc.

I would say create an access and harmonize the no-DMA devices with the
emulated devices.

What should we return here anyhow if an access was created?

Jason
Yi Liu March 2, 2023, 4:09 a.m. UTC | #5
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 2, 2023 1:47 AM
> 
> On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Monday, February 27, 2023 7:12 PM
> > [...]
> > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +				    unsigned long arg)
> > > +{
> > > +	struct vfio_device *device = df->device;
> > > +	struct vfio_device_bind_iommufd bind;
> > > +	struct iommufd_ctx *iommufd = NULL;
> > > +	unsigned long minsz;
> > > +	int ret;
> > > +
> > > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > > +
> > > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (bind.argsz < minsz || bind.flags)
> > > +		return -EINVAL;
> > > +
> > > +	if (!device->ops->bind_iommufd)
> > > +		return -ENODEV;
> >
> > Hi Jason,
> >
> > Per the comment in vfio_iommufd_bind(), such device driver
> > won't provide .bind_iommufd(). So shall we allow this ioctl
> > to go longer to call .open_device() instead of failing it here?
> > I think we need to allow it to go further. E.g. leave the check
> > to be in vfio_iommufd_bind(). Otherwise, user may not able
> > to use such devices. Is it?
> 
> You are thinking about the crazy mdev samples?

Yes. we don't have real devices which don't do DMA. Is it?
 
> We should probably just change them to provide a 'no dma' set of ops.

Yes. at least generate iommufd_device I suppose.

> > > +struct vfio_device_bind_iommufd {
> > > +	__u32		argsz;
> > > +	__u32		flags;
> > > +	__aligned_u64	dev_cookie;
> > > +	__s32		iommufd;
> > > +	__u32		out_devid;
> >
> > As above, for the devices that do not do DMA, there is no .bind_iommufd
> > op, hence no iommufd_device generated. This means no good value
> > can be filled in this out_devid field. So this field is optional. Only
> > for the devices which do DMA, should this out_devid field return a
> > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an
> > invalid value in the iommufd object id pool). Userspace needs to
> > check if the out_devid is valid or not before use. This ID can be further
> > used in iommufd uAPIs like IOMMU_HWPT_ALLOC,
> IOMMU_DEVICE_GET_INFO
> > and etc.
> 
> I would say create an access and harmonize the no-DMA devices with the
> emulated devices.

In this case, iommufd_access would be created instead of iommufd_device.

> What should we return here anyhow if an access was created?

It depends on what can be done with this id and whether this field is mandatory.
For iommufd_device ID, the user could further use it to query iommu device info and
alloc hwpt. Do we have a similar usage for iommufd_access? And if we define this
field as optional, then we may return iommufd_access object Id in future if it is
needed.

Regards,
Yi Liu
Yi Liu March 3, 2023, 6:57 a.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 2, 2023 1:47 AM
> 
> On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Monday, February 27, 2023 7:12 PM
> > [...]
> > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > +				    unsigned long arg)
> > > +{
> > > +	struct vfio_device *device = df->device;
> > > +	struct vfio_device_bind_iommufd bind;
> > > +	struct iommufd_ctx *iommufd = NULL;
> > > +	unsigned long minsz;
> > > +	int ret;
> > > +
> > > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > > +
> > > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (bind.argsz < minsz || bind.flags)
> > > +		return -EINVAL;
> > > +
> > > +	if (!device->ops->bind_iommufd)
> > > +		return -ENODEV;
> >
> > Hi Jason,
> >
> > Per the comment in vfio_iommufd_bind(), such device driver
> > won't provide .bind_iommufd(). So shall we allow this ioctl
> > to go longer to call .open_device() instead of failing it here?
> > I think we need to allow it to go further. E.g. leave the check
> > to be in vfio_iommufd_bind(). Otherwise, user may not able
> > to use such devices. Is it?
> 
> You are thinking about the crazy mdev samples?
> 
> We should probably just change them to provide a 'no dma' set of ops.
> 
> > > +struct vfio_device_bind_iommufd {
> > > +	__u32		argsz;
> > > +	__u32		flags;
> > > +	__aligned_u64	dev_cookie;
> > > +	__s32		iommufd;
> > > +	__u32		out_devid;
> >
> > As above, for the devices that do not do DMA, there is no .bind_iommufd
> > op, hence no iommufd_device generated. This means no good value
> > can be filled in this out_devid field. So this field is optional. Only
> > for the devices which do DMA, should this out_devid field return a
> > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an
> > invalid value in the iommufd object id pool). Userspace needs to
> > check if the out_devid is valid or not before use. This ID can be further
> > used in iommufd uAPIs like IOMMU_HWPT_ALLOC,
> IOMMU_DEVICE_GET_INFO
> > and etc.
> 
> I would say create an access and harmonize the no-DMA devices with the
> emulated devices.

How about below change?

diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 4f82a6fa7c6c..e536515086d7 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -18,12 +18,8 @@ int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx)
 
 	lockdep_assert_held(&vdev->dev_set->lock);
 
-	/*
-	 * If the driver doesn't provide this op then it means the device does
-	 * not do DMA at all. So nothing to do.
-	 */
-	if (!vdev->ops->bind_iommufd)
-		return 0;
+	if (WARN_ON(!vdev->ops->bind_iommufd))
+		return -ENODEV;
 
 	ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id);
 	if (ret)
@@ -102,7 +98,9 @@ EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas);
 /*
  * The emulated standard ops mean that vfio_device is going to use the
  * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
- * ops set should call vfio_register_emulated_iommu_dev().
+ * ops set should call vfio_register_emulated_iommu_dev(). Drivers that do
+ * not call  vfio_pin_pages()/vfio_dma_rw() has no need to provide dma_unmap
+ * callback.
  */
 
 static void vfio_emulated_unmap(void *data, unsigned long iova,
@@ -110,7 +107,8 @@ static void vfio_emulated_unmap(void *data, unsigned long iova,
 {
 	struct vfio_device *vdev = data;
 
-	vdev->ops->dma_unmap(vdev, iova, length);
+	if (vdev->ops->dma_unmap)
+		vdev->ops->dma_unmap(vdev, iova, length);
 }
 
 static const struct iommufd_access_ops vfio_user_ops = {
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index e54eb752e1ba..19391dda5fba 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1374,6 +1374,9 @@ static const struct vfio_device_ops mbochs_dev_ops = {
 	.write = mbochs_write,
 	.ioctl = mbochs_ioctl,
 	.mmap = mbochs_mmap,
+	.bind_iommufd	= vfio_iommufd_emulated_bind,
+	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
+	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
 };
 
 static struct mdev_driver mbochs_driver = {
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index e8400fdab71d..5f48aef36995 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -663,6 +663,9 @@ static const struct vfio_device_ops mdpy_dev_ops = {
 	.write = mdpy_write,
 	.ioctl = mdpy_ioctl,
 	.mmap = mdpy_mmap,
+	.bind_iommufd	= vfio_iommufd_emulated_bind,
+	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
+	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
 };
 
 static struct mdev_driver mdpy_driver = {
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index e887de672c52..35460901b9f7 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1269,6 +1269,9 @@ static const struct vfio_device_ops mtty_dev_ops = {
 	.read = mtty_read,
 	.write = mtty_write,
 	.ioctl = mtty_ioctl,
+	.bind_iommufd	= vfio_iommufd_emulated_bind,
+	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
+	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
 };
 
 static struct mdev_driver mtty_driver = {

> What should we return here anyhow if an access was created?

iommufd_access->obj.id. should be fine. Is it?

Regards,
Yi Liu
Yi Liu March 3, 2023, 7:23 a.m. UTC | #7
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, March 3, 2023 2:58 PM
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Thursday, March 2, 2023 1:47 AM
> >
> > On Wed, Mar 01, 2023 at 09:19:07AM +0000, Liu, Yi L wrote:
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Monday, February 27, 2023 7:12 PM
> > > [...]
> > > > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > > > +				    unsigned long arg)
> > > > +{
> > > > +	struct vfio_device *device = df->device;
> > > > +	struct vfio_device_bind_iommufd bind;
> > > > +	struct iommufd_ctx *iommufd = NULL;
> > > > +	unsigned long minsz;
> > > > +	int ret;
> > > > +
> > > > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > > > +
> > > > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > > > +		return -EFAULT;
> > > > +
> > > > +	if (bind.argsz < minsz || bind.flags)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!device->ops->bind_iommufd)
> > > > +		return -ENODEV;
> > >
> > > Hi Jason,
> > >
> > > Per the comment in vfio_iommufd_bind(), such device driver
> > > won't provide .bind_iommufd(). So shall we allow this ioctl
> > > to go longer to call .open_device() instead of failing it here?
> > > I think we need to allow it to go further. E.g. leave the check
> > > to be in vfio_iommufd_bind(). Otherwise, user may not able
> > > to use such devices. Is it?
> >
> > You are thinking about the crazy mdev samples?
> >
> > We should probably just change them to provide a 'no dma' set of ops.
> >
> > > > +struct vfio_device_bind_iommufd {
> > > > +	__u32		argsz;
> > > > +	__u32		flags;
> > > > +	__aligned_u64	dev_cookie;
> > > > +	__s32		iommufd;
> > > > +	__u32		out_devid;
> > >
> > > As above, for the devices that do not do DMA, there is
> no .bind_iommufd
> > > op, hence no iommufd_device generated. This means no good value
> > > can be filled in this out_devid field. So this field is optional. Only
> > > for the devices which do DMA, should this out_devid field return a
> > > valid ID otherwise an invalid ID would be filled (e.g. value #0 is an
> > > invalid value in the iommufd object id pool). Userspace needs to
> > > check if the out_devid is valid or not before use. This ID can be further
> > > used in iommufd uAPIs like IOMMU_HWPT_ALLOC,
> > IOMMU_DEVICE_GET_INFO
> > > and etc.
> >
> > I would say create an access and harmonize the no-DMA devices with the
> > emulated devices.
> 
> How about below change?
> 
> diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
> index 4f82a6fa7c6c..e536515086d7 100644
> --- a/drivers/vfio/iommufd.c
> +++ b/drivers/vfio/iommufd.c
> @@ -18,12 +18,8 @@ int vfio_iommufd_bind(struct vfio_device *vdev,
> struct iommufd_ctx *ictx)
> 
>  	lockdep_assert_held(&vdev->dev_set->lock);
> 
> -	/*
> -	 * If the driver doesn't provide this op then it means the device does
> -	 * not do DMA at all. So nothing to do.
> -	 */
> -	if (!vdev->ops->bind_iommufd)
> -		return 0;
> +	if (WARN_ON(!vdev->ops->bind_iommufd))
> +		return -ENODEV;
> 
>  	ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id);
>  	if (ret)
> @@ -102,7 +98,9 @@
> EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas);
>  /*
>   * The emulated standard ops mean that vfio_device is going to use the
>   * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using
> this
> - * ops set should call vfio_register_emulated_iommu_dev().
> + * ops set should call vfio_register_emulated_iommu_dev(). Drivers that
> do
> + * not call  vfio_pin_pages()/vfio_dma_rw() has no need to provide
> dma_unmap
> + * callback.
>   */
> 
>  static void vfio_emulated_unmap(void *data, unsigned long iova,
> @@ -110,7 +107,8 @@ static void vfio_emulated_unmap(void *data,
> unsigned long iova,
>  {
>  	struct vfio_device *vdev = data;
> 
> -	vdev->ops->dma_unmap(vdev, iova, length);
> +	if (vdev->ops->dma_unmap)
> +		vdev->ops->dma_unmap(vdev, iova, length);
>  }
> 
>  static const struct iommufd_access_ops vfio_user_ops = {
> diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
> index e54eb752e1ba..19391dda5fba 100644
> --- a/samples/vfio-mdev/mbochs.c
> +++ b/samples/vfio-mdev/mbochs.c
> @@ -1374,6 +1374,9 @@ static const struct vfio_device_ops
> mbochs_dev_ops = {
>  	.write = mbochs_write,
>  	.ioctl = mbochs_ioctl,
>  	.mmap = mbochs_mmap,
> +	.bind_iommufd	= vfio_iommufd_emulated_bind,
> +	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
> +	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
>  };
> 
>  static struct mdev_driver mbochs_driver = {
> diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
> index e8400fdab71d..5f48aef36995 100644
> --- a/samples/vfio-mdev/mdpy.c
> +++ b/samples/vfio-mdev/mdpy.c
> @@ -663,6 +663,9 @@ static const struct vfio_device_ops mdpy_dev_ops =
> {
>  	.write = mdpy_write,
>  	.ioctl = mdpy_ioctl,
>  	.mmap = mdpy_mmap,
> +	.bind_iommufd	= vfio_iommufd_emulated_bind,
> +	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
> +	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
>  };
> 
>  static struct mdev_driver mdpy_driver = {
> diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
> index e887de672c52..35460901b9f7 100644
> --- a/samples/vfio-mdev/mtty.c
> +++ b/samples/vfio-mdev/mtty.c
> @@ -1269,6 +1269,9 @@ static const struct vfio_device_ops mtty_dev_ops
> = {
>  	.read = mtty_read,
>  	.write = mtty_write,
>  	.ioctl = mtty_ioctl,
> +	.bind_iommufd	= vfio_iommufd_emulated_bind,
> +	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
> +	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
>  };
> 
>  static struct mdev_driver mtty_driver = {
> 
> > What should we return here anyhow if an access was created?
> 
> iommufd_access->obj.id. should be fine. Is it?

btw. It requires creating iommufd_access in vfio_iommufd_emulated_bind()
instead of in the attach(). Seems like Nicolin's replace domain series has a patch
to move iommufd_access creation to the bind().

Regards,
Yi Liu
Tian, Kevin March 7, 2023, 6:38 a.m. UTC | #8
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Friday, March 3, 2023 2:58 PM
> 
> > What should we return here anyhow if an access was created?
> 
> iommufd_access->obj.id. should be fine. Is it?
> 

Thinking more I'm not sure whether it's a good idea to fill the
dev_id field with an access object id and then later confuse
the user to get an -ENOENT error when trying to allocate a
hwpt with an access object id.

How can user differentiate it from the real error case where
invalid iommufd object is used?

It sounds clearer to return dev_id only when there is a true
device object being created by the bind_iommufd cmd. Then
the user can use it to decide whether  to further attempt
dev_id related cmds.
Jason Gunthorpe March 7, 2023, 12:37 p.m. UTC | #9
On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Friday, March 3, 2023 2:58 PM
> > 
> > > What should we return here anyhow if an access was created?
> > 
> > iommufd_access->obj.id. should be fine. Is it?
> 
> Thinking more I'm not sure whether it's a good idea to fill the
> dev_id field with an access object id and then later confuse
> the user to get an -ENOENT error when trying to allocate a
> hwpt with an access object id.
> 
> How can user differentiate it from the real error case where
> invalid iommufd object is used?
> 
> It sounds clearer to return dev_id only when there is a true
> device object being created by the bind_iommufd cmd. Then
> the user can use it to decide whether  to further attempt
> dev_id related cmds.

It means we can never return an access_id

I don't think this is a problem, the first thing userspace should do
is a get info to the dev_id which is needed to learn which iommu
driver is running it, if that returns EOPNOTSUPP then it isn't a
physical iommu device.

Jason
Yi Liu March 7, 2023, 1:03 p.m. UTC | #10
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 7, 2023 8:38 PM
> 
> On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Friday, March 3, 2023 2:58 PM
> > >
> > > > What should we return here anyhow if an access was created?
> > >
> > > iommufd_access->obj.id. should be fine. Is it?
> >
> > Thinking more I'm not sure whether it's a good idea to fill the
> > dev_id field with an access object id and then later confuse
> > the user to get an -ENOENT error when trying to allocate a
> > hwpt with an access object id.
> >
> > How can user differentiate it from the real error case where
> > invalid iommufd object is used?
> >
> > It sounds clearer to return dev_id only when there is a true
> > device object being created by the bind_iommufd cmd. Then
> > the user can use it to decide whether  to further attempt
> > dev_id related cmds.
> 
> It means we can never return an access_id
> 
> I don't think this is a problem, the first thing userspace should do
> is a get info to the dev_id which is needed to learn which iommu
> driver is running it, if that returns EOPNOTSUPP then it isn't a
> physical iommu device.

This may mean your below patch depends on the get info series. 
Tian, Kevin March 8, 2023, 7:17 a.m. UTC | #11
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Tuesday, March 7, 2023 9:04 PM
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, March 7, 2023 8:38 PM
> >
> > On Tue, Mar 07, 2023 at 06:38:59AM +0000, Tian, Kevin wrote:
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Friday, March 3, 2023 2:58 PM
> > > >
> > > > > What should we return here anyhow if an access was created?
> > > >
> > > > iommufd_access->obj.id. should be fine. Is it?
> > >
> > > Thinking more I'm not sure whether it's a good idea to fill the
> > > dev_id field with an access object id and then later confuse
> > > the user to get an -ENOENT error when trying to allocate a
> > > hwpt with an access object id.
> > >
> > > How can user differentiate it from the real error case where
> > > invalid iommufd object is used?
> > >
> > > It sounds clearer to return dev_id only when there is a true
> > > device object being created by the bind_iommufd cmd. Then
> > > the user can use it to decide whether  to further attempt
> > > dev_id related cmds.
> >
> > It means we can never return an access_id
> >
> > I don't think this is a problem, the first thing userspace should do
> > is a get info to the dev_id which is needed to learn which iommu
> > driver is running it, if that returns EOPNOTSUPP then it isn't a
> > physical iommu device.
> 
> This may mean your below patch depends on the get info series. 
Alexey Kardashevskiy March 10, 2023, 2:39 a.m. UTC | #12
On 27/2/23 22:11, Yi Liu wrote:
> This adds ioctl for userspace to bind device cdev fd to iommufd.
> 
>      VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA
> 			      control provided by the iommufd. open_device
> 			      op is called after bind_iommufd op.
> 			      VFIO no iommu mode is indicated by passing
> 			      a negative iommufd value.
> 
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>   drivers/vfio/device_cdev.c | 146 +++++++++++++++++++++++++++++++++++++
>   drivers/vfio/vfio.h        |  17 ++++-
>   drivers/vfio/vfio_main.c   |  54 ++++++++++++--
>   include/linux/iommufd.h    |   6 ++
>   include/uapi/linux/vfio.h  |  34 +++++++++
>   5 files changed, 248 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index 9e2c1ecaaf4f..37f80e368551 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -3,6 +3,7 @@
>    * Copyright (c) 2023 Intel Corporation.
>    */
>   #include <linux/vfio.h>
> +#include <linux/iommufd.h>
>   
>   #include "vfio.h"
>   
> @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>   	return ret;
>   }
>   
> +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> +{
> +	spin_lock(&df->kvm_ref_lock);
> +	if (!df->kvm)
> +		goto unlock;
> +
> +	_vfio_device_get_kvm_safe(df->device, df->kvm);
> +
> +unlock:
> +	spin_unlock(&df->kvm_ref_lock);
> +}
> +
> +void vfio_device_cdev_close(struct vfio_device_file *df)
> +{
> +	struct vfio_device *device = df->device;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/*
> +	 * As df->access_granted writer is under dev_set->lock as well,
> +	 * so this read no need to use smp_load_acquire() to pair with
> +	 * smp_store_release() in the caller of vfio_device_open().
> +	 */
> +	if (!df->access_granted) {
> +		mutex_unlock(&device->dev_set->lock);
> +		return;
> +	}
> +	vfio_device_close(df);
> +	vfio_device_put_kvm(device);
> +	if (df->iommufd)
> +		iommufd_ctx_put(df->iommufd);
> +	mutex_unlock(&device->dev_set->lock);
> +	vfio_device_unblock_group(device);
> +}
> +
> +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> +{
> +	struct fd f;
> +	struct iommufd_ctx *iommufd;
> +
> +	f = fdget(fd);
> +	if (!f.file)
> +		return ERR_PTR(-EBADF);
> +
> +	iommufd = iommufd_ctx_from_file(f.file);
> +
> +	fdput(f);
> +	return iommufd;
> +}
> +
> +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> +				    unsigned long arg)
> +{
> +	struct vfio_device *device = df->device;
> +	struct vfio_device_bind_iommufd bind;
> +	struct iommufd_ctx *iommufd = NULL;
> +	unsigned long minsz;
> +	int ret;
> +
> +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> +
> +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> +		return -EFAULT;
> +
> +	if (bind.argsz < minsz || bind.flags)
> +		return -EINVAL;
> +
> +	if (!device->ops->bind_iommufd)
> +		return -ENODEV;
> +
> +	ret = vfio_device_block_group(device);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&device->dev_set->lock);
> +	/*
> +	 * If already been bound to an iommufd, or already set noiommu
> +	 * then fail it.
> +	 */
> +	if (df->iommufd || df->noiommu) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	/* iommufd < 0 means noiommu mode */
> +	if (bind.iommufd < 0) {
> +		if (!capable(CAP_SYS_RAWIO)) {
> +			ret = -EPERM;
> +			goto out_unlock;
> +		}
> +		df->noiommu = true;
> +	} else {
> +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> +		if (IS_ERR(iommufd)) {
> +			ret = PTR_ERR(iommufd);
> +			goto out_unlock;
> +		}
> +	}
> +
> +	/*
> +	 * Before the device open, get the KVM pointer currently
> +	 * associated with the device file (if there is) and obtain
> +	 * a reference.  This reference is held until device closed.
> +	 * Save the pointer in the device for use by drivers.
> +	 */
> +	vfio_device_get_kvm_safe(df);
> +
> +	df->iommufd = iommufd;
> +	ret = vfio_device_open(df, &bind.out_devid, NULL);


This is unrelated to this patch but reminded me - while debugging QEMU, 
vfio_assert_device_open() kept firing as I was killing QEMU (which in 
turn made the kernel close all fds), device->open_count==0 as QEMU was 
dying before calling ioctl(VFIO_DEVICE_BIND_IOMMUFD) which would call 
this vfio_device_open() just above. Has this been reported/fixed, just 
curious?
Yi Liu March 10, 2023, 5:49 a.m. UTC | #13
> From: Alexey Kardashevskiy <aik@amd.com>
> Sent: Friday, March 10, 2023 10:39 AM
> 
> On 27/2/23 22:11, Yi Liu wrote:
> > This adds ioctl for userspace to bind device cdev fd to iommufd.
> >
> >      VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain
> DMA
> > 			      control provided by the iommufd. open_device
> > 			      op is called after bind_iommufd op.
> > 			      VFIO no iommu mode is indicated by passing
> > 			      a negative iommufd value.
> >
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >   drivers/vfio/device_cdev.c | 146
> +++++++++++++++++++++++++++++++++++++
> >   drivers/vfio/vfio.h        |  17 ++++-
> >   drivers/vfio/vfio_main.c   |  54 ++++++++++++--
> >   include/linux/iommufd.h    |   6 ++
> >   include/uapi/linux/vfio.h  |  34 +++++++++
> >   5 files changed, 248 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> > index 9e2c1ecaaf4f..37f80e368551 100644
> > --- a/drivers/vfio/device_cdev.c
> > +++ b/drivers/vfio/device_cdev.c
> > @@ -3,6 +3,7 @@
> >    * Copyright (c) 2023 Intel Corporation.
> >    */
> >   #include <linux/vfio.h>
> > +#include <linux/iommufd.h>
> >
> >   #include "vfio.h"
> >
> > @@ -45,6 +46,151 @@ int vfio_device_fops_cdev_open(struct inode
> *inode, struct file *filep)
> >   	return ret;
> >   }
> >
> > +static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
> > +{
> > +	spin_lock(&df->kvm_ref_lock);
> > +	if (!df->kvm)
> > +		goto unlock;
> > +
> > +	_vfio_device_get_kvm_safe(df->device, df->kvm);
> > +
> > +unlock:
> > +	spin_unlock(&df->kvm_ref_lock);
> > +}
> > +
> > +void vfio_device_cdev_close(struct vfio_device_file *df)
> > +{
> > +	struct vfio_device *device = df->device;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/*
> > +	 * As df->access_granted writer is under dev_set->lock as well,
> > +	 * so this read no need to use smp_load_acquire() to pair with
> > +	 * smp_store_release() in the caller of vfio_device_open().
> > +	 */

device->open_count is sure to be non-zero if df->access_granted
is true. Otherwise, it means this device file has not opened device
successfully, so no need to do further tidy up.

> > +	if (!df->access_granted) {
> > +		mutex_unlock(&device->dev_set->lock);
> > +		return;
> > +	}
> > +	vfio_device_close(df);
> > +	vfio_device_put_kvm(device);
> > +	if (df->iommufd)
> > +		iommufd_ctx_put(df->iommufd);
> > +	mutex_unlock(&device->dev_set->lock);
> > +	vfio_device_unblock_group(device);
> > +}
> > +
> > +static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
> > +{
> > +	struct fd f;
> > +	struct iommufd_ctx *iommufd;
> > +
> > +	f = fdget(fd);
> > +	if (!f.file)
> > +		return ERR_PTR(-EBADF);
> > +
> > +	iommufd = iommufd_ctx_from_file(f.file);
> > +
> > +	fdput(f);
> > +	return iommufd;
> > +}
> > +
> > +long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
> > +				    unsigned long arg)
> > +{
> > +	struct vfio_device *device = df->device;
> > +	struct vfio_device_bind_iommufd bind;
> > +	struct iommufd_ctx *iommufd = NULL;
> > +	unsigned long minsz;
> > +	int ret;
> > +
> > +	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
> > +
> > +	if (copy_from_user(&bind, (void __user *)arg, minsz))
> > +		return -EFAULT;
> > +
> > +	if (bind.argsz < minsz || bind.flags)
> > +		return -EINVAL;
> > +
> > +	if (!device->ops->bind_iommufd)
> > +		return -ENODEV;
> > +
> > +	ret = vfio_device_block_group(device);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mutex_lock(&device->dev_set->lock);
> > +	/*
> > +	 * If already been bound to an iommufd, or already set noiommu
> > +	 * then fail it.
> > +	 */
> > +	if (df->iommufd || df->noiommu) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/* iommufd < 0 means noiommu mode */
> > +	if (bind.iommufd < 0) {
> > +		if (!capable(CAP_SYS_RAWIO)) {
> > +			ret = -EPERM;
> > +			goto out_unlock;
> > +		}
> > +		df->noiommu = true;
> > +	} else {
> > +		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
> > +		if (IS_ERR(iommufd)) {
> > +			ret = PTR_ERR(iommufd);
> > +			goto out_unlock;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Before the device open, get the KVM pointer currently
> > +	 * associated with the device file (if there is) and obtain
> > +	 * a reference.  This reference is held until device closed.
> > +	 * Save the pointer in the device for use by drivers.
> > +	 */
> > +	vfio_device_get_kvm_safe(df);
> > +
> > +	df->iommufd = iommufd;
> > +	ret = vfio_device_open(df, &bind.out_devid, NULL);
> 
> 
> This is unrelated to this patch but reminded me - while debugging QEMU,
> vfio_assert_device_open() kept firing as I was killing QEMU (which in
> turn made the kernel close all fds), device->open_count==0 as QEMU was
> dying before calling ioctl(VFIO_DEVICE_BIND_IOMMUFD) which would call
> this vfio_device_open() just above. Has this been reported/fixed, just
> curious?

Thanks, I think this was fixed by the code I marked above. I think
it was raised in v2 review. Should have been fixed after that. Have
tried v5 or v6? If still have issue, please feel free let me know it.

https://lore.kernel.org/kvm/Y+HIWRM%2FTjWcuT6I@yzhao56-desk.sh.intel.com/

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 9e2c1ecaaf4f..37f80e368551 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2023 Intel Corporation.
  */
 #include <linux/vfio.h>
+#include <linux/iommufd.h>
 
 #include "vfio.h"
 
@@ -45,6 +46,151 @@  int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
 	return ret;
 }
 
+static void vfio_device_get_kvm_safe(struct vfio_device_file *df)
+{
+	spin_lock(&df->kvm_ref_lock);
+	if (!df->kvm)
+		goto unlock;
+
+	_vfio_device_get_kvm_safe(df->device, df->kvm);
+
+unlock:
+	spin_unlock(&df->kvm_ref_lock);
+}
+
+void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+	struct vfio_device *device = df->device;
+
+	mutex_lock(&device->dev_set->lock);
+	/*
+	 * As df->access_granted writer is under dev_set->lock as well,
+	 * so this read no need to use smp_load_acquire() to pair with
+	 * smp_store_release() in the caller of vfio_device_open().
+	 */
+	if (!df->access_granted) {
+		mutex_unlock(&device->dev_set->lock);
+		return;
+	}
+	vfio_device_close(df);
+	vfio_device_put_kvm(device);
+	if (df->iommufd)
+		iommufd_ctx_put(df->iommufd);
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+}
+
+static struct iommufd_ctx *vfio_get_iommufd_from_fd(int fd)
+{
+	struct fd f;
+	struct iommufd_ctx *iommufd;
+
+	f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	iommufd = iommufd_ctx_from_file(f.file);
+
+	fdput(f);
+	return iommufd;
+}
+
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    unsigned long arg)
+{
+	struct vfio_device *device = df->device;
+	struct vfio_device_bind_iommufd bind;
+	struct iommufd_ctx *iommufd = NULL;
+	unsigned long minsz;
+	int ret;
+
+	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
+
+	if (copy_from_user(&bind, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (bind.argsz < minsz || bind.flags)
+		return -EINVAL;
+
+	if (!device->ops->bind_iommufd)
+		return -ENODEV;
+
+	ret = vfio_device_block_group(device);
+	if (ret)
+		return ret;
+
+	mutex_lock(&device->dev_set->lock);
+	/*
+	 * If already been bound to an iommufd, or already set noiommu
+	 * then fail it.
+	 */
+	if (df->iommufd || df->noiommu) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* iommufd < 0 means noiommu mode */
+	if (bind.iommufd < 0) {
+		if (!capable(CAP_SYS_RAWIO)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+		df->noiommu = true;
+	} else {
+		iommufd = vfio_get_iommufd_from_fd(bind.iommufd);
+		if (IS_ERR(iommufd)) {
+			ret = PTR_ERR(iommufd);
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * Before the device open, get the KVM pointer currently
+	 * associated with the device file (if there is) and obtain
+	 * a reference.  This reference is held until device closed.
+	 * Save the pointer in the device for use by drivers.
+	 */
+	vfio_device_get_kvm_safe(df);
+
+	df->iommufd = iommufd;
+	ret = vfio_device_open(df, &bind.out_devid, NULL);
+	if (ret)
+		goto out_put_kvm;
+
+	ret = copy_to_user((void __user *)arg +
+			   offsetofend(struct vfio_device_bind_iommufd, iommufd),
+			   &bind.out_devid,
+			   sizeof(bind.out_devid)) ? -EFAULT : 0;
+	if (ret)
+		goto out_close_device;
+
+	if (df->noiommu)
+		dev_warn(device->dev, "vfio-noiommu device used by user "
+			 "(%s:%d)\n", current->comm, task_pid_nr(current));
+
+	/*
+	 * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
+	 * read/write/mmap
+	 */
+	smp_store_release(&df->access_granted, true);
+	mutex_unlock(&device->dev_set->lock);
+
+	return 0;
+
+out_close_device:
+	vfio_device_close(df);
+out_put_kvm:
+	df->iommufd = NULL;
+	df->noiommu = false;
+	vfio_device_put_kvm(device);
+	if (iommufd)
+		iommufd_ctx_put(iommufd);
+out_unlock:
+	mutex_unlock(&device->dev_set->lock);
+	vfio_device_unblock_group(device);
+	return ret;
+}
+
 static char *vfio_device_devnode(const struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 8661de75f94b..4716a904e63b 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -23,7 +23,9 @@  struct vfio_device_file {
 	bool access_granted;
 	spinlock_t kvm_ref_lock; /* protect kvm field */
 	struct kvm *kvm;
-	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
+	/* protected by struct vfio_device_set::lock */
+	struct iommufd_ctx *iommufd;
+	bool noiommu;
 };
 
 void vfio_device_put_registration(struct vfio_device *device);
@@ -269,6 +271,9 @@  static inline void vfio_device_del(struct vfio_device *device)
 
 void vfio_init_device_cdev(struct vfio_device *device);
 int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep);
+void vfio_device_cdev_close(struct vfio_device_file *df);
+long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+				    unsigned long arg);
 int vfio_cdev_init(struct class *device_class);
 void vfio_cdev_cleanup(void);
 #else
@@ -292,6 +297,16 @@  static inline int vfio_device_fops_cdev_open(struct inode *inode,
 	return 0;
 }
 
+static inline void vfio_device_cdev_close(struct vfio_device_file *df)
+{
+}
+
+static inline long vfio_device_ioctl_bind_iommufd(struct vfio_device_file *df,
+						  unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int vfio_cdev_init(struct class *device_class)
 {
 	return 0;
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 3f83447d022e..69d0add930bb 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -37,6 +37,7 @@ 
 #include <linux/interval_tree.h>
 #include <linux/iova_bitmap.h>
 #include <linux/iommufd.h>
+#include <uapi/linux/iommufd.h>
 #include "vfio.h"
 
 #define DRIVER_VERSION	"0.3"
@@ -422,16 +423,32 @@  static int vfio_device_first_open(struct vfio_device_file *df,
 {
 	struct vfio_device *device = df->device;
 	struct iommufd_ctx *iommufd = df->iommufd;
-	int ret;
+	int ret = 0;
 
 	lockdep_assert_held(&device->dev_set->lock);
 
+	if (WARN_ON(iommufd && df->noiommu))
+		return -EINVAL;
+
 	if (!try_module_get(device->dev->driver->owner))
 		return -ENODEV;
 
+	/*
+	 * For group/container path, iommufd pointer is NULL when comes
+	 * into this helper. Its noiommu support is handled by
+	 * vfio_device_group_use_iommu()
+	 *
+	 * For iommufd compat mode, iommufd pointer here is a valid value.
+	 * Its noiommu support is in vfio_iommufd_bind().
+	 *
+	 * For device cdev path, iommufd pointer here is a valid value for
+	 * normal cases, but it is NULL if it's noiommu. Check df->noiommu
+	 * to differentiate cdev noiommu from the group/container path which
+	 * also passes NULL iommufd pointer in. If set then do nothing.
+	 */
 	if (iommufd)
 		ret = vfio_iommufd_bind(device, iommufd, dev_id, pt_id);
-	else
+	else if (!df->noiommu)
 		ret = vfio_device_group_use_iommu(device);
 	if (ret)
 		goto err_module_put;
@@ -446,7 +463,7 @@  static int vfio_device_first_open(struct vfio_device_file *df,
 err_unuse_iommu:
 	if (iommufd)
 		vfio_iommufd_unbind(device);
-	else
+	else if (!df->noiommu)
 		vfio_device_group_unuse_iommu(device);
 err_module_put:
 	module_put(device->dev->driver->owner);
@@ -464,7 +481,7 @@  static void vfio_device_last_close(struct vfio_device_file *df)
 		device->ops->close_device(device);
 	if (iommufd)
 		vfio_iommufd_unbind(device);
-	else
+	else if (!df->noiommu)
 		vfio_device_group_unuse_iommu(device);
 	module_put(device->dev->driver->owner);
 }
@@ -549,6 +566,8 @@  static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 
 	if (!df->is_cdev_device)
 		vfio_device_group_close(df);
+	else
+		vfio_device_cdev_close(df);
 
 	vfio_device_put_registration(device);
 
@@ -1122,7 +1141,14 @@  static long vfio_device_fops_unl_ioctl(struct file *filep,
 	struct vfio_device *device = df->device;
 	int ret;
 
-	/* Paired with smp_store_release() in vfio_device_group_open() */
+	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
+		return vfio_device_ioctl_bind_iommufd(df, arg);
+
+	/*
+	 * Paired with smp_store_release() in the caller of
+	 * vfio_device_open(). e.g. vfio_device_group_open()
+	 * and vfio_device_ioctl_bind_iommufd()
+	 */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
 
@@ -1153,7 +1179,11 @@  static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
 	struct vfio_device_file *df = filep->private_data;
 	struct vfio_device *device = df->device;
 
-	/* Paired with smp_store_release() in vfio_device_group_open() */
+	/*
+	 * Paired with smp_store_release() in the caller of
+	 * vfio_device_open(). e.g. vfio_device_group_open()
+	 * and vfio_device_ioctl_bind_iommufd()
+	 */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
 
@@ -1170,7 +1200,11 @@  static ssize_t vfio_device_fops_write(struct file *filep,
 	struct vfio_device_file *df = filep->private_data;
 	struct vfio_device *device = df->device;
 
-	/* Paired with smp_store_release() in vfio_device_group_open() */
+	/*
+	 * Paired with smp_store_release() in the caller of
+	 * vfio_device_open(). e.g. vfio_device_group_open()
+	 * and vfio_device_ioctl_bind_iommufd()
+	 */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
 
@@ -1185,7 +1219,11 @@  static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	struct vfio_device_file *df = filep->private_data;
 	struct vfio_device *device = df->device;
 
-	/* Paired with smp_store_release() in vfio_device_group_open() */
+	/*
+	 * Paired with smp_store_release() in the caller of
+	 * vfio_device_open(). e.g. vfio_device_group_open()
+	 * and vfio_device_ioctl_bind_iommufd()
+	 */
 	if (!smp_load_acquire(&df->access_granted))
 		return -EINVAL;
 
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 650d45629647..9672cf839687 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -17,6 +17,12 @@  struct iommufd_ctx;
 struct iommufd_access;
 struct file;
 
+/*
+ * iommufd core init xarray with flags==XA_FLAGS_ALLOC1, so valid
+ * ID starts from 1.
+ */
+#define IOMMUFD_INVALID_ID 0
+
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
 void iommufd_device_unbind(struct iommufd_device *idev);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4bf11ee8de53..92aa8dbc970a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -194,6 +194,40 @@  struct vfio_group_status {
 
 /* --------------- IOCTLs for DEVICE file descriptors --------------- */
 
+/*
+ * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
+ *				   struct vfio_device_bind_iommufd)
+ *
+ * Bind a vfio_device to the specified iommufd.
+ *
+ * The user should provide a device cookie when calling this ioctl. The
+ * cookie is carried only in event e.g. I/O fault reported to userspace
+ * via iommufd. The user should use devid returned by this ioctl to mark
+ * the target device in other ioctls (e.g. capability query via iommufd).
+ *
+ * User is not allowed to access the device before the binding operation
+ * is completed.
+ *
+ * Unbind is automatically conducted when device fd is closed.
+ *
+ * @argsz:	 user filled size of this data.
+ * @flags:	 reserved for future extension.
+ * @dev_cookie:	 a per device cookie provided by userspace.
+ * @iommufd:	 iommufd to bind. a negative value means noiommu.
+ * @out_devid:	 the device id generated by this bind.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_bind_iommufd {
+	__u32		argsz;
+	__u32		flags;
+	__aligned_u64	dev_cookie;
+	__s32		iommufd;
+	__u32		out_devid;
+};
+
+#define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
 /**
  * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
  *						struct vfio_device_info)