diff mbox series

[v6,10/10] vfio/pci: Allow passing zero-length fd array in VFIO_DEVICE_PCI_HOT_RESET

Message ID 20230522115751.326947-11-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Enhance vfio PCI hot reset for vfio cdev device | expand

Commit Message

Yi Liu May 22, 2023, 11:57 a.m. UTC
This is the way user to invoke hot-reset for the devices opened by cdev
interface. User should check the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
in the output of VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl before doing
hot-reset for cdev devices.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Yanting Jiang <yanting.jiang@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 56 +++++++++++++++++++++++++-------
 include/uapi/linux/vfio.h        | 14 ++++++++
 2 files changed, 59 insertions(+), 11 deletions(-)

Comments

Alex Williamson May 24, 2023, 8:19 p.m. UTC | #1
On Mon, 22 May 2023 04:57:51 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> This is the way user to invoke hot-reset for the devices opened by cdev
> interface. User should check the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
> in the output of VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl before doing
> hot-reset for cdev devices.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 56 +++++++++++++++++++++++++-------
>  include/uapi/linux/vfio.h        | 14 ++++++++
>  2 files changed, 59 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 890065f846e4..67f1cb426505 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -181,7 +181,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
>  struct vfio_pci_group_info;
>  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups);
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx);
>  
>  /*
>   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> @@ -1301,8 +1302,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  	if (ret)
>  		return ret;
>  
> -	/* Somewhere between 1 and count is OK */
> -	if (!array_count || array_count > count)
> +	if (array_count > count || vfio_device_cdev_opened(&vdev->vdev))
>  		return -EINVAL;
>  
>  	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
> @@ -1351,7 +1351,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  	info.count = array_count;
>  	info.files = files;
>  
> -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
>  
>  hot_reset_release:
>  	for (file_idx--; file_idx >= 0; file_idx--)
> @@ -1380,7 +1380,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>  	else if (pci_probe_reset_bus(vdev->pdev->bus))
>  		return -ENODEV;
>  
> -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> +	if (hdr.count)
> +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> +
> +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
> +					  vfio_iommufd_device_ictx(&vdev->vdev));
>  }
>  
>  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> @@ -2347,13 +2351,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = {
>  };
>  EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
>  
> -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
> +static bool vfio_dev_in_groups(struct vfio_device *vdev,
>  			       struct vfio_pci_group_info *groups)
>  {
>  	unsigned int i;
>  
> +	if (!groups)
> +		return false;
> +
>  	for (i = 0; i < groups->count; i++)
> -		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
> +		if (vfio_file_has_dev(groups->files[i], vdev))
>  			return true;
>  	return false;
>  }
> @@ -2429,7 +2436,8 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
>   * get each memory_lock.
>   */
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups)
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx)
>  {
>  	struct vfio_pci_core_device *cur_mem;
>  	struct vfio_pci_core_device *cur_vma;
> @@ -2459,11 +2467,37 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  		goto err_unlock;
>  
>  	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
> +		bool owned;
> +
>  		/*
> -		 * Test whether all the affected devices are contained by the
> -		 * set of groups provided by the user.
> +		 * Test whether all the affected devices can be reset by the
> +		 * user.
> +		 *
> +		 * If the user provides a set of groups, all the devices
> +		 * in the dev_set should be contained by the set of groups
> +		 * provided by the user.

"If called from a group opened device and the user provides a set of
groups,..."

> +		 *
> +		 * If the user provides a zero-length group fd array, then

"If called from a cdev opened device and the user provides a
zero-length array,..."


> +		 * all the devices in the dev_set must be bound to the same
> +		 * iommufd_ctx as the input iommufd_ctx.  If there is any
> +		 * device that has not been bound to iommufd_ctx yet, check
> +		 * if its iommu_group has any device bound to the input
> +		 * iommufd_ctx Such devices can be considered owned by

"."...........................^

> +		 * the input iommufd_ctx as the device cannot be owned
> +		 * by another iommufd_ctx when its iommu_group is owned.
> +		 *
> +		 * Otherwise, reset is not allowed.


In the case where a non-null array is provided,
vfio_pci_ioctl_pci_hot_reset_groups() explicitly tests
vfio_device_cdev_opened(), so we exclude cdev devices from providing a
group list.  However, what prevents a compat opened group device from
providing a null array?

I thought it would be that this function is called with groups == NULL
and therefore the vfio_dev_in_groups() test below fails, but I don't
think that's true for a compat opened device.  Thanks,

Alex


>  		 */
> -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> +		if (iommufd_ctx) {
> +			int devid = vfio_iommufd_device_hot_reset_devid(&cur_vma->vdev,
> +									iommufd_ctx);
> +
> +			owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
> +		} else {
> +			owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
> +		}
> +
> +		if (!owned) {
>  			ret = -EINVAL;
>  			goto err_undo;
>  		}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 01203215251a..24858b650562 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -686,6 +686,9 @@ enum {
>   *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
>   *	  affected devices are owned by the user.  This flag is available only
>   *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> + *	  When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero
> + *	  length fd array on the calling device as the ownership is validated
> + *	  by iommufd_ctx.
>   *
>   * Return: 0 on success, -errno on failure:
>   *	-enospc = insufficient buffer, -enodev = unsupported for device.
> @@ -717,6 +720,17 @@ struct vfio_pci_hot_reset_info {
>   * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
>   *				    struct vfio_pci_hot_reset)
>   *
> + * Userspace requests hot reset for the devices it operates.  Due to the
> + * underlying topology, multiple devices can be affected in the reset
> + * while some might be opened by another user.  To avoid interference
> + * the calling user must ensure all affected devices are owned by itself.
> + *
> + * As the ownership described by VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, the
> + * cdev opened devices must exclusively provide a zero-length fd array and
> + * the group opened devices must exclusively use an array of group fds for
> + * proof of ownership.  Mixed access to devices between cdev and legacy
> + * groups are not supported by this interface.
> + *
>   * Return: 0 on success, -errno on failure.
>   */
>  struct vfio_pci_hot_reset {
Yi Liu May 30, 2023, 4:23 a.m. UTC | #2
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, May 25, 2023 4:20 AM
> 
> On Mon, 22 May 2023 04:57:51 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > This is the way user to invoke hot-reset for the devices opened by cdev
> > interface. User should check the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
> > in the output of VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl before doing
> > hot-reset for cdev devices.
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c | 56 +++++++++++++++++++++++++-------
> >  include/uapi/linux/vfio.h        | 14 ++++++++
> >  2 files changed, 59 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 890065f846e4..67f1cb426505 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -181,7 +181,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device
> *vdev)
> >  struct vfio_pci_group_info;
> >  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
> >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > -				      struct vfio_pci_group_info *groups);
> > +				      struct vfio_pci_group_info *groups,
> > +				      struct iommufd_ctx *iommufd_ctx);
> >
> >  /*
> >   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> > @@ -1301,8 +1302,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >  	if (ret)
> >  		return ret;
> >
> > -	/* Somewhere between 1 and count is OK */
> > -	if (!array_count || array_count > count)
> > +	if (array_count > count || vfio_device_cdev_opened(&vdev->vdev))
> >  		return -EINVAL;
> >
> >  	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
> > @@ -1351,7 +1351,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >  	info.count = array_count;
> >  	info.files = files;
> >
> > -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> > +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
> >
> >  hot_reset_release:
> >  	for (file_idx--; file_idx >= 0; file_idx--)
> > @@ -1380,7 +1380,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct
> vfio_pci_core_device *vdev,
> >  	else if (pci_probe_reset_bus(vdev->pdev->bus))
> >  		return -ENODEV;
> >
> > -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> > +	if (hdr.count)
> > +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> > +
> > +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
> > +					  vfio_iommufd_device_ictx(&vdev->vdev));
> >  }
> >
> >  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> > @@ -2347,13 +2351,16 @@ const struct pci_error_handlers
> vfio_pci_core_err_handlers = {
> >  };
> >  EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
> >
> > -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
> > +static bool vfio_dev_in_groups(struct vfio_device *vdev,
> >  			       struct vfio_pci_group_info *groups)
> >  {
> >  	unsigned int i;
> >
> > +	if (!groups)
> > +		return false;
> > +
> >  	for (i = 0; i < groups->count; i++)
> > -		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
> > +		if (vfio_file_has_dev(groups->files[i], vdev))
> >  			return true;
> >  	return false;
> >  }
> > @@ -2429,7 +2436,8 @@ static int vfio_pci_dev_set_pm_runtime_get(struct
> vfio_device_set *dev_set)
> >   * get each memory_lock.
> >   */
> >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > -				      struct vfio_pci_group_info *groups)
> > +				      struct vfio_pci_group_info *groups,
> > +				      struct iommufd_ctx *iommufd_ctx)
> >  {
> >  	struct vfio_pci_core_device *cur_mem;
> >  	struct vfio_pci_core_device *cur_vma;
> > @@ -2459,11 +2467,37 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
> >  		goto err_unlock;
> >
> >  	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
> > +		bool owned;
> > +
> >  		/*
> > -		 * Test whether all the affected devices are contained by the
> > -		 * set of groups provided by the user.
> > +		 * Test whether all the affected devices can be reset by the
> > +		 * user.
> > +		 *
> > +		 * If the user provides a set of groups, all the devices
> > +		 * in the dev_set should be contained by the set of groups
> > +		 * provided by the user.
> 
> "If called from a group opened device and the user provides a set of
> groups,..."
> 
> > +		 *
> > +		 * If the user provides a zero-length group fd array, then
> 
> "If called from a cdev opened device and the user provides a
> zero-length array,..."
> 
> 
> > +		 * all the devices in the dev_set must be bound to the same
> > +		 * iommufd_ctx as the input iommufd_ctx.  If there is any
> > +		 * device that has not been bound to iommufd_ctx yet, check
> > +		 * if its iommu_group has any device bound to the input
> > +		 * iommufd_ctx Such devices can be considered owned by
> 
> "."...........................^

Thanks, above comments well received.

> > +		 * the input iommufd_ctx as the device cannot be owned
> > +		 * by another iommufd_ctx when its iommu_group is owned.
> > +		 *
> > +		 * Otherwise, reset is not allowed.
> 
> 
> In the case where a non-null array is provided,
> vfio_pci_ioctl_pci_hot_reset_groups() explicitly tests
> vfio_device_cdev_opened(), so we exclude cdev devices from providing a
> group list. 

Yes.

> However, what prevents a compat opened group device from
> providing a null array?

I think I've asked this question before. What I got is seems no need
to prevent it[1].

[1] https://lore.kernel.org/kvm/BN9PR11MB5276ABE919C04E06A0326E8E8CBC9@BN9PR11MB5276.namprd11.prod.outlook.com/

Now, I intend to disallow it. If compat mode user binds the devices
to different containers, it shall be able to do hot reset as it can use
group fd to prove ownership. But if using the zero-length array, it
would be failed. So we may add below logic and remove the
vfio_device_cdev_opened() in vfio_pci_ioctl_pci_hot_reset_groups().

vfio_pci_ioctl_pci_hot_reset()
{
...
	if (!!hdr.count == !!vfio_device_cdev_opened(&vdev->vdev))
		return -EINVAL;
	if (hdr.count)
		vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
					     vfio_iommufd_device_ictx(&vdev->vdev));
}

> 
> I thought it would be that this function is called with groups == NULL
> and therefore the vfio_dev_in_groups() test below fails, but I don't
> think that's true for a compat opened device.  Thanks,

How about above logic?

Regards,
Yi Liu

> 
> >  		 */
> > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > +		if (iommufd_ctx) {
> > +			int devid = vfio_iommufd_device_hot_reset_devid(&cur_vma-
> >vdev,
> > +									iommufd_ctx);
> > +
> > +			owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
> > +		} else {
> > +			owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
> > +		}
> > +
> > +		if (!owned) {
> >  			ret = -EINVAL;
> >  			goto err_undo;
> >  		}
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 01203215251a..24858b650562 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -686,6 +686,9 @@ enum {
> >   *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
> >   *	  affected devices are owned by the user.  This flag is available only
> >   *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
> > + *	  When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero
> > + *	  length fd array on the calling device as the ownership is validated
> > + *	  by iommufd_ctx.
> >   *
> >   * Return: 0 on success, -errno on failure:
> >   *	-enospc = insufficient buffer, -enodev = unsupported for device.
> > @@ -717,6 +720,17 @@ struct vfio_pci_hot_reset_info {
> >   * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> >   *				    struct vfio_pci_hot_reset)
> >   *
> > + * Userspace requests hot reset for the devices it operates.  Due to the
> > + * underlying topology, multiple devices can be affected in the reset
> > + * while some might be opened by another user.  To avoid interference
> > + * the calling user must ensure all affected devices are owned by itself.
> > + *
> > + * As the ownership described by VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, the
> > + * cdev opened devices must exclusively provide a zero-length fd array and
> > + * the group opened devices must exclusively use an array of group fds for
> > + * proof of ownership.  Mixed access to devices between cdev and legacy
> > + * groups are not supported by this interface.
> > + *
> >   * Return: 0 on success, -errno on failure.
> >   */
> >  struct vfio_pci_hot_reset {
Alex Williamson May 31, 2023, 5:21 p.m. UTC | #3
On Tue, 30 May 2023 04:23:12 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Thursday, May 25, 2023 4:20 AM
> > 
> > On Mon, 22 May 2023 04:57:51 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:
> >   
> > > This is the way user to invoke hot-reset for the devices opened by cdev
> > > interface. User should check the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
> > > in the output of VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl before doing
> > > hot-reset for cdev devices.
> > >
> > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > > ---
> > >  drivers/vfio/pci/vfio_pci_core.c | 56 +++++++++++++++++++++++++-------
> > >  include/uapi/linux/vfio.h        | 14 ++++++++
> > >  2 files changed, 59 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > > index 890065f846e4..67f1cb426505 100644
> > > --- a/drivers/vfio/pci/vfio_pci_core.c
> > > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > > @@ -181,7 +181,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device  
> > *vdev)  
> > >  struct vfio_pci_group_info;
> > >  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
> > >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > > -				      struct vfio_pci_group_info *groups);
> > > +				      struct vfio_pci_group_info *groups,
> > > +				      struct iommufd_ctx *iommufd_ctx);
> > >
> > >  /*
> > >   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> > > @@ -1301,8 +1302,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct  
> > vfio_pci_core_device *vdev,  
> > >  	if (ret)
> > >  		return ret;
> > >
> > > -	/* Somewhere between 1 and count is OK */
> > > -	if (!array_count || array_count > count)
> > > +	if (array_count > count || vfio_device_cdev_opened(&vdev->vdev))
> > >  		return -EINVAL;
> > >
> > >  	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
> > > @@ -1351,7 +1351,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct  
> > vfio_pci_core_device *vdev,  
> > >  	info.count = array_count;
> > >  	info.files = files;
> > >
> > > -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> > > +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
> > >
> > >  hot_reset_release:
> > >  	for (file_idx--; file_idx >= 0; file_idx--)
> > > @@ -1380,7 +1380,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct  
> > vfio_pci_core_device *vdev,  
> > >  	else if (pci_probe_reset_bus(vdev->pdev->bus))
> > >  		return -ENODEV;
> > >
> > > -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> > > +	if (hdr.count)
> > > +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> > > +
> > > +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
> > > +					  vfio_iommufd_device_ictx(&vdev->vdev));
> > >  }
> > >
> > >  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> > > @@ -2347,13 +2351,16 @@ const struct pci_error_handlers  
> > vfio_pci_core_err_handlers = {  
> > >  };
> > >  EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
> > >
> > > -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
> > > +static bool vfio_dev_in_groups(struct vfio_device *vdev,
> > >  			       struct vfio_pci_group_info *groups)
> > >  {
> > >  	unsigned int i;
> > >
> > > +	if (!groups)
> > > +		return false;
> > > +
> > >  	for (i = 0; i < groups->count; i++)
> > > -		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
> > > +		if (vfio_file_has_dev(groups->files[i], vdev))
> > >  			return true;
> > >  	return false;
> > >  }
> > > @@ -2429,7 +2436,8 @@ static int vfio_pci_dev_set_pm_runtime_get(struct  
> > vfio_device_set *dev_set)  
> > >   * get each memory_lock.
> > >   */
> > >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > > -				      struct vfio_pci_group_info *groups)
> > > +				      struct vfio_pci_group_info *groups,
> > > +				      struct iommufd_ctx *iommufd_ctx)
> > >  {
> > >  	struct vfio_pci_core_device *cur_mem;
> > >  	struct vfio_pci_core_device *cur_vma;
> > > @@ -2459,11 +2467,37 @@ static int vfio_pci_dev_set_hot_reset(struct  
> > vfio_device_set *dev_set,  
> > >  		goto err_unlock;
> > >
> > >  	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
> > > +		bool owned;
> > > +
> > >  		/*
> > > -		 * Test whether all the affected devices are contained by the
> > > -		 * set of groups provided by the user.
> > > +		 * Test whether all the affected devices can be reset by the
> > > +		 * user.
> > > +		 *
> > > +		 * If the user provides a set of groups, all the devices
> > > +		 * in the dev_set should be contained by the set of groups
> > > +		 * provided by the user.  
> > 
> > "If called from a group opened device and the user provides a set of
> > groups,..."
> >   
> > > +		 *
> > > +		 * If the user provides a zero-length group fd array, then  
> > 
> > "If called from a cdev opened device and the user provides a
> > zero-length array,..."
> > 
> >   
> > > +		 * all the devices in the dev_set must be bound to the same
> > > +		 * iommufd_ctx as the input iommufd_ctx.  If there is any
> > > +		 * device that has not been bound to iommufd_ctx yet, check
> > > +		 * if its iommu_group has any device bound to the input
> > > +		 * iommufd_ctx Such devices can be considered owned by  
> > 
> > "."...........................^  
> 
> Thanks, above comments well received.
> 
> > > +		 * the input iommufd_ctx as the device cannot be owned
> > > +		 * by another iommufd_ctx when its iommu_group is owned.
> > > +		 *
> > > +		 * Otherwise, reset is not allowed.  
> > 
> > 
> > In the case where a non-null array is provided,
> > vfio_pci_ioctl_pci_hot_reset_groups() explicitly tests
> > vfio_device_cdev_opened(), so we exclude cdev devices from providing a
> > group list.   
> 
> Yes.
> 
> > However, what prevents a compat opened group device from
> > providing a null array?  
> 
> I think I've asked this question before. What I got is seems no need
> to prevent it[1].
> 
> [1] https://lore.kernel.org/kvm/BN9PR11MB5276ABE919C04E06A0326E8E8CBC9@BN9PR11MB5276.namprd11.prod.outlook.com/
> 
> Now, I intend to disallow it. If compat mode user binds the devices
> to different containers, it shall be able to do hot reset as it can use
> group fd to prove ownership. But if using the zero-length array, it
> would be failed. So we may add below logic and remove the
> vfio_device_cdev_opened() in vfio_pci_ioctl_pci_hot_reset_groups().
> 
> vfio_pci_ioctl_pci_hot_reset()
> {
> ...
> 	if (!!hdr.count == !!vfio_device_cdev_opened(&vdev->vdev))
> 		return -EINVAL;
> 	if (hdr.count)
> 		vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> 	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
> 					     vfio_iommufd_device_ictx(&vdev->vdev));
> }
> 
> > 
> > I thought it would be that this function is called with groups == NULL
> > and therefore the vfio_dev_in_groups() test below fails, but I don't
> > think that's true for a compat opened device.  Thanks,  
> 
> How about above logic?

The double negating a function that already returns bool is a bit
excessive.  I might also move the test closer to the other parameter
checking with a comment noting the null array interface is only for
cdev opened devices.  Thanks,

Alex
Yi Liu June 1, 2023, 5:55 a.m. UTC | #4
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Thursday, June 1, 2023 1:22 AM

> > Now, I intend to disallow it. If compat mode user binds the devices
> > to different containers, it shall be able to do hot reset as it can use
> > group fd to prove ownership. But if using the zero-length array, it
> > would be failed. So we may add below logic and remove the
> > vfio_device_cdev_opened() in vfio_pci_ioctl_pci_hot_reset_groups().
> >
> > vfio_pci_ioctl_pci_hot_reset()
> > {
> > ...
> > 	if (!!hdr.count == !!vfio_device_cdev_opened(&vdev->vdev))
> > 		return -EINVAL;
> > 	if (hdr.count)
> > 		vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
> > 	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
> > 					     vfio_iommufd_device_ictx(&vdev->vdev));
> > }
> >
> > >
> > > I thought it would be that this function is called with groups == NULL
> > > and therefore the vfio_dev_in_groups() test below fails, but I don't
> > > think that's true for a compat opened device.  Thanks,
> >
> > How about above logic?
> 
> The double negating a function that already returns bool is a bit

Yes.

> excessive.  I might also move the test closer to the other parameter
> checking with a comment noting the null array interface is only for
> cdev opened devices.  Thanks,

Yes.

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 890065f846e4..67f1cb426505 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -181,7 +181,8 @@  static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 struct vfio_pci_group_info;
 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups);
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx);
 
 /*
  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -1301,8 +1302,7 @@  vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
 	if (ret)
 		return ret;
 
-	/* Somewhere between 1 and count is OK */
-	if (!array_count || array_count > count)
+	if (array_count > count || vfio_device_cdev_opened(&vdev->vdev))
 		return -EINVAL;
 
 	group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
@@ -1351,7 +1351,7 @@  vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
 	info.count = array_count;
 	info.files = files;
 
-	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
 
 hot_reset_release:
 	for (file_idx--; file_idx >= 0; file_idx--)
@@ -1380,7 +1380,11 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	else if (pci_probe_reset_bus(vdev->pdev->bus))
 		return -ENODEV;
 
-	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
+	if (hdr.count)
+		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
+
+	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
+					  vfio_iommufd_device_ictx(&vdev->vdev));
 }
 
 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
@@ -2347,13 +2351,16 @@  const struct pci_error_handlers vfio_pci_core_err_handlers = {
 };
 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
 
-static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
+static bool vfio_dev_in_groups(struct vfio_device *vdev,
 			       struct vfio_pci_group_info *groups)
 {
 	unsigned int i;
 
+	if (!groups)
+		return false;
+
 	for (i = 0; i < groups->count; i++)
-		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
+		if (vfio_file_has_dev(groups->files[i], vdev))
 			return true;
 	return false;
 }
@@ -2429,7 +2436,8 @@  static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
  * get each memory_lock.
  */
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups)
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx)
 {
 	struct vfio_pci_core_device *cur_mem;
 	struct vfio_pci_core_device *cur_vma;
@@ -2459,11 +2467,37 @@  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 		goto err_unlock;
 
 	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+		bool owned;
+
 		/*
-		 * Test whether all the affected devices are contained by the
-		 * set of groups provided by the user.
+		 * Test whether all the affected devices can be reset by the
+		 * user.
+		 *
+		 * If the user provides a set of groups, all the devices
+		 * in the dev_set should be contained by the set of groups
+		 * provided by the user.
+		 *
+		 * If the user provides a zero-length group fd array, then
+		 * all the devices in the dev_set must be bound to the same
+		 * iommufd_ctx as the input iommufd_ctx.  If there is any
+		 * device that has not been bound to iommufd_ctx yet, check
+		 * if its iommu_group has any device bound to the input
+		 * iommufd_ctx Such devices can be considered owned by
+		 * the input iommufd_ctx as the device cannot be owned
+		 * by another iommufd_ctx when its iommu_group is owned.
+		 *
+		 * Otherwise, reset is not allowed.
 		 */
-		if (!vfio_dev_in_groups(cur_vma, groups)) {
+		if (iommufd_ctx) {
+			int devid = vfio_iommufd_device_hot_reset_devid(&cur_vma->vdev,
+									iommufd_ctx);
+
+			owned = (devid != VFIO_PCI_DEVID_NOT_OWNED);
+		} else {
+			owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+		}
+
+		if (!owned) {
 			ret = -EINVAL;
 			goto err_undo;
 		}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 01203215251a..24858b650562 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -686,6 +686,9 @@  enum {
  *	  Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the
  *	  affected devices are owned by the user.  This flag is available only
  *	  when VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved.
+ *	  When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero
+ *	  length fd array on the calling device as the ownership is validated
+ *	  by iommufd_ctx.
  *
  * Return: 0 on success, -errno on failure:
  *	-enospc = insufficient buffer, -enodev = unsupported for device.
@@ -717,6 +720,17 @@  struct vfio_pci_hot_reset_info {
  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
  *				    struct vfio_pci_hot_reset)
  *
+ * Userspace requests hot reset for the devices it operates.  Due to the
+ * underlying topology, multiple devices can be affected in the reset
+ * while some might be opened by another user.  To avoid interference
+ * the calling user must ensure all affected devices are owned by itself.
+ *
+ * As the ownership described by VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, the
+ * cdev opened devices must exclusively provide a zero-length fd array and
+ * the group opened devices must exclusively use an array of group fds for
+ * proof of ownership.  Mixed access to devices between cdev and legacy
+ * groups are not supported by this interface.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_pci_hot_reset {