diff mbox series

[v3,05/12] vfio/pci: Allow passing zero-length fd array in VFIO_DEVICE_PCI_HOT_RESET

Message ID 20230401144429.88673-6-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce new methods for verifying ownership in vfio PCI hot reset | expand

Commit Message

Yi Liu April 1, 2023, 2:44 p.m. UTC
as an alternative method for ownership check when iommufd is used. In
this case all opened devices in the affected dev_set are verified to
be bound to a same valid iommufd value to allow reset. It's simpler
and faster as user does not need to pass a set of fds and kernel no
need to search the device within the given fds.

a device in noiommu mode doesn't have a valid iommufd, so this method
should not be used in a dev_set which contains multiple devices and one
of them is in noiommu. The only allowed noiommu scenario is that the
calling device is noiommu and it's in a singleton dev_set.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Yanting Jiang <yanting.jiang@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
 include/uapi/linux/vfio.h        |  9 ++++++-
 2 files changed, 44 insertions(+), 7 deletions(-)

Comments

Eric Auger April 4, 2023, 4:54 p.m. UTC | #1
Hi Yi,

On 4/1/23 16:44, Yi Liu wrote:
> as an alternative method for ownership check when iommufd is used. In
I don't understand the 1st sentence.
> this case all opened devices in the affected dev_set are verified to
> be bound to a same valid iommufd value to allow reset. It's simpler
> and faster as user does not need to pass a set of fds and kernel no
kernel does not need to search
> need to search the device within the given fds.
>
> a device in noiommu mode doesn't have a valid iommufd, so this method
> should not be used in a dev_set which contains multiple devices and one
> of them is in noiommu. The only allowed noiommu scenario is that the
> calling device is noiommu and it's in a singleton dev_set.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
>  include/uapi/linux/vfio.h        |  9 ++++++-
>  2 files changed, 44 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 3696b8e58445..b68fcba67a4b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -180,7 +180,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
>  struct vfio_pci_group_info;
>  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups);
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx);
>  
>  /*
>   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> @@ -1277,7 +1278,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  		return ret;
>  
>  	/* Somewhere between 1 and count is OK */
> -	if (!hdr->count || hdr->count > count)
> +	if (hdr->count > count)
then I would simply remove the above comment since !count check is done
by the caller.
>  		return -EINVAL;
>  
>  	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
> @@ -1326,7 +1327,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  	info.count = hdr->count;
>  	info.files = files;
>  
> -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
>  
>  hot_reset_release:
>  	for (file_idx--; file_idx >= 0; file_idx--)
> @@ -1341,6 +1342,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>  {
>  	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
>  	struct vfio_pci_hot_reset hdr;
> +	struct iommufd_ctx *iommufd;
>  	bool slot = false;
>  
>  	if (copy_from_user(&hdr, arg, minsz))
> @@ -1355,7 +1357,12 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>  	else if (pci_probe_reset_bus(vdev->pdev->bus))
>  		return -ENODEV;
>  
> -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> +	if (hdr.count)
> +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> +
> +	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> +
> +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
>  }
>  
>  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> @@ -2327,6 +2334,9 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
>  {
>  	unsigned int i;
>  
> +	if (!groups)
> +		return false;
> +
>  	for (i = 0; i < groups->count; i++)
>  		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
>  			return true;
> @@ -2402,13 +2412,25 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
>  	return ret;
>  }
>  
> +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> +				    struct iommufd_ctx *iommufd_ctx)
> +{
> +	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> +
> +	if (!iommufd)
> +		return false;
> +
> +	return iommufd == iommufd_ctx;
> +}
> +
>  /*
>   * We need to get memory_lock for each device, but devices can share mmap_lock,
>   * therefore we need to zap and hold the vma_lock for each device, and only then
>   * get each memory_lock.
>   */
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups)
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx)
>  {
>  	struct vfio_pci_core_device *cur_mem;
>  	struct vfio_pci_core_device *cur_vma;
> @@ -2448,9 +2470,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  		 *
>  		 * Otherwise all opened devices in the dev_set must be
>  		 * contained by the set of groups provided by the user.
> +		 *
> +		 * If user provides a zero-length array, then all the
> +		 * opened devices must be bound to a same iommufd_ctx.
> +		 *
> +		 * If all above checks are failed, reset is allowed only if
> +		 * the calling device is in a singleton dev_set.
>  		 */
>  		if (cur_vma->vdev.open_count &&
> -		    !vfio_dev_in_groups(cur_vma, groups)) {
> +		    !vfio_dev_in_groups(cur_vma, groups) &&
> +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
> +		    (dev_set->device_count > 1)) {
>  			ret = -EINVAL;
>  			goto err_undo;
>  		}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index f96e5689cffc..17aa5d09db41 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -679,7 +679,14 @@ struct vfio_pci_hot_reset_info {
>   * the calling user must ensure all affected devices, if opened, are
>   * owned by itself.
>   *
> - * The ownership is proved by an array of group fds.
> + * The ownership can be proved by:
> + *   - An array of group fds
> + *   - A zero-length array

I would suggest something alike
in case a non void group fd array is passed, the devices affected by the
reset must belong to those opened VFIO groups.
in case a zero length array is passed, the other devices affected by the
reset, if any, must be bound to the same iommufd as this VFIO device
Either of the 2 methods is applied to check the feasibility of the reset
> + *
> + * In the last case all affected devices which are opened by this user
> + * must have been bound to a same iommufd. If the calling device is in
> + * noiommu mode (no valid iommufd) then it can be reset only if the reset
> + * doesn't affect other devices.
and keep that too
>   *
>   * Return: 0 on success, -errno on failure.
>   */
Thanks

Eric
Alex Williamson April 4, 2023, 8:18 p.m. UTC | #2
On Sat,  1 Apr 2023 07:44:22 -0700
Yi Liu <yi.l.liu@intel.com> wrote:

> as an alternative method for ownership check when iommufd is used. In
> this case all opened devices in the affected dev_set are verified to
> be bound to a same valid iommufd value to allow reset. It's simpler
> and faster as user does not need to pass a set of fds and kernel no
> need to search the device within the given fds.
> 
> a device in noiommu mode doesn't have a valid iommufd, so this method
> should not be used in a dev_set which contains multiple devices and one
> of them is in noiommu. The only allowed noiommu scenario is that the
> calling device is noiommu and it's in a singleton dev_set.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
>  include/uapi/linux/vfio.h        |  9 ++++++-
>  2 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 3696b8e58445..b68fcba67a4b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -180,7 +180,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
>  struct vfio_pci_group_info;
>  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups);
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx);
>  
>  /*
>   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> @@ -1277,7 +1278,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  		return ret;
>  
>  	/* Somewhere between 1 and count is OK */
> -	if (!hdr->count || hdr->count > count)
> +	if (hdr->count > count)
>  		return -EINVAL;
>  
>  	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
> @@ -1326,7 +1327,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>  	info.count = hdr->count;
>  	info.files = files;
>  
> -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
>  
>  hot_reset_release:
>  	for (file_idx--; file_idx >= 0; file_idx--)
> @@ -1341,6 +1342,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>  {
>  	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
>  	struct vfio_pci_hot_reset hdr;
> +	struct iommufd_ctx *iommufd;
>  	bool slot = false;
>  
>  	if (copy_from_user(&hdr, arg, minsz))
> @@ -1355,7 +1357,12 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>  	else if (pci_probe_reset_bus(vdev->pdev->bus))
>  		return -ENODEV;
>  
> -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> +	if (hdr.count)
> +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> +
> +	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> +
> +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
>  }
>  
>  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> @@ -2327,6 +2334,9 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
>  {
>  	unsigned int i;
>  
> +	if (!groups)
> +		return false;
> +
>  	for (i = 0; i < groups->count; i++)
>  		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
>  			return true;
> @@ -2402,13 +2412,25 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
>  	return ret;
>  }
>  
> +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> +				    struct iommufd_ctx *iommufd_ctx)
> +{
> +	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> +
> +	if (!iommufd)
> +		return false;
> +
> +	return iommufd == iommufd_ctx;
> +}
> +
>  /*
>   * We need to get memory_lock for each device, but devices can share mmap_lock,
>   * therefore we need to zap and hold the vma_lock for each device, and only then
>   * get each memory_lock.
>   */
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups)
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx)
>  {
>  	struct vfio_pci_core_device *cur_mem;
>  	struct vfio_pci_core_device *cur_vma;
> @@ -2448,9 +2470,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>  		 *
>  		 * Otherwise all opened devices in the dev_set must be
>  		 * contained by the set of groups provided by the user.
> +		 *
> +		 * If user provides a zero-length array, then all the
> +		 * opened devices must be bound to a same iommufd_ctx.
> +		 *
> +		 * If all above checks are failed, reset is allowed only if
> +		 * the calling device is in a singleton dev_set.
>  		 */
>  		if (cur_vma->vdev.open_count &&
> -		    !vfio_dev_in_groups(cur_vma, groups)) {
> +		    !vfio_dev_in_groups(cur_vma, groups) &&
> +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
> +		    (dev_set->device_count > 1)) {

This last condition looks buggy to me, we need all conditions to be
true to generate an error here, which means that for a singleton
dev_set, it doesn't matter what group fds are passed, if any, or whether
the iommufd context matches.  I think in fact this means that the empty
array path is equally available for group use cases with a singleton
dev_set, but we don't enable it for multiple device dev_sets like we do
iommufd.

You pointed out a previous issue with hot-reset info and no-iommu where
if other affected devices are not bound to vfio-pci the info ioctl
returns error.  That's handled in the hot-reset ioctl by the fact that
all affected devices must be in the dev_set and therefore bound to
vfio-pci drivers.  So it seems to me that aside from the spurious error
because we can't report an iommu group when none exists, and didn't
spot it to invent an invalid group for debugging, hot-reset otherwise
works with no-iommu just like it does for iommu backed devices.  We
don't currently require singleton no-iommu dev_sets afaict.

I'll also note that if the dev_set is singleton, this suggests that
pci_reset_function() can make use of bus reset, so a hot-reset is
accessible via VFIO_DEVICE_RESET if the appropriate reset method is
selected.

Therefore, I think as written, the singleton dev_set hot-reset is
enabled for iommufd and (unintentionally?) for the group path, while
also negating a requirement for a group fd or that a provided group fd
actually matches the device in this latter case.  The null-array
approach is not however extended to groups for more general use.
Additionally, limiting no-iommu hot-reset to singleton dev_sets
provides only a marginal functional difference vs VFIO_DEVICE_RESET.
Thanks,

Alex

>  			ret = -EINVAL;
>  			goto err_undo;
>  		}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index f96e5689cffc..17aa5d09db41 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -679,7 +679,14 @@ struct vfio_pci_hot_reset_info {
>   * the calling user must ensure all affected devices, if opened, are
>   * owned by itself.
>   *
> - * The ownership is proved by an array of group fds.
> + * The ownership can be proved by:
> + *   - An array of group fds
> + *   - A zero-length array
> + *
> + * In the last case all affected devices which are opened by this user
> + * must have been bound to a same iommufd. If the calling device is in
> + * noiommu mode (no valid iommufd) then it can be reset only if the reset
> + * doesn't affect other devices.
>   *
>   * Return: 0 on success, -errno on failure.
>   */
Yi Liu April 5, 2023, 7:55 a.m. UTC | #3
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Wednesday, April 5, 2023 4:19 AM
> 
> On Sat,  1 Apr 2023 07:44:22 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
> 
> > as an alternative method for ownership check when iommufd is used. In
> > this case all opened devices in the affected dev_set are verified to
> > be bound to a same valid iommufd value to allow reset. It's simpler
> > and faster as user does not need to pass a set of fds and kernel no
> > need to search the device within the given fds.
> >
> > a device in noiommu mode doesn't have a valid iommufd, so this method
> > should not be used in a dev_set which contains multiple devices and one
> > of them is in noiommu. The only allowed noiommu scenario is that the
> > calling device is noiommu and it's in a singleton dev_set.
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
> >  include/uapi/linux/vfio.h        |  9 ++++++-
> >  2 files changed, 44 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 3696b8e58445..b68fcba67a4b 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -180,7 +180,8 @@ static void vfio_pci_probe_mmaps(struct
> vfio_pci_core_device *vdev)
> >  struct vfio_pci_group_info;
> >  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
> >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > -				      struct vfio_pci_group_info *groups);
> > +				      struct vfio_pci_group_info *groups,
> > +				      struct iommufd_ctx *iommufd_ctx);
> >
> >  /*
> >   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> > @@ -1277,7 +1278,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >  		return ret;
> >
> >  	/* Somewhere between 1 and count is OK */
> > -	if (!hdr->count || hdr->count > count)
> > +	if (hdr->count > count)
> >  		return -EINVAL;
> >
> >  	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
> > @@ -1326,7 +1327,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >  	info.count = hdr->count;
> >  	info.files = files;
> >
> > -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> > +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
> >
> >  hot_reset_release:
> >  	for (file_idx--; file_idx >= 0; file_idx--)
> > @@ -1341,6 +1342,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct
> vfio_pci_core_device *vdev,
> >  {
> >  	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
> >  	struct vfio_pci_hot_reset hdr;
> > +	struct iommufd_ctx *iommufd;
> >  	bool slot = false;
> >
> >  	if (copy_from_user(&hdr, arg, minsz))
> > @@ -1355,7 +1357,12 @@ static int vfio_pci_ioctl_pci_hot_reset(struct
> vfio_pci_core_device *vdev,
> >  	else if (pci_probe_reset_bus(vdev->pdev->bus))
> >  		return -ENODEV;
> >
> > -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> > +	if (hdr.count)
> > +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> > +
> > +	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> > +
> > +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
> >  }
> >
> >  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> > @@ -2327,6 +2334,9 @@ static bool vfio_dev_in_groups(struct
> vfio_pci_core_device *vdev,
> >  {
> >  	unsigned int i;
> >
> > +	if (!groups)
> > +		return false;
> > +
> >  	for (i = 0; i < groups->count; i++)
> >  		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
> >  			return true;
> > @@ -2402,13 +2412,25 @@ static int vfio_pci_dev_set_pm_runtime_get(struct
> vfio_device_set *dev_set)
> >  	return ret;
> >  }
> >
> > +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> > +				    struct iommufd_ctx *iommufd_ctx)
> > +{
> > +	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> > +
> > +	if (!iommufd)
> > +		return false;
> > +
> > +	return iommufd == iommufd_ctx;
> > +}
> > +
> >  /*
> >   * We need to get memory_lock for each device, but devices can share mmap_lock,
> >   * therefore we need to zap and hold the vma_lock for each device, and only then
> >   * get each memory_lock.
> >   */
> >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > -				      struct vfio_pci_group_info *groups)
> > +				      struct vfio_pci_group_info *groups,
> > +				      struct iommufd_ctx *iommufd_ctx)
> >  {
> >  	struct vfio_pci_core_device *cur_mem;
> >  	struct vfio_pci_core_device *cur_vma;
> > @@ -2448,9 +2470,17 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
> >  		 *
> >  		 * Otherwise all opened devices in the dev_set must be
> >  		 * contained by the set of groups provided by the user.
> > +		 *
> > +		 * If user provides a zero-length array, then all the
> > +		 * opened devices must be bound to a same iommufd_ctx.
> > +		 *
> > +		 * If all above checks are failed, reset is allowed only if
> > +		 * the calling device is in a singleton dev_set.
> >  		 */
> >  		if (cur_vma->vdev.open_count &&
> > -		    !vfio_dev_in_groups(cur_vma, groups)) {
> > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
> > +		    (dev_set->device_count > 1)) {
> 
> This last condition looks buggy to me, we need all conditions to be
> true to generate an error here, which means that for a singleton
> dev_set, it doesn't matter what group fds are passed, if any, or whether
> the iommufd context matches.  I think in fact this means that the empty
> array path is equally available for group use cases with a singleton
> dev_set, but we don't enable it for multiple device dev_sets like we do
> iommufd.

you are right. The last condition allows the empty-fd array path to
work for the group use case if the dev_set happens to be a singleton.

> 
> You pointed out a previous issue with hot-reset info and no-iommu where
> if other affected devices are not bound to vfio-pci the info ioctl
> returns error.  That's handled in the hot-reset ioctl by the fact that
> all affected devices must be in the dev_set and therefore bound to
> vfio-pci drivers. 

yes, hot-reset ioctl requires all affected devices listed in the dev_set.
So for the case there are devices not bound to vfio yet, hot-reset ioctl
just fails. If all affected devices are in the dev_set, they will have a
fake group allocated by vfio. So the info ioctl won't fail.

> So it seems to me that aside from the spurious error
> because we can't report an iommu group when none exists, and didn't
> spot it to invent an invalid group for debugging, hot-reset otherwise
> works with no-iommu just like it does for iommu backed devices.  We
> don't currently require singleton no-iommu dev_sets afaict.

yes. the requirement for hot-reset is the same between no-iommu and
the iommufd backed devices.

> I'll also note that if the dev_set is singleton, this suggests that
> pci_reset_function() can make use of bus reset, so a hot-reset is
> accessible via VFIO_DEVICE_RESET if the appropriate reset method is
> selected.

yes. so does it mean not necessary to allow singleton dev_set support
in hot-reset ioctl? If user uses hot-reset, it should because of unable to
use VFIO_DEVICE_RESET, is it?

> 
> Therefore, I think as written, the singleton dev_set hot-reset is
> enabled for iommufd and (unintentionally?) for the group path, while
> also negating a requirement for a group fd or that a provided group fd
> actually matches the device in this latter case.  The null-array
> approach is not however extended to groups for more general use.
> Additionally, limiting no-iommu hot-reset to singleton dev_sets
> provides only a marginal functional difference vs VFIO_DEVICE_RESET.

I think the singletion dev_set hot-reset is for iommufd (or more accurately
for the noiommu case in cdev path). 

> Thanks,
> 
> Alex
> 
> >  			ret = -EINVAL;
> >  			goto err_undo;
> >  		}
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index f96e5689cffc..17aa5d09db41 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -679,7 +679,14 @@ struct vfio_pci_hot_reset_info {
> >   * the calling user must ensure all affected devices, if opened, are
> >   * owned by itself.
> >   *
> > - * The ownership is proved by an array of group fds.
> > + * The ownership can be proved by:
> > + *   - An array of group fds
> > + *   - A zero-length array
> > + *
> > + * In the last case all affected devices which are opened by this user
> > + * must have been bound to a same iommufd. If the calling device is in
> > + * noiommu mode (no valid iommufd) then it can be reset only if the reset
> > + * doesn't affect other devices.
> >   *
> >   * Return: 0 on success, -errno on failure.
> >   */
Yi Liu April 5, 2023, 8:01 a.m. UTC | #4
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, April 5, 2023 3:55 PM
 
> >
> > Therefore, I think as written, the singleton dev_set hot-reset is
> > enabled for iommufd and (unintentionally?) for the group path, while
> > also negating a requirement for a group fd or that a provided group fd
> > actually matches the device in this latter case.  The null-array
> > approach is not however extended to groups for more general use.
> > Additionally, limiting no-iommu hot-reset to singleton dev_sets
> > provides only a marginal functional difference vs VFIO_DEVICE_RESET.
> 
> I think the singletion dev_set hot-reset is for iommufd (or more accurately
> for the noiommu case in cdev path).

but actually, singleton dev_set hot-reset can work for group path as well.
Based on this, I'm also wondering do we really want to have singleton dev_set
hot-reset only for cdev noiommu case? or we allow it generally or just
don't support it as it is equivalent with VFIO_DEVICE_RESET?

If we don't support singletion dev_set hot-reset, noiommu devices in cdev
path shall fail the hot-reset if empty-fd array is provided. But we may just
document that empty-fd array does not work for noiommu. User should
use the device fd array.

Regards,
Yi Liu
Eric Auger April 5, 2023, 8:02 a.m. UTC | #5
On 4/4/23 22:18, Alex Williamson wrote:
> On Sat,  1 Apr 2023 07:44:22 -0700
> Yi Liu <yi.l.liu@intel.com> wrote:
>
>> as an alternative method for ownership check when iommufd is used. In
>> this case all opened devices in the affected dev_set are verified to
>> be bound to a same valid iommufd value to allow reset. It's simpler
>> and faster as user does not need to pass a set of fds and kernel no
>> need to search the device within the given fds.
>>
>> a device in noiommu mode doesn't have a valid iommufd, so this method
>> should not be used in a dev_set which contains multiple devices and one
>> of them is in noiommu. The only allowed noiommu scenario is that the
>> calling device is noiommu and it's in a singleton dev_set.
>>
>> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
>> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
>> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
>> ---
>>  drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
>>  include/uapi/linux/vfio.h        |  9 ++++++-
>>  2 files changed, 44 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
>> index 3696b8e58445..b68fcba67a4b 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -180,7 +180,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
>>  struct vfio_pci_group_info;
>>  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
>>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>> -				      struct vfio_pci_group_info *groups);
>> +				      struct vfio_pci_group_info *groups,
>> +				      struct iommufd_ctx *iommufd_ctx);
>>  
>>  /*
>>   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
>> @@ -1277,7 +1278,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>>  		return ret;
>>  
>>  	/* Somewhere between 1 and count is OK */
>> -	if (!hdr->count || hdr->count > count)
>> +	if (hdr->count > count)
>>  		return -EINVAL;
>>  
>>  	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
>> @@ -1326,7 +1327,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
>>  	info.count = hdr->count;
>>  	info.files = files;
>>  
>> -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
>> +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
>>  
>>  hot_reset_release:
>>  	for (file_idx--; file_idx >= 0; file_idx--)
>> @@ -1341,6 +1342,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>>  {
>>  	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
>>  	struct vfio_pci_hot_reset hdr;
>> +	struct iommufd_ctx *iommufd;
>>  	bool slot = false;
>>  
>>  	if (copy_from_user(&hdr, arg, minsz))
>> @@ -1355,7 +1357,12 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
>>  	else if (pci_probe_reset_bus(vdev->pdev->bus))
>>  		return -ENODEV;
>>  
>> -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
>> +	if (hdr.count)
>> +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
>> +
>> +	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
>> +
>> +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
>>  }
>>  
>>  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
>> @@ -2327,6 +2334,9 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
>>  {
>>  	unsigned int i;
>>  
>> +	if (!groups)
>> +		return false;
>> +
>>  	for (i = 0; i < groups->count; i++)
>>  		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
>>  			return true;
>> @@ -2402,13 +2412,25 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
>>  	return ret;
>>  }
>>  
>> +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
>> +				    struct iommufd_ctx *iommufd_ctx)
>> +{
>> +	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
>> +
>> +	if (!iommufd)
>> +		return false;
>> +
>> +	return iommufd == iommufd_ctx;
>> +}
>> +
>>  /*
>>   * We need to get memory_lock for each device, but devices can share mmap_lock,
>>   * therefore we need to zap and hold the vma_lock for each device, and only then
>>   * get each memory_lock.
>>   */
>>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>> -				      struct vfio_pci_group_info *groups)
>> +				      struct vfio_pci_group_info *groups,
>> +				      struct iommufd_ctx *iommufd_ctx)
>>  {
>>  	struct vfio_pci_core_device *cur_mem;
>>  	struct vfio_pci_core_device *cur_vma;
>> @@ -2448,9 +2470,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
>>  		 *
>>  		 * Otherwise all opened devices in the dev_set must be
>>  		 * contained by the set of groups provided by the user.
>> +		 *
>> +		 * If user provides a zero-length array, then all the
>> +		 * opened devices must be bound to a same iommufd_ctx.
>> +		 *
>> +		 * If all above checks are failed, reset is allowed only if
>> +		 * the calling device is in a singleton dev_set.
>>  		 */
>>  		if (cur_vma->vdev.open_count &&
>> -		    !vfio_dev_in_groups(cur_vma, groups)) {
>> +		    !vfio_dev_in_groups(cur_vma, groups) &&
>> +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
>> +		    (dev_set->device_count > 1)) {
> This last condition looks buggy to me, we need all conditions to be
> true to generate an error here, which means that for a singleton
> dev_set, it doesn't matter what group fds are passed, if any, or whether
> the iommufd context matches.  I think in fact this means that the empty
> array path is equally available for group use cases with a singleton
> dev_set, but we don't enable it for multiple device dev_sets like we do
> iommufd.
>
> You pointed out a previous issue with hot-reset info and no-iommu where
> if other affected devices are not bound to vfio-pci the info ioctl
> returns error.  That's handled in the hot-reset ioctl by the fact that
> all affected devices must be in the dev_set and therefore bound to
> vfio-pci drivers.  So it seems to me that aside from the spurious error
> because we can't report an iommu group when none exists, and didn't
> spot it to invent an invalid group for debugging, hot-reset otherwise
> works with no-iommu just like it does for iommu backed devices.  We
> don't currently require singleton no-iommu dev_sets afaict.
>
> I'll also note that if the dev_set is singleton, this suggests that
> pci_reset_function() can make use of bus reset, so a hot-reset is
> accessible via VFIO_DEVICE_RESET if the appropriate reset method is
> selected.
>
> Therefore, I think as written, the singleton dev_set hot-reset is
> enabled for iommufd and (unintentionally?) for the group path, while
> also negating a requirement for a group fd or that a provided group fd
> actually matches the device in this latter case.  The null-array
> approach is not however extended to groups for more general use.
> Additionally, limiting no-iommu hot-reset to singleton dev_sets
> provides only a marginal functional difference vs VFIO_DEVICE_RESET.
> Thanks,
>
> Alex
What bout introducing a helper
static bool is_reset_ok(pdev, groups, ctx) {
    if (!pdev->vdev.open_count)
        return true;
    if (groups && vfio_dev_in_groups(pdev, groups))
        return true;
    if (ctx && vfio_dev_in_iommufd_ctx(pdev, ctx)
        return true;
    return false;
}

Assuming the above logic is correct I think this would make the code
more readable

Thanks

Eric
>>  			ret = -EINVAL;
>>  			goto err_undo;
>>  		}
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index f96e5689cffc..17aa5d09db41 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -679,7 +679,14 @@ struct vfio_pci_hot_reset_info {
>>   * the calling user must ensure all affected devices, if opened, are
>>   * owned by itself.
>>   *
>> - * The ownership is proved by an array of group fds.
>> + * The ownership can be proved by:
>> + *   - An array of group fds
>> + *   - A zero-length array
>> + *
>> + * In the last case all affected devices which are opened by this user
>> + * must have been bound to a same iommufd. If the calling device is in
>> + * noiommu mode (no valid iommufd) then it can be reset only if the reset
>> + * doesn't affect other devices.
>>   *
>>   * Return: 0 on success, -errno on failure.
>>   */
Yi Liu April 5, 2023, 8:09 a.m. UTC | #6
Hi Eric,

> From: Eric Auger <eric.auger@redhat.com>
> Sent: Wednesday, April 5, 2023 4:02 PM 
> 
> On 4/4/23 22:18, Alex Williamson wrote:
> > On Sat,  1 Apr 2023 07:44:22 -0700
> > Yi Liu <yi.l.liu@intel.com> wrote:
> >
> >> as an alternative method for ownership check when iommufd is used. In
> >> this case all opened devices in the affected dev_set are verified to
> >> be bound to a same valid iommufd value to allow reset. It's simpler
> >> and faster as user does not need to pass a set of fds and kernel no
> >> need to search the device within the given fds.
> >>
> >> a device in noiommu mode doesn't have a valid iommufd, so this method
> >> should not be used in a dev_set which contains multiple devices and one
> >> of them is in noiommu. The only allowed noiommu scenario is that the
> >> calling device is noiommu and it's in a singleton dev_set.
> >>
> >> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> >> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> >> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> >> Tested-by: Yanting Jiang <yanting.jiang@intel.com>
> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> >> ---
> >>  drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++++++++++++++++++++-----
> >>  include/uapi/linux/vfio.h        |  9 ++++++-
> >>  2 files changed, 44 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> >> index 3696b8e58445..b68fcba67a4b 100644
> >> --- a/drivers/vfio/pci/vfio_pci_core.c
> >> +++ b/drivers/vfio/pci/vfio_pci_core.c
> >> @@ -180,7 +180,8 @@ static void vfio_pci_probe_mmaps(struct
> vfio_pci_core_device *vdev)
> >>  struct vfio_pci_group_info;
> >>  static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
> >>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> >> -				      struct vfio_pci_group_info *groups);
> >> +				      struct vfio_pci_group_info *groups,
> >> +				      struct iommufd_ctx *iommufd_ctx);
> >>
> >>  /*
> >>   * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> >> @@ -1277,7 +1278,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >>  		return ret;
> >>
> >>  	/* Somewhere between 1 and count is OK */
> >> -	if (!hdr->count || hdr->count > count)
> >> +	if (hdr->count > count)
> >>  		return -EINVAL;
> >>
> >>  	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
> >> @@ -1326,7 +1327,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct
> vfio_pci_core_device *vdev,
> >>  	info.count = hdr->count;
> >>  	info.files = files;
> >>
> >> -	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
> >> +	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
> >>
> >>  hot_reset_release:
> >>  	for (file_idx--; file_idx >= 0; file_idx--)
> >> @@ -1341,6 +1342,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct
> vfio_pci_core_device *vdev,
> >>  {
> >>  	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
> >>  	struct vfio_pci_hot_reset hdr;
> >> +	struct iommufd_ctx *iommufd;
> >>  	bool slot = false;
> >>
> >>  	if (copy_from_user(&hdr, arg, minsz))
> >> @@ -1355,7 +1357,12 @@ static int vfio_pci_ioctl_pci_hot_reset(struct
> vfio_pci_core_device *vdev,
> >>  	else if (pci_probe_reset_bus(vdev->pdev->bus))
> >>  		return -ENODEV;
> >>
> >> -	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> >> +	if (hdr.count)
> >> +		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
> >> +
> >> +	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> >> +
> >> +	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
> >>  }
> >>
> >>  static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
> >> @@ -2327,6 +2334,9 @@ static bool vfio_dev_in_groups(struct
> vfio_pci_core_device *vdev,
> >>  {
> >>  	unsigned int i;
> >>
> >> +	if (!groups)
> >> +		return false;
> >> +
> >>  	for (i = 0; i < groups->count; i++)
> >>  		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
> >>  			return true;
> >> @@ -2402,13 +2412,25 @@ static int vfio_pci_dev_set_pm_runtime_get(struct
> vfio_device_set *dev_set)
> >>  	return ret;
> >>  }
> >>
> >> +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> >> +				    struct iommufd_ctx *iommufd_ctx)
> >> +{
> >> +	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
> >> +
> >> +	if (!iommufd)
> >> +		return false;
> >> +
> >> +	return iommufd == iommufd_ctx;
> >> +}
> >> +
> >>  /*
> >>   * We need to get memory_lock for each device, but devices can share mmap_lock,
> >>   * therefore we need to zap and hold the vma_lock for each device, and only then
> >>   * get each memory_lock.
> >>   */
> >>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> >> -				      struct vfio_pci_group_info *groups)
> >> +				      struct vfio_pci_group_info *groups,
> >> +				      struct iommufd_ctx *iommufd_ctx)
> >>  {
> >>  	struct vfio_pci_core_device *cur_mem;
> >>  	struct vfio_pci_core_device *cur_vma;
> >> @@ -2448,9 +2470,17 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
> >>  		 *
> >>  		 * Otherwise all opened devices in the dev_set must be
> >>  		 * contained by the set of groups provided by the user.
> >> +		 *
> >> +		 * If user provides a zero-length array, then all the
> >> +		 * opened devices must be bound to a same iommufd_ctx.
> >> +		 *
> >> +		 * If all above checks are failed, reset is allowed only if
> >> +		 * the calling device is in a singleton dev_set.
> >>  		 */
> >>  		if (cur_vma->vdev.open_count &&
> >> -		    !vfio_dev_in_groups(cur_vma, groups)) {
> >> +		    !vfio_dev_in_groups(cur_vma, groups) &&
> >> +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
> >> +		    (dev_set->device_count > 1)) {
> > This last condition looks buggy to me, we need all conditions to be
> > true to generate an error here, which means that for a singleton
> > dev_set, it doesn't matter what group fds are passed, if any, or whether
> > the iommufd context matches.  I think in fact this means that the empty
> > array path is equally available for group use cases with a singleton
> > dev_set, but we don't enable it for multiple device dev_sets like we do
> > iommufd.
> >
> > You pointed out a previous issue with hot-reset info and no-iommu where
> > if other affected devices are not bound to vfio-pci the info ioctl
> > returns error.  That's handled in the hot-reset ioctl by the fact that
> > all affected devices must be in the dev_set and therefore bound to
> > vfio-pci drivers.  So it seems to me that aside from the spurious error
> > because we can't report an iommu group when none exists, and didn't
> > spot it to invent an invalid group for debugging, hot-reset otherwise
> > works with no-iommu just like it does for iommu backed devices.  We
> > don't currently require singleton no-iommu dev_sets afaict.
> >
> > I'll also note that if the dev_set is singleton, this suggests that
> > pci_reset_function() can make use of bus reset, so a hot-reset is
> > accessible via VFIO_DEVICE_RESET if the appropriate reset method is
> > selected.
> >
> > Therefore, I think as written, the singleton dev_set hot-reset is
> > enabled for iommufd and (unintentionally?) for the group path, while
> > also negating a requirement for a group fd or that a provided group fd
> > actually matches the device in this latter case.  The null-array
> > approach is not however extended to groups for more general use.
> > Additionally, limiting no-iommu hot-reset to singleton dev_sets
> > provides only a marginal functional difference vs VFIO_DEVICE_RESET.
> > Thanks,
> >
> > Alex
> What bout introducing a helper
> static bool is_reset_ok(pdev, groups, ctx) {
>     if (!pdev->vdev.open_count)
>         return true;
>     if (groups && vfio_dev_in_groups(pdev, groups))
>         return true;
>     if (ctx && vfio_dev_in_iommufd_ctx(pdev, ctx)
>         return true;
>     return false;
> }
> 
> Assuming the above logic is correct I think this would make the code
> more readable

this logic may fail the noiommu devices in the cdev path as the
cdev path binds the devices to iommufd==-1. The ctx would be
NULL. So we agreed to allow the reset if the dev_set is sigletion.
Detail can be found in below paragraph. As I replied in another
email. Maybe this singleton support can be dropped since singleton
dev_set may just do reset with VFIO_DEVICE_RESET. Alex may
correct me if userspace is not so intelligent.

"However the iommufd method has difficulty working with noiommu devices
since those devices don't have a valid iommufd, unless the noiommu device
is in a singleton dev_set hence no ownership check is required. [3]

[3] https://lore.kernel.org/kvm/ZACX+Np%2FIY7ygqL5@nvidia.com/"

Regards,
Yi Liu

> Thanks
> 
> Eric
> >>  			ret = -EINVAL;
> >>  			goto err_undo;
> >>  		}
> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >> index f96e5689cffc..17aa5d09db41 100644
> >> --- a/include/uapi/linux/vfio.h
> >> +++ b/include/uapi/linux/vfio.h
> >> @@ -679,7 +679,14 @@ struct vfio_pci_hot_reset_info {
> >>   * the calling user must ensure all affected devices, if opened, are
> >>   * owned by itself.
> >>   *
> >> - * The ownership is proved by an array of group fds.
> >> + * The ownership can be proved by:
> >> + *   - An array of group fds
> >> + *   - A zero-length array
> >> + *
> >> + * In the last case all affected devices which are opened by this user
> >> + * must have been bound to a same iommufd. If the calling device is in
> >> + * noiommu mode (no valid iommufd) then it can be reset only if the reset
> >> + * doesn't affect other devices.
> >>   *
> >>   * Return: 0 on success, -errno on failure.
> >>   */
Alex Williamson April 5, 2023, 3:36 p.m. UTC | #7
On Wed, 5 Apr 2023 08:01:49 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Wednesday, April 5, 2023 3:55 PM  
>  
> > >
> > > Therefore, I think as written, the singleton dev_set hot-reset is
> > > enabled for iommufd and (unintentionally?) for the group path, while
> > > also negating a requirement for a group fd or that a provided group fd
> > > actually matches the device in this latter case.  The null-array
> > > approach is not however extended to groups for more general use.
> > > Additionally, limiting no-iommu hot-reset to singleton dev_sets
> > > provides only a marginal functional difference vs VFIO_DEVICE_RESET.  
> > 
> > I think the singletion dev_set hot-reset is for iommufd (or more accurately
> > for the noiommu case in cdev path).  
> 
> but actually, singleton dev_set hot-reset can work for group path as well.
> Based on this, I'm also wondering do we really want to have singleton dev_set
> hot-reset only for cdev noiommu case? or we allow it generally or just
> don't support it as it is equivalent with VFIO_DEVICE_RESET?

I think you're taking the potential that VFIO_DEVICE_RESET and
hot-reset could do the same thing too far.  The former is more likely
to do an FLR, or even a PM reset.  QEMU even tries to guess what reset
VFIO_DEVICE_RESET might use in order to choose to do a hot-reset if it
seems like the device might only support a PM reset otherwise.

Changing the reset method of a device requires privilege, which is
maybe something we'd compromise on for no-iommu, but the general
expectation is that VFIO_DEVICE_RESET provides a device level scope and
hot-reset provides a... hot-reset, and sometimes those are the same
thing, but that doesn't mean we can lean on the former.

> If we don't support singletion dev_set hot-reset, noiommu devices in cdev
> path shall fail the hot-reset if empty-fd array is provided. But we may just
> document that empty-fd array does not work for noiommu. User should
> use the device fd array.

I don't see any replies to my comment on 08/12 where I again question
why we need an empty array option.  It's causing all sorts of headaches
and I don't see the justification for it beyond some hand waving that
it reduces complexity for the user.  This singleton dev-set notion
seems equally unjustified.  Do we just need to deal with hot-reset
being unsupported for no-iommu devices with iommufd?  Thanks,

Alex
Jason Gunthorpe April 5, 2023, 4:46 p.m. UTC | #8
On Wed, Apr 05, 2023 at 09:36:46AM -0600, Alex Williamson wrote:

> > If we don't support singletion dev_set hot-reset, noiommu devices in cdev
> > path shall fail the hot-reset if empty-fd array is provided. But we may just
> > document that empty-fd array does not work for noiommu. User should
> > use the device fd array.
> 
> I don't see any replies to my comment on 08/12 where I again question
> why we need an empty array option.

I was pressing we'd do empty-fd only and not do the device fd array at
all since it is such an ugly fit for the use cases we have.

But it is such a minor detail if you don't want it then take it out.

> This singleton dev-set notion seems equally unjustified.  Do we just
> need to deal with hot-reset being unsupported for no-iommu devices
> with iommufd?

It was to support no-iommu, if you want to de-support it then it can
go away too. AFAIK dpdk doesn't use this feature and it is the only
user we know of that has support for no-iommu so it is probably safe.

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3696b8e58445..b68fcba67a4b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -180,7 +180,8 @@  static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 struct vfio_pci_group_info;
 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups);
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx);
 
 /*
  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -1277,7 +1278,7 @@  vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
 		return ret;
 
 	/* Somewhere between 1 and count is OK */
-	if (!hdr->count || hdr->count > count)
+	if (hdr->count > count)
 		return -EINVAL;
 
 	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
@@ -1326,7 +1327,7 @@  vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
 	info.count = hdr->count;
 	info.files = files;
 
-	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
 
 hot_reset_release:
 	for (file_idx--; file_idx >= 0; file_idx--)
@@ -1341,6 +1342,7 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 {
 	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
 	struct vfio_pci_hot_reset hdr;
+	struct iommufd_ctx *iommufd;
 	bool slot = false;
 
 	if (copy_from_user(&hdr, arg, minsz))
@@ -1355,7 +1357,12 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	else if (pci_probe_reset_bus(vdev->pdev->bus))
 		return -ENODEV;
 
-	return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
+	if (hdr.count)
+		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
+
+	iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
+
+	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
 }
 
 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
@@ -2327,6 +2334,9 @@  static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
 {
 	unsigned int i;
 
+	if (!groups)
+		return false;
+
 	for (i = 0; i < groups->count; i++)
 		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
 			return true;
@@ -2402,13 +2412,25 @@  static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
 	return ret;
 }
 
+static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
+				    struct iommufd_ctx *iommufd_ctx)
+{
+	struct iommufd_ctx *iommufd = vfio_iommufd_physical_ictx(&vdev->vdev);
+
+	if (!iommufd)
+		return false;
+
+	return iommufd == iommufd_ctx;
+}
+
 /*
  * We need to get memory_lock for each device, but devices can share mmap_lock,
  * therefore we need to zap and hold the vma_lock for each device, and only then
  * get each memory_lock.
  */
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups)
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx)
 {
 	struct vfio_pci_core_device *cur_mem;
 	struct vfio_pci_core_device *cur_vma;
@@ -2448,9 +2470,17 @@  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 		 *
 		 * Otherwise all opened devices in the dev_set must be
 		 * contained by the set of groups provided by the user.
+		 *
+		 * If user provides a zero-length array, then all the
+		 * opened devices must be bound to a same iommufd_ctx.
+		 *
+		 * If all above checks are failed, reset is allowed only if
+		 * the calling device is in a singleton dev_set.
 		 */
 		if (cur_vma->vdev.open_count &&
-		    !vfio_dev_in_groups(cur_vma, groups)) {
+		    !vfio_dev_in_groups(cur_vma, groups) &&
+		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx) &&
+		    (dev_set->device_count > 1)) {
 			ret = -EINVAL;
 			goto err_undo;
 		}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index f96e5689cffc..17aa5d09db41 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -679,7 +679,14 @@  struct vfio_pci_hot_reset_info {
  * the calling user must ensure all affected devices, if opened, are
  * owned by itself.
  *
- * The ownership is proved by an array of group fds.
+ * The ownership can be proved by:
+ *   - An array of group fds
+ *   - A zero-length array
+ *
+ * In the last case all affected devices which are opened by this user
+ * must have been bound to a same iommufd. If the calling device is in
+ * noiommu mode (no valid iommufd) then it can be reset only if the reset
+ * doesn't affect other devices.
  *
  * Return: 0 on success, -errno on failure.
  */