diff mbox series

[v5,09/19] vfio/pci: Allow passing zero-length fd array in VFIO_DEVICE_PCI_HOT_RESET

Message ID 20230227111135.61728-10-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series Add vfio_device cdev for iommufd support | expand

Commit Message

Yi Liu Feb. 27, 2023, 11:11 a.m. UTC
to indicate kernel to use the device's bound iommufd_ctx for the device
ownership check. Kernel should loop all the opened devices in the dev_set,
and check if they are bound to the same iommufd_ctx. For the devices that
has not been opened yet but affected, they can be reset by the current
users as they cannot be opened by any other user. This applies to the
existing group/container path as well.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 111 +++++++++++++++++++++++--------
 drivers/vfio/vfio.h              |  11 +++
 include/uapi/linux/vfio.h        |  16 +++++
 3 files changed, 109 insertions(+), 29 deletions(-)

Comments

Jason Gunthorpe Feb. 27, 2023, 6:22 p.m. UTC | #1
On Mon, Feb 27, 2023 at 03:11:25AM -0800, Yi Liu wrote:
> to indicate kernel to use the device's bound iommufd_ctx for the device
> ownership check. Kernel should loop all the opened devices in the dev_set,
> and check if they are bound to the same iommufd_ctx. For the devices that
> has not been opened yet but affected, they can be reset by the current
> users as they cannot be opened by any other user. This applies to the
> existing group/container path as well.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 111 +++++++++++++++++++++++--------
>  drivers/vfio/vfio.h              |  11 +++
>  include/uapi/linux/vfio.h        |  16 +++++
>  3 files changed, 109 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 1bf54beeaef2..e0ebe55b4df0 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -27,11 +27,13 @@
>  #include <linux/vgaarb.h>
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
> +#include <linux/iommufd.h>

Is this needed anymore?

>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
>  
>  #include "vfio_pci_priv.h"
> +#include "../vfio.h"

Don't do this, put vfio_device_iommufd() in the normal public header

> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 0552e8dcf0cb..4bf11ee8de53 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -673,6 +673,22 @@ struct vfio_pci_hot_reset_info {
>   * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
>   *				    struct vfio_pci_hot_reset)
>   *
> + * Userspace requests hot reset for the devices it uses.  Due to the
> + * underlying topology, multiple devices may be affected in the reset.
> + * The affected devices may have been opened by the user or by other
> + * users or not opened yet.  Only when all the affected devices are
> + * either opened by the current user or not opened by any user, should
> + * the reset request be allowed.  Otherwise, this request is expected
> + * to return error.
> + *
> + * If the user uses group and container interface, it should pass down
> + * a set of group fds for ownership check.  If the user uses iommufd, it
> + * should pass down a zero-length group_fds array to indicate the kernel
> + * to use the bound iommufd for the ownership check.  User that uses the
> + * vfio iommufd compatible mode can also pass down a zero-length group_fds
> + * array as this mode uses iommufd in kernel, and there is no reason to
> + * forbide it.

'forbid'

Rest looks good

Thanks,
Jason
Yi Liu Feb. 28, 2023, 2:31 a.m. UTC | #2
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, February 28, 2023 2:23 AM
> 
> On Mon, Feb 27, 2023 at 03:11:25AM -0800, Yi Liu wrote:
> > to indicate kernel to use the device's bound iommufd_ctx for the device
> > ownership check. Kernel should loop all the opened devices in the
> dev_set,
> > and check if they are bound to the same iommufd_ctx. For the devices
> that
> > has not been opened yet but affected, they can be reset by the current
> > users as they cannot be opened by any other user. This applies to the
> > existing group/container path as well.
> >
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c | 111 +++++++++++++++++++++++-------
> -
> >  drivers/vfio/vfio.h              |  11 +++
> >  include/uapi/linux/vfio.h        |  16 +++++
> >  3 files changed, 109 insertions(+), 29 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c
> b/drivers/vfio/pci/vfio_pci_core.c
> > index 1bf54beeaef2..e0ebe55b4df0 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -27,11 +27,13 @@
> >  #include <linux/vgaarb.h>
> >  #include <linux/nospec.h>
> >  #include <linux/sched/mm.h>
> > +#include <linux/iommufd.h>
> 
> Is this needed anymore?

No more. Will remove it.

> >  #if IS_ENABLED(CONFIG_EEH)
> >  #include <asm/eeh.h>
> >  #endif
> >
> >  #include "vfio_pci_priv.h"
> > +#include "../vfio.h"
> 
> Don't do this, put vfio_device_iommufd() in the normal public header

Ok. will put it in include/linux/vfio.h

> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 0552e8dcf0cb..4bf11ee8de53 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -673,6 +673,22 @@ struct vfio_pci_hot_reset_info {
> >   * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> >   *				    struct vfio_pci_hot_reset)
> >   *
> > + * Userspace requests hot reset for the devices it uses.  Due to the
> > + * underlying topology, multiple devices may be affected in the reset.
> > + * The affected devices may have been opened by the user or by other
> > + * users or not opened yet.  Only when all the affected devices are
> > + * either opened by the current user or not opened by any user, should
> > + * the reset request be allowed.  Otherwise, this request is expected
> > + * to return error.
> > + *
> > + * If the user uses group and container interface, it should pass down
> > + * a set of group fds for ownership check.  If the user uses iommufd, it
> > + * should pass down a zero-length group_fds array to indicate the kernel
> > + * to use the bound iommufd for the ownership check.  User that uses
> the
> > + * vfio iommufd compatible mode can also pass down a zero-length
> group_fds
> > + * array as this mode uses iommufd in kernel, and there is no reason to
> > + * forbide it.
> 
> 'forbid'

Oh, yes. will correct it.

Regards,
Yi Liu
Yi Liu March 2, 2023, 6:07 a.m. UTC | #3
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, February 27, 2023 7:11 PM
[...]
> @@ -2392,13 +2416,25 @@ static int
> vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
>  	return ret;
>  }
> 
> +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> +				    struct iommufd_ctx *iommufd_ctx)
> +{
> +	struct iommufd_ctx *iommufd = vfio_device_iommufd(&vdev-
> >vdev);
> +
> +	if (!iommufd)
> +		return false;
> +
> +	return iommufd == iommufd_ctx;
> +}
> +
>  /*
>   * We need to get memory_lock for each device, but devices can share
> mmap_lock,
>   * therefore we need to zap and hold the vma_lock for each device, and
> only then
>   * get each memory_lock.
>   */
>  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> -				      struct vfio_pci_group_info *groups)
> +				      struct vfio_pci_group_info *groups,
> +				      struct iommufd_ctx *iommufd_ctx)
>  {
>  	struct vfio_pci_core_device *cur_mem;
>  	struct vfio_pci_core_device *cur_vma;
> @@ -2429,10 +2465,27 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
> 
>  	list_for_each_entry(cur_vma, &dev_set->device_list,
> vdev.dev_set_list) {
>  		/*
> -		 * Test whether all the affected devices are contained by
> the
> -		 * set of groups provided by the user.
> +		 * Test whether all the affected devices can be reset by the
> +		 * user.  The affected devices may already been opened or
> not
> +		 * yet.
> +		 *
> +		 * For the devices not opened yet, user can reset them. The
> +		 * reason is that the hot reset is done under the protection
> +		 * of the dev_set->lock, and device open is also under this
> +		 * lock.  During the hot reset, such devices can not be
> opened
> +		 * by other users.
> +		 *
> +		 * For the devices that have been opened, needs to check
> the
> +		 * ownership.  If the user provides a set of group fds, the
> +		 * ownership check is done by checking if all the opened
> +		 * devices are contained by the groups.  If the user provides
> +		 * a zero-length fd array, the ownerhsip check is done by
> +		 * checking if all the opened devices are bound to the same
> +		 * iommufd_ctx.
>  		 */
> -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> +		if (cur_vma->vdev.open_count &&
> +		    !vfio_dev_in_groups(cur_vma, groups) &&
> +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {

Hi Alex, Jason,

There is one concern on this approach which is related to the
cdev noiommu mode. As patch 16 of this series, cdev path
supports noiommu mode by passing a negative iommufd to
kernel. In such case, the vfio_device is not bound to a valid
iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
to be broken.

An idea is to add a cdev_noiommu flag in vfio_device, when
checking the iommufd_ictx, also check this flag. If all the opened
devices in the dev_set have vfio_device->cdev_noiommu==true,
then the reset is considered to be doable. But there is a special
case. If devices in this dev_set are opened by two applications
that operates in cdev noiommu mode, then this logic is not able
to differentiate them. In that case, should we allow the reset?
It seems to ok to allow reset since noiommu mode itself means
no security between the applications that use it. thoughts?

>  			ret = -EINVAL;
>  			goto err_undo;
>  		}
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 2e3cb284711d..64e862a02dad 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -225,6 +225,11 @@ static inline void vfio_container_cleanup(void)
>  #if IS_ENABLED(CONFIG_IOMMUFD)
>  int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx
> *ictx);
>  void vfio_iommufd_unbind(struct vfio_device *device);
> +static inline struct iommufd_ctx *
> +vfio_device_iommufd(struct vfio_device *device)
> +{
> +	return device->iommufd_ictx;
> +}
>  #else

Regards,
Yi Liu
Tian, Kevin March 2, 2023, 9:55 a.m. UTC | #4
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, March 2, 2023 2:07 PM
> 
> > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > +		if (cur_vma->vdev.open_count &&
> > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {
> 
> Hi Alex, Jason,
> 
> There is one concern on this approach which is related to the
> cdev noiommu mode. As patch 16 of this series, cdev path
> supports noiommu mode by passing a negative iommufd to
> kernel. In such case, the vfio_device is not bound to a valid
> iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> to be broken.
> 
> An idea is to add a cdev_noiommu flag in vfio_device, when
> checking the iommufd_ictx, also check this flag. If all the opened
> devices in the dev_set have vfio_device->cdev_noiommu==true,
> then the reset is considered to be doable. But there is a special
> case. If devices in this dev_set are opened by two applications
> that operates in cdev noiommu mode, then this logic is not able
> to differentiate them. In that case, should we allow the reset?
> It seems to ok to allow reset since noiommu mode itself means
> no security between the applications that use it. thoughts?
> 

Probably we need still pass in a valid iommufd (instead of using
a negative value) in noiommu case to mark the ownership so the
check in the reset path can correctly catch whether an opened
device belongs to this user.

That implies we may instead use a flag bit to mark NOIOMMU
mode and in the kernel also has a noiommu flag in device
file to differentiate it from normal case.
Jason Gunthorpe March 2, 2023, 12:35 p.m. UTC | #5
On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, March 2, 2023 2:07 PM
> > 
> > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > +		if (cur_vma->vdev.open_count &&
> > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {
> > 
> > Hi Alex, Jason,
> > 
> > There is one concern on this approach which is related to the
> > cdev noiommu mode. As patch 16 of this series, cdev path
> > supports noiommu mode by passing a negative iommufd to
> > kernel. In such case, the vfio_device is not bound to a valid
> > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > to be broken.
> > 
> > An idea is to add a cdev_noiommu flag in vfio_device, when
> > checking the iommufd_ictx, also check this flag. If all the opened
> > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > then the reset is considered to be doable. But there is a special
> > case. If devices in this dev_set are opened by two applications
> > that operates in cdev noiommu mode, then this logic is not able
> > to differentiate them. In that case, should we allow the reset?
> > It seems to ok to allow reset since noiommu mode itself means
> > no security between the applications that use it. thoughts?
> > 
> 
> Probably we need still pass in a valid iommufd (instead of using
> a negative value) in noiommu case to mark the ownership so the
> check in the reset path can correctly catch whether an opened
> device belongs to this user.

There should be no iommufd at all in no-iommu mode

Adding one just to deal with noiommu reset seems pretty sad :\

no-iommu is only really used by dpdk, and it doesn't invoke
VFIO_DEVICE_PCI_HOT_RESET at all.

I'd say as long as VFIO_DEVICE_PCI_HOT_RESET works if only one vfio
device is open using a empty list (eg we should ensure that the
invoking cdev itself is allowed) then I think it is OK.

Jason
Yi Liu March 2, 2023, 2:20 p.m. UTC | #6
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, March 2, 2023 8:35 PM
> 
> On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Thursday, March 2, 2023 2:07 PM
> > >
> > > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > > +		if (cur_vma->vdev.open_count &&
> > > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > > +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {
> > >
> > > Hi Alex, Jason,
> > >
> > > There is one concern on this approach which is related to the
> > > cdev noiommu mode. As patch 16 of this series, cdev path
> > > supports noiommu mode by passing a negative iommufd to
> > > kernel. In such case, the vfio_device is not bound to a valid
> > > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > > to be broken.
> > >
> > > An idea is to add a cdev_noiommu flag in vfio_device, when
> > > checking the iommufd_ictx, also check this flag. If all the opened
> > > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > > then the reset is considered to be doable. But there is a special
> > > case. If devices in this dev_set are opened by two applications
> > > that operates in cdev noiommu mode, then this logic is not able
> > > to differentiate them. In that case, should we allow the reset?
> > > It seems to ok to allow reset since noiommu mode itself means
> > > no security between the applications that use it. thoughts?
> > >
> >
> > Probably we need still pass in a valid iommufd (instead of using
> > a negative value) in noiommu case to mark the ownership so the
> > check in the reset path can correctly catch whether an opened
> > device belongs to this user.
> 
> There should be no iommufd at all in no-iommu mode
> 
> Adding one just to deal with noiommu reset seems pretty sad :\
> 
> no-iommu is only really used by dpdk, and it doesn't invoke
> VFIO_DEVICE_PCI_HOT_RESET at all.

Does it happen to be or by design, this ioctl is not needed by dpdk?

> I'd say as long as VFIO_DEVICE_PCI_HOT_RESET works if only one vfio
> device is open using a empty list (eg we should ensure that the
> invoking cdev itself is allowed) then I think it is OK.

Sorry, which empty list are your referring?

Regards,
Yi Liu
Alex Williamson March 2, 2023, 9:04 p.m. UTC | #7
On Thu, 2 Mar 2023 06:07:04 +0000
"Liu, Yi L" <yi.l.liu@intel.com> wrote:

> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, February 27, 2023 7:11 PM  
> [...]
> > @@ -2392,13 +2416,25 @@ static int
> > vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
> >  	return ret;
> >  }
> > 
> > +static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
> > +				    struct iommufd_ctx *iommufd_ctx)
> > +{
> > +	struct iommufd_ctx *iommufd = vfio_device_iommufd(&vdev-  
> > >vdev);  
> > +
> > +	if (!iommufd)
> > +		return false;
> > +
> > +	return iommufd == iommufd_ctx;
> > +}
> > +
> >  /*
> >   * We need to get memory_lock for each device, but devices can share
> > mmap_lock,
> >   * therefore we need to zap and hold the vma_lock for each device, and
> > only then
> >   * get each memory_lock.
> >   */
> >  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
> > -				      struct vfio_pci_group_info *groups)
> > +				      struct vfio_pci_group_info *groups,
> > +				      struct iommufd_ctx *iommufd_ctx)
> >  {
> >  	struct vfio_pci_core_device *cur_mem;
> >  	struct vfio_pci_core_device *cur_vma;
> > @@ -2429,10 +2465,27 @@ static int vfio_pci_dev_set_hot_reset(struct
> > vfio_device_set *dev_set,
> > 
> >  	list_for_each_entry(cur_vma, &dev_set->device_list,
> > vdev.dev_set_list) {
> >  		/*
> > -		 * Test whether all the affected devices are contained by
> > the
> > -		 * set of groups provided by the user.
> > +		 * Test whether all the affected devices can be reset by the
> > +		 * user.  The affected devices may already been opened or
> > not
> > +		 * yet.
> > +		 *
> > +		 * For the devices not opened yet, user can reset them. The
> > +		 * reason is that the hot reset is done under the protection
> > +		 * of the dev_set->lock, and device open is also under this
> > +		 * lock.  During the hot reset, such devices can not be
> > opened
> > +		 * by other users.
> > +		 *
> > +		 * For the devices that have been opened, needs to check
> > the
> > +		 * ownership.  If the user provides a set of group fds, the
> > +		 * ownership check is done by checking if all the opened
> > +		 * devices are contained by the groups.  If the user provides
> > +		 * a zero-length fd array, the ownerhsip check is done by
> > +		 * checking if all the opened devices are bound to the same
> > +		 * iommufd_ctx.
> >  		 */
> > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > +		if (cur_vma->vdev.open_count &&
> > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > +		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {  
> 
> Hi Alex, Jason,
> 
> There is one concern on this approach which is related to the
> cdev noiommu mode. As patch 16 of this series, cdev path
> supports noiommu mode by passing a negative iommufd to
> kernel. In such case, the vfio_device is not bound to a valid
> iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> to be broken.
> 
> An idea is to add a cdev_noiommu flag in vfio_device, when
> checking the iommufd_ictx, also check this flag. If all the opened
> devices in the dev_set have vfio_device->cdev_noiommu==true,
> then the reset is considered to be doable. But there is a special
> case. If devices in this dev_set are opened by two applications
> that operates in cdev noiommu mode, then this logic is not able
> to differentiate them. In that case, should we allow the reset?
> It seems to ok to allow reset since noiommu mode itself means
> no security between the applications that use it. thoughts?

I don't think the existing vulnerabilities of no-iommu mode should be
carte blanche to add additional weaknesses.  Thanks,

Alex
Tian, Kevin March 3, 2023, 6:36 a.m. UTC | #8
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Thursday, March 2, 2023 10:20 PM
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Thursday, March 2, 2023 8:35 PM
> >
> > On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Thursday, March 2, 2023 2:07 PM
> > > >
> > > > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > > > +		if (cur_vma->vdev.open_count &&
> > > > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > > > +		    !vfio_dev_in_iommufd_ctx(cur_vma,
> iommufd_ctx)) {
> > > >
> > > > Hi Alex, Jason,
> > > >
> > > > There is one concern on this approach which is related to the
> > > > cdev noiommu mode. As patch 16 of this series, cdev path
> > > > supports noiommu mode by passing a negative iommufd to
> > > > kernel. In such case, the vfio_device is not bound to a valid
> > > > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > > > to be broken.
> > > >
> > > > An idea is to add a cdev_noiommu flag in vfio_device, when
> > > > checking the iommufd_ictx, also check this flag. If all the opened
> > > > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > > > then the reset is considered to be doable. But there is a special
> > > > case. If devices in this dev_set are opened by two applications
> > > > that operates in cdev noiommu mode, then this logic is not able
> > > > to differentiate them. In that case, should we allow the reset?
> > > > It seems to ok to allow reset since noiommu mode itself means
> > > > no security between the applications that use it. thoughts?
> > > >
> > >
> > > Probably we need still pass in a valid iommufd (instead of using
> > > a negative value) in noiommu case to mark the ownership so the
> > > check in the reset path can correctly catch whether an opened
> > > device belongs to this user.
> >
> > There should be no iommufd at all in no-iommu mode
> >
> > Adding one just to deal with noiommu reset seems pretty sad :\
> >
> > no-iommu is only really used by dpdk, and it doesn't invoke
> > VFIO_DEVICE_PCI_HOT_RESET at all.
> 
> Does it happen to be or by design, this ioctl is not needed by dpdk?

use of noiommu should be discouraged.

if only known noiommu user doesn't use it then having certain
new restriction for noiommu in the hot reset path might be an
acceptable tradeoff.

but again needs Alex's input as he knows all the history about
noiommu. 
Alex Williamson March 3, 2023, 4:55 p.m. UTC | #9
On Fri, 3 Mar 2023 06:36:35 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Thursday, March 2, 2023 10:20 PM
> >   
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Thursday, March 2, 2023 8:35 PM
> > >
> > > On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:  
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Thursday, March 2, 2023 2:07 PM
> > > > >  
> > > > > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > > > > +		if (cur_vma->vdev.open_count &&
> > > > > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > > > > +		    !vfio_dev_in_iommufd_ctx(cur_vma,  
> > iommufd_ctx)) {  
> > > > >
> > > > > Hi Alex, Jason,
> > > > >
> > > > > There is one concern on this approach which is related to the
> > > > > cdev noiommu mode. As patch 16 of this series, cdev path
> > > > > supports noiommu mode by passing a negative iommufd to
> > > > > kernel. In such case, the vfio_device is not bound to a valid
> > > > > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > > > > to be broken.
> > > > >
> > > > > An idea is to add a cdev_noiommu flag in vfio_device, when
> > > > > checking the iommufd_ictx, also check this flag. If all the opened
> > > > > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > > > > then the reset is considered to be doable. But there is a special
> > > > > case. If devices in this dev_set are opened by two applications
> > > > > that operates in cdev noiommu mode, then this logic is not able
> > > > > to differentiate them. In that case, should we allow the reset?
> > > > > It seems to ok to allow reset since noiommu mode itself means
> > > > > no security between the applications that use it. thoughts?
> > > > >  
> > > >
> > > > Probably we need still pass in a valid iommufd (instead of using
> > > > a negative value) in noiommu case to mark the ownership so the
> > > > check in the reset path can correctly catch whether an opened
> > > > device belongs to this user.  
> > >
> > > There should be no iommufd at all in no-iommu mode
> > >
> > > Adding one just to deal with noiommu reset seems pretty sad :\
> > >
> > > no-iommu is only really used by dpdk, and it doesn't invoke
> > > VFIO_DEVICE_PCI_HOT_RESET at all.  
> > 
> > Does it happen to be or by design, this ioctl is not needed by dpdk?  

I can't think of a reason DPDK couldn't use hot-reset.  If we want to
make it a policy, it should be enforced by code, but creating that
policy based on a difficulty in supporting that mode with iommufd isn't
great.
 
> use of noiommu should be discouraged.
> 
> if only known noiommu user doesn't use it then having certain
> new restriction for noiommu in the hot reset path might be an
> acceptable tradeoff.
> 
> but again needs Alex's input as he knows all the history about
> noiommu. 
Yi Liu March 5, 2023, 2:48 p.m. UTC | #10
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Saturday, March 4, 2023 12:56 AM
> 
> On Fri, 3 Mar 2023 06:36:35 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Thursday, March 2, 2023 10:20 PM
> > >
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > Sent: Thursday, March 2, 2023 8:35 PM
> > > >
> > > > On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:
> > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Sent: Thursday, March 2, 2023 2:07 PM
> > > > > >
> > > > > > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > > > > > +		if (cur_vma->vdev.open_count &&
> > > > > > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > > > > > +		    !vfio_dev_in_iommufd_ctx(cur_vma,
> > > iommufd_ctx)) {
> > > > > >
> > > > > > Hi Alex, Jason,
> > > > > >
> > > > > > There is one concern on this approach which is related to the
> > > > > > cdev noiommu mode. As patch 16 of this series, cdev path
> > > > > > supports noiommu mode by passing a negative iommufd to
> > > > > > kernel. In such case, the vfio_device is not bound to a valid
> > > > > > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > > > > > to be broken.
> > > > > >
> > > > > > An idea is to add a cdev_noiommu flag in vfio_device, when
> > > > > > checking the iommufd_ictx, also check this flag. If all the opened
> > > > > > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > > > > > then the reset is considered to be doable. But there is a special
> > > > > > case. If devices in this dev_set are opened by two applications
> > > > > > that operates in cdev noiommu mode, then this logic is not able
> > > > > > to differentiate them. In that case, should we allow the reset?
> > > > > > It seems to ok to allow reset since noiommu mode itself means
> > > > > > no security between the applications that use it. thoughts?
> > > > > >
> > > > >
> > > > > Probably we need still pass in a valid iommufd (instead of using
> > > > > a negative value) in noiommu case to mark the ownership so the
> > > > > check in the reset path can correctly catch whether an opened
> > > > > device belongs to this user.
> > > >
> > > > There should be no iommufd at all in no-iommu mode
> > > >
> > > > Adding one just to deal with noiommu reset seems pretty sad :\
> > > >
> > > > no-iommu is only really used by dpdk, and it doesn't invoke
> > > > VFIO_DEVICE_PCI_HOT_RESET at all.
> > >
> > > Does it happen to be or by design, this ioctl is not needed by dpdk?
> 
> I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> make it a policy, it should be enforced by code, but creating that
> policy based on a difficulty in supporting that mode with iommufd isn't
> great.

Makes sense. A userspace driver should have the chance to reset
device.

> 
> > use of noiommu should be discouraged.
> >
> > if only known noiommu user doesn't use it then having certain
> > new restriction for noiommu in the hot reset path might be an
> > acceptable tradeoff.
> >
> > but again needs Alex's input as he knows all the history about
> > noiommu. 
Tian, Kevin March 6, 2023, 8:16 a.m. UTC | #11
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Sunday, March 5, 2023 10:49 PM
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Saturday, March 4, 2023 12:56 AM
> >
> > On Fri, 3 Mar 2023 06:36:35 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >
> > > use of noiommu should be discouraged.
> > >
> > > if only known noiommu user doesn't use it then having certain
> > > new restriction for noiommu in the hot reset path might be an
> > > acceptable tradeoff.
> > >
> > > but again needs Alex's input as he knows all the history about
> > > noiommu. 
Tian, Kevin March 6, 2023, 8:23 a.m. UTC | #12
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Monday, March 6, 2023 4:17 PM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Sunday, March 5, 2023 10:49 PM
> >
> >
> > How about falling back to prior solution. Allow userspace to pass a set
> > of device fd, and the kernel just checks the opened devices in the dev_set,
> > all the opened devices should be included in the device fd set. If not all
> > of them are included, fail it.
> >
> 
> looks this is a cleaner approach.
> 
> if a device is not opened we know it's safe to reset.
> 
> If a device is opened then it must be opened by the calling process to be
> reset.
> 
> from this angle we don't need to bother with noiommu vs. iommufd
> when iommufd is not always available.

btw there is one thing to be fixed in your next version.

noiommu shouldn't be enabled on a device which always has a iommu group.

We need a check on iommu_group in following place:

+	/* iommufd < 0 means noiommu mode */
+	if (bind.iommufd < 0) {
+		if (!capable(CAP_SYS_RAWIO)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+		df->noiommu = true;
Yi Liu March 6, 2023, 8:33 a.m. UTC | #13
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Monday, March 6, 2023 4:23 PM
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Monday, March 6, 2023 4:17 PM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Sunday, March 5, 2023 10:49 PM
> > >
> > >
> > > How about falling back to prior solution. Allow userspace to pass a set
> > > of device fd, and the kernel just checks the opened devices in the
> dev_set,
> > > all the opened devices should be included in the device fd set. If not all
> > > of them are included, fail it.
> > >
> >
> > looks this is a cleaner approach.
> >
> > if a device is not opened we know it's safe to reset.
> >
> > If a device is opened then it must be opened by the calling process to be
> > reset.
> >
> > from this angle we don't need to bother with noiommu vs. iommufd
> > when iommufd is not always available.
> 
> btw there is one thing to be fixed in your next version.
> 
> noiommu shouldn't be enabled on a device which always has a iommu
> group.
> 
> We need a check on iommu_group in following place:
> 
> +	/* iommufd < 0 means noiommu mode */
> +	if (bind.iommufd < 0) {
> +		if (!capable(CAP_SYS_RAWIO)) {
> +			ret = -EPERM;
> +			goto out_unlock;
> +		}
> +		df->noiommu = true;

Yes. it is. If there is iommu in the system, noiommu mode is not available.
Checking iommu_group presence could detect it. 
Yi Liu March 6, 2023, 9:59 a.m. UTC | #14
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Sunday, March 5, 2023 10:49 PM
> 
> > From: Alex Williamson <alex.williamson@redhat.com>
> > Sent: Saturday, March 4, 2023 12:56 AM
> >
> > On Fri, 3 Mar 2023 06:36:35 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Thursday, March 2, 2023 10:20 PM
> > > >
> > > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > > Sent: Thursday, March 2, 2023 8:35 PM
> > > > >
> > > > > On Thu, Mar 02, 2023 at 09:55:46AM +0000, Tian, Kevin wrote:
> > > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > > Sent: Thursday, March 2, 2023 2:07 PM
> > > > > > >
> > > > > > > > -		if (!vfio_dev_in_groups(cur_vma, groups)) {
> > > > > > > > +		if (cur_vma->vdev.open_count &&
> > > > > > > > +		    !vfio_dev_in_groups(cur_vma, groups) &&
> > > > > > > > +		    !vfio_dev_in_iommufd_ctx(cur_vma,
> > > > iommufd_ctx)) {
> > > > > > >
> > > > > > > Hi Alex, Jason,
> > > > > > >
> > > > > > > There is one concern on this approach which is related to the
> > > > > > > cdev noiommu mode. As patch 16 of this series, cdev path
> > > > > > > supports noiommu mode by passing a negative iommufd to
> > > > > > > kernel. In such case, the vfio_device is not bound to a valid
> > > > > > > iommufd. Then the check in vfio_dev_in_iommufd_ctx() is
> > > > > > > to be broken.
> > > > > > >
> > > > > > > An idea is to add a cdev_noiommu flag in vfio_device, when
> > > > > > > checking the iommufd_ictx, also check this flag. If all the opened
> > > > > > > devices in the dev_set have vfio_device->cdev_noiommu==true,
> > > > > > > then the reset is considered to be doable. But there is a special
> > > > > > > case. If devices in this dev_set are opened by two applications
> > > > > > > that operates in cdev noiommu mode, then this logic is not able
> > > > > > > to differentiate them. In that case, should we allow the reset?
> > > > > > > It seems to ok to allow reset since noiommu mode itself means
> > > > > > > no security between the applications that use it. thoughts?
> > > > > > >
> > > > > >
> > > > > > Probably we need still pass in a valid iommufd (instead of using
> > > > > > a negative value) in noiommu case to mark the ownership so the
> > > > > > check in the reset path can correctly catch whether an opened
> > > > > > device belongs to this user.
> > > > >
> > > > > There should be no iommufd at all in no-iommu mode
> > > > >
> > > > > Adding one just to deal with noiommu reset seems pretty sad :\
> > > > >
> > > > > no-iommu is only really used by dpdk, and it doesn't invoke
> > > > > VFIO_DEVICE_PCI_HOT_RESET at all.
> > > >
> > > > Does it happen to be or by design, this ioctl is not needed by dpdk?
> >
> > I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> > make it a policy, it should be enforced by code, but creating that
> > policy based on a difficulty in supporting that mode with iommufd isn't
> > great.
> 
> Makes sense. A userspace driver should have the chance to reset
> device.
> 
> >
> > > use of noiommu should be discouraged.
> > >
> > > if only known noiommu user doesn't use it then having certain
> > > new restriction for noiommu in the hot reset path might be an
> > > acceptable tradeoff.
> > >
> > > but again needs Alex's input as he knows all the history about
> > > noiommu. 
Jason Gunthorpe March 6, 2023, 1:16 p.m. UTC | #15
On Fri, Mar 03, 2023 at 09:55:42AM -0700, Alex Williamson wrote:

> I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> make it a policy, it should be enforced by code, but creating that
> policy based on a difficulty in supporting that mode with iommufd isn't
> great.

On the other hand adding code to allow device FDs in the hot reset
path that is never used and never tested isn't great either..

hot-reset does work for DPDK, it just doesn't work in the case where
DPDK would have many VFIO devices open and they have overlapping
device sets. Which, again, is something it doesn't do.

IMHO we should leave it out of the kernel and wait for a no-iommu user
to come forward that wants hot-reset of many devices. Then we can add
and test the device FD part. Most likely such a thing will never come
at this point.

Jason
Tian, Kevin March 7, 2023, 2:31 a.m. UTC | #16
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, March 6, 2023 9:17 PM
> 
> On Fri, Mar 03, 2023 at 09:55:42AM -0700, Alex Williamson wrote:
> 
> > I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> > make it a policy, it should be enforced by code, but creating that
> > policy based on a difficulty in supporting that mode with iommufd isn't
> > great.
> 
> On the other hand adding code to allow device FDs in the hot reset
> path that is never used and never tested isn't great either..
> 
> hot-reset does work for DPDK, it just doesn't work in the case where
> DPDK would have many VFIO devices open and they have overlapping
> device sets. Which, again, is something it doesn't do.
> 
> IMHO we should leave it out of the kernel and wait for a no-iommu user
> to come forward that wants hot-reset of many devices. Then we can add
> and test the device FD part. Most likely such a thing will never come
> at this point.
> 

I think we don't need to have this tradeoff if following Yi's last proposal
which requires every opened device in the set to be covered by the
device fd array. with dev_set->lock held in the reset/open path this is
a safe measure and fully contained in vfio-pci w/o need of further
checking noiommu or iommufd.

In the end same reset uAPI except the fd array can be device fd now. 
Yi Liu March 7, 2023, 2:35 a.m. UTC | #17
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Tuesday, March 7, 2023 10:31 AM
> 
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, March 6, 2023 9:17 PM
> >
> > On Fri, Mar 03, 2023 at 09:55:42AM -0700, Alex Williamson wrote:
> >
> > > I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> > > make it a policy, it should be enforced by code, but creating that
> > > policy based on a difficulty in supporting that mode with iommufd isn't
> > > great.
> >
> > On the other hand adding code to allow device FDs in the hot reset
> > path that is never used and never tested isn't great either..
> >
> > hot-reset does work for DPDK, it just doesn't work in the case where
> > DPDK would have many VFIO devices open and they have overlapping
> > device sets. Which, again, is something it doesn't do.
> >
> > IMHO we should leave it out of the kernel and wait for a no-iommu user
> > to come forward that wants hot-reset of many devices. Then we can add
> > and test the device FD part. Most likely such a thing will never come
> > at this point.
> >
> 
> I think we don't need to have this tradeoff if following Yi's last proposal
> which requires every opened device in the set to be covered by the
> device fd array. with dev_set->lock held in the reset/open path this is
> a safe measure and fully contained in vfio-pci w/o need of further
> checking noiommu or iommufd.
> 
> In the end same reset uAPI except the fd array can be device fd now. 
Jason Gunthorpe March 7, 2023, 12:36 p.m. UTC | #18
On Tue, Mar 07, 2023 at 02:31:11AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, March 6, 2023 9:17 PM
> > 
> > On Fri, Mar 03, 2023 at 09:55:42AM -0700, Alex Williamson wrote:
> > 
> > > I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> > > make it a policy, it should be enforced by code, but creating that
> > > policy based on a difficulty in supporting that mode with iommufd isn't
> > > great.
> > 
> > On the other hand adding code to allow device FDs in the hot reset
> > path that is never used and never tested isn't great either..
> > 
> > hot-reset does work for DPDK, it just doesn't work in the case where
> > DPDK would have many VFIO devices open and they have overlapping
> > device sets. Which, again, is something it doesn't do.
> > 
> > IMHO we should leave it out of the kernel and wait for a no-iommu user
> > to come forward that wants hot-reset of many devices. Then we can add
> > and test the device FD part. Most likely such a thing will never come
> > at this point.
> > 
> 
> I think we don't need to have this tradeoff if following Yi's last proposal
> which requires every opened device in the set to be covered by the
> device fd array. with dev_set->lock held in the reset/open path this is
> a safe measure and fully contained in vfio-pci w/o need of further
> checking noiommu or iommufd.

I really prefer the 'use the iommufd option' still exist, it is so
much cleaner and easier for the actual users of this API. We've lost
the point by worrying about no iommu.

Jason
Yi Liu March 7, 2023, 1:28 p.m. UTC | #19
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, March 7, 2023 8:37 PM
> 
> On Tue, Mar 07, 2023 at 02:31:11AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > Sent: Monday, March 6, 2023 9:17 PM
> > >
> > > On Fri, Mar 03, 2023 at 09:55:42AM -0700, Alex Williamson wrote:
> > >
> > > > I can't think of a reason DPDK couldn't use hot-reset.  If we want to
> > > > make it a policy, it should be enforced by code, but creating that
> > > > policy based on a difficulty in supporting that mode with iommufd isn't
> > > > great.
> > >
> > > On the other hand adding code to allow device FDs in the hot reset
> > > path that is never used and never tested isn't great either..
> > >
> > > hot-reset does work for DPDK, it just doesn't work in the case where
> > > DPDK would have many VFIO devices open and they have overlapping
> > > device sets. Which, again, is something it doesn't do.
> > >
> > > IMHO we should leave it out of the kernel and wait for a no-iommu user
> > > to come forward that wants hot-reset of many devices. Then we can
> add
> > > and test the device FD part. Most likely such a thing will never come
> > > at this point.
> > >
> >
> > I think we don't need to have this tradeoff if following Yi's last proposal
> > which requires every opened device in the set to be covered by the
> > device fd array. with dev_set->lock held in the reset/open path this is
> > a safe measure and fully contained in vfio-pci w/o need of further
> > checking noiommu or iommufd.
> 
> I really prefer the 'use the iommufd option' still exist, it is so
> much cleaner and easier for the actual users of this API. We've lost
> the point by worrying about no iommu.

Hmmm, so you are suggesting to have both the device fd approach
and the zero-length array approach, let user to select the best way
based on their wisdom. Is it? how about something like below in the
uapi header.

/**
 * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
 *                                  struct vfio_pci_hot_reset)
 *
 * Userspace requests hot reset for the devices it uses.  Due to the
 * underlying topology, multiple devices may be affected in the reset.
 * The affected devices may have been opened by the user or by other
 * users or not opened yet.  Only when all the affected devices are
 * either opened by the current user or not opened by any user, should
 * the reset request be allowed.  Otherwise, this request is expected
 * to return error. group_fds array can accept either group fds or
 * device fds.  Users using iommufd (valid fd), could also passing a
 * zero-length group_fds array to indicate using the bound iommufd_ctx
 * for ownership check to the affected devices that are opened.
 *
 * Return: 0 on success, -errno on failure.
 */
struct vfio_pci_hot_reset {
        __u32   argsz;
        __u32   flags;
        __u32   count;
        __s32   group_fds[];
};

Regards,
Yi Liu
Tian, Kevin March 8, 2023, 7:26 a.m. UTC | #20
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Tuesday, March 7, 2023 9:29 PM
> 
> >
> > I really prefer the 'use the iommufd option' still exist, it is so
> > much cleaner and easier for the actual users of this API. We've lost
> > the point by worrying about no iommu.
> 
> Hmmm, so you are suggesting to have both the device fd approach
> and the zero-length array approach, let user to select the best way
> based on their wisdom. Is it? how about something like below in the
> uapi header.
> 
> /**
>  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
>  *                                  struct vfio_pci_hot_reset)
>  *
>  * Userspace requests hot reset for the devices it uses.  Due to the
>  * underlying topology, multiple devices may be affected in the reset.
>  * The affected devices may have been opened by the user or by other
>  * users or not opened yet.  Only when all the affected devices are
>  * either opened by the current user or not opened by any user, should
>  * the reset request be allowed.  Otherwise, this request is expected
>  * to return error. group_fds array can accept either group fds or
>  * device fds.  Users using iommufd (valid fd), could also passing a
>  * zero-length group_fds array to indicate using the bound iommufd_ctx
>  * for ownership check to the affected devices that are opened.
>  *
>  * Return: 0 on success, -errno on failure.
>  */
> struct vfio_pci_hot_reset {
>         __u32   argsz;
>         __u32   flags;
>         __u32   count;
>         __s32   group_fds[];
> };
> 

 * Userspace requests hot reset for the devices it uses.  Due to the
 * underlying topology, multiple devices can be affected in the reset
 * while some might be opened by another user. To avoid interference
 * the calling user must ensure all affected devices, if opened, are
 * owned by itself.
 *
 * The ownership can be proved in three ways:
 *   - An array of group fds
 *   - An array of device fds
 *   - A zero-length array
 *
 * In the last case all affected devices which are opened by this user must
 * have been bound to a same iommufd_ctx.

and with this change let's rename 'group_fds'  to 'fds'
Yi Liu March 8, 2023, 7:47 a.m. UTC | #21
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, March 8, 2023 3:26 PM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Tuesday, March 7, 2023 9:29 PM
> >
> > >
> > > I really prefer the 'use the iommufd option' still exist, it is so
> > > much cleaner and easier for the actual users of this API. We've lost
> > > the point by worrying about no iommu.
> >
> > Hmmm, so you are suggesting to have both the device fd approach
> > and the zero-length array approach, let user to select the best way
> > based on their wisdom. Is it? how about something like below in the
> > uapi header.
> >
> > /**
> >  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> >  *                                  struct vfio_pci_hot_reset)
> >  *
> >  * Userspace requests hot reset for the devices it uses.  Due to the
> >  * underlying topology, multiple devices may be affected in the reset.
> >  * The affected devices may have been opened by the user or by other
> >  * users or not opened yet.  Only when all the affected devices are
> >  * either opened by the current user or not opened by any user, should
> >  * the reset request be allowed.  Otherwise, this request is expected
> >  * to return error. group_fds array can accept either group fds or
> >  * device fds.  Users using iommufd (valid fd), could also passing a
> >  * zero-length group_fds array to indicate using the bound iommufd_ctx
> >  * for ownership check to the affected devices that are opened.
> >  *
> >  * Return: 0 on success, -errno on failure.
> >  */
> > struct vfio_pci_hot_reset {
> >         __u32   argsz;
> >         __u32   flags;
> >         __u32   count;
> >         __s32   group_fds[];
> > };
> >
> 
>  * Userspace requests hot reset for the devices it uses.  Due to the
>  * underlying topology, multiple devices can be affected in the reset
>  * while some might be opened by another user. To avoid interference
>  * the calling user must ensure all affected devices, if opened, are
>  * owned by itself.
>  *
>  * The ownership can be proved in three ways:
>  *   - An array of group fds
>  *   - An array of device fds
>  *   - A zero-length array
>  *
Thanks.
>  * In the last case all affected devices which are opened by this user must
>  * have been bound to a same iommufd_ctx.

I think we only allow it when this iommufd_ctx is valid. Is it? To
user, it means device should be bound to a positive iommufd.

> and with this change let's rename 'group_fds'  to 'fds'

Sure. It would be something like below:

struct vfio_pci_hot_reset {
	__u32   argsz;
	__u32   flags;
	_u32   count;
	union {
		__s32   group_fds[0];
		__s32   fds[0];
	};
};

Regards,
Yi Liu
Tian, Kevin March 8, 2023, 7:55 a.m. UTC | #22
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, March 8, 2023 3:47 PM
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Wednesday, March 8, 2023 3:26 PM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Tuesday, March 7, 2023 9:29 PM
> > >
> > > >
> > > > I really prefer the 'use the iommufd option' still exist, it is so
> > > > much cleaner and easier for the actual users of this API. We've lost
> > > > the point by worrying about no iommu.
> > >
> > > Hmmm, so you are suggesting to have both the device fd approach
> > > and the zero-length array approach, let user to select the best way
> > > based on their wisdom. Is it? how about something like below in the
> > > uapi header.
> > >
> > > /**
> > >  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> > >  *                                  struct vfio_pci_hot_reset)
> > >  *
> > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > >  * underlying topology, multiple devices may be affected in the reset.
> > >  * The affected devices may have been opened by the user or by other
> > >  * users or not opened yet.  Only when all the affected devices are
> > >  * either opened by the current user or not opened by any user, should
> > >  * the reset request be allowed.  Otherwise, this request is expected
> > >  * to return error. group_fds array can accept either group fds or
> > >  * device fds.  Users using iommufd (valid fd), could also passing a
> > >  * zero-length group_fds array to indicate using the bound iommufd_ctx
> > >  * for ownership check to the affected devices that are opened.
> > >  *
> > >  * Return: 0 on success, -errno on failure.
> > >  */
> > > struct vfio_pci_hot_reset {
> > >         __u32   argsz;
> > >         __u32   flags;
> > >         __u32   count;
> > >         __s32   group_fds[];
> > > };
> > >
> >
> >  * Userspace requests hot reset for the devices it uses.  Due to the
> >  * underlying topology, multiple devices can be affected in the reset
> >  * while some might be opened by another user. To avoid interference
> >  * the calling user must ensure all affected devices, if opened, are
> >  * owned by itself.
> >  *
> >  * The ownership can be proved in three ways:
> >  *   - An array of group fds
> >  *   - An array of device fds
> >  *   - A zero-length array
> >  *
> Thanks.
> >  * In the last case all affected devices which are opened by this user must
> >  * have been bound to a same iommufd_ctx.
> 
> I think we only allow it when this iommufd_ctx is valid. Is it? To
> user, it means device should be bound to a positive iommufd.

I didn't get it. Do we have a iommufd_ctx created but marked as
invalid?

> 
> > and with this change let's rename 'group_fds'  to 'fds'
> 
> Sure. It would be something like below:
> 
> struct vfio_pci_hot_reset {
> 	__u32   argsz;
> 	__u32   flags;
> 	_u32   count;
> 	union {
> 		__s32   group_fds[0];
> 		__s32   fds[0];
> 	};
> };
> 

why union? Just renaming should work. In the kernel we will first
check whether it's group, whether it's device, then compare
iommufd_ctx is zero length.
Yi Liu March 8, 2023, 8 a.m. UTC | #23
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, March 8, 2023 3:55 PM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Wednesday, March 8, 2023 3:47 PM
> >
> > > From: Tian, Kevin <kevin.tian@intel.com>
> > > Sent: Wednesday, March 8, 2023 3:26 PM
> > >
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Tuesday, March 7, 2023 9:29 PM
> > > >
> > > > >
> > > > > I really prefer the 'use the iommufd option' still exist, it is so
> > > > > much cleaner and easier for the actual users of this API. We've lost
> > > > > the point by worrying about no iommu.
> > > >
> > > > Hmmm, so you are suggesting to have both the device fd approach
> > > > and the zero-length array approach, let user to select the best way
> > > > based on their wisdom. Is it? how about something like below in the
> > > > uapi header.
> > > >
> > > > /**
> > > >  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> > > >  *                                  struct vfio_pci_hot_reset)
> > > >  *
> > > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > > >  * underlying topology, multiple devices may be affected in the reset.
> > > >  * The affected devices may have been opened by the user or by
> other
> > > >  * users or not opened yet.  Only when all the affected devices are
> > > >  * either opened by the current user or not opened by any user,
> should
> > > >  * the reset request be allowed.  Otherwise, this request is expected
> > > >  * to return error. group_fds array can accept either group fds or
> > > >  * device fds.  Users using iommufd (valid fd), could also passing a
> > > >  * zero-length group_fds array to indicate using the bound
> iommufd_ctx
> > > >  * for ownership check to the affected devices that are opened.
> > > >  *
> > > >  * Return: 0 on success, -errno on failure.
> > > >  */
> > > > struct vfio_pci_hot_reset {
> > > >         __u32   argsz;
> > > >         __u32   flags;
> > > >         __u32   count;
> > > >         __s32   group_fds[];
> > > > };
> > > >
> > >
> > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > >  * underlying topology, multiple devices can be affected in the reset
> > >  * while some might be opened by another user. To avoid interference
> > >  * the calling user must ensure all affected devices, if opened, are
> > >  * owned by itself.
> > >  *
> > >  * The ownership can be proved in three ways:
> > >  *   - An array of group fds
> > >  *   - An array of device fds
> > >  *   - A zero-length array
> > >  *
> > Thanks.
> > >  * In the last case all affected devices which are opened by this user
> must
> > >  * have been bound to a same iommufd_ctx.
> >
> > I think we only allow it when this iommufd_ctx is valid. Is it? To
> > user, it means device should be bound to a positive iommufd.
> 
> I didn't get it. Do we have a iommufd_ctx created but marked as
> invalid?

I mean iommufd_ctx==NULL. If a negative iommufd is provided,
then kernel side only has a NULL iommufd_ctx. If so, the ownership
check just fail if it uses iommufd_ctx for ownership proof.

> 
> >
> > > and with this change let's rename 'group_fds'  to 'fds'
> >
> > Sure. It would be something like below:
> >
> > struct vfio_pci_hot_reset {
> > 	__u32   argsz;
> > 	__u32   flags;
> > 	_u32   count;
> > 	union {
> > 		__s32   group_fds[0];
> > 		__s32   fds[0];
> > 	};
> > };
> >
> 
> why union? Just renaming should work. In the kernel we will first
> check whether it's group, whether it's device, then compare
> iommufd_ctx is zero length.

this is for old qemus. However, since it's just a rename perhaps
it is not needed. The layout is not changed. If qemu imports the
new header file, it needs to update the group_fds in its code as
well.

Regards,
Yi Liu
Tian, Kevin March 8, 2023, 8:14 a.m. UTC | #24
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Wednesday, March 8, 2023 4:01 PM
> 
> > From: Tian, Kevin <kevin.tian@intel.com>
> > Sent: Wednesday, March 8, 2023 3:55 PM
> >
> > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > Sent: Wednesday, March 8, 2023 3:47 PM
> > >
> > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > Sent: Wednesday, March 8, 2023 3:26 PM
> > > >
> > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > Sent: Tuesday, March 7, 2023 9:29 PM
> > > > >
> > > > > >
> > > > > > I really prefer the 'use the iommufd option' still exist, it is so
> > > > > > much cleaner and easier for the actual users of this API. We've lost
> > > > > > the point by worrying about no iommu.
> > > > >
> > > > > Hmmm, so you are suggesting to have both the device fd approach
> > > > > and the zero-length array approach, let user to select the best way
> > > > > based on their wisdom. Is it? how about something like below in the
> > > > > uapi header.
> > > > >
> > > > > /**
> > > > >  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> > > > >  *                                  struct vfio_pci_hot_reset)
> > > > >  *
> > > > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > > > >  * underlying topology, multiple devices may be affected in the reset.
> > > > >  * The affected devices may have been opened by the user or by
> > other
> > > > >  * users or not opened yet.  Only when all the affected devices are
> > > > >  * either opened by the current user or not opened by any user,
> > should
> > > > >  * the reset request be allowed.  Otherwise, this request is expected
> > > > >  * to return error. group_fds array can accept either group fds or
> > > > >  * device fds.  Users using iommufd (valid fd), could also passing a
> > > > >  * zero-length group_fds array to indicate using the bound
> > iommufd_ctx
> > > > >  * for ownership check to the affected devices that are opened.
> > > > >  *
> > > > >  * Return: 0 on success, -errno on failure.
> > > > >  */
> > > > > struct vfio_pci_hot_reset {
> > > > >         __u32   argsz;
> > > > >         __u32   flags;
> > > > >         __u32   count;
> > > > >         __s32   group_fds[];
> > > > > };
> > > > >
> > > >
> > > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > > >  * underlying topology, multiple devices can be affected in the reset
> > > >  * while some might be opened by another user. To avoid interference
> > > >  * the calling user must ensure all affected devices, if opened, are
> > > >  * owned by itself.
> > > >  *
> > > >  * The ownership can be proved in three ways:
> > > >  *   - An array of group fds
> > > >  *   - An array of device fds
> > > >  *   - A zero-length array
> > > >  *
> > > Thanks.
> > > >  * In the last case all affected devices which are opened by this user
> > must
> > > >  * have been bound to a same iommufd_ctx.
> > >
> > > I think we only allow it when this iommufd_ctx is valid. Is it? To
> > > user, it means device should be bound to a positive iommufd.
> >
> > I didn't get it. Do we have a iommufd_ctx created but marked as
> > invalid?
> 
> I mean iommufd_ctx==NULL. If a negative iommufd is provided,
> then kernel side only has a NULL iommufd_ctx. If so, the ownership
> check just fail if it uses iommufd_ctx for ownership proof.

it's fine. iommufd_ctx check doesn't work with noiommu.

User should use device fd if involving noiommu.
Yi Liu March 8, 2023, 8:15 a.m. UTC | #25
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Wednesday, March 8, 2023 4:14 PM
> 
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Wednesday, March 8, 2023 4:01 PM
> >
> > > From: Tian, Kevin <kevin.tian@intel.com>
> > > Sent: Wednesday, March 8, 2023 3:55 PM
> > >
> > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > Sent: Wednesday, March 8, 2023 3:47 PM
> > > >
> > > > > From: Tian, Kevin <kevin.tian@intel.com>
> > > > > Sent: Wednesday, March 8, 2023 3:26 PM
> > > > >
> > > > > > From: Liu, Yi L <yi.l.liu@intel.com>
> > > > > > Sent: Tuesday, March 7, 2023 9:29 PM
> > > > > >
> > > > > > >
> > > > > > > I really prefer the 'use the iommufd option' still exist, it is so
> > > > > > > much cleaner and easier for the actual users of this API. We've
> lost
> > > > > > > the point by worrying about no iommu.
> > > > > >
> > > > > > Hmmm, so you are suggesting to have both the device fd approach
> > > > > > and the zero-length array approach, let user to select the best way
> > > > > > based on their wisdom. Is it? how about something like below in
> the
> > > > > > uapi header.
> > > > > >
> > > > > > /**
> > > > > >  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE +
> 13,
> > > > > >  *                                  struct vfio_pci_hot_reset)
> > > > > >  *
> > > > > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > > > > >  * underlying topology, multiple devices may be affected in the
> reset.
> > > > > >  * The affected devices may have been opened by the user or by
> > > other
> > > > > >  * users or not opened yet.  Only when all the affected devices are
> > > > > >  * either opened by the current user or not opened by any user,
> > > should
> > > > > >  * the reset request be allowed.  Otherwise, this request is
> expected
> > > > > >  * to return error. group_fds array can accept either group fds or
> > > > > >  * device fds.  Users using iommufd (valid fd), could also passing a
> > > > > >  * zero-length group_fds array to indicate using the bound
> > > iommufd_ctx
> > > > > >  * for ownership check to the affected devices that are opened.
> > > > > >  *
> > > > > >  * Return: 0 on success, -errno on failure.
> > > > > >  */
> > > > > > struct vfio_pci_hot_reset {
> > > > > >         __u32   argsz;
> > > > > >         __u32   flags;
> > > > > >         __u32   count;
> > > > > >         __s32   group_fds[];
> > > > > > };
> > > > > >
> > > > >
> > > > >  * Userspace requests hot reset for the devices it uses.  Due to the
> > > > >  * underlying topology, multiple devices can be affected in the reset
> > > > >  * while some might be opened by another user. To avoid
> interference
> > > > >  * the calling user must ensure all affected devices, if opened, are
> > > > >  * owned by itself.
> > > > >  *
> > > > >  * The ownership can be proved in three ways:
> > > > >  *   - An array of group fds
> > > > >  *   - An array of device fds
> > > > >  *   - A zero-length array
> > > > >  *
> > > > Thanks.
> > > > >  * In the last case all affected devices which are opened by this user
> > > must
> > > > >  * have been bound to a same iommufd_ctx.
> > > >
> > > > I think we only allow it when this iommufd_ctx is valid. Is it? To
> > > > user, it means device should be bound to a positive iommufd.
> > >
> > > I didn't get it. Do we have a iommufd_ctx created but marked as
> > > invalid?
> >
> > I mean iommufd_ctx==NULL. If a negative iommufd is provided,
> > then kernel side only has a NULL iommufd_ctx. If so, the ownership
> > check just fail if it uses iommufd_ctx for ownership proof.
> 
> it's fine. iommufd_ctx check doesn't work with noiommu.
> 
> User should use device fd if involving noiommu.

Yes, this is my point. This zero-length array approach is only
available for devices that are bound to positive iommufd.
Jason Gunthorpe March 8, 2023, 3:08 p.m. UTC | #26
On Wed, Mar 08, 2023 at 07:26:08AM +0000, Tian, Kevin wrote:
>  * Userspace requests hot reset for the devices it uses.  Due to the
>  * underlying topology, multiple devices can be affected in the reset
>  * while some might be opened by another user. To avoid interference
>  * the calling user must ensure all affected devices, if opened, are
>  * owned by itself.
>  *
>  * The ownership can be proved in three ways:
>  *   - An array of group fds
>  *   - An array of device fds
>  *   - A zero-length array
>  *
>  * In the last case all affected devices which are opened by this user must
>  * have been bound to a same iommufd_ctx.
> 
> and with this change let's rename 'group_fds'  to 'fds'

Looks right

Jason
diff mbox series

Patch

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1bf54beeaef2..e0ebe55b4df0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,11 +27,13 @@ 
 #include <linux/vgaarb.h>
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
+#include <linux/iommufd.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
 
 #include "vfio_pci_priv.h"
+#include "../vfio.h"
 
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
 #define DRIVER_DESC "core driver for VFIO based PCI devices"
@@ -180,7 +182,8 @@  static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
 struct vfio_pci_group_info;
 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups);
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx);
 
 /*
  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -1255,29 +1258,17 @@  static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	return ret;
 }
 
-static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
-					struct vfio_pci_hot_reset __user *arg)
+static int
+vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_hot_reset *hdr,
+				    bool slot,
+				    struct vfio_pci_hot_reset __user *arg)
 {
-	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
-	struct vfio_pci_hot_reset hdr;
 	int32_t *group_fds;
 	struct file **files;
 	struct vfio_pci_group_info info;
-	bool slot = false;
 	int file_idx, count = 0, ret = 0;
 
-	if (copy_from_user(&hdr, arg, minsz))
-		return -EFAULT;
-
-	if (hdr.argsz < minsz || hdr.flags)
-		return -EINVAL;
-
-	/* Can we do a slot or bus reset or neither? */
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return -ENODEV;
-
 	/*
 	 * We can't let userspace give us an arbitrarily large buffer to copy,
 	 * so verify how many we think there could be.  Note groups can have
@@ -1289,11 +1280,11 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 		return ret;
 
 	/* Somewhere between 1 and count is OK */
-	if (!hdr.count || hdr.count > count)
+	if (hdr->count > count)
 		return -EINVAL;
 
-	group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
-	files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL);
+	group_fds = kcalloc(hdr->count, sizeof(*group_fds), GFP_KERNEL);
+	files = kcalloc(hdr->count, sizeof(*files), GFP_KERNEL);
 	if (!group_fds || !files) {
 		kfree(group_fds);
 		kfree(files);
@@ -1301,7 +1292,7 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	}
 
 	if (copy_from_user(group_fds, arg->group_fds,
-			   hdr.count * sizeof(*group_fds))) {
+			   hdr->count * sizeof(*group_fds))) {
 		kfree(group_fds);
 		kfree(files);
 		return -EFAULT;
@@ -1311,7 +1302,7 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	 * Get the group file for each fd to ensure the group held across
 	 * the reset
 	 */
-	for (file_idx = 0; file_idx < hdr.count; file_idx++) {
+	for (file_idx = 0; file_idx < hdr->count; file_idx++) {
 		struct file *file = fget(group_fds[file_idx]);
 
 		if (!file) {
@@ -1335,10 +1326,10 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	if (ret)
 		goto hot_reset_release;
 
-	info.count = hdr.count;
+	info.count = hdr->count;
 	info.files = files;
 
-	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+	ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
 
 hot_reset_release:
 	for (file_idx--; file_idx >= 0; file_idx--)
@@ -1348,6 +1339,36 @@  static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
 	return ret;
 }
 
+static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
+					struct vfio_pci_hot_reset __user *arg)
+{
+	unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
+	struct vfio_pci_hot_reset hdr;
+	struct iommufd_ctx *iommufd;
+	bool slot = false;
+
+	if (copy_from_user(&hdr, arg, minsz))
+		return -EFAULT;
+
+	if (hdr.argsz < minsz || hdr.flags)
+		return -EINVAL;
+
+	/* Can we do a slot or bus reset or neither? */
+	if (!pci_probe_reset_slot(vdev->pdev->slot))
+		slot = true;
+	else if (pci_probe_reset_bus(vdev->pdev->bus))
+		return -ENODEV;
+
+	if (!hdr.count)
+		return vfio_pci_ioctl_pci_hot_reset_groups(vdev, &hdr, slot, arg);
+
+	iommufd = vfio_device_iommufd(&vdev->vdev);
+	if (!iommufd)
+		return -EINVAL;
+
+	return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, iommufd);
+}
+
 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
 				    struct vfio_device_ioeventfd __user *arg)
 {
@@ -2317,6 +2338,9 @@  static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
 {
 	unsigned int i;
 
+	if (!groups)
+		return false;
+
 	for (i = 0; i < groups->count; i++)
 		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
 			return true;
@@ -2392,13 +2416,25 @@  static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
 	return ret;
 }
 
+static bool vfio_dev_in_iommufd_ctx(struct vfio_pci_core_device *vdev,
+				    struct iommufd_ctx *iommufd_ctx)
+{
+	struct iommufd_ctx *iommufd = vfio_device_iommufd(&vdev->vdev);
+
+	if (!iommufd)
+		return false;
+
+	return iommufd == iommufd_ctx;
+}
+
 /*
  * We need to get memory_lock for each device, but devices can share mmap_lock,
  * therefore we need to zap and hold the vma_lock for each device, and only then
  * get each memory_lock.
  */
 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
-				      struct vfio_pci_group_info *groups)
+				      struct vfio_pci_group_info *groups,
+				      struct iommufd_ctx *iommufd_ctx)
 {
 	struct vfio_pci_core_device *cur_mem;
 	struct vfio_pci_core_device *cur_vma;
@@ -2429,10 +2465,27 @@  static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 
 	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
 		/*
-		 * Test whether all the affected devices are contained by the
-		 * set of groups provided by the user.
+		 * Test whether all the affected devices can be reset by the
+		 * user.  The affected devices may already been opened or not
+		 * yet.
+		 *
+		 * For the devices not opened yet, user can reset them. The
+		 * reason is that the hot reset is done under the protection
+		 * of the dev_set->lock, and device open is also under this
+		 * lock.  During the hot reset, such devices can not be opened
+		 * by other users.
+		 *
+		 * For the devices that have been opened, needs to check the
+		 * ownership.  If the user provides a set of group fds, the
+		 * ownership check is done by checking if all the opened
+		 * devices are contained by the groups.  If the user provides
+		 * a zero-length fd array, the ownerhsip check is done by
+		 * checking if all the opened devices are bound to the same
+		 * iommufd_ctx.
 		 */
-		if (!vfio_dev_in_groups(cur_vma, groups)) {
+		if (cur_vma->vdev.open_count &&
+		    !vfio_dev_in_groups(cur_vma, groups) &&
+		    !vfio_dev_in_iommufd_ctx(cur_vma, iommufd_ctx)) {
 			ret = -EINVAL;
 			goto err_undo;
 		}
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 2e3cb284711d..64e862a02dad 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -225,6 +225,11 @@  static inline void vfio_container_cleanup(void)
 #if IS_ENABLED(CONFIG_IOMMUFD)
 int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx);
 void vfio_iommufd_unbind(struct vfio_device *device);
+static inline struct iommufd_ctx *
+vfio_device_iommufd(struct vfio_device *device)
+{
+	return device->iommufd_ictx;
+}
 #else
 static inline int vfio_iommufd_bind(struct vfio_device *device,
 				    struct iommufd_ctx *ictx)
@@ -235,6 +240,12 @@  static inline int vfio_iommufd_bind(struct vfio_device *device,
 static inline void vfio_iommufd_unbind(struct vfio_device *device)
 {
 }
+
+static inline struct iommufd_ctx *
+vfio_device_iommufd(struct vfio_device *device)
+{
+	return NULL;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_VFIO_VIRQFD)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0552e8dcf0cb..4bf11ee8de53 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -673,6 +673,22 @@  struct vfio_pci_hot_reset_info {
  * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
  *				    struct vfio_pci_hot_reset)
  *
+ * Userspace requests hot reset for the devices it uses.  Due to the
+ * underlying topology, multiple devices may be affected in the reset.
+ * The affected devices may have been opened by the user or by other
+ * users or not opened yet.  Only when all the affected devices are
+ * either opened by the current user or not opened by any user, should
+ * the reset request be allowed.  Otherwise, this request is expected
+ * to return error.
+ *
+ * If the user uses group and container interface, it should pass down
+ * a set of group fds for ownership check.  If the user uses iommufd, it
+ * should pass down a zero-length group_fds array to indicate the kernel
+ * to use the bound iommufd for the ownership check.  User that uses the
+ * vfio iommufd compatible mode can also pass down a zero-length group_fds
+ * array as this mode uses iommufd in kernel, and there is no reason to
+ * forbide it.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_pci_hot_reset {