Message ID | 20230602121515.79374-9-yi.l.liu@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Enhance vfio PCI hot reset for vfio cdev device | expand |
On Fri, 2 Jun 2023 05:15:14 -0700 Yi Liu <yi.l.liu@intel.com> wrote: > This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx > of the cdev device to check the ownership of the other affected devices. > > When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed > device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate > the values returned are IOMMUFD devids rather than group IDs as used when > accessing vfio devices through the conventional vfio group interface. > Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported > in this mode if all of the devices affected by the hot-reset are owned by > either virtue of being directly bound to the same iommufd context as the > calling device, or implicitly owned via a shared IOMMU group. > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > --- > drivers/vfio/iommufd.c | 49 +++++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++----- > include/linux/vfio.h | 16 ++++++++++ > include/uapi/linux/vfio.h | 50 +++++++++++++++++++++++++++++++- > 4 files changed, 154 insertions(+), 8 deletions(-) > > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c > index 88b00c501015..a04f3a493437 100644 > --- a/drivers/vfio/iommufd.c > +++ b/drivers/vfio/iommufd.c > @@ -66,6 +66,55 @@ void vfio_iommufd_unbind(struct vfio_device *vdev) > vdev->ops->unbind_iommufd(vdev); > } > > +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev) > +{ > + if (vdev->iommufd_device) > + return iommufd_device_to_ictx(vdev->iommufd_device); > + return NULL; > +} > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx); > + > +static int vfio_iommufd_device_id(struct vfio_device *vdev) > +{ > + if (vdev->iommufd_device) > + return iommufd_device_to_id(vdev->iommufd_device); > + return -EINVAL; If this is actually reachable, it allows us to return -EINVAL as a devid in the reset-info ioctl, which is not a defined value. Should this return VFIO_PCI_DEVID_NOT_OWNED or do you want to catch the errno value in the caller? Thanks, Alex > +} > + > +/* > + * Return devid for a device which is affected by hot-reset. > + * - valid devid > 0 for the device that is bound to the input > + * iommufd_ctx. > + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not > + * been bound to any iommufd_ctx but other device within its > + * group has been bound to the input iommufd_ctx. > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device > + * is bound to other iommufd_ctx etc. > + */ > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > + struct iommufd_ctx *ictx) > +{ > + struct iommu_group *group; > + int devid; > + > + if (vfio_iommufd_device_ictx(vdev) == ictx) > + return vfio_iommufd_device_id(vdev); > + > + group = iommu_group_get(vdev->dev); > + if (!group) > + return VFIO_PCI_DEVID_NOT_OWNED; > + > + if (iommufd_ctx_has_group(ictx, group)) > + devid = VFIO_PCI_DEVID_OWNED; > + else > + devid = VFIO_PCI_DEVID_NOT_OWNED; > + > + iommu_group_put(group); > + > + return devid; > +} > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); > + > /* > * The physical standard ops mean that the iommufd_device is bound to the > * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 3a2f67675036..a615a223cdef 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -27,6 +27,7 @@ > #include <linux/vgaarb.h> > #include <linux/nospec.h> > #include <linux/sched/mm.h> > +#include <linux/iommufd.h> > #if IS_ENABLED(CONFIG_EEH) > #include <asm/eeh.h> > #endif > @@ -776,26 +777,49 @@ struct vfio_pci_fill_info { > int max; > int cur; > struct vfio_pci_dependent_device *devices; > + struct vfio_device *vdev; > + u32 flags; > }; > > static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > { > struct vfio_pci_fill_info *fill = data; > - struct iommu_group *iommu_group; > > if (fill->cur == fill->max) > return -EAGAIN; /* Something changed, try again */ > > - iommu_group = iommu_group_get(&pdev->dev); > - if (!iommu_group) > - return -EPERM; /* Cannot reset non-isolated devices */ > + if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { > + struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); > + struct vfio_device_set *dev_set = fill->vdev->dev_set; > + struct vfio_device *vdev; > > - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); > + /* > + * hot-reset requires all affected devices be represented in > + * the dev_set. > + */ > + vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); > + if (!vdev) > + fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; > + else > + fill->devices[fill->cur].devid = > + vfio_iommufd_device_hot_reset_devid(vdev, iommufd); > + /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ > + if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) > + fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; > + } else { > + struct iommu_group *iommu_group; > + > + iommu_group = iommu_group_get(&pdev->dev); > + if (!iommu_group) > + return -EPERM; /* Cannot reset non-isolated devices */ > + > + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); > + iommu_group_put(iommu_group); > + } > fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); > fill->devices[fill->cur].bus = pdev->bus->number; > fill->devices[fill->cur].devfn = pdev->devfn; > fill->cur++; > - iommu_group_put(iommu_group); > return 0; > } > > @@ -1229,17 +1253,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > return -ENOMEM; > > fill.devices = devices; > + fill.vdev = &vdev->vdev; > + > + if (vfio_device_cdev_opened(&vdev->vdev)) > + fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID | > + VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; > > + mutex_lock(&vdev->vdev.dev_set->lock); > ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, > &fill, slot); > + mutex_unlock(&vdev->vdev.dev_set->lock); > > /* > * If a device was removed between counting and filling, we may come up > * short of fill.max. If a device was added, we'll have a return of > * -EAGAIN above. > */ > - if (!ret) > + if (!ret) { > hdr.count = fill.cur; > + hdr.flags = fill.flags; > + } > > reset_info_exit: > if (copy_to_user(arg, &hdr, minsz)) > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index ee120d2d530b..382a7b119c7c 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -114,6 +114,9 @@ struct vfio_device_ops { > }; > > #if IS_ENABLED(CONFIG_IOMMUFD) > +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev); > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > + struct iommufd_ctx *ictx); > int vfio_iommufd_physical_bind(struct vfio_device *vdev, > struct iommufd_ctx *ictx, u32 *out_device_id); > void vfio_iommufd_physical_unbind(struct vfio_device *vdev); > @@ -123,6 +126,19 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev, > void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); > int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); > #else > +static inline struct iommufd_ctx * > +vfio_iommufd_device_ictx(struct vfio_device *vdev) > +{ > + return NULL; > +} > + > +static inline int > +vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > + struct iommufd_ctx *ictx) > +{ > + return VFIO_PCI_DEVID_NOT_OWNED; > +} > + > #define vfio_iommufd_physical_bind \ > ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ > u32 *out_device_id)) NULL) > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 0552e8dcf0cb..70cc31e6b1ce 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -650,11 +650,57 @@ enum { > * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, > * struct vfio_pci_hot_reset_info) > * > + * This command is used to query the affected devices in the hot reset for > + * a given device. > + * > + * This command always reports the segment, bus, and devfn information for > + * each affected device, and selectively reports the group_id or devid per > + * the way how the calling device is opened. > + * > + * - If the calling device is opened via the traditional group/container > + * API, group_id is reported. User should check if it has owned all > + * the affected devices and provides a set of group fds to prove the > + * ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl. > + * > + * - If the calling device is opened as a cdev, devid is reported. > + * Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this > + * data type. All the affected devices should be represented in > + * the dev_set, ex. bound to a vfio driver, and also be owned by > + * this interface which is determined by the following conditions: > + * 1) Has a valid devid within the iommufd_ctx of the calling device. > + * Ownership cannot be determined across separate iommufd_ctx and > + * the cdev calling conventions do not support a proof-of-ownership > + * model as provided in the legacy group interface. In this case > + * valid devid with value greater than zero is provided in the return > + * structure. > + * 2) Does not have a valid devid within the iommufd_ctx of the calling > + * device, but belongs to the same IOMMU group as the calling device > + * or another opened device that has a valid devid within the > + * iommufd_ctx of the calling device. This provides implicit ownership > + * for devices within the same DMA isolation context. In this case > + * the devid value of VFIO_PCI_DEVID_OWNED is provided in the return > + * structure. > + * > + * A devid value of VFIO_PCI_DEVID_NOT_OWNED is provided in the return > + * structure for affected devices where device is NOT represented in the > + * dev_set or ownership is not available. Such devices prevent the use > + * of VFIO_DEVICE_PCI_HOT_RESET ioctl outside of the proof-of-ownership > + * calling conventions (ie. via legacy group accessed devices). Flag > + * VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the > + * affected devices are represented in the dev_set and also owned by > + * the user. This flag is available only when > + * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. > + * > * Return: 0 on success, -errno on failure: > * -enospc = insufficient buffer, -enodev = unsupported for device. > */ > struct vfio_pci_dependent_device { > - __u32 group_id; > + union { > + __u32 group_id; > + __u32 devid; > +#define VFIO_PCI_DEVID_OWNED 0 > +#define VFIO_PCI_DEVID_NOT_OWNED -1 > + }; > __u16 segment; > __u8 bus; > __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ > @@ -663,6 +709,8 @@ struct vfio_pci_dependent_device { > struct vfio_pci_hot_reset_info { > __u32 argsz; > __u32 flags; > +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID (1 << 0) > +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED (1 << 1) > __u32 count; > struct vfio_pci_dependent_device devices[]; > };
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Friday, June 9, 2023 6:27 AM > > On Fri, 2 Jun 2023 05:15:14 -0700 > Yi Liu <yi.l.liu@intel.com> wrote: > > > This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx > > of the cdev device to check the ownership of the other affected devices. > > > > When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed > > device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate > > the values returned are IOMMUFD devids rather than group IDs as used when > > accessing vfio devices through the conventional vfio group interface. > > Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported > > in this mode if all of the devices affected by the hot-reset are owned by > > either virtue of being directly bound to the same iommufd context as the > > calling device, or implicitly owned via a shared IOMMU group. > > > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > > --- > > drivers/vfio/iommufd.c | 49 +++++++++++++++++++++++++++++++ > > drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++----- > > include/linux/vfio.h | 16 ++++++++++ > > include/uapi/linux/vfio.h | 50 +++++++++++++++++++++++++++++++- > > 4 files changed, 154 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c > > index 88b00c501015..a04f3a493437 100644 > > --- a/drivers/vfio/iommufd.c > > +++ b/drivers/vfio/iommufd.c > > @@ -66,6 +66,55 @@ void vfio_iommufd_unbind(struct vfio_device *vdev) > > vdev->ops->unbind_iommufd(vdev); > > } > > > > +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev) > > +{ > > + if (vdev->iommufd_device) > > + return iommufd_device_to_ictx(vdev->iommufd_device); > > + return NULL; > > +} > > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx); > > + > > +static int vfio_iommufd_device_id(struct vfio_device *vdev) > > +{ > > + if (vdev->iommufd_device) > > + return iommufd_device_to_id(vdev->iommufd_device); > > + return -EINVAL; > > If this is actually reachable, it allows us to return -EINVAL as a > devid in the reset-info ioctl, which is not a defined value. Should > this return VFIO_PCI_DEVID_NOT_OWNED or do you want to catch the errno > value in the caller? Thanks, This error can be reached if user invokes _INFO or HOT_RESET on an emulated device or a physical device that has not been bound to iommufd. Both should be considered as not-owned. So return VFIO_PCI_DEVID_NOT_OWNED makes more sense. Regards, Yi Liu
On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > +/* > + * Return devid for a device which is affected by hot-reset. > + * - valid devid > 0 for the device that is bound to the input > + * iommufd_ctx. > + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not > + * been bound to any iommufd_ctx but other device within its > + * group has been bound to the input iommufd_ctx. > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device > + * is bound to other iommufd_ctx etc. > + */ > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > + struct iommufd_ctx *ictx) > +{ > + struct iommu_group *group; > + int devid; > + > + if (vfio_iommufd_device_ictx(vdev) == ictx) > + return vfio_iommufd_device_id(vdev); > + > + group = iommu_group_get(vdev->dev); > + if (!group) > + return VFIO_PCI_DEVID_NOT_OWNED; > + > + if (iommufd_ctx_has_group(ictx, group)) > + devid = VFIO_PCI_DEVID_OWNED; > + else > + devid = VFIO_PCI_DEVID_NOT_OWNED; > + > + iommu_group_put(group); > + > + return devid; > +} > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); This function really should not be in the core iommufd.c file - it is a purely vfio-pci function - why did you have to place it here? Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, June 13, 2023 7:47 PM > > On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > > +/* > > + * Return devid for a device which is affected by hot-reset. > > + * - valid devid > 0 for the device that is bound to the input > > + * iommufd_ctx. > > + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not > > + * been bound to any iommufd_ctx but other device within its > > + * group has been bound to the input iommufd_ctx. > > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device > > + * is bound to other iommufd_ctx etc. > > + */ > > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > > + struct iommufd_ctx *ictx) > > +{ > > + struct iommu_group *group; > > + int devid; > > + > > + if (vfio_iommufd_device_ictx(vdev) == ictx) > > + return vfio_iommufd_device_id(vdev); > > + > > + group = iommu_group_get(vdev->dev); > > + if (!group) > > + return VFIO_PCI_DEVID_NOT_OWNED; > > + > > + if (iommufd_ctx_has_group(ictx, group)) > > + devid = VFIO_PCI_DEVID_OWNED; > > + else > > + devid = VFIO_PCI_DEVID_NOT_OWNED; > > + > > + iommu_group_put(group); > > + > > + return devid; > > +} > > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); > > This function really should not be in the core iommufd.c file - it is > a purely vfio-pci function - why did you have to place it here? Put it here can avoid calling iommufd_ctx_has_group() in vfio-pci, which requires to import IOMMUFD_NS. If this reason is not so strong I can move it back to vfio-pci code. Regards, Yi Liu
On Tue, 13 Jun 2023 12:50:43 +0000 "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Tuesday, June 13, 2023 7:47 PM > > > > On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > > > +/* > > > + * Return devid for a device which is affected by hot-reset. > > > + * - valid devid > 0 for the device that is bound to the input > > > + * iommufd_ctx. > > > + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not > > > + * been bound to any iommufd_ctx but other device within its > > > + * group has been bound to the input iommufd_ctx. > > > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device > > > + * is bound to other iommufd_ctx etc. > > > + */ > > > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > > > + struct iommufd_ctx *ictx) > > > +{ > > > + struct iommu_group *group; > > > + int devid; > > > + > > > + if (vfio_iommufd_device_ictx(vdev) == ictx) > > > + return vfio_iommufd_device_id(vdev); > > > + > > > + group = iommu_group_get(vdev->dev); > > > + if (!group) > > > + return VFIO_PCI_DEVID_NOT_OWNED; > > > + > > > + if (iommufd_ctx_has_group(ictx, group)) > > > + devid = VFIO_PCI_DEVID_OWNED; > > > + else > > > + devid = VFIO_PCI_DEVID_NOT_OWNED; > > > + > > > + iommu_group_put(group); > > > + > > > + return devid; > > > +} > > > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); > > > > This function really should not be in the core iommufd.c file - it is > > a purely vfio-pci function - why did you have to place it here? > > Put it here can avoid calling iommufd_ctx_has_group() in vfio-pci, > which requires to import IOMMUFD_NS. If this reason is not so > strong I can move it back to vfio-pci code. The PCI-isms here are the name of the function and the return value, otherwise this is simply a "give me the devid for this device in this context". The function name is trivial to change and we can define the internal errno usage such that the vfio-pci-core code can interpret the correct uAPI value. For example, -EXDEV ("Cross-device link") could maybe be interpreted as owned and any other errno is not-owned, -ENODEV maybe being the default. Errno values are often contentious, are there other suggestions? Thanks, Alex
On Tue, Jun 13, 2023 at 08:32:29AM -0600, Alex Williamson wrote: > On Tue, 13 Jun 2023 12:50:43 +0000 > "Liu, Yi L" <yi.l.liu@intel.com> wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Tuesday, June 13, 2023 7:47 PM > > > > > > On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > > > > +/* > > > > + * Return devid for a device which is affected by hot-reset. > > > > + * - valid devid > 0 for the device that is bound to the input > > > > + * iommufd_ctx. > > > > + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not > > > > + * been bound to any iommufd_ctx but other device within its > > > > + * group has been bound to the input iommufd_ctx. > > > > + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device > > > > + * is bound to other iommufd_ctx etc. > > > > + */ > > > > +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, > > > > + struct iommufd_ctx *ictx) > > > > +{ > > > > + struct iommu_group *group; > > > > + int devid; > > > > + > > > > + if (vfio_iommufd_device_ictx(vdev) == ictx) > > > > + return vfio_iommufd_device_id(vdev); > > > > + > > > > + group = iommu_group_get(vdev->dev); > > > > + if (!group) > > > > + return VFIO_PCI_DEVID_NOT_OWNED; > > > > + > > > > + if (iommufd_ctx_has_group(ictx, group)) > > > > + devid = VFIO_PCI_DEVID_OWNED; > > > > + else > > > > + devid = VFIO_PCI_DEVID_NOT_OWNED; > > > > + > > > > + iommu_group_put(group); > > > > + > > > > + return devid; > > > > +} > > > > +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); > > > > > > This function really should not be in the core iommufd.c file - it is > > > a purely vfio-pci function - why did you have to place it here? > > > > Put it here can avoid calling iommufd_ctx_has_group() in vfio-pci, > > which requires to import IOMMUFD_NS. If this reason is not so > > strong I can move it back to vfio-pci code. > > The PCI-isms here are the name of the function and the return value, > otherwise this is simply a "give me the devid for this device in this > context". The function name is trivial to change and we can define the > internal errno usage such that the vfio-pci-core code can interpret the > correct uAPI value. For example, -EXDEV ("Cross-device link") could > maybe be interpreted as owned and any other errno is not-owned, -ENODEV > maybe being the default. Yeah, this approach seems logical If the function is called vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) Then maybe ENOENT = device is owned but there is no Id ENODEV = device is not owned EXDEV is good too, nice symmetry with ENODEV - it doesn't really matter since there is only one caller and there is no embedded errno propogation. Jason
On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx > of the cdev device to check the ownership of the other affected devices. > > When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed > device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate > the values returned are IOMMUFD devids rather than group IDs as used when > accessing vfio devices through the conventional vfio group interface. > Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported > in this mode if all of the devices affected by the hot-reset are owned by > either virtue of being directly bound to the same iommufd context as the > calling device, or implicitly owned via a shared IOMMU group. > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > --- > drivers/vfio/iommufd.c | 49 +++++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++----- > include/linux/vfio.h | 16 ++++++++++ > include/uapi/linux/vfio.h | 50 +++++++++++++++++++++++++++++++- > 4 files changed, 154 insertions(+), 8 deletions(-) This could use some more fiddling, like we could copy each vfio_pci_dependent_device to user memory inside the loop instead of allocating an array. Add another patch with something like this in it: diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b0eadafcbcf502..516e0fda74bec9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -775,19 +775,23 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - int max; - int cur; - struct vfio_pci_dependent_device *devices; + struct vfio_pci_dependent_device __user *devices; + struct vfio_pci_dependent_device __user *devices_end; struct vfio_device *vdev; u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { + struct vfio_pci_dependent_device info = { + .segment = pci_domain_nr(pdev->bus), + .bus = pdev->bus->number, + .devfn = pdev->devfn, + }; struct vfio_pci_fill_info *fill = data; - if (fill->cur == fill->max) - return -EAGAIN; /* Something changed, try again */ + if (fill->devices_end >= fill->devices) + return -ENOSPC; if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); @@ -800,12 +804,12 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) */ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); if (!vdev) - fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; + info.devid = VFIO_PCI_DEVID_NOT_OWNED; else - fill->devices[fill->cur].devid = - vfio_iommufd_device_hot_reset_devid(vdev, iommufd); + info.devid = vfio_iommufd_device_hot_reset_devid( + vdev, iommufd); /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ - if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; } else { struct iommu_group *iommu_group; @@ -814,13 +818,13 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) if (!iommu_group) return -EPERM; /* Cannot reset non-isolated devices */ - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + info.group_id = iommu_group_id(iommu_group); iommu_group_put(iommu_group); } - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); - fill->devices[fill->cur].bus = pdev->bus->number; - fill->devices[fill->cur].devfn = pdev->devfn; - fill->cur++; + + if (copy_to_user(fill->devices, &info, sizeof(info))) + return -EFAULT; + fill->devices++; return 0; } @@ -1212,8 +1216,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; + struct vfio_pci_fill_info fill = {}; bool slot = false; int ret = 0; @@ -1231,29 +1234,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; - - WARN_ON(!fill.max); /* Should always be at least one */ - - /* - * If there's enough space, fill it now, otherwise return -ENOSPC and - * the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } - - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - fill.devices = devices; + fill.devices = arg->devices; + fill.devices_end = arg->devices + + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); fill.vdev = &vdev->vdev; if (vfio_device_cdev_opened(&vdev->vdev)) @@ -1264,29 +1247,14 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, &fill, slot); mutex_unlock(&vdev->vdev.dev_set->lock); + if (ret) + return ret; - /* - * If a device was removed between counting and filling, we may come up - * short of fill.max. If a device was added, we'll have a return of - * -EAGAIN above. - */ - if (!ret) { - hdr.count = fill.cur; - hdr.flags = fill.flags; - } - -reset_info_exit: + hdr.count = fill.devices - arg->devices; + hdr.flags = fill.flags; if (copy_to_user(arg, &hdr, minsz)) ret = -EFAULT; - - if (!ret) { - if (copy_to_user(&arg->devices, devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } - - kfree(devices); - return ret; + return 0; } static int
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, June 14, 2023 2:23 AM > > On Fri, Jun 02, 2023 at 05:15:14AM -0700, Yi Liu wrote: > > This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx > > of the cdev device to check the ownership of the other affected devices. > > > > When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed > > device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate > > the values returned are IOMMUFD devids rather than group IDs as used when > > accessing vfio devices through the conventional vfio group interface. > > Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported > > in this mode if all of the devices affected by the hot-reset are owned by > > either virtue of being directly bound to the same iommufd context as the > > calling device, or implicitly owned via a shared IOMMU group. > > > > Suggested-by: Jason Gunthorpe <jgg@nvidia.com> > > Suggested-by: Alex Williamson <alex.williamson@redhat.com> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > > --- > > drivers/vfio/iommufd.c | 49 +++++++++++++++++++++++++++++++ > > drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++----- > > include/linux/vfio.h | 16 ++++++++++ > > include/uapi/linux/vfio.h | 50 +++++++++++++++++++++++++++++++- > > 4 files changed, 154 insertions(+), 8 deletions(-) > > This could use some more fiddling, like we could copy each > vfio_pci_dependent_device to user memory inside the loop instead of > allocating an array. I understand the motivation. But have some concerns. Please check inline. > Add another patch with something like this in it: > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index b0eadafcbcf502..516e0fda74bec9 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -775,19 +775,23 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void > *data) > } > > struct vfio_pci_fill_info { > - int max; > - int cur; > - struct vfio_pci_dependent_device *devices; > + struct vfio_pci_dependent_device __user *devices; > + struct vfio_pci_dependent_device __user *devices_end; > struct vfio_device *vdev; > u32 flags; > }; > > static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > { > + struct vfio_pci_dependent_device info = { > + .segment = pci_domain_nr(pdev->bus), > + .bus = pdev->bus->number, > + .devfn = pdev->devfn, > + }; > struct vfio_pci_fill_info *fill = data; > > - if (fill->cur == fill->max) > - return -EAGAIN; /* Something changed, try again */ > + if (fill->devices_end >= fill->devices) > + return -ENOSPC; This should be fill->devices_end <= fill->devices. Even it's corrected. The new code does not return -EAGAIN. And if return -ENOSPC, the expected size should be returned. But I didn't see it. As the hunk below[1] is removed, seems no way to know how many memory it requires. > > if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { > struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); > @@ -800,12 +804,12 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > */ > vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); > if (!vdev) > - fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; > + info.devid = VFIO_PCI_DEVID_NOT_OWNED; > else > - fill->devices[fill->cur].devid = > - vfio_iommufd_device_hot_reset_devid(vdev, iommufd); > + info.devid = vfio_iommufd_device_hot_reset_devid( > + vdev, iommufd); > /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ > - if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) > + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) > fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; > } else { > struct iommu_group *iommu_group; > @@ -814,13 +818,13 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > if (!iommu_group) > return -EPERM; /* Cannot reset non-isolated devices */ > > - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); > + info.group_id = iommu_group_id(iommu_group); > iommu_group_put(iommu_group); > } > - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); > - fill->devices[fill->cur].bus = pdev->bus->number; > - fill->devices[fill->cur].devfn = pdev->devfn; > - fill->cur++; > + > + if (copy_to_user(fill->devices, &info, sizeof(info))) > + return -EFAULT; > + fill->devices++; > return 0; > } > > @@ -1212,8 +1216,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > unsigned long minsz = > offsetofend(struct vfio_pci_hot_reset_info, count); > struct vfio_pci_hot_reset_info hdr; > - struct vfio_pci_fill_info fill = { 0 }; > - struct vfio_pci_dependent_device *devices = NULL; > + struct vfio_pci_fill_info fill = {}; > bool slot = false; > int ret = 0; > > @@ -1231,29 +1234,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > else if (pci_probe_reset_bus(vdev->pdev->bus)) > return -ENODEV; > > - /* How many devices are affected? */ > - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, > - &fill.max, slot); > - if (ret) > - return ret; > - > - WARN_ON(!fill.max); /* Should always be at least one */ > - > - /* > - * If there's enough space, fill it now, otherwise return -ENOSPC and > - * the number of devices affected. > - */ > - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { > - ret = -ENOSPC; > - hdr.count = fill.max; > - goto reset_info_exit; > - } [1] The loop in this hunk figures out how many devices are affected and also figures out how many memory is needs. > - > - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); > - if (!devices) > - return -ENOMEM; > - > - fill.devices = devices; > + fill.devices = arg->devices; > + fill.devices_end = arg->devices + > + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); > fill.vdev = &vdev->vdev; > > if (vfio_device_cdev_opened(&vdev->vdev)) > @@ -1264,29 +1247,14 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, > &fill, slot); > mutex_unlock(&vdev->vdev.dev_set->lock); > + if (ret) > + return ret; > > - /* > - * If a device was removed between counting and filling, we may come up > - * short of fill.max. If a device was added, we'll have a return of > - * -EAGAIN above. > - */ > - if (!ret) { > - hdr.count = fill.cur; > - hdr.flags = fill.flags; > - } This mechanism is also removed though it may be rare. > - > -reset_info_exit: > + hdr.count = fill.devices - arg->devices; > + hdr.flags = fill.flags; > if (copy_to_user(arg, &hdr, minsz)) > ret = -EFAULT; > - > - if (!ret) { > - if (copy_to_user(&arg->devices, devices, > - hdr.count * sizeof(*devices))) > - ret = -EFAULT; > - } > - > - kfree(devices); > - return ret; > + return 0; should still return ret as "if (copy_to_user(arg, &hdr, minsz))" can fail. > } > > static int It appears to me there are subtle changes in the uapi (-ENOSPC, -EAGAIN). Though uapi header didn't document them. But per the comment in the code, it's changed. Maybe we can do it in a follow-up patch instead of part of this series. Regards, Yi Liu
On Wed, Jun 14, 2023 at 10:35:10AM +0000, Liu, Yi L wrote: > > - if (fill->cur == fill->max) > > - return -EAGAIN; /* Something changed, try again */ > > + if (fill->devices_end >= fill->devices) > > + return -ENOSPC; > > This should be fill->devices_end <= fill->devices. Yep > Even it's corrected. The > new code does not return -EAGAIN. Right, there is no EAGAIN. If the caller didn't provide enough space the previous version returned ENOSPC: > > - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { > > - ret = -ENOSPC; > > - hdr.count = fill.max; > > - goto reset_info_exit; > > - } -EAGAIN basically means the kernel internally malfunctioned - eg it allocated too little space for the actual size of devices. That is no longer possible in this version so it should never return -EAGAIN. > And if return -ENOSPC, the expected > size should be returned. But I didn't see it. As the hunk below[1] is removed, > seems no way to know how many memory it requires. Yes, I missed that, it should keep counting Like this then diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b0eadafcbcf502..05c064896a7a94 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -775,19 +775,25 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - int max; - int cur; - struct vfio_pci_dependent_device *devices; + struct vfio_pci_dependent_device __user *devices; + struct vfio_pci_dependent_device __user *devices_end; struct vfio_device *vdev; + u32 count; u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { + struct vfio_pci_dependent_device info = { + .segment = pci_domain_nr(pdev->bus), + .bus = pdev->bus->number, + .devfn = pdev->devfn, + }; struct vfio_pci_fill_info *fill = data; - if (fill->cur == fill->max) - return -EAGAIN; /* Something changed, try again */ + fill.count++; + if (fill->devices >= fill->devices_end) + return 0; if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); @@ -800,12 +806,12 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) */ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); if (!vdev) - fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; + info.devid = VFIO_PCI_DEVID_NOT_OWNED; else - fill->devices[fill->cur].devid = - vfio_iommufd_device_hot_reset_devid(vdev, iommufd); + info.devid = vfio_iommufd_device_hot_reset_devid( + vdev, iommufd); /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ - if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; } else { struct iommu_group *iommu_group; @@ -814,13 +820,13 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) if (!iommu_group) return -EPERM; /* Cannot reset non-isolated devices */ - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + info.group_id = iommu_group_id(iommu_group); iommu_group_put(iommu_group); } - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); - fill->devices[fill->cur].bus = pdev->bus->number; - fill->devices[fill->cur].devfn = pdev->devfn; - fill->cur++; + + if (copy_to_user(fill->devices, &info, sizeof(info))) + return -EFAULT; + fill->devices++; return 0; } @@ -1212,8 +1218,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; + struct vfio_pci_fill_info fill = {}; bool slot = false; int ret = 0; @@ -1231,29 +1236,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; - - WARN_ON(!fill.max); /* Should always be at least one */ - - /* - * If there's enough space, fill it now, otherwise return -ENOSPC and - * the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } - - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - fill.devices = devices; + fill.devices = arg->devices; + fill.devices_end = arg->devices + + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); fill.vdev = &vdev->vdev; if (vfio_device_cdev_opened(&vdev->vdev)) @@ -1264,29 +1249,17 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, &fill, slot); mutex_unlock(&vdev->vdev.dev_set->lock); + if (ret) + return ret; - /* - * If a device was removed between counting and filling, we may come up - * short of fill.max. If a device was added, we'll have a return of - * -EAGAIN above. - */ - if (!ret) { - hdr.count = fill.cur; - hdr.flags = fill.flags; - } - -reset_info_exit: + hdr.count = fill.count; + hdr.flags = fill.flags; if (copy_to_user(arg, &hdr, minsz)) - ret = -EFAULT; + return -EFAULT; - if (!ret) { - if (copy_to_user(&arg->devices, devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } - - kfree(devices); - return ret; + if (fill.count != fill.devices - arg->devices) + return -ENOSPC; + return 0; } static int
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, June 14, 2023 8:17 PM > > On Wed, Jun 14, 2023 at 10:35:10AM +0000, Liu, Yi L wrote: > > > > - if (fill->cur == fill->max) > > > - return -EAGAIN; /* Something changed, try again */ > > > + if (fill->devices_end >= fill->devices) > > > + return -ENOSPC; > > > > This should be fill->devices_end <= fill->devices. > > Yep > > > Even it's corrected. The > > new code does not return -EAGAIN. > > Right, there is no EAGAIN. If the caller didn't provide enough space > the previous version returned ENOSPC: > > > > - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { > > > - ret = -ENOSPC; > > > - hdr.count = fill.max; > > > - goto reset_info_exit; > > > - } > > -EAGAIN basically means the kernel internally malfunctioned - eg it > allocated too little space for the actual size of devices. That is no > longer possible in this version so it should never return -EAGAIN. I still have one doubt. Per my understanding, this is to handle newly plugged devices during the info reporting path. I don’t think holding dev_set lock can prevent it. but maybe -ENOSPC is enough. @Alex, what about your opinion? > > And if return -ENOSPC, the expected > > size should be returned. But I didn't see it. As the hunk below[1] is removed, > > seems no way to know how many memory it requires. > > Yes, I missed that, it should keep counting > > Like this then > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index b0eadafcbcf502..05c064896a7a94 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -775,19 +775,25 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void > *data) > } > > struct vfio_pci_fill_info { > - int max; > - int cur; > - struct vfio_pci_dependent_device *devices; > + struct vfio_pci_dependent_device __user *devices; > + struct vfio_pci_dependent_device __user *devices_end; > struct vfio_device *vdev; > + u32 count; > u32 flags; > }; > > static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > { > + struct vfio_pci_dependent_device info = { > + .segment = pci_domain_nr(pdev->bus), > + .bus = pdev->bus->number, > + .devfn = pdev->devfn, > + }; > struct vfio_pci_fill_info *fill = data; > > - if (fill->cur == fill->max) > - return -EAGAIN; /* Something changed, try again */ > + fill.count++; > + if (fill->devices >= fill->devices_end) > + return 0; > > if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { > struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); > @@ -800,12 +806,12 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > */ > vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); > if (!vdev) > - fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; > + info.devid = VFIO_PCI_DEVID_NOT_OWNED; > else > - fill->devices[fill->cur].devid = > - vfio_iommufd_device_hot_reset_devid(vdev, iommufd); > + info.devid = vfio_iommufd_device_hot_reset_devid( > + vdev, iommufd); > /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ > - if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) > + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) > fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; > } else { > struct iommu_group *iommu_group; > @@ -814,13 +820,13 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) > if (!iommu_group) > return -EPERM; /* Cannot reset non-isolated devices */ > > - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); > + info.group_id = iommu_group_id(iommu_group); > iommu_group_put(iommu_group); > } > - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); > - fill->devices[fill->cur].bus = pdev->bus->number; > - fill->devices[fill->cur].devfn = pdev->devfn; > - fill->cur++; > + > + if (copy_to_user(fill->devices, &info, sizeof(info))) > + return -EFAULT; > + fill->devices++; > return 0; > } > > @@ -1212,8 +1218,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > unsigned long minsz = > offsetofend(struct vfio_pci_hot_reset_info, count); > struct vfio_pci_hot_reset_info hdr; > - struct vfio_pci_fill_info fill = { 0 }; > - struct vfio_pci_dependent_device *devices = NULL; > + struct vfio_pci_fill_info fill = {}; > bool slot = false; > int ret = 0; > > @@ -1231,29 +1236,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > else if (pci_probe_reset_bus(vdev->pdev->bus)) > return -ENODEV; > > - /* How many devices are affected? */ > - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, > - &fill.max, slot); > - if (ret) > - return ret; > - > - WARN_ON(!fill.max); /* Should always be at least one */ > - > - /* > - * If there's enough space, fill it now, otherwise return -ENOSPC and > - * the number of devices affected. > - */ > - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { > - ret = -ENOSPC; > - hdr.count = fill.max; > - goto reset_info_exit; > - } > - > - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); > - if (!devices) > - return -ENOMEM; > - > - fill.devices = devices; > + fill.devices = arg->devices; > + fill.devices_end = arg->devices + > + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); > fill.vdev = &vdev->vdev; > > if (vfio_device_cdev_opened(&vdev->vdev)) > @@ -1264,29 +1249,17 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( > ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, > &fill, slot); > mutex_unlock(&vdev->vdev.dev_set->lock); > + if (ret) > + return ret; > > - /* > - * If a device was removed between counting and filling, we may come up > - * short of fill.max. If a device was added, we'll have a return of > - * -EAGAIN above. > - */ > - if (!ret) { > - hdr.count = fill.cur; > - hdr.flags = fill.flags; > - } > - > -reset_info_exit: > + hdr.count = fill.count; > + hdr.flags = fill.flags; > if (copy_to_user(arg, &hdr, minsz)) > - ret = -EFAULT; > + return -EFAULT; > > - if (!ret) { > - if (copy_to_user(&arg->devices, devices, > - hdr.count * sizeof(*devices))) > - ret = -EFAULT; > - } > - > - kfree(devices); > - return ret; > + if (fill.count != fill.devices - arg->devices) Should be "if (fill.count != (fill.devices - arg->devices) / sizeof(arg->devices[0]))"
On Wed, Jun 14, 2023 at 01:05:45PM +0000, Liu, Yi L wrote: > > -EAGAIN basically means the kernel internally malfunctioned - eg it > > allocated too little space for the actual size of devices. That is no > > longer possible in this version so it should never return -EAGAIN. > > I still have one doubt. Per my understanding, this is to handle newly > plugged devices during the info reporting path. I don’t think holding > dev_set lock can prevent it. but maybe -ENOSPC is enough. @Alex, > what about your opinion? If the device was plug instantly before we computed the size we returned ENOSPC If it was plugged instantly after we computed the size we returned EAGAIN Here we just resolve this race consistently to always return ENOSPC, which always means we ran out of space in the user provided buffer. > > - kfree(devices); > > - return ret; > > + if (fill.count != fill.devices - arg->devices) > > Should be "if (fill.count != (fill.devices - arg->devices) / sizeof(arg->devices[0]))"
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, June 14, 2023 9:38 PM > > On Wed, Jun 14, 2023 at 01:05:45PM +0000, Liu, Yi L wrote: > > > -EAGAIN basically means the kernel internally malfunctioned - eg it > > > allocated too little space for the actual size of devices. That is no > > > longer possible in this version so it should never return -EAGAIN. > > > > I still have one doubt. Per my understanding, this is to handle newly > > plugged devices during the info reporting path. I don’t think holding > > dev_set lock can prevent it. but maybe -ENOSPC is enough. @Alex, > > what about your opinion? > > If the device was plug instantly before we computed the size we returned > ENOSPC > > If it was plugged instantly after we computed the size we returned > EAGAIN Yes. > Here we just resolve this race consistently to always return ENOSPC, > which always means we ran out of space in the user provided buffer. This makes sense. > > > - kfree(devices); > > > - return ret; > > > + if (fill.count != fill.devices - arg->devices) > > > > Should be "if (fill.count != (fill.devices - arg->devices) / sizeof(arg->devices[0]))"
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 88b00c501015..a04f3a493437 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -66,6 +66,55 @@ void vfio_iommufd_unbind(struct vfio_device *vdev) vdev->ops->unbind_iommufd(vdev); } +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev) +{ + if (vdev->iommufd_device) + return iommufd_device_to_ictx(vdev->iommufd_device); + return NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx); + +static int vfio_iommufd_device_id(struct vfio_device *vdev) +{ + if (vdev->iommufd_device) + return iommufd_device_to_id(vdev->iommufd_device); + return -EINVAL; +} + +/* + * Return devid for a device which is affected by hot-reset. + * - valid devid > 0 for the device that is bound to the input + * iommufd_ctx. + * - devid == VFIO_PCI_DEVID_OWNED for the device that has not + * been bound to any iommufd_ctx but other device within its + * group has been bound to the input iommufd_ctx. + * - devid == VFIO_PCI_DEVID_NOT_OWNED for others. e.g. device + * is bound to other iommufd_ctx etc. + */ +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, + struct iommufd_ctx *ictx) +{ + struct iommu_group *group; + int devid; + + if (vfio_iommufd_device_ictx(vdev) == ictx) + return vfio_iommufd_device_id(vdev); + + group = iommu_group_get(vdev->dev); + if (!group) + return VFIO_PCI_DEVID_NOT_OWNED; + + if (iommufd_ctx_has_group(ictx, group)) + devid = VFIO_PCI_DEVID_OWNED; + else + devid = VFIO_PCI_DEVID_NOT_OWNED; + + iommu_group_put(group); + + return devid; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_device_hot_reset_devid); + /* * The physical standard ops mean that the iommufd_device is bound to the * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 3a2f67675036..a615a223cdef 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/iommufd.h> #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -776,26 +777,49 @@ struct vfio_pci_fill_info { int max; int cur; struct vfio_pci_dependent_device *devices; + struct vfio_device *vdev; + u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { struct vfio_pci_fill_info *fill = data; - struct iommu_group *iommu_group; if (fill->cur == fill->max) return -EAGAIN; /* Something changed, try again */ - iommu_group = iommu_group_get(&pdev->dev); - if (!iommu_group) - return -EPERM; /* Cannot reset non-isolated devices */ + if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { + struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); + struct vfio_device_set *dev_set = fill->vdev->dev_set; + struct vfio_device *vdev; - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + /* + * hot-reset requires all affected devices be represented in + * the dev_set. + */ + vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); + if (!vdev) + fill->devices[fill->cur].devid = VFIO_PCI_DEVID_NOT_OWNED; + else + fill->devices[fill->cur].devid = + vfio_iommufd_device_hot_reset_devid(vdev, iommufd); + /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ + if (fill->devices[fill->cur].devid == VFIO_PCI_DEVID_NOT_OWNED) + fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + } else { + struct iommu_group *iommu_group; + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ + + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + iommu_group_put(iommu_group); + } fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); fill->devices[fill->cur].bus = pdev->bus->number; fill->devices[fill->cur].devfn = pdev->devfn; fill->cur++; - iommu_group_put(iommu_group); return 0; } @@ -1229,17 +1253,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( return -ENOMEM; fill.devices = devices; + fill.vdev = &vdev->vdev; + + if (vfio_device_cdev_opened(&vdev->vdev)) + fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID | + VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + mutex_lock(&vdev->vdev.dev_set->lock); ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, &fill, slot); + mutex_unlock(&vdev->vdev.dev_set->lock); /* * If a device was removed between counting and filling, we may come up * short of fill.max. If a device was added, we'll have a return of * -EAGAIN above. */ - if (!ret) + if (!ret) { hdr.count = fill.cur; + hdr.flags = fill.flags; + } reset_info_exit: if (copy_to_user(arg, &hdr, minsz)) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ee120d2d530b..382a7b119c7c 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -114,6 +114,9 @@ struct vfio_device_ops { }; #if IS_ENABLED(CONFIG_IOMMUFD) +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev); +int vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, + struct iommufd_ctx *ictx); int vfio_iommufd_physical_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_physical_unbind(struct vfio_device *vdev); @@ -123,6 +126,19 @@ int vfio_iommufd_emulated_bind(struct vfio_device *vdev, void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); #else +static inline struct iommufd_ctx * +vfio_iommufd_device_ictx(struct vfio_device *vdev) +{ + return NULL; +} + +static inline int +vfio_iommufd_device_hot_reset_devid(struct vfio_device *vdev, + struct iommufd_ctx *ictx) +{ + return VFIO_PCI_DEVID_NOT_OWNED; +} + #define vfio_iommufd_physical_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0552e8dcf0cb..70cc31e6b1ce 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -650,11 +650,57 @@ enum { * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) * + * This command is used to query the affected devices in the hot reset for + * a given device. + * + * This command always reports the segment, bus, and devfn information for + * each affected device, and selectively reports the group_id or devid per + * the way how the calling device is opened. + * + * - If the calling device is opened via the traditional group/container + * API, group_id is reported. User should check if it has owned all + * the affected devices and provides a set of group fds to prove the + * ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl. + * + * - If the calling device is opened as a cdev, devid is reported. + * Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this + * data type. All the affected devices should be represented in + * the dev_set, ex. bound to a vfio driver, and also be owned by + * this interface which is determined by the following conditions: + * 1) Has a valid devid within the iommufd_ctx of the calling device. + * Ownership cannot be determined across separate iommufd_ctx and + * the cdev calling conventions do not support a proof-of-ownership + * model as provided in the legacy group interface. In this case + * valid devid with value greater than zero is provided in the return + * structure. + * 2) Does not have a valid devid within the iommufd_ctx of the calling + * device, but belongs to the same IOMMU group as the calling device + * or another opened device that has a valid devid within the + * iommufd_ctx of the calling device. This provides implicit ownership + * for devices within the same DMA isolation context. In this case + * the devid value of VFIO_PCI_DEVID_OWNED is provided in the return + * structure. + * + * A devid value of VFIO_PCI_DEVID_NOT_OWNED is provided in the return + * structure for affected devices where device is NOT represented in the + * dev_set or ownership is not available. Such devices prevent the use + * of VFIO_DEVICE_PCI_HOT_RESET ioctl outside of the proof-of-ownership + * calling conventions (ie. via legacy group accessed devices). Flag + * VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the + * affected devices are represented in the dev_set and also owned by + * the user. This flag is available only when + * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. + * * Return: 0 on success, -errno on failure: * -enospc = insufficient buffer, -enodev = unsupported for device. */ struct vfio_pci_dependent_device { - __u32 group_id; + union { + __u32 group_id; + __u32 devid; +#define VFIO_PCI_DEVID_OWNED 0 +#define VFIO_PCI_DEVID_NOT_OWNED -1 + }; __u16 segment; __u8 bus; __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ @@ -663,6 +709,8 @@ struct vfio_pci_dependent_device { struct vfio_pci_hot_reset_info { __u32 argsz; __u32 flags; +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID (1 << 0) +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED (1 << 1) __u32 count; struct vfio_pci_dependent_device devices[]; };
This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx of the cdev device to check the ownership of the other affected devices. When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate the values returned are IOMMUFD devids rather than group IDs as used when accessing vfio devices through the conventional vfio group interface. Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported in this mode if all of the devices affected by the hot-reset are owned by either virtue of being directly bound to the same iommufd context as the calling device, or implicitly owned via a shared IOMMU group. Suggested-by: Jason Gunthorpe <jgg@nvidia.com> Suggested-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Yi Liu <yi.l.liu@intel.com> --- drivers/vfio/iommufd.c | 49 +++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 47 +++++++++++++++++++++++++----- include/linux/vfio.h | 16 ++++++++++ include/uapi/linux/vfio.h | 50 +++++++++++++++++++++++++++++++- 4 files changed, 154 insertions(+), 8 deletions(-)