@@ -174,7 +174,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
}
struct vfio_pci_group_info;
-static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
+static void vfio_pci_core_try_reset(struct vfio_pci_core_device *vdev);
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
struct vfio_pci_group_info *groups);
@@ -667,7 +667,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
out:
pci_disable_device(pdev);
- vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
+ vfio_pci_core_try_reset(vdev);
/* Put the pm-runtime usage counter acquired during enable */
if (!disable_idle_d3)
@@ -2483,14 +2483,18 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
return ret;
}
-static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
+static bool vfio_pci_core_needs_reset(struct vfio_pci_core_device *vdev)
{
+ struct vfio_device_set *dev_set = vdev->vdev.dev_set;
struct vfio_pci_core_device *cur;
bool needs_reset = false;
+ if (WARN_ON(vdev->vdev.open_count > 1))
+ return false;
+
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
- /* No VFIO device in the set can have an open device FD */
- if (cur->vdev.open_count)
+ /* Only the VFIO device being reset can have an open FD */
+ if (cur != vdev && cur->vdev.open_count)
return false;
needs_reset |= cur->needs_reset;
}
@@ -2498,19 +2502,20 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
}
/*
- * If a bus or slot reset is available for the provided dev_set and:
+ * If a bus or slot reset is available for the provided device and:
* - All of the devices affected by that bus or slot reset are unused
* - At least one of the affected devices is marked dirty via
* needs_reset (such as by lack of FLR support)
* Then attempt to perform that bus or slot reset.
*/
-static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
+static void vfio_pci_core_try_reset(struct vfio_pci_core_device *vdev)
{
+ struct vfio_device_set *dev_set = vdev->vdev.dev_set;
struct vfio_pci_core_device *cur;
struct pci_dev *pdev;
bool reset_done = false;
- if (!vfio_pci_dev_set_needs_reset(dev_set))
+ if (!vfio_pci_core_needs_reset(vdev))
return;
pdev = vfio_pci_dev_set_resettable(dev_set);
The implementation of vfio_pci_core_disable() inspects the open_count of every device in the device set to determine whether a reset is needed. This count is always non-zero for the device being disabled, effectively disabling the reset logic. After commit 2cd8b14aaa66 ("vfio/pci: Move to the device set infrastructure"), failure to create a new file for a device would cause the reset to be skipped due to open_count being decremented after calling close_device() in the error path. After commit eadd86f835c6 ("vfio: Remove calls to vfio_group_add_container_user()"), releasing a device would always skip the reset due to an ordering change in vfio_device_fops_release(). Failing to reset the device leaves it in an unknown state, potentially causing errors when it is bound to a different driver. This issue was observed with a Radeon RX Vega 56 [1002:687f] (rev c3) assigned to a Windows guest. After shutting down the guest, unbinding the device from vfio-pci, and binding the device to amdgpu: [ 548.007102] [drm:psp_hw_start [amdgpu]] *ERROR* PSP create ring failed! [ 548.027174] [drm:psp_hw_init [amdgpu]] *ERROR* PSP firmware loading failed [ 548.027242] [drm:amdgpu_device_fw_loading [amdgpu]] *ERROR* hw_init of IP block <psp> failed -22 [ 548.027306] amdgpu 0000:0a:00.0: amdgpu: amdgpu_device_ip_init failed [ 548.027308] amdgpu 0000:0a:00.0: amdgpu: Fatal error during GPU init Fixes: 2cd8b14aaa66 ("vfio/pci: Move to the device set infrastructure") Fixes: eadd86f835c6 ("vfio: Remove calls to vfio_group_add_container_user()") Signed-off-by: Anthony DeRossi <ajderossi@gmail.com> --- v2 -> v3: - Added WARN_ON() - Revised commit message v2: https://lore.kernel.org/kvm/20221026194245.1769-1-ajderossi@gmail.com/ v1 -> v2: - Changed reset behavior instead of open_count ordering - Retitled from "vfio: Decrement open_count before close_device()" v1: https://lore.kernel.org/kvm/20221025193820.4412-1-ajderossi@gmail.com/ drivers/vfio/pci/vfio_pci_core.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-)