diff mbox series

[v5,04/14] vpci: cancel pending map/unmap on vpci removal

Message ID 20211125110251.2877218-5-andr2000@gmail.com (mailing list archive)
State New, archived
Headers show
Series PCI devices passthrough on Arm, part 3 | expand

Commit Message

Oleksandr Andrushchenko Nov. 25, 2021, 11:02 a.m. UTC
From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

When a vPCI is removed for a PCI device it is possible that we have
scheduled a delayed work for map/unmap operations for that device.
For example, the following scenario can illustrate the problem:

pci_physdev_op
   pci_add_device
       init_bars -> modify_bars -> defer_map -> raise_softirq(SCHEDULE_SOFTIRQ)
   iommu_add_device <- FAILS
   vpci_remove_device -> xfree(pdev->vpci)

leave_hypervisor_to_guest
   vpci_process_pending: v->vpci.mem != NULL; v->vpci.pdev->vpci == NULL

For the hardware domain we continue execution as the worse that
could happen is that MMIO mappings are left in place when the
device has been deassigned.

For unprivileged domains that get a failure in the middle of a vPCI
{un}map operation we need to destroy them, as we don't know in which
state the p2m is. This can only happen in vpci_process_pending for
DomUs as they won't be allowed to call pci_add_device.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>

---
Cc: Roger Pau Monné <roger.pau@citrix.com>
---
Since v4:
 - crash guest domain if map/unmap operation didn't succeed
 - re-work vpci cancel work to cancel work on all vCPUs
 - use new locking scheme with pdev->vpci_lock
New in v4

Fixes: 86dbcf6e30cb ("vpci: cancel pending map/unmap on vpci removal")

---

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
 xen/drivers/vpci/header.c | 49 ++++++++++++++++++++++++++++++---------
 xen/drivers/vpci/vpci.c   |  2 ++
 xen/include/xen/pci.h     |  5 ++++
 xen/include/xen/vpci.h    |  6 +++++
 4 files changed, 51 insertions(+), 11 deletions(-)

Comments

Roger Pau Monné Jan. 11, 2022, 4:57 p.m. UTC | #1
On Thu, Nov 25, 2021 at 01:02:41PM +0200, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> When a vPCI is removed for a PCI device it is possible that we have
> scheduled a delayed work for map/unmap operations for that device.
> For example, the following scenario can illustrate the problem:
> 
> pci_physdev_op
>    pci_add_device
>        init_bars -> modify_bars -> defer_map -> raise_softirq(SCHEDULE_SOFTIRQ)
>    iommu_add_device <- FAILS
>    vpci_remove_device -> xfree(pdev->vpci)
> 
> leave_hypervisor_to_guest
>    vpci_process_pending: v->vpci.mem != NULL; v->vpci.pdev->vpci == NULL
> 
> For the hardware domain we continue execution as the worse that
> could happen is that MMIO mappings are left in place when the
> device has been deassigned.
> 
> For unprivileged domains that get a failure in the middle of a vPCI
> {un}map operation we need to destroy them, as we don't know in which
> state the p2m is. This can only happen in vpci_process_pending for
> DomUs as they won't be allowed to call pci_add_device.
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> ---
> Cc: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Since v4:
>  - crash guest domain if map/unmap operation didn't succeed
>  - re-work vpci cancel work to cancel work on all vCPUs
>  - use new locking scheme with pdev->vpci_lock
> New in v4
> 
> Fixes: 86dbcf6e30cb ("vpci: cancel pending map/unmap on vpci removal")
> 
> ---
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> ---
>  xen/drivers/vpci/header.c | 49 ++++++++++++++++++++++++++++++---------
>  xen/drivers/vpci/vpci.c   |  2 ++
>  xen/include/xen/pci.h     |  5 ++++
>  xen/include/xen/vpci.h    |  6 +++++
>  4 files changed, 51 insertions(+), 11 deletions(-)
> 
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bd23c0274d48..ba333fb2f9b0 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,7 +131,13 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>  
>  bool vpci_process_pending(struct vcpu *v)
>  {
> -    if ( v->vpci.mem )
> +    struct pci_dev *pdev = v->vpci.pdev;
> +
> +    if ( !pdev )
> +        return false;
> +
> +    spin_lock(&pdev->vpci_lock);
> +    if ( !pdev->vpci_cancel_pending && v->vpci.mem )

Could you just check for pdev->vpci != NULL instead of having to add a
new vpci_cancel_pending field?

I also have a suggestion below which could make the code here simpler.

>      {
>          struct map_data data = {
>              .d = v->domain,
> @@ -140,32 +146,53 @@ bool vpci_process_pending(struct vcpu *v)
>          int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>  
>          if ( rc == -ERESTART )
> +        {
> +            spin_unlock(&pdev->vpci_lock);
>              return true;
> +        }
>  
> -        spin_lock(&v->vpci.pdev->vpci_lock);
> -        if ( v->vpci.pdev->vpci )
> +        if ( pdev->vpci )
>              /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(v->vpci.pdev,
> +            modify_decoding(pdev,
>                              rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
>                              !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci_lock);
>  
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
>          if ( rc )
> +        {
>              /*
>               * FIXME: in case of failure remove the device from the domain.
>               * Note that there might still be leftover mappings. While this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> +             * safe for Dom0, for DomUs the domain needs to be killed in order
> +             * to avoid leaking stale p2m mappings on failure.
>               */
> -            vpci_remove_device(v->vpci.pdev);
> +            if ( is_hardware_domain(v->domain) )
> +                vpci_remove_device_locked(pdev);
> +            else
> +                domain_crash(v->domain);
> +        }
>      }
> +    spin_unlock(&pdev->vpci_lock);
>  
>      return false;
>  }
>  
> +void vpci_cancel_pending_locked(struct pci_dev *pdev)
> +{
> +    struct vcpu *v;
> +
> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
> +
> +    /* Cancel any pending work now on all vCPUs. */
> +    for_each_vcpu( pdev->domain, v )
> +    {
> +        if ( v->vpci.mem && (v->vpci.pdev == pdev) )

I'm unsure this is correct. You are protecting the access to
v->vpci.pdev with an expectation that v->vpci.pdev->vpci_lock is being
held.

I wonder if it would be better to just pause all the domain vCPUs and
then perform the cleaning of any pending operations. That would assure
that there are no changes to v->vpci. vpci_cancel_pending_locked
shouldn't be a frequent operation, so the overhead of pausing all
domain vCPUs here is likely fine.

Thanks, Roger.
Jan Beulich Jan. 12, 2022, 3:27 p.m. UTC | #2
On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> When a vPCI is removed for a PCI device it is possible that we have
> scheduled a delayed work for map/unmap operations for that device.
> For example, the following scenario can illustrate the problem:
> 
> pci_physdev_op
>    pci_add_device
>        init_bars -> modify_bars -> defer_map -> raise_softirq(SCHEDULE_SOFTIRQ)
>    iommu_add_device <- FAILS
>    vpci_remove_device -> xfree(pdev->vpci)
> 
> leave_hypervisor_to_guest
>    vpci_process_pending: v->vpci.mem != NULL; v->vpci.pdev->vpci == NULL
> 
> For the hardware domain we continue execution as the worse that
> could happen is that MMIO mappings are left in place when the
> device has been deassigned.
> 
> For unprivileged domains that get a failure in the middle of a vPCI
> {un}map operation we need to destroy them, as we don't know in which
> state the p2m is. This can only happen in vpci_process_pending for
> DomUs as they won't be allowed to call pci_add_device.
> 
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> 
> ---
> Cc: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Since v4:
>  - crash guest domain if map/unmap operation didn't succeed
>  - re-work vpci cancel work to cancel work on all vCPUs
>  - use new locking scheme with pdev->vpci_lock
> New in v4
> 
> Fixes: 86dbcf6e30cb ("vpci: cancel pending map/unmap on vpci removal")

What is this about?

Jan
Oleksandr Andrushchenko Jan. 28, 2022, 12:21 p.m. UTC | #3
On 12.01.22 17:27, Jan Beulich wrote:
> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>
>> When a vPCI is removed for a PCI device it is possible that we have
>> scheduled a delayed work for map/unmap operations for that device.
>> For example, the following scenario can illustrate the problem:
>>
>> pci_physdev_op
>>     pci_add_device
>>         init_bars -> modify_bars -> defer_map -> raise_softirq(SCHEDULE_SOFTIRQ)
>>     iommu_add_device <- FAILS
>>     vpci_remove_device -> xfree(pdev->vpci)
>>
>> leave_hypervisor_to_guest
>>     vpci_process_pending: v->vpci.mem != NULL; v->vpci.pdev->vpci == NULL
>>
>> For the hardware domain we continue execution as the worse that
>> could happen is that MMIO mappings are left in place when the
>> device has been deassigned.
>>
>> For unprivileged domains that get a failure in the middle of a vPCI
>> {un}map operation we need to destroy them, as we don't know in which
>> state the p2m is. This can only happen in vpci_process_pending for
>> DomUs as they won't be allowed to call pci_add_device.
>>
>> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>>
>> ---
>> Cc: Roger Pau Monné <roger.pau@citrix.com>
>> ---
>> Since v4:
>>   - crash guest domain if map/unmap operation didn't succeed
>>   - re-work vpci cancel work to cancel work on all vCPUs
>>   - use new locking scheme with pdev->vpci_lock
>> New in v4
>>
>> Fixes: 86dbcf6e30cb ("vpci: cancel pending map/unmap on vpci removal")
> What is this about?
Just a leftover after squashing WIP patches, sorry
>
> Jan
>
Thank you,
Oleksandr
Oleksandr Andrushchenko Jan. 31, 2022, 7:53 a.m. UTC | #4
I am going to postpone this patch as it does need major re-thinking
on the approach to take. This is possible as it fixes a really rare
use-case seen during development phase, so shouldn't hurt the run-time

Thank you,
Oleksandr

On 25.11.21 13:02, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>
> When a vPCI is removed for a PCI device it is possible that we have
> scheduled a delayed work for map/unmap operations for that device.
> For example, the following scenario can illustrate the problem:
>
> pci_physdev_op
>     pci_add_device
>         init_bars -> modify_bars -> defer_map -> raise_softirq(SCHEDULE_SOFTIRQ)
>     iommu_add_device <- FAILS
>     vpci_remove_device -> xfree(pdev->vpci)
>
> leave_hypervisor_to_guest
>     vpci_process_pending: v->vpci.mem != NULL; v->vpci.pdev->vpci == NULL
>
> For the hardware domain we continue execution as the worse that
> could happen is that MMIO mappings are left in place when the
> device has been deassigned.
>
> For unprivileged domains that get a failure in the middle of a vPCI
> {un}map operation we need to destroy them, as we don't know in which
> state the p2m is. This can only happen in vpci_process_pending for
> DomUs as they won't be allowed to call pci_add_device.
>
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
>
> ---
> Cc: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Since v4:
>   - crash guest domain if map/unmap operation didn't succeed
>   - re-work vpci cancel work to cancel work on all vCPUs
>   - use new locking scheme with pdev->vpci_lock
> New in v4
>
> Fixes: 86dbcf6e30cb ("vpci: cancel pending map/unmap on vpci removal")
>
> ---
>
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
> ---
>   xen/drivers/vpci/header.c | 49 ++++++++++++++++++++++++++++++---------
>   xen/drivers/vpci/vpci.c   |  2 ++
>   xen/include/xen/pci.h     |  5 ++++
>   xen/include/xen/vpci.h    |  6 +++++
>   4 files changed, 51 insertions(+), 11 deletions(-)
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index bd23c0274d48..ba333fb2f9b0 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,7 +131,13 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
>   
>   bool vpci_process_pending(struct vcpu *v)
>   {
> -    if ( v->vpci.mem )
> +    struct pci_dev *pdev = v->vpci.pdev;
> +
> +    if ( !pdev )
> +        return false;
> +
> +    spin_lock(&pdev->vpci_lock);
> +    if ( !pdev->vpci_cancel_pending && v->vpci.mem )
>       {
>           struct map_data data = {
>               .d = v->domain,
> @@ -140,32 +146,53 @@ bool vpci_process_pending(struct vcpu *v)
>           int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
>   
>           if ( rc == -ERESTART )
> +        {
> +            spin_unlock(&pdev->vpci_lock);
>               return true;
> +        }
>   
> -        spin_lock(&v->vpci.pdev->vpci_lock);
> -        if ( v->vpci.pdev->vpci )
> +        if ( pdev->vpci )
>               /* Disable memory decoding unconditionally on failure. */
> -            modify_decoding(v->vpci.pdev,
> +            modify_decoding(pdev,
>                               rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
>                               !rc && v->vpci.rom_only);
> -        spin_unlock(&v->vpci.pdev->vpci_lock);
>   
> -        rangeset_destroy(v->vpci.mem);
> -        v->vpci.mem = NULL;
>           if ( rc )
> +        {
>               /*
>                * FIXME: in case of failure remove the device from the domain.
>                * Note that there might still be leftover mappings. While this is
> -             * safe for Dom0, for DomUs the domain will likely need to be
> -             * killed in order to avoid leaking stale p2m mappings on
> -             * failure.
> +             * safe for Dom0, for DomUs the domain needs to be killed in order
> +             * to avoid leaking stale p2m mappings on failure.
>                */
> -            vpci_remove_device(v->vpci.pdev);
> +            if ( is_hardware_domain(v->domain) )
> +                vpci_remove_device_locked(pdev);
> +            else
> +                domain_crash(v->domain);
> +        }
>       }
> +    spin_unlock(&pdev->vpci_lock);
>   
>       return false;
>   }
>   
> +void vpci_cancel_pending_locked(struct pci_dev *pdev)
> +{
> +    struct vcpu *v;
> +
> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
> +
> +    /* Cancel any pending work now on all vCPUs. */
> +    for_each_vcpu( pdev->domain, v )
> +    {
> +        if ( v->vpci.mem && (v->vpci.pdev == pdev) )
> +        {
> +            rangeset_destroy(v->vpci.mem);
> +            v->vpci.mem = NULL;
> +        }
> +    }
> +}
> +
>   static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
>                               struct rangeset *mem, uint16_t cmd)
>   {
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index ceaac4516ff8..37103e207635 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -54,7 +54,9 @@ void vpci_remove_device_locked(struct pci_dev *pdev)
>   {
>       ASSERT(spin_is_locked(&pdev->vpci_lock));
>   
> +    pdev->vpci_cancel_pending = true;
>       vpci_remove_device_handlers_locked(pdev);
> +    vpci_cancel_pending_locked(pdev);
>       xfree(pdev->vpci->msix);
>       xfree(pdev->vpci->msi);
>       xfree(pdev->vpci);
> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
> index 3f60d6c6c6dd..52d302ac5f35 100644
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -135,6 +135,11 @@ struct pci_dev {
>   
>       /* Data for vPCI. */
>       spinlock_t vpci_lock;
> +    /*
> +     * Set if PCI device is being removed now and we need to cancel any
> +     * pending map/unmap operations.
> +     */
> +    bool vpci_cancel_pending;
>       struct vpci *vpci;
>   };
>   
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index 8b22bdef11d0..cfff87e5801e 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -57,6 +57,7 @@ uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
>    * should not run.
>    */
>   bool __must_check vpci_process_pending(struct vcpu *v);
> +void vpci_cancel_pending_locked(struct pci_dev *pdev);
>   
>   struct vpci {
>       /* List of vPCI handlers for a device. */
> @@ -253,6 +254,11 @@ static inline bool __must_check vpci_process_pending(struct vcpu *v)
>       ASSERT_UNREACHABLE();
>       return false;
>   }
> +
> +static inline void vpci_cancel_pending_locked(struct pci_dev *pdev)
> +{
> +    ASSERT_UNREACHABLE();
> +}
>   #endif
>   
>   #endif
diff mbox series

Patch

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index bd23c0274d48..ba333fb2f9b0 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -131,7 +131,13 @@  static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
 
 bool vpci_process_pending(struct vcpu *v)
 {
-    if ( v->vpci.mem )
+    struct pci_dev *pdev = v->vpci.pdev;
+
+    if ( !pdev )
+        return false;
+
+    spin_lock(&pdev->vpci_lock);
+    if ( !pdev->vpci_cancel_pending && v->vpci.mem )
     {
         struct map_data data = {
             .d = v->domain,
@@ -140,32 +146,53 @@  bool vpci_process_pending(struct vcpu *v)
         int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
 
         if ( rc == -ERESTART )
+        {
+            spin_unlock(&pdev->vpci_lock);
             return true;
+        }
 
-        spin_lock(&v->vpci.pdev->vpci_lock);
-        if ( v->vpci.pdev->vpci )
+        if ( pdev->vpci )
             /* Disable memory decoding unconditionally on failure. */
-            modify_decoding(v->vpci.pdev,
+            modify_decoding(pdev,
                             rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
                             !rc && v->vpci.rom_only);
-        spin_unlock(&v->vpci.pdev->vpci_lock);
 
-        rangeset_destroy(v->vpci.mem);
-        v->vpci.mem = NULL;
         if ( rc )
+        {
             /*
              * FIXME: in case of failure remove the device from the domain.
              * Note that there might still be leftover mappings. While this is
-             * safe for Dom0, for DomUs the domain will likely need to be
-             * killed in order to avoid leaking stale p2m mappings on
-             * failure.
+             * safe for Dom0, for DomUs the domain needs to be killed in order
+             * to avoid leaking stale p2m mappings on failure.
              */
-            vpci_remove_device(v->vpci.pdev);
+            if ( is_hardware_domain(v->domain) )
+                vpci_remove_device_locked(pdev);
+            else
+                domain_crash(v->domain);
+        }
     }
+    spin_unlock(&pdev->vpci_lock);
 
     return false;
 }
 
+void vpci_cancel_pending_locked(struct pci_dev *pdev)
+{
+    struct vcpu *v;
+
+    ASSERT(spin_is_locked(&pdev->vpci_lock));
+
+    /* Cancel any pending work now on all vCPUs. */
+    for_each_vcpu( pdev->domain, v )
+    {
+        if ( v->vpci.mem && (v->vpci.pdev == pdev) )
+        {
+            rangeset_destroy(v->vpci.mem);
+            v->vpci.mem = NULL;
+        }
+    }
+}
+
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
                             struct rangeset *mem, uint16_t cmd)
 {
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index ceaac4516ff8..37103e207635 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -54,7 +54,9 @@  void vpci_remove_device_locked(struct pci_dev *pdev)
 {
     ASSERT(spin_is_locked(&pdev->vpci_lock));
 
+    pdev->vpci_cancel_pending = true;
     vpci_remove_device_handlers_locked(pdev);
+    vpci_cancel_pending_locked(pdev);
     xfree(pdev->vpci->msix);
     xfree(pdev->vpci->msi);
     xfree(pdev->vpci);
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 3f60d6c6c6dd..52d302ac5f35 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -135,6 +135,11 @@  struct pci_dev {
 
     /* Data for vPCI. */
     spinlock_t vpci_lock;
+    /*
+     * Set if PCI device is being removed now and we need to cancel any
+     * pending map/unmap operations.
+     */
+    bool vpci_cancel_pending;
     struct vpci *vpci;
 };
 
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 8b22bdef11d0..cfff87e5801e 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -57,6 +57,7 @@  uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
  * should not run.
  */
 bool __must_check vpci_process_pending(struct vcpu *v);
+void vpci_cancel_pending_locked(struct pci_dev *pdev);
 
 struct vpci {
     /* List of vPCI handlers for a device. */
@@ -253,6 +254,11 @@  static inline bool __must_check vpci_process_pending(struct vcpu *v)
     ASSERT_UNREACHABLE();
     return false;
 }
+
+static inline void vpci_cancel_pending_locked(struct pci_dev *pdev)
+{
+    ASSERT_UNREACHABLE();
+}
 #endif
 
 #endif