diff mbox series

[RFC,01/10] xen: pci: add per-domain pci list lock

Message ID 20220831141040.13231-2-volodymyr_babchuk@epam.com (mailing list archive)
State New, archived
Headers show
Series Rework PCI locking | expand

Commit Message

Volodymyr Babchuk Aug. 31, 2022, 2:10 p.m. UTC
domain->pdevs_lock protects access to domain->pdev_list.
As this, it should be used when we are adding, removing on enumerating
PCI devices assigned to a domain.

This enables more granular locking instead of one huge pcidevs_lock that
locks entire PCI subsystem. Please note that pcidevs_lock() is still
used, we are going to remove it in subsequent patches.

Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
---
 xen/common/domain.c                         |  1 +
 xen/drivers/passthrough/amd/iommu_cmd.c     |  4 ++-
 xen/drivers/passthrough/amd/pci_amd_iommu.c |  7 ++++-
 xen/drivers/passthrough/pci.c               | 29 ++++++++++++++++++++-
 xen/drivers/passthrough/vtd/iommu.c         |  9 +++++--
 xen/drivers/vpci/header.c                   |  3 +++
 xen/drivers/vpci/msi.c                      |  7 ++++-
 xen/drivers/vpci/vpci.c                     |  4 +--
 xen/include/xen/pci.h                       |  2 +-
 xen/include/xen/sched.h                     |  1 +
 10 files changed, 58 insertions(+), 9 deletions(-)

Comments

Stefano Stabellini Jan. 26, 2023, 11:18 p.m. UTC | #1
On Wed, 31 Aug 2022, Volodymyr Babchuk wrote:
> domain->pdevs_lock protects access to domain->pdev_list.
> As this, it should be used when we are adding, removing on enumerating
> PCI devices assigned to a domain.
> 
> This enables more granular locking instead of one huge pcidevs_lock that
> locks entire PCI subsystem. Please note that pcidevs_lock() is still
> used, we are going to remove it in subsequent patches.
> 
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>

I reviewed the patch, and made sure to pay extra attention to:
- error paths
- missing locks
- lock ordering
- interruptions

Here is what I found:


1) iommu.c:reassign_device_ownership and pci_amd_iommu.c:reassign_device
Both functions without any pdevs_lock locking do:
list_move(&pdev->domain_list, &target->pdev_list);

It seems to be it would need pdevs_lock. Maybe we need to change
list_move into list_del (protected by the pdevs_lock of the old domain)
and list_add (protected by the pdev_lock of the new domain).


2) has_arch_pdevs
has_arch_pdevs is implemented as list_empty and needs locking as well,
however no domain->pdevs_lock are added to protect has_arch_pdevs in
this patch. I think we need pdevs_lock around has_arch_pdevs.


Two more comments below about lock inversion and taking the same lock
twice



> ---
>  xen/common/domain.c                         |  1 +
>  xen/drivers/passthrough/amd/iommu_cmd.c     |  4 ++-
>  xen/drivers/passthrough/amd/pci_amd_iommu.c |  7 ++++-
>  xen/drivers/passthrough/pci.c               | 29 ++++++++++++++++++++-
>  xen/drivers/passthrough/vtd/iommu.c         |  9 +++++--
>  xen/drivers/vpci/header.c                   |  3 +++
>  xen/drivers/vpci/msi.c                      |  7 ++++-
>  xen/drivers/vpci/vpci.c                     |  4 +--
>  xen/include/xen/pci.h                       |  2 +-
>  xen/include/xen/sched.h                     |  1 +
>  10 files changed, 58 insertions(+), 9 deletions(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7062393e37..4611141b87 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -618,6 +618,7 @@ struct domain *domain_create(domid_t domid,
>  
>  #ifdef CONFIG_HAS_PCI
>      INIT_LIST_HEAD(&d->pdev_list);
> +    spin_lock_init(&d->pdevs_lock);
>  #endif
>  
>      /* All error paths can depend on the above setup. */
> diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
> index 40ddf366bb..47c45398d4 100644
> --- a/xen/drivers/passthrough/amd/iommu_cmd.c
> +++ b/xen/drivers/passthrough/amd/iommu_cmd.c
> @@ -308,11 +308,12 @@ void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev,
>      flush_command_buffer(iommu, iommu_dev_iotlb_timeout);
>  }
>  
> -static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
> +static void amd_iommu_flush_all_iotlbs(struct domain *d, daddr_t daddr,
>                                         unsigned int order)
>  {
>      struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev( d, pdev )
>      {
>          u8 devfn = pdev->devfn;
> @@ -323,6 +324,7 @@ static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
>          } while ( devfn != pdev->devfn &&
>                    PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
>      }
> +    spin_unlock(&d->pdevs_lock);
>  }
>  
>  /* Flush iommu cache after p2m changes. */
> diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> index 4ba8e764b2..64c016491d 100644
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -96,20 +96,25 @@ static int __must_check allocate_domain_resources(struct domain *d)
>      return rc;
>  }
>  
> -static bool any_pdev_behind_iommu(const struct domain *d,
> +static bool any_pdev_behind_iommu(struct domain *d,
>                                    const struct pci_dev *exclude,
>                                    const struct amd_iommu *iommu)
>  {
>      const struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev ( d, pdev )
>      {
>          if ( pdev == exclude )
>              continue;
>  
>          if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
> +	{
> +	    spin_unlock(&d->pdevs_lock);
>              return true;
> +	}

code style: tabs instead of spaces


>      }
> +    spin_unlock(&d->pdevs_lock);
>  
>      return false;
>  }
> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
> index cdaf5c247f..4366f8f965 100644
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -523,7 +523,9 @@ static void __init _pci_hide_device(struct pci_dev *pdev)
>      if ( pdev->domain )
>          return;
>      pdev->domain = dom_xen;
> +    spin_lock(&dom_xen->pdevs_lock);
>      list_add(&pdev->domain_list, &dom_xen->pdev_list);
> +    spin_unlock(&dom_xen->pdevs_lock);
>  }
>  
>  int __init pci_hide_device(unsigned int seg, unsigned int bus,
> @@ -595,7 +597,7 @@ struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf)
>      return pdev;
>  }
>  
> -struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
> +struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf)
>  {
>      struct pci_dev *pdev;
>  
> @@ -620,9 +622,16 @@ struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
>                  return pdev;
>      }
>      else
> +    {
> +        spin_lock(&d->pdevs_lock);
>          list_for_each_entry ( pdev, &d->pdev_list, domain_list )
>              if ( pdev->sbdf.bdf == sbdf.bdf )
> +            {
> +                spin_unlock(&d->pdevs_lock);
>                  return pdev;
> +            }
> +        spin_unlock(&d->pdevs_lock);
> +    }
>  
>      return NULL;
>  }
> @@ -817,7 +826,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>      if ( !pdev->domain )
>      {
>          pdev->domain = hardware_domain;
> +        spin_lock(&hardware_domain->pdevs_lock);
>          list_add(&pdev->domain_list, &hardware_domain->pdev_list);
> +        spin_unlock(&hardware_domain->pdevs_lock);
>  
>          /*
>           * For devices not discovered by Xen during boot, add vPCI handlers
> @@ -827,7 +838,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>          if ( ret )
>          {
>              printk(XENLOG_ERR "Setup of vPCI failed: %d\n", ret);
> +            spin_lock(&pdev->domain->pdevs_lock);
>              list_del(&pdev->domain_list);
> +            spin_unlock(&pdev->domain->pdevs_lock);
>              pdev->domain = NULL;
>              goto out;
>          }
> @@ -835,7 +848,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>          if ( ret )
>          {
>              vpci_remove_device(pdev);
> +            spin_lock(&pdev->domain->pdevs_lock);
>              list_del(&pdev->domain_list);
> +            spin_unlock(&pdev->domain->pdevs_lock);
>              pdev->domain = NULL;
>              goto out;
>          }
> @@ -885,7 +900,11 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
>              pci_cleanup_msi(pdev);
>              ret = iommu_remove_device(pdev);
>              if ( pdev->domain )
> +            {
> +                spin_lock(&pdev->domain->pdevs_lock);
>                  list_del(&pdev->domain_list);
> +                spin_unlock(&pdev->domain->pdevs_lock);
> +            }
>              printk(XENLOG_DEBUG "PCI remove device %pp\n", &pdev->sbdf);
>              free_pdev(pseg, pdev);
>              break;
> @@ -967,12 +986,14 @@ int pci_release_devices(struct domain *d)
>          pcidevs_unlock();
>          return ret;
>      }
> +    spin_lock(&d->pdevs_lock);
>      list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
>      {
>          bus = pdev->bus;
>          devfn = pdev->devfn;
>          ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;

This causes pdevs_lock to be taken twice. deassign_device also takes a
pdevs_lock.  Probably we need to change all the
spin_lock(&d->pdevs_lock) into spin_lock_recursive.



>      }
> +    spin_unlock(&d->pdevs_lock);
>      pcidevs_unlock();
>  
>      return ret;
> @@ -1194,7 +1215,9 @@ static int __hwdom_init cf_check _setup_hwdom_pci_devices(
>              if ( !pdev->domain )
>              {
>                  pdev->domain = ctxt->d;
> +                spin_lock(&pdev->domain->pdevs_lock);
>                  list_add(&pdev->domain_list, &ctxt->d->pdev_list);
> +                spin_unlock(&pdev->domain->pdevs_lock);
>                  setup_one_hwdom_device(ctxt, pdev);
>              }
>              else if ( pdev->domain == dom_xen )
> @@ -1556,6 +1579,7 @@ static int iommu_get_device_group(
>          return group_id;
>  
>      pcidevs_lock();
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev( d, pdev )
>      {
>          unsigned int b = pdev->bus;
> @@ -1571,6 +1595,7 @@ static int iommu_get_device_group(
>          if ( sdev_id < 0 )
>          {
>              pcidevs_unlock();
> +            spin_unlock(&d->pdevs_lock);

lock inversion


>              return sdev_id;
>          }
>  
> @@ -1581,6 +1606,7 @@ static int iommu_get_device_group(
>              if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
>              {
>                  pcidevs_unlock();
> +                spin_unlock(&d->pdevs_lock);

lock inversion


>                  return -EFAULT;
>              }
>              i++;
> @@ -1588,6 +1614,7 @@ static int iommu_get_device_group(
>      }
>  
>      pcidevs_unlock();
> +    spin_unlock(&d->pdevs_lock);

lock inversion


>      return i;
>  }
> diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
> index 62e143125d..fff1442265 100644
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -183,12 +183,13 @@ static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu)
>      }
>  }
>  
> -static bool any_pdev_behind_iommu(const struct domain *d,
> +static bool any_pdev_behind_iommu(struct domain *d,
>                                    const struct pci_dev *exclude,
>                                    const struct vtd_iommu *iommu)
>  {
>      const struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev ( d, pdev )
>      {
>          const struct acpi_drhd_unit *drhd;
> @@ -198,8 +199,12 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>  
>          drhd = acpi_find_matched_drhd_unit(pdev);
>          if ( drhd && drhd->iommu == iommu )
> +        {
> +            spin_unlock(&d->pdevs_lock);
>              return true;
> +        }
>      }
> +    spin_unlock(&d->pdevs_lock);
>  
>      return false;
>  }
> @@ -208,7 +213,7 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>   * If no other devices under the same iommu owned by this domain,
>   * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
>   */
> -static void check_cleanup_domid_map(const struct domain *d,
> +static void check_cleanup_domid_map(struct domain *d,
>                                      const struct pci_dev *exclude,
>                                      struct vtd_iommu *iommu)
>  {
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index a1c928a0d2..a59aa7ad0b 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -267,6 +267,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>       * Check for overlaps with other BARs. Note that only BARs that are
>       * currently mapped (enabled) are checked for overlaps.
>       */
> +    spin_lock(&pdev->domain->pdevs_lock);
>      for_each_pdev ( pdev->domain, tmp )
>      {
>          if ( tmp == pdev )
> @@ -306,11 +307,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
>                  printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                         start, end, rc);
>                  rangeset_destroy(mem);
> +                spin_unlock( &pdev->domain->pdevs_lock);
>                  return rc;
>              }
>          }
>      }
>  
> +    spin_unlock( &pdev->domain->pdevs_lock);
>      ASSERT(dev);
>  
>      if ( system_state < SYS_STATE_active )
> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> index 8f2b59e61a..8969c335b0 100644
> --- a/xen/drivers/vpci/msi.c
> +++ b/xen/drivers/vpci/msi.c
> @@ -265,7 +265,7 @@ REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
>  
>  void vpci_dump_msi(void)
>  {
> -    const struct domain *d;
> +    struct domain *d;
>  
>      rcu_read_lock(&domlist_read_lock);
>      for_each_domain ( d )
> @@ -277,6 +277,9 @@ void vpci_dump_msi(void)
>  
>          printk("vPCI MSI/MSI-X d%d\n", d->domain_id);
>  
> +        if ( !spin_trylock(&d->pdevs_lock) )
> +            continue;
> +
>          for_each_pdev ( d, pdev )
>          {
>              const struct vpci_msi *msi;
> @@ -326,6 +329,8 @@ void vpci_dump_msi(void)
>              spin_unlock(&pdev->vpci->lock);
>              process_pending_softirqs();
>          }
> +        spin_unlock(&d->pdevs_lock);
> +
>      }
>      rcu_read_unlock(&domlist_read_lock);
>  }
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 3467c0de86..7d1f9fd438 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -312,7 +312,7 @@ static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
>  
>  uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>  {
> -    const struct domain *d = current->domain;
> +    struct domain *d = current->domain;
>      const struct pci_dev *pdev;
>      const struct vpci_register *r;
>      unsigned int data_offset = 0;
> @@ -415,7 +415,7 @@ static void vpci_write_helper(const struct pci_dev *pdev,
>  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>                  uint32_t data)
>  {
> -    const struct domain *d = current->domain;
> +    struct domain *d = current->domain;
>      const struct pci_dev *pdev;
>      const struct vpci_register *r;
>      unsigned int data_offset = 0;
> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
> index 5975ca2f30..19047b4b20 100644
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -177,7 +177,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>  int pci_remove_device(u16 seg, u8 bus, u8 devfn);
>  int pci_ro_device(int seg, int bus, int devfn);
>  int pci_hide_device(unsigned int seg, unsigned int bus, unsigned int devfn);
> -struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf);
> +struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf);
>  struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf);
>  void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
>  
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index 1cf629e7ec..0775228ba9 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -457,6 +457,7 @@ struct domain
>  
>  #ifdef CONFIG_HAS_PCI
>      struct list_head pdev_list;
> +    spinlock_t pdevs_lock;

I think it would be better called "pdev_lock" but OK either way


>  #endif
>  
>  #ifdef CONFIG_HAS_PASSTHROUGH
> -- 
> 2.36.1
>
Jan Beulich Jan. 27, 2023, 8:01 a.m. UTC | #2
On 27.01.2023 00:18, Stefano Stabellini wrote:
> On Wed, 31 Aug 2022, Volodymyr Babchuk wrote:
>> @@ -1571,6 +1595,7 @@ static int iommu_get_device_group(
>>          if ( sdev_id < 0 )
>>          {
>>              pcidevs_unlock();
>> +            spin_unlock(&d->pdevs_lock);
> 
> lock inversion
> 
> 
>>              return sdev_id;
>>          }
>>  
>> @@ -1581,6 +1606,7 @@ static int iommu_get_device_group(
>>              if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
>>              {
>>                  pcidevs_unlock();
>> +                spin_unlock(&d->pdevs_lock);
> 
> lock inversion
> 
> 
>>                  return -EFAULT;
>>              }
>>              i++;
>> @@ -1588,6 +1614,7 @@ static int iommu_get_device_group(
>>      }
>>  
>>      pcidevs_unlock();
>> +    spin_unlock(&d->pdevs_lock);
> 
> lock inversion

While from a cosmetic perspective I of course agree that releasing locks
would better be done the opposite order of acquiring whenever possible,
I'd like to point out that lock release alone is never subject to "lock
order" issues. We do have a couple of cases (I think) where we actually
do so because otherwise respective code would end up uglier, or because
we want to limit locking regions as much as possible (I'm sorry, I don't
have an example to hand).

>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -457,6 +457,7 @@ struct domain
>>  
>>  #ifdef CONFIG_HAS_PCI
>>      struct list_head pdev_list;
>> +    spinlock_t pdevs_lock;
> 
> I think it would be better called "pdev_lock" but OK either way

I'm of two minds here: On one hand the lock is to guard all devices, so
plural looks applicable. Otoh the list head field uses singular as well.

Jan
Volodymyr Babchuk Feb. 14, 2023, 11:38 p.m. UTC | #3
Hello Stefano,

Stefano Stabellini <sstabellini@kernel.org> writes:

> On Wed, 31 Aug 2022, Volodymyr Babchuk wrote:
>> domain->pdevs_lock protects access to domain->pdev_list.
>> As this, it should be used when we are adding, removing on enumerating
>> PCI devices assigned to a domain.
>> 
>> This enables more granular locking instead of one huge pcidevs_lock that
>> locks entire PCI subsystem. Please note that pcidevs_lock() is still
>> used, we are going to remove it in subsequent patches.
>> 
>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>
> I reviewed the patch, and made sure to pay extra attention to:

Thank you for doing this.

> - error paths
> - missing locks
> - lock ordering
> - interruptions

> Here is what I found:
>
>
> 1) iommu.c:reassign_device_ownership and pci_amd_iommu.c:reassign_device
> Both functions without any pdevs_lock locking do:
> list_move(&pdev->domain_list, &target->pdev_list);
>
> It seems to be it would need pdevs_lock. Maybe we need to change
> list_move into list_del (protected by the pdevs_lock of the old domain)
> and list_add (protected by the pdev_lock of the new domain).

Yes, I did as you suggested. But this leads to another potential
issue. I'll describe it below, in deassign_device() part.

[...]

>> +    spin_lock(&d->pdevs_lock);
>>      list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
>>      {
>>          bus = pdev->bus;
>>          devfn = pdev->devfn;
>>          ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
>
> This causes pdevs_lock to be taken twice. deassign_device also takes a
> pdevs_lock.  Probably we need to change all the
> spin_lock(&d->pdevs_lock) into spin_lock_recursive.

You are right, I missed that deassign_device() causes
iommu*_reassign_device() call. But there lies the issue: with recursive
locks, reassign_device() will not be able to unlock source->pdevs_lock,
but will try to take target->pdevs_lock also. This potentially might
lead to deadlock, if another call to reassign_device() moves some other
pdev in the opposite way at the same time. This is why I want to avoid
recursive spinlocks if possible.

So, I am thinking: why does IOMMU code move a pdev across domains in the
first place? We are making IOMMU driver responsible of managing domain's
pdevs, which does not look right, as this is the responsibility of pci.c

I want to propose another approach: implement deassign_device() function
in IOMMU drivers. Then, instead of calling to reassign_device() we might
do the following:

1. deassign_device()
2. remove pdev from source->pdev_list
3. add pdef to target->pdev_list
4. assign_device()


[...]

>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index 1cf629e7ec..0775228ba9 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -457,6 +457,7 @@ struct domain
>>  
>>  #ifdef CONFIG_HAS_PCI
>>      struct list_head pdev_list;
>> +    spinlock_t pdevs_lock;
>
> I think it would be better called "pdev_lock" but OK either way

As Jan pointed out, we are locking devices as in plural. On other hand, I can rename
pdev_list to pdevs_lists to make this consistent.

>
>
>>  #endif
>>  
>>  #ifdef CONFIG_HAS_PASSTHROUGH
>> -- 
>> 2.36.1
>>
Jan Beulich Feb. 15, 2023, 9:06 a.m. UTC | #4
On 15.02.2023 00:38, Volodymyr Babchuk wrote:
> Stefano Stabellini <sstabellini@kernel.org> writes:
>> On Wed, 31 Aug 2022, Volodymyr Babchuk wrote:
>> 1) iommu.c:reassign_device_ownership and pci_amd_iommu.c:reassign_device
>> Both functions without any pdevs_lock locking do:
>> list_move(&pdev->domain_list, &target->pdev_list);
>>
>> It seems to be it would need pdevs_lock. Maybe we need to change
>> list_move into list_del (protected by the pdevs_lock of the old domain)
>> and list_add (protected by the pdev_lock of the new domain).
> 
> Yes, I did as you suggested. But this leads to another potential
> issue. I'll describe it below, in deassign_device() part.
> 
> [...]
> 
>>> +    spin_lock(&d->pdevs_lock);
>>>      list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
>>>      {
>>>          bus = pdev->bus;
>>>          devfn = pdev->devfn;
>>>          ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
>>
>> This causes pdevs_lock to be taken twice. deassign_device also takes a
>> pdevs_lock.  Probably we need to change all the
>> spin_lock(&d->pdevs_lock) into spin_lock_recursive.
> 
> You are right, I missed that deassign_device() causes
> iommu*_reassign_device() call. But there lies the issue: with recursive
> locks, reassign_device() will not be able to unlock source->pdevs_lock,
> but will try to take target->pdevs_lock also. This potentially might
> lead to deadlock, if another call to reassign_device() moves some other
> pdev in the opposite way at the same time. This is why I want to avoid
> recursive spinlocks if possible.
> 
> So, I am thinking: why does IOMMU code move a pdev across domains in the
> first place? We are making IOMMU driver responsible of managing domain's
> pdevs, which does not look right, as this is the responsibility of pci.c

The boundary between what is PCI and what is IOMMU is pretty fuzzy, I'm
afraid. After all pass-through (with IOMMU) is all about PCI devices (on
x86 at least). Despite the filename being pci.c, much what's there is
actually IOMMU code. Specifically deassign_device() is the vendor-
independent IOMMU part of the operation; moving that ...

> I want to propose another approach: implement deassign_device() function
> in IOMMU drivers.

... into vendor-specific code would mean code duplication, which ought
to be avoided as much as possible.

> Then, instead of calling to reassign_device() we might
> do the following:
> 
> 1. deassign_device()
> 2. remove pdev from source->pdev_list
> 3. add pdef to target->pdev_list
> 4. assign_device()

I'm not sure such ordering would end up correct. It may need to be

1. remove pdev from source->pdev_list
2. deassign_device()
3. assign_device()
4. add pdev to target->pdev_list (or back to source upon failure)

which still may be troublesome: The present placement of the moving is, in
particular, ensuring that ownership (expressed by pdev->domain) changes at
the same time as list membership. With things properly locked it _may_ be
okay to separate (in time) these two steps, but that'll require quite a bit
of care (read: justification that it's correct/safe).

And of course you could then also ask why it's low level driver code
changing pdev->domain. I don't see how you would move that to generic code,
as the field shouldn't be left stale for an extended period of time, nor
can it sensibly be transiently set to e.g. NULL.

Additionally deassign_device() is misnamed - it's really re-assigning
the device (as you can see from it invoking
hd->platform_ops->reassign_device()). You cannot split de-assign from
assign: The device always should be assigned _somewhere_ (DOM_IO if
nothing else; see XSA-400), except late enough during hot-unplug. But that
would be taken care of by the alternative ordering above (combining 2 and
3 into a single step).

Jan
diff mbox series

Patch

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 7062393e37..4611141b87 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -618,6 +618,7 @@  struct domain *domain_create(domid_t domid,
 
 #ifdef CONFIG_HAS_PCI
     INIT_LIST_HEAD(&d->pdev_list);
+    spin_lock_init(&d->pdevs_lock);
 #endif
 
     /* All error paths can depend on the above setup. */
diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
index 40ddf366bb..47c45398d4 100644
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -308,11 +308,12 @@  void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev,
     flush_command_buffer(iommu, iommu_dev_iotlb_timeout);
 }
 
-static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
+static void amd_iommu_flush_all_iotlbs(struct domain *d, daddr_t daddr,
                                        unsigned int order)
 {
     struct pci_dev *pdev;
 
+    spin_lock(&d->pdevs_lock);
     for_each_pdev( d, pdev )
     {
         u8 devfn = pdev->devfn;
@@ -323,6 +324,7 @@  static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
         } while ( devfn != pdev->devfn &&
                   PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
     }
+    spin_unlock(&d->pdevs_lock);
 }
 
 /* Flush iommu cache after p2m changes. */
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index 4ba8e764b2..64c016491d 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -96,20 +96,25 @@  static int __must_check allocate_domain_resources(struct domain *d)
     return rc;
 }
 
-static bool any_pdev_behind_iommu(const struct domain *d,
+static bool any_pdev_behind_iommu(struct domain *d,
                                   const struct pci_dev *exclude,
                                   const struct amd_iommu *iommu)
 {
     const struct pci_dev *pdev;
 
+    spin_lock(&d->pdevs_lock);
     for_each_pdev ( d, pdev )
     {
         if ( pdev == exclude )
             continue;
 
         if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
+	{
+	    spin_unlock(&d->pdevs_lock);
             return true;
+	}
     }
+    spin_unlock(&d->pdevs_lock);
 
     return false;
 }
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index cdaf5c247f..4366f8f965 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -523,7 +523,9 @@  static void __init _pci_hide_device(struct pci_dev *pdev)
     if ( pdev->domain )
         return;
     pdev->domain = dom_xen;
+    spin_lock(&dom_xen->pdevs_lock);
     list_add(&pdev->domain_list, &dom_xen->pdev_list);
+    spin_unlock(&dom_xen->pdevs_lock);
 }
 
 int __init pci_hide_device(unsigned int seg, unsigned int bus,
@@ -595,7 +597,7 @@  struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf)
     return pdev;
 }
 
-struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
+struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf)
 {
     struct pci_dev *pdev;
 
@@ -620,9 +622,16 @@  struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
                 return pdev;
     }
     else
+    {
+        spin_lock(&d->pdevs_lock);
         list_for_each_entry ( pdev, &d->pdev_list, domain_list )
             if ( pdev->sbdf.bdf == sbdf.bdf )
+            {
+                spin_unlock(&d->pdevs_lock);
                 return pdev;
+            }
+        spin_unlock(&d->pdevs_lock);
+    }
 
     return NULL;
 }
@@ -817,7 +826,9 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
     if ( !pdev->domain )
     {
         pdev->domain = hardware_domain;
+        spin_lock(&hardware_domain->pdevs_lock);
         list_add(&pdev->domain_list, &hardware_domain->pdev_list);
+        spin_unlock(&hardware_domain->pdevs_lock);
 
         /*
          * For devices not discovered by Xen during boot, add vPCI handlers
@@ -827,7 +838,9 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
         if ( ret )
         {
             printk(XENLOG_ERR "Setup of vPCI failed: %d\n", ret);
+            spin_lock(&pdev->domain->pdevs_lock);
             list_del(&pdev->domain_list);
+            spin_unlock(&pdev->domain->pdevs_lock);
             pdev->domain = NULL;
             goto out;
         }
@@ -835,7 +848,9 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
         if ( ret )
         {
             vpci_remove_device(pdev);
+            spin_lock(&pdev->domain->pdevs_lock);
             list_del(&pdev->domain_list);
+            spin_unlock(&pdev->domain->pdevs_lock);
             pdev->domain = NULL;
             goto out;
         }
@@ -885,7 +900,11 @@  int pci_remove_device(u16 seg, u8 bus, u8 devfn)
             pci_cleanup_msi(pdev);
             ret = iommu_remove_device(pdev);
             if ( pdev->domain )
+            {
+                spin_lock(&pdev->domain->pdevs_lock);
                 list_del(&pdev->domain_list);
+                spin_unlock(&pdev->domain->pdevs_lock);
+            }
             printk(XENLOG_DEBUG "PCI remove device %pp\n", &pdev->sbdf);
             free_pdev(pseg, pdev);
             break;
@@ -967,12 +986,14 @@  int pci_release_devices(struct domain *d)
         pcidevs_unlock();
         return ret;
     }
+    spin_lock(&d->pdevs_lock);
     list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
     {
         bus = pdev->bus;
         devfn = pdev->devfn;
         ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
     }
+    spin_unlock(&d->pdevs_lock);
     pcidevs_unlock();
 
     return ret;
@@ -1194,7 +1215,9 @@  static int __hwdom_init cf_check _setup_hwdom_pci_devices(
             if ( !pdev->domain )
             {
                 pdev->domain = ctxt->d;
+                spin_lock(&pdev->domain->pdevs_lock);
                 list_add(&pdev->domain_list, &ctxt->d->pdev_list);
+                spin_unlock(&pdev->domain->pdevs_lock);
                 setup_one_hwdom_device(ctxt, pdev);
             }
             else if ( pdev->domain == dom_xen )
@@ -1556,6 +1579,7 @@  static int iommu_get_device_group(
         return group_id;
 
     pcidevs_lock();
+    spin_lock(&d->pdevs_lock);
     for_each_pdev( d, pdev )
     {
         unsigned int b = pdev->bus;
@@ -1571,6 +1595,7 @@  static int iommu_get_device_group(
         if ( sdev_id < 0 )
         {
             pcidevs_unlock();
+            spin_unlock(&d->pdevs_lock);
             return sdev_id;
         }
 
@@ -1581,6 +1606,7 @@  static int iommu_get_device_group(
             if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
             {
                 pcidevs_unlock();
+                spin_unlock(&d->pdevs_lock);
                 return -EFAULT;
             }
             i++;
@@ -1588,6 +1614,7 @@  static int iommu_get_device_group(
     }
 
     pcidevs_unlock();
+    spin_unlock(&d->pdevs_lock);
 
     return i;
 }
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 62e143125d..fff1442265 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -183,12 +183,13 @@  static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu)
     }
 }
 
-static bool any_pdev_behind_iommu(const struct domain *d,
+static bool any_pdev_behind_iommu(struct domain *d,
                                   const struct pci_dev *exclude,
                                   const struct vtd_iommu *iommu)
 {
     const struct pci_dev *pdev;
 
+    spin_lock(&d->pdevs_lock);
     for_each_pdev ( d, pdev )
     {
         const struct acpi_drhd_unit *drhd;
@@ -198,8 +199,12 @@  static bool any_pdev_behind_iommu(const struct domain *d,
 
         drhd = acpi_find_matched_drhd_unit(pdev);
         if ( drhd && drhd->iommu == iommu )
+        {
+            spin_unlock(&d->pdevs_lock);
             return true;
+        }
     }
+    spin_unlock(&d->pdevs_lock);
 
     return false;
 }
@@ -208,7 +213,7 @@  static bool any_pdev_behind_iommu(const struct domain *d,
  * If no other devices under the same iommu owned by this domain,
  * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
  */
-static void check_cleanup_domid_map(const struct domain *d,
+static void check_cleanup_domid_map(struct domain *d,
                                     const struct pci_dev *exclude,
                                     struct vtd_iommu *iommu)
 {
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index a1c928a0d2..a59aa7ad0b 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -267,6 +267,7 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
      * Check for overlaps with other BARs. Note that only BARs that are
      * currently mapped (enabled) are checked for overlaps.
      */
+    spin_lock(&pdev->domain->pdevs_lock);
     for_each_pdev ( pdev->domain, tmp )
     {
         if ( tmp == pdev )
@@ -306,11 +307,13 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
                 rangeset_destroy(mem);
+                spin_unlock( &pdev->domain->pdevs_lock);
                 return rc;
             }
         }
     }
 
+    spin_unlock( &pdev->domain->pdevs_lock);
     ASSERT(dev);
 
     if ( system_state < SYS_STATE_active )
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 8f2b59e61a..8969c335b0 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -265,7 +265,7 @@  REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
 
 void vpci_dump_msi(void)
 {
-    const struct domain *d;
+    struct domain *d;
 
     rcu_read_lock(&domlist_read_lock);
     for_each_domain ( d )
@@ -277,6 +277,9 @@  void vpci_dump_msi(void)
 
         printk("vPCI MSI/MSI-X d%d\n", d->domain_id);
 
+        if ( !spin_trylock(&d->pdevs_lock) )
+            continue;
+
         for_each_pdev ( d, pdev )
         {
             const struct vpci_msi *msi;
@@ -326,6 +329,8 @@  void vpci_dump_msi(void)
             spin_unlock(&pdev->vpci->lock);
             process_pending_softirqs();
         }
+        spin_unlock(&d->pdevs_lock);
+
     }
     rcu_read_unlock(&domlist_read_lock);
 }
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 3467c0de86..7d1f9fd438 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -312,7 +312,7 @@  static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
 
 uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
 {
-    const struct domain *d = current->domain;
+    struct domain *d = current->domain;
     const struct pci_dev *pdev;
     const struct vpci_register *r;
     unsigned int data_offset = 0;
@@ -415,7 +415,7 @@  static void vpci_write_helper(const struct pci_dev *pdev,
 void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
                 uint32_t data)
 {
-    const struct domain *d = current->domain;
+    struct domain *d = current->domain;
     const struct pci_dev *pdev;
     const struct vpci_register *r;
     unsigned int data_offset = 0;
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 5975ca2f30..19047b4b20 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -177,7 +177,7 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
 int pci_remove_device(u16 seg, u8 bus, u8 devfn);
 int pci_ro_device(int seg, int bus, int devfn);
 int pci_hide_device(unsigned int seg, unsigned int bus, unsigned int devfn);
-struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf);
+struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf);
 struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf);
 void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1cf629e7ec..0775228ba9 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -457,6 +457,7 @@  struct domain
 
 #ifdef CONFIG_HAS_PCI
     struct list_head pdev_list;
+    spinlock_t pdevs_lock;
 #endif
 
 #ifdef CONFIG_HAS_PASSTHROUGH