diff mbox series

[RFC] pci: introduce per-domain PCI rwlock

Message ID 20230711004537.888185-1-volodymyr_babchuk@epam.com (mailing list archive)
State New, archived
Headers show
Series [RFC] pci: introduce per-domain PCI rwlock | expand

Commit Message

Volodymyr Babchuk July 11, 2023, 12:46 a.m. UTC
Add per-domain d->pci_lock that protects access to
d->pdev_list. Purpose of this lock is to give guarantees to VPCI code
that underlying pdev will not disappear under feet. Later it will also
protect pdev->vpci structure and pdev access in modify_bars().

Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>

---

This patch should be part of VPCI series, but I am posting it as a
sinle-patch RFC to discuss changes to x86 MM and IOMMU code.

I opted to factor out part of the changes from "vpci: introduce
per-domain lock to protect vpci structure" commit to ease up review process.
---
 xen/arch/x86/hvm/hvm.c                      |  2 +
 xen/arch/x86/hvm/vmx/vmcs.c                 |  2 +
 xen/arch/x86/mm.c                           |  6 ++
 xen/arch/x86/mm/p2m-pod.c                   |  7 +++
 xen/arch/x86/mm/paging.c                    |  6 ++
 xen/common/domain.c                         |  1 +
 xen/drivers/passthrough/amd/iommu_cmd.c     |  4 +-
 xen/drivers/passthrough/amd/pci_amd_iommu.c | 15 ++++-
 xen/drivers/passthrough/pci.c               | 70 +++++++++++++++++----
 xen/drivers/passthrough/vtd/iommu.c         | 19 +++++-
 xen/include/xen/sched.h                     |  1 +
 11 files changed, 117 insertions(+), 16 deletions(-)

Comments

Jan Beulich July 11, 2023, 10:24 a.m. UTC | #1
On 11.07.2023 02:46, Volodymyr Babchuk wrote:
> Add per-domain d->pci_lock that protects access to
> d->pdev_list. Purpose of this lock is to give guarantees to VPCI code
> that underlying pdev will not disappear under feet. Later it will also
> protect pdev->vpci structure and pdev access in modify_bars().
> 
> Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
> Suggested-by: Jan Beulich <jbeulich@suse.com>
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
> 
> ---
> 
> This patch should be part of VPCI series, but I am posting it as a
> sinle-patch RFC to discuss changes to x86 MM and IOMMU code.

To aid review / judgement extending the commit message would help, to
outline around which function invocations (and for what reason) the
lock now needs to be held. Furthermore lock nesting rules want writing
down (perhaps next to the declaration of the lock). Therefore comments
below are merely preliminary and likely incomplete.

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2381,12 +2381,14 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
>          }
>      }
>  
> +    read_lock(&d->pci_lock);
>      if ( ((value ^ old_value) & X86_CR0_CD) &&
>           is_iommu_enabled(d) && hvm_funcs.handle_cd &&
>           (!rangeset_is_empty(d->iomem_caps) ||
>            !rangeset_is_empty(d->arch.ioport_caps) ||
>            has_arch_pdevs(d)) )
>          alternative_vcall(hvm_funcs.handle_cd, v, value);
> +    read_unlock(&d->pci_lock);

handle_cd() is non-trivial - did you you audit it for safety of
holding a lock around it?

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -858,12 +858,15 @@ get_page_from_l1e(
>          return 0;
>      }
>  
> +    read_lock(&l1e_owner->pci_lock);
>      if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
>      {
>          gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
>                   l1f & l1_disallow_mask(l1e_owner));
> +        read_unlock(&l1e_owner->pci_lock);

In cases like this one I think you want to avoid holding the lock
across the printk(). This can easily be arranged for by latching
l1_disallow_mask()'s return value into a new local variable.

> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -349,10 +349,12 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
>  
>      ASSERT( pod_target >= p2m->pod.count );
>  
> +    read_lock(&d->pci_lock);
>      if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>          ret = -ENOTEMPTY;
>      else
>          ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
> +    read_unlock(&d->pci_lock);

Hmm, is it necessary to hold the lock across the function call?

> --- a/xen/arch/x86/mm/paging.c
> +++ b/xen/arch/x86/mm/paging.c
> @@ -205,21 +205,27 @@ static int paging_log_dirty_enable(struct domain *d)
>  {
>      int ret;
>  
> +    read_lock(&d->pci_lock);
>      if ( has_arch_pdevs(d) )
>      {
>          /*
>           * Refuse to turn on global log-dirty mode
>           * if the domain is sharing the P2M with the IOMMU.
>           */
> +        read_unlock(&d->pci_lock);
>          return -EINVAL;
>      }
>  
>      if ( paging_mode_log_dirty(d) )
> +    {
> +        read_unlock(&d->pci_lock);
>          return -EINVAL;
> +    }
>  
>      domain_pause(d);
>      ret = d->arch.paging.log_dirty.ops->enable(d);
>      domain_unpause(d);
> +    read_unlock(&d->pci_lock);

This means a relatively long potential lock holding time. I wonder
whether lock release shouldn't be delegated to the ->enable() hook,
as it could do so immediately after setting the flag that would
then prevent assignment of devices.

> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -102,6 +102,8 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>  {
>      const struct pci_dev *pdev;
>  
> +    ASSERT(rw_is_locked(&d->pci_lock));
> +
>      for_each_pdev ( d, pdev )
>      {
>          if ( pdev == exclude )
> @@ -467,17 +469,24 @@ static int cf_check reassign_device(
>  
>      if ( !QUARANTINE_SKIP(target, pdev) )
>      {
> +	read_lock(&target->pci_lock);
>          rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
>          if ( rc )
>              return rc;
> +	read_unlock(&target->pci_lock);

You need to drop the lock before the if().

Also nit: No hard tabs here please.

>      }
>      else
>          amd_iommu_disable_domain_device(source, iommu, devfn, pdev);

Related to my initial comment at the top: It wants clarifying for example
why "setup" needs to lock held, but "disable" doesn't.

>      if ( devfn == pdev->devfn && pdev->domain != target )
>      {
> -        list_move(&pdev->domain_list, &target->pdev_list);
> -        pdev->domain = target;
> +        write_lock(&pdev->domain->pci_lock);

Shorter as write_lock(&source->pci_lock)? (Also in the VT-d counterpart
then.)

> @@ -748,7 +750,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>      if ( !pdev->domain )
>      {
>          pdev->domain = hardware_domain;
> +        write_lock(&hardware_domain->pci_lock);
>          list_add(&pdev->domain_list, &hardware_domain->pdev_list);
> +        write_unlock(&hardware_domain->pci_lock);

What about the companion pci_remove_device()?

> @@ -887,26 +891,62 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
>  
>  int pci_release_devices(struct domain *d)
>  {
> -    struct pci_dev *pdev, *tmp;
> -    u8 bus, devfn;
> -    int ret;
> +    int combined_ret;
> +    LIST_HEAD(failed_pdevs);
>  
>      pcidevs_lock();
> -    ret = arch_pci_clean_pirqs(d);
> -    if ( ret )
> +    write_lock(&d->pci_lock);
> +    combined_ret = arch_pci_clean_pirqs(d);
> +    if ( combined_ret )
>      {
>          pcidevs_unlock();
> -        return ret;
> +        write_unlock(&d->pci_lock);
> +        return combined_ret;
>      }
> -    list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
> +
> +    while ( !list_empty(&d->pdev_list) )
>      {
> -        bus = pdev->bus;
> -        devfn = pdev->devfn;
> -        ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
> +        struct pci_dev *pdev = list_first_entry(&d->pdev_list,
> +                                                struct pci_dev,
> +                                                domain_list);
> +        uint16_t seg = pdev->seg;
> +        uint8_t bus = pdev->bus;
> +        uint8_t devfn = pdev->devfn;
> +        int ret;
> +
> +        write_unlock(&d->pci_lock);
> +        ret = deassign_device(d, seg, bus, devfn);
> +        write_lock(&d->pci_lock);
> +        if ( ret )
> +        {
> +            bool still_present = false;
> +            const struct pci_dev *tmp;
> +
> +            /*
> +             * We need to check if deassign_device() left our pdev in
> +             * domain's list. As we dropped the lock, we can't be sure
> +             * that list wasn't permutated in some random way, so we
> +             * need to traverse the whole list.
> +             */
> +            for_each_pdev ( d, tmp )
> +            {
> +                if ( tmp == pdev )
> +                {
> +                    still_present = true;
> +                    break;
> +                }
> +            }
> +            if ( still_present )
> +                list_move(&pdev->domain_list, &failed_pdevs);
> +            combined_ret = ret;

Elsewhere we aim at returning the first error that was encountered, not
the last one.

> @@ -2765,6 +2767,7 @@ static int cf_check reassign_device_ownership(
>  
>      if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
>      {
> +        read_lock(&target->pci_lock);
>          if ( !has_arch_pdevs(target) )
>              vmx_pi_hooks_assign(target);

I'm afraid this and the unhook side locking isn't sufficient to guarantee
no races. Things still depend on the domctl and/or pcidevs lock being
held around this. As which points acquiring the lock here (and below) is
of questionable value. In any event I think this warrants code comments.

Possibly the same also applies to check_cleanup_domid_map() and friends.

> @@ -2780,21 +2783,26 @@ static int cf_check reassign_device_ownership(
>  #endif
>  
>          ret = domain_context_mapping(target, devfn, pdev);
> +        read_unlock(&target->pci_lock);

Other calls to domain_context_mapping() aren't wrapped like this. Same
for domain_context_unmap(), wrapped exactly once below.

>          if ( !ret && pdev->devfn == devfn &&
>               !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
>          {
>              const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
>  
> +            read_lock(&source->pci_lock);
>              if ( drhd )
>                  check_cleanup_domid_map(source, pdev, drhd->iommu);
> +            read_unlock(&source->pci_lock);

Acquiring the lock inside the if() ought to suffice here.

Jan
Volodymyr Babchuk July 11, 2023, 6:40 p.m. UTC | #2
Hi Jan,

Jan Beulich <jbeulich@suse.com> writes:

> On 11.07.2023 02:46, Volodymyr Babchuk wrote:
>> Add per-domain d->pci_lock that protects access to
>> d->pdev_list. Purpose of this lock is to give guarantees to VPCI code
>> that underlying pdev will not disappear under feet. Later it will also
>> protect pdev->vpci structure and pdev access in modify_bars().
>> 
>> Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
>> Suggested-by: Jan Beulich <jbeulich@suse.com>
>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>> 
>> ---
>> 
>> This patch should be part of VPCI series, but I am posting it as a
>> sinle-patch RFC to discuss changes to x86 MM and IOMMU code.
>
> To aid review / judgement extending the commit message would help, to
> outline around which function invocations (and for what reason) the
> lock now needs to be held. Furthermore lock nesting rules want writing
> down (perhaps next to the declaration of the lock). Therefore comments
> below are merely preliminary and likely incomplete.

I added lock in places where underlying code touches d->pdev_list. My
intention was to lock parts of code that might depend on list
contents. This is straightforward in case we are traversing the list, but
it is much more complicated (for me at least) in cases where
has_arch_pdevs() macro is involved. Prior to my patch uses of
has_arch_pdevs() weren't protected by pci lock at all. This begs
question: do wee need to protect it now? And if we need, which portion
of the code needs to be protected? I did my best trying to isolated the
affected parts of the code.

>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -2381,12 +2381,14 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
>>          }
>>      }
>>  
>> +    read_lock(&d->pci_lock);
>>      if ( ((value ^ old_value) & X86_CR0_CD) &&
>>           is_iommu_enabled(d) && hvm_funcs.handle_cd &&
>>           (!rangeset_is_empty(d->iomem_caps) ||
>>            !rangeset_is_empty(d->arch.ioport_caps) ||
>>            has_arch_pdevs(d)) )
>>          alternative_vcall(hvm_funcs.handle_cd, v, value);
>> +    read_unlock(&d->pci_lock);
>
> handle_cd() is non-trivial - did you you audit it for safety of
> holding a lock around it?

Well, I only vmx_handle_cd() implements this call. I scanned through it
and didn't found any other PCI-related things inside. It acquires
v->arch.hvm.vmx.vmcs_lock, but I didn't found potential for dead locks.

On other hand - do we really need to call in under d->pci_lock? What bad
will happen if has_arch_pdevs(d) will become false during handle_cd()
execution?

>
>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -858,12 +858,15 @@ get_page_from_l1e(
>>          return 0;
>>      }
>>  
>> +    read_lock(&l1e_owner->pci_lock);
>>      if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
>>      {
>>          gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
>>                   l1f & l1_disallow_mask(l1e_owner));
>> +        read_unlock(&l1e_owner->pci_lock);
>
> In cases like this one I think you want to avoid holding the lock
> across the printk(). This can easily be arranged for by latching
> l1_disallow_mask()'s return value into a new local variable.

Sure, will rework.

>> --- a/xen/arch/x86/mm/p2m-pod.c
>> +++ b/xen/arch/x86/mm/p2m-pod.c
>> @@ -349,10 +349,12 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
>>  
>>      ASSERT( pod_target >= p2m->pod.count );
>>  
>> +    read_lock(&d->pci_lock);
>>      if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>          ret = -ENOTEMPTY;
>>      else
>>          ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>> +    read_unlock(&d->pci_lock);
>
> Hmm, is it necessary to hold the lock across the function call?

Well, I am not sure. Will it be okay to just check has_arch_pdevs()
while holding a lock? What if it would change it's result in the next
instant?


>> --- a/xen/arch/x86/mm/paging.c
>> +++ b/xen/arch/x86/mm/paging.c
>> @@ -205,21 +205,27 @@ static int paging_log_dirty_enable(struct domain *d)
>>  {
>>      int ret;
>>  
>> +    read_lock(&d->pci_lock);
>>      if ( has_arch_pdevs(d) )
>>      {
>>          /*
>>           * Refuse to turn on global log-dirty mode
>>           * if the domain is sharing the P2M with the IOMMU.
>>           */
>> +        read_unlock(&d->pci_lock);
>>          return -EINVAL;
>>      }
>>  
>>      if ( paging_mode_log_dirty(d) )
>> +    {
>> +        read_unlock(&d->pci_lock);
>>          return -EINVAL;
>> +    }
>>  
>>      domain_pause(d);
>>      ret = d->arch.paging.log_dirty.ops->enable(d);
>>      domain_unpause(d);
>> +    read_unlock(&d->pci_lock);
>
> This means a relatively long potential lock holding time. I wonder
> whether lock release shouldn't be delegated to the ->enable() hook,
> as it could do so immediately after setting the flag that would
> then prevent assignment of devices.

For me it looks a bit fragile: we need to rely on some hook to release a
lock, that wasn't acquired by the said hook. But I can do this. It
should be released after setting PG_log_dirty, correct?

BTW, I can see that hap_enable_log_dirty() uses
read_atomic(&p2m->ioreq.entry_count), but p2m_entry_modify() does just
p2m->ioreq.entry_count++ and p2m->ioreq.entry_count--;
This looks inconsistent. Also, looks like hap_enable_log_dirty() does
not hold &p2m->ioreq.lock while accessing entry_count, so its value can
change right after read_atomic().


>
>> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
>> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
>> @@ -102,6 +102,8 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>>  {
>>      const struct pci_dev *pdev;
>>  
>> +    ASSERT(rw_is_locked(&d->pci_lock));
>> +
>>      for_each_pdev ( d, pdev )
>>      {
>>          if ( pdev == exclude )
>> @@ -467,17 +469,24 @@ static int cf_check reassign_device(
>>  
>>      if ( !QUARANTINE_SKIP(target, pdev) )
>>      {
>> +	read_lock(&target->pci_lock);
>>          rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
>>          if ( rc )
>>              return rc;
>> +	read_unlock(&target->pci_lock);
>
> You need to drop the lock before the if().

Yes, thanks.

>
> Also nit: No hard tabs here please.
>
>>      }
>>      else
>>          amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
>
> Related to my initial comment at the top: It wants clarifying for example
> why "setup" needs to lock held, but "disable" doesn't.
>

Because amd_iommu_disable_domain_device() does not access d->pdev_list,
while amd_iommu_setup_domain_device() does.

Anyway, I am interested in AMD IOMMU's maintainer opinion there - what
is the correct scope for lock?

>>      if ( devfn == pdev->devfn && pdev->domain != target )
>>      {
>> -        list_move(&pdev->domain_list, &target->pdev_list);
>> -        pdev->domain = target;
>> +        write_lock(&pdev->domain->pci_lock);
>
> Shorter as write_lock(&source->pci_lock)? (Also in the VT-d counterpart
> then.)

Ah yes, sure.

>
>> @@ -748,7 +750,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>      if ( !pdev->domain )
>>      {
>>          pdev->domain = hardware_domain;
>> +        write_lock(&hardware_domain->pci_lock);
>>          list_add(&pdev->domain_list, &hardware_domain->pdev_list);
>> +        write_unlock(&hardware_domain->pci_lock);
>
> What about the companion pci_remove_device()?

Missed this. Thanks.

[...]

>> @@ -2765,6 +2767,7 @@ static int cf_check reassign_device_ownership(
>>  
>>      if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
>>      {
>> +        read_lock(&target->pci_lock);
>>          if ( !has_arch_pdevs(target) )
>>              vmx_pi_hooks_assign(target);
>
> I'm afraid this and the unhook side locking isn't sufficient to guarantee
> no races. Things still depend on the domctl and/or pcidevs lock being
> held around this.

I have no intention to drop pcidevs lock at this time. Honestly, I am
not sure that we will be able to do this without major rework of IOMMU
code.

> As which points acquiring the lock here (and below) is
> of questionable value. In any event I think this warrants code comments.

Well, it would be good to take the lock for the first half of the function
where we deal with `target`, but we also accessing `source` at the same
time. To prevent ABBA dead lock I opted to number of finer-grained lock
acquisitions.

As for "questionable value", I am agree with you. But, if we want to
protect/serialize access to d->pdev_list, we need to use lock there.

> Possibly the same also applies to check_cleanup_domid_map() and friends.
>
>> @@ -2780,21 +2783,26 @@ static int cf_check reassign_device_ownership(
>>  #endif
>>  
>>          ret = domain_context_mapping(target, devfn, pdev);
>> +        read_unlock(&target->pci_lock);
>
> Other calls to domain_context_mapping() aren't wrapped like this. Same
> for domain_context_unmap(), wrapped exactly once below.
>

Will add.

>>          if ( !ret && pdev->devfn == devfn &&
>>               !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
>>          {
>>              const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
>>  
>> +            read_lock(&source->pci_lock);
>>              if ( drhd )
>>                  check_cleanup_domid_map(source, pdev, drhd->iommu);
>> +            read_unlock(&source->pci_lock);
>
> Acquiring the lock inside the if() ought to suffice here.
>
> Jan


Roger, what is your opinion on this? If you remember, you proposed to
extend vpci_lock to protect d->pdev_list as well to deal with potential
ABBA issue in modify_bars().  But as you can see, this either leads to
another ABBA in reassign_device_ownership() or to (as Jan pointed out)
questionable value of this new lock in some cases.
Jan Beulich July 12, 2023, 9:15 a.m. UTC | #3
Up front remark: I'm sorry for some possibly unhelpful replies below. I,
for one, am of the opinion that some of the issues you ask about are to
be looked into by whoever wants / needs to rework the locking model.
After all this (likely) is why nobody has dared to make an attempt before
the need became apparent.

On 11.07.2023 20:40, Volodymyr Babchuk wrote:
> Jan Beulich <jbeulich@suse.com> writes:
>> On 11.07.2023 02:46, Volodymyr Babchuk wrote:
>>> Add per-domain d->pci_lock that protects access to
>>> d->pdev_list. Purpose of this lock is to give guarantees to VPCI code
>>> that underlying pdev will not disappear under feet. Later it will also
>>> protect pdev->vpci structure and pdev access in modify_bars().
>>>
>>> Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
>>> Suggested-by: Jan Beulich <jbeulich@suse.com>
>>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>>>
>>> ---
>>>
>>> This patch should be part of VPCI series, but I am posting it as a
>>> sinle-patch RFC to discuss changes to x86 MM and IOMMU code.
>>
>> To aid review / judgement extending the commit message would help, to
>> outline around which function invocations (and for what reason) the
>> lock now needs to be held. Furthermore lock nesting rules want writing
>> down (perhaps next to the declaration of the lock). Therefore comments
>> below are merely preliminary and likely incomplete.
> 
> I added lock in places where underlying code touches d->pdev_list. My
> intention was to lock parts of code that might depend on list
> contents. This is straightforward in case we are traversing the list, but
> it is much more complicated (for me at least) in cases where
> has_arch_pdevs() macro is involved. Prior to my patch uses of
> has_arch_pdevs() weren't protected by pci lock at all. This begs
> question: do wee need to protect it now? And if we need, which portion
> of the code needs to be protected? I did my best trying to isolated the
> affected parts of the code.

Well, yes - these questions need answering. And since you're proposing
these code changes, your present understanding wants writing down, such
that (a) we can use that to make corrections to the (intended) model
and (b) we can match intentions with actual implementation.

>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -2381,12 +2381,14 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
>>>          }
>>>      }
>>>  
>>> +    read_lock(&d->pci_lock);
>>>      if ( ((value ^ old_value) & X86_CR0_CD) &&
>>>           is_iommu_enabled(d) && hvm_funcs.handle_cd &&
>>>           (!rangeset_is_empty(d->iomem_caps) ||
>>>            !rangeset_is_empty(d->arch.ioport_caps) ||
>>>            has_arch_pdevs(d)) )
>>>          alternative_vcall(hvm_funcs.handle_cd, v, value);
>>> +    read_unlock(&d->pci_lock);
>>
>> handle_cd() is non-trivial - did you you audit it for safety of
>> holding a lock around it?
> 
> Well, I only vmx_handle_cd() implements this call. I scanned through it
> and didn't found any other PCI-related things inside. It acquires
> v->arch.hvm.vmx.vmcs_lock, but I didn't found potential for dead locks.

What about overall lock-holding time, which may affect other CPUs and
hence other security contexts?

> On other hand - do we really need to call in under d->pci_lock? What bad
> will happen if has_arch_pdevs(d) will become false during handle_cd()
> execution?

Much like with log-dirty enabling, the main question is what existing
races there may be plus whether things are at least not being made worse.
(Ideally of course by introducing better locking, races would go away if
any exist.) IOW here it would certainly be better to drop the lock before
doing expensive work, but than guarantees are needed that
- the state checked can't change until after the operation is complete, or
- the state changing is benign.

>>> --- a/xen/arch/x86/mm/p2m-pod.c
>>> +++ b/xen/arch/x86/mm/p2m-pod.c
>>> @@ -349,10 +349,12 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
>>>  
>>>      ASSERT( pod_target >= p2m->pod.count );
>>>  
>>> +    read_lock(&d->pci_lock);
>>>      if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
>>>          ret = -ENOTEMPTY;
>>>      else
>>>          ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>> +    read_unlock(&d->pci_lock);
>>
>> Hmm, is it necessary to hold the lock across the function call?
> 
> Well, I am not sure. Will it be okay to just check has_arch_pdevs()
> while holding a lock? What if it would change it's result in the next
> instant?

PoD and pass-through are incompatible with one another (just like
global log-dirty tracking is). Therefore this and the other side
(like also above for handle_cd(), and like for log-dirty below) need
to make sure that a state change either can't occur or (not
applicable here afaict) is benign. As outlined for log-dirty in the
earlier reply, this may involve doing part of the operation under
lock, until it is safe to release the lock (and yes, below for
log-dirty you validly say this is somewhat fragile, but what do you
do).

>>> --- a/xen/arch/x86/mm/paging.c
>>> +++ b/xen/arch/x86/mm/paging.c
>>> @@ -205,21 +205,27 @@ static int paging_log_dirty_enable(struct domain *d)
>>>  {
>>>      int ret;
>>>  
>>> +    read_lock(&d->pci_lock);
>>>      if ( has_arch_pdevs(d) )
>>>      {
>>>          /*
>>>           * Refuse to turn on global log-dirty mode
>>>           * if the domain is sharing the P2M with the IOMMU.
>>>           */
>>> +        read_unlock(&d->pci_lock);
>>>          return -EINVAL;
>>>      }
>>>  
>>>      if ( paging_mode_log_dirty(d) )
>>> +    {
>>> +        read_unlock(&d->pci_lock);
>>>          return -EINVAL;
>>> +    }
>>>  
>>>      domain_pause(d);
>>>      ret = d->arch.paging.log_dirty.ops->enable(d);
>>>      domain_unpause(d);
>>> +    read_unlock(&d->pci_lock);
>>
>> This means a relatively long potential lock holding time. I wonder
>> whether lock release shouldn't be delegated to the ->enable() hook,
>> as it could do so immediately after setting the flag that would
>> then prevent assignment of devices.
> 
> For me it looks a bit fragile: we need to rely on some hook to release a
> lock, that wasn't acquired by the said hook. But I can do this. It
> should be released after setting PG_log_dirty, correct?

Yes (s/should/could/).

> BTW, I can see that hap_enable_log_dirty() uses
> read_atomic(&p2m->ioreq.entry_count), but p2m_entry_modify() does just
> p2m->ioreq.entry_count++ and p2m->ioreq.entry_count--;
> This looks inconsistent. Also, looks like hap_enable_log_dirty() does
> not hold &p2m->ioreq.lock while accessing entry_count, so its value can
> change right after read_atomic().

I'm afraid it you look closely you'll find many such inconsistencies.

>>> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
>>> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
>>> @@ -102,6 +102,8 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>>>  {
>>>      const struct pci_dev *pdev;
>>>  
>>> +    ASSERT(rw_is_locked(&d->pci_lock));
>>> +
>>>      for_each_pdev ( d, pdev )
>>>      {
>>>          if ( pdev == exclude )
>>> @@ -467,17 +469,24 @@ static int cf_check reassign_device(
>>>  
>>>      if ( !QUARANTINE_SKIP(target, pdev) )
>>>      {
>>> +	read_lock(&target->pci_lock);
>>>          rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
>>>          if ( rc )
>>>              return rc;
>>> +	read_unlock(&target->pci_lock);
>>
>> You need to drop the lock before the if().
> 
> Yes, thanks.
> 
>>
>> Also nit: No hard tabs here please.
>>
>>>      }
>>>      else
>>>          amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
>>
>> Related to my initial comment at the top: It wants clarifying for example
>> why "setup" needs to lock held, but "disable" doesn't.
>>
> 
> Because amd_iommu_disable_domain_device() does not access d->pdev_list,
> while amd_iommu_setup_domain_device() does.

I was guessing that might be the reason, but to be honest while looking
at the function I can't spot that access. I clearly must be overlooking
something, which may be that the access is in a called function. Yet as
soon as this isn't obvious, a code comment can make a significant
difference.

> Anyway, I am interested in AMD IOMMU's maintainer opinion there - what
> is the correct scope for lock?

To determine that (and to save readers like me from re-doing the work
you must have done already) is why I gave the earlier comment.

>>> @@ -2765,6 +2767,7 @@ static int cf_check reassign_device_ownership(
>>>  
>>>      if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
>>>      {
>>> +        read_lock(&target->pci_lock);
>>>          if ( !has_arch_pdevs(target) )
>>>              vmx_pi_hooks_assign(target);
>>
>> I'm afraid this and the unhook side locking isn't sufficient to guarantee
>> no races. Things still depend on the domctl and/or pcidevs lock being
>> held around this.
> 
> I have no intention to drop pcidevs lock at this time. Honestly, I am
> not sure that we will be able to do this without major rework of IOMMU
> code.

Of course, and my remark wasn't intended to hint in such a direction.
Instead ...

>> As which points acquiring the lock here (and below) is
>> of questionable value. In any event I think this warrants code comments.
> 
> Well, it would be good to take the lock for the first half of the function
> where we deal with `target`, but we also accessing `source` at the same
> time. To prevent ABBA dead lock I opted to number of finer-grained lock
> acquisitions.
> 
> As for "questionable value", I am agree with you. But, if we want to
> protect/serialize access to d->pdev_list, we need to use lock there.

... I did ask to assess whether acquiring the lock (without other
locking changed) is useful, and to put down the result of this in a
comment - whether or not the lock acquire is retained here. IOW in
one case the comment may say "lock not acquired here because ..."
whereas in the other case the comment might be "lock acquired here
despite ..."

Jan
Volodymyr Babchuk July 12, 2023, 11:09 a.m. UTC | #4
Hi Jan, Roger,

Jan Beulich <jbeulich@suse.com> writes:

> Up front remark: I'm sorry for some possibly unhelpful replies below. I,
> for one, am of the opinion that some of the issues you ask about are to
> be looked into by whoever wants / needs to rework the locking model.
> After all this (likely) is why nobody has dared to make an attempt before
> the need became apparent.

I have no great need desire or need to rework the locking model. I was
perfectly fine with much narrower vpci_lock. As you remember, it is
Roger who suggested to extend this lock to the include the whole PCI
device.

I already tried something like this as part of the another patch series:
"[RFC,01/10] xen: pci: add per-domain pci list lock" [1], with the same
result: it was considered very hard to do this properly, so I dropped
that effort. I am not so familiar with x86-specific code as a whole and
IOMMU drivers in particular to be 100% sure that I am doing correct
changes. Without support from x86 guys I can't do proper patches and
looks like x86 guys are not interested in this. So, this is dead end.

Roger, in [2] I proposed another approach to fix ABBA in modify_bars():
store copy of BARs in the domain structure. Taking into account that my
effort to introduce d->pci_lock basically failed (again), I am proposing
to return back to d->vpci_lock + BARs shadow copy in the domain
struct. What do you think? And you, Jan?

[1] https://lore.kernel.org/all/20220831141040.13231-2-volodymyr_babchuk@epam.com/
[2] https://lore.kernel.org/all/87ilbfnqmo.fsf@epam.com/
Jan Beulich July 12, 2023, 11:23 a.m. UTC | #5
On 12.07.2023 13:09, Volodymyr Babchuk wrote:
> Jan Beulich <jbeulich@suse.com> writes:
> 
>> Up front remark: I'm sorry for some possibly unhelpful replies below. I,
>> for one, am of the opinion that some of the issues you ask about are to
>> be looked into by whoever wants / needs to rework the locking model.
>> After all this (likely) is why nobody has dared to make an attempt before
>> the need became apparent.
> 
> I have no great need desire or need to rework the locking model. I was
> perfectly fine with much narrower vpci_lock. As you remember, it is
> Roger who suggested to extend this lock to the include the whole PCI
> device.
> 
> I already tried something like this as part of the another patch series:
> "[RFC,01/10] xen: pci: add per-domain pci list lock" [1], with the same
> result: it was considered very hard to do this properly, so I dropped
> that effort. I am not so familiar with x86-specific code as a whole and
> IOMMU drivers in particular to be 100% sure that I am doing correct
> changes. Without support from x86 guys I can't do proper patches and
> looks like x86 guys are not interested in this.

That's not the case, no. The problem is time: I don't have the time to
take on this effort myself. I'm willing to help where necessary, within
reasonable bounds. But I can't realistically do large parts of the
analysis that is inevitably needed. (I'm also a little sick of doing
code audits for other, unrelated reasons.) Hence that earlier "up front"
remark.

> So, this is dead end.
> 
> Roger, in [2] I proposed another approach to fix ABBA in modify_bars():
> store copy of BARs in the domain structure. Taking into account that my
> effort to introduce d->pci_lock basically failed (again), I am proposing
> to return back to d->vpci_lock + BARs shadow copy in the domain
> struct. What do you think? And you, Jan?

I support Roger's earlier request, and I think that doing what you
propose would move us further away from where we want to arrive at some
point.

I'm sorry that this is all pretty unpleasant.

Jan

> [1] https://lore.kernel.org/all/20220831141040.13231-2-volodymyr_babchuk@epam.com/
> [2] https://lore.kernel.org/all/87ilbfnqmo.fsf@epam.com/
>
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a67ef79dc0..089fbe38a7 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2381,12 +2381,14 @@  int hvm_set_cr0(unsigned long value, bool may_defer)
         }
     }
 
+    read_lock(&d->pci_lock);
     if ( ((value ^ old_value) & X86_CR0_CD) &&
          is_iommu_enabled(d) && hvm_funcs.handle_cd &&
          (!rangeset_is_empty(d->iomem_caps) ||
           !rangeset_is_empty(d->arch.ioport_caps) ||
           has_arch_pdevs(d)) )
         alternative_vcall(hvm_funcs.handle_cd, v, value);
+    read_unlock(&d->pci_lock);
 
     hvm_update_cr(v, 0, value);
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index b209563625..88bbcbbd99 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1889,6 +1889,7 @@  void cf_check vmx_do_resume(void)
          *  2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
          * If VT-d engine can force snooping, we don't need to do these.
          */
+        read_lock(&v->domain->pci_lock);
         if ( has_arch_pdevs(v->domain) && !iommu_snoop
                 && !cpu_has_wbinvd_exiting )
         {
@@ -1896,6 +1897,7 @@  void cf_check vmx_do_resume(void)
             if ( cpu != -1 )
                 flush_mask(cpumask_of(cpu), FLUSH_CACHE);
         }
+        read_unlock(&v->domain->pci_lock);
 
         vmx_clear_vmcs(v);
         vmx_load_vmcs(v);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index be2b10a391..f1e882a980 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -858,12 +858,15 @@  get_page_from_l1e(
         return 0;
     }
 
+    read_lock(&l1e_owner->pci_lock);
     if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
     {
         gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
                  l1f & l1_disallow_mask(l1e_owner));
+        read_unlock(&l1e_owner->pci_lock);
         return -EINVAL;
     }
+    read_unlock(&l1e_owner->pci_lock);
 
     valid = mfn_valid(_mfn(mfn));
 
@@ -2142,12 +2145,15 @@  static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
     {
         struct page_info *page = NULL;
 
+        read_lock(&pt_dom->pci_lock);
         if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
         {
             gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
                     l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
+            read_unlock(&pt_dom->pci_lock);
             return -EINVAL;
         }
+        read_unlock(&pt_dom->pci_lock);
 
         /* Translate foreign guest address. */
         if ( cmd != MMU_PT_UPDATE_NO_TRANSLATE &&
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index 9969eb45fa..07e0bedad7 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -349,10 +349,12 @@  p2m_pod_set_mem_target(struct domain *d, unsigned long target)
 
     ASSERT( pod_target >= p2m->pod.count );
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
         ret = -ENOTEMPTY;
     else
         ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+    read_unlock(&d->pci_lock);
 
 out:
     pod_unlock(p2m);
@@ -1401,8 +1403,13 @@  guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
     if ( !paging_mode_translate(d) )
         return -EINVAL;
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+    {
+        read_unlock(&d->pci_lock);
         return -ENOTEMPTY;
+    }
+    read_unlock(&d->pci_lock);
 
     do {
         rc = mark_populate_on_demand(d, gfn, chunk_order);
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 34d833251b..fb8f7ff7cf 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -205,21 +205,27 @@  static int paging_log_dirty_enable(struct domain *d)
 {
     int ret;
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) )
     {
         /*
          * Refuse to turn on global log-dirty mode
          * if the domain is sharing the P2M with the IOMMU.
          */
+        read_unlock(&d->pci_lock);
         return -EINVAL;
     }
 
     if ( paging_mode_log_dirty(d) )
+    {
+        read_unlock(&d->pci_lock);
         return -EINVAL;
+    }
 
     domain_pause(d);
     ret = d->arch.paging.log_dirty.ops->enable(d);
     domain_unpause(d);
+    read_unlock(&d->pci_lock);
 
     return ret;
 }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index caaa402637..5d8a8836da 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -645,6 +645,7 @@  struct domain *domain_create(domid_t domid,
 
 #ifdef CONFIG_HAS_PCI
     INIT_LIST_HEAD(&d->pdev_list);
+    rwlock_init(&d->pci_lock);
 #endif
 
     /* All error paths can depend on the above setup. */
diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
index 40ddf366bb..b67aee31f6 100644
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -308,11 +308,12 @@  void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev,
     flush_command_buffer(iommu, iommu_dev_iotlb_timeout);
 }
 
-static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
+static void amd_iommu_flush_all_iotlbs(struct domain *d, daddr_t daddr,
                                        unsigned int order)
 {
     struct pci_dev *pdev;
 
+    read_lock(&d->pci_lock);
     for_each_pdev( d, pdev )
     {
         u8 devfn = pdev->devfn;
@@ -323,6 +324,7 @@  static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
         } while ( devfn != pdev->devfn &&
                   PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
     }
+    read_unlock(&d->pci_lock);
 }
 
 /* Flush iommu cache after p2m changes. */
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index 94e3775506..8541b66a93 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -102,6 +102,8 @@  static bool any_pdev_behind_iommu(const struct domain *d,
 {
     const struct pci_dev *pdev;
 
+    ASSERT(rw_is_locked(&d->pci_lock));
+
     for_each_pdev ( d, pdev )
     {
         if ( pdev == exclude )
@@ -467,17 +469,24 @@  static int cf_check reassign_device(
 
     if ( !QUARANTINE_SKIP(target, pdev) )
     {
+	read_lock(&target->pci_lock);
         rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
         if ( rc )
             return rc;
+	read_unlock(&target->pci_lock);
     }
     else
         amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
 
     if ( devfn == pdev->devfn && pdev->domain != target )
     {
-        list_move(&pdev->domain_list, &target->pdev_list);
-        pdev->domain = target;
+        write_lock(&pdev->domain->pci_lock);
+        list_del(&pdev->domain_list);
+        write_unlock(&pdev->domain->pci_lock);
+
+        write_lock(&target->pci_lock);
+        list_add(&pdev->domain_list, &target->pdev_list);
+        write_unlock(&target->pci_lock);
     }
 
     /*
@@ -628,12 +637,14 @@  static int cf_check amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
         fresh_domid = true;
     }
 
+    read_lock(&pdev->domain->pci_lock);
     ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
     if ( ret && fresh_domid )
     {
         iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
         pdev->arch.pseudo_domid = DOMID_INVALID;
     }
+    read_unlock(&pdev->domain->pci_lock);
 
     return ret;
 }
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 07d1986d33..1831e1b0c0 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -454,7 +454,9 @@  static void __init _pci_hide_device(struct pci_dev *pdev)
     if ( pdev->domain )
         return;
     pdev->domain = dom_xen;
+    write_lock(&dom_xen->pci_lock);
     list_add(&pdev->domain_list, &dom_xen->pdev_list);
+    write_unlock(&dom_xen->pci_lock);
 }
 
 int __init pci_hide_device(unsigned int seg, unsigned int bus,
@@ -530,7 +532,7 @@  struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
 {
     struct pci_dev *pdev;
 
-    ASSERT(d || pcidevs_locked());
+    ASSERT((d && rw_is_locked(&d->pci_lock)) || pcidevs_locked());
 
     /*
      * The hardware domain owns the majority of the devices in the system.
@@ -748,7 +750,9 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
     if ( !pdev->domain )
     {
         pdev->domain = hardware_domain;
+        write_lock(&hardware_domain->pci_lock);
         list_add(&pdev->domain_list, &hardware_domain->pdev_list);
+        write_unlock(&hardware_domain->pci_lock);
 
         /*
          * For devices not discovered by Xen during boot, add vPCI handlers
@@ -887,26 +891,62 @@  static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
 
 int pci_release_devices(struct domain *d)
 {
-    struct pci_dev *pdev, *tmp;
-    u8 bus, devfn;
-    int ret;
+    int combined_ret;
+    LIST_HEAD(failed_pdevs);
 
     pcidevs_lock();
-    ret = arch_pci_clean_pirqs(d);
-    if ( ret )
+    write_lock(&d->pci_lock);
+    combined_ret = arch_pci_clean_pirqs(d);
+    if ( combined_ret )
     {
         pcidevs_unlock();
-        return ret;
+        write_unlock(&d->pci_lock);
+        return combined_ret;
     }
-    list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
+
+    while ( !list_empty(&d->pdev_list) )
     {
-        bus = pdev->bus;
-        devfn = pdev->devfn;
-        ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
+        struct pci_dev *pdev = list_first_entry(&d->pdev_list,
+                                                struct pci_dev,
+                                                domain_list);
+        uint16_t seg = pdev->seg;
+        uint8_t bus = pdev->bus;
+        uint8_t devfn = pdev->devfn;
+        int ret;
+
+        write_unlock(&d->pci_lock);
+        ret = deassign_device(d, seg, bus, devfn);
+        write_lock(&d->pci_lock);
+        if ( ret )
+        {
+            bool still_present = false;
+            const struct pci_dev *tmp;
+
+            /*
+             * We need to check if deassign_device() left our pdev in
+             * domain's list. As we dropped the lock, we can't be sure
+             * that list wasn't permutated in some random way, so we
+             * need to traverse the whole list.
+             */
+            for_each_pdev ( d, tmp )
+            {
+                if ( tmp == pdev )
+                {
+                    still_present = true;
+                    break;
+                }
+            }
+            if ( still_present )
+                list_move(&pdev->domain_list, &failed_pdevs);
+            combined_ret = ret;
+        }
     }
+
+    list_splice(&failed_pdevs, &d->pdev_list);
+    write_unlock(&d->pci_lock);
     pcidevs_unlock();
 
-    return ret;
+    return combined_ret;
 }
 
 #define PCI_CLASS_BRIDGE_HOST    0x0600
@@ -1125,7 +1165,9 @@  static int __hwdom_init cf_check _setup_hwdom_pci_devices(
             if ( !pdev->domain )
             {
                 pdev->domain = ctxt->d;
+                write_lock(&ctxt->d->pci_lock);
                 list_add(&pdev->domain_list, &ctxt->d->pdev_list);
+                write_unlock(&ctxt->d->pci_lock);
                 setup_one_hwdom_device(ctxt, pdev);
             }
             else if ( pdev->domain == dom_xen )
@@ -1487,6 +1529,7 @@  static int iommu_get_device_group(
         return group_id;
 
     pcidevs_lock();
+    read_lock(&d->pci_lock);
     for_each_pdev( d, pdev )
     {
         unsigned int b = pdev->bus;
@@ -1501,6 +1544,7 @@  static int iommu_get_device_group(
         sdev_id = iommu_call(ops, get_device_group_id, seg, b, df);
         if ( sdev_id < 0 )
         {
+            read_unlock(&d->pci_lock);
             pcidevs_unlock();
             return sdev_id;
         }
@@ -1511,6 +1555,7 @@  static int iommu_get_device_group(
 
             if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
             {
+                read_unlock(&d->pci_lock);
                 pcidevs_unlock();
                 return -EFAULT;
             }
@@ -1518,6 +1563,7 @@  static int iommu_get_device_group(
         }
     }
 
+    read_unlock(&d->pci_lock);
     pcidevs_unlock();
 
     return i;
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 0e3062c820..6a36cc18fe 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -186,6 +186,8 @@  static bool any_pdev_behind_iommu(const struct domain *d,
 {
     const struct pci_dev *pdev;
 
+    ASSERT(rw_is_locked(&d->pci_lock));
+
     for_each_pdev ( d, pdev )
     {
         const struct acpi_drhd_unit *drhd;
@@ -2765,6 +2767,7 @@  static int cf_check reassign_device_ownership(
 
     if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
     {
+        read_lock(&target->pci_lock);
         if ( !has_arch_pdevs(target) )
             vmx_pi_hooks_assign(target);
 
@@ -2780,21 +2783,26 @@  static int cf_check reassign_device_ownership(
 #endif
 
         ret = domain_context_mapping(target, devfn, pdev);
+        read_unlock(&target->pci_lock);
 
         if ( !ret && pdev->devfn == devfn &&
              !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
         {
             const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
 
+            read_lock(&source->pci_lock);
             if ( drhd )
                 check_cleanup_domid_map(source, pdev, drhd->iommu);
+            read_unlock(&source->pci_lock);
         }
     }
     else
     {
         const struct acpi_drhd_unit *drhd;
 
+        read_lock(&source->pci_lock);
         drhd = domain_context_unmap(source, devfn, pdev);
+        read_unlock(&source->pci_lock);
         ret = IS_ERR(drhd) ? PTR_ERR(drhd) : 0;
     }
     if ( ret )
@@ -2806,12 +2814,21 @@  static int cf_check reassign_device_ownership(
 
     if ( devfn == pdev->devfn && pdev->domain != target )
     {
-        list_move(&pdev->domain_list, &target->pdev_list);
+        write_lock(&pdev->domain->pci_lock);
+        list_del(&pdev->domain_list);
+        write_unlock(&pdev->domain->pci_lock);
+
+        write_lock(&target->pci_lock);
+        list_add(&pdev->domain_list, &target->pdev_list);
+        write_unlock(&target->pci_lock);
+
         pdev->domain = target;
     }
 
+    read_lock(&source->pci_lock);
     if ( !has_arch_pdevs(source) )
         vmx_pi_hooks_deassign(source);
+    read_unlock(&source->pci_lock);
 
     /*
      * If the device belongs to the hardware domain, and it has RMRR, don't
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 85242a73d3..80dd150bbf 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -460,6 +460,7 @@  struct domain
 
 #ifdef CONFIG_HAS_PCI
     struct list_head pdev_list;
+    rwlock_t pci_lock;
 #endif
 
 #ifdef CONFIG_HAS_PASSTHROUGH