diff mbox series

[v6,2/3] xen/pci: introduce PF<->VF links

Message ID 20241018203913.1162962-3-stewart.hildebrand@amd.com (mailing list archive)
State Superseded
Headers show
Series xen: SR-IOV fixes | expand

Commit Message

Stewart Hildebrand Oct. 18, 2024, 8:39 p.m. UTC
Add links between a VF's struct pci_dev and its associated PF struct
pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
dropping and re-acquiring the pcidevs_lock().

During PF removal, unlink VF from PF and mark the VF broken. As before,
VFs may exist without a corresponding PF, although now only with
pdev->broken = true.

The hardware domain is expected to remove the associated VFs before
removing the PF. Print a warning in case a PF is removed with associated
VFs still present.

Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
Candidate for backport to 4.19 (the next patch depends on this one)

v5->v6:
* move printk() before ASSERT_UNREACHABLE()
* warn about PF removal with VFs still present
* clarify commit message

v4->v5:
* new patch, split from ("x86/msi: fix locking for SR-IOV devices")
* move INIT_LIST_HEAD(&pdev->vf_list); earlier
* collapse struct list_head instances
* retain error code from pci_add_device()
* unlink (and mark broken) VFs instead of removing them
* const-ify VF->PF link
---
 xen/drivers/passthrough/pci.c | 76 ++++++++++++++++++++++++++++-------
 xen/include/xen/pci.h         | 10 +++++
 2 files changed, 72 insertions(+), 14 deletions(-)

Comments

Jan Beulich Oct. 28, 2024, 5:02 p.m. UTC | #1
On 18.10.2024 22:39, Stewart Hildebrand wrote:
> Add links between a VF's struct pci_dev and its associated PF struct
> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> dropping and re-acquiring the pcidevs_lock().
> 
> During PF removal, unlink VF from PF and mark the VF broken. As before,
> VFs may exist without a corresponding PF, although now only with
> pdev->broken = true.
> 
> The hardware domain is expected to remove the associated VFs before
> removing the PF. Print a warning in case a PF is removed with associated
> VFs still present.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> Candidate for backport to 4.19 (the next patch depends on this one)
> 
> v5->v6:
> * move printk() before ASSERT_UNREACHABLE()
> * warn about PF removal with VFs still present

Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
just after an adjustment to the commit message. I'm instead actively
concerned of the resulting behavior. Question is whether we can reasonably
do something about that.

Jan
Roger Pau Monné Oct. 28, 2024, 6:41 p.m. UTC | #2
On Fri, Oct 18, 2024 at 04:39:09PM -0400, Stewart Hildebrand wrote:
> Add links between a VF's struct pci_dev and its associated PF struct
> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> dropping and re-acquiring the pcidevs_lock().
> 
> During PF removal, unlink VF from PF and mark the VF broken. As before,
> VFs may exist without a corresponding PF, although now only with
> pdev->broken = true.
> 
> The hardware domain is expected to remove the associated VFs before
> removing the PF. Print a warning in case a PF is removed with associated
> VFs still present.
> 
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> Candidate for backport to 4.19 (the next patch depends on this one)
> 
> v5->v6:
> * move printk() before ASSERT_UNREACHABLE()
> * warn about PF removal with VFs still present
> * clarify commit message
> 
> v4->v5:
> * new patch, split from ("x86/msi: fix locking for SR-IOV devices")
> * move INIT_LIST_HEAD(&pdev->vf_list); earlier
> * collapse struct list_head instances
> * retain error code from pci_add_device()
> * unlink (and mark broken) VFs instead of removing them
> * const-ify VF->PF link
> ---
>  xen/drivers/passthrough/pci.c | 76 ++++++++++++++++++++++++++++-------
>  xen/include/xen/pci.h         | 10 +++++
>  2 files changed, 72 insertions(+), 14 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
> index 74d3895e1ef6..fe31255b1207 100644
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -333,6 +333,8 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>      *((u8*) &pdev->devfn) = devfn;
>      pdev->domain = NULL;
>  
> +    INIT_LIST_HEAD(&pdev->vf_list);
> +
>      arch_pci_init_pdev(pdev);
>  
>      rc = pdev_msi_init(pdev);
> @@ -449,6 +451,10 @@ static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev)
>  
>      list_del(&pdev->alldevs_list);
>      pdev_msi_deinit(pdev);
> +
> +    if ( pdev->info.is_virtfn && pdev->virtfn.pf_pdev )

Shouldn't having pdev->info.is_virtfn set already ensure that
pdev->virtfn.pf_pdev != NULL?

> +        list_del(&pdev->vf_list);
> +
>      xfree(pdev);
>  }
>  
> @@ -656,24 +662,11 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>      unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>      const char *type;
>      int ret;
> -    bool pf_is_extfn = false;
>  
>      if ( !info )
>          type = "device";
>      else if ( info->is_virtfn )
> -    {
> -        pcidevs_lock();
> -        pdev = pci_get_pdev(NULL,
> -                            PCI_SBDF(seg, info->physfn.bus,
> -                                     info->physfn.devfn));
> -        if ( pdev )
> -            pf_is_extfn = pdev->info.is_extfn;
> -        pcidevs_unlock();
> -        if ( !pdev )
> -            pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
> -                           NULL, node);
>          type = "virtual function";
> -    }
>      else if ( info->is_extfn )
>          type = "extended function";
>      else
> @@ -703,7 +696,44 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>           * extended function.
>           */
>          if ( pdev->info.is_virtfn )
> -            pdev->info.is_extfn = pf_is_extfn;
> +        {
> +            struct pci_dev *pf_pdev;

This could be const?

> +
> +            pf_pdev = pci_get_pdev(NULL,
> +                                   PCI_SBDF(seg, info->physfn.bus,
> +                                            info->physfn.devfn));

You can probably initialize at declaration?

> +
> +            if ( !pf_pdev )

Is this even feasible during correct operation?  IOW: shouldn't the PF
always be added first, so that SR-IOV can be enabled and the VFs added
afterwards?

I see previous code also catered for VFs being added without the PF
being present, so I assume there was some need for this.

> +            {
> +                ret = pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
> +                                     NULL, node);
> +                if ( ret )
> +                {
> +                    printk(XENLOG_WARNING "Failed to add SR-IOV device PF %pp for VF %pp\n",

Could you split this to make the line a bit shorter?

                       printk(XENLOG_WARNING
		              "Failed to add SR-IOV device PF %pp for VF %pp\n",

Same below.

> +                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
> +                           &pdev->sbdf);
> +                    free_pdev(pseg, pdev);
> +                    goto out;
> +                }
> +                pf_pdev = pci_get_pdev(NULL,
> +                                       PCI_SBDF(seg, info->physfn.bus,
> +                                                info->physfn.devfn));
> +                if ( !pf_pdev )
> +                {
> +                    printk(XENLOG_ERR "Failed to find SR-IOV device PF %pp for VF %pp\n",
> +                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
> +                           &pdev->sbdf);

If you want to add an error message here, I think it should mention
the fact this state is not expected:

"Inconsistent PCI state: failed to find newly added PF %pp for VF %pp\n"

> +                    ASSERT_UNREACHABLE();
> +                    free_pdev(pseg, pdev);
> +                    ret = -EILSEQ;
> +                    goto out;
> +                }
> +            }
> +
> +            pdev->info.is_extfn = pf_pdev->info.is_extfn;
> +            pdev->virtfn.pf_pdev = pf_pdev;
> +            list_add(&pdev->vf_list, &pf_pdev->vf_list);
> +        }
>      }
>  
>      if ( !pdev->info.is_virtfn && !pdev->vf_rlen[0] )
> @@ -821,6 +851,24 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
>      list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
>          if ( pdev->bus == bus && pdev->devfn == devfn )
>          {
> +            if ( !pdev->info.is_virtfn )

Given we have no field to mark a device as a PF, we could check that
pdev->vf_list is not empty, and by doing so the warn_stale_vfs
variable could be dropped?

if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
{
    struct pci_dev *vf_pdev;

    while ( (vf_pdev = list_first_entry_or_null(&pdev->vf_list,
                                                struct pci_dev,
						vf_list)) != NULL )
    {
        list_del(&vf_pdev->vf_list);
        vf_pdev->virtfn.pf_pdev = NULL;
        vf_pdev->broken = true;
    }

    printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
           &pdev->sbdf);
}

> +            {
> +                struct pci_dev *vf_pdev, *tmp;
> +                bool warn_stale_vfs = false;
> +
> +                list_for_each_entry_safe(vf_pdev, tmp, &pdev->vf_list, vf_list)
> +                {
> +                    list_del(&vf_pdev->vf_list);
> +                    vf_pdev->virtfn.pf_pdev = NULL;
> +                    vf_pdev->broken = true;
> +                    warn_stale_vfs = true;
> +                }
> +
> +                if ( warn_stale_vfs )
> +                    printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
> +                           &pdev->sbdf);
> +            }
> +
>              if ( pdev->domain )
>              {
>                  write_lock(&pdev->domain->pci_lock);
> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
> index ef56e80651d6..2ea168d5f914 100644
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -153,7 +153,17 @@ struct pci_dev {
>          unsigned int count;
>  #define PT_FAULT_THRESHOLD 10
>      } fault;
> +
> +    /*
> +     * List head if info.is_virtfn == false
> +     * List entry if info.is_virtfn == true
> +     */
> +    struct list_head vf_list;
>      u64 vf_rlen[6];
> +    struct {
> +        /* Only populated for VFs (info.is_virtfn == true) */

All comments here (specially the first ones) would better use PF and
VF consistently, rather than referring to other fields in the struct.
Specially because the fields can change names and the comments would
then become stale.

> +        const struct pci_dev *pf_pdev;        /* Link from VF to PF */
> +    } virtfn;

I'm unsure you need an outer virtfn struct, as it's only one field in
this patch?  Maybe more fields gets added by further patches?

Thanks, Roger.
Stewart Hildebrand Nov. 1, 2024, 8:16 p.m. UTC | #3
+Daniel (XSM mention)

On 10/28/24 13:02, Jan Beulich wrote:
> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>> Add links between a VF's struct pci_dev and its associated PF struct
>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>> dropping and re-acquiring the pcidevs_lock().
>>
>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>> VFs may exist without a corresponding PF, although now only with
>> pdev->broken = true.
>>
>> The hardware domain is expected to remove the associated VFs before
>> removing the PF. Print a warning in case a PF is removed with associated
>> VFs still present.
>>
>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>> ---
>> Candidate for backport to 4.19 (the next patch depends on this one)
>>
>> v5->v6:
>> * move printk() before ASSERT_UNREACHABLE()
>> * warn about PF removal with VFs still present
> 
> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> just after an adjustment to the commit message. I'm instead actively
> concerned of the resulting behavior. Question is whether we can reasonably
> do something about that.
> 
> Jan

Right. My suggestion then is to go back to roughly how it was done in
v4 [0]:

* Remove the VFs right away during PF removal, so that we don't end up
with stale VFs. Regarding XSM, assume that a domain with permission to
remove the PF is also allowed to remove the VFs. We should probably also
return an error from pci_remove_device in the case of removing the PF
with VFs still present (and still perform the removals despite returning
an error). Subsequent attempts by a domain to remove the VFs would
return an error (as they have already been removed), but that's expected
since we've taken a stance that PF-then-VF removal order is invalid
anyway.

While the above is what I prefer, I just want to mention other options I
considered for the scenario of PF removal with VFs still present:

* Increase the "scariness" of the warning message added in v6.

* Return an error from pci_remove_device (while still removing only the
PF). We would be left with stale VFs in Xen. At least this would
concretely inform dom0 that Xen takes issue with the PF-then-VF removal
order. Subsequent attempts by a domain to remove VFs, however
(un)likely, would succeed.

* Return an error from pci_remove_device and keep the PF and VFs. This
is IMO the worst option because then we would have a stale PF in
addition to stale VFs.

[0] https://lore.kernel.org/xen-devel/20240827035929.118003-1-stewart.hildebrand@amd.com/T/#t
Daniel P. Smith Nov. 2, 2024, 3:18 p.m. UTC | #4
On 11/1/24 16:16, Stewart Hildebrand wrote:
> +Daniel (XSM mention)
> 
> On 10/28/24 13:02, Jan Beulich wrote:
>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>> Add links between a VF's struct pci_dev and its associated PF struct
>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>> dropping and re-acquiring the pcidevs_lock().
>>>
>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>> VFs may exist without a corresponding PF, although now only with
>>> pdev->broken = true.
>>>
>>> The hardware domain is expected to remove the associated VFs before
>>> removing the PF. Print a warning in case a PF is removed with associated
>>> VFs still present.
>>>
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> ---
>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>
>>> v5->v6:
>>> * move printk() before ASSERT_UNREACHABLE()
>>> * warn about PF removal with VFs still present
>>
>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>> just after an adjustment to the commit message. I'm instead actively
>> concerned of the resulting behavior. Question is whether we can reasonably
>> do something about that.
>>
>> Jan
> 
> Right. My suggestion then is to go back to roughly how it was done in
> v4 [0]:
> 
> * Remove the VFs right away during PF removal, so that we don't end up
> with stale VFs. Regarding XSM, assume that a domain with permission to
> remove the PF is also allowed to remove the VFs. We should probably also
> return an error from pci_remove_device in the case of removing the PF
> with VFs still present (and still perform the removals despite returning
> an error). Subsequent attempts by a domain to remove the VFs would
> return an error (as they have already been removed), but that's expected
> since we've taken a stance that PF-then-VF removal order is invalid
> anyway.

I am not confident this is a safe assumption. It will likely be safe for 
probably 99% of the implementations. Apologies for not following 
closely, and correct me if I am wrong here, but from a resource 
perspective each VF can appear to the system as its own unique BDF and 
so I am fairly certain it would be possible to uniquely label each VF. 
For instance in the SVP architecture, the VF may be labeled to restrict 
control to a hardware domain within a Guest Virtual Platform while the 
PF may be restricted to the Supervisor Virtual Platform. In this 
scenario, the Guest would be torn down before the Supervisor so the VF 
should get released before the PF. But it's all theoretical, so I have 
no real implementation to point at that this could be checked/confirmed.

I am only raising this for awareness and not as an objection. If people 
want to punt that theoretical use case down the road until someone 
actually attempts it, I would not be opposed.

v/r,
dps
Jan Beulich Nov. 4, 2024, 7:44 a.m. UTC | #5
On 01.11.2024 21:16, Stewart Hildebrand wrote:
> +Daniel (XSM mention)
> 
> On 10/28/24 13:02, Jan Beulich wrote:
>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>> Add links between a VF's struct pci_dev and its associated PF struct
>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>> dropping and re-acquiring the pcidevs_lock().
>>>
>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>> VFs may exist without a corresponding PF, although now only with
>>> pdev->broken = true.
>>>
>>> The hardware domain is expected to remove the associated VFs before
>>> removing the PF. Print a warning in case a PF is removed with associated
>>> VFs still present.
>>>
>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>> ---
>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>
>>> v5->v6:
>>> * move printk() before ASSERT_UNREACHABLE()
>>> * warn about PF removal with VFs still present
>>
>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>> just after an adjustment to the commit message. I'm instead actively
>> concerned of the resulting behavior. Question is whether we can reasonably
>> do something about that.
> 
> Right. My suggestion then is to go back to roughly how it was done in
> v4 [0]:
> 
> * Remove the VFs right away during PF removal, so that we don't end up
> with stale VFs. Regarding XSM, assume that a domain with permission to
> remove the PF is also allowed to remove the VFs. We should probably also
> return an error from pci_remove_device in the case of removing the PF
> with VFs still present (and still perform the removals despite returning
> an error). Subsequent attempts by a domain to remove the VFs would
> return an error (as they have already been removed), but that's expected
> since we've taken a stance that PF-then-VF removal order is invalid
> anyway.

Imo going back is not an option.

> While the above is what I prefer, I just want to mention other options I
> considered for the scenario of PF removal with VFs still present:
> 
> * Increase the "scariness" of the warning message added in v6.
> 
> * Return an error from pci_remove_device (while still removing only the
> PF). We would be left with stale VFs in Xen. At least this would
> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
> order. Subsequent attempts by a domain to remove VFs, however
> (un)likely, would succeed.

Returning an error in such a case is a possibility, but comes with the
risk of confusion. Seeing such an error, a caller may itself assume the
device still is there, and retry its (with or without having removed the
VFs) removal at a later point.

> * Return an error from pci_remove_device and keep the PF and VFs. This
> is IMO the worst option because then we would have a stale PF in
> addition to stale VFs.

Yet this would at least be self-consistent, unlike the variant above. No
matter what, any failure to remove VFs and/or PFs correctly will need to
result in there being no attempt to physically remove the device.

You didn't enumerate an option lightly mentioned before, perhaps because
of its anticipated intrusiveness: Re-associate stale VFs with their PF,
once the PF is re-reported. Problem of course is that, aiui, the VFs
could in principle re-appear at a different BDF (albeit we have other
issues with potential bus-renumbering done by Dom0), and their count
could also change.

Jan

> [0] https://lore.kernel.org/xen-devel/20240827035929.118003-1-stewart.hildebrand@amd.com/T/#t
Alejandro Vallejo Nov. 4, 2024, 11:45 a.m. UTC | #6
On Sat Nov 2, 2024 at 3:18 PM GMT, Daniel P. Smith wrote:
> On 11/1/24 16:16, Stewart Hildebrand wrote:
> > +Daniel (XSM mention)
> > 
> > On 10/28/24 13:02, Jan Beulich wrote:
> >> On 18.10.2024 22:39, Stewart Hildebrand wrote:
> >>> Add links between a VF's struct pci_dev and its associated PF struct
> >>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> >>> dropping and re-acquiring the pcidevs_lock().
> >>>
> >>> During PF removal, unlink VF from PF and mark the VF broken. As before,
> >>> VFs may exist without a corresponding PF, although now only with
> >>> pdev->broken = true.
> >>>
> >>> The hardware domain is expected to remove the associated VFs before
> >>> removing the PF. Print a warning in case a PF is removed with associated
> >>> VFs still present.
> >>>
> >>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>> ---
> >>> Candidate for backport to 4.19 (the next patch depends on this one)
> >>>
> >>> v5->v6:
> >>> * move printk() before ASSERT_UNREACHABLE()
> >>> * warn about PF removal with VFs still present
> >>
> >> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> >> just after an adjustment to the commit message. I'm instead actively
> >> concerned of the resulting behavior. Question is whether we can reasonably
> >> do something about that.
> >>
> >> Jan
> > 
> > Right. My suggestion then is to go back to roughly how it was done in
> > v4 [0]:
> > 
> > * Remove the VFs right away during PF removal, so that we don't end up
> > with stale VFs. Regarding XSM, assume that a domain with permission to
> > remove the PF is also allowed to remove the VFs. We should probably also
> > return an error from pci_remove_device in the case of removing the PF
> > with VFs still present (and still perform the removals despite returning
> > an error). Subsequent attempts by a domain to remove the VFs would
> > return an error (as they have already been removed), but that's expected
> > since we've taken a stance that PF-then-VF removal order is invalid
> > anyway.
>
> I am not confident this is a safe assumption. It will likely be safe for 
> probably 99% of the implementations. Apologies for not following 
> closely, and correct me if I am wrong here, but from a resource 
> perspective each VF can appear to the system as its own unique BDF and 
> so I am fairly certain it would be possible to uniquely label each VF. 
> For instance in the SVP architecture, the VF may be labeled to restrict 
> control to a hardware domain within a Guest Virtual Platform while the 
> PF may be restricted to the Supervisor Virtual Platform. In this 
> scenario, the Guest would be torn down before the Supervisor so the VF 
> should get released before the PF. But it's all theoretical, so I have 
> no real implementation to point at that this could be checked/confirmed.
>
> I am only raising this for awareness and not as an objection. If people 
> want to punt that theoretical use case down the road until someone 
> actually attempts it, I would not be opposed.

Wouldn't it stand to reason then to act conditionally on the authority of the
caller?

i.e: If the caller has the (XSM-checked) authority to remove _BOTH_ PF and
VFs, remove all. If it doesn't have authority to remove the VFs then early exit
with an error, leaving the PF behind as well.

That would do the clean thing in the common case and be consistent with the
security policy even with a conflicting policy. The semantics are somewhat more
complex, but trying to remove a PF before removing the VFs is silly and the
only sensible thing (imo) is to help out during cleanup _or_ be strict about
checking.

>
> v/r,
> dps

Cheers,
Alejandro
Roger Pau Monné Nov. 5, 2024, 9:03 a.m. UTC | #7
On Sat, Nov 02, 2024 at 11:18:24AM -0400, Daniel P. Smith wrote:
> On 11/1/24 16:16, Stewart Hildebrand wrote:
> > +Daniel (XSM mention)
> > 
> > On 10/28/24 13:02, Jan Beulich wrote:
> > > On 18.10.2024 22:39, Stewart Hildebrand wrote:
> > > > Add links between a VF's struct pci_dev and its associated PF struct
> > > > pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> > > > dropping and re-acquiring the pcidevs_lock().
> > > > 
> > > > During PF removal, unlink VF from PF and mark the VF broken. As before,
> > > > VFs may exist without a corresponding PF, although now only with
> > > > pdev->broken = true.
> > > > 
> > > > The hardware domain is expected to remove the associated VFs before
> > > > removing the PF. Print a warning in case a PF is removed with associated
> > > > VFs still present.
> > > > 
> > > > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > > > ---
> > > > Candidate for backport to 4.19 (the next patch depends on this one)
> > > > 
> > > > v5->v6:
> > > > * move printk() before ASSERT_UNREACHABLE()
> > > > * warn about PF removal with VFs still present
> > > 
> > > Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> > > just after an adjustment to the commit message. I'm instead actively
> > > concerned of the resulting behavior. Question is whether we can reasonably
> > > do something about that.
> > > 
> > > Jan
> > 
> > Right. My suggestion then is to go back to roughly how it was done in
> > v4 [0]:
> > 
> > * Remove the VFs right away during PF removal, so that we don't end up
> > with stale VFs. Regarding XSM, assume that a domain with permission to
> > remove the PF is also allowed to remove the VFs. We should probably also
> > return an error from pci_remove_device in the case of removing the PF
> > with VFs still present (and still perform the removals despite returning
> > an error). Subsequent attempts by a domain to remove the VFs would
> > return an error (as they have already been removed), but that's expected
> > since we've taken a stance that PF-then-VF removal order is invalid
> > anyway.
> 
> I am not confident this is a safe assumption. It will likely be safe for
> probably 99% of the implementations. Apologies for not following closely,
> and correct me if I am wrong here, but from a resource perspective each VF
> can appear to the system as its own unique BDF and so I am fairly certain it
> would be possible to uniquely label each VF. For instance in the SVP
> architecture, the VF may be labeled to restrict control to a hardware domain
> within a Guest Virtual Platform while the PF may be restricted to the
> Supervisor Virtual Platform. In this scenario, the Guest would be torn down
> before the Supervisor so the VF should get released before the PF.

The VF getting released before the PF is what we would usually expect?

I'm a bit confused because a guest being torn down doesn't imply that
the device is removed from Xen (iow: a call to pci_remove_device()).
Removing a device is hot-unplugging the PCI device from Xen, not
deassinging from a guest.

I would also be uneasy with assigning a PF to a non-privileged domain,
specially if VFs from that same device are being assigned to trusted
domains.

My assumption is that you generally want the PFs assigned to the
hardware domain, and the VFs assigned to any other domains (trusted or
not).

Thanks, Roger.
Roger Pau Monné Nov. 5, 2024, 9:08 a.m. UTC | #8
On Mon, Nov 04, 2024 at 11:45:05AM +0000, Alejandro Vallejo wrote:
> On Sat Nov 2, 2024 at 3:18 PM GMT, Daniel P. Smith wrote:
> > On 11/1/24 16:16, Stewart Hildebrand wrote:
> > > +Daniel (XSM mention)
> > > 
> > > On 10/28/24 13:02, Jan Beulich wrote:
> > >> On 18.10.2024 22:39, Stewart Hildebrand wrote:
> > >>> Add links between a VF's struct pci_dev and its associated PF struct
> > >>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> > >>> dropping and re-acquiring the pcidevs_lock().
> > >>>
> > >>> During PF removal, unlink VF from PF and mark the VF broken. As before,
> > >>> VFs may exist without a corresponding PF, although now only with
> > >>> pdev->broken = true.
> > >>>
> > >>> The hardware domain is expected to remove the associated VFs before
> > >>> removing the PF. Print a warning in case a PF is removed with associated
> > >>> VFs still present.
> > >>>
> > >>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> > >>> ---
> > >>> Candidate for backport to 4.19 (the next patch depends on this one)
> > >>>
> > >>> v5->v6:
> > >>> * move printk() before ASSERT_UNREACHABLE()
> > >>> * warn about PF removal with VFs still present
> > >>
> > >> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> > >> just after an adjustment to the commit message. I'm instead actively
> > >> concerned of the resulting behavior. Question is whether we can reasonably
> > >> do something about that.
> > >>
> > >> Jan
> > > 
> > > Right. My suggestion then is to go back to roughly how it was done in
> > > v4 [0]:
> > > 
> > > * Remove the VFs right away during PF removal, so that we don't end up
> > > with stale VFs. Regarding XSM, assume that a domain with permission to
> > > remove the PF is also allowed to remove the VFs. We should probably also
> > > return an error from pci_remove_device in the case of removing the PF
> > > with VFs still present (and still perform the removals despite returning
> > > an error). Subsequent attempts by a domain to remove the VFs would
> > > return an error (as they have already been removed), but that's expected
> > > since we've taken a stance that PF-then-VF removal order is invalid
> > > anyway.
> >
> > I am not confident this is a safe assumption. It will likely be safe for 
> > probably 99% of the implementations. Apologies for not following 
> > closely, and correct me if I am wrong here, but from a resource 
> > perspective each VF can appear to the system as its own unique BDF and 
> > so I am fairly certain it would be possible to uniquely label each VF. 
> > For instance in the SVP architecture, the VF may be labeled to restrict 
> > control to a hardware domain within a Guest Virtual Platform while the 
> > PF may be restricted to the Supervisor Virtual Platform. In this 
> > scenario, the Guest would be torn down before the Supervisor so the VF 
> > should get released before the PF. But it's all theoretical, so I have 
> > no real implementation to point at that this could be checked/confirmed.
> >
> > I am only raising this for awareness and not as an objection. If people 
> > want to punt that theoretical use case down the road until someone 
> > actually attempts it, I would not be opposed.
> 
> Wouldn't it stand to reason then to act conditionally on the authority of the
> caller?
> 
> i.e: If the caller has the (XSM-checked) authority to remove _BOTH_ PF and
> VFs, remove all. If it doesn't have authority to remove the VFs then early exit
> with an error, leaving the PF behind as well.

I'm unsure if it makes sense to have an entity that's allowed to issue
a pci_remove_device() against a PF, but not against the VFs of the
device.

The owner of the PF should be capable of disabling SR-IOV, at which
point all the VFs disappear from the PCI config space.  If such entity
is capable of controlling the SR-IOV capability, it should also be
able to issue pci_remove_device() calls against the VFs.

Thanks, Roger.
Daniel P. Smith Nov. 6, 2024, 4:20 p.m. UTC | #9
On 11/5/24 04:08, Roger Pau Monné wrote:
> On Mon, Nov 04, 2024 at 11:45:05AM +0000, Alejandro Vallejo wrote:
>> On Sat Nov 2, 2024 at 3:18 PM GMT, Daniel P. Smith wrote:
>>> On 11/1/24 16:16, Stewart Hildebrand wrote:
>>>> +Daniel (XSM mention)
>>>>
>>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>>
>>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>>> VFs may exist without a corresponding PF, although now only with
>>>>>> pdev->broken = true.
>>>>>>
>>>>>> The hardware domain is expected to remove the associated VFs before
>>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>>> VFs still present.
>>>>>>
>>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>>> ---
>>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>>
>>>>>> v5->v6:
>>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>>> * warn about PF removal with VFs still present
>>>>>
>>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>>> just after an adjustment to the commit message. I'm instead actively
>>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>>> do something about that.
>>>>>
>>>>> Jan
>>>>
>>>> Right. My suggestion then is to go back to roughly how it was done in
>>>> v4 [0]:
>>>>
>>>> * Remove the VFs right away during PF removal, so that we don't end up
>>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>>> remove the PF is also allowed to remove the VFs. We should probably also
>>>> return an error from pci_remove_device in the case of removing the PF
>>>> with VFs still present (and still perform the removals despite returning
>>>> an error). Subsequent attempts by a domain to remove the VFs would
>>>> return an error (as they have already been removed), but that's expected
>>>> since we've taken a stance that PF-then-VF removal order is invalid
>>>> anyway.
>>>
>>> I am not confident this is a safe assumption. It will likely be safe for
>>> probably 99% of the implementations. Apologies for not following
>>> closely, and correct me if I am wrong here, but from a resource
>>> perspective each VF can appear to the system as its own unique BDF and
>>> so I am fairly certain it would be possible to uniquely label each VF.
>>> For instance in the SVP architecture, the VF may be labeled to restrict
>>> control to a hardware domain within a Guest Virtual Platform while the
>>> PF may be restricted to the Supervisor Virtual Platform. In this
>>> scenario, the Guest would be torn down before the Supervisor so the VF
>>> should get released before the PF. But it's all theoretical, so I have
>>> no real implementation to point at that this could be checked/confirmed.
>>>
>>> I am only raising this for awareness and not as an objection. If people
>>> want to punt that theoretical use case down the road until someone
>>> actually attempts it, I would not be opposed.
>>
>> Wouldn't it stand to reason then to act conditionally on the authority of the
>> caller?
>>
>> i.e: If the caller has the (XSM-checked) authority to remove _BOTH_ PF and
>> VFs, remove all. If it doesn't have authority to remove the VFs then early exit
>> with an error, leaving the PF behind as well.
> 
> I'm unsure if it makes sense to have an entity that's allowed to issue
> a pci_remove_device() against a PF, but not against the VFs of the
> device.

Apologies for not fully defining how SVP would work. The Supervisor is 
the one of the few domains considered running at the higher trust level.

When I was referring to restricting the VF, for instance, that a VF of 
L:M:N may be assigned label gvp1_pci_t that only allows guest VP 1 
access, while VF X:Y:Z would be labeled gvp2_pci_t that grants guest VP 
2 access only.

At the same time, the Supervisor would be allowed in the policy to 
remove all VFs, in the example above it would have access to 
gvp1/2_pci_t labeled devices. In theory, it would have attempted to have 
the Guest VP tear down (or unplug the VF in a hotplug scenario) the 
virtual device. But in the end, it can't rely on the Guest VP to 
properly shutdown/unplug, and must be able to properly manage the system.

> The owner of the PF should be capable of disabling SR-IOV, at which
> point all the VFs disappear from the PCI config space.  If such entity
> is capable of controlling the SR-IOV capability, it should also be
> able to issue pci_remove_device() calls against the VFs.

Correct, as I mentioned above.

v/r,
dps
Daniel P. Smith Nov. 6, 2024, 4:31 p.m. UTC | #10
On 11/5/24 04:03, Roger Pau Monné wrote:
> On Sat, Nov 02, 2024 at 11:18:24AM -0400, Daniel P. Smith wrote:
>> On 11/1/24 16:16, Stewart Hildebrand wrote:
>>> +Daniel (XSM mention)
>>>
>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>
>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>> VFs may exist without a corresponding PF, although now only with
>>>>> pdev->broken = true.
>>>>>
>>>>> The hardware domain is expected to remove the associated VFs before
>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>> VFs still present.
>>>>>
>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>> ---
>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>
>>>>> v5->v6:
>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>> * warn about PF removal with VFs still present
>>>>
>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>> just after an adjustment to the commit message. I'm instead actively
>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>> do something about that.
>>>>
>>>> Jan
>>>
>>> Right. My suggestion then is to go back to roughly how it was done in
>>> v4 [0]:
>>>
>>> * Remove the VFs right away during PF removal, so that we don't end up
>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>> remove the PF is also allowed to remove the VFs. We should probably also
>>> return an error from pci_remove_device in the case of removing the PF
>>> with VFs still present (and still perform the removals despite returning
>>> an error). Subsequent attempts by a domain to remove the VFs would
>>> return an error (as they have already been removed), but that's expected
>>> since we've taken a stance that PF-then-VF removal order is invalid
>>> anyway.
>>
>> I am not confident this is a safe assumption. It will likely be safe for
>> probably 99% of the implementations. Apologies for not following closely,
>> and correct me if I am wrong here, but from a resource perspective each VF
>> can appear to the system as its own unique BDF and so I am fairly certain it
>> would be possible to uniquely label each VF. For instance in the SVP
>> architecture, the VF may be labeled to restrict control to a hardware domain
>> within a Guest Virtual Platform while the PF may be restricted to the
>> Supervisor Virtual Platform. In this scenario, the Guest would be torn down
>> before the Supervisor so the VF should get released before the PF.
> 
> The VF getting released before the PF is what we would usually expect?
> 
> I'm a bit confused because a guest being torn down doesn't imply that
> the device is removed from Xen (iow: a call to pci_remove_device()).
> Removing a device is hot-unplugging the PCI device from Xen, not
> deassinging from a guest.

I was providing a use-case that was crafted, just not implemented. I 
have not looked at SRIOV in some time and not at the level necessary to 
determine this, but I would be uneasy thinking a VF could just be 
released from a guest to be reassigned either to the host hardware 
domain, aka Supervisor VP, or another guest. Perhaps a reset of the VF 
is enough or it might need more, I have not looked at the details at 
this point since we are still ages away from Xen being exactly capable 
of really doing a true implementation of the SVP architecture.

> I would also be uneasy with assigning a PF to a non-privileged domain,
> specially if VFs from that same device are being assigned to trusted
> domains.

As mentioned in the other response, the PF is in a higher trusted domain 
level than where the VFs would be assigned.

> My assumption is that you generally want the PFs assigned to the
> hardware domain, and the VFs assigned to any other domains (trusted or
> not).

In the SVP architecture, there is more than one "hardware domain".

v/r,
dps
Daniel P. Smith Nov. 6, 2024, 5:04 p.m. UTC | #11
On 11/4/24 06:45, Alejandro Vallejo wrote:
> On Sat Nov 2, 2024 at 3:18 PM GMT, Daniel P. Smith wrote:
>> On 11/1/24 16:16, Stewart Hildebrand wrote:
>>> +Daniel (XSM mention)
>>>
>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>
>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>> VFs may exist without a corresponding PF, although now only with
>>>>> pdev->broken = true.
>>>>>
>>>>> The hardware domain is expected to remove the associated VFs before
>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>> VFs still present.
>>>>>
>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>> ---
>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>
>>>>> v5->v6:
>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>> * warn about PF removal with VFs still present
>>>>
>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>> just after an adjustment to the commit message. I'm instead actively
>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>> do something about that.
>>>>
>>>> Jan
>>>
>>> Right. My suggestion then is to go back to roughly how it was done in
>>> v4 [0]:
>>>
>>> * Remove the VFs right away during PF removal, so that we don't end up
>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>> remove the PF is also allowed to remove the VFs. We should probably also
>>> return an error from pci_remove_device in the case of removing the PF
>>> with VFs still present (and still perform the removals despite returning
>>> an error). Subsequent attempts by a domain to remove the VFs would
>>> return an error (as they have already been removed), but that's expected
>>> since we've taken a stance that PF-then-VF removal order is invalid
>>> anyway.
>>
>> I am not confident this is a safe assumption. It will likely be safe for
>> probably 99% of the implementations. Apologies for not following
>> closely, and correct me if I am wrong here, but from a resource
>> perspective each VF can appear to the system as its own unique BDF and
>> so I am fairly certain it would be possible to uniquely label each VF.
>> For instance in the SVP architecture, the VF may be labeled to restrict
>> control to a hardware domain within a Guest Virtual Platform while the
>> PF may be restricted to the Supervisor Virtual Platform. In this
>> scenario, the Guest would be torn down before the Supervisor so the VF
>> should get released before the PF. But it's all theoretical, so I have
>> no real implementation to point at that this could be checked/confirmed.
>>
>> I am only raising this for awareness and not as an objection. If people
>> want to punt that theoretical use case down the road until someone
>> actually attempts it, I would not be opposed.
> 
> Wouldn't it stand to reason then to act conditionally on the authority of the
> caller?

Correct, and as I hopefully clarified in response to Roger, like 
everything with virtualization, it's turtles all the way down. Try 
having this model in mind,

   +----------------+  +-------------------------------+
   | Supervisor VP  |  | Guest VP                      |
   |                |  |                               |
   | +------------+ |  | +------------+ +------------+ |
   | |    HWDOM   | |  | |    HWDOM   | |    Guest   | |
   | |  +------+  | |  | |  +------+  | |  +------+  | |
   | |  |      |  | |  | |  |      |  | |  |      |  | |
   | |  |  PF  |  | |  | |  |  VF  +--+-+-->  PCI |  | |
   | |  |      |  | |  | |  |      |  | |  |      |  | |
   | |  +------+  | |  | |  +------+  | |  +------+  | |
   | +------------+ |  | +------------+ +------------+ |
   +----------------+  +-------------------------------+


> i.e: If the caller has the (XSM-checked) authority to remove _BOTH_ PF and
> VFs, remove all. If it doesn't have authority to remove the VFs then early exit
> with an error, leaving the PF behind as well.

Yes, I would agree that is reasonable. Please, I am not trying to make 
this complicated, but just trying to give a more advanced (nested?) 
model to consider. Not something that we have to figure out all the 
details of how to make it all work. I am just saying, consider that a VF 
may have a different label than the PF which allows another domain to be 
able to take actions on the device as if the domain fully owns that 
virtual PCI device. A code comment around the xsm check(s) could state 
that it is expected that a system with access to the PF label must have 
access to all VF labels and the dummy policy should already be wired up 
to only allow the hwdom to do it.

> That would do the clean thing in the common case and be consistent with the
> security policy even with a conflicting policy. The semantics are somewhat more
> complex, but trying to remove a PF before removing the VFs is silly and the
> only sensible thing (imo) is to help out during cleanup _or_ be strict about
> checking.

Yes, and I was not trying to suggest the PF would be removed before the 
VFs. Apologies if it sounds as though I am repeating, myself,, all that 
I was attempting to say is that the VFs should be able to be checked 
separately from the PF. This would allow a domain that was granted 
ownership of a VF, to be able to unplug the VF and only that VF from the 
system. I would leave it to the PCI logic to manage the sequencing of 
the PF and VF operations. In such a scenario, I would expect the hwdom 
to have the facilities to monitor/check the status of the VF to manage 
the case that a guest unplugged it.

v/r,
dps
Stewart Hildebrand Nov. 7, 2024, 2:32 p.m. UTC | #12
On 11/4/24 02:44, Jan Beulich wrote:
> On 01.11.2024 21:16, Stewart Hildebrand wrote:
>> +Daniel (XSM mention)
>>
>> On 10/28/24 13:02, Jan Beulich wrote:
>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>> dropping and re-acquiring the pcidevs_lock().
>>>>
>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>> VFs may exist without a corresponding PF, although now only with
>>>> pdev->broken = true.
>>>>
>>>> The hardware domain is expected to remove the associated VFs before
>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>> VFs still present.
>>>>
>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>> ---
>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>
>>>> v5->v6:
>>>> * move printk() before ASSERT_UNREACHABLE()
>>>> * warn about PF removal with VFs still present
>>>
>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>> just after an adjustment to the commit message. I'm instead actively
>>> concerned of the resulting behavior. Question is whether we can reasonably
>>> do something about that.
>>
>> Right. My suggestion then is to go back to roughly how it was done in
>> v4 [0]:
>>
>> * Remove the VFs right away during PF removal, so that we don't end up
>> with stale VFs. Regarding XSM, assume that a domain with permission to
>> remove the PF is also allowed to remove the VFs. We should probably also
>> return an error from pci_remove_device in the case of removing the PF
>> with VFs still present (and still perform the removals despite returning
>> an error). Subsequent attempts by a domain to remove the VFs would
>> return an error (as they have already been removed), but that's expected
>> since we've taken a stance that PF-then-VF removal order is invalid
>> anyway.
> 
> Imo going back is not an option.
> 
>> While the above is what I prefer, I just want to mention other options I
>> considered for the scenario of PF removal with VFs still present:
>>
>> * Increase the "scariness" of the warning message added in v6.
>>
>> * Return an error from pci_remove_device (while still removing only the
>> PF). We would be left with stale VFs in Xen. At least this would
>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
>> order. Subsequent attempts by a domain to remove VFs, however
>> (un)likely, would succeed.
> 
> Returning an error in such a case is a possibility, but comes with the
> risk of confusion. Seeing such an error, a caller may itself assume the
> device still is there, and retry its (with or without having removed the
> VFs) removal at a later point.
> 
>> * Return an error from pci_remove_device and keep the PF and VFs. This
>> is IMO the worst option because then we would have a stale PF in
>> addition to stale VFs.
> 
> Yet this would at least be self-consistent, unlike the variant above. No
> matter what, any failure to remove VFs and/or PFs correctly will need to
> result in there being no attempt to physically remove the device.
> 
> You didn't enumerate an option lightly mentioned before, perhaps because
> of its anticipated intrusiveness: Re-associate stale VFs with their PF,
> once the PF is re-reported.

I'll plan on this approach for v7.

> Problem of course is that, aiui, the VFs
> could in principle re-appear at a different BDF (albeit we have other
> issues with potential bus-renumbering done by Dom0), and their count
> could also change.
> 
> Jan
> 
>> [0] https://lore.kernel.org/xen-devel/20240827035929.118003-1-stewart.hildebrand@amd.com/T/#t
>
Alejandro Vallejo Nov. 8, 2024, 12:42 p.m. UTC | #13
On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
> On 01.11.2024 21:16, Stewart Hildebrand wrote:
> > +Daniel (XSM mention)
> > 
> > On 10/28/24 13:02, Jan Beulich wrote:
> >> On 18.10.2024 22:39, Stewart Hildebrand wrote:
> >>> Add links between a VF's struct pci_dev and its associated PF struct
> >>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> >>> dropping and re-acquiring the pcidevs_lock().
> >>>
> >>> During PF removal, unlink VF from PF and mark the VF broken. As before,
> >>> VFs may exist without a corresponding PF, although now only with
> >>> pdev->broken = true.
> >>>
> >>> The hardware domain is expected to remove the associated VFs before
> >>> removing the PF. Print a warning in case a PF is removed with associated
> >>> VFs still present.
> >>>
> >>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>> ---
> >>> Candidate for backport to 4.19 (the next patch depends on this one)
> >>>
> >>> v5->v6:
> >>> * move printk() before ASSERT_UNREACHABLE()
> >>> * warn about PF removal with VFs still present
> >>
> >> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> >> just after an adjustment to the commit message. I'm instead actively
> >> concerned of the resulting behavior. Question is whether we can reasonably
> >> do something about that.
> > 
> > Right. My suggestion then is to go back to roughly how it was done in
> > v4 [0]:
> > 
> > * Remove the VFs right away during PF removal, so that we don't end up
> > with stale VFs. Regarding XSM, assume that a domain with permission to
> > remove the PF is also allowed to remove the VFs. We should probably also
> > return an error from pci_remove_device in the case of removing the PF
> > with VFs still present (and still perform the removals despite returning
> > an error). Subsequent attempts by a domain to remove the VFs would
> > return an error (as they have already been removed), but that's expected
> > since we've taken a stance that PF-then-VF removal order is invalid
> > anyway.
>
> Imo going back is not an option.
>
> > While the above is what I prefer, I just want to mention other options I
> > considered for the scenario of PF removal with VFs still present:
> > 
> > * Increase the "scariness" of the warning message added in v6.
> > 
> > * Return an error from pci_remove_device (while still removing only the
> > PF). We would be left with stale VFs in Xen. At least this would
> > concretely inform dom0 that Xen takes issue with the PF-then-VF removal
> > order. Subsequent attempts by a domain to remove VFs, however
> > (un)likely, would succeed.
>
> Returning an error in such a case is a possibility, but comes with the
> risk of confusion. Seeing such an error, a caller may itself assume the
> device still is there, and retry its (with or without having removed the
> VFs) removal at a later point.
>
> > * Return an error from pci_remove_device and keep the PF and VFs. This
> > is IMO the worst option because then we would have a stale PF in
> > addition to stale VFs.
>
> Yet this would at least be self-consistent, unlike the variant above. No
> matter what, any failure to remove VFs and/or PFs correctly will need to
> result in there being no attempt to physically remove the device.
>
> You didn't enumerate an option lightly mentioned before, perhaps because
> of its anticipated intrusiveness: Re-associate stale VFs with their PF,
> once the PF is re-reported. Problem of course is that, aiui, the VFs
> could in principle re-appear at a different BDF (albeit we have other
> issues with potential bus-renumbering done by Dom0), and their count
> could also change.

Are you enumerating it for completeness or suggesting it should be done?

Maybe I'm missing something here (and please, do tell me what if so), but why
would this option be desirable at all? What would benefit from such semantics
(as opposed to any of the others)? It would break the lifetime dependency
between PF and VFs, and that doesn't strike me as a feature. It also turns
kernel bugs into a fine implementations by making promises about how state is
persisted, but the consequences of that appear to be too far reaching to know
for sure it's 100% ok.

From afar, it sounds like trying to turn a bug into a feature. And that cannot
always be done sanely. But again, maybe I might very well be missing
something...

>
> Jan
>
> > [0] https://lore.kernel.org/xen-devel/20240827035929.118003-1-stewart.hildebrand@amd.com/T/#t

Cheers,
Alejandro
Jan Beulich Nov. 8, 2024, 1:17 p.m. UTC | #14
On 08.11.2024 13:42, Alejandro Vallejo wrote:
> On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
>> On 01.11.2024 21:16, Stewart Hildebrand wrote:
>>> +Daniel (XSM mention)
>>>
>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>
>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>> VFs may exist without a corresponding PF, although now only with
>>>>> pdev->broken = true.
>>>>>
>>>>> The hardware domain is expected to remove the associated VFs before
>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>> VFs still present.
>>>>>
>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>> ---
>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>
>>>>> v5->v6:
>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>> * warn about PF removal with VFs still present
>>>>
>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>> just after an adjustment to the commit message. I'm instead actively
>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>> do something about that.
>>>
>>> Right. My suggestion then is to go back to roughly how it was done in
>>> v4 [0]:
>>>
>>> * Remove the VFs right away during PF removal, so that we don't end up
>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>> remove the PF is also allowed to remove the VFs. We should probably also
>>> return an error from pci_remove_device in the case of removing the PF
>>> with VFs still present (and still perform the removals despite returning
>>> an error). Subsequent attempts by a domain to remove the VFs would
>>> return an error (as they have already been removed), but that's expected
>>> since we've taken a stance that PF-then-VF removal order is invalid
>>> anyway.
>>
>> Imo going back is not an option.
>>
>>> While the above is what I prefer, I just want to mention other options I
>>> considered for the scenario of PF removal with VFs still present:
>>>
>>> * Increase the "scariness" of the warning message added in v6.
>>>
>>> * Return an error from pci_remove_device (while still removing only the
>>> PF). We would be left with stale VFs in Xen. At least this would
>>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
>>> order. Subsequent attempts by a domain to remove VFs, however
>>> (un)likely, would succeed.
>>
>> Returning an error in such a case is a possibility, but comes with the
>> risk of confusion. Seeing such an error, a caller may itself assume the
>> device still is there, and retry its (with or without having removed the
>> VFs) removal at a later point.
>>
>>> * Return an error from pci_remove_device and keep the PF and VFs. This
>>> is IMO the worst option because then we would have a stale PF in
>>> addition to stale VFs.
>>
>> Yet this would at least be self-consistent, unlike the variant above. No
>> matter what, any failure to remove VFs and/or PFs correctly will need to
>> result in there being no attempt to physically remove the device.
>>
>> You didn't enumerate an option lightly mentioned before, perhaps because
>> of its anticipated intrusiveness: Re-associate stale VFs with their PF,
>> once the PF is re-reported. Problem of course is that, aiui, the VFs
>> could in principle re-appear at a different BDF (albeit we have other
>> issues with potential bus-renumbering done by Dom0), and their count
>> could also change.
> 
> Are you enumerating it for completeness or suggesting it should be done?

I was meaning to suggest that it should at least be considered.

> Maybe I'm missing something here (and please, do tell me what if so), but why
> would this option be desirable at all? What would benefit from such semantics
> (as opposed to any of the others)? It would break the lifetime dependency
> between PF and VFs, and that doesn't strike me as a feature. It also turns
> kernel bugs into a fine implementations by making promises about how state is
> persisted, but the consequences of that appear to be too far reaching to know
> for sure it's 100% ok.
> 
> From afar, it sounds like trying to turn a bug into a feature. And that cannot
> always be done sanely. But again, maybe I might very well be missing
> something...

My main point is that the other suggested options have weaknesses, too.
Leaving stale VFs around forever isn't, imo, any better than trying to
reuse their structs.

Jan
Roger Pau Monné Nov. 8, 2024, 3:19 p.m. UTC | #15
On Fri, Nov 08, 2024 at 02:17:40PM +0100, Jan Beulich wrote:
> On 08.11.2024 13:42, Alejandro Vallejo wrote:
> > On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
> >> On 01.11.2024 21:16, Stewart Hildebrand wrote:
> >>> +Daniel (XSM mention)
> >>>
> >>> On 10/28/24 13:02, Jan Beulich wrote:
> >>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
> >>>>> Add links between a VF's struct pci_dev and its associated PF struct
> >>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> >>>>> dropping and re-acquiring the pcidevs_lock().
> >>>>>
> >>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
> >>>>> VFs may exist without a corresponding PF, although now only with
> >>>>> pdev->broken = true.
> >>>>>
> >>>>> The hardware domain is expected to remove the associated VFs before
> >>>>> removing the PF. Print a warning in case a PF is removed with associated
> >>>>> VFs still present.
> >>>>>
> >>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>>>> ---
> >>>>> Candidate for backport to 4.19 (the next patch depends on this one)
> >>>>>
> >>>>> v5->v6:
> >>>>> * move printk() before ASSERT_UNREACHABLE()
> >>>>> * warn about PF removal with VFs still present
> >>>>
> >>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> >>>> just after an adjustment to the commit message. I'm instead actively
> >>>> concerned of the resulting behavior. Question is whether we can reasonably
> >>>> do something about that.
> >>>
> >>> Right. My suggestion then is to go back to roughly how it was done in
> >>> v4 [0]:
> >>>
> >>> * Remove the VFs right away during PF removal, so that we don't end up
> >>> with stale VFs. Regarding XSM, assume that a domain with permission to
> >>> remove the PF is also allowed to remove the VFs. We should probably also
> >>> return an error from pci_remove_device in the case of removing the PF
> >>> with VFs still present (and still perform the removals despite returning
> >>> an error). Subsequent attempts by a domain to remove the VFs would
> >>> return an error (as they have already been removed), but that's expected
> >>> since we've taken a stance that PF-then-VF removal order is invalid
> >>> anyway.
> >>
> >> Imo going back is not an option.
> >>
> >>> While the above is what I prefer, I just want to mention other options I
> >>> considered for the scenario of PF removal with VFs still present:
> >>>
> >>> * Increase the "scariness" of the warning message added in v6.
> >>>
> >>> * Return an error from pci_remove_device (while still removing only the
> >>> PF). We would be left with stale VFs in Xen. At least this would
> >>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
> >>> order. Subsequent attempts by a domain to remove VFs, however
> >>> (un)likely, would succeed.
> >>
> >> Returning an error in such a case is a possibility, but comes with the
> >> risk of confusion. Seeing such an error, a caller may itself assume the
> >> device still is there, and retry its (with or without having removed the
> >> VFs) removal at a later point.
> >>
> >>> * Return an error from pci_remove_device and keep the PF and VFs. This
> >>> is IMO the worst option because then we would have a stale PF in
> >>> addition to stale VFs.

I'm thinking probably this is the least bad option, and just force the
owner of the PF to ensure there are no VFs left when removing the PF.

What sense does it make anyway to allow removing a PF with VFs still
present?  Not sure exactly what the owner of the PF will do before
calling pci_remove_device(), but it would seem to me the device should
be ready for unplug (so SR-IOV disabled).  Calling pci_remove_device()
with VFs still active points to an error to do proper cleanup by the
owner of the PF.

Returning error from pci_remove_device() and doing nothing would seem
fine to me.  There should be no stale PF or VFs in that case, as the
caller has been notified the device has failed to be removed, so
should treat the device as still present.

Thanks. Roger.
Stewart Hildebrand Nov. 8, 2024, 4:29 p.m. UTC | #16
On 11/8/24 10:19, Roger Pau Monné wrote:
> On Fri, Nov 08, 2024 at 02:17:40PM +0100, Jan Beulich wrote:
>> On 08.11.2024 13:42, Alejandro Vallejo wrote:
>>> On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
>>>> On 01.11.2024 21:16, Stewart Hildebrand wrote:
>>>>> +Daniel (XSM mention)
>>>>>
>>>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>>>
>>>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>>>> VFs may exist without a corresponding PF, although now only with
>>>>>>> pdev->broken = true.
>>>>>>>
>>>>>>> The hardware domain is expected to remove the associated VFs before
>>>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>>>> VFs still present.
>>>>>>>
>>>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>>>> ---
>>>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>>>
>>>>>>> v5->v6:
>>>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>>>> * warn about PF removal with VFs still present
>>>>>>
>>>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>>>> just after an adjustment to the commit message. I'm instead actively
>>>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>>>> do something about that.
>>>>>
>>>>> Right. My suggestion then is to go back to roughly how it was done in
>>>>> v4 [0]:
>>>>>
>>>>> * Remove the VFs right away during PF removal, so that we don't end up
>>>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>>>> remove the PF is also allowed to remove the VFs. We should probably also
>>>>> return an error from pci_remove_device in the case of removing the PF
>>>>> with VFs still present (and still perform the removals despite returning
>>>>> an error). Subsequent attempts by a domain to remove the VFs would
>>>>> return an error (as they have already been removed), but that's expected
>>>>> since we've taken a stance that PF-then-VF removal order is invalid
>>>>> anyway.
>>>>
>>>> Imo going back is not an option.
>>>>
>>>>> While the above is what I prefer, I just want to mention other options I
>>>>> considered for the scenario of PF removal with VFs still present:
>>>>>
>>>>> * Increase the "scariness" of the warning message added in v6.
>>>>>
>>>>> * Return an error from pci_remove_device (while still removing only the
>>>>> PF). We would be left with stale VFs in Xen. At least this would
>>>>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
>>>>> order. Subsequent attempts by a domain to remove VFs, however
>>>>> (un)likely, would succeed.
>>>>
>>>> Returning an error in such a case is a possibility, but comes with the
>>>> risk of confusion. Seeing such an error, a caller may itself assume the
>>>> device still is there, and retry its (with or without having removed the
>>>> VFs) removal at a later point.
>>>>
>>>>> * Return an error from pci_remove_device and keep the PF and VFs. This
>>>>> is IMO the worst option because then we would have a stale PF in
>>>>> addition to stale VFs.
> 
> I'm thinking probably this is the least bad option, and just force the
> owner of the PF to ensure there are no VFs left when removing the PF.
> 
> What sense does it make anyway to allow removing a PF with VFs still
> present?  Not sure exactly what the owner of the PF will do before
> calling pci_remove_device(), but it would seem to me the device should
> be ready for unplug (so SR-IOV disabled).  Calling pci_remove_device()
> with VFs still active points to an error to do proper cleanup by the
> owner of the PF.

In normal, correct operation, right. The PF driver is indeed expected to
disable SR-IOV (i.e. remove VFs) during its removal, prior to calling
PHYSDEVOP_pci_device_remove for the PF.

> Returning error from pci_remove_device() and doing nothing would seem
> fine to me.  There should be no stale PF or VFs in that case, as the
> caller has been notified the device has failed to be removed, so
> should treat the device as still present.

But software has no way to guarantee there won't be a physical device
removal.

In test scenario #2 described in the first patch [1], the PF (the whole
device, actually) has already been physically unplugged, and dom0
invokes PHYSDEVOP_pci_device_remove to inform Xen about it.

[1] https://lore.kernel.org/xen-devel/20241018203913.1162962-2-stewart.hildebrand@amd.com/

That said, test scenario #2 would only happen when a buggy PF driver
failed to properly clean up the VFs before the PF. But the point is that
returning an error does not guarantee there won't be a stale pdev in
case of a buggy dom0.

I guess as long as we trust the owner of the PF, this approach is fine.
Jan Beulich Nov. 11, 2024, 6:40 a.m. UTC | #17
On 08.11.2024 17:29, Stewart Hildebrand wrote:
> On 11/8/24 10:19, Roger Pau Monné wrote:
>> On Fri, Nov 08, 2024 at 02:17:40PM +0100, Jan Beulich wrote:
>>> On 08.11.2024 13:42, Alejandro Vallejo wrote:
>>>> On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
>>>>> On 01.11.2024 21:16, Stewart Hildebrand wrote:
>>>>>> +Daniel (XSM mention)
>>>>>>
>>>>>> On 10/28/24 13:02, Jan Beulich wrote:
>>>>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
>>>>>>>> Add links between a VF's struct pci_dev and its associated PF struct
>>>>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>>>>>>>> dropping and re-acquiring the pcidevs_lock().
>>>>>>>>
>>>>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>>>>>>>> VFs may exist without a corresponding PF, although now only with
>>>>>>>> pdev->broken = true.
>>>>>>>>
>>>>>>>> The hardware domain is expected to remove the associated VFs before
>>>>>>>> removing the PF. Print a warning in case a PF is removed with associated
>>>>>>>> VFs still present.
>>>>>>>>
>>>>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>>>>>>>> ---
>>>>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
>>>>>>>>
>>>>>>>> v5->v6:
>>>>>>>> * move printk() before ASSERT_UNREACHABLE()
>>>>>>>> * warn about PF removal with VFs still present
>>>>>>>
>>>>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
>>>>>>> just after an adjustment to the commit message. I'm instead actively
>>>>>>> concerned of the resulting behavior. Question is whether we can reasonably
>>>>>>> do something about that.
>>>>>>
>>>>>> Right. My suggestion then is to go back to roughly how it was done in
>>>>>> v4 [0]:
>>>>>>
>>>>>> * Remove the VFs right away during PF removal, so that we don't end up
>>>>>> with stale VFs. Regarding XSM, assume that a domain with permission to
>>>>>> remove the PF is also allowed to remove the VFs. We should probably also
>>>>>> return an error from pci_remove_device in the case of removing the PF
>>>>>> with VFs still present (and still perform the removals despite returning
>>>>>> an error). Subsequent attempts by a domain to remove the VFs would
>>>>>> return an error (as they have already been removed), but that's expected
>>>>>> since we've taken a stance that PF-then-VF removal order is invalid
>>>>>> anyway.
>>>>>
>>>>> Imo going back is not an option.
>>>>>
>>>>>> While the above is what I prefer, I just want to mention other options I
>>>>>> considered for the scenario of PF removal with VFs still present:
>>>>>>
>>>>>> * Increase the "scariness" of the warning message added in v6.
>>>>>>
>>>>>> * Return an error from pci_remove_device (while still removing only the
>>>>>> PF). We would be left with stale VFs in Xen. At least this would
>>>>>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
>>>>>> order. Subsequent attempts by a domain to remove VFs, however
>>>>>> (un)likely, would succeed.
>>>>>
>>>>> Returning an error in such a case is a possibility, but comes with the
>>>>> risk of confusion. Seeing such an error, a caller may itself assume the
>>>>> device still is there, and retry its (with or without having removed the
>>>>> VFs) removal at a later point.
>>>>>
>>>>>> * Return an error from pci_remove_device and keep the PF and VFs. This
>>>>>> is IMO the worst option because then we would have a stale PF in
>>>>>> addition to stale VFs.
>>
>> I'm thinking probably this is the least bad option, and just force the
>> owner of the PF to ensure there are no VFs left when removing the PF.
>>
>> What sense does it make anyway to allow removing a PF with VFs still
>> present?  Not sure exactly what the owner of the PF will do before
>> calling pci_remove_device(), but it would seem to me the device should
>> be ready for unplug (so SR-IOV disabled).  Calling pci_remove_device()
>> with VFs still active points to an error to do proper cleanup by the
>> owner of the PF.
> 
> In normal, correct operation, right. The PF driver is indeed expected to
> disable SR-IOV (i.e. remove VFs) during its removal, prior to calling
> PHYSDEVOP_pci_device_remove for the PF.
> 
>> Returning error from pci_remove_device() and doing nothing would seem
>> fine to me.  There should be no stale PF or VFs in that case, as the
>> caller has been notified the device has failed to be removed, so
>> should treat the device as still present.

Imo really that's another case that would best be addressed by proper
ref-counting. Each VF would hold a ref to its PF, and hence the PF would
go away when the last VF is removed, or when PF removal is (properly)
last. Just that this likely is too complex a change to be warranted for
the purpose here.

> But software has no way to guarantee there won't be a physical device
> removal.
> 
> In test scenario #2 described in the first patch [1], the PF (the whole
> device, actually) has already been physically unplugged, and dom0
> invokes PHYSDEVOP_pci_device_remove to inform Xen about it.

I don't think that's how it's supposed to work. Physical removal should
occur only after software has done all "soft removal". I'd view the
notification to Xen as part of that.

Jan

> [1] https://lore.kernel.org/xen-devel/20241018203913.1162962-2-stewart.hildebrand@amd.com/
> 
> That said, test scenario #2 would only happen when a buggy PF driver
> failed to properly clean up the VFs before the PF. But the point is that
> returning an error does not guarantee there won't be a stale pdev in
> case of a buggy dom0.
> 
> I guess as long as we trust the owner of the PF, this approach is fine.
Stewart Hildebrand Nov. 11, 2024, 8:07 p.m. UTC | #18
On 10/28/24 14:41, Roger Pau Monné wrote:
> On Fri, Oct 18, 2024 at 04:39:09PM -0400, Stewart Hildebrand wrote:
>> Add links between a VF's struct pci_dev and its associated PF struct
>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
>> dropping and re-acquiring the pcidevs_lock().
>>
>> During PF removal, unlink VF from PF and mark the VF broken. As before,
>> VFs may exist without a corresponding PF, although now only with
>> pdev->broken = true.
>>
>> The hardware domain is expected to remove the associated VFs before
>> removing the PF. Print a warning in case a PF is removed with associated
>> VFs still present.
>>
>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
>> ---
>> Candidate for backport to 4.19 (the next patch depends on this one)
>>
>> v5->v6:
>> * move printk() before ASSERT_UNREACHABLE()
>> * warn about PF removal with VFs still present
>> * clarify commit message
>>
>> v4->v5:
>> * new patch, split from ("x86/msi: fix locking for SR-IOV devices")
>> * move INIT_LIST_HEAD(&pdev->vf_list); earlier
>> * collapse struct list_head instances
>> * retain error code from pci_add_device()
>> * unlink (and mark broken) VFs instead of removing them
>> * const-ify VF->PF link
>> ---
>>  xen/drivers/passthrough/pci.c | 76 ++++++++++++++++++++++++++++-------
>>  xen/include/xen/pci.h         | 10 +++++
>>  2 files changed, 72 insertions(+), 14 deletions(-)
>>
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index 74d3895e1ef6..fe31255b1207 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -333,6 +333,8 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
>>      *((u8*) &pdev->devfn) = devfn;
>>      pdev->domain = NULL;
>>  
>> +    INIT_LIST_HEAD(&pdev->vf_list);
>> +
>>      arch_pci_init_pdev(pdev);
>>  
>>      rc = pdev_msi_init(pdev);
>> @@ -449,6 +451,10 @@ static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev)
>>  
>>      list_del(&pdev->alldevs_list);
>>      pdev_msi_deinit(pdev);
>> +
>> +    if ( pdev->info.is_virtfn && pdev->virtfn.pf_pdev )
> 
> Shouldn't having pdev->info.is_virtfn set already ensure that
> pdev->virtfn.pf_pdev != NULL?

In the current rev, the possibility exists, however unlikely, that a
*buggy* dom0 may remove the PF before removing the VFs. In this case a
VF would exist without a corresponding PF, and thus pdev->virtfn.pf_pdev
is NULL.

For the next rev, you're right, it'll be back to the situation where a
VF can only exist with an associated PF.

>> +        list_del(&pdev->vf_list);
>> +
>>      xfree(pdev);
>>  }
>>  
>> @@ -656,24 +662,11 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>      unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>>      const char *type;
>>      int ret;
>> -    bool pf_is_extfn = false;
>>  
>>      if ( !info )
>>          type = "device";
>>      else if ( info->is_virtfn )
>> -    {
>> -        pcidevs_lock();
>> -        pdev = pci_get_pdev(NULL,
>> -                            PCI_SBDF(seg, info->physfn.bus,
>> -                                     info->physfn.devfn));
>> -        if ( pdev )
>> -            pf_is_extfn = pdev->info.is_extfn;
>> -        pcidevs_unlock();
>> -        if ( !pdev )
>> -            pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>> -                           NULL, node);
>>          type = "virtual function";
>> -    }
>>      else if ( info->is_extfn )
>>          type = "extended function";
>>      else
>> @@ -703,7 +696,44 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>           * extended function.
>>           */
>>          if ( pdev->info.is_virtfn )
>> -            pdev->info.is_extfn = pf_is_extfn;
>> +        {
>> +            struct pci_dev *pf_pdev;
> 
> This could be const?

No, as we are doing this below:
    list_add(&pdev->vf_list, &pf_pdev->vf_list);

>> +
>> +            pf_pdev = pci_get_pdev(NULL,
>> +                                   PCI_SBDF(seg, info->physfn.bus,
>> +                                            info->physfn.devfn));
> 
> You can probably initialize at declaration?

OK

>> +
>> +            if ( !pf_pdev )
> 
> Is this even feasible during correct operation?

No, I don't think so.

> IOW: shouldn't the PF
> always be added first, so that SR-IOV can be enabled and the VFs added
> afterwards?

Yes, I think you're right.

> I see previous code also catered for VFs being added without the PF
> being present, so I assume there was some need for this.

This is exactly the source of the confusion on my part. In the removal
path, the consensus seems to be that removing a PF with VFs still
present indicates an error. Then shouldn't the opposite also be true?
Adding a VF without a PF should also indicate an error.

I see the PF-adding logic was added in 942a6f1376d8 ("x86/PCI-MSI:
properly determine VF BAR values"). Searching the mailing list archives
didn't reveal much about it [0].

[0] https://lore.kernel.org/xen-devel/4E3FC6E102000078000501CA@nat28.tlf.novell.com/

The only time I've observed this path being taken is by manually
calling PHYSDEVOP_pci_device_add. I've resorted to calling
PHYSDEVOP_pci_device_{remove,add} from userspace in order to test this,
because the Linux kernel doesn't behave this way.

I can't think of a good rationale for catering to VFs being added
without a PF, so I'll turn it into an error for the next rev.

>> +            {
>> +                ret = pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>> +                                     NULL, node);
>> +                if ( ret )
>> +                {
>> +                    printk(XENLOG_WARNING "Failed to add SR-IOV device PF %pp for VF %pp\n",
> 
> Could you split this to make the line a bit shorter?
> 
>                        printk(XENLOG_WARNING
> 		              "Failed to add SR-IOV device PF %pp for VF %pp\n",
> 
> Same below.
> 
>> +                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
>> +                           &pdev->sbdf);
>> +                    free_pdev(pseg, pdev);
>> +                    goto out;
>> +                }
>> +                pf_pdev = pci_get_pdev(NULL,
>> +                                       PCI_SBDF(seg, info->physfn.bus,
>> +                                                info->physfn.devfn));
>> +                if ( !pf_pdev )
>> +                {
>> +                    printk(XENLOG_ERR "Failed to find SR-IOV device PF %pp for VF %pp\n",
>> +                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
>> +                           &pdev->sbdf);
> 
> If you want to add an error message here, I think it should mention
> the fact this state is not expected:
> 
> "Inconsistent PCI state: failed to find newly added PF %pp for VF %pp\n"
> 
>> +                    ASSERT_UNREACHABLE();
>> +                    free_pdev(pseg, pdev);
>> +                    ret = -EILSEQ;
>> +                    goto out;
>> +                }
>> +            }
>> +
>> +            pdev->info.is_extfn = pf_pdev->info.is_extfn;
>> +            pdev->virtfn.pf_pdev = pf_pdev;
>> +            list_add(&pdev->vf_list, &pf_pdev->vf_list);
>> +        }
>>      }
>>  
>>      if ( !pdev->info.is_virtfn && !pdev->vf_rlen[0] )
>> @@ -821,6 +851,24 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
>>      list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
>>          if ( pdev->bus == bus && pdev->devfn == devfn )
>>          {
>> +            if ( !pdev->info.is_virtfn )
> 
> Given we have no field to mark a device as a PF, we could check that
> pdev->vf_list is not empty, and by doing so the warn_stale_vfs
> variable could be dropped?
> 
> if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
> {
>     struct pci_dev *vf_pdev;
> 
>     while ( (vf_pdev = list_first_entry_or_null(&pdev->vf_list,
>                                                 struct pci_dev,
> 						vf_list)) != NULL )
>     {
>         list_del(&vf_pdev->vf_list);
>         vf_pdev->virtfn.pf_pdev = NULL;
>         vf_pdev->broken = true;
>     }
> 
>     printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
>            &pdev->sbdf);
> }

Yeah. Given that the consensus is leaning toward keeping the PF and
returning an error, here's my suggestion:

    if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
    {
        struct pci_dev *vf_pdev;

        list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
            vf_pdev->broken = true;

        pdev->broken = true;

        printk(XENLOG_WARNING
               "Attempted to remove PCI SR-IOV PF %pp with VFs still present\n",
               &pdev->sbdf);

        ret = -EBUSY;
        break;
    }

>> +            {
>> +                struct pci_dev *vf_pdev, *tmp;
>> +                bool warn_stale_vfs = false;
>> +
>> +                list_for_each_entry_safe(vf_pdev, tmp, &pdev->vf_list, vf_list)
>> +                {
>> +                    list_del(&vf_pdev->vf_list);
>> +                    vf_pdev->virtfn.pf_pdev = NULL;
>> +                    vf_pdev->broken = true;
>> +                    warn_stale_vfs = true;
>> +                }
>> +
>> +                if ( warn_stale_vfs )
>> +                    printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
>> +                           &pdev->sbdf);
>> +            }
>> +
>>              if ( pdev->domain )
>>              {
>>                  write_lock(&pdev->domain->pci_lock);
>> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
>> index ef56e80651d6..2ea168d5f914 100644
>> --- a/xen/include/xen/pci.h
>> +++ b/xen/include/xen/pci.h
>> @@ -153,7 +153,17 @@ struct pci_dev {
>>          unsigned int count;
>>  #define PT_FAULT_THRESHOLD 10
>>      } fault;
>> +
>> +    /*
>> +     * List head if info.is_virtfn == false
>> +     * List entry if info.is_virtfn == true
>> +     */
>> +    struct list_head vf_list;
>>      u64 vf_rlen[6];
>> +    struct {
>> +        /* Only populated for VFs (info.is_virtfn == true) */
> 
> All comments here (specially the first ones) would better use PF and
> VF consistently, rather than referring to other fields in the struct.
> Specially because the fields can change names and the comments would
> then become stale.

OK

>> +        const struct pci_dev *pf_pdev;        /* Link from VF to PF */
>> +    } virtfn;
> 
> I'm unsure you need an outer virtfn struct, as it's only one field in
> this patch?  Maybe more fields gets added by further patches?

Right. There are no more fields to be added, so there's no need. It was
leftover from a previous rev when vf_list was split.

> 
> Thanks, Roger.
Roger Pau Monné Nov. 12, 2024, 9:02 a.m. UTC | #19
On Mon, Nov 11, 2024 at 03:07:28PM -0500, Stewart Hildebrand wrote:
> On 10/28/24 14:41, Roger Pau Monné wrote:
> > On Fri, Oct 18, 2024 at 04:39:09PM -0400, Stewart Hildebrand wrote:
> > IOW: shouldn't the PF
> > always be added first, so that SR-IOV can be enabled and the VFs added
> > afterwards?
> 
> Yes, I think you're right.
> 
> > I see previous code also catered for VFs being added without the PF
> > being present, so I assume there was some need for this.
> 
> This is exactly the source of the confusion on my part. In the removal
> path, the consensus seems to be that removing a PF with VFs still
> present indicates an error. Then shouldn't the opposite also be true?
> Adding a VF without a PF should also indicate an error.
> 
> I see the PF-adding logic was added in 942a6f1376d8 ("x86/PCI-MSI:
> properly determine VF BAR values"). Searching the mailing list archives
> didn't reveal much about it [0].
> 
> [0] https://lore.kernel.org/xen-devel/4E3FC6E102000078000501CA@nat28.tlf.novell.com/
> 
> The only time I've observed this path being taken is by manually
> calling PHYSDEVOP_pci_device_add. I've resorted to calling
> PHYSDEVOP_pci_device_{remove,add} from userspace in order to test this,
> because the Linux kernel doesn't behave this way.
> 
> I can't think of a good rationale for catering to VFs being added
> without a PF, so I'll turn it into an error for the next rev.

Maybe there's a case for a device to be discovered with SR-IOV already
enabled (ie: when booting in kexec like environments), but then I
would still expect the OS to first add the PF and afterwards the VFs.
Otherwise I'm not even sure whether the OS would be capable of
identifying the VFs as such.

> >> +                    ASSERT_UNREACHABLE();
> >> +                    free_pdev(pseg, pdev);
> >> +                    ret = -EILSEQ;
> >> +                    goto out;
> >> +                }
> >> +            }
> >> +
> >> +            pdev->info.is_extfn = pf_pdev->info.is_extfn;
> >> +            pdev->virtfn.pf_pdev = pf_pdev;
> >> +            list_add(&pdev->vf_list, &pf_pdev->vf_list);
> >> +        }
> >>      }
> >>  
> >>      if ( !pdev->info.is_virtfn && !pdev->vf_rlen[0] )
> >> @@ -821,6 +851,24 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
> >>      list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
> >>          if ( pdev->bus == bus && pdev->devfn == devfn )
> >>          {
> >> +            if ( !pdev->info.is_virtfn )
> > 
> > Given we have no field to mark a device as a PF, we could check that
> > pdev->vf_list is not empty, and by doing so the warn_stale_vfs
> > variable could be dropped?
> > 
> > if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
> > {
> >     struct pci_dev *vf_pdev;
> > 
> >     while ( (vf_pdev = list_first_entry_or_null(&pdev->vf_list,
> >                                                 struct pci_dev,
> > 						vf_list)) != NULL )
> >     {
> >         list_del(&vf_pdev->vf_list);
> >         vf_pdev->virtfn.pf_pdev = NULL;
> >         vf_pdev->broken = true;
> >     }
> > 
> >     printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
> >            &pdev->sbdf);
> > }
> 
> Yeah. Given that the consensus is leaning toward keeping the PF and
> returning an error, here's my suggestion:
> 
>     if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
>     {
>         struct pci_dev *vf_pdev;
> 
>         list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
>             vf_pdev->broken = true;
> 
>         pdev->broken = true;

Do you need to mark the devices as broken?  My expectation would be
that returning -EBUSY here should prevent the device from being
removed, and hence there would be no breakage, just failure to fulfill
the (possible) hot-unplug request.

Thanks, Roger.
Roger Pau Monné Nov. 12, 2024, 9:05 a.m. UTC | #20
On Mon, Nov 11, 2024 at 07:40:14AM +0100, Jan Beulich wrote:
> On 08.11.2024 17:29, Stewart Hildebrand wrote:
> > On 11/8/24 10:19, Roger Pau Monné wrote:
> >> On Fri, Nov 08, 2024 at 02:17:40PM +0100, Jan Beulich wrote:
> >>> On 08.11.2024 13:42, Alejandro Vallejo wrote:
> >>>> On Mon Nov 4, 2024 at 7:44 AM GMT, Jan Beulich wrote:
> >>>>> On 01.11.2024 21:16, Stewart Hildebrand wrote:
> >>>>>> +Daniel (XSM mention)
> >>>>>>
> >>>>>> On 10/28/24 13:02, Jan Beulich wrote:
> >>>>>>> On 18.10.2024 22:39, Stewart Hildebrand wrote:
> >>>>>>>> Add links between a VF's struct pci_dev and its associated PF struct
> >>>>>>>> pci_dev. Move the calls to pci_get_pdev()/pci_add_device() down to avoid
> >>>>>>>> dropping and re-acquiring the pcidevs_lock().
> >>>>>>>>
> >>>>>>>> During PF removal, unlink VF from PF and mark the VF broken. As before,
> >>>>>>>> VFs may exist without a corresponding PF, although now only with
> >>>>>>>> pdev->broken = true.
> >>>>>>>>
> >>>>>>>> The hardware domain is expected to remove the associated VFs before
> >>>>>>>> removing the PF. Print a warning in case a PF is removed with associated
> >>>>>>>> VFs still present.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> >>>>>>>> ---
> >>>>>>>> Candidate for backport to 4.19 (the next patch depends on this one)
> >>>>>>>>
> >>>>>>>> v5->v6:
> >>>>>>>> * move printk() before ASSERT_UNREACHABLE()
> >>>>>>>> * warn about PF removal with VFs still present
> >>>>>>>
> >>>>>>> Hmm, maybe I didn't make this clear enough when commenting on v5: I wasn't
> >>>>>>> just after an adjustment to the commit message. I'm instead actively
> >>>>>>> concerned of the resulting behavior. Question is whether we can reasonably
> >>>>>>> do something about that.
> >>>>>>
> >>>>>> Right. My suggestion then is to go back to roughly how it was done in
> >>>>>> v4 [0]:
> >>>>>>
> >>>>>> * Remove the VFs right away during PF removal, so that we don't end up
> >>>>>> with stale VFs. Regarding XSM, assume that a domain with permission to
> >>>>>> remove the PF is also allowed to remove the VFs. We should probably also
> >>>>>> return an error from pci_remove_device in the case of removing the PF
> >>>>>> with VFs still present (and still perform the removals despite returning
> >>>>>> an error). Subsequent attempts by a domain to remove the VFs would
> >>>>>> return an error (as they have already been removed), but that's expected
> >>>>>> since we've taken a stance that PF-then-VF removal order is invalid
> >>>>>> anyway.
> >>>>>
> >>>>> Imo going back is not an option.
> >>>>>
> >>>>>> While the above is what I prefer, I just want to mention other options I
> >>>>>> considered for the scenario of PF removal with VFs still present:
> >>>>>>
> >>>>>> * Increase the "scariness" of the warning message added in v6.
> >>>>>>
> >>>>>> * Return an error from pci_remove_device (while still removing only the
> >>>>>> PF). We would be left with stale VFs in Xen. At least this would
> >>>>>> concretely inform dom0 that Xen takes issue with the PF-then-VF removal
> >>>>>> order. Subsequent attempts by a domain to remove VFs, however
> >>>>>> (un)likely, would succeed.
> >>>>>
> >>>>> Returning an error in such a case is a possibility, but comes with the
> >>>>> risk of confusion. Seeing such an error, a caller may itself assume the
> >>>>> device still is there, and retry its (with or without having removed the
> >>>>> VFs) removal at a later point.
> >>>>>
> >>>>>> * Return an error from pci_remove_device and keep the PF and VFs. This
> >>>>>> is IMO the worst option because then we would have a stale PF in
> >>>>>> addition to stale VFs.
> >>
> >> I'm thinking probably this is the least bad option, and just force the
> >> owner of the PF to ensure there are no VFs left when removing the PF.
> >>
> >> What sense does it make anyway to allow removing a PF with VFs still
> >> present?  Not sure exactly what the owner of the PF will do before
> >> calling pci_remove_device(), but it would seem to me the device should
> >> be ready for unplug (so SR-IOV disabled).  Calling pci_remove_device()
> >> with VFs still active points to an error to do proper cleanup by the
> >> owner of the PF.
> > 
> > In normal, correct operation, right. The PF driver is indeed expected to
> > disable SR-IOV (i.e. remove VFs) during its removal, prior to calling
> > PHYSDEVOP_pci_device_remove for the PF.
> > 
> >> Returning error from pci_remove_device() and doing nothing would seem
> >> fine to me.  There should be no stale PF or VFs in that case, as the
> >> caller has been notified the device has failed to be removed, so
> >> should treat the device as still present.
> 
> Imo really that's another case that would best be addressed by proper
> ref-counting. Each VF would hold a ref to its PF, and hence the PF would
> go away when the last VF is removed, or when PF removal is (properly)
> last. Just that this likely is too complex a change to be warranted for
> the purpose here.
> 
> > But software has no way to guarantee there won't be a physical device
> > removal.
> > 
> > In test scenario #2 described in the first patch [1], the PF (the whole
> > device, actually) has already been physically unplugged, and dom0
> > invokes PHYSDEVOP_pci_device_remove to inform Xen about it.
> 
> I don't think that's how it's supposed to work. Physical removal should
> occur only after software has done all "soft removal". I'd view the
> notification to Xen as part of that.

That would be my expectation also, albeit IIRC we had other instances
where the calling of the removal hypercall was not done in a vetoing
manner by Linux, so it was mostly a notification that the device was
going away, rather than a request for permission to removal.

Thanks, Roger.
Jan Beulich Nov. 12, 2024, 9:39 a.m. UTC | #21
On 12.11.2024 10:02, Roger Pau Monné wrote:
> On Mon, Nov 11, 2024 at 03:07:28PM -0500, Stewart Hildebrand wrote:
>> On 10/28/24 14:41, Roger Pau Monné wrote:
>>> if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
>>> {
>>>     struct pci_dev *vf_pdev;
>>>
>>>     while ( (vf_pdev = list_first_entry_or_null(&pdev->vf_list,
>>>                                                 struct pci_dev,
>>> 						vf_list)) != NULL )
>>>     {
>>>         list_del(&vf_pdev->vf_list);
>>>         vf_pdev->virtfn.pf_pdev = NULL;
>>>         vf_pdev->broken = true;
>>>     }
>>>
>>>     printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
>>>            &pdev->sbdf);
>>> }
>>
>> Yeah. Given that the consensus is leaning toward keeping the PF and
>> returning an error, here's my suggestion:
>>
>>     if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
>>     {
>>         struct pci_dev *vf_pdev;
>>
>>         list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
>>             vf_pdev->broken = true;
>>
>>         pdev->broken = true;
> 
> Do you need to mark the devices as broken?  My expectation would be
> that returning -EBUSY here should prevent the device from being
> removed, and hence there would be no breakage, just failure to fulfill
> the (possible) hot-unplug request.

That very much depends on Dom0 kernels then actually respecting the error,
and not considering the underlying hypercall a mere notification.

Jan
Jan Beulich Nov. 12, 2024, 9:42 a.m. UTC | #22
On 11.11.2024 21:07, Stewart Hildebrand wrote:
> On 10/28/24 14:41, Roger Pau Monné wrote:
>> I see previous code also catered for VFs being added without the PF
>> being present, so I assume there was some need for this.
> 
> This is exactly the source of the confusion on my part. In the removal
> path, the consensus seems to be that removing a PF with VFs still
> present indicates an error. Then shouldn't the opposite also be true?
> Adding a VF without a PF should also indicate an error.
> 
> I see the PF-adding logic was added in 942a6f1376d8 ("x86/PCI-MSI:
> properly determine VF BAR values"). Searching the mailing list archives
> didn't reveal much about it [0].
> 
> [0] https://lore.kernel.org/xen-devel/4E3FC6E102000078000501CA@nat28.tlf.novell.com/
> 
> The only time I've observed this path being taken is by manually
> calling PHYSDEVOP_pci_device_add. I've resorted to calling
> PHYSDEVOP_pci_device_{remove,add} from userspace in order to test this,
> because the Linux kernel doesn't behave this way.

The goal was to avoid returning an error when we don't strictly need to.
With the overall adjustment ...

> I can't think of a good rationale for catering to VFs being added
> without a PF, so I'll turn it into an error for the next rev.

... changing this may indeed result in better overall consistency.

Jan
Stewart Hildebrand Nov. 12, 2024, 5:41 p.m. UTC | #23
On 11/12/24 04:39, Jan Beulich wrote:
> On 12.11.2024 10:02, Roger Pau Monné wrote:
>> On Mon, Nov 11, 2024 at 03:07:28PM -0500, Stewart Hildebrand wrote:
>>> On 10/28/24 14:41, Roger Pau Monné wrote:
>>>> if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
>>>> {
>>>>     struct pci_dev *vf_pdev;
>>>>
>>>>     while ( (vf_pdev = list_first_entry_or_null(&pdev->vf_list,
>>>>                                                 struct pci_dev,
>>>> 						vf_list)) != NULL )
>>>>     {
>>>>         list_del(&vf_pdev->vf_list);
>>>>         vf_pdev->virtfn.pf_pdev = NULL;
>>>>         vf_pdev->broken = true;
>>>>     }
>>>>
>>>>     printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
>>>>            &pdev->sbdf);
>>>> }
>>>
>>> Yeah. Given that the consensus is leaning toward keeping the PF and
>>> returning an error, here's my suggestion:
>>>
>>>     if ( !pdev->info.is_virtfn && !list_empty(&pdev->vf_list) )
>>>     {
>>>         struct pci_dev *vf_pdev;
>>>
>>>         list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
>>>             vf_pdev->broken = true;
>>>
>>>         pdev->broken = true;
>>
>> Do you need to mark the devices as broken?  My expectation would be
>> that returning -EBUSY here should prevent the device from being
>> removed, and hence there would be no breakage, just failure to fulfill
>> the (possible) hot-unplug request.
> 
> That very much depends on Dom0 kernels then actually respecting the error,
> and not considering the underlying hypercall a mere notification.

All dom0 Linux does is print a warning:

# echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs
# echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/remove
[   56.738750]  0000:01:00.0: driver left SR-IOV enabled after remove
(XEN) Attempted to remove PCI SR-IOV PF 0000:01:00.0 with VFs still present
[   56.749904] pci 0000:01:00.0: Failed to delete - passthrough or MSI/MSI-X might fail!
# echo $?
0

Subsequently, lspci reveals no entry for 0000:01:00.0. I think it's
appropriate to mark them broken.
diff mbox series

Patch

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 74d3895e1ef6..fe31255b1207 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -333,6 +333,8 @@  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
     *((u8*) &pdev->devfn) = devfn;
     pdev->domain = NULL;
 
+    INIT_LIST_HEAD(&pdev->vf_list);
+
     arch_pci_init_pdev(pdev);
 
     rc = pdev_msi_init(pdev);
@@ -449,6 +451,10 @@  static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev)
 
     list_del(&pdev->alldevs_list);
     pdev_msi_deinit(pdev);
+
+    if ( pdev->info.is_virtfn && pdev->virtfn.pf_pdev )
+        list_del(&pdev->vf_list);
+
     xfree(pdev);
 }
 
@@ -656,24 +662,11 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
     unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
     const char *type;
     int ret;
-    bool pf_is_extfn = false;
 
     if ( !info )
         type = "device";
     else if ( info->is_virtfn )
-    {
-        pcidevs_lock();
-        pdev = pci_get_pdev(NULL,
-                            PCI_SBDF(seg, info->physfn.bus,
-                                     info->physfn.devfn));
-        if ( pdev )
-            pf_is_extfn = pdev->info.is_extfn;
-        pcidevs_unlock();
-        if ( !pdev )
-            pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
-                           NULL, node);
         type = "virtual function";
-    }
     else if ( info->is_extfn )
         type = "extended function";
     else
@@ -703,7 +696,44 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
          * extended function.
          */
         if ( pdev->info.is_virtfn )
-            pdev->info.is_extfn = pf_is_extfn;
+        {
+            struct pci_dev *pf_pdev;
+
+            pf_pdev = pci_get_pdev(NULL,
+                                   PCI_SBDF(seg, info->physfn.bus,
+                                            info->physfn.devfn));
+
+            if ( !pf_pdev )
+            {
+                ret = pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
+                                     NULL, node);
+                if ( ret )
+                {
+                    printk(XENLOG_WARNING "Failed to add SR-IOV device PF %pp for VF %pp\n",
+                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
+                           &pdev->sbdf);
+                    free_pdev(pseg, pdev);
+                    goto out;
+                }
+                pf_pdev = pci_get_pdev(NULL,
+                                       PCI_SBDF(seg, info->physfn.bus,
+                                                info->physfn.devfn));
+                if ( !pf_pdev )
+                {
+                    printk(XENLOG_ERR "Failed to find SR-IOV device PF %pp for VF %pp\n",
+                           &PCI_SBDF(seg, info->physfn.bus, info->physfn.devfn),
+                           &pdev->sbdf);
+                    ASSERT_UNREACHABLE();
+                    free_pdev(pseg, pdev);
+                    ret = -EILSEQ;
+                    goto out;
+                }
+            }
+
+            pdev->info.is_extfn = pf_pdev->info.is_extfn;
+            pdev->virtfn.pf_pdev = pf_pdev;
+            list_add(&pdev->vf_list, &pf_pdev->vf_list);
+        }
     }
 
     if ( !pdev->info.is_virtfn && !pdev->vf_rlen[0] )
@@ -821,6 +851,24 @@  int pci_remove_device(u16 seg, u8 bus, u8 devfn)
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
         if ( pdev->bus == bus && pdev->devfn == devfn )
         {
+            if ( !pdev->info.is_virtfn )
+            {
+                struct pci_dev *vf_pdev, *tmp;
+                bool warn_stale_vfs = false;
+
+                list_for_each_entry_safe(vf_pdev, tmp, &pdev->vf_list, vf_list)
+                {
+                    list_del(&vf_pdev->vf_list);
+                    vf_pdev->virtfn.pf_pdev = NULL;
+                    vf_pdev->broken = true;
+                    warn_stale_vfs = true;
+                }
+
+                if ( warn_stale_vfs )
+                    printk(XENLOG_WARNING "PCI SR-IOV PF %pp removed with VFs still present\n",
+                           &pdev->sbdf);
+            }
+
             if ( pdev->domain )
             {
                 write_lock(&pdev->domain->pci_lock);
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index ef56e80651d6..2ea168d5f914 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -153,7 +153,17 @@  struct pci_dev {
         unsigned int count;
 #define PT_FAULT_THRESHOLD 10
     } fault;
+
+    /*
+     * List head if info.is_virtfn == false
+     * List entry if info.is_virtfn == true
+     */
+    struct list_head vf_list;
     u64 vf_rlen[6];
+    struct {
+        /* Only populated for VFs (info.is_virtfn == true) */
+        const struct pci_dev *pf_pdev;        /* Link from VF to PF */
+    } virtfn;
 
     /* Data for vPCI. */
     struct vpci *vpci;