diff mbox series

[for-9.2,v11,08/11] pcie_sriov: Remove num_vfs from PCIESriovPF

Message ID 20240802-reuse-v11-8-fb83bb8c19fb@daynix.com (mailing list archive)
State New, archived
Headers show
Series hw/pci: SR-IOV related fixes and improvements | expand

Commit Message

Akihiko Odaki Aug. 2, 2024, 5:17 a.m. UTC
num_vfs is not migrated so use PCI_SRIOV_CTRL_VFE and PCI_SRIOV_NUM_VF
instead.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 include/hw/pci/pcie_sriov.h |  1 -
 hw/pci/pcie_sriov.c         | 28 ++++++++++++++++++++--------
 hw/pci/trace-events         |  2 +-
 3 files changed, 21 insertions(+), 10 deletions(-)

Comments

Michael S. Tsirkin Aug. 2, 2024, 12:58 p.m. UTC | #1
On Fri, Aug 02, 2024 at 02:17:58PM +0900, Akihiko Odaki wrote:
> num_vfs is not migrated so use PCI_SRIOV_CTRL_VFE and PCI_SRIOV_NUM_VF
> instead.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  include/hw/pci/pcie_sriov.h |  1 -
>  hw/pci/pcie_sriov.c         | 28 ++++++++++++++++++++--------
>  hw/pci/trace-events         |  2 +-
>  3 files changed, 21 insertions(+), 10 deletions(-)
> 
> diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
> index 70649236c18a..5148c5b77dd1 100644
> --- a/include/hw/pci/pcie_sriov.h
> +++ b/include/hw/pci/pcie_sriov.h
> @@ -16,7 +16,6 @@
>  #include "hw/pci/pci.h"
>  
>  typedef struct PCIESriovPF {
> -    uint16_t num_vfs;   /* Number of virtual functions created */
>      uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
>      PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
>  } PCIESriovPF;
> diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
> index 9bd7f8acc3f4..fae6acea4acb 100644
> --- a/hw/pci/pcie_sriov.c
> +++ b/hw/pci/pcie_sriov.c
> @@ -57,7 +57,6 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>      pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
>                          offset, PCI_EXT_CAP_SRIOV_SIZEOF);
>      dev->exp.sriov_cap = offset;
> -    dev->exp.sriov_pf.num_vfs = 0;
>      dev->exp.sriov_pf.vf = NULL;
>  
>      pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
> @@ -186,6 +185,12 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
>      }
>  }
>  
> +static void clear_ctrl_vfe(PCIDevice *dev)
> +{
> +    uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;

space here, after definition

> +    pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
> +}
> +

Pls use pci_word_test_and_clear_mask


>  static void register_vfs(PCIDevice *dev)
>  {
>      uint16_t num_vfs;
> @@ -195,6 +200,7 @@ static void register_vfs(PCIDevice *dev)
>      assert(sriov_cap > 0);
>      num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
>      if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
> +        clear_ctrl_vfe(dev);
>          return;
>      }
>  
> @@ -203,20 +209,18 @@ static void register_vfs(PCIDevice *dev)
>      for (i = 0; i < num_vfs; i++) {
>          pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
>      }
> -    dev->exp.sriov_pf.num_vfs = num_vfs;
>  }
>  
>  static void unregister_vfs(PCIDevice *dev)
>  {
> -    uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
>      uint16_t i;
> +    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
>  
>      trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
> -                               PCI_FUNC(dev->devfn), num_vfs);
> -    for (i = 0; i < num_vfs; i++) {
> +                               PCI_FUNC(dev->devfn));
> +    for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {

Why PCI_SRIOV_TOTAL_VF not PCI_SRIOV_NUM_VF/pcie_sriov_num_vfs?


>          pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
>      }
> -    dev->exp.sriov_pf.num_vfs = 0;
>  }
>  
>  void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
> @@ -242,6 +246,9 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
>          } else {
>              unregister_vfs(dev);
>          }
> +    } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) {
> +        clear_ctrl_vfe(dev);
> +        unregister_vfs(dev);

So any write into PCI_SRIOV_NUM_VF automatically clears VFE?

Yes writing into PCI_SRIOV_NUM_VF should not happen when VFE
is set, but spec does not say we need to clear it automatically.
Why come up with random rules? just don't special case it,
whatever happens, let it happen.

And what does this change have to do with getting rid of
num_vfs?

>      }
>  }
>  
> @@ -304,7 +311,7 @@ PCIDevice *pcie_sriov_get_pf(PCIDevice *dev)
>  PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
>  {
>      assert(!pci_is_vf(dev));
> -    if (n < dev->exp.sriov_pf.num_vfs) {
> +    if (n < pcie_sriov_num_vfs(dev)) {
>          return dev->exp.sriov_pf.vf[n];
>      }
>      return NULL;
> @@ -312,5 +319,10 @@ PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
>  
>  uint16_t pcie_sriov_num_vfs(PCIDevice *dev)
>  {
> -    return dev->exp.sriov_pf.num_vfs;
> +    uint16_t sriov_cap = dev->exp.sriov_cap;
> +    uint8_t *cfg = dev->config + sriov_cap;
> +
> +    return sriov_cap &&
> +           (pci_get_word(cfg + PCI_SRIOV_CTRL) & PCI_SRIOV_CTRL_VFE) ?
> +           pci_get_word(cfg + PCI_SRIOV_NUM_VF) : 0;
>  }
> diff --git a/hw/pci/trace-events b/hw/pci/trace-events
> index 19643aa8c6b0..e98f575a9d19 100644
> --- a/hw/pci/trace-events
> +++ b/hw/pci/trace-events
> @@ -14,7 +14,7 @@ msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d mask
>  
>  # hw/pci/pcie_sriov.c
>  sriov_register_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: creating %d vf devs"
> -sriov_unregister_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: Unregistering %d vf devs"
> +sriov_unregister_vfs(const char *name, int slot, int function) "%s %02x:%x: Unregistering vf devs"
>  sriov_config_write(const char *name, int slot, int fun, uint32_t offset, uint32_t val, uint32_t len) "%s %02x:%x: sriov offset 0x%x val 0x%x len %d"
>  
>  # pcie.c
> 
> -- 
> 2.45.2
Akihiko Odaki Aug. 2, 2024, 3:38 p.m. UTC | #2
On 2024/08/02 21:58, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2024 at 02:17:58PM +0900, Akihiko Odaki wrote:
>> num_vfs is not migrated so use PCI_SRIOV_CTRL_VFE and PCI_SRIOV_NUM_VF
>> instead.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>> ---
>>   include/hw/pci/pcie_sriov.h |  1 -
>>   hw/pci/pcie_sriov.c         | 28 ++++++++++++++++++++--------
>>   hw/pci/trace-events         |  2 +-
>>   3 files changed, 21 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
>> index 70649236c18a..5148c5b77dd1 100644
>> --- a/include/hw/pci/pcie_sriov.h
>> +++ b/include/hw/pci/pcie_sriov.h
>> @@ -16,7 +16,6 @@
>>   #include "hw/pci/pci.h"
>>   
>>   typedef struct PCIESriovPF {
>> -    uint16_t num_vfs;   /* Number of virtual functions created */
>>       uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
>>       PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
>>   } PCIESriovPF;
>> diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
>> index 9bd7f8acc3f4..fae6acea4acb 100644
>> --- a/hw/pci/pcie_sriov.c
>> +++ b/hw/pci/pcie_sriov.c
>> @@ -57,7 +57,6 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>>       pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
>>                           offset, PCI_EXT_CAP_SRIOV_SIZEOF);
>>       dev->exp.sriov_cap = offset;
>> -    dev->exp.sriov_pf.num_vfs = 0;
>>       dev->exp.sriov_pf.vf = NULL;
>>   
>>       pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
>> @@ -186,6 +185,12 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
>>       }
>>   }
>>   
>> +static void clear_ctrl_vfe(PCIDevice *dev)
>> +{
>> +    uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
> 
> space here, after definition
> 
>> +    pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
>> +}
>> +
> 
> Pls use pci_word_test_and_clear_mask

That sounds good. I'll do so with the next version.

> 
> 
>>   static void register_vfs(PCIDevice *dev)
>>   {
>>       uint16_t num_vfs;
>> @@ -195,6 +200,7 @@ static void register_vfs(PCIDevice *dev)
>>       assert(sriov_cap > 0);
>>       num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
>>       if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
>> +        clear_ctrl_vfe(dev);
>>           return;
>>       }
>>   
>> @@ -203,20 +209,18 @@ static void register_vfs(PCIDevice *dev)
>>       for (i = 0; i < num_vfs; i++) {
>>           pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
>>       }
>> -    dev->exp.sriov_pf.num_vfs = num_vfs;
>>   }
>>   
>>   static void unregister_vfs(PCIDevice *dev)
>>   {
>> -    uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
>>       uint16_t i;
>> +    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
>>   
>>       trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
>> -                               PCI_FUNC(dev->devfn), num_vfs);
>> -    for (i = 0; i < num_vfs; i++) {
>> +                               PCI_FUNC(dev->devfn));
>> +    for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
> 
> Why PCI_SRIOV_TOTAL_VF not PCI_SRIOV_NUM_VF/pcie_sriov_num_vfs?

Because PCI_SRIOV_NUM_VF is overwritten when unregister_vfs() is called.

> 
> 
>>           pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
>>       }
>> -    dev->exp.sriov_pf.num_vfs = 0;
>>   }
>>   
>>   void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
>> @@ -242,6 +246,9 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
>>           } else {
>>               unregister_vfs(dev);
>>           }
>> +    } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) {
>> +        clear_ctrl_vfe(dev);
>> +        unregister_vfs(dev);
> 
> So any write into PCI_SRIOV_NUM_VF automatically clears VFE?
> 
> Yes writing into PCI_SRIOV_NUM_VF should not happen when VFE
> is set, but spec does not say we need to clear it automatically.
> Why come up with random rules? just don't special case it,
> whatever happens, let it happen.
> 
> And what does this change have to do with getting rid of
> num_vfs?

Keeping VFs working requires to know the number of VFs, but we do no 
longer know it because PCI_SRIOV_NUM_VF is overwritten. This disables 
all VFs instead of trying to keep VFs alive.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 2, 2024, 4:52 p.m. UTC | #3
On Sat, Aug 03, 2024 at 12:38:10AM +0900, Akihiko Odaki wrote:
> On 2024/08/02 21:58, Michael S. Tsirkin wrote:
> > On Fri, Aug 02, 2024 at 02:17:58PM +0900, Akihiko Odaki wrote:
> > > num_vfs is not migrated so use PCI_SRIOV_CTRL_VFE and PCI_SRIOV_NUM_VF
> > > instead.
> > > 
> > > Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> > > ---
> > >   include/hw/pci/pcie_sriov.h |  1 -
> > >   hw/pci/pcie_sriov.c         | 28 ++++++++++++++++++++--------
> > >   hw/pci/trace-events         |  2 +-
> > >   3 files changed, 21 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
> > > index 70649236c18a..5148c5b77dd1 100644
> > > --- a/include/hw/pci/pcie_sriov.h
> > > +++ b/include/hw/pci/pcie_sriov.h
> > > @@ -16,7 +16,6 @@
> > >   #include "hw/pci/pci.h"
> > >   typedef struct PCIESriovPF {
> > > -    uint16_t num_vfs;   /* Number of virtual functions created */
> > >       uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
> > >       PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
> > >   } PCIESriovPF;
> > > diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
> > > index 9bd7f8acc3f4..fae6acea4acb 100644
> > > --- a/hw/pci/pcie_sriov.c
> > > +++ b/hw/pci/pcie_sriov.c
> > > @@ -57,7 +57,6 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
> > >       pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
> > >                           offset, PCI_EXT_CAP_SRIOV_SIZEOF);
> > >       dev->exp.sriov_cap = offset;
> > > -    dev->exp.sriov_pf.num_vfs = 0;
> > >       dev->exp.sriov_pf.vf = NULL;
> > >       pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
> > > @@ -186,6 +185,12 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
> > >       }
> > >   }
> > > +static void clear_ctrl_vfe(PCIDevice *dev)
> > > +{
> > > +    uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
> > 
> > space here, after definition
> > 
> > > +    pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
> > > +}
> > > +
> > 
> > Pls use pci_word_test_and_clear_mask
> 
> That sounds good. I'll do so with the next version.
> 
> > 
> > 
> > >   static void register_vfs(PCIDevice *dev)
> > >   {
> > >       uint16_t num_vfs;
> > > @@ -195,6 +200,7 @@ static void register_vfs(PCIDevice *dev)
> > >       assert(sriov_cap > 0);
> > >       num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
> > >       if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
> > > +        clear_ctrl_vfe(dev);
> > >           return;
> > >       }
> > > @@ -203,20 +209,18 @@ static void register_vfs(PCIDevice *dev)
> > >       for (i = 0; i < num_vfs; i++) {
> > >           pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
> > >       }
> > > -    dev->exp.sriov_pf.num_vfs = num_vfs;
> > >   }
> > >   static void unregister_vfs(PCIDevice *dev)
> > >   {
> > > -    uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
> > >       uint16_t i;
> > > +    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
> > >       trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
> > > -                               PCI_FUNC(dev->devfn), num_vfs);
> > > -    for (i = 0; i < num_vfs; i++) {
> > > +                               PCI_FUNC(dev->devfn));
> > > +    for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
> > 
> > Why PCI_SRIOV_TOTAL_VF not PCI_SRIOV_NUM_VF/pcie_sriov_num_vfs?
> 
> Because PCI_SRIOV_NUM_VF is overwritten when unregister_vfs() is called.


maybe this function should get the range of VFs to unregister, then.

> > 
> > 
> > >           pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
> > >       }
> > > -    dev->exp.sriov_pf.num_vfs = 0;
> > >   }
> > >   void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
> > > @@ -242,6 +246,9 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
> > >           } else {
> > >               unregister_vfs(dev);
> > >           }
> > > +    } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) {
> > > +        clear_ctrl_vfe(dev);
> > > +        unregister_vfs(dev);
> > 
> > So any write into PCI_SRIOV_NUM_VF automatically clears VFE?
> > 
> > Yes writing into PCI_SRIOV_NUM_VF should not happen when VFE
> > is set, but spec does not say we need to clear it automatically.
> > Why come up with random rules? just don't special case it,
> > whatever happens, let it happen.
> > 
> > And what does this change have to do with getting rid of
> > num_vfs?
> 
> Keeping VFs working requires to know the number of VFs, but we do no longer
> know it because PCI_SRIOV_NUM_VF is overwritten. This disables all VFs
> instead of trying to keep VFs alive.
> 
> Regards,
> Akihiko Odaki

However, we then get into a situation where VFE is set but
PCI_SRIOV_NUM_VF no longer reflects the # of registered VFs.
Given you removed num_vfs which was exactly
the # of registered VFs, it is hard to say if that will lead to
confusion now or later.
Akihiko Odaki Aug. 4, 2024, 9:11 a.m. UTC | #4
On 2024/08/03 1:52, Michael S. Tsirkin wrote:
> On Sat, Aug 03, 2024 at 12:38:10AM +0900, Akihiko Odaki wrote:
>> On 2024/08/02 21:58, Michael S. Tsirkin wrote:
>>> On Fri, Aug 02, 2024 at 02:17:58PM +0900, Akihiko Odaki wrote:
>>>> num_vfs is not migrated so use PCI_SRIOV_CTRL_VFE and PCI_SRIOV_NUM_VF
>>>> instead.
>>>>
>>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>>> ---
>>>>    include/hw/pci/pcie_sriov.h |  1 -
>>>>    hw/pci/pcie_sriov.c         | 28 ++++++++++++++++++++--------
>>>>    hw/pci/trace-events         |  2 +-
>>>>    3 files changed, 21 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
>>>> index 70649236c18a..5148c5b77dd1 100644
>>>> --- a/include/hw/pci/pcie_sriov.h
>>>> +++ b/include/hw/pci/pcie_sriov.h
>>>> @@ -16,7 +16,6 @@
>>>>    #include "hw/pci/pci.h"
>>>>    typedef struct PCIESriovPF {
>>>> -    uint16_t num_vfs;   /* Number of virtual functions created */
>>>>        uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
>>>>        PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
>>>>    } PCIESriovPF;
>>>> diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
>>>> index 9bd7f8acc3f4..fae6acea4acb 100644
>>>> --- a/hw/pci/pcie_sriov.c
>>>> +++ b/hw/pci/pcie_sriov.c
>>>> @@ -57,7 +57,6 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>>>>        pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
>>>>                            offset, PCI_EXT_CAP_SRIOV_SIZEOF);
>>>>        dev->exp.sriov_cap = offset;
>>>> -    dev->exp.sriov_pf.num_vfs = 0;
>>>>        dev->exp.sriov_pf.vf = NULL;
>>>>        pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
>>>> @@ -186,6 +185,12 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
>>>>        }
>>>>    }
>>>> +static void clear_ctrl_vfe(PCIDevice *dev)
>>>> +{
>>>> +    uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
>>>
>>> space here, after definition
>>>
>>>> +    pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
>>>> +}
>>>> +
>>>
>>> Pls use pci_word_test_and_clear_mask
>>
>> That sounds good. I'll do so with the next version.
>>
>>>
>>>
>>>>    static void register_vfs(PCIDevice *dev)
>>>>    {
>>>>        uint16_t num_vfs;
>>>> @@ -195,6 +200,7 @@ static void register_vfs(PCIDevice *dev)
>>>>        assert(sriov_cap > 0);
>>>>        num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
>>>>        if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
>>>> +        clear_ctrl_vfe(dev);
>>>>            return;
>>>>        }
>>>> @@ -203,20 +209,18 @@ static void register_vfs(PCIDevice *dev)
>>>>        for (i = 0; i < num_vfs; i++) {
>>>>            pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
>>>>        }
>>>> -    dev->exp.sriov_pf.num_vfs = num_vfs;
>>>>    }
>>>>    static void unregister_vfs(PCIDevice *dev)
>>>>    {
>>>> -    uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
>>>>        uint16_t i;
>>>> +    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
>>>>        trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
>>>> -                               PCI_FUNC(dev->devfn), num_vfs);
>>>> -    for (i = 0; i < num_vfs; i++) {
>>>> +                               PCI_FUNC(dev->devfn));
>>>> +    for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
>>>
>>> Why PCI_SRIOV_TOTAL_VF not PCI_SRIOV_NUM_VF/pcie_sriov_num_vfs?
>>
>> Because PCI_SRIOV_NUM_VF is overwritten when unregister_vfs() is called.
> 
> 
> maybe this function should get the range of VFs to unregister, then.

PCI_SRIOV_TOTAL_VF gives always a valid value, and it is the only value 
known to valid when PCI_SRIOV_CTRL gets written with a value clearing 
PCI_SRIOV_CTRL_VFE.

> 
>>>
>>>
>>>>            pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
>>>>        }
>>>> -    dev->exp.sriov_pf.num_vfs = 0;
>>>>    }
>>>>    void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
>>>> @@ -242,6 +246,9 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
>>>>            } else {
>>>>                unregister_vfs(dev);
>>>>            }
>>>> +    } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) {
>>>> +        clear_ctrl_vfe(dev);
>>>> +        unregister_vfs(dev);
>>>
>>> So any write into PCI_SRIOV_NUM_VF automatically clears VFE?
>>>
>>> Yes writing into PCI_SRIOV_NUM_VF should not happen when VFE
>>> is set, but spec does not say we need to clear it automatically.
>>> Why come up with random rules? just don't special case it,
>>> whatever happens, let it happen.
>>>
>>> And what does this change have to do with getting rid of
>>> num_vfs?
>>
>> Keeping VFs working requires to know the number of VFs, but we do no longer
>> know it because PCI_SRIOV_NUM_VF is overwritten. This disables all VFs
>> instead of trying to keep VFs alive.
>>
>> Regards,
>> Akihiko Odaki
> 
> However, we then get into a situation where VFE is set but
> PCI_SRIOV_NUM_VF no longer reflects the # of registered VFs.
> Given you removed num_vfs which was exactly
> the # of registered VFs, it is hard to say if that will lead to
> confusion now or later.

I masked writes to PCI_SRIOV_NUM_VF when PCI_SRIOV_CTRL_VFE is set in v12.

Regards,
Akihiko Odaki
diff mbox series

Patch

diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
index 70649236c18a..5148c5b77dd1 100644
--- a/include/hw/pci/pcie_sriov.h
+++ b/include/hw/pci/pcie_sriov.h
@@ -16,7 +16,6 @@ 
 #include "hw/pci/pci.h"
 
 typedef struct PCIESriovPF {
-    uint16_t num_vfs;   /* Number of virtual functions created */
     uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
     PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
 } PCIESriovPF;
diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index 9bd7f8acc3f4..fae6acea4acb 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -57,7 +57,6 @@  bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
     pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
                         offset, PCI_EXT_CAP_SRIOV_SIZEOF);
     dev->exp.sriov_cap = offset;
-    dev->exp.sriov_pf.num_vfs = 0;
     dev->exp.sriov_pf.vf = NULL;
 
     pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
@@ -186,6 +185,12 @@  void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
     }
 }
 
+static void clear_ctrl_vfe(PCIDevice *dev)
+{
+    uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
+    pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
+}
+
 static void register_vfs(PCIDevice *dev)
 {
     uint16_t num_vfs;
@@ -195,6 +200,7 @@  static void register_vfs(PCIDevice *dev)
     assert(sriov_cap > 0);
     num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
     if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
+        clear_ctrl_vfe(dev);
         return;
     }
 
@@ -203,20 +209,18 @@  static void register_vfs(PCIDevice *dev)
     for (i = 0; i < num_vfs; i++) {
         pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
     }
-    dev->exp.sriov_pf.num_vfs = num_vfs;
 }
 
 static void unregister_vfs(PCIDevice *dev)
 {
-    uint16_t num_vfs = dev->exp.sriov_pf.num_vfs;
     uint16_t i;
+    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
 
     trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
-                               PCI_FUNC(dev->devfn), num_vfs);
-    for (i = 0; i < num_vfs; i++) {
+                               PCI_FUNC(dev->devfn));
+    for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
         pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
     }
-    dev->exp.sriov_pf.num_vfs = 0;
 }
 
 void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
@@ -242,6 +246,9 @@  void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
         } else {
             unregister_vfs(dev);
         }
+    } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) {
+        clear_ctrl_vfe(dev);
+        unregister_vfs(dev);
     }
 }
 
@@ -304,7 +311,7 @@  PCIDevice *pcie_sriov_get_pf(PCIDevice *dev)
 PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
 {
     assert(!pci_is_vf(dev));
-    if (n < dev->exp.sriov_pf.num_vfs) {
+    if (n < pcie_sriov_num_vfs(dev)) {
         return dev->exp.sriov_pf.vf[n];
     }
     return NULL;
@@ -312,5 +319,10 @@  PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
 
 uint16_t pcie_sriov_num_vfs(PCIDevice *dev)
 {
-    return dev->exp.sriov_pf.num_vfs;
+    uint16_t sriov_cap = dev->exp.sriov_cap;
+    uint8_t *cfg = dev->config + sriov_cap;
+
+    return sriov_cap &&
+           (pci_get_word(cfg + PCI_SRIOV_CTRL) & PCI_SRIOV_CTRL_VFE) ?
+           pci_get_word(cfg + PCI_SRIOV_NUM_VF) : 0;
 }
diff --git a/hw/pci/trace-events b/hw/pci/trace-events
index 19643aa8c6b0..e98f575a9d19 100644
--- a/hw/pci/trace-events
+++ b/hw/pci/trace-events
@@ -14,7 +14,7 @@  msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d mask
 
 # hw/pci/pcie_sriov.c
 sriov_register_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: creating %d vf devs"
-sriov_unregister_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: Unregistering %d vf devs"
+sriov_unregister_vfs(const char *name, int slot, int function) "%s %02x:%x: Unregistering vf devs"
 sriov_config_write(const char *name, int slot, int fun, uint32_t offset, uint32_t val, uint32_t len) "%s %02x:%x: sriov offset 0x%x val 0x%x len %d"
 
 # pcie.c