Message ID | 1414942894-17034-9-git-send-email-weiyang@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: > From: Gavin Shan <gwshan@linux.vnet.ibm.com> > > pci_dn is the extension of PCI device node and it's created from > device node. Unfortunately, VFs that are enabled dynamically by > PF's driver and they don't have corresponding device nodes, and > pci_dn. The patch refactors pci_dn to support VFs: > > * pci_dn is organized as a hierarchy tree. VF's pci_dn is put > to the child list of pci_dn of PF's bridge. pci_dn of other > device put to the child list of pci_dn of its upstream bridge. > > * VF's pci_dn is expected to be created dynamically when applying > final fixup to PF. VF's pci_dn will be destroyed when releasing > PF's pci_dev instance. pci_dn of other device is still created > from device node as before. > > * For one particular PCI device (VF or not), its pci_dn can be > found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), > or parent's list. The fast path (fetching pci_dn through PCI > device instance) is populated during early fixup time. > > Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> > --- > ... > +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) > +{ > +#ifdef CONFIG_PCI_IOV > + struct pci_dn *parent, *pdn; > + int i; > + > + /* Only support IOV for now */ > + if (!pdev->is_physfn) > + return pci_get_pdn(pdev); > + > + /* Check if VFs have been populated */ > + pdn = pci_get_pdn(pdev); > + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) > + return NULL; > + > + pdn->flags |= PCI_DN_FLAG_IOV_VF; > + parent = pci_bus_to_pdn(pdev->bus); > + if (!parent) > + return NULL; > + > + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { > + pdn = add_one_dev_pci_info(parent, NULL, > + pci_iov_virtfn_bus(pdev, i), > + pci_iov_virtfn_devfn(pdev, i)); I'm not sure this makes sense, but I certainly don't know this code, so maybe I'm missing something. pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride and First VF Offset in the SR-IOV capability by sriov_init(), which is called before add_dev_pci_info(): pci_scan_child_bus pci_scan_slot pci_scan_single_device pci_device_add pci_init_capabilities pci_iov_init(PF) sriov_init(PF, pos) pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) iov->offset = offset iov->stride = stride pci_bus_add_devices pci_bus_add_device pci_fixup_device(pci_fixup_final) add_dev_pci_info pci_iov_virtfn_bus return ... + sriov->offset + (sriov->stride * id) ... But both First VF Offset and VF Stride change when ARI Capable Hierarchy or NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in sriov_init() above. We will change NumVFs to something different when a driver calls pci_enable_sriov(): pci_enable_sriov sriov_enable pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) Now First VF Offset and VF Stride have changed from what they were when we called pci_iov_virtfn_bus() above. > + if (!pdn) { > + pr_warn("%s: Cannot create firmware data " > + "for VF#%d of %s\n", > + __func__, i, pci_name(pdev)); > + return NULL; > + } > + } > +#endif > + > + return pci_get_pdn(pdev); > +} > ... > +static void pci_dev_pdn_create(struct pci_dev *pdev) > +{ > + add_dev_pci_info(pdev); > +} > +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create); There are no other callers of add_dev_pci_info(), so it seems pointless to declare it in arch/powerpc/include/asm/pci-bridge.h and add this wrapper around it. > + > +static void pci_dev_pdn_setup(struct pci_dev *pdev) > +{ > + struct pci_dn *pdn; > + > + if (pdev->dev.archdata.firmware_data) > + return; > + > + /* Setup the fast path */ > + pdn = pci_get_pdn(pdev); > + pdev->dev.archdata.firmware_data = pdn; > +} > +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup); > -- > 1.7.9.5 > -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: >> From: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> pci_dn is the extension of PCI device node and it's created from >> device node. Unfortunately, VFs that are enabled dynamically by >> PF's driver and they don't have corresponding device nodes, and >> pci_dn. The patch refactors pci_dn to support VFs: >> >> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put >> to the child list of pci_dn of PF's bridge. pci_dn of other >> device put to the child list of pci_dn of its upstream bridge. >> >> * VF's pci_dn is expected to be created dynamically when applying >> final fixup to PF. VF's pci_dn will be destroyed when releasing >> PF's pci_dev instance. pci_dn of other device is still created >> from device node as before. >> >> * For one particular PCI device (VF or not), its pci_dn can be >> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), >> or parent's list. The fast path (fetching pci_dn through PCI >> device instance) is populated during early fixup time. >> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >> --- >> ... > >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) >> +{ >> +#ifdef CONFIG_PCI_IOV >> + struct pci_dn *parent, *pdn; >> + int i; >> + >> + /* Only support IOV for now */ >> + if (!pdev->is_physfn) >> + return pci_get_pdn(pdev); >> + >> + /* Check if VFs have been populated */ >> + pdn = pci_get_pdn(pdev); >> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) >> + return NULL; >> + >> + pdn->flags |= PCI_DN_FLAG_IOV_VF; >> + parent = pci_bus_to_pdn(pdev->bus); >> + if (!parent) >> + return NULL; >> + >> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { >> + pdn = add_one_dev_pci_info(parent, NULL, >> + pci_iov_virtfn_bus(pdev, i), >> + pci_iov_virtfn_devfn(pdev, i)); > >I'm not sure this makes sense, but I certainly don't know this code, so >maybe I'm missing something. > For ARI, Richard had some patches to fix the issue from firmware side. >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on >pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride >and First VF Offset in the SR-IOV capability by sriov_init(), which is >called before add_dev_pci_info(): > > pci_scan_child_bus > pci_scan_slot > pci_scan_single_device > pci_device_add > pci_init_capabilities > pci_iov_init(PF) > sriov_init(PF, pos) > pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) > pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) > pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) > iov->offset = offset > iov->stride = stride > > pci_bus_add_devices > pci_bus_add_device > pci_fixup_device(pci_fixup_final) > add_dev_pci_info > pci_iov_virtfn_bus > return ... + sriov->offset + (sriov->stride * id) ... > >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in >sriov_init() above. We will change NumVFs to something different when a >driver calls pci_enable_sriov(): > > pci_enable_sriov > sriov_enable > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) > >Now First VF Offset and VF Stride have changed from what they were when we >called pci_iov_virtfn_bus() above. > It's the case we missed: First VF Offset and VF Stride can change when PF's number of VFs is changed. It means the BDFN (Bus/Device/Function number) for one VF can't be determined until PF's number of VFs is populated and updated to HW (before calling to virtfn_add()). The dynamically created pci_dn is used in PCI config accessors currently. That means we have to get it ready before first PCI config request to the VF in pci_setup_device(). In the code of old revision, we had some weak function called in pci_alloc_dev(), which gave platform chance to create pci_dn. I think we have to switch back to the old way in order to fix the problem you catched. However, the old way is implemented with cost of more weak function, which you're probably unhappy to see. sriov_enable() virtfn_add() virtfn_add_bus() pci_alloc_dev() pci_setup_device() >> + if (!pdn) { >> + pr_warn("%s: Cannot create firmware data " >> + "for VF#%d of %s\n", >> + __func__, i, pci_name(pdev)); >> + return NULL; >> + } >> + } >> +#endif >> + >> + return pci_get_pdn(pdev); >> +} >> ... > >> +static void pci_dev_pdn_create(struct pci_dev *pdev) >> +{ >> + add_dev_pci_info(pdev); >> +} >> +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create); > >There are no other callers of add_dev_pci_info(), so it seems pointless to >declare it in arch/powerpc/include/asm/pci-bridge.h and add this wrapper >around it. > Yep and will fix. Thanks, Gavin >> + >> +static void pci_dev_pdn_setup(struct pci_dev *pdev) >> +{ >> + struct pci_dn *pdn; >> + >> + if (pdev->dev.archdata.firmware_data) >> + return; >> + >> + /* Setup the fast path */ >> + pdn = pci_get_pdn(pdev); >> + pdev->dev.archdata.firmware_data = pdn; >> +} >> +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup); >> -- >> 1.7.9.5 >> > -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: >> From: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> pci_dn is the extension of PCI device node and it's created from >> device node. Unfortunately, VFs that are enabled dynamically by >> PF's driver and they don't have corresponding device nodes, and >> pci_dn. The patch refactors pci_dn to support VFs: >> >> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put >> to the child list of pci_dn of PF's bridge. pci_dn of other >> device put to the child list of pci_dn of its upstream bridge. >> >> * VF's pci_dn is expected to be created dynamically when applying >> final fixup to PF. VF's pci_dn will be destroyed when releasing >> PF's pci_dev instance. pci_dn of other device is still created >> from device node as before. >> >> * For one particular PCI device (VF or not), its pci_dn can be >> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), >> or parent's list. The fast path (fetching pci_dn through PCI >> device instance) is populated during early fixup time. >> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >> --- >> ... > >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) >> +{ >> +#ifdef CONFIG_PCI_IOV >> + struct pci_dn *parent, *pdn; >> + int i; >> + >> + /* Only support IOV for now */ >> + if (!pdev->is_physfn) >> + return pci_get_pdn(pdev); >> + >> + /* Check if VFs have been populated */ >> + pdn = pci_get_pdn(pdev); >> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) >> + return NULL; >> + >> + pdn->flags |= PCI_DN_FLAG_IOV_VF; >> + parent = pci_bus_to_pdn(pdev->bus); >> + if (!parent) >> + return NULL; >> + >> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { >> + pdn = add_one_dev_pci_info(parent, NULL, >> + pci_iov_virtfn_bus(pdev, i), >> + pci_iov_virtfn_devfn(pdev, i)); > >I'm not sure this makes sense, but I certainly don't know this code, so >maybe I'm missing something. > >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on >pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride >and First VF Offset in the SR-IOV capability by sriov_init(), which is >called before add_dev_pci_info(): > > pci_scan_child_bus > pci_scan_slot > pci_scan_single_device > pci_device_add > pci_init_capabilities > pci_iov_init(PF) > sriov_init(PF, pos) > pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) > pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) > pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) > iov->offset = offset > iov->stride = stride > > pci_bus_add_devices > pci_bus_add_device > pci_fixup_device(pci_fixup_final) > add_dev_pci_info > pci_iov_virtfn_bus > return ... + sriov->offset + (sriov->stride * id) ... > >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in >sriov_init() above. We will change NumVFs to something different when a >driver calls pci_enable_sriov(): > > pci_enable_sriov > sriov_enable > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) > >Now First VF Offset and VF Stride have changed from what they were when we >called pci_iov_virtfn_bus() above. > Oops, I see the ARI would affect those value, while missed the NumVFs also would. Let's look at the problem one by one. 1. The ARI capability. =============================================================================== The kernel initialize the capability like this: pci_init_capabilities() pci_configure_ari() pci_iov_init() iov->offset = offset iov->stride = stride When offset/stride is retrieved at this point, the ARI capability is taken into consideration. 2. The PF's NumVFs field =============================================================================== 2.1 Potential problem in current code =============================================================================== First, is current pci code has some potential problem? sriov_enable() pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset); pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride); iov->offset = offset; iov->stride = stride; pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); virtfn_add() ... virtfn->devfn = pci_iov_virtfn_devfn(dev, id); The sriov_enable() retrieve the offset/stride then write the NumVFs. According to the SPEC, at this moment the offset/stride may change. While I don't see some code to retrieve and store those value again. And these fields will be used in virtfn_add(). If my understanding is correct, I suggest to move the retrieve and store operation after NumVFs is written. 2.2 The IOV bus range may not be correct in pci_scan_child_bus()? =============================================================================== In current pci core, when enumerating the pci tree, we do like this: pci_scan_child_bus() pci_scan_slot() pci_scan_single_device() pci_device_add() pci_init_capabilities() pci_iov_init() max += pci_iov_bus_range(bus); busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1); max = pci_scan_bridge(bus, dev, max, pass); From this point, we see pci core reserve some bus range for VFs. This calculation is based on the offset/stride at this moment. And do the enumeration with the new bus number. sriov_enable() could be called several times from driver to enable SRIOV, and with different nr_virtfn. If each time the NumVFs written, the offset/stride will change. This means we may try to address an extra bus we didn't reserve? Or this means it is out of control? Do I miss something? 2.3 How can I reserve bus range in FW? =============================================================================== This question comes from the previous one. Based on my understanding, current pci core will rely on the bus number in HW if pcibios_assign_all_busses() is not set. If we want to support those VFs sits on different bus with PF, we need to reserve bus range and write the correct secondary/subordinate in bridge. Otherwise, those VFs on different bus may not be addressed. Currently I am writing the code in FW to reserve the range with the same mechanism in pci core. While as you mentioned the offset/stride may change after sriov_enable(), I am confused whether this is the correct way. 2.4 The potential problem for [Patch 08/18] =============================================================================== According to the SPEC, the offset/stride will change after each sriov_enable(). This means the bus/devfn will change after each sriov_enable(). My current thought is to fix it up in virtfn_add(). If the total VF number will not change, we could create those pci_dn at the beginning and fix the bus/devfn at each time the VF is truely created.
On Thu, Nov 20, 2014 at 12:02:13PM +1100, Gavin Shan wrote: >On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: >>On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: >>> From: Gavin Shan <gwshan@linux.vnet.ibm.com> >>> >>> pci_dn is the extension of PCI device node and it's created from >>> device node. Unfortunately, VFs that are enabled dynamically by >>> PF's driver and they don't have corresponding device nodes, and >>> pci_dn. The patch refactors pci_dn to support VFs: >>> >>> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put >>> to the child list of pci_dn of PF's bridge. pci_dn of other >>> device put to the child list of pci_dn of its upstream bridge. >>> >>> * VF's pci_dn is expected to be created dynamically when applying >>> final fixup to PF. VF's pci_dn will be destroyed when releasing >>> PF's pci_dev instance. pci_dn of other device is still created >>> from device node as before. >>> >>> * For one particular PCI device (VF or not), its pci_dn can be >>> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), >>> or parent's list. The fast path (fetching pci_dn through PCI >>> device instance) is populated during early fixup time. >>> >>> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >>> --- >>> ... >> >>> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) >>> +{ >>> +#ifdef CONFIG_PCI_IOV >>> + struct pci_dn *parent, *pdn; >>> + int i; >>> + >>> + /* Only support IOV for now */ >>> + if (!pdev->is_physfn) >>> + return pci_get_pdn(pdev); >>> + >>> + /* Check if VFs have been populated */ >>> + pdn = pci_get_pdn(pdev); >>> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) >>> + return NULL; >>> + >>> + pdn->flags |= PCI_DN_FLAG_IOV_VF; >>> + parent = pci_bus_to_pdn(pdev->bus); >>> + if (!parent) >>> + return NULL; >>> + >>> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { >>> + pdn = add_one_dev_pci_info(parent, NULL, >>> + pci_iov_virtfn_bus(pdev, i), >>> + pci_iov_virtfn_devfn(pdev, i)); >> >>I'm not sure this makes sense, but I certainly don't know this code, so >>maybe I'm missing something. >> > >For ARI, Richard had some patches to fix the issue from firmware side. > >>pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on >>pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride >>and First VF Offset in the SR-IOV capability by sriov_init(), which is >>called before add_dev_pci_info(): >> >> pci_scan_child_bus >> pci_scan_slot >> pci_scan_single_device >> pci_device_add >> pci_init_capabilities >> pci_iov_init(PF) >> sriov_init(PF, pos) >> pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) >> pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) >> pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) >> iov->offset = offset >> iov->stride = stride >> >> pci_bus_add_devices >> pci_bus_add_device >> pci_fixup_device(pci_fixup_final) >> add_dev_pci_info >> pci_iov_virtfn_bus >> return ... + sriov->offset + (sriov->stride * id) ... >> >>But both First VF Offset and VF Stride change when ARI Capable Hierarchy or >>NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in >>sriov_init() above. We will change NumVFs to something different when a >>driver calls pci_enable_sriov(): >> >> pci_enable_sriov >> sriov_enable >> pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) >> >>Now First VF Offset and VF Stride have changed from what they were when we >>called pci_iov_virtfn_bus() above. >> > >It's the case we missed: First VF Offset and VF Stride can change when >PF's number of VFs is changed. It means the BDFN (Bus/Device/Function >number) for one VF can't be determined until PF's number of VFs is >populated and updated to HW (before calling to virtfn_add()). > >The dynamically created pci_dn is used in PCI config accessors currently. >That means we have to get it ready before first PCI config request to the >VF in pci_setup_device(). In the code of old revision, we had some weak >function called in pci_alloc_dev(), which gave platform chance to create >pci_dn. I think we have to switch back to the old way in order to fix >the problem you catched. However, the old way is implemented with cost >of more weak function, which you're probably unhappy to see. > > sriov_enable() > virtfn_add() > virtfn_add_bus() > pci_alloc_dev() > pci_setup_device() Ok, sounds my solution in previous reply can't work. We need the pci_dn ready before access the configuration space of VFs.
On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote: > On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: > >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: > >> From: Gavin Shan <gwshan@linux.vnet.ibm.com> > >> > >> pci_dn is the extension of PCI device node and it's created from > >> device node. Unfortunately, VFs that are enabled dynamically by > >> PF's driver and they don't have corresponding device nodes, and > >> pci_dn. The patch refactors pci_dn to support VFs: > >> > >> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put > >> to the child list of pci_dn of PF's bridge. pci_dn of other > >> device put to the child list of pci_dn of its upstream bridge. > >> > >> * VF's pci_dn is expected to be created dynamically when applying > >> final fixup to PF. VF's pci_dn will be destroyed when releasing > >> PF's pci_dev instance. pci_dn of other device is still created > >> from device node as before. > >> > >> * For one particular PCI device (VF or not), its pci_dn can be > >> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), > >> or parent's list. The fast path (fetching pci_dn through PCI > >> device instance) is populated during early fixup time. > >> > >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> > >> --- > >> ... > > > >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) > >> +{ > >> +#ifdef CONFIG_PCI_IOV > >> + struct pci_dn *parent, *pdn; > >> + int i; > >> + > >> + /* Only support IOV for now */ > >> + if (!pdev->is_physfn) > >> + return pci_get_pdn(pdev); > >> + > >> + /* Check if VFs have been populated */ > >> + pdn = pci_get_pdn(pdev); > >> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) > >> + return NULL; > >> + > >> + pdn->flags |= PCI_DN_FLAG_IOV_VF; > >> + parent = pci_bus_to_pdn(pdev->bus); > >> + if (!parent) > >> + return NULL; > >> + > >> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { > >> + pdn = add_one_dev_pci_info(parent, NULL, > >> + pci_iov_virtfn_bus(pdev, i), > >> + pci_iov_virtfn_devfn(pdev, i)); > > > >I'm not sure this makes sense, but I certainly don't know this code, so > >maybe I'm missing something. > > > >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on > >pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride > >and First VF Offset in the SR-IOV capability by sriov_init(), which is > >called before add_dev_pci_info(): > > > > pci_scan_child_bus > > pci_scan_slot > > pci_scan_single_device > > pci_device_add > > pci_init_capabilities > > pci_iov_init(PF) > > sriov_init(PF, pos) > > pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) > > pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) > > pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) > > iov->offset = offset > > iov->stride = stride > > > > pci_bus_add_devices > > pci_bus_add_device > > pci_fixup_device(pci_fixup_final) > > add_dev_pci_info > > pci_iov_virtfn_bus > > return ... + sriov->offset + (sriov->stride * id) ... > > > >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or > >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in > >sriov_init() above. We will change NumVFs to something different when a > >driver calls pci_enable_sriov(): > > > > pci_enable_sriov > > sriov_enable > > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) > > > >Now First VF Offset and VF Stride have changed from what they were when we > >called pci_iov_virtfn_bus() above. > > > > Oops, I see the ARI would affect those value, while missed the NumVFs also > would. > > Let's look at the problem one by one. > > 1. The ARI capability. > =============================================================================== > The kernel initialize the capability like this: > > pci_init_capabilities() > pci_configure_ari() > pci_iov_init() > iov->offset = offset > iov->stride = stride > > When offset/stride is retrieved at this point, the ARI capability is taken > into consideration. PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the PF, so I don't think this is really a problem. > 2. The PF's NumVFs field > =============================================================================== > 2.1 Potential problem in current code > =============================================================================== > First, is current pci code has some potential problem? > > sriov_enable() > pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset); > pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride); > iov->offset = offset; > iov->stride = stride; > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); > virtfn_add() > ... > virtfn->devfn = pci_iov_virtfn_devfn(dev, id); > > The sriov_enable() retrieve the offset/stride then write the NumVFs. According > to the SPEC, at this moment the offset/stride may change. While I don't see > some code to retrieve and store those value again. And these fields will be > used in virtfn_add(). > > If my understanding is correct, I suggest to move the retrieve and store > operation after NumVFs is written. Yep, it looks like the existing code has similar problems. We might want to add a simple function that writes PCI_SRIOV_NUM_VF, then reads PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values in dev->sriov. Then we'd at least know that virtfn_bus() and virtfn_devfn() return values that are correct until the next NumVFs change. > 2.2 The IOV bus range may not be correct in pci_scan_child_bus()? > =============================================================================== > In current pci core, when enumerating the pci tree, we do like this: > > pci_scan_child_bus() > pci_scan_slot() > pci_scan_single_device() > pci_device_add() > pci_init_capabilities() > pci_iov_init() > max += pci_iov_bus_range(bus); > busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1); > max = pci_scan_bridge(bus, dev, max, pass); > > From this point, we see pci core reserve some bus range for VFs. This > calculation is based on the offset/stride at this moment. And do the > enumeration with the new bus number. > > sriov_enable() could be called several times from driver to enable SRIOV, and > with different nr_virtfn. If each time the NumVFs written, the offset/stride > will change. This means we may try to address an extra bus we didn't reserve? > Or this means it is out of control? This looks like a problem, too. I don't have a good suggestion for fixing it. > 2.3 How can I reserve bus range in FW? > =============================================================================== > This question comes from the previous one. > > Based on my understanding, current pci core will rely on the bus number in HW > if pcibios_assign_all_busses() is not set. If we want to support those VFs > sits on different bus with PF, we need to reserve bus range and write the > correct secondary/subordinate in bridge. Otherwise, those VFs on different bus > may not be addressed. > > Currently I am writing the code in FW to reserve the range with the same > mechanism in pci core. While as you mentioned the offset/stride may change > after sriov_enable(), I am confused whether this is the correct way. If your firmware knows something about the device and can compute the number of buses it will likely need, it can set up bridges with appropriate bus number ranges, and Linux will generally leave those alone. I don't know the best way to figure out the number of buses, though. It seems like you almost need to experimentally set NumVFs and read the resulting offset/stride, because I think it's really up to the device to decide how to number the VFs. Maybe pci_iov_bus_range() needs to do something similar. > 2.4 The potential problem for [Patch 08/18] > =============================================================================== > According to the SPEC, the offset/stride will change after each > sriov_enable(). This means the bus/devfn will change after each > sriov_enable(). > > My current thought is to fix it up in virtfn_add(). If the total VF number > will not change, we could create those pci_dn at the beginning and fix the > bus/devfn at each time the VF is truely created. By "fix it up," I assume you mean call an arch function that does the pci_pdn setup you need. It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(), sriov_enable(), sriov_disable(), and sriov_restore_state(). From a hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set, so it might be nice to have this setup connected to that. Bjorn -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Nov 20, 2014 at 12:05:41PM -0700, Bjorn Helgaas wrote: >On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote: >> On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: >> >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: >> >> From: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> >> >> pci_dn is the extension of PCI device node and it's created from >> >> device node. Unfortunately, VFs that are enabled dynamically by >> >> PF's driver and they don't have corresponding device nodes, and >> >> pci_dn. The patch refactors pci_dn to support VFs: >> >> >> >> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put >> >> to the child list of pci_dn of PF's bridge. pci_dn of other >> >> device put to the child list of pci_dn of its upstream bridge. >> >> >> >> * VF's pci_dn is expected to be created dynamically when applying >> >> final fixup to PF. VF's pci_dn will be destroyed when releasing >> >> PF's pci_dev instance. pci_dn of other device is still created >> >> from device node as before. >> >> >> >> * For one particular PCI device (VF or not), its pci_dn can be >> >> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), >> >> or parent's list. The fast path (fetching pci_dn through PCI >> >> device instance) is populated during early fixup time. >> >> >> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> --- >> >> ... >> > >> >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) >> >> +{ >> >> +#ifdef CONFIG_PCI_IOV >> >> + struct pci_dn *parent, *pdn; >> >> + int i; >> >> + >> >> + /* Only support IOV for now */ >> >> + if (!pdev->is_physfn) >> >> + return pci_get_pdn(pdev); >> >> + >> >> + /* Check if VFs have been populated */ >> >> + pdn = pci_get_pdn(pdev); >> >> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) >> >> + return NULL; >> >> + >> >> + pdn->flags |= PCI_DN_FLAG_IOV_VF; >> >> + parent = pci_bus_to_pdn(pdev->bus); >> >> + if (!parent) >> >> + return NULL; >> >> + >> >> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { >> >> + pdn = add_one_dev_pci_info(parent, NULL, >> >> + pci_iov_virtfn_bus(pdev, i), >> >> + pci_iov_virtfn_devfn(pdev, i)); >> > >> >I'm not sure this makes sense, but I certainly don't know this code, so >> >maybe I'm missing something. >> > >> >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on >> >pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride >> >and First VF Offset in the SR-IOV capability by sriov_init(), which is >> >called before add_dev_pci_info(): >> > >> > pci_scan_child_bus >> > pci_scan_slot >> > pci_scan_single_device >> > pci_device_add >> > pci_init_capabilities >> > pci_iov_init(PF) >> > sriov_init(PF, pos) >> > pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) >> > pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) >> > pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) >> > iov->offset = offset >> > iov->stride = stride >> > >> > pci_bus_add_devices >> > pci_bus_add_device >> > pci_fixup_device(pci_fixup_final) >> > add_dev_pci_info >> > pci_iov_virtfn_bus >> > return ... + sriov->offset + (sriov->stride * id) ... >> > >> >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or >> >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in >> >sriov_init() above. We will change NumVFs to something different when a >> >driver calls pci_enable_sriov(): >> > >> > pci_enable_sriov >> > sriov_enable >> > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) >> > >> >Now First VF Offset and VF Stride have changed from what they were when we >> >called pci_iov_virtfn_bus() above. >> > >> >> Oops, I see the ARI would affect those value, while missed the NumVFs also >> would. >> >> Let's look at the problem one by one. >> >> 1. The ARI capability. >> =============================================================================== >> The kernel initialize the capability like this: >> >> pci_init_capabilities() >> pci_configure_ari() >> pci_iov_init() >> iov->offset = offset >> iov->stride = stride >> >> When offset/stride is retrieved at this point, the ARI capability is taken >> into consideration. > >PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the >PF, so I don't think this is really a problem. > >> 2. The PF's NumVFs field >> =============================================================================== >> 2.1 Potential problem in current code >> =============================================================================== >> First, is current pci code has some potential problem? >> >> sriov_enable() >> pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset); >> pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride); >> iov->offset = offset; >> iov->stride = stride; >> pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); >> virtfn_add() >> ... >> virtfn->devfn = pci_iov_virtfn_devfn(dev, id); >> >> The sriov_enable() retrieve the offset/stride then write the NumVFs. According >> to the SPEC, at this moment the offset/stride may change. While I don't see >> some code to retrieve and store those value again. And these fields will be >> used in virtfn_add(). >> >> If my understanding is correct, I suggest to move the retrieve and store >> operation after NumVFs is written. > >Yep, it looks like the existing code has similar problems. We might want >to add a simple function that writes PCI_SRIOV_NUM_VF, then reads >PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values >in dev->sriov. > >Then we'd at least know that virtfn_bus() and virtfn_devfn() return values >that are correct until the next NumVFs change. > >> 2.2 The IOV bus range may not be correct in pci_scan_child_bus()? >> =============================================================================== >> In current pci core, when enumerating the pci tree, we do like this: >> >> pci_scan_child_bus() >> pci_scan_slot() >> pci_scan_single_device() >> pci_device_add() >> pci_init_capabilities() >> pci_iov_init() >> max += pci_iov_bus_range(bus); >> busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1); >> max = pci_scan_bridge(bus, dev, max, pass); >> >> From this point, we see pci core reserve some bus range for VFs. This >> calculation is based on the offset/stride at this moment. And do the >> enumeration with the new bus number. >> >> sriov_enable() could be called several times from driver to enable SRIOV, and >> with different nr_virtfn. If each time the NumVFs written, the offset/stride >> will change. This means we may try to address an extra bus we didn't reserve? >> Or this means it is out of control? > >This looks like a problem, too. I don't have a good suggestion for fixing >it. > >> 2.3 How can I reserve bus range in FW? >> =============================================================================== >> This question comes from the previous one. >> >> Based on my understanding, current pci core will rely on the bus number in HW >> if pcibios_assign_all_busses() is not set. If we want to support those VFs >> sits on different bus with PF, we need to reserve bus range and write the >> correct secondary/subordinate in bridge. Otherwise, those VFs on different bus >> may not be addressed. >> >> Currently I am writing the code in FW to reserve the range with the same >> mechanism in pci core. While as you mentioned the offset/stride may change >> after sriov_enable(), I am confused whether this is the correct way. > >If your firmware knows something about the device and can compute the >number of buses it will likely need, it can set up bridges with appropriate >bus number ranges, and Linux will generally leave those alone. > >I don't know the best way to figure out the number of buses, though. It >seems like you almost need to experimentally set NumVFs and read the >resulting offset/stride, because I think it's really up to the device to >decide how to number the VFs. Maybe pci_iov_bus_range() needs to do >something similar. > Yep, it's the reasonable way to probe maximal number of buses for the upstream bridge of PF. I guess Richard need implement similar thing in firmware. >> 2.4 The potential problem for [Patch 08/18] >> =============================================================================== >> According to the SPEC, the offset/stride will change after each >> sriov_enable(). This means the bus/devfn will change after each >> sriov_enable(). >> >> My current thought is to fix it up in virtfn_add(). If the total VF number >> will not change, we could create those pci_dn at the beginning and fix the >> bus/devfn at each time the VF is truely created. > >By "fix it up," I assume you mean call an arch function that does the >pci_pdn setup you need. > >It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or >at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(), >sriov_enable(), sriov_disable(), and sriov_restore_state(). From a >hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set, >so it might be nice to have this setup connected to that. > Yes, Both ways can fix the issue. For couple reasons, I want add weak pcibios_virtfn_add(), which is called in virtfn_add() if you agree. - PCI_SRIOV_CTRL_VFE might be set somewhere except the functions you pointed. Set/clear PCI_SRIOV_CTRL_VFE will invoke background work to check pci_dn and add/remove accordingly. It would be overhead which we can avoid. - We plan to support EEH for VFs in future. virtfn_add() way matches with current EEH implementation better. EEH device and PE are created based on (struct pci_dev), and EEH devices and PE can be destroied in time in pcibios_release_device(), which is invoked by virtfn_remove(). So we only need one weak function. In contrast, we have to create EEH device and PE for VFs a bit early before any VFs are instantiated, and destroy them a bit late after all VFs are offline. Thanks, Gavin >Bjorn > -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Nov 20, 2014 at 12:05:41PM -0700, Bjorn Helgaas wrote: >On Thu, Nov 20, 2014 at 03:20:57PM +0800, Wei Yang wrote: >> On Wed, Nov 19, 2014 at 04:30:24PM -0700, Bjorn Helgaas wrote: >> >On Sun, Nov 02, 2014 at 11:41:24PM +0800, Wei Yang wrote: >> >> From: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> >> >> pci_dn is the extension of PCI device node and it's created from >> >> device node. Unfortunately, VFs that are enabled dynamically by >> >> PF's driver and they don't have corresponding device nodes, and >> >> pci_dn. The patch refactors pci_dn to support VFs: >> >> >> >> * pci_dn is organized as a hierarchy tree. VF's pci_dn is put >> >> to the child list of pci_dn of PF's bridge. pci_dn of other >> >> device put to the child list of pci_dn of its upstream bridge. >> >> >> >> * VF's pci_dn is expected to be created dynamically when applying >> >> final fixup to PF. VF's pci_dn will be destroyed when releasing >> >> PF's pci_dev instance. pci_dn of other device is still created >> >> from device node as before. >> >> >> >> * For one particular PCI device (VF or not), its pci_dn can be >> >> found from pdev->dev.archdata.firmware_data, PCI_DN(devnode), >> >> or parent's list. The fast path (fetching pci_dn through PCI >> >> device instance) is populated during early fixup time. >> >> >> >> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> >> >> --- >> >> ... >> > >> >> +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) >> >> +{ >> >> +#ifdef CONFIG_PCI_IOV >> >> + struct pci_dn *parent, *pdn; >> >> + int i; >> >> + >> >> + /* Only support IOV for now */ >> >> + if (!pdev->is_physfn) >> >> + return pci_get_pdn(pdev); >> >> + >> >> + /* Check if VFs have been populated */ >> >> + pdn = pci_get_pdn(pdev); >> >> + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) >> >> + return NULL; >> >> + >> >> + pdn->flags |= PCI_DN_FLAG_IOV_VF; >> >> + parent = pci_bus_to_pdn(pdev->bus); >> >> + if (!parent) >> >> + return NULL; >> >> + >> >> + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { >> >> + pdn = add_one_dev_pci_info(parent, NULL, >> >> + pci_iov_virtfn_bus(pdev, i), >> >> + pci_iov_virtfn_devfn(pdev, i)); >> > >> >I'm not sure this makes sense, but I certainly don't know this code, so >> >maybe I'm missing something. >> > >> >pci_iov_virtfn_bus() and pci_iov_virtfn_devfn() depend on >> >pdev->sriov->stride and pdev->sriov->offset. These are read from VF Stride >> >and First VF Offset in the SR-IOV capability by sriov_init(), which is >> >called before add_dev_pci_info(): >> > >> > pci_scan_child_bus >> > pci_scan_slot >> > pci_scan_single_device >> > pci_device_add >> > pci_init_capabilities >> > pci_iov_init(PF) >> > sriov_init(PF, pos) >> > pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0) >> > pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset) >> > pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride) >> > iov->offset = offset >> > iov->stride = stride >> > >> > pci_bus_add_devices >> > pci_bus_add_device >> > pci_fixup_device(pci_fixup_final) >> > add_dev_pci_info >> > pci_iov_virtfn_bus >> > return ... + sriov->offset + (sriov->stride * id) ... >> > >> >But both First VF Offset and VF Stride change when ARI Capable Hierarchy or >> >NumVFs changes (SR-IOV spec sec 3.3.9, 3.3.10). We set NumVFs to zero in >> >sriov_init() above. We will change NumVFs to something different when a >> >driver calls pci_enable_sriov(): >> > >> > pci_enable_sriov >> > sriov_enable >> > pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn) >> > >> >Now First VF Offset and VF Stride have changed from what they were when we >> >called pci_iov_virtfn_bus() above. >> > >> >> Oops, I see the ARI would affect those value, while missed the NumVFs also >> would. >> >> Let's look at the problem one by one. >> >> 1. The ARI capability. >> =============================================================================== >> The kernel initialize the capability like this: >> >> pci_init_capabilities() >> pci_configure_ari() >> pci_iov_init() >> iov->offset = offset >> iov->stride = stride >> >> When offset/stride is retrieved at this point, the ARI capability is taken >> into consideration. > >PCI_SRIOV_CTRL_ARI is currently only changed at the time we enumerate the >PF, so I don't think this is really a problem. > >> 2. The PF's NumVFs field >> =============================================================================== >> 2.1 Potential problem in current code >> =============================================================================== >> First, is current pci code has some potential problem? >> >> sriov_enable() >> pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset); >> pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride); >> iov->offset = offset; >> iov->stride = stride; >> pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); >> virtfn_add() >> ... >> virtfn->devfn = pci_iov_virtfn_devfn(dev, id); >> >> The sriov_enable() retrieve the offset/stride then write the NumVFs. According >> to the SPEC, at this moment the offset/stride may change. While I don't see >> some code to retrieve and store those value again. And these fields will be >> used in virtfn_add(). >> >> If my understanding is correct, I suggest to move the retrieve and store >> operation after NumVFs is written. > >Yep, it looks like the existing code has similar problems. We might want >to add a simple function that writes PCI_SRIOV_NUM_VF, then reads >PCI_SRIOV_VF_OFFSET and PCI_SRIOV_VF_STRIDE and refreshes the cached values >in dev->sriov. > >Then we'd at least know that virtfn_bus() and virtfn_devfn() return values >that are correct until the next NumVFs change. > Ok, I will write a function to wrap it. >> 2.2 The IOV bus range may not be correct in pci_scan_child_bus()? >> =============================================================================== >> In current pci core, when enumerating the pci tree, we do like this: >> >> pci_scan_child_bus() >> pci_scan_slot() >> pci_scan_single_device() >> pci_device_add() >> pci_init_capabilities() >> pci_iov_init() >> max += pci_iov_bus_range(bus); >> busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1); >> max = pci_scan_bridge(bus, dev, max, pass); >> >> From this point, we see pci core reserve some bus range for VFs. This >> calculation is based on the offset/stride at this moment. And do the >> enumeration with the new bus number. >> >> sriov_enable() could be called several times from driver to enable SRIOV, and >> with different nr_virtfn. If each time the NumVFs written, the offset/stride >> will change. This means we may try to address an extra bus we didn't reserve? >> Or this means it is out of control? > >This looks like a problem, too. I don't have a good suggestion for fixing >it. How about enumerating all possible NumVFs and select the maximum? I am not sure what will happen if the FW sets a different number? So FW should listen to kernel, right? I could write a code with this logic and test, while I am afraid this will break some platfrom in some case. > >> 2.3 How can I reserve bus range in FW? >> =============================================================================== >> This question comes from the previous one. >> >> Based on my understanding, current pci core will rely on the bus number in HW >> if pcibios_assign_all_busses() is not set. If we want to support those VFs >> sits on different bus with PF, we need to reserve bus range and write the >> correct secondary/subordinate in bridge. Otherwise, those VFs on different bus >> may not be addressed. >> >> Currently I am writing the code in FW to reserve the range with the same >> mechanism in pci core. While as you mentioned the offset/stride may change >> after sriov_enable(), I am confused whether this is the correct way. > >If your firmware knows something about the device and can compute the >number of buses it will likely need, it can set up bridges with appropriate >bus number ranges, and Linux will generally leave those alone. > Yep, this is what I am trying to do. >I don't know the best way to figure out the number of buses, though. It >seems like you almost need to experimentally set NumVFs and read the >resulting offset/stride, because I think it's really up to the device to >decide how to number the VFs. Maybe pci_iov_bus_range() needs to do >something similar. Got it, I will add this logic. > >> 2.4 The potential problem for [Patch 08/18] >> =============================================================================== >> According to the SPEC, the offset/stride will change after each >> sriov_enable(). This means the bus/devfn will change after each >> sriov_enable(). >> >> My current thought is to fix it up in virtfn_add(). If the total VF number >> will not change, we could create those pci_dn at the beginning and fix the >> bus/devfn at each time the VF is truely created. > >By "fix it up," I assume you mean call an arch function that does the >pci_pdn setup you need. > >It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or >at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(), >sriov_enable(), sriov_disable(), and sriov_restore_state(). From a >hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set, >so it might be nice to have this setup connected to that. If my understanding is correct, we could wrap up the configuration write/read on PCI_SRIOV_CTRL and when it involves PCI_SRIOV_CTRL_VFE, do the fix up? > >Bjorn
On Fri, Nov 21, 2014 at 11:04:11AM +1100, Gavin Shan wrote: >>> 2.4 The potential problem for [Patch 08/18] >>> =============================================================================== >>> According to the SPEC, the offset/stride will change after each >>> sriov_enable(). This means the bus/devfn will change after each >>> sriov_enable(). >>> >>> My current thought is to fix it up in virtfn_add(). If the total VF number >>> will not change, we could create those pci_dn at the beginning and fix the >>> bus/devfn at each time the VF is truely created. >> >>By "fix it up," I assume you mean call an arch function that does the >>pci_pdn setup you need. >> >>It sounds reasonable to do this either in virtfn_add()/virtfn_remove() or >>at the points where we write PCI_SRIOV_CTRL_VFE, i.e., in sriov_init(), >>sriov_enable(), sriov_disable(), and sriov_restore_state(). From a >>hardware point of view, the VFs exist whenever PCI_SRIOV_CTRL_VFE is set, >>so it might be nice to have this setup connected to that. >> > >Yes, Both ways can fix the issue. For couple reasons, I want add weak >pcibios_virtfn_add(), which is called in virtfn_add() if you agree. > >- PCI_SRIOV_CTRL_VFE might be set somewhere except the functions you pointed. > Set/clear PCI_SRIOV_CTRL_VFE will invoke background work to check pci_dn > and add/remove accordingly. It would be overhead which we can avoid. >- We plan to support EEH for VFs in future. virtfn_add() way matches with > current EEH implementation better. EEH device and PE are created based > on (struct pci_dev), and EEH devices and PE can be destroied in time in > pcibios_release_device(), which is invoked by virtfn_remove(). So we only > need one weak function. In contrast, we have to create EEH device and PE > for VFs a bit early before any VFs are instantiated, and destroy them a > bit late after all VFs are offline. > Bjorn, Since my mail box got some problem in the last few days, I am not sure you agree with Gavin's proposal or not? And if you agree to enumerate the maximum VF bus range, I will add this logic in the next versin. >Thanks, >Gavin > >>Bjorn >> > >-- >To unsubscribe from this list: send the line "unsubscribe linux-pci" in >the body of a message to majordomo@vger.kernel.org >More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 38faede..29992cd 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -34,6 +34,9 @@ struct dev_archdata { #ifdef CONFIG_SWIOTLB dma_addr_t max_direct_dma_addr; #endif +#ifdef CONFIG_PPC64 + void *firmware_data; +#endif #ifdef CONFIG_EEH struct eeh_dev *edev; #endif diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 4ca90a3..757d7bb 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -89,6 +89,7 @@ struct pci_controller { #ifdef CONFIG_PPC64 unsigned long buid; + void *firmware_data; #endif /* CONFIG_PPC64 */ void *private_data; @@ -150,9 +151,13 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) struct iommu_table; struct pci_dn { + int flags; +#define PCI_DN_FLAG_IOV_VF 0x01 + int busno; /* pci bus number */ int devfn; /* pci device and function number */ + struct pci_dn *parent; struct pci_controller *phb; /* for pci devices */ struct iommu_table *iommu_table; /* for phb's or bridges */ struct device_node *node; /* back-pointer to the device_node */ @@ -169,14 +174,19 @@ struct pci_dn { #ifdef CONFIG_PPC_POWERNV int pe_number; #endif + struct list_head child_list; + struct list_head list; }; /* Get the pointer to a device_node's pci_dn */ #define PCI_DN(dn) ((struct pci_dn *) (dn)->data) +extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, + int devfn); extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); - -extern void * update_dn_pci_info(struct device_node *dn, void *data); +extern struct pci_dn *add_dev_pci_info(struct pci_dev *pdev); +extern void remove_dev_pci_info(struct pci_dev *pdev); +extern void *update_dn_pci_info(struct device_node *dn, void *data); static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 5b78917..af60efe 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -30,6 +30,9 @@ void pcibios_release_device(struct pci_dev *dev) { eeh_remove_device(dev); + + /* Release firmware data */ + remove_dev_pci_info(dev); } /** diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 1f61fab..fa966ae 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -32,12 +32,222 @@ #include <asm/ppc-pci.h> #include <asm/firmware.h> +/* + * The function is used to find the firmware data of one + * specific PCI device, which is attached to the indicated + * PCI bus. For VFs, their firmware data is linked to that + * one of PF's bridge. For other devices, their firmware + * data is linked to that of their bridge. + */ +static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) +{ + struct pci_bus *pbus; + struct device_node *dn; + struct pci_dn *pdn; + + /* + * We probably have virtual bus which doesn't + * have associated bridge. + */ + pbus = bus; + while (pbus) { + if (pci_is_root_bus(pbus) || pbus->self) + break; + + pbus = pbus->parent; + } + + /* + * Except virtual bus, all PCI buses should + * have device nodes. + */ + dn = pci_bus_to_OF_node(pbus); + pdn = dn ? PCI_DN(dn) : NULL; + + return pdn; +} + +struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, + int devfn) +{ + struct device_node *dn = NULL; + struct pci_dn *parent, *pdn; + struct pci_dev *pdev = NULL; + + /* Fast path: fetch from PCI device */ + list_for_each_entry(pdev, &bus->devices, bus_list) { + if (pdev->devfn == devfn) { + if (pdev->dev.archdata.firmware_data) + return pdev->dev.archdata.firmware_data; + + dn = pci_device_to_OF_node(pdev); + break; + } + } + + /* Fast path: fetch from device node */ + pdn = dn ? PCI_DN(dn) : NULL; + if (pdn) + return pdn; + + /* Slow path: fetch from firmware data hierarchy */ + parent = pci_bus_to_pdn(bus); + if (!parent) + return NULL; + + list_for_each_entry(pdn, &parent->child_list, list) { + if (pdn->busno == bus->number && + pdn->devfn == devfn) + return pdn; + } + + return NULL; +} + struct pci_dn *pci_get_pdn(struct pci_dev *pdev) { - struct device_node *dn = pci_device_to_OF_node(pdev); - if (!dn) + struct device_node *dn; + struct pci_dn *parent, *pdn; + + /* Search device directly */ + if (pdev->dev.archdata.firmware_data) + return pdev->dev.archdata.firmware_data; + + /* Check device node */ + dn = pci_device_to_OF_node(pdev); + pdn = dn ? PCI_DN(dn) : NULL; + if (pdn) + return pdn; + + /* + * VFs don't have device nodes. We hook their + * firmware data to PF's bridge. + */ + parent = pci_bus_to_pdn(pdev->bus); + if (!parent) return NULL; - return PCI_DN(dn); + + list_for_each_entry(pdn, &parent->child_list, list) { + if (pdn->busno == pdev->bus->number && + pdn->devfn == pdev->devfn) + return pdn; + } + + return NULL; +} + +static struct pci_dn *add_one_dev_pci_info(struct pci_dn *parent, + struct pci_dev *pdev, + int busno, int devfn) +{ + struct pci_dn *pdn; + + /* Except PHB, we always have parent firmware data */ + if (!parent) + return NULL; + + pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); + if (!pdn) { + pr_warn("%s: Out of memory !\n", __func__); + return NULL; + } + + pdn->phb = parent->phb; + pdn->parent = parent; + pdn->busno = busno; + pdn->devfn = devfn; +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif + INIT_LIST_HEAD(&pdn->child_list); + INIT_LIST_HEAD(&pdn->list); + list_add_tail(&pdn->list, &parent->child_list); + + /* + * If we already have PCI device instance, lets + * bind them. + */ + if (pdev) + pdev->dev.archdata.firmware_data = pdn; + + return pdn; +} + +struct pci_dn *add_dev_pci_info(struct pci_dev *pdev) +{ +#ifdef CONFIG_PCI_IOV + struct pci_dn *parent, *pdn; + int i; + + /* Only support IOV for now */ + if (!pdev->is_physfn) + return pci_get_pdn(pdev); + + /* Check if VFs have been populated */ + pdn = pci_get_pdn(pdev); + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) + return NULL; + + pdn->flags |= PCI_DN_FLAG_IOV_VF; + parent = pci_bus_to_pdn(pdev->bus); + if (!parent) + return NULL; + + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { + pdn = add_one_dev_pci_info(parent, NULL, + pci_iov_virtfn_bus(pdev, i), + pci_iov_virtfn_devfn(pdev, i)); + if (!pdn) { + pr_warn("%s: Cannot create firmware data " + "for VF#%d of %s\n", + __func__, i, pci_name(pdev)); + return NULL; + } + } +#endif + + return pci_get_pdn(pdev); +} + +void remove_dev_pci_info(struct pci_dev *pdev) +{ +#ifdef CONFIG_PCI_IOV + struct pci_dn *parent; + struct pci_dn *pdn, *tmp; + int i; + + /* Only support IOV PF for now */ + if (!pdev->is_physfn) + return; + + /* Check if VFs have been populated */ + pdn = pci_get_pdn(pdev); + if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF)) + return; + + pdn->flags &= ~PCI_DN_FLAG_IOV_VF; + parent = pci_bus_to_pdn(pdev->bus); + if (!parent) + return; + + /* + * We might introduce flag to pci_dn in future + * so that we can release VF's firmware data in + * a batch mode. + */ + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { + list_for_each_entry_safe(pdn, tmp, + &parent->child_list, list) { + if (pdn->busno != pci_iov_virtfn_bus(pdev, i) || + pdn->devfn != pci_iov_virtfn_devfn(pdev, i)) + continue; + + if (!list_empty(&pdn->list)) + list_del(&pdn->list); + kfree(pdn); + } + } +#endif } /* @@ -49,6 +259,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data) struct pci_controller *phb = data; const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); const __be32 *regs; + struct device_node *parent; struct pci_dn *pdn; pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL); @@ -70,6 +281,15 @@ void *update_dn_pci_info(struct device_node *dn, void *data) } pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1); + + /* Attach to parent node */ + INIT_LIST_HEAD(&pdn->child_list); + INIT_LIST_HEAD(&pdn->list); + parent = of_get_parent(dn); + pdn->parent = parent ? PCI_DN(parent) : NULL; + if (pdn->parent) + list_add_tail(&pdn->list, &pdn->parent->child_list); + return NULL; } @@ -150,6 +370,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) if (pdn) { pdn->devfn = pdn->busno = -1; pdn->phb = phb; + phb->firmware_data = pdn; } /* Update dn->phb ptrs for new phb and children devices */ @@ -173,3 +394,22 @@ void __init pci_devs_phb_init(void) list_for_each_entry_safe(phb, tmp, &hose_list, list_node) pci_devs_phb_init_dynamic(phb); } + +static void pci_dev_pdn_create(struct pci_dev *pdev) +{ + add_dev_pci_info(pdev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create); + +static void pci_dev_pdn_setup(struct pci_dev *pdev) +{ + struct pci_dn *pdn; + + if (pdev->dev.archdata.firmware_data) + return; + + /* Setup the fast path */ + pdn = pci_get_pdn(pdev); + pdev->dev.archdata.firmware_data = pdn; +} +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);