Message ID | 20170928125838.11887-4-thomas.petazzoni@free-electrons.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote: > From: Victor Gu <xigu@marvell.com> > > Since the Aardvark does not implement a PCIe root bus, What exactly do you mean by "does not implement a PCIe root bus"? I assume there is still a hierarchy of PCI buses, and I assume the hierarchy has a top-most ("root") bus. Maybe there's no Root Port? There are other systems that don't have Root Ports, and we've made changes to accommodate that, e.g., http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f If we can make the generic code work for whatever the Aardvark topology is, that would be better than adding Aardvark-specific code here. > the Linux PCIe > subsystem will not align the MAX payload size between the host and the > device. This patch ensures that the host and device have the same MAX > payload size, fixing a number of problems with various PCIe devices. > > This is part of fixing bug > https://bugzilla.kernel.org/show_bug.cgi?id=196339, this commit was > reported as the user to be important to get a Intel 7260 mini-PCIe > WiFi card working. > > Fixes: Fixes: 8c39d710363c1 ("PCI: aardvark: Add Aardvark PCI host controller driver") > Signed-off-by: Victor Gu <xigu@marvell.com> > Reviewed-by: Evan Wang <xswang@marvell.com> > Reviewed-by: Nadav Haklai <nadavh@marvell.com> > [Thomas: tweak commit log.] > Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> > --- > drivers/pci/host/pci-aardvark.c | 60 ++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 59 insertions(+), 1 deletion(-) > > diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c > index af7a9c4a61a4..c8a97bad6c4c 100644 > --- a/drivers/pci/host/pci-aardvark.c > +++ b/drivers/pci/host/pci-aardvark.c > @@ -30,8 +30,10 @@ > #define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8 > #define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4) > #define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5 > +#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ 0x2 > #define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11) > #define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12 > +#define PCIE_CORE_MPS_UNIT_BYTE 128 > #define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 > #define PCIE_CORE_LINK_L0S_ENTRY BIT(0) > #define PCIE_CORE_LINK_TRAINING BIT(5) > @@ -297,7 +299,8 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) > > /* Set PCIe Device Control and Status 1 PF0 register */ > reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE | > - (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | > + (PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ << > + PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | > PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE | > PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT; > advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); > @@ -879,6 +882,58 @@ static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie) > return err; > } > > +static int advk_pcie_find_smpss(struct pci_dev *dev, void *data) > +{ > + u8 *smpss = data; > + > + if (!dev) > + return 0; > + > + if (!pci_is_pcie(dev)) > + return 0; > + > + if (*smpss > dev->pcie_mpss) > + *smpss = dev->pcie_mpss; > + > + return 0; > +} > + > +static int advk_pcie_bus_configure_mps(struct pci_dev *dev, void *data) > +{ > + int mps; > + > + if (!dev) > + return 0; > + > + if (!pci_is_pcie(dev)) > + return 0; > + > + mps = PCIE_CORE_MPS_UNIT_BYTE << *(u8 *)data; > + pcie_set_mps(dev, mps); > + > + return 0; > +} > + > +static void advk_pcie_configure_mps(struct pci_bus *bus, struct advk_pcie *pcie) > +{ > + u8 smpss = PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ; > + u32 reg; > + > + /* Find the minimal supported MAX payload size */ > + advk_pcie_find_smpss(bus->self, &smpss); > + pci_walk_bus(bus, advk_pcie_find_smpss, &smpss); > + > + /* Configure RC MAX payload size */ > + reg = advk_readl(pcie, PCIE_CORE_DEV_CTRL_STATS_REG); > + reg &= ~PCI_EXP_DEVCTL_PAYLOAD; > + reg |= smpss << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT; > + advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); > + > + /* Configure device MAX payload size */ > + advk_pcie_bus_configure_mps(bus->self, &smpss); > + pci_walk_bus(bus, advk_pcie_bus_configure_mps, &smpss); > +} > + > static int advk_pcie_probe(struct platform_device *pdev) > { > struct device *dev = &pdev->dev; > @@ -950,6 +1005,9 @@ static int advk_pcie_probe(struct platform_device *pdev) > list_for_each_entry(child, &bus->children, node) > pcie_bus_configure_settings(child); > > + /* Configure the MAX pay load size */ > + advk_pcie_configure_mps(bus, pcie); > + > pci_bus_add_devices(bus); > return 0; > } > -- > 2.13.5 >
Hello Bjorn, On Thu, 5 Oct 2017 12:31:02 -0500, Bjorn Helgaas wrote: > On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote: > > From: Victor Gu <xigu@marvell.com> > > > > Since the Aardvark does not implement a PCIe root bus, > > What exactly do you mean by "does not implement a PCIe root bus"? I > assume there is still a hierarchy of PCI buses, and I assume the > hierarchy has a top-most ("root") bus. > > Maybe there's no Root Port? There are other systems that don't have > Root Ports, and we've made changes to accommodate that, e.g., > > http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f I'm trying to get back (finally) to this topic. Unfortunately, your branch has been rebased, and this commit no longer exists. Do you have an updated pointer about what you suggest to use for systems that don't have Root Ports ? Thanks! Thomas
[+cc Lorenzo, who maintains this area now] On Tue, Jan 09, 2018 at 04:39:18PM +0100, Thomas Petazzoni wrote: > Hello Bjorn, > > On Thu, 5 Oct 2017 12:31:02 -0500, Bjorn Helgaas wrote: > > On Thu, Sep 28, 2017 at 02:58:34PM +0200, Thomas Petazzoni wrote: > > > From: Victor Gu <xigu@marvell.com> > > > > > > Since the Aardvark does not implement a PCIe root bus, > > > > What exactly do you mean by "does not implement a PCIe root bus"? I > > assume there is still a hierarchy of PCI buses, and I assume the > > hierarchy has a top-most ("root") bus. > > > > Maybe there's no Root Port? There are other systems that don't have > > Root Ports, and we've made changes to accommodate that, e.g., > > > > http://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=1b8a6079015f > > I'm trying to get back (finally) to this topic. Unfortunately, your > branch has been rebased, and this commit no longer exists. Do you have > an updated pointer about what you suggest to use for systems that don't > have Root Ports ? Sorry, about that; here's the upstream commit, FWIW: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d If the OS sees no Root Port (I haven't seen the full lspci or kernel enumeration log, so I don't know what the topology actually is), I assume you probably have some Endpoints that have valid Link Capabilities, Control, and Status registers. Those refer to the downstream end of the Link, and the Root Port would normally have corresponding registers that refer to the upstream end. The lack of the Root Port means we can't do any management of those top-level Links, so no ASPM, no MPS, no link width/speed management, etc. I see that advk_pcie_probe() calls pcie_bus_configure_settings() like all other drivers, and ideally we would try to make that work just like it does on other platforms. The code is: pci_scan_root_bus_bridge(bridge); bus = bridge->bus; list_for_each_entry(child, &bus->children, node) pcie_bus_configure_settings(child); This MPS setting is all strictly in the PCIe domain (it's not in the Aardvark domain and shouldn't have any Aardvark dependencies), so I would expect the core code to just work, modulo some possible confusion if it expects to find a Root Port but doesn't. Can you collect "lspci -vv" output and details about what currently goes wrong? Then we'd have something more concrete to talk about. Bjorn
Hello, On Tue, 9 Jan 2018 16:14:36 -0600, Bjorn Helgaas wrote: > > I'm trying to get back (finally) to this topic. Unfortunately, your > > branch has been rebased, and this commit no longer exists. Do you have > > an updated pointer about what you suggest to use for systems that don't > > have Root Ports ? > > Sorry, about that; here's the upstream commit, FWIW: > > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d Thanks. I don't see how this commit can fix our problem though, see below. > If the OS sees no Root Port (I haven't seen the full lspci or kernel > enumeration log, so I don't know what the topology actually is), I > assume you probably have some Endpoints that have valid Link > Capabilities, Control, and Status registers. Those refer to the > downstream end of the Link, and the Root Port would normally have > corresponding registers that refer to the upstream end. > > The lack of the Root Port means we can't do any management of those > top-level Links, so no ASPM, no MPS, no link width/speed management, > etc. > > I see that advk_pcie_probe() calls pcie_bus_configure_settings() like > all other drivers, and ideally we would try to make that work just > like it does on other platforms. The code is: > > pci_scan_root_bus_bridge(bridge); > bus = bridge->bus; > list_for_each_entry(child, &bus->children, node) > pcie_bus_configure_settings(child); > > This MPS setting is all strictly in the PCIe domain (it's not in the > Aardvark domain and shouldn't have any Aardvark dependencies), so I > would expect the core code to just work, modulo some possible > confusion if it expects to find a Root Port but doesn't. > > Can you collect "lspci -vv" output and details about what currently > goes wrong? Then we'd have something more concrete to talk about. With an E1000E PCIe NIC connected, the entire lspci -vvv output is: # lspci -vv 00:00.0 Ethernet controller: Intel Corporation 82572EI Gigabit Ethernet Controller (Copper) (rev 06) Subsystem: Intel Corporation PRO/1000 PT Server Adapter Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+ Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx- Latency: 0, Cache Line Size: 32 bytes Interrupt: pin A routed to IRQ 40 Region 0: Memory at e8000000 (32-bit, non-prefetchable) [size=128K] Region 1: Memory at e8020000 (32-bit, non-prefetchable) [size=128K] Region 2: I/O ports at 1000 [disabled] [size=32] Expansion ROM at e8040000 [disabled] [size=128K] Capabilities: [c8] Power Management version 2 Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+) Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME- Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+ Address: 000000001d1f6f68 Data: 0028 Capabilities: [e0] Express (v1) Endpoint, MSI 00 DevCap: MaxPayload 256 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us ExtTag- AttnBtn- AttnInd- PwrInd- RBE- FLReset- SlotPowerLimit 0.000W DevCtl: Report errors: Correctable+ Non-Fatal+ Fatal+ Unsupported+ RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+ MaxPayload 256 bytes, MaxReadReq 512 bytes DevSta: CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend- LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Exit Latency L0s <4us ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp- LnkCtl: ASPM Disabled; RCB 64 bytes Disabled- CommClk- ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt- LnkSta: Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt- Capabilities: [100 v1] Advanced Error Reporting UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol- UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol- UESvrt: DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol- CESta: RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr- CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr- AERCap: First Error Pointer: 00, ECRCGenCap- ECRCGenEn- ECRCChkCap- ECRCChkEn- MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap- HeaderLog: 00000000 00000000 00000000 00000000 Capabilities: [140 v1] Device Serial Number 00-1b-21-ff-ff-c1-c4-fe Kernel driver in use: e1000e I.e, there is no Root Port. Therefore, I don't see how the kernel can know what is the maximum allowed payload size of the PCIe controller, nor how to adjust the payload size to use. Same for the L0s configuration. This is why we need those changes, one to update the PCIe controller MPS according to the Maximum Payload Size acceptable by the endpoint, and one to disable L0s entirely to avoid issues with non-L0s compliant devices. Does that make more sense ? Best regards, Thomas
On Fri, Jan 12, 2018 at 11:14:48AM +0100, Thomas Petazzoni wrote: > Hello, > > On Tue, 9 Jan 2018 16:14:36 -0600, Bjorn Helgaas wrote: > > > > I'm trying to get back (finally) to this topic. Unfortunately, your > > > branch has been rebased, and this commit no longer exists. Do you have > > > an updated pointer about what you suggest to use for systems that don't > > > have Root Ports ? > > > > Sorry, about that; here's the upstream commit, FWIW: > > > > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ee8bdfb6568d > > Thanks. I don't see how this commit can fix our problem though, see below. Sorry, I didn't mean that commit would fix your problem. It's just an example of another case where generic code incorrectly assumed a Root Port would always be present. > > If the OS sees no Root Port (I haven't seen the full lspci or kernel > > enumeration log, so I don't know what the topology actually is), I > > assume you probably have some Endpoints that have valid Link > > Capabilities, Control, and Status registers. Those refer to the > > downstream end of the Link, and the Root Port would normally have > > corresponding registers that refer to the upstream end. > > > > The lack of the Root Port means we can't do any management of those > > top-level Links, so no ASPM, no MPS, no link width/speed management, > > etc. > > > > I see that advk_pcie_probe() calls pcie_bus_configure_settings() like > > all other drivers, and ideally we would try to make that work just > > like it does on other platforms. The code is: > > > > pci_scan_root_bus_bridge(bridge); > > bus = bridge->bus; > > list_for_each_entry(child, &bus->children, node) > > pcie_bus_configure_settings(child); > > > > This MPS setting is all strictly in the PCIe domain (it's not in the > > Aardvark domain and shouldn't have any Aardvark dependencies), so I > > would expect the core code to just work, modulo some possible > > confusion if it expects to find a Root Port but doesn't. > > > > Can you collect "lspci -vv" output and details about what currently > > goes wrong? Then we'd have something more concrete to talk about. > > With an E1000E PCIe NIC connected, the entire lspci -vvv output is: > > # lspci -vv > 00:00.0 Ethernet controller: Intel Corporation 82572EI Gigabit Ethernet Controller (Copper) (rev 06) > Subsystem: Intel Corporation PRO/1000 PT Server Adapter > Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+ > Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx- > Latency: 0, Cache Line Size: 32 bytes > Interrupt: pin A routed to IRQ 40 > Region 0: Memory at e8000000 (32-bit, non-prefetchable) [size=128K] > Region 1: Memory at e8020000 (32-bit, non-prefetchable) [size=128K] > Region 2: I/O ports at 1000 [disabled] [size=32] > Expansion ROM at e8040000 [disabled] [size=128K] > Capabilities: [c8] Power Management version 2 > Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold+) > Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME- > Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+ > Address: 000000001d1f6f68 Data: 0028 > Capabilities: [e0] Express (v1) Endpoint, MSI 00 > DevCap: MaxPayload 256 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us > ExtTag- AttnBtn- AttnInd- PwrInd- RBE- FLReset- SlotPowerLimit 0.000W > DevCtl: Report errors: Correctable+ Non-Fatal+ Fatal+ Unsupported+ > RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+ > MaxPayload 256 bytes, MaxReadReq 512 bytes > DevSta: CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend- > LnkCap: Port #0, Speed 2.5GT/s, Width x1, ASPM L0s, Exit Latency L0s <4us > ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp- > LnkCtl: ASPM Disabled; RCB 64 bytes Disabled- CommClk- > ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt- > LnkSta: Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt- > Capabilities: [100 v1] Advanced Error Reporting > UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol- > UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol- > UESvrt: DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol- > CESta: RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr- > CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr- > AERCap: First Error Pointer: 00, ECRCGenCap- ECRCGenEn- ECRCChkCap- ECRCChkEn- > MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap- > HeaderLog: 00000000 00000000 00000000 00000000 > Capabilities: [140 v1] Device Serial Number 00-1b-21-ff-ff-c1-c4-fe > Kernel driver in use: e1000e > > I.e, there is no Root Port. Therefore, I don't see how the kernel > can know what is the maximum allowed payload size of the PCIe > controller, nor how to adjust the payload size to use. Same for the L0s > configuration. The Device Control MPS field defaults to 128 bytes. Generic software can only change that default if it knows that every element that might receive a packet from the device can handle it. In this case, we have no information about what the invisible Root Port can handle, so I would argue that we cannot change MPS. In the lspci above, MPS is set to 256 bytes. If that was done by firmware, it might be safe because it knows things about the Root Port that Linux doesn't. But I don't think the Linux PCI core could set it to 256. ASPM L0s is similar. We should only enable L0s if we can tell that both ends of the link support it. If there's no Root Port, we don't have any ASPM capability information for the upstream end of the link, so we shouldn't enable ASPM at all. > This is why we need those changes, one to update the PCIe controller > MPS according to the Maximum Payload Size acceptable by the endpoint, > and one to disable L0s entirely to avoid issues with non-L0s compliant > devices. The generic core code should perform minimal, guaranteed-to-work configuration using the least information and fewest assumptions possible. That may not lead to optimal performance, but it should at least be functional. This should work even if there is no Root Port. Once we have that figured out, then we can worry about whether we can or should do platform-specific tweaks to improve performance, e.g., increase MPS if we know Root Port capabilities implicitly. I had the impression that these patches were required for correct functionality, not just to improve performance. But maybe I misunderstood? Bjorn
Hello Bjorn, Thanks again for this discussion! On Fri, 12 Jan 2018 08:40:24 -0600, Bjorn Helgaas wrote: > Sorry, I didn't mean that commit would fix your problem. It's just an > example of another case where generic code incorrectly assumed a Root > Port would always be present. OK, understood. > > I.e, there is no Root Port. Therefore, I don't see how the kernel > > can know what is the maximum allowed payload size of the PCIe > > controller, nor how to adjust the payload size to use. Same for the L0s > > configuration. > > The Device Control MPS field defaults to 128 bytes. Generic software > can only change that default if it knows that every element that might > receive a packet from the device can handle it. In this case, we have > no information about what the invisible Root Port can handle, so I > would argue that we cannot change MPS. > > In the lspci above, MPS is set to 256 bytes. If that was done by > firmware, it might be safe because it knows things about the Root Port > that Linux doesn't. But I don't think the Linux PCI core could set it > to 256. So you're suspecting that the firmware/bootloader has configured the MPS on the E1000E device to 256 bytes ? Isn't it dangerous for the kernel to rely on the firmware/bootloader configuration ? Indeed, the firmware/bootloader might have configured MPS to X bytes on the endpoint, but when the kernel boots and initializes the PCIe controller, its sets the PCIe controller MPS to Y bytes, with Y > X. > ASPM L0s is similar. We should only enable L0s if we can tell that > both ends of the link support it. If there's no Root Port, we don't > have any ASPM capability information for the upstream end of the link, > so we shouldn't enable ASPM at all. Well, even without the Root Port, we are able to use the endpoint configuration space to figure out whether it supports L0s, and adjust the root complex configuration accordingly. This is what our patch is doing for MPS, and which could be done similarly for L0s, no? > > > This is why we need those changes, one to update the PCIe controller > > MPS according to the Maximum Payload Size acceptable by the endpoint, > > and one to disable L0s entirely to avoid issues with non-L0s compliant > > devices. > > The generic core code should perform minimal, guaranteed-to-work > configuration using the least information and fewest assumptions > possible. That may not lead to optimal performance, but it should at > least be functional. This should work even if there is no Root Port. > > Once we have that figured out, then we can worry about whether we can > or should do platform-specific tweaks to improve performance, e.g., > increase MPS if we know Root Port capabilities implicitly. > > I had the impression that these patches were required for correct > functionality, not just to improve performance. But maybe I > misunderstood? I don't myself have the device that wasn't working, and that this patch got to work, so I can't double check myself. However, indeed, I was told that without this fix, some devices would not work. One question: is it valid/working to have the root complex configured with MPS = 128 bytes but the endpoint configured with MPS = 256 or 512 bytes ? Or should the MPS value be strictly equal on both sides ? Depending on your answer, there are two options: - It is a valid situation to have a root complex MPS lower than the endpoint MPS. In this case, we could for now simply unconditionally set the MPS to 128 bytes in the root complex, as a fix to get all devices working. And then separately, work on improving performance by increasing the MPS according to the endpoint capabilities. - It is not valid for the root complex MPS to be different than the endpoint MPS. In this case, then I don't see how we can do things differently than the proposed patch: we have to see what the endpoint MPS is, and adjust the root complex MPS accordingly. Indeed, the bootloader/firmware might have changed the endpoint MPS so that it is no longer the default of 128 bytes. Best regards, Thomas
On Fri, Jan 12, 2018 at 04:46:44PM +0100, Thomas Petazzoni wrote: > On Fri, 12 Jan 2018 08:40:24 -0600, Bjorn Helgaas wrote: > > The Device Control MPS field defaults to 128 bytes. Generic software > > can only change that default if it knows that every element that might > > receive a packet from the device can handle it. In this case, we have > > no information about what the invisible Root Port can handle, so I > > would argue that we cannot change MPS. > > > > In the lspci above, MPS is set to 256 bytes. If that was done by > > firmware, it might be safe because it knows things about the Root Port > > that Linux doesn't. But I don't think the Linux PCI core could set it > > to 256. > > So you're suspecting that the firmware/bootloader has configured the > MPS on the E1000E device to 256 bytes ? I didn't word that very well. It looks like *something* set it to 256, but I don't know what. It's possible Linux did, but I think that would be a bug and should be fixed. We'd have to instrument the code or analyze it more closely than I have. > Isn't it dangerous for the kernel to rely on the firmware/bootloader > configuration ? Indeed, the firmware/bootloader might have configured > MPS to X bytes on the endpoint, but when the kernel boots and > initializes the PCIe controller, its sets the PCIe controller MPS to Y > bytes, with Y > X. > > > ASPM L0s is similar. We should only enable L0s if we can tell that > > both ends of the link support it. If there's no Root Port, we don't > > have any ASPM capability information for the upstream end of the link, > > so we shouldn't enable ASPM at all. > > Well, even without the Root Port, we are able to use the endpoint > configuration space to figure out whether it supports L0s, and adjust > the root complex configuration accordingly. This is what our patch is > doing for MPS, and which could be done similarly for L0s, no? Yes, this is where it would get machine-specific. I don't know how that should be structured, or even whether it's really worthwhile. I think the core should make it *work* with the least-common-denominator approach, but I'm not convinced that a lot of effort should be put into optimizing a topology that doesn't follow the spec. A driver could do this outside the core, but I think it would be better to put the effort into making the topology more standard. Why exactly *doesn't* Aardvark expose the Root Port? I assume it does actually exist, since there is actually a link leading to the slot, and there has to be *something* at the upstream end of that link. > > I had the impression that these patches were required for correct > > functionality, not just to improve performance. But maybe I > > misunderstood? > > I don't myself have the device that wasn't working, and that this patch > got to work, so I can't double check myself. However, indeed, I was > told that without this fix, some devices would not work. > > One question: is it valid/working to have the root complex configured > with MPS = 128 bytes but the endpoint configured with MPS = 256 or 512 > bytes ? Or should the MPS value be strictly equal on both sides ? Per PCIe r4.0, sec 2.2.2, a device cannot transmit a TLP with a payload larger than its MPS. A device receiving a TLP with a payload larger than its MPS must treat the TLP as malformed. I think that means MPS really should be set the same on both sides so the device can do both DMA reads and writes safely. > Depending on your answer, there are two options: > > - It is a valid situation to have a root complex MPS lower than the > endpoint MPS. In this case, we could for now simply unconditionally > set the MPS to 128 bytes in the root complex, as a fix to get all > devices working. And then separately, work on improving performance > by increasing the MPS according to the endpoint capabilities. > > - It is not valid for the root complex MPS to be different than the > endpoint MPS. In this case, then I don't see how we can do things > differently than the proposed patch: we have to see what the > endpoint MPS is, and adjust the root complex MPS accordingly. > Indeed, the bootloader/firmware might have changed the endpoint MPS > so that it is no longer the default of 128 bytes. All devices are guaranteed to support MPS = 128 bytes, so if you set the Root Port to that in the driver, we should be able to make the PCI core leave (or set, if necessary) all devices with MPS = 128. I think we should start with that first, then worry about performance optimizations separately. I guess I'm still just shaking my head over the invisible Root Port mystery. I think the other cases I know about are related to virtualization, where I can sort of understand why the Root Port is missing, but I don't think that's the situation here. Bjorn
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c index af7a9c4a61a4..c8a97bad6c4c 100644 --- a/drivers/pci/host/pci-aardvark.c +++ b/drivers/pci/host/pci-aardvark.c @@ -30,8 +30,10 @@ #define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8 #define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4) #define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5 +#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ 0x2 #define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11) #define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12 +#define PCIE_CORE_MPS_UNIT_BYTE 128 #define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0 #define PCIE_CORE_LINK_L0S_ENTRY BIT(0) #define PCIE_CORE_LINK_TRAINING BIT(5) @@ -297,7 +299,8 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) /* Set PCIe Device Control and Status 1 PF0 register */ reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE | - (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | + (PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ << + PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) | PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE | PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT; advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); @@ -879,6 +882,58 @@ static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie) return err; } +static int advk_pcie_find_smpss(struct pci_dev *dev, void *data) +{ + u8 *smpss = data; + + if (!dev) + return 0; + + if (!pci_is_pcie(dev)) + return 0; + + if (*smpss > dev->pcie_mpss) + *smpss = dev->pcie_mpss; + + return 0; +} + +static int advk_pcie_bus_configure_mps(struct pci_dev *dev, void *data) +{ + int mps; + + if (!dev) + return 0; + + if (!pci_is_pcie(dev)) + return 0; + + mps = PCIE_CORE_MPS_UNIT_BYTE << *(u8 *)data; + pcie_set_mps(dev, mps); + + return 0; +} + +static void advk_pcie_configure_mps(struct pci_bus *bus, struct advk_pcie *pcie) +{ + u8 smpss = PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ; + u32 reg; + + /* Find the minimal supported MAX payload size */ + advk_pcie_find_smpss(bus->self, &smpss); + pci_walk_bus(bus, advk_pcie_find_smpss, &smpss); + + /* Configure RC MAX payload size */ + reg = advk_readl(pcie, PCIE_CORE_DEV_CTRL_STATS_REG); + reg &= ~PCI_EXP_DEVCTL_PAYLOAD; + reg |= smpss << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT; + advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG); + + /* Configure device MAX payload size */ + advk_pcie_bus_configure_mps(bus->self, &smpss); + pci_walk_bus(bus, advk_pcie_bus_configure_mps, &smpss); +} + static int advk_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -950,6 +1005,9 @@ static int advk_pcie_probe(struct platform_device *pdev) list_for_each_entry(child, &bus->children, node) pcie_bus_configure_settings(child); + /* Configure the MAX pay load size */ + advk_pcie_configure_mps(bus, pcie); + pci_bus_add_devices(bus); return 0; }