diff mbox

[v9,2/3] PCI: Add tango PCIe host bridge support

Message ID b32f9e1c-fdde-3064-f935-c817cf707534@sigmadesigns.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Marc Gonzalez June 20, 2017, 8:17 a.m. UTC
This driver is required to work around several hardware bugs
in the PCIe controller.

NB: Revision 1 does not support legacy interrupts, or IO space.

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
---
 drivers/pci/host/Kconfig      |   8 +++
 drivers/pci/host/Makefile     |   1 +
 drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h       |   2 +
 4 files changed, 175 insertions(+)
 create mode 100644 drivers/pci/host/pcie-tango.c

Comments

Bjorn Helgaas July 2, 2017, 11:18 p.m. UTC | #1
On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
> This driver is required to work around several hardware bugs
> in the PCIe controller.
> 
> NB: Revision 1 does not support legacy interrupts, or IO space.

I had to apply these manually because of conflicts in Kconfig and
Makefile.  What are these based on?  Easiest for me is if you base
them on the current -rc1 tag.

> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> ---
>  drivers/pci/host/Kconfig      |   8 +++
>  drivers/pci/host/Makefile     |   1 +
>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pci_ids.h       |   2 +
>  4 files changed, 175 insertions(+)
>  create mode 100644 drivers/pci/host/pcie-tango.c
> 
> diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
> index d7e7c0a827c3..5183d9095c3a 100644
> --- a/drivers/pci/host/Kconfig
> +++ b/drivers/pci/host/Kconfig
> @@ -285,6 +285,14 @@ config PCIE_ROCKCHIP
>  	  There is 1 internal PCIe port available to support GEN2 with
>  	  4 slots.
>  
> +config PCIE_TANGO
> +	bool "Tango PCIe controller"
> +	depends on ARCH_TANGO && PCI_MSI && OF
> +	select PCI_HOST_COMMON
> +	help
> +	  Say Y here to enable PCIe controller support for Sigma Designs
> +	  Tango systems, such as SMP8759 and later chips.
> +
>  config VMD
>  	depends on PCI_MSI && X86_64
>  	tristate "Intel Volume Management Device Driver"
> diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
> index 084cb4983645..fc7ea90196f3 100644
> --- a/drivers/pci/host/Makefile
> +++ b/drivers/pci/host/Makefile
> @@ -32,4 +32,5 @@ obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o
>  obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
>  obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
>  obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o
> +obj-$(CONFIG_PCIE_TANGO) += pcie-tango.o
>  obj-$(CONFIG_VMD) += vmd.o
> diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
> new file mode 100644
> index 000000000000..67aaadcc1c5e
> --- /dev/null
> +++ b/drivers/pci/host/pcie-tango.c
> @@ -0,0 +1,164 @@
> +#include <linux/pci-ecam.h>
> +#include <linux/delay.h>
> +#include <linux/of.h>
> +
> +#define MSI_MAX 256
> +
> +#define SMP8759_MUX		0x48
> +#define SMP8759_TEST_OUT	0x74
> +
> +struct tango_pcie {
> +	void __iomem *mux;
> +};
> +
> +/*** HOST BRIDGE SUPPORT ***/
> +
> +static int smp8759_config_read(struct pci_bus *bus,
> +		unsigned int devfn, int where, int size, u32 *val)

Wrap these with as much as possible on the first line, e.g.,

  static int smp8759_config_read(struct pci_bus *bus, unsigned int devfn,
                                 int where, int size, u32 *val)


> +{
> +	int ret;
> +	struct pci_config_window *cfg = bus->sysdata;
> +	struct tango_pcie *pcie = dev_get_drvdata(cfg->parent);

Reorder as:

        struct pci_config_window *cfg = bus->sysdata;
        struct tango_pcie *pcie = dev_get_drvdata(cfg->parent);
        int ret;

> +
> +	/*
> +	 * QUIRK #1

Omit "QUIRK #1", "QUIRK #2", etc unless they're actual references to an errata
document.

> +	 * Reads in configuration space outside devfn 0 return garbage.
> +	 */
> +	if (devfn != 0)
> +		return PCIBIOS_FUNC_NOT_SUPPORTED;
> +
> +	/*
> +	 * QUIRK #2
> +	 * Unfortunately, config and mem spaces are muxed.
> +	 * Linux does not support such a setting, since drivers are free
> +	 * to access mem space directly, at any time.
> +	 * Therefore, we can only PRAY that config and mem space accesses
> +	 * NEVER occur concurrently.
> +	 */
> +	writel_relaxed(1, pcie->mux);
> +	ret = pci_generic_config_read(bus, devfn, where, size, val);
> +	writel_relaxed(0, pcie->mux);

I'm very hesitant about this.  When people stress this, we're going to
get reports of data corruption.  Even with the disclaimer below, I
don't feel good about this.  Adding the driver is an implicit claim
that we support the device, but we know it can't be made reliable.

What is the benefit of adding this driver?  How many units are in the
field?  Are you hoping to have support in distros like RHEL?  Are
these running self-built kernels straight from kernel.org?  Is it
feasible for you to distribute this driver separately from the
upstream kernel?

> +	return ret;
> +}
> +
> +static int smp8759_config_write(struct pci_bus *bus,
> +		unsigned int devfn, int where, int size, u32 val)
> +{
> +	int ret;
> +	struct pci_config_window *cfg = bus->sysdata;
> +	struct tango_pcie *pcie = dev_get_drvdata(cfg->parent);
> +
> +	writel_relaxed(1, pcie->mux);
> +	ret = pci_generic_config_write(bus, devfn, where, size, val);
> +	writel_relaxed(0, pcie->mux);
> +
> +	return ret;
> +}
> +
> +static struct pci_ecam_ops smp8759_ecam_ops = {
> +	.bus_shift	= 20,
> +	.pci_ops	= {
> +		.map_bus	= pci_ecam_map_bus,
> +		.read		= smp8759_config_read,
> +		.write		= smp8759_config_write,
> +	}
> +};
> +
> +static const struct of_device_id tango_pcie_ids[] = {
> +	{ .compatible = "sigma,smp8759-pcie" },
> +	{ /* sentinel */ },
> +};

Move table down to point where it's needed.

> +static int tango_check_pcie_link(void __iomem *test_out)

I think this is checking for link up.  Rename to tango_pcie_link_up()
to follow the convention of other drivers.  Take a struct tango_pcie *
instead of an address, if possible.

> +{
> +	int i;
> +
> +	writel_relaxed(16, test_out);
> +	for (i = 0; i < 10; ++i) {
> +		u32 ltssm_state = readl_relaxed(test_out) >> 8;
> +		if ((ltssm_state & 0x1f) == 0xf) /* L0 */
> +			return 0;
> +		usleep_range(3000, 4000);
> +	}
> +
> +	return -ENODEV;
> +}
> +
> +static int smp8759_init(struct tango_pcie *pcie, void __iomem *base)
> +{
> +	pcie->mux		= base + SMP8759_MUX;
> +
> +	return tango_check_pcie_link(base + SMP8759_TEST_OUT);
> +}
> +
> +static int tango_pcie_probe(struct platform_device *pdev)
> +{
> +	int ret = -EINVAL;
> +	void __iomem *base;
> +	struct resource *res;
> +	struct tango_pcie *pcie;
> +	struct device *dev = &pdev->dev;

Reverse declaration order.  Then they'll be declared in order of use.

> +	pr_err("MAJOR ISSUE: PCIe config and mem spaces are muxed\n");
> +	pr_err("Tainting kernel... Use driver at your own risk\n");

These should be dev_err().

> +	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
> +
> +	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
> +	if (!pcie)
> +		return -ENOMEM;
> +
> +	platform_set_drvdata(pdev, pcie);

Minor style: move this down to the end, so we're not associating an
uninitialized drvdata structure with the pdev.  Better to wait until
it's fully initialized.

> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
> +	base = devm_ioremap_resource(&pdev->dev, res);

Use "dev", not "&pdev->dev".

> +	if (IS_ERR(base))
> +		return PTR_ERR(base);
> +
> +	if (of_device_is_compatible(dev->of_node, "sigma,smp8759-pcie"))

Not necessary, since tango_pcie_ids[] only contains
"sigma,smp8759-pcie".  If/when you need different init for different
versions, you can add something like this back, and it will be obvious
why it's needed.  Right now, there's no need for this, so it looks out
of place.

> +		ret = smp8759_init(pcie, base);
> +
> +	if (ret)
> +		return ret;
> +
> +	return pci_host_common_probe(pdev, &smp8759_ecam_ops);
> +}
> +
> +static struct platform_driver tango_pcie_driver = {
> +	.probe	= tango_pcie_probe,
> +	.driver	= {
> +		.name = KBUILD_MODNAME,
> +		.of_match_table = tango_pcie_ids,

I think you need ".suppress_bind_attrs = true" here to prevent issues
when unbinding driver.  See
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5f40e8098fe

> +	},
> +};
> +

Remove blank line.

> +builtin_platform_driver(tango_pcie_driver);
> +
> +/*
> + * QUIRK #3
> + * The root complex advertizes the wrong device class.

s/advertizes/advertises/

> + * Header Type 1 is for PCI-to-PCI bridges.
> + */
> +static void tango_fixup_class(struct pci_dev *dev)
> +{
> +	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
> +}
> +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x24, tango_fixup_class);
> +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x28, tango_fixup_class);
> +
> +/*
> + * QUIRK #4
> + * The root complex exposes a "fake" BAR, which is used to filter
> + * bus-to-system accesses. Only accesses within the range defined
> + * by this BAR are forwarded to the host, others are ignored.
> + *
> + * By default, the DMA framework expects an identity mapping,
> + * and DRAM0 is mapped at 0x80000000.
> + */
> +static void tango_fixup_bar(struct pci_dev *dev)
> +{
> +	dev->non_compliant_bars = true;
> +	pci_write_config_dword(dev, PCI_BASE_ADDRESS_0, 0x80000000);
> +}
> +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x24, tango_fixup_bar);
> +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x28, tango_fixup_bar);
> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
> index f020ab4079d3..b577dbe46f8f 100644
> --- a/include/linux/pci_ids.h
> +++ b/include/linux/pci_ids.h
> @@ -1369,6 +1369,8 @@
>  #define PCI_DEVICE_ID_TTI_HPT374	0x0008
>  #define PCI_DEVICE_ID_TTI_HPT372N	0x0009	/* apparently a 372N variant? */
>  
> +#define PCI_VENDOR_ID_SIGMA		0x1105
> +
>  #define PCI_VENDOR_ID_VIA		0x1106
>  #define PCI_DEVICE_ID_VIA_8763_0	0x0198
>  #define PCI_DEVICE_ID_VIA_8380_0	0x0204
> -- 
> 2.11.0
>
Ard Biesheuvel July 3, 2017, 9:35 a.m. UTC | #2
On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
>> This driver is required to work around several hardware bugs
>> in the PCIe controller.
>>
>> NB: Revision 1 does not support legacy interrupts, or IO space.
>
> I had to apply these manually because of conflicts in Kconfig and
> Makefile.  What are these based on?  Easiest for me is if you base
> them on the current -rc1 tag.
>
>> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>> ---
>>  drivers/pci/host/Kconfig      |   8 +++
>>  drivers/pci/host/Makefile     |   1 +
>>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/pci_ids.h       |   2 +
>>  4 files changed, 175 insertions(+)
>>  create mode 100644 drivers/pci/host/pcie-tango.c
>>
[..]
>> +     /*
>> +      * QUIRK #2
>> +      * Unfortunately, config and mem spaces are muxed.
>> +      * Linux does not support such a setting, since drivers are free
>> +      * to access mem space directly, at any time.
>> +      * Therefore, we can only PRAY that config and mem space accesses
>> +      * NEVER occur concurrently.
>> +      */
>> +     writel_relaxed(1, pcie->mux);
>> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
>> +     writel_relaxed(0, pcie->mux);
>
> I'm very hesitant about this.  When people stress this, we're going to
> get reports of data corruption.  Even with the disclaimer below, I
> don't feel good about this.  Adding the driver is an implicit claim
> that we support the device, but we know it can't be made reliable.
>

I noticed that the Synopsys driver suffers from a similar issue: in
dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
to perform a config space access, and switches it back to I/O space
afterwards (unless it has more than 2 viewports, in which case it uses
dedicated windows for I/O space and config space)
Marc Gonzalez July 3, 2017, 9:54 a.m. UTC | #3
Hello Bjorn,

On 03/07/2017 01:18, Bjorn Helgaas wrote:

> On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
> 
>> This driver is required to work around several hardware bugs
>> in the PCIe controller.
>
> [ Snip style issues ]

I wish you had pointed these out in your May 23 review, or in
my March 29 version (or even just alluded to them). I would
have fixed them in time for 2.13.


>> +	/*
>> +	 * QUIRK #2
>> +	 * Unfortunately, config and mem spaces are muxed.
>> +	 * Linux does not support such a setting, since drivers are free
>> +	 * to access mem space directly, at any time.
>> +	 * Therefore, we can only PRAY that config and mem space accesses
>> +	 * NEVER occur concurrently.
>> +	 */
>> +	writel_relaxed(1, pcie->mux);
>> +	ret = pci_generic_config_read(bus, devfn, where, size, val);
>> +	writel_relaxed(0, pcie->mux);
> 
> I'm very hesitant about this.  When people stress this, we're going to
> get reports of data corruption.  Even with the disclaimer below, I
> don't feel good about this.  Adding the driver is an implicit claim
> that we support the device, but we know it can't be made reliable.

I can't say I didn't see this coming (I had taken your long silence
as a sign of your reluctance) but back in May, I thought you implied
that a warning + tainting the kernel would be sufficient.

Mark Rutland points out stop_machine. I will test this solution.
Would you find that acceptable?


> What is the benefit of adding this driver?  How many units are in the
> field?  Are you hoping to have support in distros like RHEL?  Are
> these running self-built kernels straight from kernel.org?  Is it
> feasible for you to distribute this driver separately from the
> upstream kernel?

The benefit of upstreaming (for me) is making kernel maintainers
aware that one is using specific internal APIs. Then one may be
notified when these APIs are about to change.

I'm told we have sold ~100k units. Though I don't know how many
are in the field and using PCIe.

There are no plans to use "full-blown" distros, we use embedded
oriented distros, such as buildroot.

Maintaining out-of-tree drivers is what we've been doing for
~15 years, and there are many pain-points involved. Ask Greg
what he thinks of OOT drivers.


>> +static int tango_check_pcie_link(void __iomem *test_out)
> 
> I think this is checking for link up.  Rename to tango_pcie_link_up()
> to follow the convention of other drivers.  Take a struct tango_pcie *
> instead of an address, if possible.

Anything's possible. NB: if I pass the struct, then I have to store
the address in the struct, which isn't the case now, since I never
need the address later. If you don't mind adding an unnecessary
field to the struct, I can do it. What do you say?


>> +static struct platform_driver tango_pcie_driver = {
>> +	.probe	= tango_pcie_probe,
>> +	.driver	= {
>> +		.name = KBUILD_MODNAME,
>> +		.of_match_table = tango_pcie_ids,
> 
> I think you need ".suppress_bind_attrs = true" here to prevent issues
> when unbinding driver.  See
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5f40e8098fe

Will do. In that case, I can drop tango_pcie_remove() right?

Regards.
Bjorn Helgaas July 3, 2017, 1:27 p.m. UTC | #4
[+cc Jingoo, Joao]

On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:
> On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
> >> This driver is required to work around several hardware bugs
> >> in the PCIe controller.
> >>
> >> NB: Revision 1 does not support legacy interrupts, or IO space.
> >
> > I had to apply these manually because of conflicts in Kconfig and
> > Makefile.  What are these based on?  Easiest for me is if you base
> > them on the current -rc1 tag.
> >
> >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> >> ---
> >>  drivers/pci/host/Kconfig      |   8 +++
> >>  drivers/pci/host/Makefile     |   1 +
> >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
> >>  include/linux/pci_ids.h       |   2 +
> >>  4 files changed, 175 insertions(+)
> >>  create mode 100644 drivers/pci/host/pcie-tango.c
> >>
> [..]
> >> +     /*
> >> +      * QUIRK #2
> >> +      * Unfortunately, config and mem spaces are muxed.
> >> +      * Linux does not support such a setting, since drivers are free
> >> +      * to access mem space directly, at any time.
> >> +      * Therefore, we can only PRAY that config and mem space accesses
> >> +      * NEVER occur concurrently.
> >> +      */
> >> +     writel_relaxed(1, pcie->mux);
> >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
> >> +     writel_relaxed(0, pcie->mux);
> >
> > I'm very hesitant about this.  When people stress this, we're going to
> > get reports of data corruption.  Even with the disclaimer below, I
> > don't feel good about this.  Adding the driver is an implicit claim
> > that we support the device, but we know it can't be made reliable.
> 
> I noticed that the Synopsys driver suffers from a similar issue: in
> dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
> to perform a config space access, and switches it back to I/O space
> afterwards (unless it has more than 2 viewports, in which case it uses
> dedicated windows for I/O space and config space)

That doesn't sound good.  Jingoo, Joao?  I remember some discussion
about this, but not the details.

I/O accesses use wrappers (inb(), etc), so there's at least the
possibility of a mutex to serialize them with respect to config
accesses.

Bjorn
Bjorn Helgaas July 3, 2017, 1:40 p.m. UTC | #5
On Mon, Jul 03, 2017 at 11:54:20AM +0200, Marc Gonzalez wrote:
> Hello Bjorn,
> 
> On 03/07/2017 01:18, Bjorn Helgaas wrote:
> 
> > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
> > 
> >> This driver is required to work around several hardware bugs
> >> in the PCIe controller.
> >
> > [ Snip style issues ]
> 
> I wish you had pointed these out in your May 23 review, or in
> my March 29 version (or even just alluded to them). I would
> have fixed them in time for 2.13.

They're trivial, and if they were the only issues I would have just
done them myself while merging.

> >> +	/*
> >> +	 * QUIRK #2
> >> +	 * Unfortunately, config and mem spaces are muxed.
> >> +	 * Linux does not support such a setting, since drivers are free
> >> +	 * to access mem space directly, at any time.
> >> +	 * Therefore, we can only PRAY that config and mem space accesses
> >> +	 * NEVER occur concurrently.
> >> +	 */
> >> +	writel_relaxed(1, pcie->mux);
> >> +	ret = pci_generic_config_read(bus, devfn, where, size, val);
> >> +	writel_relaxed(0, pcie->mux);
> > 
> > I'm very hesitant about this.  When people stress this, we're going to
> > get reports of data corruption.  Even with the disclaimer below, I
> > don't feel good about this.  Adding the driver is an implicit claim
> > that we support the device, but we know it can't be made reliable.
> 
> I can't say I didn't see this coming (I had taken your long silence
> as a sign of your reluctance) but back in May, I thought you implied
> that a warning + tainting the kernel would be sufficient.
> 
> Mark Rutland points out stop_machine. I will test this solution.
> Would you find that acceptable?

Sounds possible, though I can't say for sure without seeing the code.

The problem is serializing vs. memory accesses, since they don't use
any wrappers.  However, they are ioremapped(), so it's at least
conceivable that another solution would be to use VM to trap those
accesses.  I'm not a VM person, so I don't know whether that's
feasible in Linux.

> > What is the benefit of adding this driver?  How many units are in the
> > field?  Are you hoping to have support in distros like RHEL?  Are
> > these running self-built kernels straight from kernel.org?  Is it
> > feasible for you to distribute this driver separately from the
> > upstream kernel?
> 
> The benefit of upstreaming (for me) is making kernel maintainers
> aware that one is using specific internal APIs. Then one may be
> notified when these APIs are about to change.
> 
> I'm told we have sold ~100k units. Though I don't know how many
> are in the field and using PCIe.
> 
> There are no plans to use "full-blown" distros, we use embedded
> oriented distros, such as buildroot.
> 
> Maintaining out-of-tree drivers is what we've been doing for
> ~15 years, and there are many pain-points involved. Ask Greg
> what he thinks of OOT drivers.
> 
> 
> >> +static int tango_check_pcie_link(void __iomem *test_out)
> > 
> > I think this is checking for link up.  Rename to tango_pcie_link_up()
> > to follow the convention of other drivers.  Take a struct tango_pcie *
> > instead of an address, if possible.
> 
> Anything's possible. NB: if I pass the struct, then I have to store
> the address in the struct, which isn't the case now, since I never
> need the address later. If you don't mind adding an unnecessary
> field to the struct, I can do it. What do you say?

The benefit of following the same formula as other drivers is pretty
large.  Most drivers save the equivalent of "base" in the struct.  If
you did that, you wouldn't need an extra pointer; you would just use
"base + SMP8759_MUX" in the config accessors and "base +
SMP8759_TEST_OUT" in tango_pcie_link_up().

> >> +static struct platform_driver tango_pcie_driver = {
> >> +	.probe	= tango_pcie_probe,
> >> +	.driver	= {
> >> +		.name = KBUILD_MODNAME,
> >> +		.of_match_table = tango_pcie_ids,
> > 
> > I think you need ".suppress_bind_attrs = true" here to prevent issues
> > when unbinding driver.  See
> > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5f40e8098fe
> 
> Will do. In that case, I can drop tango_pcie_remove() right?

I think so; I don't think there would be any way to remove the driver.

Bjorn
Marc Gonzalez July 3, 2017, 2:34 p.m. UTC | #6
Bjorn Helgaas wrote:

> Marc Gonzalez wrote:
>
>> On 03/07/2017 01:18, Bjorn Helgaas wrote:
>>
>>> On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
>>>
>>>> +static int tango_check_pcie_link(void __iomem *test_out)
>>>
>>> I think this is checking for link up.  Rename to tango_pcie_link_up()
>>> to follow the convention of other drivers.  Take a struct tango_pcie *
>>> instead of an address, if possible.
>>
>> Anything's possible. NB: if I pass the struct, then I have to store
>> the address in the struct, which isn't the case now, since I never
>> need the address later. If you don't mind adding an unnecessary
>> field to the struct, I can do it. What do you say?
> 
> The benefit of following the same formula as other drivers is pretty
> large.  Most drivers save the equivalent of "base" in the struct.  If
> you did that, you wouldn't need an extra pointer; you would just use
> "base + SMP8759_MUX" in the config accessors and "base + SMP8759_TEST_OUT"
> in tango_pcie_link_up().

The problem is that TEST_OUT is at 0x74 on SMP8759, but at 0x138
on my other chip. In fact, all registers have been "reshuffled",
and none have the same offsets on the two chips.

My solution was to define specific registers in the struct.

In my [RFC PATCH v0.2] posted March 23, I tried illustrating
the issue:

+static const struct of_device_id tango_pcie_ids[] = {
+	{ .compatible = "sigma,smp8759-pcie" },
+	{ .compatible = "sigma,rev2-pcie" },
+	{ /* sentinel */ },
+};
+
+static void smp8759_init(struct tango_pcie *pcie, void __iomem *base)
+{
+	pcie->mux		= base + 0x48;
+	pcie->msi_status	= base + 0x80;
+	pcie->msi_mask		= base + 0xa0;
+	pcie->msi_doorbell	= 0xa0000000 + 0x2e07c;
+}
+
+static void rev2_init(struct tango_pcie *pcie, void __iomem *base)
+{
+	void __iomem *misc_irq	= base + 0x40;
+	void __iomem *doorbell	= base + 0x8c;
+
+	pcie->mux		= base + 0x2c;
+	pcie->msi_status	= base + 0x4c;
+	pcie->msi_mask		= base + 0x6c;
+	pcie->msi_doorbell	= 0x80000000;
+
+	writel(lower_32_bits(pcie->msi_doorbell), doorbell + 0);
+	writel(upper_32_bits(pcie->msi_doorbell), doorbell + 4);
+
+	/* Enable legacy PCI interrupts */
+	writel(BIT(15), misc_irq);
+	writel(0xf << 4, misc_irq + 4);
+}


Do you agree that the 'base + OFFSET' idiom does not work in
this specific situation? Would you handle it differently?

Regards.
Marc Gonzalez July 3, 2017, 3:30 p.m. UTC | #7
On 03/07/2017 15:13, Marc Gonzalez wrote:

> On 03/07/2017 11:54, Marc Gonzalez wrote:
> 
>> Mark Rutland points out stop_machine. I will test this solution.
>
> I must be misunderstanding some of the requirements for
> calling stop_machine() because my kernel panics, after
> many warnings.

Enabling several kernel debugging features:

+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
+CONFIG_PROVE_LOCKING=y

And at the end of smp8759_config_read:

	printk("in_atomic_preempt_off = %d\n", in_atomic_preempt_off());
	stop_machine(do_nothing, NULL, NULL);
	panic("STOP HERE FOR NOW\n");

The kernel outputs:

[    1.022725] in_atomic_preempt_off = 0
[    1.026568] BUG: scheduling while atomic: swapper/0/1/0x00000002
[    1.032625] 5 locks held by swapper/0/1:
[    1.036575]  #0:  (&dev->mutex){......}, at: [<c038c684>] __driver_attach+0x50/0xd0
[    1.044319]  #1:  (&dev->mutex){......}, at: [<c038c694>] __driver_attach+0x60/0xd0
[    1.052050]  #2:  (pci_lock){+.+...}, at: [<c03309d8>] pci_bus_read_config_dword+0x44/0x94
[    1.060398]  #3:  (cpu_hotplug.dep_map){++++++}, at: [<c0119db0>] get_online_cpus+0x2c/0xa0
[    1.068843]  #4:  (stop_cpus_mutex){+.+...}, at: [<c01a1184>] stop_cpus+0x20/0x48
[    1.076404] Modules linked in:
[    1.079483] Preemption disabled at:[    1.082820] [<  (null)>]   (null)
[    1.086165] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.23-1-rc4 #13
[    1.092723] Hardware name: Sigma Tango DT
[    1.096765] [<c010f600>] (unwind_backtrace) from [<c010b568>] (show_stack+0x10/0x14)
[    1.104558] [<c010b568>] (show_stack) from [<c0304830>] (dump_stack+0x98/0xc4)
[    1.111823] [<c0304830>] (dump_stack) from [<c013eee0>] (__schedule_bug+0x94/0xec)
[    1.119436] [<c013eee0>] (__schedule_bug) from [<c0516364>] (__schedule+0x4a4/0x5f0)
[    1.127220] [<c0516364>] (__schedule) from [<c05164fc>] (schedule+0x4c/0xac)
[    1.134308] [<c05164fc>] (schedule) from [<c051b010>] (schedule_timeout+0x1f4/0x30c)
[    1.142093] [<c051b010>] (schedule_timeout) from [<c051702c>] (wait_for_common+0x8c/0x13c)
[    1.150401] [<c051702c>] (wait_for_common) from [<c05170ec>] (wait_for_completion+0x10/0x14)
[    1.158886] [<c05170ec>] (wait_for_completion) from [<c01a0d74>] (__stop_cpus+0x50/0x64)
[    1.167021] [<c01a0d74>] (__stop_cpus) from [<c01a1194>] (stop_cpus+0x30/0x48)
[    1.174282] [<c01a1194>] (stop_cpus) from [<c01a1230>] (stop_machine+0x84/0x118)
[    1.181719] [<c01a1230>] (stop_machine) from [<c034c070>] (smp8759_config_read+0x84/0x90)
[    1.189942] [<c034c070>] (smp8759_config_read) from [<c0330a00>] (pci_bus_read_config_dword+0x6c/0x94)
[    1.199301] [<c0330a00>] (pci_bus_read_config_dword) from [<c0332920>] (pci_bus_read_dev_vendor_id+0x24/0xe8)
[    1.209270] [<c0332920>] (pci_bus_read_dev_vendor_id) from [<c033413c>] (pci_scan_single_device+0x40/0xb0)
[    1.218977] [<c033413c>] (pci_scan_single_device) from [<c0334204>] (pci_scan_slot+0x58/0x100)
[    1.227636] [<c0334204>] (pci_scan_slot) from [<c033511c>] (pci_scan_child_bus+0x20/0xf8)
[    1.235858] [<c033511c>] (pci_scan_child_bus) from [<c03353ec>] (pci_scan_root_bus_msi+0xcc/0xd8)
[    1.244779] [<c03353ec>] (pci_scan_root_bus_msi) from [<c0335410>] (pci_scan_root_bus+0x18/0x20)
[    1.253612] [<c0335410>] (pci_scan_root_bus) from [<c034bc5c>] (pci_host_common_probe+0xc8/0x314)
[    1.262533] [<c034bc5c>] (pci_host_common_probe) from [<c034c444>] (tango_pcie_probe+0x148/0x350)
[    1.271455] [<c034c444>] (tango_pcie_probe) from [<c038dbc8>] (platform_drv_probe+0x34/0x6c)
[    1.279939] [<c038dbc8>] (platform_drv_probe) from [<c038c5a8>] (really_probe+0x1c4/0x250)
[    1.288248] [<c038c5a8>] (really_probe) from [<c038c700>] (__driver_attach+0xcc/0xd0)
[    1.296121] [<c038c700>] (__driver_attach) from [<c038aa50>] (bus_for_each_dev+0x68/0x9c)
[    1.304342] [<c038aa50>] (bus_for_each_dev) from [<c038bfac>] (driver_attach+0x1c/0x20)
[    1.312389] [<c038bfac>] (driver_attach) from [<c038bb5c>] (bus_add_driver+0x108/0x214)
[    1.320436] [<c038bb5c>] (bus_add_driver) from [<c038ccb0>] (driver_register+0x78/0xf4)
[    1.328483] [<c038ccb0>] (driver_register) from [<c038db8c>] (__platform_driver_register+0x40/0x48)
[    1.337583] [<c038db8c>] (__platform_driver_register) from [<c0816fc4>] (tango_pcie_driver_init+0x14/0x18)
[    1.347291] [<c0816fc4>] (tango_pcie_driver_init) from [<c01017dc>] (do_one_initcall+0x44/0x174)
[    1.356128] [<c01017dc>] (do_one_initcall) from [<c0800dcc>] (kernel_init_freeable+0x154/0x1e0)
[    1.364875] [<c0800dcc>] (kernel_init_freeable) from [<c0515544>] (kernel_init+0x8/0x10c)
[    1.373097] [<c0515544>] (kernel_init) from [<c01077d0>] (ret_from_fork+0x14/0x24)
[    1.380799] Kernel panic - not syncing: STOP HERE FOR NOW
[    1.380799] 
[    1.387714] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W       4.9.23-1-rc4 #13
[    1.395495] Hardware name: Sigma Tango DT
[    1.399529] [<c010f600>] (unwind_backtrace) from [<c010b568>] (show_stack+0x10/0x14)
[    1.407317] [<c010b568>] (show_stack) from [<c0304830>] (dump_stack+0x98/0xc4)
[    1.414582] [<c0304830>] (dump_stack) from [<c01a3bac>] (panic+0xe0/0x258)
[    1.421494] [<c01a3bac>] (panic) from [<c034c07c>] (tango_check_pcie_link+0x0/0x48)
[    1.429192] [<c034c07c>] (tango_check_pcie_link) from [<c034bfec>] (smp8759_config_read+0x0/0x90)
[    1.438114] [<c034bfec>] (smp8759_config_read) from [<00241105>] (0x241105)
[    1.445191] CPU1: stopping
[    1.447910] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G        W       4.9.23-1-rc4 #13
[    1.455690] Hardware name: Sigma Tango DT
[    1.459724] [<c010f600>] (unwind_backtrace) from [<c010b568>] (show_stack+0x10/0x14)
[    1.467512] [<c010b568>] (show_stack) from [<c0304830>] (dump_stack+0x98/0xc4)
[    1.474774] [<c0304830>] (dump_stack) from [<c010e410>] (handle_IPI+0x19c/0x1b0)
[    1.482210] [<c010e410>] (handle_IPI) from [<c010151c>] (gic_handle_irq+0x88/0x8c)
[    1.489819] [<c010151c>] (gic_handle_irq) from [<c010c0b0>] (__irq_svc+0x70/0xb0)
[    1.497339] Exception stack(0xdf46ff80 to 0xdf46ffc8)
[    1.502415] ff80: 00000001 00000001 00000000 df468180 df46e000 c1004be4 c1004c48 00000002
[    1.510636] ffa0: c100f990 413fc090 00000000 00000000 00000000 df46ffd0 c0165ee0 c0108240
[    1.518853] ffc0: 20000113 ffffffff
[    1.522358] [<c010c0b0>] (__irq_svc) from [<c0108240>] (arch_cpu_idle+0x24/0x3c)
[    1.529795] [<c0108240>] (arch_cpu_idle) from [<c051b540>] (default_idle_call+0x20/0x30)
[    1.537934] [<c051b540>] (default_idle_call) from [<c015d6cc>] (cpu_startup_entry+0xd0/0x150)
[    1.546504] [<c015d6cc>] (cpu_startup_entry) from [<c010e048>] (secondary_start_kernel+0x158/0x164)
[    1.555599] [<c010e048>] (secondary_start_kernel) from [<801015ac>] (0x801015ac)
[    1.563063] ---[ end Kernel panic - not syncing: STOP HERE FOR NOW


The panic call stack looks suspicious.

smp8759_config_read() never calls tango_check_pcie_link().

(I compile with -fno-optimize-sibling-calls to get more accurate
call stacks.)

in_atomic_preempt_off is false before calling stop_machine()

I suppose I'm doing something wrong, but I'm not sure what yet.

Regards.
Russell King (Oracle) July 3, 2017, 6:11 p.m. UTC | #8
On Mon, Jul 03, 2017 at 08:40:31AM -0500, Bjorn Helgaas wrote:
> The problem is serializing vs. memory accesses, since they don't use
> any wrappers.  However, they are ioremapped(), so it's at least
> conceivable that another solution would be to use VM to trap those
> accesses.  I'm not a VM person, so I don't know whether that's
> feasible in Linux.

Bjorn,

You're forgetting that MMIO (iow, memory returned by ioremap()) must
be accessed through the appropriate accessors, and must not be
directly dereferenced in C.  (We do have buggy drivers that do that
but they are buggy, and in many cases are getting attention to fix
that.)

However, adding a spinlock into them is really not nice, because it
adds extra overhead that's only necessary for rare cases like Sigma
Designs - especially when you consider that these accessors are used
for all MMIO accesses, not just PCI.  It would effectively mean that
we end up serialising all MMIO accesses throughout the kernel when
Sigma Designs SoCs are enabled, destroying some of the SMP benefit.

I don't think we can sanely use the MMU to trap those accesses either,
that would mean sending IPIs to tell other CPUs to do something, and
waiting for them to respond - which can deadlock if we're already in
an IRQ-protected region (iirc, config accesses are made with IRQs
off.)

I don't think there's an easy solution to this problem - and I'm not
sure that stop_machine() can be made to work in this path (which
needs a process context).  I have a suspicion that the Sigma Designs
PCI implementation is just soo insane that it's never going to work
reliably in a multi-SoC kernel without introducing severe performance
issues for everyone else.
Ard Biesheuvel July 3, 2017, 6:44 p.m. UTC | #9
On 3 July 2017 at 19:11, Russell King - ARM Linux <linux@armlinux.org.uk> wrote:
> On Mon, Jul 03, 2017 at 08:40:31AM -0500, Bjorn Helgaas wrote:
>> The problem is serializing vs. memory accesses, since they don't use
>> any wrappers.  However, they are ioremapped(), so it's at least
>> conceivable that another solution would be to use VM to trap those
>> accesses.  I'm not a VM person, so I don't know whether that's
>> feasible in Linux.
>
> Bjorn,
>
> You're forgetting that MMIO (iow, memory returned by ioremap()) must
> be accessed through the appropriate accessors, and must not be
> directly dereferenced in C.  (We do have buggy drivers that do that
> but they are buggy, and in many cases are getting attention to fix
> that.)
>
> However, adding a spinlock into them is really not nice, because it
> adds extra overhead that's only necessary for rare cases like Sigma
> Designs - especially when you consider that these accessors are used
> for all MMIO accesses, not just PCI.  It would effectively mean that
> we end up serialising all MMIO accesses throughout the kernel when
> Sigma Designs SoCs are enabled, destroying some of the SMP benefit.
>
> I don't think we can sanely use the MMU to trap those accesses either,
> that would mean sending IPIs to tell other CPUs to do something, and
> waiting for them to respond - which can deadlock if we're already in
> an IRQ-protected region (iirc, config accesses are made with IRQs
> off.)
>
> I don't think there's an easy solution to this problem - and I'm not
> sure that stop_machine() can be made to work in this path (which
> needs a process context).  I have a suspicion that the Sigma Designs
> PCI implementation is just soo insane that it's never going to work
> reliably in a multi-SoC kernel without introducing severe performance
> issues for everyone else.
>

I suppose we could perhaps use per-cpu spinlocks? That would put the
complexity in the Sigma config space accessors, i.e., to take each
lock before proceeding with reprogramming the outbound window, and
other implementations wouldn't have to care. However, I do agree with
Russell that having this complexity in the first place is hard to
justify if the only implementation that requires it is a wacky design
that needs lots of other quirks to operate somewhat sanely to begin
with.
Jisheng Zhang July 4, 2017, 6:58 a.m. UTC | #10
On Mon, 3 Jul 2017 08:27:04 -0500 wrote:

> [+cc Jingoo, Joao]
> 
> On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:
> > On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:  
> > > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:  
> > >> This driver is required to work around several hardware bugs
> > >> in the PCIe controller.
> > >>
> > >> NB: Revision 1 does not support legacy interrupts, or IO space.  
> > >
> > > I had to apply these manually because of conflicts in Kconfig and
> > > Makefile.  What are these based on?  Easiest for me is if you base
> > > them on the current -rc1 tag.
> > >  
> > >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> > >> ---
> > >>  drivers/pci/host/Kconfig      |   8 +++
> > >>  drivers/pci/host/Makefile     |   1 +
> > >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
> > >>  include/linux/pci_ids.h       |   2 +
> > >>  4 files changed, 175 insertions(+)
> > >>  create mode 100644 drivers/pci/host/pcie-tango.c
> > >>  
> > [..]  
> > >> +     /*
> > >> +      * QUIRK #2
> > >> +      * Unfortunately, config and mem spaces are muxed.
> > >> +      * Linux does not support such a setting, since drivers are free
> > >> +      * to access mem space directly, at any time.
> > >> +      * Therefore, we can only PRAY that config and mem space accesses
> > >> +      * NEVER occur concurrently.
> > >> +      */
> > >> +     writel_relaxed(1, pcie->mux);
> > >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
> > >> +     writel_relaxed(0, pcie->mux);  
> > >
> > > I'm very hesitant about this.  When people stress this, we're going to
> > > get reports of data corruption.  Even with the disclaimer below, I
> > > don't feel good about this.  Adding the driver is an implicit claim
> > > that we support the device, but we know it can't be made reliable.  
> > 
> > I noticed that the Synopsys driver suffers from a similar issue: in
> > dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
> > to perform a config space access, and switches it back to I/O space
> > afterwards (unless it has more than 2 viewports, in which case it uses
> > dedicated windows for I/O space and config space)  
> 
> That doesn't sound good.  Jingoo, Joao?  I remember some discussion
> about this, but not the details.
> 
> I/O accesses use wrappers (inb(), etc), so there's at least the
> possibility of a mutex to serialize them with respect to config
> accesses.
> 

IIRC, for 2 viewports, we don't need to worry about the config space
access, because config space access is serialized by pci_lock; We
do have race between config space and io space. But the accessing config
space and io space at the same time is rare. And the PCIe EPs which
has io space are rare too, supporting these EPs are not the potential
target of those platforms with 2 viewports.

Thanks,
Jisheng
Peter Zijlstra July 4, 2017, 7:09 a.m. UTC | #11
On Mon, Jul 03, 2017 at 05:30:28PM +0200, Marc Gonzalez wrote:

> And at the end of smp8759_config_read:
> 
> 	printk("in_atomic_preempt_off = %d\n", in_atomic_preempt_off());

That's confused...

> 	stop_machine(do_nothing, NULL, NULL);
> 	panic("STOP HERE FOR NOW\n");
> 
> The kernel outputs:
> 
> [    1.022725] in_atomic_preempt_off = 0
> [    1.026568] BUG: scheduling while atomic: swapper/0/1/0x00000002
> [    1.032625] 5 locks held by swapper/0/1:
> [    1.036575]  #0:  (&dev->mutex){......}, at: [<c038c684>] __driver_attach+0x50/0xd0
> [    1.044319]  #1:  (&dev->mutex){......}, at: [<c038c694>] __driver_attach+0x60/0xd0
> [    1.052050]  #2:  (pci_lock){+.+...}, at: [<c03309d8>] pci_bus_read_config_dword+0x44/0x94

This is a raw_spinlock_t, that disables preemption

> [    1.060398]  #3:  (cpu_hotplug.dep_map){++++++}, at: [<c0119db0>] get_online_cpus+0x2c/0xa0
> [    1.068843]  #4:  (stop_cpus_mutex){+.+...}, at: [<c01a1184>] stop_cpus+0x20/0x48

> [    1.076404] Modules linked in:
> [    1.079483] Preemption disabled at:[    1.082820] [<  (null)>]   (null)
> [    1.086165] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.23-1-rc4 #13
> [    1.092723] Hardware name: Sigma Tango DT
> [    1.096765] [<c010f600>] (unwind_backtrace) from [<c010b568>] (show_stack+0x10/0x14)
> [    1.104558] [<c010b568>] (show_stack) from [<c0304830>] (dump_stack+0x98/0xc4)
> [    1.111823] [<c0304830>] (dump_stack) from [<c013eee0>] (__schedule_bug+0x94/0xec)
> [    1.119436] [<c013eee0>] (__schedule_bug) from [<c0516364>] (__schedule+0x4a4/0x5f0)
> [    1.127220] [<c0516364>] (__schedule) from [<c05164fc>] (schedule+0x4c/0xac)


And here you try and schedule with preemption disabled.

> [    1.134308] [<c05164fc>] (schedule) from [<c051b010>] (schedule_timeout+0x1f4/0x30c)
> [    1.142093] [<c051b010>] (schedule_timeout) from [<c051702c>] (wait_for_common+0x8c/0x13c)
> [    1.150401] [<c051702c>] (wait_for_common) from [<c05170ec>] (wait_for_completion+0x10/0x14)
> [    1.158886] [<c05170ec>] (wait_for_completion) from [<c01a0d74>] (__stop_cpus+0x50/0x64)
> [    1.167021] [<c01a0d74>] (__stop_cpus) from [<c01a1194>] (stop_cpus+0x30/0x48)
> [    1.174282] [<c01a1194>] (stop_cpus) from [<c01a1230>] (stop_machine+0x84/0x118)
> [    1.181719] [<c01a1230>] (stop_machine) from [<c034c070>] (smp8759_config_read+0x84/0x90)
> [    1.189942] [<c034c070>] (smp8759_config_read) from [<c0330a00>] (pci_bus_read_config_dword+0x6c/0x94)
> [    1.199301] [<c0330a00>] (pci_bus_read_config_dword) from [<c0332920>] (pci_bus_read_dev_vendor_id+0x24/0xe8)
> [    1.209270] [<c0332920>] (pci_bus_read_dev_vendor_id) from [<c033413c>] (pci_scan_single_device+0x40/0xb0)
> [    1.218977] [<c033413c>] (pci_scan_single_device) from [<c0334204>] (pci_scan_slot+0x58/0x100)
> [    1.227636] [<c0334204>] (pci_scan_slot) from [<c033511c>] (pci_scan_child_bus+0x20/0xf8)
> [    1.235858] [<c033511c>] (pci_scan_child_bus) from [<c03353ec>] (pci_scan_root_bus_msi+0xcc/0xd8)
> [    1.244779] [<c03353ec>] (pci_scan_root_bus_msi) from [<c0335410>] (pci_scan_root_bus+0x18/0x20)
> [    1.253612] [<c0335410>] (pci_scan_root_bus) from [<c034bc5c>] (pci_host_common_probe+0xc8/0x314)
> [    1.262533] [<c034bc5c>] (pci_host_common_probe) from [<c034c444>] (tango_pcie_probe+0x148/0x350)
> [    1.271455] [<c034c444>] (tango_pcie_probe) from [<c038dbc8>] (platform_drv_probe+0x34/0x6c)

> The panic call stack looks suspicious.
> 
> smp8759_config_read() never calls tango_check_pcie_link().
> 
> (I compile with -fno-optimize-sibling-calls to get more accurate
> call stacks.)
> 
> in_atomic_preempt_off is false before calling stop_machine()

Yes, but that's a pointless statement.

> I suppose I'm doing something wrong, but I'm not sure what yet.

Using stop_machine() is per definition doing it wrong ;-) But see above.
Jisheng Zhang July 4, 2017, 7:16 a.m. UTC | #12
On Tue, 4 Jul 2017 14:58:40 +0800 Jisheng Zhang wrote:

> On Mon, 3 Jul 2017 08:27:04 -0500 wrote:
> 
> > [+cc Jingoo, Joao]
> > 
> > On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:  
> > > On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:    
> > > > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:    
> > > >> This driver is required to work around several hardware bugs
> > > >> in the PCIe controller.
> > > >>
> > > >> NB: Revision 1 does not support legacy interrupts, or IO space.    
> > > >
> > > > I had to apply these manually because of conflicts in Kconfig and
> > > > Makefile.  What are these based on?  Easiest for me is if you base
> > > > them on the current -rc1 tag.
> > > >    
> > > >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> > > >> ---
> > > >>  drivers/pci/host/Kconfig      |   8 +++
> > > >>  drivers/pci/host/Makefile     |   1 +
> > > >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
> > > >>  include/linux/pci_ids.h       |   2 +
> > > >>  4 files changed, 175 insertions(+)
> > > >>  create mode 100644 drivers/pci/host/pcie-tango.c
> > > >>    
> > > [..]    
> > > >> +     /*
> > > >> +      * QUIRK #2
> > > >> +      * Unfortunately, config and mem spaces are muxed.
> > > >> +      * Linux does not support such a setting, since drivers are free
> > > >> +      * to access mem space directly, at any time.
> > > >> +      * Therefore, we can only PRAY that config and mem space accesses
> > > >> +      * NEVER occur concurrently.
> > > >> +      */
> > > >> +     writel_relaxed(1, pcie->mux);
> > > >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
> > > >> +     writel_relaxed(0, pcie->mux);    
> > > >
> > > > I'm very hesitant about this.  When people stress this, we're going to
> > > > get reports of data corruption.  Even with the disclaimer below, I
> > > > don't feel good about this.  Adding the driver is an implicit claim
> > > > that we support the device, but we know it can't be made reliable.    
> > > 
> > > I noticed that the Synopsys driver suffers from a similar issue: in
> > > dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
> > > to perform a config space access, and switches it back to I/O space
> > > afterwards (unless it has more than 2 viewports, in which case it uses
> > > dedicated windows for I/O space and config space)    
> > 
> > That doesn't sound good.  Jingoo, Joao?  I remember some discussion
> > about this, but not the details.
> > 
> > I/O accesses use wrappers (inb(), etc), so there's at least the
> > possibility of a mutex to serialize them with respect to config
> > accesses.
> >   
> 
> IIRC, for 2 viewports, we don't need to worry about the config space
> access, because config space access is serialized by pci_lock; We
> do have race between config space and io space. But the accessing config
> space and io space at the same time is rare. And the PCIe EPs which
> has io space are rare too, supporting these EPs are not the potential
> target of those platforms with 2 viewports.
> 

PS: I think most platforms choose 2 pcie designware viewports just because
it's the default setting. And I have send a feature request to ASIC people
to increase the viewports to 3 for future marvell berlin SoCs.
Ard Biesheuvel July 4, 2017, 8:02 a.m. UTC | #13
On 4 July 2017 at 07:58, Jisheng Zhang <jszhang@marvell.com> wrote:
> On Mon, 3 Jul 2017 08:27:04 -0500 wrote:
>
>> [+cc Jingoo, Joao]
>>
>> On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:
>> > On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:
>> > > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
>> > >> This driver is required to work around several hardware bugs
>> > >> in the PCIe controller.
>> > >>
>> > >> NB: Revision 1 does not support legacy interrupts, or IO space.
>> > >
>> > > I had to apply these manually because of conflicts in Kconfig and
>> > > Makefile.  What are these based on?  Easiest for me is if you base
>> > > them on the current -rc1 tag.
>> > >
>> > >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>> > >> ---
>> > >>  drivers/pci/host/Kconfig      |   8 +++
>> > >>  drivers/pci/host/Makefile     |   1 +
>> > >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
>> > >>  include/linux/pci_ids.h       |   2 +
>> > >>  4 files changed, 175 insertions(+)
>> > >>  create mode 100644 drivers/pci/host/pcie-tango.c
>> > >>
>> > [..]
>> > >> +     /*
>> > >> +      * QUIRK #2
>> > >> +      * Unfortunately, config and mem spaces are muxed.
>> > >> +      * Linux does not support such a setting, since drivers are free
>> > >> +      * to access mem space directly, at any time.
>> > >> +      * Therefore, we can only PRAY that config and mem space accesses
>> > >> +      * NEVER occur concurrently.
>> > >> +      */
>> > >> +     writel_relaxed(1, pcie->mux);
>> > >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
>> > >> +     writel_relaxed(0, pcie->mux);
>> > >
>> > > I'm very hesitant about this.  When people stress this, we're going to
>> > > get reports of data corruption.  Even with the disclaimer below, I
>> > > don't feel good about this.  Adding the driver is an implicit claim
>> > > that we support the device, but we know it can't be made reliable.
>> >
>> > I noticed that the Synopsys driver suffers from a similar issue: in
>> > dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
>> > to perform a config space access, and switches it back to I/O space
>> > afterwards (unless it has more than 2 viewports, in which case it uses
>> > dedicated windows for I/O space and config space)
>>
>> That doesn't sound good.  Jingoo, Joao?  I remember some discussion
>> about this, but not the details.
>>
>> I/O accesses use wrappers (inb(), etc), so there's at least the
>> possibility of a mutex to serialize them with respect to config
>> accesses.
>>
>
> IIRC, for 2 viewports, we don't need to worry about the config space
> access, because config space access is serialized by pci_lock; We
> do have race between config space and io space. But the accessing config
> space and io space at the same time is rare.

Being 'rare' is not sufficient, unfortunately. In the current
situation, I/O space accesses may occur when the outbound window is
directed to the config space of a potentially completely unrelated
device. This is bad.

> And the PCIe EPs which
> has io space are rare too, supporting these EPs are not the potential
> target of those platforms with 2 viewports.
>

I am not sure EP mode is relevant here. What I do know is that boards
like the Marvell 8040 based MacchiatoBin uses this IP in RC mode, and
exposes config, MMIO and IO space windows using only 2 viewports. Note
that this is essentially a bug in the DT description, given that its
version of the IP supports 8 viewports. But the driver needs to be
fixed as well.
Jisheng Zhang July 4, 2017, 8:19 a.m. UTC | #14
On Tue, 4 Jul 2017 09:02:06 +0100 Ard Biesheuvel wrote:

> On 4 July 2017 at 07:58, Jisheng Zhang <jszhang@marvell.com> wrote:
> > On Mon, 3 Jul 2017 08:27:04 -0500 wrote:
> >  
> >> [+cc Jingoo, Joao]
> >>
> >> On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:  
> >> > On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:  
> >> > > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:  
> >> > >> This driver is required to work around several hardware bugs
> >> > >> in the PCIe controller.
> >> > >>
> >> > >> NB: Revision 1 does not support legacy interrupts, or IO space.  
> >> > >
> >> > > I had to apply these manually because of conflicts in Kconfig and
> >> > > Makefile.  What are these based on?  Easiest for me is if you base
> >> > > them on the current -rc1 tag.
> >> > >  
> >> > >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
> >> > >> ---
> >> > >>  drivers/pci/host/Kconfig      |   8 +++
> >> > >>  drivers/pci/host/Makefile     |   1 +
> >> > >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
> >> > >>  include/linux/pci_ids.h       |   2 +
> >> > >>  4 files changed, 175 insertions(+)
> >> > >>  create mode 100644 drivers/pci/host/pcie-tango.c
> >> > >>  
> >> > [..]  
> >> > >> +     /*
> >> > >> +      * QUIRK #2
> >> > >> +      * Unfortunately, config and mem spaces are muxed.
> >> > >> +      * Linux does not support such a setting, since drivers are free
> >> > >> +      * to access mem space directly, at any time.
> >> > >> +      * Therefore, we can only PRAY that config and mem space accesses
> >> > >> +      * NEVER occur concurrently.
> >> > >> +      */
> >> > >> +     writel_relaxed(1, pcie->mux);
> >> > >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
> >> > >> +     writel_relaxed(0, pcie->mux);  
> >> > >
> >> > > I'm very hesitant about this.  When people stress this, we're going to
> >> > > get reports of data corruption.  Even with the disclaimer below, I
> >> > > don't feel good about this.  Adding the driver is an implicit claim
> >> > > that we support the device, but we know it can't be made reliable.  
> >> >
> >> > I noticed that the Synopsys driver suffers from a similar issue: in
> >> > dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
> >> > to perform a config space access, and switches it back to I/O space
> >> > afterwards (unless it has more than 2 viewports, in which case it uses
> >> > dedicated windows for I/O space and config space)  
> >>
> >> That doesn't sound good.  Jingoo, Joao?  I remember some discussion
> >> about this, but not the details.
> >>
> >> I/O accesses use wrappers (inb(), etc), so there's at least the
> >> possibility of a mutex to serialize them with respect to config
> >> accesses.
> >>  
> >
> > IIRC, for 2 viewports, we don't need to worry about the config space
> > access, because config space access is serialized by pci_lock; We
> > do have race between config space and io space. But the accessing config
> > space and io space at the same time is rare.  
> 
> Being 'rare' is not sufficient, unfortunately. In the current
> situation, I/O space accesses may occur when the outbound window is
> directed to the config space of a potentially completely unrelated
> device. This is bad.

Yep, I agree with you.

> 
> > And the PCIe EPs which
> > has io space are rare too, supporting these EPs are not the potential
> > target of those platforms with 2 viewports.
> >  
> 
> I am not sure EP mode is relevant here. What I do know is that boards

I mean those PCIe EP cards which have IO space, but that doesn't matter.

> like the Marvell 8040 based MacchiatoBin uses this IP in RC mode, and
> exposes config, MMIO and IO space windows using only 2 viewports. Note
> that this is essentially a bug in the DT description, given that its
> version of the IP supports 8 viewports. But the driver needs to be
> fixed as well.

To fix for 2 viewports situation, we need to serialize access of the io
and config space. In internal repo, we can achieve it by modifying the
io access helper functions such as inl/outl, but this won't be accepted
by the mainline IMHO. Except fixing the HW, any elegant solution?

Suggestions are appreciated.

Thanks,
Jisheng
Ard Biesheuvel July 4, 2017, 9:38 a.m. UTC | #15
On 4 July 2017 at 09:19, Jisheng Zhang <jszhang@marvell.com> wrote:
> On Tue, 4 Jul 2017 09:02:06 +0100 Ard Biesheuvel wrote:
>
>> On 4 July 2017 at 07:58, Jisheng Zhang <jszhang@marvell.com> wrote:
>> > On Mon, 3 Jul 2017 08:27:04 -0500 wrote:
>> >
>> >> [+cc Jingoo, Joao]
>> >>
>> >> On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:
>> >> > On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:
>> >> > > On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
>> >> > >> This driver is required to work around several hardware bugs
>> >> > >> in the PCIe controller.
>> >> > >>
>> >> > >> NB: Revision 1 does not support legacy interrupts, or IO space.
>> >> > >
>> >> > > I had to apply these manually because of conflicts in Kconfig and
>> >> > > Makefile.  What are these based on?  Easiest for me is if you base
>> >> > > them on the current -rc1 tag.
>> >> > >
>> >> > >> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>> >> > >> ---
>> >> > >>  drivers/pci/host/Kconfig      |   8 +++
>> >> > >>  drivers/pci/host/Makefile     |   1 +
>> >> > >>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
>> >> > >>  include/linux/pci_ids.h       |   2 +
>> >> > >>  4 files changed, 175 insertions(+)
>> >> > >>  create mode 100644 drivers/pci/host/pcie-tango.c
>> >> > >>
>> >> > [..]
>> >> > >> +     /*
>> >> > >> +      * QUIRK #2
>> >> > >> +      * Unfortunately, config and mem spaces are muxed.
>> >> > >> +      * Linux does not support such a setting, since drivers are free
>> >> > >> +      * to access mem space directly, at any time.
>> >> > >> +      * Therefore, we can only PRAY that config and mem space accesses
>> >> > >> +      * NEVER occur concurrently.
>> >> > >> +      */
>> >> > >> +     writel_relaxed(1, pcie->mux);
>> >> > >> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
>> >> > >> +     writel_relaxed(0, pcie->mux);
>> >> > >
>> >> > > I'm very hesitant about this.  When people stress this, we're going to
>> >> > > get reports of data corruption.  Even with the disclaimer below, I
>> >> > > don't feel good about this.  Adding the driver is an implicit claim
>> >> > > that we support the device, but we know it can't be made reliable.
>> >> >
>> >> > I noticed that the Synopsys driver suffers from a similar issue: in
>> >> > dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
>> >> > to perform a config space access, and switches it back to I/O space
>> >> > afterwards (unless it has more than 2 viewports, in which case it uses
>> >> > dedicated windows for I/O space and config space)
>> >>
>> >> That doesn't sound good.  Jingoo, Joao?  I remember some discussion
>> >> about this, but not the details.
>> >>
>> >> I/O accesses use wrappers (inb(), etc), so there's at least the
>> >> possibility of a mutex to serialize them with respect to config
>> >> accesses.
>> >>
>> >
>> > IIRC, for 2 viewports, we don't need to worry about the config space
>> > access, because config space access is serialized by pci_lock; We
>> > do have race between config space and io space. But the accessing config
>> > space and io space at the same time is rare.
>>
>> Being 'rare' is not sufficient, unfortunately. In the current
>> situation, I/O space accesses may occur when the outbound window is
>> directed to the config space of a potentially completely unrelated
>> device. This is bad.
>
> Yep, I agree with you.
>
>>
>> > And the PCIe EPs which
>> > has io space are rare too, supporting these EPs are not the potential
>> > target of those platforms with 2 viewports.
>> >
>>
>> I am not sure EP mode is relevant here. What I do know is that boards
>
> I mean those PCIe EP cards which have IO space, but that doesn't matter.
>

Ah, ok. But if such EP cards are so rare, why expose I/O space at all
if we cannot do it safely?

>> like the Marvell 8040 based MacchiatoBin uses this IP in RC mode, and
>> exposes config, MMIO and IO space windows using only 2 viewports. Note
>> that this is essentially a bug in the DT description, given that its
>> version of the IP supports 8 viewports. But the driver needs to be
>> fixed as well.
>
> To fix for 2 viewports situation, we need to serialize access of the io
> and config space. In internal repo, we can achieve it by modifying the
> io access helper functions such as inl/outl, but this won't be accepted
> by the mainline IMHO. Except fixing the HW, any elegant solution?
>
> Suggestions are appreciated.
>

I think the safe and upstreamable approach is to disable the I/O
window completely if num-viewports <= 2.
Mason July 4, 2017, 1:08 p.m. UTC | #16
On 04/07/2017 09:09, Peter Zijlstra wrote:

> On Mon, Jul 03, 2017 at 05:30:28PM +0200, Marc Gonzalez wrote:
> 
>> And at the end of smp8759_config_read:
>>
>> 	printk("in_atomic_preempt_off = %d\n", in_atomic_preempt_off());
> 
> That's confused...

That much is certain. I am indeed grasping at straws.

I grepped "scheduling while atomic", found __schedule_bug()
in kernel/sched/core.c and saw

	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) {
		pr_err("Preemption disabled at:");
		print_ip_sym(preempt_disable_ip);
		pr_cont("\n");
	}

I thought printing the value of in_atomic_preempt_off()
in the callback would indicate whether preemption had
already been turned off at that point.

It doesn't work like that?

BTW, why didn't print_ip_sym(preempt_disable_ip); say
where preemption had been disabled?


>> [    1.026568] BUG: scheduling while atomic: swapper/0/1/0x00000002
>> [    1.032625] 5 locks held by swapper/0/1:
>> [    1.036575]  #0:  (&dev->mutex){......}, at: [<c038c684>] __driver_attach+0x50/0xd0
>> [    1.044319]  #1:  (&dev->mutex){......}, at: [<c038c694>] __driver_attach+0x60/0xd0
>> [    1.052050]  #2:  (pci_lock){+.+...}, at: [<c03309d8>] pci_bus_read_config_dword+0x44/0x94
> 
> This is a raw_spinlock_t, that disables preemption

drivers/pci/access.c

/*
 * This interrupt-safe spinlock protects all accesses to PCI
 * configuration space.
 */
DEFINE_RAW_SPINLOCK(pci_lock);


	raw_spin_lock_irqsave(&pci_lock, flags);
	res = bus->ops->read(bus, devfn, pos, len, &data);


IIUC, it's not possible to call stop_machine() while holding
a raw spinlock? What about regular spinlocks? IIUC, in RT,
regular spinlocks may sleep?

I didn't find "preempt" or "schedul" in the spinlock doc.
https://www.kernel.org/doc/Documentation/locking/spinlocks.txt


> Using stop_machine() is per definition doing it wrong ;-)

Here's the high-level view. My HW is borked and muxes
config space and mem space. So I need a way to freeze
the entire system, make the config space access, and
then return the system to normal. (AFAICT, config space
accesses are rare, so if I kill performance for these
accesses, the system might remain usable.)

Is there a way to do this? Mark suggested stop_machine
but it seems using it in my situation is not quite
straight-forward.

Regards.
Peter Zijlstra July 4, 2017, 2:27 p.m. UTC | #17
On Tue, Jul 04, 2017 at 03:08:43PM +0200, Mason wrote:
> On 04/07/2017 09:09, Peter Zijlstra wrote:
> 
> > On Mon, Jul 03, 2017 at 05:30:28PM +0200, Marc Gonzalez wrote:
> > 
> >> And at the end of smp8759_config_read:
> >>
> >> 	printk("in_atomic_preempt_off = %d\n", in_atomic_preempt_off());
> > 
> > That's confused...
> 
> That much is certain. I am indeed grasping at straws.
> 
> I grepped "scheduling while atomic", found __schedule_bug()
> in kernel/sched/core.c and saw
> 
> 	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) {
> 		pr_err("Preemption disabled at:");
> 		print_ip_sym(preempt_disable_ip);
> 		pr_cont("\n");
> 	}
> 
> I thought printing the value of in_atomic_preempt_off()
> in the callback would indicate whether preemption had
> already been turned off at that point.

in_atomic_preempt_off() is a 'weird' construct. It basically checks to
see if preempt_count != 1. So calling it while holding a single preempt,
will return false (like in your case).

> It doesn't work like that?

Not really, you want to just print preempt_count.

> BTW, why didn't print_ip_sym(preempt_disable_ip); say
> where preemption had been disabled?

It does, but it might be non-obvious. We only store the first 0->!0
transition IP in there.

So if you have chains like:

	preempt_disable();		// 0 -> 1
	spin_lock();			// 1 -> 2
	preempt_enable();		// 2 -> 1
	preempt_disable();		// 1 -> 2
	spin_unlock();			// 2 -> 1

	mutex_lock();
	  might_sleep();		*SPLAT* report the IP of 0 -> 1

	preempt_enable():		// 1 -> 0


That IP might not even be part of the current callchain.

> >> [    1.026568] BUG: scheduling while atomic: swapper/0/1/0x00000002
> >> [    1.032625] 5 locks held by swapper/0/1:
> >> [    1.036575]  #0:  (&dev->mutex){......}, at: [<c038c684>] __driver_attach+0x50/0xd0
> >> [    1.044319]  #1:  (&dev->mutex){......}, at: [<c038c694>] __driver_attach+0x60/0xd0
> >> [    1.052050]  #2:  (pci_lock){+.+...}, at: [<c03309d8>] pci_bus_read_config_dword+0x44/0x94
> > 
> > This is a raw_spinlock_t, that disables preemption
> 
> drivers/pci/access.c
> 
> /*
>  * This interrupt-safe spinlock protects all accesses to PCI
>  * configuration space.
>  */
> DEFINE_RAW_SPINLOCK(pci_lock);
> 
> 
> 	raw_spin_lock_irqsave(&pci_lock, flags);
> 	res = bus->ops->read(bus, devfn, pos, len, &data);
> 
> 
> IIUC, it's not possible to call stop_machine() while holding
> a raw spinlock?

Correct. You cannot call blocking primitives while holding a spinlock.

> What about regular spinlocks? IIUC, in RT,
> regular spinlocks may sleep?

Still not, your code should not depend on CONFIG_PREEMPT*.

> I didn't find "preempt" or "schedul" in the spinlock doc.
> https://www.kernel.org/doc/Documentation/locking/spinlocks.txt

Correct... as per usual the documentation is somewhat terse.

But once you think about it, its 'obvious' a spinlock needs to disable
preemption. If you don't you get into heaps of trouble (one of the
reasons userspace spinlocks are an absolute trainwreck).

> > Using stop_machine() is per definition doing it wrong ;-)
> 
> Here's the high-level view. My HW is borked and muxes
> config space and mem space. So I need a way to freeze
> the entire system, make the config space access, and
> then return the system to normal. (AFAICT, config space
> accesses are rare, so if I kill performance for these
> accesses, the system might remain usable.)
> 
> Is there a way to do this? Mark suggested stop_machine
> but it seems using it in my situation is not quite
> straight-forward.

*groan*... so yeah, broken hardware demands crazy stuff... stop machine
is tricky here because I'm not sure we can demand all PCI accessors to
allow sleeping.

And given that PCI lock is irqsave, we can't even assume IRQs are
enabled.

Does your platform have NMIs? If so, you can do yuck things like the
kgdb sync. NMI IPI all other CPUs and have them spin-wait on your state.

Then be careful not to deadlock when two CPUs do that concurrently.
Bjorn Helgaas July 4, 2017, 3:15 p.m. UTC | #18
On Mon, Jul 03, 2017 at 07:11:28PM +0100, Russell King - ARM Linux wrote:
> On Mon, Jul 03, 2017 at 08:40:31AM -0500, Bjorn Helgaas wrote:
> > The problem is serializing vs. memory accesses, since they don't use
> > any wrappers.  However, they are ioremapped(), so it's at least
> > conceivable that another solution would be to use VM to trap those
> > accesses.  I'm not a VM person, so I don't know whether that's
> > feasible in Linux.
> 
> Bjorn,
> 
> You're forgetting that MMIO (iow, memory returned by ioremap()) must
> be accessed through the appropriate accessors, and must not be
> directly dereferenced in C.  (We do have buggy drivers that do that
> but they are buggy, and in many cases are getting attention to fix
> that.)

Oh, you're right, thank you!  I guess you're referring to readb()
and friends.  I haven't found an actual prohibition on directly
dereferencing addresses returned from ioremap(), but
Documentation/driver-api/device-io.rst is clear that they're
suitable for passing to readb(), etc.

I recently told someone else my mistaken idea that ioremap() must
return a valid virtual address.  I wish I remembered who it was, so I
could correct that.  Documentation/DMA-API-HOWTO.txt also suggests
that ioremap() returns a virtual address -- I think I wrote that, and
maybe that virtual address reference should be tweaked a bit.

Another wrinkle is that the pci_mmap_resource() interface is exposed
via sysfs and allows direct userspace mmap of PCI MMIO resources.  In
that case, there is no accessor available.  I wonder if we need some
way to disable this mmap when readb() is non-trivial.

Bjorn
Mason July 4, 2017, 3:18 p.m. UTC | #19
On 04/07/2017 16:27, Peter Zijlstra wrote:

> Mason wrote:
>
>> 	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) {
>> 		pr_err("Preemption disabled at:");
>> 		print_ip_sym(preempt_disable_ip);
>> 		pr_cont("\n");
>> 	}
>>
>> BTW, why didn't print_ip_sym(preempt_disable_ip); say
>> where preemption had been disabled?
> 
> It does, but it might be non-obvious. We only store the first 0->!0
> transition IP in there.

The output was:

[    1.079483] Preemption disabled at:[    1.082820] [<  (null)>]   (null)

so preempt_disable_ip was NULL, right?

Has it been already clobbered?


>> Here's the high-level view. My HW is borked and muxes
>> config space and mem space. So I need a way to freeze
>> the entire system, make the config space access, and
>> then return the system to normal. (AFAICT, config space
>> accesses are rare, so if I kill performance for these
>> accesses, the system might remain usable.)
>>
>> Is there a way to do this? Mark suggested stop_machine
>> but it seems using it in my situation is not quite
>> straight-forward.
> 
> *groan*... so yeah, broken hardware demands crazy stuff... stop machine
> is tricky here because I'm not sure we can demand all PCI accessors to
> allow sleeping.
> 
> And given that PCI lock is irqsave, we can't even assume IRQs are
> enabled.
> 
> Does your platform have NMIs? If so, you can do yuck things like the
> kgdb sync. NMI IPI all other CPUs and have them spin-wait on your state.
> 
> Then be careful not to deadlock when two CPUs do that concurrently.

My platform is arch/arm/mach-tango (Cortex A9, ARMv7-A)

This looks similar to what you described:
https://www.linaro.org/blog/core-dump/debugging-arm-kernels-using-nmifiq/
(I CCed Daniel Thompson)

Quote article:

> Note: On ARMv7-A devices that have security extensions (TrustZone)
> FIQ can only be used by the kernel if it is possible to run Linux in
> secure mode. It is therefore not possible to exploit FIQ for
> debugging and run a secure monitor simultaneously. At the end of this
> blog post we will discuss potential future work to mitigate this
> problem.

On my platform, Linux runs in non-secure mode...

Sounds like I don't have many options left for this driver :-(

Regards.
Bjorn Helgaas July 4, 2017, 3:58 p.m. UTC | #20
On Mon, Jul 03, 2017 at 04:34:29PM +0200, Marc Gonzalez wrote:
> Bjorn Helgaas wrote:
> 
> > Marc Gonzalez wrote:
> >
> >> On 03/07/2017 01:18, Bjorn Helgaas wrote:
> >>
> >>> On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
> >>>
> >>>> +static int tango_check_pcie_link(void __iomem *test_out)
> >>>
> >>> I think this is checking for link up.  Rename to tango_pcie_link_up()
> >>> to follow the convention of other drivers.  Take a struct tango_pcie *
> >>> instead of an address, if possible.
> >>
> >> Anything's possible. NB: if I pass the struct, then I have to store
> >> the address in the struct, which isn't the case now, since I never
> >> need the address later. If you don't mind adding an unnecessary
> >> field to the struct, I can do it. What do you say?
> > 
> > The benefit of following the same formula as other drivers is pretty
> > large.  Most drivers save the equivalent of "base" in the struct.  If
> > you did that, you wouldn't need an extra pointer; you would just use
> > "base + SMP8759_MUX" in the config accessors and "base + SMP8759_TEST_OUT"
> > in tango_pcie_link_up().
> 
> The problem is that TEST_OUT is at 0x74 on SMP8759, but at 0x138
> on my other chip. In fact, all registers have been "reshuffled",
> and none have the same offsets on the two chips.
> 
> My solution was to define specific registers in the struct.
> 
> In my [RFC PATCH v0.2] posted March 23, I tried illustrating
> the issue:
> 
> +static const struct of_device_id tango_pcie_ids[] = {
> +	{ .compatible = "sigma,smp8759-pcie" },
> +	{ .compatible = "sigma,rev2-pcie" },
> +	{ /* sentinel */ },
> +};
> +
> +static void smp8759_init(struct tango_pcie *pcie, void __iomem *base)
> +{
> +	pcie->mux		= base + 0x48;
> +	pcie->msi_status	= base + 0x80;
> +	pcie->msi_mask		= base + 0xa0;
> +	pcie->msi_doorbell	= 0xa0000000 + 0x2e07c;
> +}
> +
> +static void rev2_init(struct tango_pcie *pcie, void __iomem *base)
> +{
> +	void __iomem *misc_irq	= base + 0x40;
> +	void __iomem *doorbell	= base + 0x8c;
> +
> +	pcie->mux		= base + 0x2c;
> +	pcie->msi_status	= base + 0x4c;
> +	pcie->msi_mask		= base + 0x6c;
> +	pcie->msi_doorbell	= 0x80000000;
> +
> +	writel(lower_32_bits(pcie->msi_doorbell), doorbell + 0);
> +	writel(upper_32_bits(pcie->msi_doorbell), doorbell + 4);
> +
> +	/* Enable legacy PCI interrupts */
> +	writel(BIT(15), misc_irq);
> +	writel(0xf << 4, misc_irq + 4);
> +}
> 
> 
> Do you agree that the 'base + OFFSET' idiom does not work in
> this specific situation? Would you handle it differently?

It's definitely a hassle to support chips with different register
layouts.  Your hardware guys are really making your life hard :)

drivers/pci/host/pcie-iproc.c is one strategy.  It has
iproc_pcie_reg_paxb[] and iproc_pcie_reg_paxb_v2[] with register
offsets for different chip versions.

It saves a table of register offsets per controller, which is similar
to saving a set of pointers per controller.  Saving the pointers as
you suggest above is marginally more storage but probably easier to
read.

If the chips are fundamentally different, i.e., if they *operate*
differently in addition to having a different register layout, you
could make two separate drivers.

Bjorn
Russell King (Oracle) July 4, 2017, 6:17 p.m. UTC | #21
On Tue, Jul 04, 2017 at 10:15:02AM -0500, Bjorn Helgaas wrote:
> On Mon, Jul 03, 2017 at 07:11:28PM +0100, Russell King - ARM Linux wrote:
> > On Mon, Jul 03, 2017 at 08:40:31AM -0500, Bjorn Helgaas wrote:
> > > The problem is serializing vs. memory accesses, since they don't use
> > > any wrappers.  However, they are ioremapped(), so it's at least
> > > conceivable that another solution would be to use VM to trap those
> > > accesses.  I'm not a VM person, so I don't know whether that's
> > > feasible in Linux.
> > 
> > Bjorn,
> > 
> > You're forgetting that MMIO (iow, memory returned by ioremap()) must
> > be accessed through the appropriate accessors, and must not be
> > directly dereferenced in C.  (We do have buggy drivers that do that
> > but they are buggy, and in many cases are getting attention to fix
> > that.)
> 
> Oh, you're right, thank you!  I guess you're referring to readb()
> and friends.  I haven't found an actual prohibition on directly
> dereferencing addresses returned from ioremap(), but
> Documentation/driver-api/device-io.rst is clear that they're
> suitable for passing to readb(), etc.

There was a strong suggestion years ago that what is returned from
ioremap() is a cookie that must not be dereferenced by drivers, and
that there was a suggestion that having ioremap() return the virtual
address with an offset (which read*() and friends would undo) would
be a good idea.  However, even back then, we had some cases where
drivers would directly dereference the pointer.  We have sparse today
which helps point these places out (provided drivers stay away from
__force, but unfortunately, I think we've ended up with people who
think that silencing sparse warnings with __force is more preferable
than leaving them there to point out where things are actually wrong.)

So, imho, unfortunately sparse has lost its usefulness in this regard.

> I recently told someone else my mistaken idea that ioremap() must
> return a valid virtual address.  I wish I remembered who it was, so I
> could correct that.  Documentation/DMA-API-HOWTO.txt also suggests
> that ioremap() returns a virtual address -- I think I wrote that, and
> maybe that virtual address reference should be tweaked a bit.

For most implementations, ioremap() does indeed return a virtual address,
but that was never how the API was defined in the first place - it was
always referred to as returning a cookie.

> Another wrinkle is that the pci_mmap_resource() interface is exposed
> via sysfs and allows direct userspace mmap of PCI MMIO resources.  In
> that case, there is no accessor available.  I wonder if we need some
> way to disable this mmap when readb() is non-trivial.

Hmm, no comment, except that while the PCI MMIO space is available to
userspace, and userspace is capable of running that thread on any CPU,
PCI MMIO space can't be switched to config space.

That's another nail in this coffin...
Mason July 4, 2017, 11:42 p.m. UTC | #22
On 04/07/2017 17:58, Bjorn Helgaas wrote:

> It's definitely a hassle to support chips with different register
> layouts.  Your hardware guys are really making your life hard :)

Now where did I put my foam bat...

> If the chips are fundamentally different, i.e., if they *operate*
> differently in addition to having a different register layout, you
> could make two separate drivers.

It's the exact same underlying IP. Revision 2 is only a
bug fix rev. IIUC, some of the fixes lead to adding a
register here, removing a register there... and I don't
think the HW dev ever considered the pain of supporting
both revs within a single driver.

This dual support explains some of the peculiarities
you noted in my submission.

Regards.
Mason July 4, 2017, 11:59 p.m. UTC | #23
On 03/07/2017 20:11, Russell King - ARM Linux wrote:

> I don't think there's an easy solution to this problem - and I'm not
> sure that stop_machine() can be made to work in this path (which
> needs a process context).  I have a suspicion that the Sigma Designs
> PCI implementation is just soo insane that it's never going to work
> reliably in a multi-SoC kernel without introducing severe performance
> issues for everyone else.

If I remember correctly, this is the second HW block from
tango that has been deemed "too insane for Linux".

The first one was the DMA engine, which doesn't interrupt
when a transfer is done, but when a new transfer may be
programmed. (Though there is a simple work-around for
this one, if we give up command pipelining.)

Do larger SoC vendors have HW devs working closely with
Linux devs, to avoid these design bloopers?

Regards.
Greg Kroah-Hartman July 5, 2017, 5:21 a.m. UTC | #24
On Wed, Jul 05, 2017 at 01:59:55AM +0200, Mason wrote:
> Do larger SoC vendors have HW devs working closely with
> Linux devs, to avoid these design bloopers?

Yes they generally do, as they have learned from their prior mistakes :)

good luck!

greg k-h
Mark Brown July 5, 2017, 12:33 p.m. UTC | #25
On Wed, Jul 05, 2017 at 01:59:55AM +0200, Mason wrote:

> Do larger SoC vendors have HW devs working closely with
> Linux devs, to avoid these design bloopers?

Yes, or just internal software teams in general - hardware designs tend
to be a lot more usable if there's good dialogue with users about what's
useful and what isn't.  It's going to happen at some point anyway when
people can't get the chip working well.
Joao Pinto July 5, 2017, 1:53 p.m. UTC | #26
Hi to all,

Às 10:38 AM de 7/4/2017, Ard Biesheuvel escreveu:
> On 4 July 2017 at 09:19, Jisheng Zhang <jszhang@marvell.com> wrote:
>> On Tue, 4 Jul 2017 09:02:06 +0100 Ard Biesheuvel wrote:
>>
>>> On 4 July 2017 at 07:58, Jisheng Zhang <jszhang@marvell.com> wrote:
>>>> On Mon, 3 Jul 2017 08:27:04 -0500 wrote:
>>>>
>>>>> [+cc Jingoo, Joao]
>>>>>
>>>>> On Mon, Jul 03, 2017 at 10:35:50AM +0100, Ard Biesheuvel wrote:
>>>>>> On 3 July 2017 at 00:18, Bjorn Helgaas <helgaas@kernel.org> wrote:
>>>>>>> On Tue, Jun 20, 2017 at 10:17:40AM +0200, Marc Gonzalez wrote:
>>>>>>>> This driver is required to work around several hardware bugs
>>>>>>>> in the PCIe controller.
>>>>>>>>
>>>>>>>> NB: Revision 1 does not support legacy interrupts, or IO space.
>>>>>>>
>>>>>>> I had to apply these manually because of conflicts in Kconfig and
>>>>>>> Makefile.  What are these based on?  Easiest for me is if you base
>>>>>>> them on the current -rc1 tag.
>>>>>>>
>>>>>>>> Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
>>>>>>>> ---
>>>>>>>>  drivers/pci/host/Kconfig      |   8 +++
>>>>>>>>  drivers/pci/host/Makefile     |   1 +
>>>>>>>>  drivers/pci/host/pcie-tango.c | 164 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>  include/linux/pci_ids.h       |   2 +
>>>>>>>>  4 files changed, 175 insertions(+)
>>>>>>>>  create mode 100644 drivers/pci/host/pcie-tango.c
>>>>>>>>
>>>>>> [..]
>>>>>>>> +     /*
>>>>>>>> +      * QUIRK #2
>>>>>>>> +      * Unfortunately, config and mem spaces are muxed.
>>>>>>>> +      * Linux does not support such a setting, since drivers are free
>>>>>>>> +      * to access mem space directly, at any time.
>>>>>>>> +      * Therefore, we can only PRAY that config and mem space accesses
>>>>>>>> +      * NEVER occur concurrently.
>>>>>>>> +      */
>>>>>>>> +     writel_relaxed(1, pcie->mux);
>>>>>>>> +     ret = pci_generic_config_read(bus, devfn, where, size, val);
>>>>>>>> +     writel_relaxed(0, pcie->mux);
>>>>>>>
>>>>>>> I'm very hesitant about this.  When people stress this, we're going to
>>>>>>> get reports of data corruption.  Even with the disclaimer below, I
>>>>>>> don't feel good about this.  Adding the driver is an implicit claim
>>>>>>> that we support the device, but we know it can't be made reliable.
>>>>>>
>>>>>> I noticed that the Synopsys driver suffers from a similar issue: in
>>>>>> dw_pcie_rd_other_conf(), it happily reprograms the outbound I/O window
>>>>>> to perform a config space access, and switches it back to I/O space
>>>>>> afterwards (unless it has more than 2 viewports, in which case it uses
>>>>>> dedicated windows for I/O space and config space)
>>>>>
>>>>> That doesn't sound good.  Jingoo, Joao?  I remember some discussion
>>>>> about this, but not the details.
>>>>>
>>>>> I/O accesses use wrappers (inb(), etc), so there's at least the
>>>>> possibility of a mutex to serialize them with respect to config
>>>>> accesses.
>>>>>
>>>>
>>>> IIRC, for 2 viewports, we don't need to worry about the config space
>>>> access, because config space access is serialized by pci_lock; We
>>>> do have race between config space and io space. But the accessing config
>>>> space and io space at the same time is rare.
>>>
>>> Being 'rare' is not sufficient, unfortunately. In the current
>>> situation, I/O space accesses may occur when the outbound window is
>>> directed to the config space of a potentially completely unrelated
>>> device. This is bad.
>>
>> Yep, I agree with you.
>>
>>>
>>>> And the PCIe EPs which
>>>> has io space are rare too, supporting these EPs are not the potential
>>>> target of those platforms with 2 viewports.
>>>>
>>>
>>> I am not sure EP mode is relevant here. What I do know is that boards
>>
>> I mean those PCIe EP cards which have IO space, but that doesn't matter.
>>
> 
> Ah, ok. But if such EP cards are so rare, why expose I/O space at all
> if we cannot do it safely?
> 
>>> like the Marvell 8040 based MacchiatoBin uses this IP in RC mode, and
>>> exposes config, MMIO and IO space windows using only 2 viewports. Note
>>> that this is essentially a bug in the DT description, given that its
>>> version of the IP supports 8 viewports. But the driver needs to be
>>> fixed as well.
>>
>> To fix for 2 viewports situation, we need to serialize access of the io
>> and config space. In internal repo, we can achieve it by modifying the
>> io access helper functions such as inl/outl, but this won't be accepted
>> by the mainline IMHO. Except fixing the HW, any elegant solution?
>>
>> Suggestions are appreciated.
>>
> 
> I think the safe and upstreamable approach is to disable the I/O
> window completely if num-viewports <= 2.
> 

I think that the critical case is when we are using a single ATU region, and
that has to managed by software. The hardware is not capable of managing racing
conditions, like programming the ATU while still transfering data, so problems
can ocurre for sure. In this case we should be able to add to the driver a
simple lock to signal that the ATU is being used. If the ATU was being used we
could make a spinlock for example. Would this acceptable in your opinion? The
problem is when we are transfering data. Are we able to know that data is being
processed?

When using a single region, when we access the config mem, we have to stop all
data transfers, reprogram the ATU to handle the config access, do the config
read/write, put the region "back" and restart the data transfer.

When using 2 or more regions, we can separate the scopes and the data transfers
does not need to be stopped. The race problem is harder to ocurre.

Thanks,
Joao
diff mbox

Patch

diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index d7e7c0a827c3..5183d9095c3a 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -285,6 +285,14 @@  config PCIE_ROCKCHIP
 	  There is 1 internal PCIe port available to support GEN2 with
 	  4 slots.
 
+config PCIE_TANGO
+	bool "Tango PCIe controller"
+	depends on ARCH_TANGO && PCI_MSI && OF
+	select PCI_HOST_COMMON
+	help
+	  Say Y here to enable PCIe controller support for Sigma Designs
+	  Tango systems, such as SMP8759 and later chips.
+
 config VMD
 	depends on PCI_MSI && X86_64
 	tristate "Intel Volume Management Device Driver"
diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
index 084cb4983645..fc7ea90196f3 100644
--- a/drivers/pci/host/Makefile
+++ b/drivers/pci/host/Makefile
@@ -32,4 +32,5 @@  obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o
 obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
 obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
 obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o
+obj-$(CONFIG_PCIE_TANGO) += pcie-tango.o
 obj-$(CONFIG_VMD) += vmd.o
diff --git a/drivers/pci/host/pcie-tango.c b/drivers/pci/host/pcie-tango.c
new file mode 100644
index 000000000000..67aaadcc1c5e
--- /dev/null
+++ b/drivers/pci/host/pcie-tango.c
@@ -0,0 +1,164 @@ 
+#include <linux/pci-ecam.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+
+#define MSI_MAX 256
+
+#define SMP8759_MUX		0x48
+#define SMP8759_TEST_OUT	0x74
+
+struct tango_pcie {
+	void __iomem *mux;
+};
+
+/*** HOST BRIDGE SUPPORT ***/
+
+static int smp8759_config_read(struct pci_bus *bus,
+		unsigned int devfn, int where, int size, u32 *val)
+{
+	int ret;
+	struct pci_config_window *cfg = bus->sysdata;
+	struct tango_pcie *pcie = dev_get_drvdata(cfg->parent);
+
+	/*
+	 * QUIRK #1
+	 * Reads in configuration space outside devfn 0 return garbage.
+	 */
+	if (devfn != 0)
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+	/*
+	 * QUIRK #2
+	 * Unfortunately, config and mem spaces are muxed.
+	 * Linux does not support such a setting, since drivers are free
+	 * to access mem space directly, at any time.
+	 * Therefore, we can only PRAY that config and mem space accesses
+	 * NEVER occur concurrently.
+	 */
+	writel_relaxed(1, pcie->mux);
+	ret = pci_generic_config_read(bus, devfn, where, size, val);
+	writel_relaxed(0, pcie->mux);
+
+	return ret;
+}
+
+static int smp8759_config_write(struct pci_bus *bus,
+		unsigned int devfn, int where, int size, u32 val)
+{
+	int ret;
+	struct pci_config_window *cfg = bus->sysdata;
+	struct tango_pcie *pcie = dev_get_drvdata(cfg->parent);
+
+	writel_relaxed(1, pcie->mux);
+	ret = pci_generic_config_write(bus, devfn, where, size, val);
+	writel_relaxed(0, pcie->mux);
+
+	return ret;
+}
+
+static struct pci_ecam_ops smp8759_ecam_ops = {
+	.bus_shift	= 20,
+	.pci_ops	= {
+		.map_bus	= pci_ecam_map_bus,
+		.read		= smp8759_config_read,
+		.write		= smp8759_config_write,
+	}
+};
+
+static const struct of_device_id tango_pcie_ids[] = {
+	{ .compatible = "sigma,smp8759-pcie" },
+	{ /* sentinel */ },
+};
+
+static int tango_check_pcie_link(void __iomem *test_out)
+{
+	int i;
+
+	writel_relaxed(16, test_out);
+	for (i = 0; i < 10; ++i) {
+		u32 ltssm_state = readl_relaxed(test_out) >> 8;
+		if ((ltssm_state & 0x1f) == 0xf) /* L0 */
+			return 0;
+		usleep_range(3000, 4000);
+	}
+
+	return -ENODEV;
+}
+
+static int smp8759_init(struct tango_pcie *pcie, void __iomem *base)
+{
+	pcie->mux		= base + SMP8759_MUX;
+
+	return tango_check_pcie_link(base + SMP8759_TEST_OUT);
+}
+
+static int tango_pcie_probe(struct platform_device *pdev)
+{
+	int ret = -EINVAL;
+	void __iomem *base;
+	struct resource *res;
+	struct tango_pcie *pcie;
+	struct device *dev = &pdev->dev;
+
+	pr_err("MAJOR ISSUE: PCIe config and mem spaces are muxed\n");
+	pr_err("Tainting kernel... Use driver at your own risk\n");
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, pcie);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	if (of_device_is_compatible(dev->of_node, "sigma,smp8759-pcie"))
+		ret = smp8759_init(pcie, base);
+
+	if (ret)
+		return ret;
+
+	return pci_host_common_probe(pdev, &smp8759_ecam_ops);
+}
+
+static struct platform_driver tango_pcie_driver = {
+	.probe	= tango_pcie_probe,
+	.driver	= {
+		.name = KBUILD_MODNAME,
+		.of_match_table = tango_pcie_ids,
+	},
+};
+
+builtin_platform_driver(tango_pcie_driver);
+
+/*
+ * QUIRK #3
+ * The root complex advertizes the wrong device class.
+ * Header Type 1 is for PCI-to-PCI bridges.
+ */
+static void tango_fixup_class(struct pci_dev *dev)
+{
+	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x24, tango_fixup_class);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x28, tango_fixup_class);
+
+/*
+ * QUIRK #4
+ * The root complex exposes a "fake" BAR, which is used to filter
+ * bus-to-system accesses. Only accesses within the range defined
+ * by this BAR are forwarded to the host, others are ignored.
+ *
+ * By default, the DMA framework expects an identity mapping,
+ * and DRAM0 is mapped at 0x80000000.
+ */
+static void tango_fixup_bar(struct pci_dev *dev)
+{
+	dev->non_compliant_bars = true;
+	pci_write_config_dword(dev, PCI_BASE_ADDRESS_0, 0x80000000);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x24, tango_fixup_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIGMA, 0x28, tango_fixup_bar);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f020ab4079d3..b577dbe46f8f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1369,6 +1369,8 @@ 
 #define PCI_DEVICE_ID_TTI_HPT374	0x0008
 #define PCI_DEVICE_ID_TTI_HPT372N	0x0009	/* apparently a 372N variant? */
 
+#define PCI_VENDOR_ID_SIGMA		0x1105
+
 #define PCI_VENDOR_ID_VIA		0x1106
 #define PCI_DEVICE_ID_VIA_8763_0	0x0198
 #define PCI_DEVICE_ID_VIA_8380_0	0x0204