diff mbox series

PCI: Add sysfs attribute for disabling PCIe link to downstream component

Message ID 20190529104942.74991-1-mika.westerberg@linux.intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series PCI: Add sysfs attribute for disabling PCIe link to downstream component | expand

Commit Message

Mika Westerberg May 29, 2019, 10:49 a.m. UTC
PCIe root and downstream ports have link control register that can be
used disable the link from software. This can be useful for instance
when performing "software" hotplug on systems that do not support real
PCIe/ACPI hotplug.

For example when used with FPGA card we can burn a new FPGA image
without need to reboot the system.

First we remove the FGPA device from Linux PCI stack:

  # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove

Then we disable the link:

  # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable

By doing this we prevent the kernel from accessing the hardware while we
burn the new FPGA image. Once the new FPGA is burned we can re-enable
the link and rescan the new and possibly different device:

  # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
  # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-bus-pci |  8 +++
 drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 1 deletion(-)

Comments

Mika Westerberg July 3, 2019, 12:30 p.m. UTC | #1
On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> PCIe root and downstream ports have link control register that can be
> used disable the link from software. This can be useful for instance
> when performing "software" hotplug on systems that do not support real
> PCIe/ACPI hotplug.
> 
> For example when used with FPGA card we can burn a new FPGA image
> without need to reboot the system.
> 
> First we remove the FGPA device from Linux PCI stack:
> 
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> 
> Then we disable the link:
> 
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> 
> By doing this we prevent the kernel from accessing the hardware while we
> burn the new FPGA image. Once the new FPGA is burned we can re-enable
> the link and rescan the new and possibly different device:
> 
>   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> 
> Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>

Hi,

Any comments on this?
Bjorn Helgaas July 3, 2019, 1:39 p.m. UTC | #2
On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> PCIe root and downstream ports have link control register that can be
> used disable the link from software. This can be useful for instance
> when performing "software" hotplug on systems that do not support real
> PCIe/ACPI hotplug.
> 
> For example when used with FPGA card we can burn a new FPGA image
> without need to reboot the system.
> 
> First we remove the FGPA device from Linux PCI stack:
> 
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> 
> Then we disable the link:
> 
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> 
> By doing this we prevent the kernel from accessing the hardware while we
> burn the new FPGA image. 

What is the case where the kernel accesses the hardware?  You've
already done the remove, so the pci_dev is gone.  Is this to protect
against another user doing a rescan?  Or is there some spurious event
during the FPGA update that causes an interrupt that causes pciehp to
rescan?  Something else?

I guess this particular FPGA update must be done via some side channel
(not the PCIe link)?  I assume there are other FPGA arrangements where
the update *would* be done via the PCIe link, and we would just do a
reset to make the update take effect.

> Once the new FPGA is burned we can re-enable
> the link and rescan the new and possibly different device:
> 
>   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
>   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> 
> Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
>  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
>  2 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 8bfee557e50e..c93d6b9ab580 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -324,6 +324,14 @@ Description:
>  		This is similar to /sys/bus/pci/drivers_autoprobe, but
>  		affects only the VFs associated with a specific PF.
>  
> +What:		/sys/bus/pci/devices/.../link_disable
> +Date:		September 2019
> +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> +Description:
> +		PCIe root and downstream ports have this attribute. Writing
> +		1 causes the link to downstream component be disabled.
> +		Re-enabling the link happens by writing 0 instead.
> +
>  What:		/sys/bus/pci/devices/.../p2pmem/size
>  Date:		November 2017
>  Contact:	Logan Gunthorpe <logang@deltatee.com>
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 6d27475e39b2..dfcd21745192 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(current_link_width);
>  
> +static ssize_t link_disable_show(struct device *dev,
> +				 struct device_attribute *attr, char *buf)
> +{
> +	struct pci_dev *pci_dev = to_pci_dev(dev);
> +	u16 linkctl;
> +	int ret;
> +
> +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> +	if (ret)
> +		return -EINVAL;
> +
> +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> +}
> +
> +static ssize_t link_disable_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{
> +	struct pci_dev *pci_dev = to_pci_dev(dev);
> +	u16 linkctl;
> +	bool disable;
> +	int ret;
> +
> +	ret = kstrtobool(buf, &disable);
> +	if (ret)
> +		return ret;
> +
> +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> +	if (ret)
> +		return -EINVAL;
> +
> +	if (disable) {
> +		if (linkctl & PCI_EXP_LNKCTL_LD)
> +			goto out;
> +		linkctl |= PCI_EXP_LNKCTL_LD;
> +	} else {
> +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> +			goto out;
> +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> +	}
> +
> +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> +	if (ret)
> +		return ret;
> +
> +out:
> +	return count;
> +}
> +static DEVICE_ATTR_RW(link_disable);
> +
>  static ssize_t secondary_bus_number_show(struct device *dev,
>  					 struct device_attribute *attr,
>  					 char *buf)
> @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
>  	&dev_attr_current_link_width.attr,
>  	&dev_attr_max_link_width.attr,
>  	&dev_attr_max_link_speed.attr,
> +	&dev_attr_link_disable.attr,
>  	NULL,
>  };
>  
> @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
>  	struct device *dev = kobj_to_dev(kobj);
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  
> -	if (pci_is_pcie(pdev))
> +	if (pci_is_pcie(pdev)) {
> +		if (a == &dev_attr_link_disable.attr) {
> +			switch (pci_pcie_type(pdev)) {
> +			case PCI_EXP_TYPE_ROOT_PORT:
> +			case PCI_EXP_TYPE_DOWNSTREAM:

This is actually not completely reliable because there are weird
systems that don't identify upstream/downstream ports correctly, e.g.,
see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
dev->has_secondary_link to find downstream links").

I think I suggested the dev->has_secondary_link approach, but I now
think that was a mistake because it means we have to remember to look
at has_secondary_link instead of doing the obvious thing like your
code.

set_pcie_port_type() detects those unusual topologies, and I think it
would probably be better for it to just change the cached caps reg
used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
the right thing.

> +				break;
> +
> +			default:
> +				return 0;
> +			}
> +		}
> +
>  		return a->mode;
> +	}
>  
>  	return 0;
>  }
> -- 
> 2.20.1
>
Mika Westerberg July 3, 2019, 3:03 p.m. UTC | #3
On Wed, Jul 03, 2019 at 08:39:53AM -0500, Bjorn Helgaas wrote:
> On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> > PCIe root and downstream ports have link control register that can be
> > used disable the link from software. This can be useful for instance
> > when performing "software" hotplug on systems that do not support real
> > PCIe/ACPI hotplug.
> > 
> > For example when used with FPGA card we can burn a new FPGA image
> > without need to reboot the system.
> > 
> > First we remove the FGPA device from Linux PCI stack:
> > 
> >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> > 
> > Then we disable the link:
> > 
> >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > 
> > By doing this we prevent the kernel from accessing the hardware while we
> > burn the new FPGA image. 
> 
> What is the case where the kernel accesses the hardware?  You've
> already done the remove, so the pci_dev is gone.  Is this to protect
> against another user doing a rescan?  Or is there some spurious event
> during the FPGA update that causes an interrupt that causes pciehp to
> rescan?  Something else?

Protect against another user doing rescan.

> I guess this particular FPGA update must be done via some side channel
> (not the PCIe link)?  I assume there are other FPGA arrangements where
> the update *would* be done via the PCIe link, and we would just do a
> reset to make the update take effect.

In this setup the FPGA is programmed using side channel. I haven't seen
the actual system but I think it is some sort of FPGA programmer
connected to another system.

> > Once the new FPGA is burned we can re-enable
> > the link and rescan the new and possibly different device:
> > 
> >   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> > 
> > Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
> >  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
> >  2 files changed, 72 insertions(+), 1 deletion(-)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > index 8bfee557e50e..c93d6b9ab580 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > @@ -324,6 +324,14 @@ Description:
> >  		This is similar to /sys/bus/pci/drivers_autoprobe, but
> >  		affects only the VFs associated with a specific PF.
> >  
> > +What:		/sys/bus/pci/devices/.../link_disable
> > +Date:		September 2019
> > +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> > +Description:
> > +		PCIe root and downstream ports have this attribute. Writing
> > +		1 causes the link to downstream component be disabled.
> > +		Re-enabling the link happens by writing 0 instead.
> > +
> >  What:		/sys/bus/pci/devices/.../p2pmem/size
> >  Date:		November 2017
> >  Contact:	Logan Gunthorpe <logang@deltatee.com>
> > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > index 6d27475e39b2..dfcd21745192 100644
> > --- a/drivers/pci/pci-sysfs.c
> > +++ b/drivers/pci/pci-sysfs.c
> > @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
> >  }
> >  static DEVICE_ATTR_RO(current_link_width);
> >  
> > +static ssize_t link_disable_show(struct device *dev,
> > +				 struct device_attribute *attr, char *buf)
> > +{
> > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > +	u16 linkctl;
> > +	int ret;
> > +
> > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > +	if (ret)
> > +		return -EINVAL;
> > +
> > +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> > +}
> > +
> > +static ssize_t link_disable_store(struct device *dev,
> > +				  struct device_attribute *attr,
> > +				  const char *buf, size_t count)
> > +{
> > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > +	u16 linkctl;
> > +	bool disable;
> > +	int ret;
> > +
> > +	ret = kstrtobool(buf, &disable);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > +	if (ret)
> > +		return -EINVAL;
> > +
> > +	if (disable) {
> > +		if (linkctl & PCI_EXP_LNKCTL_LD)
> > +			goto out;
> > +		linkctl |= PCI_EXP_LNKCTL_LD;
> > +	} else {
> > +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> > +			goto out;
> > +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> > +	}
> > +
> > +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> > +	if (ret)
> > +		return ret;
> > +
> > +out:
> > +	return count;
> > +}
> > +static DEVICE_ATTR_RW(link_disable);
> > +
> >  static ssize_t secondary_bus_number_show(struct device *dev,
> >  					 struct device_attribute *attr,
> >  					 char *buf)
> > @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
> >  	&dev_attr_current_link_width.attr,
> >  	&dev_attr_max_link_width.attr,
> >  	&dev_attr_max_link_speed.attr,
> > +	&dev_attr_link_disable.attr,
> >  	NULL,
> >  };
> >  
> > @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
> >  	struct device *dev = kobj_to_dev(kobj);
> >  	struct pci_dev *pdev = to_pci_dev(dev);
> >  
> > -	if (pci_is_pcie(pdev))
> > +	if (pci_is_pcie(pdev)) {
> > +		if (a == &dev_attr_link_disable.attr) {
> > +			switch (pci_pcie_type(pdev)) {
> > +			case PCI_EXP_TYPE_ROOT_PORT:
> > +			case PCI_EXP_TYPE_DOWNSTREAM:
> 
> This is actually not completely reliable because there are weird
> systems that don't identify upstream/downstream ports correctly, e.g.,
> see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
> downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
> dev->has_secondary_link to find downstream links").

D'oh!

It never came to my mind that using pci_pcie_type() would not be
reliable. Thanks for pointing it out.

> I think I suggested the dev->has_secondary_link approach, but I now
> think that was a mistake because it means we have to remember to look
> at has_secondary_link instead of doing the obvious thing like your
> code.
> 
> set_pcie_port_type() detects those unusual topologies, and I think it
> would probably be better for it to just change the cached caps reg
> used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
> the right thing.

You mean modify set_pcie_port_type() to correct the type if it finds:

  type == PCI_EXP_TYPE_UPSTREAM && !pdev->has_secondary_link => type = PCI_EXP_TYPE_DOWNSTREAM

or

  type == PCI_EXP_TYPE_DOWNSTREAM && pdev->has_secondary_link => type = PCI_EXP_TYPE_UPSTREAM

? Assuming my understanding of pdev->has_secondary_link is correct.
Bjorn Helgaas Aug. 1, 2019, 9:53 p.m. UTC | #4
[+cc FPGA folks, just FYI; I'm pretty sure PCI could do a much better
job supporting FPGAs, so any input is welcome!]

On Wed, Jul 03, 2019 at 06:03:41PM +0300, Mika Westerberg wrote:
> On Wed, Jul 03, 2019 at 08:39:53AM -0500, Bjorn Helgaas wrote:
> > On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> > > PCIe root and downstream ports have link control register that can be
> > > used disable the link from software. This can be useful for instance
> > > when performing "software" hotplug on systems that do not support real
> > > PCIe/ACPI hotplug.
> > > 
> > > For example when used with FPGA card we can burn a new FPGA image
> > > without need to reboot the system.
> > > 
> > > First we remove the FGPA device from Linux PCI stack:
> > > 
> > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> > > 
> > > Then we disable the link:
> > > 
> > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > 
> > > By doing this we prevent the kernel from accessing the hardware while we
> > > burn the new FPGA image. 
> > 
> > What is the case where the kernel accesses the hardware?  You've
> > already done the remove, so the pci_dev is gone.  Is this to protect
> > against another user doing a rescan?  Or is there some spurious event
> > during the FPGA update that causes an interrupt that causes pciehp to
> > rescan?  Something else?
> 
> Protect against another user doing rescan.

I'm not 100% sure this is enough of an issue to warrant a new sysfs
file.  The file is visible all the time to everybody, but it only
protects root from shooting him/herself in the foot.

> > I guess this particular FPGA update must be done via some side channel
> > (not the PCIe link)?  I assume there are other FPGA arrangements where
> > the update *would* be done via the PCIe link, and we would just do a
> > reset to make the update take effect.
> 
> In this setup the FPGA is programmed using side channel. I haven't seen
> the actual system but I think it is some sort of FPGA programmer
> connected to another system.
> 
> > > Once the new FPGA is burned we can re-enable
> > > the link and rescan the new and possibly different device:
> > > 
> > >   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> > > 
> > > Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > ---
> > >  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
> > >  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
> > >  2 files changed, 72 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > index 8bfee557e50e..c93d6b9ab580 100644
> > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > @@ -324,6 +324,14 @@ Description:
> > >  		This is similar to /sys/bus/pci/drivers_autoprobe, but
> > >  		affects only the VFs associated with a specific PF.
> > >  
> > > +What:		/sys/bus/pci/devices/.../link_disable
> > > +Date:		September 2019
> > > +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> > > +Description:
> > > +		PCIe root and downstream ports have this attribute. Writing
> > > +		1 causes the link to downstream component be disabled.
> > > +		Re-enabling the link happens by writing 0 instead.
> > > +
> > >  What:		/sys/bus/pci/devices/.../p2pmem/size
> > >  Date:		November 2017
> > >  Contact:	Logan Gunthorpe <logang@deltatee.com>
> > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > > index 6d27475e39b2..dfcd21745192 100644
> > > --- a/drivers/pci/pci-sysfs.c
> > > +++ b/drivers/pci/pci-sysfs.c
> > > @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
> > >  }
> > >  static DEVICE_ATTR_RO(current_link_width);
> > >  
> > > +static ssize_t link_disable_show(struct device *dev,
> > > +				 struct device_attribute *attr, char *buf)
> > > +{
> > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > +	u16 linkctl;
> > > +	int ret;
> > > +
> > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > +	if (ret)
> > > +		return -EINVAL;
> > > +
> > > +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> > > +}
> > > +
> > > +static ssize_t link_disable_store(struct device *dev,
> > > +				  struct device_attribute *attr,
> > > +				  const char *buf, size_t count)
> > > +{
> > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > +	u16 linkctl;
> > > +	bool disable;
> > > +	int ret;
> > > +
> > > +	ret = kstrtobool(buf, &disable);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > +	if (ret)
> > > +		return -EINVAL;
> > > +
> > > +	if (disable) {
> > > +		if (linkctl & PCI_EXP_LNKCTL_LD)
> > > +			goto out;
> > > +		linkctl |= PCI_EXP_LNKCTL_LD;
> > > +	} else {
> > > +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> > > +			goto out;
> > > +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> > > +	}
> > > +
> > > +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +out:
> > > +	return count;
> > > +}
> > > +static DEVICE_ATTR_RW(link_disable);
> > > +
> > >  static ssize_t secondary_bus_number_show(struct device *dev,
> > >  					 struct device_attribute *attr,
> > >  					 char *buf)
> > > @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
> > >  	&dev_attr_current_link_width.attr,
> > >  	&dev_attr_max_link_width.attr,
> > >  	&dev_attr_max_link_speed.attr,
> > > +	&dev_attr_link_disable.attr,
> > >  	NULL,
> > >  };
> > >  
> > > @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
> > >  	struct device *dev = kobj_to_dev(kobj);
> > >  	struct pci_dev *pdev = to_pci_dev(dev);
> > >  
> > > -	if (pci_is_pcie(pdev))
> > > +	if (pci_is_pcie(pdev)) {
> > > +		if (a == &dev_attr_link_disable.attr) {
> > > +			switch (pci_pcie_type(pdev)) {
> > > +			case PCI_EXP_TYPE_ROOT_PORT:
> > > +			case PCI_EXP_TYPE_DOWNSTREAM:
> > 
> > This is actually not completely reliable because there are weird
> > systems that don't identify upstream/downstream ports correctly, e.g.,
> > see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
> > downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
> > dev->has_secondary_link to find downstream links").
> 
> D'oh!
> 
> It never came to my mind that using pci_pcie_type() would not be
> reliable. Thanks for pointing it out.
> 
> > I think I suggested the dev->has_secondary_link approach, but I now
> > think that was a mistake because it means we have to remember to look
> > at has_secondary_link instead of doing the obvious thing like your
> > code.
> > 
> > set_pcie_port_type() detects those unusual topologies, and I think it
> > would probably be better for it to just change the cached caps reg
> > used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
> > the right thing.
> 
> You mean modify set_pcie_port_type() to correct the type if it finds:
> 
>   type == PCI_EXP_TYPE_UPSTREAM && !pdev->has_secondary_link => type = PCI_EXP_TYPE_DOWNSTREAM
> 
> or
> 
>   type == PCI_EXP_TYPE_DOWNSTREAM && pdev->has_secondary_link => type = PCI_EXP_TYPE_UPSTREAM
> 
> ? Assuming my understanding of pdev->has_secondary_link is correct.

I was hoping we could get rid of "has_secondary_link" completely if we
corrected the type, but I'm not sure that's possible.

Bjorn
Wu, Hao Aug. 4, 2019, 11:51 a.m. UTC | #5
On Thu, Aug 01, 2019 at 04:53:39PM -0500, Bjorn Helgaas wrote:
> [+cc FPGA folks, just FYI; I'm pretty sure PCI could do a much better
> job supporting FPGAs, so any input is welcome!]
> 
> On Wed, Jul 03, 2019 at 06:03:41PM +0300, Mika Westerberg wrote:
> > On Wed, Jul 03, 2019 at 08:39:53AM -0500, Bjorn Helgaas wrote:
> > > On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> > > > PCIe root and downstream ports have link control register that can be
> > > > used disable the link from software. This can be useful for instance
> > > > when performing "software" hotplug on systems that do not support real
> > > > PCIe/ACPI hotplug.
> > > > 
> > > > For example when used with FPGA card we can burn a new FPGA image
> > > > without need to reboot the system.
> > > > 
> > > > First we remove the FGPA device from Linux PCI stack:
> > > > 
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> > > > 
> > > > Then we disable the link:
> > > > 
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > > 
> > > > By doing this we prevent the kernel from accessing the hardware while we
> > > > burn the new FPGA image. 
> > > 
> > > What is the case where the kernel accesses the hardware?  You've
> > > already done the remove, so the pci_dev is gone.  Is this to protect
> > > against another user doing a rescan?  Or is there some spurious event
> > > during the FPGA update that causes an interrupt that causes pciehp to
> > > rescan?  Something else?
> > 
> > Protect against another user doing rescan.
> 
> I'm not 100% sure this is enough of an issue to warrant a new sysfs
> file.  The file is visible all the time to everybody, but it only
> protects root from shooting him/herself in the foot.
> 
> > > I guess this particular FPGA update must be done via some side channel
> > > (not the PCIe link)?  I assume there are other FPGA arrangements where
> > > the update *would* be done via the PCIe link, and we would just do a
> > > reset to make the update take effect.
> > 
> > In this setup the FPGA is programmed using side channel. I haven't seen
> > the actual system but I think it is some sort of FPGA programmer
> > connected to another system.

One example is, FPGA image is stored in flash on the PCIe card, and the
board management controller loads the image from flash to FPGA during
power up or any time requested by software/user.

Actually I used several PCIe FPGA cards, they can do partial reconfiguration
with inband method, but they are all using side channel to do full FPGA
reconfiguration. Partial reconfiguration doesn't touch the PCIe logics
inside FPGA, but full reconfiguration case really does. So we do need some
protection for full reconfiguration case from PCIe device level.

Actually we are using the similar method as above to remove/rescan pcie dev
to make sure nobody see it during full reconfiguration. For sure we need
to make sure nobody does the rescan at the same time, and another thing is
we have to mask AER on the root port, otherwise it's easy to see system
hang/reboot. I just did some qucik tests, with this patch, after link_disable
I don't need to mask AER any more, so this patch does help on this case,
and I should be able to try more next week.

Thanks a lot for adding us to this thread. :)

Hao

> > 
> > > > Once the new FPGA is burned we can re-enable
> > > > the link and rescan the new and possibly different device:
> > > > 
> > > >   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> > > > 
> > > > Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > ---
> > > >  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
> > > >  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
> > > >  2 files changed, 72 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > index 8bfee557e50e..c93d6b9ab580 100644
> > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > @@ -324,6 +324,14 @@ Description:
> > > >  		This is similar to /sys/bus/pci/drivers_autoprobe, but
> > > >  		affects only the VFs associated with a specific PF.
> > > >  
> > > > +What:		/sys/bus/pci/devices/.../link_disable
> > > > +Date:		September 2019
> > > > +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > +Description:
> > > > +		PCIe root and downstream ports have this attribute. Writing
> > > > +		1 causes the link to downstream component be disabled.
> > > > +		Re-enabling the link happens by writing 0 instead.
> > > > +
> > > >  What:		/sys/bus/pci/devices/.../p2pmem/size
> > > >  Date:		November 2017
> > > >  Contact:	Logan Gunthorpe <logang@deltatee.com>
> > > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > > > index 6d27475e39b2..dfcd21745192 100644
> > > > --- a/drivers/pci/pci-sysfs.c
> > > > +++ b/drivers/pci/pci-sysfs.c
> > > > @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
> > > >  }
> > > >  static DEVICE_ATTR_RO(current_link_width);
> > > >  
> > > > +static ssize_t link_disable_show(struct device *dev,
> > > > +				 struct device_attribute *attr, char *buf)
> > > > +{
> > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > +	u16 linkctl;
> > > > +	int ret;
> > > > +
> > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > +	if (ret)
> > > > +		return -EINVAL;
> > > > +
> > > > +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> > > > +}
> > > > +
> > > > +static ssize_t link_disable_store(struct device *dev,
> > > > +				  struct device_attribute *attr,
> > > > +				  const char *buf, size_t count)
> > > > +{
> > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > +	u16 linkctl;
> > > > +	bool disable;
> > > > +	int ret;
> > > > +
> > > > +	ret = kstrtobool(buf, &disable);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > +	if (ret)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (disable) {
> > > > +		if (linkctl & PCI_EXP_LNKCTL_LD)
> > > > +			goto out;
> > > > +		linkctl |= PCI_EXP_LNKCTL_LD;
> > > > +	} else {
> > > > +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> > > > +			goto out;
> > > > +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> > > > +	}
> > > > +
> > > > +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +out:
> > > > +	return count;
> > > > +}
> > > > +static DEVICE_ATTR_RW(link_disable);
> > > > +
> > > >  static ssize_t secondary_bus_number_show(struct device *dev,
> > > >  					 struct device_attribute *attr,
> > > >  					 char *buf)
> > > > @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
> > > >  	&dev_attr_current_link_width.attr,
> > > >  	&dev_attr_max_link_width.attr,
> > > >  	&dev_attr_max_link_speed.attr,
> > > > +	&dev_attr_link_disable.attr,
> > > >  	NULL,
> > > >  };
> > > >  
> > > > @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
> > > >  	struct device *dev = kobj_to_dev(kobj);
> > > >  	struct pci_dev *pdev = to_pci_dev(dev);
> > > >  
> > > > -	if (pci_is_pcie(pdev))
> > > > +	if (pci_is_pcie(pdev)) {
> > > > +		if (a == &dev_attr_link_disable.attr) {
> > > > +			switch (pci_pcie_type(pdev)) {
> > > > +			case PCI_EXP_TYPE_ROOT_PORT:
> > > > +			case PCI_EXP_TYPE_DOWNSTREAM:
> > > 
> > > This is actually not completely reliable because there are weird
> > > systems that don't identify upstream/downstream ports correctly, e.g.,
> > > see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
> > > downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
> > > dev->has_secondary_link to find downstream links").
> > 
> > D'oh!
> > 
> > It never came to my mind that using pci_pcie_type() would not be
> > reliable. Thanks for pointing it out.
> > 
> > > I think I suggested the dev->has_secondary_link approach, but I now
> > > think that was a mistake because it means we have to remember to look
> > > at has_secondary_link instead of doing the obvious thing like your
> > > code.
> > > 
> > > set_pcie_port_type() detects those unusual topologies, and I think it
> > > would probably be better for it to just change the cached caps reg
> > > used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
> > > the right thing.
> > 
> > You mean modify set_pcie_port_type() to correct the type if it finds:
> > 
> >   type == PCI_EXP_TYPE_UPSTREAM && !pdev->has_secondary_link => type = PCI_EXP_TYPE_DOWNSTREAM
> > 
> > or
> > 
> >   type == PCI_EXP_TYPE_DOWNSTREAM && pdev->has_secondary_link => type = PCI_EXP_TYPE_UPSTREAM
> > 
> > ? Assuming my understanding of pdev->has_secondary_link is correct.
> 
> I was hoping we could get rid of "has_secondary_link" completely if we
> corrected the type, but I'm not sure that's possible.
> 
> Bjorn
Mika Westerberg Aug. 6, 2019, 10:12 a.m. UTC | #6
On Thu, Aug 01, 2019 at 04:53:39PM -0500, Bjorn Helgaas wrote:
> [+cc FPGA folks, just FYI; I'm pretty sure PCI could do a much better
> job supporting FPGAs, so any input is welcome!]
> 
> On Wed, Jul 03, 2019 at 06:03:41PM +0300, Mika Westerberg wrote:
> > On Wed, Jul 03, 2019 at 08:39:53AM -0500, Bjorn Helgaas wrote:
> > > On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> > > > PCIe root and downstream ports have link control register that can be
> > > > used disable the link from software. This can be useful for instance
> > > > when performing "software" hotplug on systems that do not support real
> > > > PCIe/ACPI hotplug.
> > > > 
> > > > For example when used with FPGA card we can burn a new FPGA image
> > > > without need to reboot the system.
> > > > 
> > > > First we remove the FGPA device from Linux PCI stack:
> > > > 
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> > > > 
> > > > Then we disable the link:
> > > > 
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > > 
> > > > By doing this we prevent the kernel from accessing the hardware while we
> > > > burn the new FPGA image. 
> > > 
> > > What is the case where the kernel accesses the hardware?  You've
> > > already done the remove, so the pci_dev is gone.  Is this to protect
> > > against another user doing a rescan?  Or is there some spurious event
> > > during the FPGA update that causes an interrupt that causes pciehp to
> > > rescan?  Something else?
> > 
> > Protect against another user doing rescan.
> 
> I'm not 100% sure this is enough of an issue to warrant a new sysfs
> file.  The file is visible all the time to everybody, but it only
> protects root from shooting him/herself in the foot.

Well, only root can do rescan so in that sense it should be enough ;-)

> > > I guess this particular FPGA update must be done via some side channel
> > > (not the PCIe link)?  I assume there are other FPGA arrangements where
> > > the update *would* be done via the PCIe link, and we would just do a
> > > reset to make the update take effect.
> > 
> > In this setup the FPGA is programmed using side channel. I haven't seen
> > the actual system but I think it is some sort of FPGA programmer
> > connected to another system.
> > 
> > > > Once the new FPGA is burned we can re-enable
> > > > the link and rescan the new and possibly different device:
> > > > 
> > > >   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> > > > 
> > > > Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > ---
> > > >  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
> > > >  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
> > > >  2 files changed, 72 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > index 8bfee557e50e..c93d6b9ab580 100644
> > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > @@ -324,6 +324,14 @@ Description:
> > > >  		This is similar to /sys/bus/pci/drivers_autoprobe, but
> > > >  		affects only the VFs associated with a specific PF.
> > > >  
> > > > +What:		/sys/bus/pci/devices/.../link_disable
> > > > +Date:		September 2019
> > > > +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > +Description:
> > > > +		PCIe root and downstream ports have this attribute. Writing
> > > > +		1 causes the link to downstream component be disabled.
> > > > +		Re-enabling the link happens by writing 0 instead.
> > > > +
> > > >  What:		/sys/bus/pci/devices/.../p2pmem/size
> > > >  Date:		November 2017
> > > >  Contact:	Logan Gunthorpe <logang@deltatee.com>
> > > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > > > index 6d27475e39b2..dfcd21745192 100644
> > > > --- a/drivers/pci/pci-sysfs.c
> > > > +++ b/drivers/pci/pci-sysfs.c
> > > > @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
> > > >  }
> > > >  static DEVICE_ATTR_RO(current_link_width);
> > > >  
> > > > +static ssize_t link_disable_show(struct device *dev,
> > > > +				 struct device_attribute *attr, char *buf)
> > > > +{
> > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > +	u16 linkctl;
> > > > +	int ret;
> > > > +
> > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > +	if (ret)
> > > > +		return -EINVAL;
> > > > +
> > > > +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> > > > +}
> > > > +
> > > > +static ssize_t link_disable_store(struct device *dev,
> > > > +				  struct device_attribute *attr,
> > > > +				  const char *buf, size_t count)
> > > > +{
> > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > +	u16 linkctl;
> > > > +	bool disable;
> > > > +	int ret;
> > > > +
> > > > +	ret = kstrtobool(buf, &disable);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > +	if (ret)
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (disable) {
> > > > +		if (linkctl & PCI_EXP_LNKCTL_LD)
> > > > +			goto out;
> > > > +		linkctl |= PCI_EXP_LNKCTL_LD;
> > > > +	} else {
> > > > +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> > > > +			goto out;
> > > > +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> > > > +	}
> > > > +
> > > > +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +out:
> > > > +	return count;
> > > > +}
> > > > +static DEVICE_ATTR_RW(link_disable);
> > > > +
> > > >  static ssize_t secondary_bus_number_show(struct device *dev,
> > > >  					 struct device_attribute *attr,
> > > >  					 char *buf)
> > > > @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
> > > >  	&dev_attr_current_link_width.attr,
> > > >  	&dev_attr_max_link_width.attr,
> > > >  	&dev_attr_max_link_speed.attr,
> > > > +	&dev_attr_link_disable.attr,
> > > >  	NULL,
> > > >  };
> > > >  
> > > > @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
> > > >  	struct device *dev = kobj_to_dev(kobj);
> > > >  	struct pci_dev *pdev = to_pci_dev(dev);
> > > >  
> > > > -	if (pci_is_pcie(pdev))
> > > > +	if (pci_is_pcie(pdev)) {
> > > > +		if (a == &dev_attr_link_disable.attr) {
> > > > +			switch (pci_pcie_type(pdev)) {
> > > > +			case PCI_EXP_TYPE_ROOT_PORT:
> > > > +			case PCI_EXP_TYPE_DOWNSTREAM:
> > > 
> > > This is actually not completely reliable because there are weird
> > > systems that don't identify upstream/downstream ports correctly, e.g.,
> > > see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
> > > downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
> > > dev->has_secondary_link to find downstream links").
> > 
> > D'oh!
> > 
> > It never came to my mind that using pci_pcie_type() would not be
> > reliable. Thanks for pointing it out.
> > 
> > > I think I suggested the dev->has_secondary_link approach, but I now
> > > think that was a mistake because it means we have to remember to look
> > > at has_secondary_link instead of doing the obvious thing like your
> > > code.
> > > 
> > > set_pcie_port_type() detects those unusual topologies, and I think it
> > > would probably be better for it to just change the cached caps reg
> > > used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
> > > the right thing.
> > 
> > You mean modify set_pcie_port_type() to correct the type if it finds:
> > 
> >   type == PCI_EXP_TYPE_UPSTREAM && !pdev->has_secondary_link => type = PCI_EXP_TYPE_DOWNSTREAM
> > 
> > or
> > 
> >   type == PCI_EXP_TYPE_DOWNSTREAM && pdev->has_secondary_link => type = PCI_EXP_TYPE_UPSTREAM
> > 
> > ? Assuming my understanding of pdev->has_secondary_link is correct.
> 
> I was hoping we could get rid of "has_secondary_link" completely if we
> corrected the type, but I'm not sure that's possible.

Right, it looks like we need some sort of flag there anyway.
Bjorn Helgaas Aug. 19, 2019, 11:52 p.m. UTC | #7
On Tue, Aug 06, 2019 at 01:12:30PM +0300, Mika Westerberg wrote:
> On Thu, Aug 01, 2019 at 04:53:39PM -0500, Bjorn Helgaas wrote:
> > [+cc FPGA folks, just FYI; I'm pretty sure PCI could do a much better
> > job supporting FPGAs, so any input is welcome!]
> > 
> > On Wed, Jul 03, 2019 at 06:03:41PM +0300, Mika Westerberg wrote:
> > > On Wed, Jul 03, 2019 at 08:39:53AM -0500, Bjorn Helgaas wrote:
> > > > On Wed, May 29, 2019 at 01:49:42PM +0300, Mika Westerberg wrote:
> > > > > PCIe root and downstream ports have link control register that can be
> > > > > used disable the link from software. This can be useful for instance
> > > > > when performing "software" hotplug on systems that do not support real
> > > > > PCIe/ACPI hotplug.
> > > > > 
> > > > > For example when used with FPGA card we can burn a new FPGA image
> > > > > without need to reboot the system.
> > > > > 
> > > > > First we remove the FGPA device from Linux PCI stack:
> > > > > 
> > > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/0000:02:00.0/remove
> > > > > 
> > > > > Then we disable the link:
> > > > > 
> > > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > > > 
> > > > > By doing this we prevent the kernel from accessing the hardware while we
> > > > > burn the new FPGA image. 
> > > > 
> > > > What is the case where the kernel accesses the hardware?  You've
> > > > already done the remove, so the pci_dev is gone.  Is this to protect
> > > > against another user doing a rescan?  Or is there some spurious event
> > > > during the FPGA update that causes an interrupt that causes pciehp to
> > > > rescan?  Something else?
> > > 
> > > Protect against another user doing rescan.
> > 
> > I'm not 100% sure this is enough of an issue to warrant a new sysfs
> > file.  The file is visible all the time to everybody, but it only
> > protects root from shooting him/herself in the foot.
> 
> Well, only root can do rescan so in that sense it should be enough ;-)
> 
> > > > I guess this particular FPGA update must be done via some side channel
> > > > (not the PCIe link)?  I assume there are other FPGA arrangements where
> > > > the update *would* be done via the PCIe link, and we would just do a
> > > > reset to make the update take effect.
> > > 
> > > In this setup the FPGA is programmed using side channel. I haven't seen
> > > the actual system but I think it is some sort of FPGA programmer
> > > connected to another system.
> > > 
> > > > > Once the new FPGA is burned we can re-enable
> > > > > the link and rescan the new and possibly different device:
> > > > > 
> > > > >   # echo 0 > /sys/bus/pci/devices/0000:00:01.1/link_disable
> > > > >   # echo 1 > /sys/bus/pci/devices/0000:00:01.1/rescan
> > > > > 
> > > > > Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > > ---
> > > > >  Documentation/ABI/testing/sysfs-bus-pci |  8 +++
> > > > >  drivers/pci/pci-sysfs.c                 | 65 ++++++++++++++++++++++++-
> > > > >  2 files changed, 72 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > index 8bfee557e50e..c93d6b9ab580 100644
> > > > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > > > @@ -324,6 +324,14 @@ Description:
> > > > >  		This is similar to /sys/bus/pci/drivers_autoprobe, but
> > > > >  		affects only the VFs associated with a specific PF.
> > > > >  
> > > > > +What:		/sys/bus/pci/devices/.../link_disable
> > > > > +Date:		September 2019
> > > > > +Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
> > > > > +Description:
> > > > > +		PCIe root and downstream ports have this attribute. Writing
> > > > > +		1 causes the link to downstream component be disabled.
> > > > > +		Re-enabling the link happens by writing 0 instead.
> > > > > +
> > > > >  What:		/sys/bus/pci/devices/.../p2pmem/size
> > > > >  Date:		November 2017
> > > > >  Contact:	Logan Gunthorpe <logang@deltatee.com>
> > > > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > > > > index 6d27475e39b2..dfcd21745192 100644
> > > > > --- a/drivers/pci/pci-sysfs.c
> > > > > +++ b/drivers/pci/pci-sysfs.c
> > > > > @@ -218,6 +218,56 @@ static ssize_t current_link_width_show(struct device *dev,
> > > > >  }
> > > > >  static DEVICE_ATTR_RO(current_link_width);
> > > > >  
> > > > > +static ssize_t link_disable_show(struct device *dev,
> > > > > +				 struct device_attribute *attr, char *buf)
> > > > > +{
> > > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > > +	u16 linkctl;
> > > > > +	int ret;
> > > > > +
> > > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > > +	if (ret)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
> > > > > +}
> > > > > +
> > > > > +static ssize_t link_disable_store(struct device *dev,
> > > > > +				  struct device_attribute *attr,
> > > > > +				  const char *buf, size_t count)
> > > > > +{
> > > > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > > > +	u16 linkctl;
> > > > > +	bool disable;
> > > > > +	int ret;
> > > > > +
> > > > > +	ret = kstrtobool(buf, &disable);
> > > > > +	if (ret)
> > > > > +		return ret;
> > > > > +
> > > > > +	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
> > > > > +	if (ret)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (disable) {
> > > > > +		if (linkctl & PCI_EXP_LNKCTL_LD)
> > > > > +			goto out;
> > > > > +		linkctl |= PCI_EXP_LNKCTL_LD;
> > > > > +	} else {
> > > > > +		if (!(linkctl & PCI_EXP_LNKCTL_LD))
> > > > > +			goto out;
> > > > > +		linkctl &= ~PCI_EXP_LNKCTL_LD;
> > > > > +	}
> > > > > +
> > > > > +	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
> > > > > +	if (ret)
> > > > > +		return ret;
> > > > > +
> > > > > +out:
> > > > > +	return count;
> > > > > +}
> > > > > +static DEVICE_ATTR_RW(link_disable);
> > > > > +
> > > > >  static ssize_t secondary_bus_number_show(struct device *dev,
> > > > >  					 struct device_attribute *attr,
> > > > >  					 char *buf)
> > > > > @@ -785,6 +835,7 @@ static struct attribute *pcie_dev_attrs[] = {
> > > > >  	&dev_attr_current_link_width.attr,
> > > > >  	&dev_attr_max_link_width.attr,
> > > > >  	&dev_attr_max_link_speed.attr,
> > > > > +	&dev_attr_link_disable.attr,
> > > > >  	NULL,
> > > > >  };
> > > > >  
> > > > > @@ -1656,8 +1707,20 @@ static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
> > > > >  	struct device *dev = kobj_to_dev(kobj);
> > > > >  	struct pci_dev *pdev = to_pci_dev(dev);
> > > > >  
> > > > > -	if (pci_is_pcie(pdev))
> > > > > +	if (pci_is_pcie(pdev)) {
> > > > > +		if (a == &dev_attr_link_disable.attr) {
> > > > > +			switch (pci_pcie_type(pdev)) {
> > > > > +			case PCI_EXP_TYPE_ROOT_PORT:
> > > > > +			case PCI_EXP_TYPE_DOWNSTREAM:
> > > > 
> > > > This is actually not completely reliable because there are weird
> > > > systems that don't identify upstream/downstream ports correctly, e.g.,
> > > > see d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track
> > > > downstream PCIe links") and c8fc9339409d ("PCI/ASPM: Use
> > > > dev->has_secondary_link to find downstream links").
> > > 
> > > D'oh!
> > > 
> > > It never came to my mind that using pci_pcie_type() would not be
> > > reliable. Thanks for pointing it out.
> > > 
> > > > I think I suggested the dev->has_secondary_link approach, but I now
> > > > think that was a mistake because it means we have to remember to look
> > > > at has_secondary_link instead of doing the obvious thing like your
> > > > code.
> > > > 
> > > > set_pcie_port_type() detects those unusual topologies, and I think it
> > > > would probably be better for it to just change the cached caps reg
> > > > used by pci_pcie_type() so checking for PCI_EXP_TYPE_DOWNSTREAM does
> > > > the right thing.
> > > 
> > > You mean modify set_pcie_port_type() to correct the type if it finds:
> > > 
> > >   type == PCI_EXP_TYPE_UPSTREAM && !pdev->has_secondary_link => type = PCI_EXP_TYPE_DOWNSTREAM
> > > 
> > > or
> > > 
> > >   type == PCI_EXP_TYPE_DOWNSTREAM && pdev->has_secondary_link => type = PCI_EXP_TYPE_UPSTREAM
> > > 
> > > ? Assuming my understanding of pdev->has_secondary_link is correct.
> > 
> > I was hoping we could get rid of "has_secondary_link" completely if we
> > corrected the type, but I'm not sure that's possible.
> 
> Right, it looks like we need some sort of flag there anyway.

Does this mean you're looking at getting rid of "has_secondary_link",
you think it's impossible, or you think it's not worth trying?

I'm pretty sure we could get rid of it by looking upstream, but I
haven't actually tried it.

Bjorn
Mika Westerberg Aug. 20, 2019, 9:58 a.m. UTC | #8
On Mon, Aug 19, 2019 at 06:52:45PM -0500, Bjorn Helgaas wrote:
> > Right, it looks like we need some sort of flag there anyway.
> 
> Does this mean you're looking at getting rid of "has_secondary_link",
> you think it's impossible, or you think it's not worth trying?

I was of thinking that we need some flag anyway for the downstream port
(such as has_secondary_link) that tells us the which side of the port
the link is.

> I'm pretty sure we could get rid of it by looking upstream, but I
> haven't actually tried it.

So if we are downstream port, look at the parent and if it is also
downstream port (or root port) we change the type to upstream port
accordingly? That might work.

Another option may be to just add a quirk for these ports.

Only concern for both is that we have functions that rely on the type
such as pcie_capability_read_word() so if we change the type do we end
up breaking something? I did not check too closely, though.

I'm willing to cook a patch that fixes this once we have some consensus
what it should do ;-)
Bjorn Helgaas Aug. 20, 2019, 2:17 p.m. UTC | #9
On Tue, Aug 20, 2019 at 12:58:20PM +0300, Mika Westerberg wrote:
> On Mon, Aug 19, 2019 at 06:52:45PM -0500, Bjorn Helgaas wrote:
> > > Right, it looks like we need some sort of flag there anyway.
> > 
> > Does this mean you're looking at getting rid of "has_secondary_link",
> > you think it's impossible, or you think it's not worth trying?
> 
> I was of thinking that we need some flag anyway for the downstream port
> (such as has_secondary_link) that tells us the which side of the port
> the link is.
> 
> > I'm pretty sure we could get rid of it by looking upstream, but I
> > haven't actually tried it.
> 
> So if we are downstream port, look at the parent and if it is also
> downstream port (or root port) we change the type to upstream port
> accordingly? That might work.

If we see a type of PCI_EXP_TYPE_ROOT_PORT or
PCI_EXP_TYPE_PCIE_BRIDGE, I think we have to assume that's accurate
(which we already do today -- for those types, we assume the device
has a secondary link).

For a device that claims to be PCI_EXP_TYPE_DOWNSTREAM, if a parent
device exists and is a Downstream Port (Root Port, Switch Downstream
Port, and I suppose a PCI-to-PCIe bridge (this is basically
pcie_downstream_port()), this device must actually be acting as a
PCI_EXP_TYPE_UPSTREAM device.

If a device claiming to be PCI_EXP_TYPE_UPSTREAM has a parent that is
PCI_EXP_TYPE_UPSTREAM, this device must actually be a
PCI_EXP_TYPE_DOWNSTREAM port.

For PCI_EXP_TYPE_DOWNSTREAM and PCI_EXP_TYPE_UPSTREAM devices that
don't have parents, we just have to assume they advertise the correct
type (as we do today).  There are sparc and virtualization configs
like this.

> Another option may be to just add a quirk for these ports.

I don't really like the quirk approach because then we have to rely on
user reports of something being broken.

> Only concern for both is that we have functions that rely on the type
> such as pcie_capability_read_word() so if we change the type do we end
> up breaking something? I did not check too closely, though.

I don't think we'll break anything that's not already broken because
the type will reflect exactly what has_secondary_link now tells us.
In fact, we might *fix* some things, e.g., pcie_capability_read_word()
should work better if we fix the type that pcie_downstream_port()
checks.

> I'm willing to cook a patch that fixes this once we have some consensus
> what it should do ;-)
Mika Westerberg Aug. 21, 2019, 7:28 a.m. UTC | #10
On Tue, Aug 20, 2019 at 09:17:17AM -0500, Bjorn Helgaas wrote:
> On Tue, Aug 20, 2019 at 12:58:20PM +0300, Mika Westerberg wrote:
> > On Mon, Aug 19, 2019 at 06:52:45PM -0500, Bjorn Helgaas wrote:
> > > > Right, it looks like we need some sort of flag there anyway.
> > > 
> > > Does this mean you're looking at getting rid of "has_secondary_link",
> > > you think it's impossible, or you think it's not worth trying?
> > 
> > I was of thinking that we need some flag anyway for the downstream port
> > (such as has_secondary_link) that tells us the which side of the port
> > the link is.
> > 
> > > I'm pretty sure we could get rid of it by looking upstream, but I
> > > haven't actually tried it.
> > 
> > So if we are downstream port, look at the parent and if it is also
> > downstream port (or root port) we change the type to upstream port
> > accordingly? That might work.
> 
> If we see a type of PCI_EXP_TYPE_ROOT_PORT or
> PCI_EXP_TYPE_PCIE_BRIDGE, I think we have to assume that's accurate
> (which we already do today -- for those types, we assume the device
> has a secondary link).
> 
> For a device that claims to be PCI_EXP_TYPE_DOWNSTREAM, if a parent
> device exists and is a Downstream Port (Root Port, Switch Downstream
> Port, and I suppose a PCI-to-PCIe bridge (this is basically
> pcie_downstream_port()), this device must actually be acting as a
> PCI_EXP_TYPE_UPSTREAM device.
> 
> If a device claiming to be PCI_EXP_TYPE_UPSTREAM has a parent that is
> PCI_EXP_TYPE_UPSTREAM, this device must actually be a
> PCI_EXP_TYPE_DOWNSTREAM port.
> 
> For PCI_EXP_TYPE_DOWNSTREAM and PCI_EXP_TYPE_UPSTREAM devices that
> don't have parents, we just have to assume they advertise the correct
> type (as we do today).  There are sparc and virtualization configs
> like this.

OK, thanks for the details. I'll try to make patch based on the above.

> > Another option may be to just add a quirk for these ports.
> 
> I don't really like the quirk approach because then we have to rely on
> user reports of something being broken.
> 
> > Only concern for both is that we have functions that rely on the type
> > such as pcie_capability_read_word() so if we change the type do we end
> > up breaking something? I did not check too closely, though.
> 
> I don't think we'll break anything that's not already broken because
> the type will reflect exactly what has_secondary_link now tells us.
> In fact, we might *fix* some things, e.g., pcie_capability_read_word()
> should work better if we fix the type that pcie_downstream_port()
> checks.

Fair enough :)
Mika Westerberg Aug. 21, 2019, 2:37 p.m. UTC | #11
On Wed, Aug 21, 2019 at 10:28:37AM +0300, Mika Westerberg wrote:
> > If we see a type of PCI_EXP_TYPE_ROOT_PORT or
> > PCI_EXP_TYPE_PCIE_BRIDGE, I think we have to assume that's accurate
> > (which we already do today -- for those types, we assume the device
> > has a secondary link).
> > 
> > For a device that claims to be PCI_EXP_TYPE_DOWNSTREAM, if a parent
> > device exists and is a Downstream Port (Root Port, Switch Downstream
> > Port, and I suppose a PCI-to-PCIe bridge (this is basically
> > pcie_downstream_port()), this device must actually be acting as a
> > PCI_EXP_TYPE_UPSTREAM device.
> > 
> > If a device claiming to be PCI_EXP_TYPE_UPSTREAM has a parent that is
> > PCI_EXP_TYPE_UPSTREAM, this device must actually be a
> > PCI_EXP_TYPE_DOWNSTREAM port.
> > 
> > For PCI_EXP_TYPE_DOWNSTREAM and PCI_EXP_TYPE_UPSTREAM devices that
> > don't have parents, we just have to assume they advertise the correct
> > type (as we do today).  There are sparc and virtualization configs
> > like this.
> 
> OK, thanks for the details. I'll try to make patch based on the above.

Something like the below patch? Only compile tested for now but I will
split it into a proper patch series and give it some testing if this is
what you were after.

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 544922f097c0..2fccb5762c76 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -336,15 +336,6 @@ static inline int pcie_cap_version(const struct pci_dev *dev)
 	return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS;
 }
 
-static bool pcie_downstream_port(const struct pci_dev *dev)
-{
-	int type = pci_pcie_type(dev);
-
-	return type == PCI_EXP_TYPE_ROOT_PORT ||
-	       type == PCI_EXP_TYPE_DOWNSTREAM ||
-	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
-}
-
 bool pcie_cap_has_lnkctl(const struct pci_dev *dev)
 {
 	int type = pci_pcie_type(dev);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 9ac50710f1d4..3c0672f1dfe7 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3577,7 +3577,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask)
 		}
 
 		/* Ensure upstream ports don't block AtomicOps on egress */
-		if (!bridge->has_secondary_link) {
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) {
 			pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
 						   &ctl2);
 			if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9a83fcf612ca..ae8d839dca4f 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -118,6 +118,15 @@ static inline bool pci_power_manageable(struct pci_dev *pci_dev)
 	return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
 }
 
+static inline bool pcie_downstream_port(const struct pci_dev *dev)
+{
+	int type = pci_pcie_type(dev);
+
+	return type == PCI_EXP_TYPE_ROOT_PORT ||
+	       type == PCI_EXP_TYPE_DOWNSTREAM ||
+	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
+}
+
 int pci_vpd_init(struct pci_dev *dev);
 void pci_vpd_release(struct pci_dev *dev);
 void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 464f8f92653f..db2d40e44c08 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -904,6 +904,7 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
 {
 	struct pcie_link_state *link;
 	int blacklist = !!pcie_aspm_sanity_check(pdev);
+	int type = pci_pcie_type(pdev);
 
 	if (!aspm_support_enabled)
 		return;
@@ -913,15 +914,14 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
 
 	/*
 	 * We allocate pcie_link_state for the component on the upstream
-	 * end of a Link, so there's nothing to do unless this device has a
-	 * Link on its secondary side.
+	 * end of a Link, so there's nothing to do unless this device is
+	 * downstream or root port.
 	 */
-	if (!pdev->has_secondary_link)
+	if (type != PCI_EXP_TYPE_ROOT_PORT && type != PCI_EXP_TYPE_DOWNSTREAM)
 		return;
 
 	/* VIA has a strange chipset, root port is under a bridge */
-	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT &&
-	    pdev->bus->self)
+	if (type == PCI_EXP_TYPE_ROOT_PORT && pdev->bus->self)
 		return;
 
 	down_read(&pci_bus_sem);
@@ -1070,7 +1070,8 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
 	if (!pci_is_pcie(pdev))
 		return 0;
 
-	if (pdev->has_secondary_link)
+	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT ||
+	    pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)
 		parent = pdev;
 	if (!parent || !parent->link_state)
 		return -EINVAL;
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 773197a12568..b0e6048a9208 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -166,7 +166,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
 	driver = pcie_port_find_service(dev, service);
 	if (driver && driver->reset_link) {
 		status = driver->reset_link(dev);
-	} else if (dev->has_secondary_link) {
+	} else if (pcie_downstream_port(dev)) {
 		status = default_reset_link(dev);
 	} else {
 		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a3c7338fad86..983a5612c548 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1431,26 +1431,40 @@ void set_pcie_port_type(struct pci_dev *pdev)
 	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
 	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
 
+	parent = pci_upstream_bridge(pdev);
+	if (!parent)
+		return;
+
 	/*
-	 * A Root Port or a PCI-to-PCIe bridge is always the upstream end
-	 * of a Link.  No PCIe component has two Links.  Two Links are
-	 * connected by a Switch that has a Port on each Link and internal
-	 * logic to connect the two Ports.
+	 * Some systems do not identify their upstream/downstream ports
+	 * correctly so detect impossible configurations here and correct
+	 * the port type accordingly.
 	 */
 	type = pci_pcie_type(pdev);
-	if (type == PCI_EXP_TYPE_ROOT_PORT ||
-	    type == PCI_EXP_TYPE_PCIE_BRIDGE)
-		pdev->has_secondary_link = 1;
-	else if (type == PCI_EXP_TYPE_UPSTREAM ||
-		 type == PCI_EXP_TYPE_DOWNSTREAM) {
-		parent = pci_upstream_bridge(pdev);
-
+	if (type == PCI_EXP_TYPE_DOWNSTREAM) {
 		/*
-		 * Usually there's an upstream device (Root Port or Switch
-		 * Downstream Port), but we can't assume one exists.
+		 * If pdev claims to be downstream port but the parent
+		 * device is also downstream port assume pdev is actually
+		 * upstream port.
 		 */
-		if (parent && !parent->has_secondary_link)
-			pdev->has_secondary_link = 1;
+		if (pcie_downstream_port(parent)) {
+			dev_info(&pdev->dev,
+				"claims to be downstream port but is acting as upstream port, correcting type\n");
+			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
+			pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM;
+		}
+	} else if (type == PCI_EXP_TYPE_UPSTREAM) {
+		/*
+		 * If pdev claims to be upstream port but the parent
+		 * device is also upstream port assume pdev is actually
+		 * downstream port.
+		 */
+		if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) {
+			dev_info(&pdev->dev,
+				"claims to be upstream port but is acting as downstream port, correcting type\n");
+			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
+			pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM;
+		}
 	}
 }
 
@@ -2764,12 +2778,8 @@ static int only_one_child(struct pci_bus *bus)
 	 * A PCIe Downstream Port normally leads to a Link with only Device
 	 * 0 on it (PCIe spec r3.1, sec 7.3.1).  As an optimization, scan
 	 * only for Device 0 in that situation.
-	 *
-	 * Checking has_secondary_link is a hack to identify Downstream
-	 * Ports because sometimes Switches are configured such that the
-	 * PCIe Port Type labels are backwards.
 	 */
-	if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link)
+	if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge))
 		return 1;
 
 	return 0;
diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c
index 5acd9c02683a..9ae9fb9339e8 100644
--- a/drivers/pci/vc.c
+++ b/drivers/pci/vc.c
@@ -13,6 +13,8 @@
 #include <linux/pci_regs.h>
 #include <linux/types.h>
 
+#include "pci.h"
+
 /**
  * pci_vc_save_restore_dwords - Save or restore a series of dwords
  * @dev: device
@@ -105,7 +107,7 @@ static void pci_vc_enable(struct pci_dev *dev, int pos, int res)
 	struct pci_dev *link = NULL;
 
 	/* Enable VCs from the downstream device */
-	if (!dev->has_secondary_link)
+	if (!pci_is_pcie(dev) || !pcie_downstream_port(dev))
 		return;
 
 	ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 82e4cd1b7ac3..2f8990246316 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -418,7 +418,6 @@ struct pci_dev {
 	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
 	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
-	unsigned int	has_secondary_link:1;
 	unsigned int	non_compliant_bars:1;	/* Broken BARs; ignore them */
 	unsigned int	is_probed:1;		/* Device probing in progress */
 	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
Bjorn Helgaas Aug. 21, 2019, 7:01 p.m. UTC | #12
On Wed, Aug 21, 2019 at 05:37:51PM +0300, Mika Westerberg wrote:
> On Wed, Aug 21, 2019 at 10:28:37AM +0300, Mika Westerberg wrote:
> > > If we see a type of PCI_EXP_TYPE_ROOT_PORT or
> > > PCI_EXP_TYPE_PCIE_BRIDGE, I think we have to assume that's accurate
> > > (which we already do today -- for those types, we assume the device
> > > has a secondary link).
> > > 
> > > For a device that claims to be PCI_EXP_TYPE_DOWNSTREAM, if a parent
> > > device exists and is a Downstream Port (Root Port, Switch Downstream
> > > Port, and I suppose a PCI-to-PCIe bridge (this is basically
> > > pcie_downstream_port()), this device must actually be acting as a
> > > PCI_EXP_TYPE_UPSTREAM device.
> > > 
> > > If a device claiming to be PCI_EXP_TYPE_UPSTREAM has a parent that is
> > > PCI_EXP_TYPE_UPSTREAM, this device must actually be a
> > > PCI_EXP_TYPE_DOWNSTREAM port.
> > > 
> > > For PCI_EXP_TYPE_DOWNSTREAM and PCI_EXP_TYPE_UPSTREAM devices that
> > > don't have parents, we just have to assume they advertise the correct
> > > type (as we do today).  There are sparc and virtualization configs
> > > like this.
> > 
> > OK, thanks for the details. I'll try to make patch based on the above.
> 
> Something like the below patch? Only compile tested for now but I will
> split it into a proper patch series and give it some testing if this is
> what you were after.

This is great, thanks a lot for cleaning this up!  I feel bad for
having introduced the has_secondary_link hack in the first place, so
I'm really glad you're fixing it up.

A couple minor nits below.

> diff --git a/drivers/pci/access.c b/drivers/pci/access.c
> index 544922f097c0..2fccb5762c76 100644
> --- a/drivers/pci/access.c
> +++ b/drivers/pci/access.c
> @@ -336,15 +336,6 @@ static inline int pcie_cap_version(const struct pci_dev *dev)
>  	return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS;
>  }
>  
> -static bool pcie_downstream_port(const struct pci_dev *dev)
> -{
> -	int type = pci_pcie_type(dev);
> -
> -	return type == PCI_EXP_TYPE_ROOT_PORT ||
> -	       type == PCI_EXP_TYPE_DOWNSTREAM ||
> -	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
> -}
> -
>  bool pcie_cap_has_lnkctl(const struct pci_dev *dev)
>  {
>  	int type = pci_pcie_type(dev);
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 9ac50710f1d4..3c0672f1dfe7 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -3577,7 +3577,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask)
>  		}
>  
>  		/* Ensure upstream ports don't block AtomicOps on egress */
> -		if (!bridge->has_secondary_link) {
> +		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) {
>  			pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
>  						   &ctl2);
>  			if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK)
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9a83fcf612ca..ae8d839dca4f 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -118,6 +118,15 @@ static inline bool pci_power_manageable(struct pci_dev *pci_dev)
>  	return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
>  }
>  
> +static inline bool pcie_downstream_port(const struct pci_dev *dev)
> +{
> +	int type = pci_pcie_type(dev);
> +
> +	return type == PCI_EXP_TYPE_ROOT_PORT ||
> +	       type == PCI_EXP_TYPE_DOWNSTREAM ||
> +	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
> +}
> +
>  int pci_vpd_init(struct pci_dev *dev);
>  void pci_vpd_release(struct pci_dev *dev);
>  void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev);
> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index 464f8f92653f..db2d40e44c08 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -904,6 +904,7 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
>  {
>  	struct pcie_link_state *link;
>  	int blacklist = !!pcie_aspm_sanity_check(pdev);
> +	int type = pci_pcie_type(pdev);
>  
>  	if (!aspm_support_enabled)
>  		return;
> @@ -913,15 +914,14 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
>  
>  	/*
>  	 * We allocate pcie_link_state for the component on the upstream
> -	 * end of a Link, so there's nothing to do unless this device has a
> -	 * Link on its secondary side.
> +	 * end of a Link, so there's nothing to do unless this device is
> +	 * downstream or root port.
>  	 */
> -	if (!pdev->has_secondary_link)
> +	if (type != PCI_EXP_TYPE_ROOT_PORT && type != PCI_EXP_TYPE_DOWNSTREAM)

I think this should use pcie_downstream_port(), since
PCI_EXP_TYPE_PCIE_BRIDGE is a Downstream Port, and
set_pcie_port_type() did set dev->has_secondary_link for PCIE_BRIDGE.

>  		return;
>  
>  	/* VIA has a strange chipset, root port is under a bridge */
> -	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT &&
> -	    pdev->bus->self)
> +	if (type == PCI_EXP_TYPE_ROOT_PORT && pdev->bus->self)
>  		return;
>  
>  	down_read(&pci_bus_sem);
> @@ -1070,7 +1070,8 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem)
>  	if (!pci_is_pcie(pdev))
>  		return 0;
>  
> -	if (pdev->has_secondary_link)
> +	if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT ||
> +	    pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)

Same here.

>  		parent = pdev;
>  	if (!parent || !parent->link_state)
>  		return -EINVAL;
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 773197a12568..b0e6048a9208 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -166,7 +166,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
>  	driver = pcie_port_find_service(dev, service);
>  	if (driver && driver->reset_link) {
>  		status = driver->reset_link(dev);
> -	} else if (dev->has_secondary_link) {
> +	} else if (pcie_downstream_port(dev)) {
>  		status = default_reset_link(dev);
>  	} else {
>  		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index a3c7338fad86..983a5612c548 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -1431,26 +1431,40 @@ void set_pcie_port_type(struct pci_dev *pdev)
>  	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
>  	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
>  
> +	parent = pci_upstream_bridge(pdev);
> +	if (!parent)
> +		return;
> +
>  	/*
> -	 * A Root Port or a PCI-to-PCIe bridge is always the upstream end
> -	 * of a Link.  No PCIe component has two Links.  Two Links are
> -	 * connected by a Switch that has a Port on each Link and internal
> -	 * logic to connect the two Ports.
> +	 * Some systems do not identify their upstream/downstream ports
> +	 * correctly so detect impossible configurations here and correct
> +	 * the port type accordingly.
>  	 */
>  	type = pci_pcie_type(pdev);
> -	if (type == PCI_EXP_TYPE_ROOT_PORT ||
> -	    type == PCI_EXP_TYPE_PCIE_BRIDGE)
> -		pdev->has_secondary_link = 1;
> -	else if (type == PCI_EXP_TYPE_UPSTREAM ||
> -		 type == PCI_EXP_TYPE_DOWNSTREAM) {
> -		parent = pci_upstream_bridge(pdev);
> -
> +	if (type == PCI_EXP_TYPE_DOWNSTREAM) {
>  		/*
> -		 * Usually there's an upstream device (Root Port or Switch
> -		 * Downstream Port), but we can't assume one exists.
> +		 * If pdev claims to be downstream port but the parent
> +		 * device is also downstream port assume pdev is actually
> +		 * upstream port.
>  		 */
> -		if (parent && !parent->has_secondary_link)
> -			pdev->has_secondary_link = 1;
> +		if (pcie_downstream_port(parent)) {
> +			dev_info(&pdev->dev,
> +				"claims to be downstream port but is acting as upstream port, correcting type\n");

You can use pci_info() here.  Since the text wraps anyway, I'd just
remove the line break and put it all on one line.

> +			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
> +			pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM;
> +		}
> +	} else if (type == PCI_EXP_TYPE_UPSTREAM) {
> +		/*
> +		 * If pdev claims to be upstream port but the parent
> +		 * device is also upstream port assume pdev is actually
> +		 * downstream port.
> +		 */
> +		if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) {
> +			dev_info(&pdev->dev,
> +				"claims to be upstream port but is acting as downstream port, correcting type\n");

And here.

> +			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
> +			pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM;
> +		}
>  	}
>  }
>  
> @@ -2764,12 +2778,8 @@ static int only_one_child(struct pci_bus *bus)
>  	 * A PCIe Downstream Port normally leads to a Link with only Device
>  	 * 0 on it (PCIe spec r3.1, sec 7.3.1).  As an optimization, scan
>  	 * only for Device 0 in that situation.
> -	 *
> -	 * Checking has_secondary_link is a hack to identify Downstream
> -	 * Ports because sometimes Switches are configured such that the
> -	 * PCIe Port Type labels are backwards.
>  	 */
> -	if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link)
> +	if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge))
>  		return 1;
>  
>  	return 0;
> diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c
> index 5acd9c02683a..9ae9fb9339e8 100644
> --- a/drivers/pci/vc.c
> +++ b/drivers/pci/vc.c
> @@ -13,6 +13,8 @@
>  #include <linux/pci_regs.h>
>  #include <linux/types.h>
>  
> +#include "pci.h"
> +
>  /**
>   * pci_vc_save_restore_dwords - Save or restore a series of dwords
>   * @dev: device
> @@ -105,7 +107,7 @@ static void pci_vc_enable(struct pci_dev *dev, int pos, int res)
>  	struct pci_dev *link = NULL;
>  
>  	/* Enable VCs from the downstream device */
> -	if (!dev->has_secondary_link)
> +	if (!pci_is_pcie(dev) || !pcie_downstream_port(dev))
>  		return;
>  
>  	ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 82e4cd1b7ac3..2f8990246316 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -418,7 +418,6 @@ struct pci_dev {
>  	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
>  	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
>  	unsigned int	irq_managed:1;
> -	unsigned int	has_secondary_link:1;
>  	unsigned int	non_compliant_bars:1;	/* Broken BARs; ignore them */
>  	unsigned int	is_probed:1;		/* Device probing in progress */
>  	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 8bfee557e50e..c93d6b9ab580 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -324,6 +324,14 @@  Description:
 		This is similar to /sys/bus/pci/drivers_autoprobe, but
 		affects only the VFs associated with a specific PF.
 
+What:		/sys/bus/pci/devices/.../link_disable
+Date:		September 2019
+Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
+Description:
+		PCIe root and downstream ports have this attribute. Writing
+		1 causes the link to downstream component be disabled.
+		Re-enabling the link happens by writing 0 instead.
+
 What:		/sys/bus/pci/devices/.../p2pmem/size
 Date:		November 2017
 Contact:	Logan Gunthorpe <logang@deltatee.com>
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6d27475e39b2..dfcd21745192 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -218,6 +218,56 @@  static ssize_t current_link_width_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(current_link_width);
 
+static ssize_t link_disable_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u16 linkctl;
+	int ret;
+
+	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
+	if (ret)
+		return -EINVAL;
+
+	return sprintf(buf, "%d\n", !!(linkctl & PCI_EXP_LNKCTL_LD));
+}
+
+static ssize_t link_disable_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	u16 linkctl;
+	bool disable;
+	int ret;
+
+	ret = kstrtobool(buf, &disable);
+	if (ret)
+		return ret;
+
+	ret = pcie_capability_read_word(pci_dev, PCI_EXP_LNKCTL, &linkctl);
+	if (ret)
+		return -EINVAL;
+
+	if (disable) {
+		if (linkctl & PCI_EXP_LNKCTL_LD)
+			goto out;
+		linkctl |= PCI_EXP_LNKCTL_LD;
+	} else {
+		if (!(linkctl & PCI_EXP_LNKCTL_LD))
+			goto out;
+		linkctl &= ~PCI_EXP_LNKCTL_LD;
+	}
+
+	ret = pcie_capability_write_word(pci_dev, PCI_EXP_LNKCTL, linkctl);
+	if (ret)
+		return ret;
+
+out:
+	return count;
+}
+static DEVICE_ATTR_RW(link_disable);
+
 static ssize_t secondary_bus_number_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -785,6 +835,7 @@  static struct attribute *pcie_dev_attrs[] = {
 	&dev_attr_current_link_width.attr,
 	&dev_attr_max_link_width.attr,
 	&dev_attr_max_link_speed.attr,
+	&dev_attr_link_disable.attr,
 	NULL,
 };
 
@@ -1656,8 +1707,20 @@  static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	if (pci_is_pcie(pdev))
+	if (pci_is_pcie(pdev)) {
+		if (a == &dev_attr_link_disable.attr) {
+			switch (pci_pcie_type(pdev)) {
+			case PCI_EXP_TYPE_ROOT_PORT:
+			case PCI_EXP_TYPE_DOWNSTREAM:
+				break;
+
+			default:
+				return 0;
+			}
+		}
+
 		return a->mode;
+	}
 
 	return 0;
 }