diff mbox

[v2,2/5] x86/PCI: Support additional MMIO range capabilities

Message ID 20140419025323.2408.88764.stgit@amt.stowe (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Myron Stowe April 19, 2014, 2:53 a.m. UTC
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

This patch adds supports for additional MMIO ranges (16 ranges).  Also,
each MMIO base/limit can now support up to 48-bit MMIO addresses.
However, this requires initializing the ECS sooner since the new registers
are in the ECS ranges.

This applies for AMD Family 15h and later.

Reference: Advanced Micro Devices (AMD), "BIOS and Kernel Developer's
Guide (BKDG) for AMD Family 15h Models 00h-0fh Processors."  Section 3.4
Device [1F:18]h Function 1 Configuration Registers;
  D18F1x[1CC:180,BC:80] MMIO Base/Limit,
  D18F1x[D8,D0,C8,C0] IO-Space Base,
  D18F1x[DC,D4,CC,C4] IO-Space Limit,
42301 Rev 3.12 - October 11, 2012.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
---

 arch/x86/pci/amd_bus.c |  126 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 103 insertions(+), 23 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Borislav Petkov April 19, 2014, 1:52 p.m. UTC | #1
On Fri, Apr 18, 2014 at 08:53:23PM -0600, Myron Stowe wrote:
> From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> 
> This patch adds supports for additional MMIO ranges (16 ranges).  Also,
> each MMIO base/limit can now support up to 48-bit MMIO addresses.
> However, this requires initializing the ECS sooner since the new registers
> are in the ECS ranges.
> 
> This applies for AMD Family 15h and later.
> 
> Reference: Advanced Micro Devices (AMD), "BIOS and Kernel Developer's
> Guide (BKDG) for AMD Family 15h Models 00h-0fh Processors."  Section 3.4
> Device [1F:18]h Function 1 Configuration Registers;
>   D18F1x[1CC:180,BC:80] MMIO Base/Limit,
>   D18F1x[D8,D0,C8,C0] IO-Space Base,
>   D18F1x[DC,D4,CC,C4] IO-Space Limit,
> 42301 Rev 3.12 - October 11, 2012.
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
> Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
> ---
> 
>  arch/x86/pci/amd_bus.c |  126 +++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 103 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
> index c8cd75c..1b2895e 100644
> --- a/arch/x86/pci/amd_bus.c
> +++ b/arch/x86/pci/amd_bus.c
> @@ -13,11 +13,30 @@
>  
>  #define AMD_NB_F0_NODE_ID			0x60
>  #define AMD_NB_F0_UNIT_ID			0x64
> +#define AMD_NB_F1_MMIO_BASE_REG			0x80
> +#define AMD_NB_F1_MMIO_LIMIT_REG		0x84
> +#define AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG	0x180
> +#define AMD_NB_F1_IO_BASE_ADDR_REG		0xc0
> +#define AMD_NB_F1_IO_LIMIT_ADDR_REG		0xc4
>  #define AMD_NB_F1_CONFIG_MAP_REG		0xe0
>  
>  #define RANGE_NUM				16
> +#define AMD_NB_F1_MMIO_RANGES			16
> +#define AMD_NB_F1_IOPORT_RANGES			4
>  #define AMD_NB_F1_CONFIG_MAP_RANGES		4
>  
> +#define AMD_PCIE_CF8(bus, dev, fn, reg) \
> +			(0x80000000 | \
> +			((reg & 0xF00) << 16) | \
> +			((bus & 0xF) << 16) | \
> +			((dev & 0x1F) << 11) | \
> +			((fn & 0x7) << 8) | \
> +			((reg & 0xFC)))
> +
> +static bool amd_early_ecs_enabled;
> +
> +static int __init pci_io_ecs_init(u8 bus, u8 slot);

Please move the function above its caller instead of adding a forward
declaration.

> +
>  /*
>   * This discovers the pcibus <-> node mapping on AMD K8, Family 10h, 12h,
>   * 14h, 15h, and 16h processor based systems.  This also gets peer root bus
> @@ -39,6 +58,20 @@ static struct amd_hostbridge hb_probes[] __initdata = {
>  	{ 0xff, 0   , PCI_DEVICE_ID_AMD_10H_NB_HT },
>  };
>  
> +/* This version of read_pci_config allows reading registers in the ECS area */
> +static inline int _amd_read_pci_config(u8 bus, u8 slot, u8 fn, u32 offset)
> +{
> +	u32 value;
> +
> +	if ((!amd_early_ecs_enabled) && (offset > 0xFF))
> +		return -1;
> +
> +	outl(AMD_PCIE_CF8(bus, slot, fn, offset), 0xcf8);
> +	value = inl(0xcfc);
> +
> +	return value;
> +
>  static struct pci_root_info __init *find_pci_root_info(int node, int link)
>  {
>  	struct pci_root_info *info;
> @@ -48,6 +81,9 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
>  		if (info->node == node && info->link == link)
>  			return info;
>  
> +	pr_warn("AMD-Bus: WARNING: Failed to find root info for node %#x, link %#x\n",

Prefixes like "AMD-Bus" are done using pr_fmt.

> +		node, link);
> +
>  	return NULL;
>  }
>  
> @@ -118,6 +154,12 @@ static int __init early_fill_mp_bus_info(void)
>  
>  	pr_debug("Found AMD hostbridge at %x:%x.0\n", bus, slot);
>  
> +	/* We enable ECS mode prior to probing MMIO as MMIO related registers
> +	 * are in the ECS area.
> +	 */
> +	pci_io_ecs_init(bus, slot);

Well, if enabling the ECS failed somehow, you probably want to save
yourself the _amd_read_pci_config() calls further down.

Which means:

* you should propagate an error code from that pci_io_ecs_init() function

* you can carve out the huge loop iterating over AMD_NB_F1_MMIO_RANGES
along with the pci ECS enablement call to pci_io_ecs_init into a
separate function and thus slim down that huge early_fill_mp_bus_info()
monster a bit.

> +
> +	found = false;
>  	for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
>  		int min_bus;
>  		int max_bus;
> @@ -128,6 +170,7 @@ static int __init early_fill_mp_bus_info(void)
>  		if ((reg & 7) != 3)
>  			continue;
>  
> +		found = true;
>  		min_bus = (reg >> 16) & 0xff;
>  		max_bus = (reg >> 24) & 0xff;
>  		node = (reg >> 4) & 0x07;
> @@ -136,6 +179,13 @@ static int __init early_fill_mp_bus_info(void)
>  		info = alloc_pci_root_info(min_bus, max_bus, node, link);
>  	}
>  
> +	if (!found) {
> +		/* In case there is no AMDNB_F1_CONFIG_MAP_REGs,
> +		 * we just use default to bus 0, node 0 link 0)
> +		 */
> +		info = alloc_pci_root_info(0, 0, 0, 0);
> +	}

No need for curly braces around a single statement - simply put the
comment before the if (...).

> +
>  	/* get the default node and link for left over res */
>  	reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
>  	def_node = (reg >> 8) & 0x07;
> @@ -144,14 +194,17 @@ static int __init early_fill_mp_bus_info(void)
>  
>  	memset(range, 0, sizeof(range));
>  	add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
> +
>  	/* io port resource */
> -	for (i = 0; i < 4; i++) {
> -		reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
> +	for (i = 0; i < AMD_NB_F1_IOPORT_RANGES; i++) {
> +		reg = read_pci_config(bus, slot, 1,
> +				AMD_NB_F1_IO_BASE_ADDR_REG + (i << 3));
>  		if (!(reg & 3))
>  			continue;
>  
>  		start = reg & 0xfff000;
> -		reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3));
> +		reg = read_pci_config(bus, slot, 1,
> +				AMD_NB_F1_IO_LIMIT_ADDR_REG + (i << 3));
>  		node = reg & 0x07;
>  		link = (reg >> 4) & 0x03;
>  		end = (reg & 0xfff000) | 0xfff;
> @@ -169,6 +222,7 @@ static int __init early_fill_mp_bus_info(void)
>  		update_res(info, start, end, IORESOURCE_IO, 1);
>  		subtract_range(range, RANGE_NUM, start, end + 1);
>  	}
> +
>  	/* add left over io port range to def node/link, [0, 0xffff] */
>  	/* find the position */
>  	info = find_pci_root_info(def_node, def_link);
> @@ -211,22 +265,46 @@ static int __init early_fill_mp_bus_info(void)
>  	}
>  
>  	/* mmio resource */
> -	for (i = 0; i < 8; i++) {
> -		reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
> +	for (i = 0; i < AMD_NB_F1_MMIO_RANGES; i++) {
> +		u64 tmp;
> +		u32 base = AMD_NB_F1_MMIO_BASE_REG + (i << 3);
> +		u32 limit = AMD_NB_F1_MMIO_LIMIT_REG + (i << 3);
> +		u32 base_limit_hi = AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG + (i << 2);
> +
> +		if (i >= 12) {
> +			/* Range 12 base/limit starts at 0x2A0 */
> +			base += 0x1c0;
> +			limit += 0x1c0;
> +			/* Range 12 base/limit hi starts at 0x2C0 */
> +			base_limit_hi += 0x110;
> +		} else if (i >= 8) {
> +			/* Range 8 base/limit starts at 0x1A0 */
> +			base += 0xe0;
> +			limit += 0xe0;
> +			/* Range 12 base/limit hi starts at 0x1C0 */
> +			base_limit_hi += 0x20;
> +		}
> +
> +		/* Base lo */
> +		reg = _amd_read_pci_config(bus, slot, 1, base);
>  		if (!(reg & 3))
>  			continue;
>  
> -		start = reg & 0xffffff00; /* 39:16 on 31:8*/
> -		start <<= 8;
> -		reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
> +		start = (reg & 0xffffff00UL) << 8;	/* 39:16 on 31:8 */
> +
> +		/* Limit lo */
> +		reg = _amd_read_pci_config(bus, slot, 1, limit);
>  		node = reg & 0x07;
>  		link = (reg >> 4) & 0x03;
> -		end = (reg & 0xffffff00);
> -		end <<= 8;
> -		end |= 0xffff;
> +		end = (reg & 0xffffff00UL) << 8;	/* 39:16 on 31:8 */
> +		end |= 0xffffUL;
>  
> -		info = find_pci_root_info(node, link);
> +		/* Base/Limit hi */
> +		tmp = _amd_read_pci_config(bus, slot, 1, base_limit_hi);
> +		start |= ((tmp & 0xffUL) << 40);
> +		end |= ((tmp & (0xffUL << 16)) << 24);
>  
> +		info = find_pci_root_info(node, link);
>  		if (!info)
>  			continue;
>  
> @@ -354,20 +432,24 @@ static struct notifier_block amd_cpu_notifier = {
>  	.notifier_call	= amd_cpu_notify,
>  };
>  
> -static void __init pci_enable_pci_io_ecs(void)
> +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
>  {
>  #ifdef CONFIG_AMD_NB
>  	unsigned int i, n;
> +	u8 limit;
>  
>  	for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
> -		u8 bus = amd_nb_bus_dev_ranges[i].bus;
> -		u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
> -		u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
> +		/* Try matching for the bus range */
> +		if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
> +		    (slot != amd_nb_bus_dev_ranges[i].dev_base))
> +			continue;
> +
> +		limit = amd_nb_bus_dev_ranges[i].dev_limit;
>  
> +		/* Setup all northbridges within the range */
>  		for (; slot < limit; ++slot) {
>  			u32 val = read_pci_config(bus, slot, 3, 0);
> -
> -			if (!early_is_amd_nb(val))
> +			if (!val)
>  				continue;
>  
>  			val = read_pci_config(bus, slot, 3, 0x8c);
> @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
>  				val |= ENABLE_CF8_EXT_CFG >> 32;

What a fun shifting!

Maybe you should do

#define ENABLE_CF8_EXT_CFG	BIT(46 - 32)

to show exactly what you mean and how the bit is defined in MSR NB_CFG1
and also show how the high 32-bits are mapped into F3x8c, while at it.

And then you can drop the shifting at the call site.


>  				write_pci_config(bus, slot, 3, 0x8c, val);
>  			}
> +			amd_early_ecs_enabled = true;

You probably should check whether the PCI config write stuck before
setting this.

>  			++n;
>  		}
>  	}
>  #endif
>  }
>  
> -static int __init pci_io_ecs_init(void)
> +static int __init pci_io_ecs_init(u8 bus, u8 slot)
>  {
>  	int cpu;
>  
> @@ -389,9 +472,7 @@ static int __init pci_io_ecs_init(void)
>          if (boot_cpu_data.x86 < 0x10)
>  		return 0;
>  
> -	/* Try the PCI method first. */
> -	if (early_pci_allowed())
> -		pci_enable_pci_io_ecs();

Where did the if-check go?

> +	pci_enable_pci_io_ecs(bus, slot);
>  
>  	cpu_notifier_register_begin();
>  	for_each_online_cpu(cpu)
> @@ -411,7 +492,6 @@ static int __init amd_postcore_init(void)
>  		return 0;
>  
>  	early_fill_mp_bus_info();
> -	pci_io_ecs_init();
>  
>  	return 0;
>  }
>
Borislav Petkov April 20, 2014, 7:59 a.m. UTC | #2
Drop Andreas' old email address from CC as it keeps bouncing.

On Sat, Apr 19, 2014 at 03:52:20PM +0200, Borislav Petkov wrote:
> > -static void __init pci_enable_pci_io_ecs(void)
> > +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
> >  {
> >  #ifdef CONFIG_AMD_NB
> >  	unsigned int i, n;
> > +	u8 limit;
> >  
> >  	for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
> > -		u8 bus = amd_nb_bus_dev_ranges[i].bus;
> > -		u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
> > -		u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
> > +		/* Try matching for the bus range */
> > +		if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
> > +		    (slot != amd_nb_bus_dev_ranges[i].dev_base))
> > +			continue;
> > +
> > +		limit = amd_nb_bus_dev_ranges[i].dev_limit;
> >  
> > +		/* Setup all northbridges within the range */
> >  		for (; slot < limit; ++slot) {
> >  			u32 val = read_pci_config(bus, slot, 3, 0);
> > -
> > -			if (!early_is_amd_nb(val))
> > +			if (!val)
> >  				continue;
> >  
> >  			val = read_pci_config(bus, slot, 3, 0x8c);
> > @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
> >  				val |= ENABLE_CF8_EXT_CFG >> 32;
> 
> What a fun shifting!
> 
> Maybe you should do
> 
> #define ENABLE_CF8_EXT_CFG	BIT(46 - 32)
> 
> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
> and also show how the high 32-bits are mapped into F3x8c, while at it.
> 
> And then you can drop the shifting at the call site.

Ok, I see another fun with this ECS enabling:

There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
which is called as part of the notifier *and* there's a PCI write to
that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.

So, AFAICT, we do it twice and the second time is not needed. Which
means, you probably can drop pci_enable_pci_io_ecs() completely and use
solely the notifier?

Yes, no?
Myron Stowe April 25, 2014, 10:24 p.m. UTC | #3
On Sun, Apr 20, 2014 at 1:59 AM, Borislav Petkov <bp@suse.de> wrote:
> Drop Andreas' old email address from CC as it keeps bouncing.
>
> On Sat, Apr 19, 2014 at 03:52:20PM +0200, Borislav Petkov wrote:
>> > -static void __init pci_enable_pci_io_ecs(void)
>> > +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
>> >  {
>> >  #ifdef CONFIG_AMD_NB
>> >     unsigned int i, n;
>> > +   u8 limit;
>> >
>> >     for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
>> > -           u8 bus = amd_nb_bus_dev_ranges[i].bus;
>> > -           u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
>> > -           u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
>> > +           /* Try matching for the bus range */
>> > +           if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
>> > +               (slot != amd_nb_bus_dev_ranges[i].dev_base))
>> > +                   continue;
>> > +
>> > +           limit = amd_nb_bus_dev_ranges[i].dev_limit;
>> >
>> > +           /* Setup all northbridges within the range */
>> >             for (; slot < limit; ++slot) {
>> >                     u32 val = read_pci_config(bus, slot, 3, 0);
>> > -
>> > -                   if (!early_is_amd_nb(val))
>> > +                   if (!val)
>> >                             continue;
>> >
>> >                     val = read_pci_config(bus, slot, 3, 0x8c);
>> > @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
>> >                             val |= ENABLE_CF8_EXT_CFG >> 32;
>>
>> What a fun shifting!
>>
>> Maybe you should do
>>
>> #define ENABLE_CF8_EXT_CFG    BIT(46 - 32)
>>
>> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
>> and also show how the high 32-bits are mapped into F3x8c, while at it.
>>
>> And then you can drop the shifting at the call site.
>
> Ok, I see another fun with this ECS enabling:
>
> There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
> which is called as part of the notifier *and* there's a PCI write to
> that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.
>
> So, AFAICT, we do it twice and the second time is not needed. Which
> means, you probably can drop pci_enable_pci_io_ecs() completely and use
> solely the notifier?

It does look as if there is some duplication with respect to setting
MSR_AMD64_NB_CFG's (which is aliased at D18F3x8c [1])
ENABLE_CF8_EXT_CFG enable bit but there are at least a couple of
differences.

enable_pci_io_ecs() only sets the bit on one NB whereas
pci_enable_pci_io_ecs iterates over all the NBs (as you mentioned
above).  The other difference has something to do with Xen; see the
origin of pci_enable_pci_io_ecs - commit 24d9b70b8.

>
> Yes, no?

Suravee, Kim - either of you want to chime in here?


[1] Read-write; Per-node. MSRC001_001F[31:0] is an alias of D18F3x88.
MSRC001_001F[63:32] is an alias of
D18F3x8C.

Myron
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov April 26, 2014, 9:10 a.m. UTC | #4
+ Robert.

On Fri, Apr 25, 2014 at 04:24:31PM -0600, Myron Stowe wrote:
> On Sun, Apr 20, 2014 at 1:59 AM, Borislav Petkov <bp@suse.de> wrote:
> > Drop Andreas' old email address from CC as it keeps bouncing.
> >
> > On Sat, Apr 19, 2014 at 03:52:20PM +0200, Borislav Petkov wrote:
> >> > -static void __init pci_enable_pci_io_ecs(void)
> >> > +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
> >> >  {
> >> >  #ifdef CONFIG_AMD_NB
> >> >     unsigned int i, n;
> >> > +   u8 limit;
> >> >
> >> >     for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
> >> > -           u8 bus = amd_nb_bus_dev_ranges[i].bus;
> >> > -           u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
> >> > -           u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
> >> > +           /* Try matching for the bus range */
> >> > +           if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
> >> > +               (slot != amd_nb_bus_dev_ranges[i].dev_base))
> >> > +                   continue;
> >> > +
> >> > +           limit = amd_nb_bus_dev_ranges[i].dev_limit;
> >> >
> >> > +           /* Setup all northbridges within the range */
> >> >             for (; slot < limit; ++slot) {
> >> >                     u32 val = read_pci_config(bus, slot, 3, 0);
> >> > -
> >> > -                   if (!early_is_amd_nb(val))
> >> > +                   if (!val)
> >> >                             continue;
> >> >
> >> >                     val = read_pci_config(bus, slot, 3, 0x8c);
> >> > @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
> >> >                             val |= ENABLE_CF8_EXT_CFG >> 32;
> >>
> >> What a fun shifting!
> >>
> >> Maybe you should do
> >>
> >> #define ENABLE_CF8_EXT_CFG    BIT(46 - 32)
> >>
> >> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
> >> and also show how the high 32-bits are mapped into F3x8c, while at it.
> >>
> >> And then you can drop the shifting at the call site.
> >
> > Ok, I see another fun with this ECS enabling:
> >
> > There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
> > which is called as part of the notifier *and* there's a PCI write to
> > that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.
> >
> > So, AFAICT, we do it twice and the second time is not needed. Which
> > means, you probably can drop pci_enable_pci_io_ecs() completely and use
> > solely the notifier?
> 
> It does look as if there is some duplication with respect to setting
> MSR_AMD64_NB_CFG's (which is aliased at D18F3x8c [1])
> ENABLE_CF8_EXT_CFG enable bit but there are at least a couple of
> differences.
> 
> enable_pci_io_ecs() only sets the bit on one NB whereas
> pci_enable_pci_io_ecs iterates over all the NBs (as you mentioned
> above).  The other difference has something to do with Xen; see the
> origin of pci_enable_pci_io_ecs - commit 24d9b70b8.

Of course it is xen - what else?! We do have to carry special code in
baremetal just for it because it is special and we all can't seem to get
enough of its crap.

Oh well, I guess we should at least comment this and refer to 24d9b70b8
so that the explanation is right there, in the code.

Thanks.
Bjorn Helgaas April 28, 2014, 8:50 p.m. UTC | #5
[+cc Jan (24d9b70b8 author), Yinghai]

On Sat, Apr 26, 2014 at 3:10 AM, Borislav Petkov <bp@suse.de> wrote:
> + Robert.
>
> On Fri, Apr 25, 2014 at 04:24:31PM -0600, Myron Stowe wrote:
>> On Sun, Apr 20, 2014 at 1:59 AM, Borislav Petkov <bp@suse.de> wrote:
>> > Drop Andreas' old email address from CC as it keeps bouncing.
>> >
>> > On Sat, Apr 19, 2014 at 03:52:20PM +0200, Borislav Petkov wrote:
>> >> > -static void __init pci_enable_pci_io_ecs(void)
>> >> > +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
>> >> >  {
>> >> >  #ifdef CONFIG_AMD_NB
>> >> >     unsigned int i, n;
>> >> > +   u8 limit;
>> >> >
>> >> >     for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
>> >> > -           u8 bus = amd_nb_bus_dev_ranges[i].bus;
>> >> > -           u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
>> >> > -           u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
>> >> > +           /* Try matching for the bus range */
>> >> > +           if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
>> >> > +               (slot != amd_nb_bus_dev_ranges[i].dev_base))
>> >> > +                   continue;
>> >> > +
>> >> > +           limit = amd_nb_bus_dev_ranges[i].dev_limit;
>> >> >
>> >> > +           /* Setup all northbridges within the range */
>> >> >             for (; slot < limit; ++slot) {
>> >> >                     u32 val = read_pci_config(bus, slot, 3, 0);
>> >> > -
>> >> > -                   if (!early_is_amd_nb(val))
>> >> > +                   if (!val)
>> >> >                             continue;
>> >> >
>> >> >                     val = read_pci_config(bus, slot, 3, 0x8c);
>> >> > @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
>> >> >                             val |= ENABLE_CF8_EXT_CFG >> 32;
>> >>
>> >> What a fun shifting!
>> >>
>> >> Maybe you should do
>> >>
>> >> #define ENABLE_CF8_EXT_CFG    BIT(46 - 32)
>> >>
>> >> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
>> >> and also show how the high 32-bits are mapped into F3x8c, while at it.
>> >>
>> >> And then you can drop the shifting at the call site.
>> >
>> > Ok, I see another fun with this ECS enabling:
>> >
>> > There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
>> > which is called as part of the notifier *and* there's a PCI write to
>> > that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.
>> >
>> > So, AFAICT, we do it twice and the second time is not needed. Which
>> > means, you probably can drop pci_enable_pci_io_ecs() completely and use
>> > solely the notifier?
>>
>> It does look as if there is some duplication with respect to setting
>> MSR_AMD64_NB_CFG's (which is aliased at D18F3x8c [1])
>> ENABLE_CF8_EXT_CFG enable bit but there are at least a couple of
>> differences.
>>
>> enable_pci_io_ecs() only sets the bit on one NB whereas
>> pci_enable_pci_io_ecs iterates over all the NBs (as you mentioned
>> above).  The other difference has something to do with Xen; see the
>> origin of pci_enable_pci_io_ecs - commit 24d9b70b8.
>
> Of course it is xen - what else?! We do have to carry special code in
> baremetal just for it because it is special and we all can't seem to get
> enough of its crap.
>
> Oh well, I guess we should at least comment this and refer to 24d9b70b8
> so that the explanation is right there, in the code.

This is probably obvious, but my interest here is to (1) make sure all
systems in the field run well (so we need quirks to work around BIOS
and other issues), and (2) eliminate the need for kernel changes to
support future systems.  So far we seem to be concentrating on (1) and
neglecting (2), which means we're always reacting to things that are
broken.

This I/O ECS thing seems likely to cause future problems.  My
understanding (based on sec 2.8 of [1]) is that enable_pci_io_ecs()
and pci_enable_pci_io_ecs() are there to enable access to extended
config space (offsets 256-4095) via the 0xcf8/0xcfc I/O ports.

Per sec 4.1.1 of [2], we should be using ECAM (the memory-mapped
enhanced configuration mechanism, i.e., MMCONFIG) to access extended
config space, and the BIOS should supply an MCFG table.

So why do we need to enable I/O access to ECS on AMD chips at all?  Is
this a workaround for a broken BIOS that doesn't supply an MCFG table?

From reading the path below, I think raw_pci_read() will use
pci_direct_conf1 for (domain 0 [cfg 0-255]).  For everything else, it
will use (a) pci_mmcfg if there's a valid MCFG or (b) pci_direct_conf1
if there's no MCFG and this is an AMD >= fam10h CPU, i.e.,
PCI_HAS_IO_ECS is set.

    pci_arch_init
      type = pci_direct_probe
      pci_mmcfg_early_init
        __pci_mmcfg_init
          pci_mmcfg_arch_init
            raw_pci_ext_ops = &pci_mmcfg
      pci_direct_init
        if (type == 1)
          raw_pci_ops = &pci_direct_conf1
          if (raw_pci_ext_ops)
            return
          if (!pci_probe & PCI_HAS_IO_ECS)
            return
          raw_pci_ext_ops = &pci_direct_conf1

I think we should try to get rid of amd_bus.c, e.g., only run
amd_postcore_init() for BIOS dates < 2015.  It looks like a crutch
that is perpetuating buggy BIOSes and costing us maintenance effort.
We don't need anything similar for Intel CPUs, and I don't see a
compelling reason why we need it for AMD.

Bjorn

[1] BIOS and Kernel Developer's Guide for AMD Family 15h Models
00h-0Fh Processors Rev 3.14 (document number 42301)
[2] PCI Firmware Specification, Rev 3.0, June 20, 2005
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Myron Stowe April 28, 2014, 9:19 p.m. UTC | #6
On Sat, Apr 19, 2014 at 7:52 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Fri, Apr 18, 2014 at 08:53:23PM -0600, Myron Stowe wrote:
>> From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
>>
>> This patch adds supports for additional MMIO ranges (16 ranges).  Also,
>> each MMIO base/limit can now support up to 48-bit MMIO addresses.
>> However, this requires initializing the ECS sooner since the new registers
>> are in the ECS ranges.
>>
>> This applies for AMD Family 15h and later.
>>
>> Reference: Advanced Micro Devices (AMD), "BIOS and Kernel Developer's
>> Guide (BKDG) for AMD Family 15h Models 00h-0fh Processors."  Section 3.4
>> Device [1F:18]h Function 1 Configuration Registers;
>>   D18F1x[1CC:180,BC:80] MMIO Base/Limit,
>>   D18F1x[D8,D0,C8,C0] IO-Space Base,
>>   D18F1x[DC,D4,CC,C4] IO-Space Limit,
>> 42301 Rev 3.12 - October 11, 2012.
>>
>> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
>> Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
>> Tested-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
>> ---
>>
>>  arch/x86/pci/amd_bus.c |  126 +++++++++++++++++++++++++++++++++++++++---------
>>  1 files changed, 103 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
>> index c8cd75c..1b2895e 100644
>> --- a/arch/x86/pci/amd_bus.c
>> +++ b/arch/x86/pci/amd_bus.c
>> @@ -13,11 +13,30 @@
>>
>>  #define AMD_NB_F0_NODE_ID                    0x60
>>  #define AMD_NB_F0_UNIT_ID                    0x64
>> +#define AMD_NB_F1_MMIO_BASE_REG                      0x80
>> +#define AMD_NB_F1_MMIO_LIMIT_REG             0x84
>> +#define AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG     0x180
>> +#define AMD_NB_F1_IO_BASE_ADDR_REG           0xc0
>> +#define AMD_NB_F1_IO_LIMIT_ADDR_REG          0xc4
>>  #define AMD_NB_F1_CONFIG_MAP_REG             0xe0
>>
>>  #define RANGE_NUM                            16
>> +#define AMD_NB_F1_MMIO_RANGES                        16
>> +#define AMD_NB_F1_IOPORT_RANGES                      4
>>  #define AMD_NB_F1_CONFIG_MAP_RANGES          4
>>
>> +#define AMD_PCIE_CF8(bus, dev, fn, reg) \
>> +                     (0x80000000 | \
>> +                     ((reg & 0xF00) << 16) | \
>> +                     ((bus & 0xF) << 16) | \
>> +                     ((dev & 0x1F) << 11) | \
>> +                     ((fn & 0x7) << 8) | \
>> +                     ((reg & 0xFC)))
>> +
>> +static bool amd_early_ecs_enabled;
>> +
>> +static int __init pci_io_ecs_init(u8 bus, u8 slot);
>
> Please move the function above its caller instead of adding a forward
> declaration.
>

Patch 3/5 cleans this up

>> +
>>  /*
>>   * This discovers the pcibus <-> node mapping on AMD K8, Family 10h, 12h,
>>   * 14h, 15h, and 16h processor based systems.  This also gets peer root bus
>> @@ -39,6 +58,20 @@ static struct amd_hostbridge hb_probes[] __initdata = {
>>       { 0xff, 0   , PCI_DEVICE_ID_AMD_10H_NB_HT },
>>  };
>>
>> +/* This version of read_pci_config allows reading registers in the ECS area */
>> +static inline int _amd_read_pci_config(u8 bus, u8 slot, u8 fn, u32 offset)
>> +{
>> +     u32 value;
>> +
>> +     if ((!amd_early_ecs_enabled) && (offset > 0xFF))
>> +             return -1;
>> +
>> +     outl(AMD_PCIE_CF8(bus, slot, fn, offset), 0xcf8);
>> +     value = inl(0xcfc);
>> +
>> +     return value;
>> +
>>  static struct pci_root_info __init *find_pci_root_info(int node, int link)
>>  {
>>       struct pci_root_info *info;
>> @@ -48,6 +81,9 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
>>               if (info->node == node && info->link == link)
>>                       return info;
>>
>> +     pr_warn("AMD-Bus: WARNING: Failed to find root info for node %#x, link %#x\n",
>
> Prefixes like "AMD-Bus" are done using pr_fmt.
>

Ok, I incorporated pr_fmt changes into patch 3/5

>> +             node, link);
>> +
>>       return NULL;
>>  }
>>
>> @@ -118,6 +154,12 @@ static int __init early_fill_mp_bus_info(void)
>>
>>       pr_debug("Found AMD hostbridge at %x:%x.0\n", bus, slot);
>>
>> +     /* We enable ECS mode prior to probing MMIO as MMIO related registers
>> +      * are in the ECS area.
>> +      */
>> +     pci_io_ecs_init(bus, slot);
>
> Well, if enabling the ECS failed somehow, you probably want to save
> yourself the _amd_read_pci_config() calls further down.
>
> Which means:
>
> * you should propagate an error code from that pci_io_ecs_init() function
>
> * you can carve out the huge loop iterating over AMD_NB_F1_MMIO_RANGES
> along with the pci ECS enablement call to pci_io_ecs_init into a
> separate function and thus slim down that huge early_fill_mp_bus_info()
> monster a bit.
>

Yes but this was not trivial since most, if not all, usages ignore any
return value.

I'm wanting to focus on just fixing the issue for the most part and
making as few changes as possible.  This is primarly due to:
  o  The lack of engagement by the AMD engineers (Suravee posted these
changes but has since been unresponsive),
  o  As Bjorn just mentioned, we would like to get rid of this entire
file (as we did in a similar manner with intel_bus) as it is just an
"endless treadmill" that needs attention with every new HW release,
  o  I want my name to be minimally related to this  ;)

>> +
>> +     found = false;
>>       for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
>>               int min_bus;
>>               int max_bus;
>> @@ -128,6 +170,7 @@ static int __init early_fill_mp_bus_info(void)
>>               if ((reg & 7) != 3)
>>                       continue;
>>
>> +             found = true;
>>               min_bus = (reg >> 16) & 0xff;
>>               max_bus = (reg >> 24) & 0xff;
>>               node = (reg >> 4) & 0x07;
>> @@ -136,6 +179,13 @@ static int __init early_fill_mp_bus_info(void)
>>               info = alloc_pci_root_info(min_bus, max_bus, node, link);
>>       }
>>
>> +     if (!found) {
>> +             /* In case there is no AMDNB_F1_CONFIG_MAP_REGs,
>> +              * we just use default to bus 0, node 0 link 0)
>> +              */
>> +             info = alloc_pci_root_info(0, 0, 0, 0);
>> +     }
>
> No need for curly braces around a single statement - simply put the
> comment before the if (...).
>

Done

>> +
>>       /* get the default node and link for left over res */
>>       reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
>>       def_node = (reg >> 8) & 0x07;
>> @@ -144,14 +194,17 @@ static int __init early_fill_mp_bus_info(void)
>>
>>       memset(range, 0, sizeof(range));
>>       add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
>> +
>>       /* io port resource */
>> -     for (i = 0; i < 4; i++) {
>> -             reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
>> +     for (i = 0; i < AMD_NB_F1_IOPORT_RANGES; i++) {
>> +             reg = read_pci_config(bus, slot, 1,
>> +                             AMD_NB_F1_IO_BASE_ADDR_REG + (i << 3));
>>               if (!(reg & 3))
>>                       continue;
>>
>>               start = reg & 0xfff000;
>> -             reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3));
>> +             reg = read_pci_config(bus, slot, 1,
>> +                             AMD_NB_F1_IO_LIMIT_ADDR_REG + (i << 3));
>>               node = reg & 0x07;
>>               link = (reg >> 4) & 0x03;
>>               end = (reg & 0xfff000) | 0xfff;
>> @@ -169,6 +222,7 @@ static int __init early_fill_mp_bus_info(void)
>>               update_res(info, start, end, IORESOURCE_IO, 1);
>>               subtract_range(range, RANGE_NUM, start, end + 1);
>>       }
>> +
>>       /* add left over io port range to def node/link, [0, 0xffff] */
>>       /* find the position */
>>       info = find_pci_root_info(def_node, def_link);
>> @@ -211,22 +265,46 @@ static int __init early_fill_mp_bus_info(void)
>>       }
>>
>>       /* mmio resource */
>> -     for (i = 0; i < 8; i++) {
>> -             reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
>> +     for (i = 0; i < AMD_NB_F1_MMIO_RANGES; i++) {
>> +             u64 tmp;
>> +             u32 base = AMD_NB_F1_MMIO_BASE_REG + (i << 3);
>> +             u32 limit = AMD_NB_F1_MMIO_LIMIT_REG + (i << 3);
>> +             u32 base_limit_hi = AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG + (i << 2);
>> +
>> +             if (i >= 12) {
>> +                     /* Range 12 base/limit starts at 0x2A0 */
>> +                     base += 0x1c0;
>> +                     limit += 0x1c0;
>> +                     /* Range 12 base/limit hi starts at 0x2C0 */
>> +                     base_limit_hi += 0x110;
>> +             } else if (i >= 8) {
>> +                     /* Range 8 base/limit starts at 0x1A0 */
>> +                     base += 0xe0;
>> +                     limit += 0xe0;
>> +                     /* Range 12 base/limit hi starts at 0x1C0 */
>> +                     base_limit_hi += 0x20;
>> +             }
>> +
>> +             /* Base lo */
>> +             reg = _amd_read_pci_config(bus, slot, 1, base);
>>               if (!(reg & 3))
>>                       continue;
>>
>> -             start = reg & 0xffffff00; /* 39:16 on 31:8*/
>> -             start <<= 8;
>> -             reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
>> +             start = (reg & 0xffffff00UL) << 8;      /* 39:16 on 31:8 */
>> +
>> +             /* Limit lo */
>> +             reg = _amd_read_pci_config(bus, slot, 1, limit);
>>               node = reg & 0x07;
>>               link = (reg >> 4) & 0x03;
>> -             end = (reg & 0xffffff00);
>> -             end <<= 8;
>> -             end |= 0xffff;
>> +             end = (reg & 0xffffff00UL) << 8;        /* 39:16 on 31:8 */
>> +             end |= 0xffffUL;
>>
>> -             info = find_pci_root_info(node, link);
>> +             /* Base/Limit hi */
>> +             tmp = _amd_read_pci_config(bus, slot, 1, base_limit_hi);
>> +             start |= ((tmp & 0xffUL) << 40);
>> +             end |= ((tmp & (0xffUL << 16)) << 24);
>>
>> +             info = find_pci_root_info(node, link);
>>               if (!info)
>>                       continue;
>>
>> @@ -354,20 +432,24 @@ static struct notifier_block amd_cpu_notifier = {
>>       .notifier_call  = amd_cpu_notify,
>>  };
>>
>> -static void __init pci_enable_pci_io_ecs(void)
>> +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
>>  {
>>  #ifdef CONFIG_AMD_NB
>>       unsigned int i, n;
>> +     u8 limit;
>>
>>       for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
>> -             u8 bus = amd_nb_bus_dev_ranges[i].bus;
>> -             u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
>> -             u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
>> +             /* Try matching for the bus range */
>> +             if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
>> +                 (slot != amd_nb_bus_dev_ranges[i].dev_base))
>> +                     continue;
>> +
>> +             limit = amd_nb_bus_dev_ranges[i].dev_limit;
>>
>> +             /* Setup all northbridges within the range */
>>               for (; slot < limit; ++slot) {
>>                       u32 val = read_pci_config(bus, slot, 3, 0);
>> -
>> -                     if (!early_is_amd_nb(val))
>> +                     if (!val)
>>                               continue;
>>
>>                       val = read_pci_config(bus, slot, 3, 0x8c);
>> @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
>>                               val |= ENABLE_CF8_EXT_CFG >> 32;
>
> What a fun shifting!
>
> Maybe you should do
>
> #define ENABLE_CF8_EXT_CFG      BIT(46 - 32)
>
> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
> and also show how the high 32-bits are mapped into F3x8c, while at it.
>
> And then you can drop the shifting at the call site.
>
>

Yes, this confused me at first also so I added an additional ENABLE_CF8_EXT_CFG

>>                               write_pci_config(bus, slot, 3, 0x8c, val);
>>                       }
>> +                     amd_early_ecs_enabled = true;
>
> You probably should check whether the PCI config write stuck before
> setting this.
>
>>                       ++n;
>>               }
>>       }
>>  #endif
>>  }
>>
>> -static int __init pci_io_ecs_init(void)
>> +static int __init pci_io_ecs_init(u8 bus, u8 slot)
>>  {
>>       int cpu;
>>
>> @@ -389,9 +472,7 @@ static int __init pci_io_ecs_init(void)
>>          if (boot_cpu_data.x86 < 0x10)
>>               return 0;
>>
>> -     /* Try the PCI method first. */
>> -     if (early_pci_allowed())
>> -             pci_enable_pci_io_ecs();
>
> Where did the if-check go?
>

Good catch.  I assume Suravee didn't drop this on purpose?

>> +     pci_enable_pci_io_ecs(bus, slot);
>>
>>       cpu_notifier_register_begin();
>>       for_each_online_cpu(cpu)
>> @@ -411,7 +492,6 @@ static int __init amd_postcore_init(void)
>>               return 0;
>>
>>       early_fill_mp_bus_info();
>> -     pci_io_ecs_init();
>>
>>       return 0;
>>  }
>>
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov April 28, 2014, 9:40 p.m. UTC | #7
On Mon, Apr 28, 2014 at 02:50:29PM -0600, Bjorn Helgaas wrote:
> This I/O ECS thing seems likely to cause future problems.  My
> understanding (based on sec 2.8 of [1]) is that enable_pci_io_ecs()
> and pci_enable_pci_io_ecs() are there to enable access to extended
> config space (offsets 256-4095) via the 0xcf8/0xcfc I/O ports.
> 
> Per sec 4.1.1 of [2], we should be using ECAM (the memory-mapped
> enhanced configuration mechanism, i.e., MMCONFIG) to access extended
> config space, and the BIOS should supply an MCFG table.
> 
> So why do we need to enable I/O access to ECS on AMD chips at all?  Is
> this a workaround for a broken BIOS that doesn't supply an MCFG table?

That's a good question. 831d991821dae doesn't say but I have a hunch
Andreas and/or Robert should know. I seem to vaguely remember it
might've been because of a missing MCFG but have flushed it out of the
cache long time ago.

Let's ask them.

Andreas, Robert, guys, do you remember why we did the PCI IO ECS access?
B0rked BIOSes?

Thanks.
Suravee Suthikulpanit April 29, 2014, 2:47 a.m. UTC | #8
On 4/28/2014 4:19 PM, Myron Stowe wrote:
> On Sat, Apr 19, 2014 at 7:52 AM, Borislav Petkov <bp@alien8.de> wrote:
>> On Fri, Apr 18, 2014 at 08:53:23PM -0600, Myron Stowe wrote:
>>> From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

.......

>>> @@ -118,6 +154,12 @@ static int __init early_fill_mp_bus_info(void)
>>>
>>>        pr_debug("Found AMD hostbridge at %x:%x.0\n", bus, slot);
>>>
>>> +     /* We enable ECS mode prior to probing MMIO as MMIO related registers
>>> +      * are in the ECS area.
>>> +      */
>>> +     pci_io_ecs_init(bus, slot);
>>
>> Well, if enabling the ECS failed somehow, you probably want to save
>> yourself the _amd_read_pci_config() calls further down.
>>
>> Which means:
>>
>> * you should propagate an error code from that pci_io_ecs_init() function
>>
>> * you can carve out the huge loop iterating over AMD_NB_F1_MMIO_RANGES
>> along with the pci ECS enablement call to pci_io_ecs_init into a
>> separate function and thus slim down that huge early_fill_mp_bus_info()
>> monster a bit.
>>
>
> Yes but this was not trivial since most, if not all, usages ignore any
> return value.
>
> I'm wanting to focus on just fixing the issue for the most part and
> making as few changes as possible.  This is primarly due to:
>    o  The lack of engagement by the AMD engineers (Suravee posted these
> changes but has since been unresponsive),
>    o  As Bjorn just mentioned, we would like to get rid of this entire
> file (as we did in a similar manner with intel_bus) as it is just an
> "endless treadmill" that needs attention with every new HW release,
>    o  I want my name to be minimally related to this  ;)
>
[Suravee] Sorry for late reply and did not follow up on this patch in a 
timely manner.  Also, thank you Myron for helping to follow up with the 
patch set.

Regarding the error code from pci_io_ecs_init(), it is currently always 
return 0 (success).  I am not sure what error code should I 
check/propagate.  Let me know if I am missing something here.

......

>>> -static int __init pci_io_ecs_init(void)
>>> +static int __init pci_io_ecs_init(u8 bus, u8 slot)
>>>   {
>>>        int cpu;
>>>
>>> @@ -389,9 +472,7 @@ static int __init pci_io_ecs_init(void)
>>>           if (boot_cpu_data.x86 < 0x10)
>>>                return 0;
>>>
>>> -     /* Try the PCI method first. */
>>> -     if (early_pci_allowed())
>>> -             pci_enable_pci_io_ecs();
>>
>> Where did the if-check go?
>>
>
> Good catch.  I assume Suravee didn't drop this on purpose?

[SURAVEE] I dropped the "early_pci_allowed()" here since I have moved 
the calling of "pci_io_ecs_init()" into the "early_fill_mp_bus_info()" 
since I would like to enable PCI ecs access at earlier stage. The 
"early_fill_mp_bus_info() has already check for "early_pci_allowed()" 
already at the beginning of the function. So, this should not be needed 
here.

Suravee


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suravee Suthikulpanit April 29, 2014, 3:04 a.m. UTC | #9
On 4/25/2014 5:24 PM, Myron Stowe wrote:
> On Sun, Apr 20, 2014 at 1:59 AM, Borislav Petkov <bp@suse.de> wrote:
>> Drop Andreas' old email address from CC as it keeps bouncing.
>>
>> On Sat, Apr 19, 2014 at 03:52:20PM +0200, Borislav Petkov wrote:
>>>> -static void __init pci_enable_pci_io_ecs(void)
>>>> +static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
>>>>   {
>>>>   #ifdef CONFIG_AMD_NB
>>>>      unsigned int i, n;
>>>> +   u8 limit;
>>>>
>>>>      for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
>>>> -           u8 bus = amd_nb_bus_dev_ranges[i].bus;
>>>> -           u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
>>>> -           u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
>>>> +           /* Try matching for the bus range */
>>>> +           if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
>>>> +               (slot != amd_nb_bus_dev_ranges[i].dev_base))
>>>> +                   continue;
>>>> +
>>>> +           limit = amd_nb_bus_dev_ranges[i].dev_limit;
>>>>
>>>> +           /* Setup all northbridges within the range */
>>>>              for (; slot < limit; ++slot) {
>>>>                      u32 val = read_pci_config(bus, slot, 3, 0);
>>>> -
>>>> -                   if (!early_is_amd_nb(val))
>>>> +                   if (!val)
>>>>                              continue;
>>>>
>>>>                      val = read_pci_config(bus, slot, 3, 0x8c);
>>>> @@ -375,13 +457,14 @@ static void __init pci_enable_pci_io_ecs(void)
>>>>                              val |= ENABLE_CF8_EXT_CFG >> 32;
>>>
>>> What a fun shifting!
>>>
>>> Maybe you should do
>>>
>>> #define ENABLE_CF8_EXT_CFG    BIT(46 - 32)
>>>
>>> to show exactly what you mean and how the bit is defined in MSR NB_CFG1
>>> and also show how the high 32-bits are mapped into F3x8c, while at it.
>>>
>>> And then you can drop the shifting at the call site.
>>
>> Ok, I see another fun with this ECS enabling:
>>
>> There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
>> which is called as part of the notifier *and* there's a PCI write to
>> that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.
>>
>> So, AFAICT, we do it twice and the second time is not needed. Which
>> means, you probably can drop pci_enable_pci_io_ecs() completely and use
>> solely the notifier?
>
> It does look as if there is some duplication with respect to setting
> MSR_AMD64_NB_CFG's (which is aliased at D18F3x8c [1])
> ENABLE_CF8_EXT_CFG enable bit but there are at least a couple of
> differences.
>
> enable_pci_io_ecs() only sets the bit on one NB whereas
> pci_enable_pci_io_ecs iterates over all the NBs (as you mentioned
> above).  The other difference has something to do with Xen; see the
> origin of pci_enable_pci_io_ecs - commit 24d9b70b8.
>
>>
>> Yes, no?
>
> Suravee, Kim - either of you want to chime in here?

I believe the notifier is mainly for the cores which are initially 
offline, and then brought up online afterward.

Suravee

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Beulich April 29, 2014, 7:06 a.m. UTC | #10
>>> On 28.04.14 at 22:50, <bhelgaas@google.com> wrote:
>>> > There's a enable_pci_io_ecs() which enables ECS through the NB_CFG MSR
>>> > which is called as part of the notifier *and* there's a PCI write to
>>> > that same bit in pci_enable_pci_io_ecs() which iterates over all NBs.
>>> >
>>> > So, AFAICT, we do it twice and the second time is not needed. Which
>>> > means, you probably can drop pci_enable_pci_io_ecs() completely and use
>>> > solely the notifier?
>>>
>>> It does look as if there is some duplication with respect to setting
>>> MSR_AMD64_NB_CFG's (which is aliased at D18F3x8c [1])
>>> ENABLE_CF8_EXT_CFG enable bit but there are at least a couple of
>>> differences.
>>>
>>> enable_pci_io_ecs() only sets the bit on one NB whereas
>>> pci_enable_pci_io_ecs iterates over all the NBs (as you mentioned
>>> above).  The other difference has something to do with Xen; see the
>>> origin of pci_enable_pci_io_ecs - commit 24d9b70b8.

As stated there - Xen Dom0 is running on virtual CPUs, i.e. which
physical CPU the code gets executed on is unknown, and hence the
MSR based approach just does not reliably work there.

Of course, with (since then) grown needs to access extended config
space in Xen itself, this enabling could be moved into the hypervisor.
Back then the hypervisor didn't have much need for this, so it seemed
more reasonable (albeit I'm rather certain Borislav is going to disagree
with this) to augment the Dom0 side code.

> This is probably obvious, but my interest here is to (1) make sure all
> systems in the field run well (so we need quirks to work around BIOS
> and other issues), and (2) eliminate the need for kernel changes to
> support future systems.  So far we seem to be concentrating on (1) and
> neglecting (2), which means we're always reacting to things that are
> broken.
> 
> This I/O ECS thing seems likely to cause future problems.  My
> understanding (based on sec 2.8 of [1]) is that enable_pci_io_ecs()
> and pci_enable_pci_io_ecs() are there to enable access to extended
> config space (offsets 256-4095) via the 0xcf8/0xcfc I/O ports.
> 
> Per sec 4.1.1 of [2], we should be using ECAM (the memory-mapped
> enhanced configuration mechanism, i.e., MMCONFIG) to access extended
> config space, and the BIOS should supply an MCFG table.
> 
> So why do we need to enable I/O access to ECS on AMD chips at all?  Is
> this a workaround for a broken BIOS that doesn't supply an MCFG table?

Yes. Rather than leaving such systems entirely without access to
extended config space, it seemed (and still seems to me) better to
at least have this fallback in place. If after this discussion you decide
to drop this code here, it would be nice if you could let us know up
front, so we can decide whether to add the code to the hypervisor
instead or - just like on Intel systems - rely on MCFG being properly
exposed by the firmware.

Jan

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Herrmann April 29, 2014, 7:33 a.m. UTC | #11
On Mon, Apr 28, 2014 at 11:40:36PM +0200, Borislav Petkov wrote:
> On Mon, Apr 28, 2014 at 02:50:29PM -0600, Bjorn Helgaas wrote:
> > This I/O ECS thing seems likely to cause future problems.  My
> > understanding (based on sec 2.8 of [1]) is that enable_pci_io_ecs()
> > and pci_enable_pci_io_ecs() are there to enable access to extended
> > config space (offsets 256-4095) via the 0xcf8/0xcfc I/O ports.
> > 
> > Per sec 4.1.1 of [2], we should be using ECAM (the memory-mapped
> > enhanced configuration mechanism, i.e., MMCONFIG) to access extended
> > config space, and the BIOS should supply an MCFG table.
> > 
> > So why do we need to enable I/O access to ECS on AMD chips at all?  Is
> > this a workaround for a broken BIOS that doesn't supply an MCFG table?
> 
> That's a good question. 831d991821dae doesn't say but I have a hunch
> Andreas and/or Robert should know. I seem to vaguely remember it
> might've been because of a missing MCFG but have flushed it out of the
> cache long time ago.
> 
> Let's ask them.
> 
> Andreas, Robert, guys, do you remember why we did the PCI IO ECS access?
> B0rked BIOSes?


I am sure, it's because some server systems had MMIO ECS access not
enabled in BIOS. I can't remember which systems were affected.


Andreas
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov April 29, 2014, 10:20 a.m. UTC | #12
On Tue, Apr 29, 2014 at 09:33:09AM +0200, Andreas Herrmann wrote:
> I am sure, it's because some server systems had MMIO ECS access not
> enabled in BIOS. I can't remember which systems were affected.

Ok, now AMD people: what's the story with IO ECS, can we assume that on
everything after F10h, BIOS has a sensible MCFG and we can limit this to
F10h only? I like Bjorn's idea but we need to make sure a working MCFG
is ubiquitous.

Which begs the real question: Suravee, why are you even touching IO ECS
provided F15h and later have a MCFG? Or, do they?
Robert Richter April 29, 2014, 11:19 a.m. UTC | #13
On 29.04.14 09:33:09, Andreas Herrmann wrote:
> On Mon, Apr 28, 2014 at 11:40:36PM +0200, Borislav Petkov wrote:
> > On Mon, Apr 28, 2014 at 02:50:29PM -0600, Bjorn Helgaas wrote:
> > > This I/O ECS thing seems likely to cause future problems.  My
> > > understanding (based on sec 2.8 of [1]) is that enable_pci_io_ecs()
> > > and pci_enable_pci_io_ecs() are there to enable access to extended
> > > config space (offsets 256-4095) via the 0xcf8/0xcfc I/O ports.
> > > 
> > > Per sec 4.1.1 of [2], we should be using ECAM (the memory-mapped
> > > enhanced configuration mechanism, i.e., MMCONFIG) to access extended
> > > config space, and the BIOS should supply an MCFG table.

This is due to the non-atomic access to 0xcf8/0xcfc.

IIRC, another problem with io cfg space access is that it is bound to
register rax or so. The kernel implementation may not change here.

Otherall mmconfig access is much better implemented and thus
recommended to use by the os, while io cfg is used mainly by the bios.

> > > So why do we need to enable I/O access to ECS on AMD chips at all?  Is
> > > this a workaround for a broken BIOS that doesn't supply an MCFG table?
> > 
> > That's a good question. 831d991821dae doesn't say but I have a hunch
> > Andreas and/or Robert should know. I seem to vaguely remember it
> > might've been because of a missing MCFG but have flushed it out of the
> > cache long time ago.
> > 
> > Let's ask them.
> > 
> > Andreas, Robert, guys, do you remember why we did the PCI IO ECS access?
> > B0rked BIOSes?
> 
> 
> I am sure, it's because some server systems had MMIO ECS access not
> enabled in BIOS. I can't remember which systems were affected.

Yes, mmio ecs was not enabled on some systems by the bios. We needed
to use cf8/cfc io access.

If MMCONFIG is available it is the default, cf8/cfc is only used
otherwise. Also, some kernel configs may disable MMCONFIG.

PCI ECS was especially needed to enable the IBS interrupt for hw
profiling.

-Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steffen Persvold April 29, 2014, 1:07 p.m. UTC | #14
On 29 Apr 2014, at 3:20 , Borislav Petkov <bp@suse.de> wrote:

> On Tue, Apr 29, 2014 at 09:33:09AM +0200, Andreas Herrmann wrote:
>> I am sure, it's because some server systems had MMIO ECS access not
>> enabled in BIOS. I can't remember which systems were affected.
> 
> Ok, now AMD people: what's the story with IO ECS, can we assume that on
> everything after F10h, BIOS has a sensible MCFG and we can limit this to
> F10h only? I like Bjorn's idea but we need to make sure a working MCFG
> is ubiquitous.
> 
> Which begs the real question: Suravee, why are you even touching IO ECS
> provided F15h and later have a MCFG? Or, do they?
> 

Our experience with this is that Fam10h and later have a very well working MCFG setup, earlier generations not so much (hence IO ECS was needed).

Cheers,
Steffen

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suravee Suthikulpanit April 29, 2014, 3:16 p.m. UTC | #15
On 4/29/2014 5:20 AM, Borislav Petkov wrote:
> On Tue, Apr 29, 2014 at 09:33:09AM +0200, Andreas Herrmann wrote:
>> I am sure, it's because some server systems had MMIO ECS access not
>> enabled in BIOS. I can't remember which systems were affected.
>
If you are referring to accessing PCI ECS ranges via 0xCF8, then yes, 
BIOS disable this as described below in the BKDG.

"The BIOS may use either configuration space access mechanism during 
boot. Before booting the OS, BIOS must disable IO access to ECS, enable 
MMIO configuration and build an ACPI defined MCFG table. BIOS ACPI code 
must use MMIO to access configuration space."

> Ok, now AMD people: what's the story with IO ECS, can we assume that on
> everything after F10h, BIOS has a sensible MCFG and we can limit this to
> F10h only? I like Bjorn's idea but we need to make sure a working MCFG
> is ubiquitous.
>
> Which begs the real question: Suravee, why are you even touching IO ECS
> provided F15h and later have a MCFG? Or, do they?
>

As I was trying to generalize the logic inside amd_bus.c, which seems to 
be used mainly as a fallback mechanism, I tried to maintain the existing 
code, which does many things:
     1. Setup numa_node information (if PXM doesn't exist)
     2. Probe NB for MMIO resources (if MCFG doesn't exist)
     3. Probe NB for IO resources
     4. Setup IO ECS

In the new code, the IO ECS was needed to retrieve the 
AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG (offset 0x180) during the early 
initialization as part of (2) logic. However, this register exists only 
on the newer systems.  However, as you mentioned, for (2) we can assume 
that the MCFG exists for most of the systems (family10h and later), and 
should be used instead.

The main purpose of this patch set is mainly to deal with the the node 
information (1).  So, we might need to split these all up and handle 
them separately as needed where (2) and (3) will be used as fallback for 
older systems where MCFG does not exist. I am not sure if where we need (4).

Suravee

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Richter April 29, 2014, 5:17 p.m. UTC | #16
In addition to Boris I have the following:

On 18.04.14 20:53:23, Myron Stowe wrote:
> +#define AMD_PCIE_CF8(bus, dev, fn, reg) \
> +			(0x80000000 | \
> +			((reg & 0xF00) << 16) | \
> +			((bus & 0xF) << 16) | \
> +			((dev & 0x1F) << 11) | \
> +			((fn & 0x7) << 8) | \
> +			((reg & 0xFC)))

Make this a static function.

> +/* This version of read_pci_config allows reading registers in the ECS area */
> +static inline int _amd_read_pci_config(u8 bus, u8 slot, u8 fn, u32 offset)

No need to inline this, no need for a single tailing underscore.

> +{
> +	u32 value;
> +
> +	if ((!amd_early_ecs_enabled) && (offset > 0xFF))
> +		return -1;
> +
> +	outl(AMD_PCIE_CF8(bus, slot, fn, offset), 0xcf8);
> +	value = inl(0xcfc);
> +
> +	return value;

This design is broken. The return type does not match the functions
return value and you can't do error checking as -1 can be a value too.

Why reinventing this new instead of trying to reuse functions in
arch/x86/pci/direct.c/early.c or so?

Would an access via mmconfig space at this early stage possible? If so
this code could be skipped as we wouln't need it for fam10h (no early
access needed) nor fam15h or newer (mmconfig available).

> +}
> +
>  static struct pci_root_info __init *find_pci_root_info(int node, int link)
>  {
>  	struct pci_root_info *info;
> @@ -48,6 +81,9 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
>  		if (info->node == node && info->link == link)
>  			return info;
>  
> +	pr_warn("AMD-Bus: WARNING: Failed to find root info for node %#x, link %#x\n",
> +		node, link);

As there might be (virtual) systems where this case is valid, I
wouldn't print anything here. We should only print things that exist
or error/warnings.

-Robert

> +
>  	return NULL;
>  }
>  
> @@ -118,6 +154,12 @@ static int __init early_fill_mp_bus_info(void)
>  
>  	pr_debug("Found AMD hostbridge at %x:%x.0\n", bus, slot);
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov April 29, 2014, 7:14 p.m. UTC | #17
On Tue, Apr 29, 2014 at 10:16:57AM -0500, Suravee Suthikulanit wrote:
> In the new code, the IO ECS was needed to retrieve the
> AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG (offset 0x180) during the early
> initialization as part of (2) logic. However, this register exists only on
> the newer systems.  However, as you mentioned, for (2) we can assume that
> the MCFG exists for most of the systems (family10h and later), and should be
> used instead.
> 
> The main purpose of this patch set is mainly to deal with the the node
> information (1).  So, we might need to split these all up and handle them
> separately as needed where (2) and (3) will be used as fallback for older
> systems where MCFG does not exist.

So sounds to me like we want to get rid of the whole IO ECS deal
altogether then.

Now, I'm wondering whether we should kill it completely since I don't
think anyone cares about numa node info being correct on K8, or? I'm
specifically turning to our numascale friends who love to have a lot of
nodes. :-)

Daniel, Steffen?
Myron Stowe April 29, 2014, 9:40 p.m. UTC | #18
On Tue, Apr 29, 2014 at 1:14 PM, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Apr 29, 2014 at 10:16:57AM -0500, Suravee Suthikulanit wrote:
>> In the new code, the IO ECS was needed to retrieve the
>> AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG (offset 0x180) during the early
>> initialization as part of (2) logic. However, this register exists only on
>> the newer systems.  However, as you mentioned, for (2) we can assume that
>> the MCFG exists for most of the systems (family10h and later), and should be
>> used instead.
>>
>> The main purpose of this patch set is mainly to deal with the the node
>> information (1).  So, we might need to split these all up and handle them
>> separately as needed where (2) and (3) will be used as fallback for older
>> systems where MCFG does not exist.
>
> So sounds to me like we want to get rid of the whole IO ECS deal
> altogether then.
>
> Now, I'm wondering whether we should kill it completely since I don't
> think anyone cares about numa node info being correct on K8, or? I'm
> specifically turning to our numascale friends who love to have a lot of
> nodes. :-)

I think we need to be careful here as there are two unrelated topics
being discussed together.  What started this whole thread was the need
for sysfs related numa_node information with respect to PCI devices
(1).  Without patch 1, platforms with newer AMD CPUs end up having
'-1' numa_node values for all PCI devices.

IO ECS has no bearing on patch 1, it only comes into play with patch 2
which is concerned with MMIO resource information when MCFG doesn't
exist.  For the particular issue I'm trying to get resolved, patch 2
is not needed.  However, since we have expended time and effort on
this subject, perhaps we should get this cleaned up while it has our
attention.

I'm all for deleting as much of amd_bus.c as possible due to its
"perennial maintenance headache".  The obvious choices seem to be all,
or some combination, of:
  o removing IO ECS logic,
  o removing IO/MMIO logic (assuming MCFG issues were long enough ago
to no longer be a concern),
  o start deprecating amd_bus.c by adding logic to skip if BIOS >= 2015

Myron

>
> Daniel, Steffen?
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Richter April 30, 2014, 6:41 a.m. UTC | #19
On 18.04.14 20:53:23, Myron Stowe wrote:
> From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>

> -		start = reg & 0xffffff00; /* 39:16 on 31:8*/
> -		start <<= 8;
> -		reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
> +		start = (reg & 0xffffff00UL) << 8;	/* 39:16 on 31:8 */

I think this is not save anymore with respect to u32/u64 casting. The
original was.

-Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Richter April 30, 2014, 7 a.m. UTC | #20
On 29.04.14 15:40:28, Myron Stowe wrote:
> On Tue, Apr 29, 2014 at 1:14 PM, Borislav Petkov <bp@alien8.de> wrote:
> > So sounds to me like we want to get rid of the whole IO ECS deal
> > altogether then.
> >
> > Now, I'm wondering whether we should kill it completely since I don't
> > think anyone cares about numa node info being correct on K8, or? I'm
> > specifically turning to our numascale friends who love to have a lot of
> > nodes. :-)

Maybe I did get you wrong, but IO ECS was introduced with fam10h and
is not related to k8.

> I think we need to be careful here as there are two unrelated topics
> being discussed together.  What started this whole thread was the need
> for sysfs related numa_node information with respect to PCI devices
> (1).  Without patch 1, platforms with newer AMD CPUs end up having
> '-1' numa_node values for all PCI devices.
> 
> IO ECS has no bearing on patch 1, it only comes into play with patch 2
> which is concerned with MMIO resource information when MCFG doesn't
> exist.  For the particular issue I'm trying to get resolved, patch 2
> is not needed.  However, since we have expended time and effort on
> this subject, perhaps we should get this cleaned up while it has our
> attention.
> 
> I'm all for deleting as much of amd_bus.c as possible due to its
> "perennial maintenance headache".  The obvious choices seem to be all,
> or some combination, of:
>   o removing IO ECS logic,
>   o removing IO/MMIO logic (assuming MCFG issues were long enough ago
> to no longer be a concern),
>   o start deprecating amd_bus.c by adding logic to skip if BIOS >= 2015

I don't see any reason for big changes actually. Just bind the IO ECS
logic to fam10h (either with fam check or pci device depending on the
implementation, xen's flavor would be pci). This is something stricter
than 'if BIOS >= 2015'. It leaves code as it is which is maintainable.

You implement the new logic for for newer families. No need for one
implementation that fits all.

-Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suravee Suthikulpanit April 30, 2014, 7:50 a.m. UTC | #21
On 04/30/2014 02:00 AM, Robert Richter wrote:
> On 29.04.14 15:40:28, Myron Stowe wrote:
>> On Tue, Apr 29, 2014 at 1:14 PM, Borislav Petkov <bp@alien8.de> wrote:
>>> So sounds to me like we want to get rid of the whole IO ECS deal
>>> altogether then.
>>>
>>> Now, I'm wondering whether we should kill it completely since I don't
>>> think anyone cares about numa node info being correct on K8, or? I'm
>>> specifically turning to our numascale friends who love to have a lot of
>>> nodes. :-)
>
> Maybe I did get you wrong, but IO ECS was introduced with fam10h and
> is not related to k8.
>
>> I think we need to be careful here as there are two unrelated topics
>> being discussed together.  What started this whole thread was the need
>> for sysfs related numa_node information with respect to PCI devices
>> (1).  Without patch 1, platforms with newer AMD CPUs end up having
>> '-1' numa_node values for all PCI devices.
>>
>> IO ECS has no bearing on patch 1, it only comes into play with patch 2
>> which is concerned with MMIO resource information when MCFG doesn't
>> exist.  For the particular issue I'm trying to get resolved, patch 2
>> is not needed.  However, since we have expended time and effort on
>> this subject, perhaps we should get this cleaned up while it has our
>> attention.
>>
>> I'm all for deleting as much of amd_bus.c as possible due to its
>> "perennial maintenance headache".  The obvious choices seem to be all,
>> or some combination, of:
>>    o removing IO ECS logic,
>>    o removing IO/MMIO logic (assuming MCFG issues were long enough ago
>> to no longer be a concern),
>>    o start deprecating amd_bus.c by adding logic to skip if BIOS >= 2015
>
> I don't see any reason for big changes actually. Just bind the IO ECS
> logic to fam10h (either with fam check or pci device depending on the
> implementation, xen's flavor would be pci). This is something stricter
> than 'if BIOS >= 2015'. It leaves code as it is which is maintainable.
>
> You implement the new logic for for newer families. No need for one
> implementation that fits all.
>
> -Robert
>
Actually, if ECS is needed by IBS, then wouldn't this still be needed 
for every family since 10h and later (except family11h).

Suravee
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Richter April 30, 2014, 9:51 a.m. UTC | #22
On 30.04.14 02:50:21, Suravee Suthikulpanit wrote:
> Actually, if ECS is needed by IBS, then wouldn't this still be needed for
> every family since 10h and later (except family11h).

Yes, but you have stated that pci mmconfig should work on families >
10h. Thus, ECS would work there out-of-the-box (at least after the
system brought pci up, ibs is initialized after pci setup).

-Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Myron Stowe April 30, 2014, 11:03 p.m. UTC | #23
On Wed, Apr 30, 2014 at 1:00 AM, Robert Richter <rric@kernel.org> wrote:
> On 29.04.14 15:40:28, Myron Stowe wrote:
>> On Tue, Apr 29, 2014 at 1:14 PM, Borislav Petkov <bp@alien8.de> wrote:
>> > So sounds to me like we want to get rid of the whole IO ECS deal
>> > altogether then.
>> >
>> > Now, I'm wondering whether we should kill it completely since I don't
>> > think anyone cares about numa node info being correct on K8, or? I'm
>> > specifically turning to our numascale friends who love to have a lot of
>> > nodes. :-)
>
> Maybe I did get you wrong, but IO ECS was introduced with fam10h and
> is not related to k8.
>
>> I think we need to be careful here as there are two unrelated topics
>> being discussed together.  What started this whole thread was the need
>> for sysfs related numa_node information with respect to PCI devices
>> (1).  Without patch 1, platforms with newer AMD CPUs end up having
>> '-1' numa_node values for all PCI devices.
>>
>> IO ECS has no bearing on patch 1, it only comes into play with patch 2
>> which is concerned with MMIO resource information when MCFG doesn't
>> exist.  For the particular issue I'm trying to get resolved, patch 2
>> is not needed.  However, since we have expended time and effort on
>> this subject, perhaps we should get this cleaned up while it has our
>> attention.
>>
>> I'm all for deleting as much of amd_bus.c as possible due to its
>> "perennial maintenance headache".  The obvious choices seem to be all,
>> or some combination, of:
>>   o removing IO ECS logic,
>>   o removing IO/MMIO logic (assuming MCFG issues were long enough ago
>> to no longer be a concern),
>>   o start deprecating amd_bus.c by adding logic to skip if BIOS >= 2015
>
> I don't see any reason for big changes actually. Just bind the IO ECS
> logic to fam10h (either with fam check or pci device depending on the
> implementation, xen's flavor would be pci). This is something stricter
> than 'if BIOS >= 2015'. It leaves code as it is which is maintainable.
>
> You implement the new logic for for newer families. No need for one
> implementation that fits all.

I wasn't explicit enough with respect to "deleting as much of
amd_bus.c as possible ..." so I'll try again.

Earlier in this thread - https://lkml.org/lkml/2014/4/28/524 - Bjorn
expressed the desire to "eliminate the need for kernel changes to
support future systems.  So far we seem to be concentrating on (1) and
neglecting (2), which means we're always reacting to things that are
broken.  ...  I think we should try to get rid of amd_bus.c ...".

Then, again in this thread - https://lkml.org/lkml/2014/4/29/360 -
Suravee noted: "... the existing code, which does many things:
  1. Setup numa_node information (if PXM doesn't exist)
  2. Probe NB for MMIO resources (if MCFG doesn't exist)
  3. Probe NB for IO resources
  4. Setup IO ECS

So let's walk through these.  (1) was put in place to "snoop out, from
the HW" numa_node information.  It is "snooped" and cached.  Then,
later in booting, if the platform does not supply an ACPI _PXM method
corresponding to the hostbridge *and* we are on a AMD based platform,
the "snooped" numa_node information is retrieved and used.  There are
two issues with this approach.  First, "The node numbers used by Linux
are logical and there's no reason they need to be identical to
settings in the CPU registers.  So if we got some node information in
the normal way (from _PXM, SLIT, SRAT, etc.) and some from your patch,
there's no reason to believe they would be compatible." [1].  Second,
there is a architectural agnostic way to get this information; the
ACPI _PXM method.  Looking at numerous 'acpidump' captures, the vast
majority of platform BIOS' are not implementing _PXM methods
corresponding to hostbridges - we need to try and correct this and get
away from this current, error prone, fall-back mechanism (again: see
[1]).

(2) and (3) were put in place for similar reasons but with respect to
MCFG - during its early phases, it was either buggy or BIOS' were not
supplying ACPI MCFG tables.  This was long enough ago that I expect we
are well past those issues with new systems today.  MCFG, _CBA, and
_CRS are again architectural agnostic ways to get MMCONFIG and
resource (I/O Port, and MMIO) information.  With respect to (2) and
(3) we were in a similar situation with Intel based systems and for a
brief period of time had 'intel_bus.c'.  We were encountering the same
"perennial maintenance headache" issues with 'intel_bus.c' and finally
with Bjorn's efforts in implementing _CRS as the default for platforms
with BIOS >= 2008 [2] we were able to obviate 'intel_bus.c' completely
- something we should be similarly striving for here with amd_bus.c.

(4) is a little more interesting.  It seems to be related to Xen, non
MMIO based ECS enabled platforms, and IBS.  Xen has indicated that
they can "decide whether to add the code to the hypervisor instead or
- just like on Intel systems - rely on MCFG being properly
exposed by the firmware." [3].  Again, I expect we are past the early
implementations of platforms that don't have MMIO based ECS enabled.
That leaves IBS which I'm completely unfamiliar with [4].

With the possible exception of (4), there should be ACPI based
architectural agnostic ways to get the information being discussed
here.  MCFG, _CBA, and _CRS are mature and provide solutions to (2)
and (3).  There are platforms in the field, the vast majority
actually, that still do not implement _PXM methods corresponding to
hostbridges (1).  Patch 1 of this series provides a fall-back for that
situation for AMD based platforms only; albeit a solution with
problems itself as expressed above.  For (1), the proper solution is
to get platform BIOS' to implement _PXM methods.

As a result, it seems like we should be pursuing an avenue to move us
out of the current "perennial maintenance headache" design that
amd_bus.c presents.  As such, I'm going to start working on an
additional patch to this series that only runs 'amd_postcore_init()'
for BIOS dates < 2015.

[1] https://lkml.org/lkml/2014/3/17/390
[2] Kernel commit 7bc5e3f  "x86/PCI: use host bridge _CRS info by
default on 2008 and newer machines"
[3] https://lkml.org/lkml/2014/4/29/66
[4]  https://lkml.org/lkml/2014/4/30/153 - "ECS would work there
out-of-the-box (at least after the system brought pci up, ibs is
initialized after pci setup)."

Myron


>
> -Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index c8cd75c..1b2895e 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -13,11 +13,30 @@ 
 
 #define AMD_NB_F0_NODE_ID			0x60
 #define AMD_NB_F0_UNIT_ID			0x64
+#define AMD_NB_F1_MMIO_BASE_REG			0x80
+#define AMD_NB_F1_MMIO_LIMIT_REG		0x84
+#define AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG	0x180
+#define AMD_NB_F1_IO_BASE_ADDR_REG		0xc0
+#define AMD_NB_F1_IO_LIMIT_ADDR_REG		0xc4
 #define AMD_NB_F1_CONFIG_MAP_REG		0xe0
 
 #define RANGE_NUM				16
+#define AMD_NB_F1_MMIO_RANGES			16
+#define AMD_NB_F1_IOPORT_RANGES			4
 #define AMD_NB_F1_CONFIG_MAP_RANGES		4
 
+#define AMD_PCIE_CF8(bus, dev, fn, reg) \
+			(0x80000000 | \
+			((reg & 0xF00) << 16) | \
+			((bus & 0xF) << 16) | \
+			((dev & 0x1F) << 11) | \
+			((fn & 0x7) << 8) | \
+			((reg & 0xFC)))
+
+static bool amd_early_ecs_enabled;
+
+static int __init pci_io_ecs_init(u8 bus, u8 slot);
+
 /*
  * This discovers the pcibus <-> node mapping on AMD K8, Family 10h, 12h,
  * 14h, 15h, and 16h processor based systems.  This also gets peer root bus
@@ -39,6 +58,20 @@  static struct amd_hostbridge hb_probes[] __initdata = {
 	{ 0xff, 0   , PCI_DEVICE_ID_AMD_10H_NB_HT },
 };
 
+/* This version of read_pci_config allows reading registers in the ECS area */
+static inline int _amd_read_pci_config(u8 bus, u8 slot, u8 fn, u32 offset)
+{
+	u32 value;
+
+	if ((!amd_early_ecs_enabled) && (offset > 0xFF))
+		return -1;
+
+	outl(AMD_PCIE_CF8(bus, slot, fn, offset), 0xcf8);
+	value = inl(0xcfc);
+
+	return value;
+}
+
 static struct pci_root_info __init *find_pci_root_info(int node, int link)
 {
 	struct pci_root_info *info;
@@ -48,6 +81,9 @@  static struct pci_root_info __init *find_pci_root_info(int node, int link)
 		if (info->node == node && info->link == link)
 			return info;
 
+	pr_warn("AMD-Bus: WARNING: Failed to find root info for node %#x, link %#x\n",
+		node, link);
+
 	return NULL;
 }
 
@@ -118,6 +154,12 @@  static int __init early_fill_mp_bus_info(void)
 
 	pr_debug("Found AMD hostbridge at %x:%x.0\n", bus, slot);
 
+	/* We enable ECS mode prior to probing MMIO as MMIO related registers
+	 * are in the ECS area.
+	 */
+	pci_io_ecs_init(bus, slot);
+
+	found = false;
 	for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
 		int min_bus;
 		int max_bus;
@@ -128,6 +170,7 @@  static int __init early_fill_mp_bus_info(void)
 		if ((reg & 7) != 3)
 			continue;
 
+		found = true;
 		min_bus = (reg >> 16) & 0xff;
 		max_bus = (reg >> 24) & 0xff;
 		node = (reg >> 4) & 0x07;
@@ -136,6 +179,13 @@  static int __init early_fill_mp_bus_info(void)
 		info = alloc_pci_root_info(min_bus, max_bus, node, link);
 	}
 
+	if (!found) {
+		/* In case there is no AMDNB_F1_CONFIG_MAP_REGs,
+		 * we just use default to bus 0, node 0 link 0)
+		 */
+		info = alloc_pci_root_info(0, 0, 0, 0);
+	}
+
 	/* get the default node and link for left over res */
 	reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
 	def_node = (reg >> 8) & 0x07;
@@ -144,14 +194,17 @@  static int __init early_fill_mp_bus_info(void)
 
 	memset(range, 0, sizeof(range));
 	add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
+
 	/* io port resource */
-	for (i = 0; i < 4; i++) {
-		reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
+	for (i = 0; i < AMD_NB_F1_IOPORT_RANGES; i++) {
+		reg = read_pci_config(bus, slot, 1,
+				AMD_NB_F1_IO_BASE_ADDR_REG + (i << 3));
 		if (!(reg & 3))
 			continue;
 
 		start = reg & 0xfff000;
-		reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3));
+		reg = read_pci_config(bus, slot, 1,
+				AMD_NB_F1_IO_LIMIT_ADDR_REG + (i << 3));
 		node = reg & 0x07;
 		link = (reg >> 4) & 0x03;
 		end = (reg & 0xfff000) | 0xfff;
@@ -169,6 +222,7 @@  static int __init early_fill_mp_bus_info(void)
 		update_res(info, start, end, IORESOURCE_IO, 1);
 		subtract_range(range, RANGE_NUM, start, end + 1);
 	}
+
 	/* add left over io port range to def node/link, [0, 0xffff] */
 	/* find the position */
 	info = find_pci_root_info(def_node, def_link);
@@ -211,22 +265,46 @@  static int __init early_fill_mp_bus_info(void)
 	}
 
 	/* mmio resource */
-	for (i = 0; i < 8; i++) {
-		reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
+	for (i = 0; i < AMD_NB_F1_MMIO_RANGES; i++) {
+		u64 tmp;
+		u32 base = AMD_NB_F1_MMIO_BASE_REG + (i << 3);
+		u32 limit = AMD_NB_F1_MMIO_LIMIT_REG + (i << 3);
+		u32 base_limit_hi = AMD_NB_F1_MMIO_BASE_LIMIT_HI_REG + (i << 2);
+
+		if (i >= 12) {
+			/* Range 12 base/limit starts at 0x2A0 */
+			base += 0x1c0;
+			limit += 0x1c0;
+			/* Range 12 base/limit hi starts at 0x2C0 */
+			base_limit_hi += 0x110;
+		} else if (i >= 8) {
+			/* Range 8 base/limit starts at 0x1A0 */
+			base += 0xe0;
+			limit += 0xe0;
+			/* Range 12 base/limit hi starts at 0x1C0 */
+			base_limit_hi += 0x20;
+		}
+
+		/* Base lo */
+		reg = _amd_read_pci_config(bus, slot, 1, base);
 		if (!(reg & 3))
 			continue;
 
-		start = reg & 0xffffff00; /* 39:16 on 31:8*/
-		start <<= 8;
-		reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
+		start = (reg & 0xffffff00UL) << 8;	/* 39:16 on 31:8 */
+
+		/* Limit lo */
+		reg = _amd_read_pci_config(bus, slot, 1, limit);
 		node = reg & 0x07;
 		link = (reg >> 4) & 0x03;
-		end = (reg & 0xffffff00);
-		end <<= 8;
-		end |= 0xffff;
+		end = (reg & 0xffffff00UL) << 8;	/* 39:16 on 31:8 */
+		end |= 0xffffUL;
 
-		info = find_pci_root_info(node, link);
+		/* Base/Limit hi */
+		tmp = _amd_read_pci_config(bus, slot, 1, base_limit_hi);
+		start |= ((tmp & 0xffUL) << 40);
+		end |= ((tmp & (0xffUL << 16)) << 24);
 
+		info = find_pci_root_info(node, link);
 		if (!info)
 			continue;
 
@@ -354,20 +432,24 @@  static struct notifier_block amd_cpu_notifier = {
 	.notifier_call	= amd_cpu_notify,
 };
 
-static void __init pci_enable_pci_io_ecs(void)
+static void __init pci_enable_pci_io_ecs(u8 bus, u8 slot)
 {
 #ifdef CONFIG_AMD_NB
 	unsigned int i, n;
+	u8 limit;
 
 	for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
-		u8 bus = amd_nb_bus_dev_ranges[i].bus;
-		u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
-		u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
+		/* Try matching for the bus range */
+		if ((bus != amd_nb_bus_dev_ranges[i].bus) ||
+		    (slot != amd_nb_bus_dev_ranges[i].dev_base))
+			continue;
+
+		limit = amd_nb_bus_dev_ranges[i].dev_limit;
 
+		/* Setup all northbridges within the range */
 		for (; slot < limit; ++slot) {
 			u32 val = read_pci_config(bus, slot, 3, 0);
-
-			if (!early_is_amd_nb(val))
+			if (!val)
 				continue;
 
 			val = read_pci_config(bus, slot, 3, 0x8c);
@@ -375,13 +457,14 @@  static void __init pci_enable_pci_io_ecs(void)
 				val |= ENABLE_CF8_EXT_CFG >> 32;
 				write_pci_config(bus, slot, 3, 0x8c, val);
 			}
+			amd_early_ecs_enabled = true;
 			++n;
 		}
 	}
 #endif
 }
 
-static int __init pci_io_ecs_init(void)
+static int __init pci_io_ecs_init(u8 bus, u8 slot)
 {
 	int cpu;
 
@@ -389,9 +472,7 @@  static int __init pci_io_ecs_init(void)
         if (boot_cpu_data.x86 < 0x10)
 		return 0;
 
-	/* Try the PCI method first. */
-	if (early_pci_allowed())
-		pci_enable_pci_io_ecs();
+	pci_enable_pci_io_ecs(bus, slot);
 
 	cpu_notifier_register_begin();
 	for_each_online_cpu(cpu)
@@ -411,7 +492,6 @@  static int __init amd_postcore_init(void)
 		return 0;
 
 	early_fill_mp_bus_info();
-	pci_io_ecs_init();
 
 	return 0;
 }