diff mbox series

irqchip/gic-v3-its: Add early memory allocation errata

Message ID 20180912095232.2110-1-matthias.bgg@kernel.org (mailing list archive)
State New, archived
Headers show
Series irqchip/gic-v3-its: Add early memory allocation errata | expand

Commit Message

Matthias Brugger Sept. 12, 2018, 9:52 a.m. UTC
From: Matthias Brugger <mbrugger@suse.com>

Some hardware does not implement two-level page tables so that
the amount of contigious memory needed by the baser is bigger
then the zone order. This is a known problem on Cavium Thunderx
with 4K page size.

We fix this by adding an errata which allocates the memory early
in the boot cycle, using the memblock allocator.

Signed-off-by: Matthias Brugger <mbrugger@suse.com>
---
 arch/arm64/Kconfig               | 12 ++++++++
 arch/arm64/include/asm/cpucaps.h |  3 +-
 arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
 drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
 4 files changed, 79 insertions(+), 19 deletions(-)

Comments

Matthias Brugger Oct. 4, 2018, 10:11 p.m. UTC | #1
Friendly reminder, if anyone has any comment on the patch :)

On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
> From: Matthias Brugger <mbrugger@suse.com>
> 
> Some hardware does not implement two-level page tables so that
> the amount of contigious memory needed by the baser is bigger
> then the zone order. This is a known problem on Cavium Thunderx
> with 4K page size.
> 
> We fix this by adding an errata which allocates the memory early
> in the boot cycle, using the memblock allocator.
> 
> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
> ---
>  arch/arm64/Kconfig               | 12 ++++++++
>  arch/arm64/include/asm/cpucaps.h |  3 +-
>  arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
>  drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
>  4 files changed, 79 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1b1a0e95c751..dfd9fe08f0b2 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
>  
>  	  If unsure, say Y.
>  
> +config CAVIUM_ALLOC_ITS_TABLE_EARLY
> +	bool "Cavium Thunderx: Allocate the its table early"
> +	default y
> +	depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
> +	depends on ARM_GIC_V3_ITS
> +	help
> +	  Cavium Thunderx needs to allocate 16MB of ITS translation table.
> +	  This can be bigger as MAX_ZONE_ORDER and need therefore be done
> +	  via the memblock allocator.
> +
> +	  If unsure, say Y.
> +
>  endmenu
>  
>  
> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
> index ae1f70450fb2..c98be4809b7f 100644
> --- a/arch/arm64/include/asm/cpucaps.h
> +++ b/arch/arm64/include/asm/cpucaps.h
> @@ -51,7 +51,8 @@
>  #define ARM64_SSBD				30
>  #define ARM64_MISMATCHED_CACHE_TYPE		31
>  #define ARM64_HAS_STAGE2_FWB			32
> +#define ARM64_WORKAROUND_CAVIUM_ITS_TABLE	33
>  
> -#define ARM64_NCAPS				33
> +#define ARM64_NCAPS				34
>  
>  #endif /* __ASM_CPUCAPS_H */
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index dec10898d688..7908f8fa3ba8 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -411,6 +411,29 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
>  }
>  #endif	/* CONFIG_ARM64_SSBD */
>  
> +#ifdef CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY
> +#include <linux/bootmem.h>
> +extern void *its_base;
> +
> +/*
> + * Hardware that doesn't use two-level page table and exceedes
> + * the maximum order of pages that can be allocated by the buddy
> + * allocator. Try to use the memblock allocator instead.
> + * This has been observed on Cavium Thunderx machines with 4K
> + * page size.
> + */
> +static bool __init its_early_alloc(const struct arm64_cpu_capabilities *cap,
> +		       int scope)
> +{
> +	/* We need to allocate the table only once */
> +	if (scope & ARM64_CPUCAP_SCOPE_BOOT_CPU && !its_base)
> +		its_base = (void *)memblock_virt_alloc_nopanic(16 * SZ_1M,
> +					64 * SZ_1K);
> +
> +	return true;
> +}
> +#endif /* CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY */
> +
>  #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
>  	.matches = is_affected_midr_range,			\
>  	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
> @@ -679,6 +702,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
>  		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
>  		.matches = has_ssbd_mitigation,
>  	},
> +#endif
> +#ifdef CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY
> +	{
> +		/* Cavium ThunderX, pass 1.x - 2.1 */
> +		.desc = "Cavium alloc ITS table early",
> +		.capability = ARM64_WORKAROUND_CAVIUM_ITS_TABLE,
> +		.type = ARM64_CPUCAP_SCOPE_BOOT_CPU,
> +		.matches = its_early_alloc,
> +		.midr_range = MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
> +	},
>  #endif
>  	{
>  	}
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index c2df341ff6fa..b78546740a0d 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -87,6 +87,8 @@ struct its_baser {
>  	u32		psz;
>  };
>  
> +void *its_base;
> +
>  struct its_device;
>  
>  /*
> @@ -1666,7 +1668,7 @@ static void its_write_baser(struct its_node *its, struct its_baser *baser,
>  	baser->val = its_read_baser(its, baser);
>  }
>  
> -static int its_setup_baser(struct its_node *its, struct its_baser *baser,
> +static int __init its_setup_baser(struct its_node *its, struct its_baser *baser,
>  			   u64 cache, u64 shr, u32 psz, u32 order,
>  			   bool indirect)
>  {
> @@ -1675,7 +1677,6 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  	u64 type = GITS_BASER_TYPE(val);
>  	u64 baser_phys, tmp;
>  	u32 alloc_pages;
> -	void *base;
>  
>  retry_alloc_baser:
>  	alloc_pages = (PAGE_ORDER_TO_SIZE(order) / psz);
> @@ -1687,11 +1688,22 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		order = get_order(GITS_BASER_PAGES_MAX * psz);
>  	}
>  
> -	base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
> -	if (!base)
> -		return -ENOMEM;
> +	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_ITS_TABLE)) {
> +		if (!its_base) {
> +			pr_warn("ITS@%pa: %s Allocation using memblock failed %pS\n",
> +				&its->phys_base, its_base_type_string[type],
> +				its_base);
> +			return -ENOMEM;
> +		}
> +
> +	} else {
> +		its_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
> +							order);
> +		if (!its_base)
> +			return -ENOMEM;
> +	}
>  
> -	baser_phys = virt_to_phys(base);
> +	baser_phys = virt_to_phys(its_base);
>  
>  	/* Check if the physical address of the memory is above 48bits */
>  	if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && (baser_phys >> 48)) {
> @@ -1699,7 +1711,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		/* 52bit PA is supported only when PageSize=64K */
>  		if (psz != SZ_64K) {
>  			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
> -			free_pages((unsigned long)base, order);
> +			free_pages((unsigned long)its_base, order);
>  			return -ENXIO;
>  		}
>  
> @@ -1744,7 +1756,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		shr = tmp & GITS_BASER_SHAREABILITY_MASK;
>  		if (!shr) {
>  			cache = GITS_BASER_nC;
> -			gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
> +			gic_flush_dcache_to_poc(its_base, PAGE_ORDER_TO_SIZE(order));
>  		}
>  		goto retry_baser;
>  	}
> @@ -1755,7 +1767,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		 * size and retry. If we reach 4K, then
>  		 * something is horribly wrong...
>  		 */
> -		free_pages((unsigned long)base, order);
> +		free_pages((unsigned long)its_base, order);
>  		baser->base = NULL;
>  
>  		switch (psz) {
> @@ -1772,19 +1784,19 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
>  		       &its->phys_base, its_base_type_string[type],
>  		       val, tmp);
> -		free_pages((unsigned long)base, order);
> +		free_pages((unsigned long)its_base, order);
>  		return -ENXIO;
>  	}
>  
>  	baser->order = order;
> -	baser->base = base;
> +	baser->base = its_base;
>  	baser->psz = psz;
>  	tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
>  
>  	pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
>  		&its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
>  		its_base_type_string[type],
> -		(unsigned long)virt_to_phys(base),
> +		(unsigned long)virt_to_phys(its_base),
>  		indirect ? "indirect" : "flat", (int)esz,
>  		psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
>  
> @@ -1832,12 +1844,14 @@ static bool its_parse_indirect_baser(struct its_node *its,
>  	 * feature is not supported by hardware.
>  	 */
>  	new_order = max_t(u32, get_order(esz << ids), new_order);
> -	if (new_order >= MAX_ORDER) {
> -		new_order = MAX_ORDER - 1;
> -		ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
> -		pr_warn("ITS@%pa: %s Table too large, reduce ids %u->%u\n",
> -			&its->phys_base, its_base_type_string[type],
> -			its->device_ids, ids);
> +	if (!cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_ITS_TABLE)) {
> +		if (new_order >= MAX_ORDER) {
> +			new_order = MAX_ORDER - 1;
> +			ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
> +			pr_warn("ITS@%pa: %s Table too large, reduce ids %u->%u\n",
> +				&its->phys_base, its_base_type_string[type],
> +				its->device_ids, ids);
> +		}
>  	}
>  
>  	*order = new_order;
>
Marc Zyngier Oct. 5, 2018, 10:55 a.m. UTC | #2
Hi Matthias,

On 04/10/18 23:11, Matthias Brugger wrote:
> Friendly reminder, if anyone has any comment on the patch :)
> 
> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
>> From: Matthias Brugger <mbrugger@suse.com>
>>
>> Some hardware does not implement two-level page tables so that
>> the amount of contigious memory needed by the baser is bigger
>> then the zone order. This is a known problem on Cavium Thunderx
>> with 4K page size.
>>
>> We fix this by adding an errata which allocates the memory early
>> in the boot cycle, using the memblock allocator.
>>
>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
>> ---
>>   arch/arm64/Kconfig               | 12 ++++++++
>>   arch/arm64/include/asm/cpucaps.h |  3 +-
>>   arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
>>   drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
>>   4 files changed, 79 insertions(+), 19 deletions(-)

My only comment would be to state how much I dislike both the HW and the 
patch... ;-) The idea that we have some erratum that depends on the page 
size doesn't feel good at all.

>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 1b1a0e95c751..dfd9fe08f0b2 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
>>   
>>   	  If unsure, say Y.
>>   
>> +config CAVIUM_ALLOC_ITS_TABLE_EARLY
>> +	bool "Cavium Thunderx: Allocate the its table early"
>> +	default y
>> +	depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13

Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as 
we could always allocate the same amount of memory, no matter what the 
page size is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel 
includes support for TX1.

Any of this of course requires buy-in from the arm64 maintainers, as 
this is quite a departure from the way things work so far.

Thanks,
	
	M.
Matthias Brugger Oct. 5, 2018, 12:33 p.m. UTC | #3
On 05/10/2018 12:55, Marc Zyngier wrote:
> Hi Matthias,
> 
> On 04/10/18 23:11, Matthias Brugger wrote:
>> Friendly reminder, if anyone has any comment on the patch :)
>>
>> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
>>> From: Matthias Brugger <mbrugger@suse.com>
>>>
>>> Some hardware does not implement two-level page tables so that
>>> the amount of contigious memory needed by the baser is bigger
>>> then the zone order. This is a known problem on Cavium Thunderx
>>> with 4K page size.
>>>
>>> We fix this by adding an errata which allocates the memory early
>>> in the boot cycle, using the memblock allocator.
>>>
>>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
>>> ---
>>>   arch/arm64/Kconfig               | 12 ++++++++
>>>   arch/arm64/include/asm/cpucaps.h |  3 +-
>>>   arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
>>>   drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
>>>   4 files changed, 79 insertions(+), 19 deletions(-)
> 
> My only comment would be to state how much I dislike both the HW and the
> patch... ;-) The idea that we have some erratum that depends on the page size
> doesn't feel good at all.
> 

Well ugly HW needs ugly patches ;-)

>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 1b1a0e95c751..dfd9fe08f0b2 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
>>>           If unsure, say Y.
>>>   +config CAVIUM_ALLOC_ITS_TABLE_EARLY
>>> +    bool "Cavium Thunderx: Allocate the its table early"
>>> +    default y
>>> +    depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
> 
> Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as we
> could always allocate the same amount of memory, no matter what the page size
> is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel includes support for TX1.
> 

Bumping FORCE_MAX_ZONEORDER when TX1 is supported was proposed here:
https://patchwork.kernel.org/patch/6322281/

To bring in some more history, the CMA approach ended with this discussion:
https://patchwork.kernel.org/patch/9888041/

> Any of this of course requires buy-in from the arm64 maintainers, as this is
> quite a departure from the way things work so far.
> 

With my distribution head on, I would prefer a solution that does not change
FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
the same problem :)

Regards,
Matthias
Marc Zyngier Oct. 5, 2018, 1:42 p.m. UTC | #4
On 05/10/18 13:33, Matthias Brugger wrote:
> 
> 
> On 05/10/2018 12:55, Marc Zyngier wrote:
>> Hi Matthias,
>>
>> On 04/10/18 23:11, Matthias Brugger wrote:
>>> Friendly reminder, if anyone has any comment on the patch :)
>>>
>>> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
>>>> From: Matthias Brugger <mbrugger@suse.com>
>>>>
>>>> Some hardware does not implement two-level page tables so that
>>>> the amount of contigious memory needed by the baser is bigger
>>>> then the zone order. This is a known problem on Cavium Thunderx
>>>> with 4K page size.
>>>>
>>>> We fix this by adding an errata which allocates the memory early
>>>> in the boot cycle, using the memblock allocator.
>>>>
>>>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
>>>> ---
>>>>    arch/arm64/Kconfig               | 12 ++++++++
>>>>    arch/arm64/include/asm/cpucaps.h |  3 +-
>>>>    arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
>>>>    drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
>>>>    4 files changed, 79 insertions(+), 19 deletions(-)
>>
>> My only comment would be to state how much I dislike both the HW and the
>> patch... ;-) The idea that we have some erratum that depends on the page size
>> doesn't feel good at all.
>>
> 
> Well ugly HW needs ugly patches ;-)
> 
>>>>
>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>> index 1b1a0e95c751..dfd9fe08f0b2 100644
>>>> --- a/arch/arm64/Kconfig
>>>> +++ b/arch/arm64/Kconfig
>>>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
>>>>            If unsure, say Y.
>>>>    +config CAVIUM_ALLOC_ITS_TABLE_EARLY
>>>> +    bool "Cavium Thunderx: Allocate the its table early"
>>>> +    default y
>>>> +    depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
>>
>> Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as we
>> could always allocate the same amount of memory, no matter what the page size
>> is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel includes support for TX1.
>>
> 
> Bumping FORCE_MAX_ZONEORDER when TX1 is supported was proposed here:
> https://patchwork.kernel.org/patch/6322281/
> 
> To bring in some more history, the CMA approach ended with this discussion:
> https://patchwork.kernel.org/patch/9888041/
> 
>> Any of this of course requires buy-in from the arm64 maintainers, as this is
>> quite a departure from the way things work so far.
>>
> 
> With my distribution head on, I would prefer a solution that does not change
> FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
> the same problem :)

Why is that a problem? What impact does this have on your favourite distro?

Thanks,

	M.
Matthias Brugger Oct. 5, 2018, 2:13 p.m. UTC | #5
On 05/10/2018 15:42, Marc Zyngier wrote:
> On 05/10/18 13:33, Matthias Brugger wrote:
>>
>>
>> On 05/10/2018 12:55, Marc Zyngier wrote:
>>> Hi Matthias,
>>>
>>> On 04/10/18 23:11, Matthias Brugger wrote:
>>>> Friendly reminder, if anyone has any comment on the patch :)
>>>>
>>>> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
>>>>> From: Matthias Brugger <mbrugger@suse.com>
>>>>>
>>>>> Some hardware does not implement two-level page tables so that
>>>>> the amount of contigious memory needed by the baser is bigger
>>>>> then the zone order. This is a known problem on Cavium Thunderx
>>>>> with 4K page size.
>>>>>
>>>>> We fix this by adding an errata which allocates the memory early
>>>>> in the boot cycle, using the memblock allocator.
>>>>>
>>>>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
>>>>> ---
>>>>>    arch/arm64/Kconfig               | 12 ++++++++
>>>>>    arch/arm64/include/asm/cpucaps.h |  3 +-
>>>>>    arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
>>>>>    drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
>>>>>    4 files changed, 79 insertions(+), 19 deletions(-)
>>>
>>> My only comment would be to state how much I dislike both the HW and the
>>> patch... ;-) The idea that we have some erratum that depends on the page size
>>> doesn't feel good at all.
>>>
>>
>> Well ugly HW needs ugly patches ;-)
>>
>>>>>
>>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>>> index 1b1a0e95c751..dfd9fe08f0b2 100644
>>>>> --- a/arch/arm64/Kconfig
>>>>> +++ b/arch/arm64/Kconfig
>>>>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
>>>>>            If unsure, say Y.
>>>>>    +config CAVIUM_ALLOC_ITS_TABLE_EARLY
>>>>> +    bool "Cavium Thunderx: Allocate the its table early"
>>>>> +    default y
>>>>> +    depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
>>>
>>> Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as we
>>> could always allocate the same amount of memory, no matter what the page size
>>> is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel includes support
>>> for TX1.
>>>
>>
>> Bumping FORCE_MAX_ZONEORDER when TX1 is supported was proposed here:
>> https://patchwork.kernel.org/patch/6322281/
>>
>> To bring in some more history, the CMA approach ended with this discussion:
>> https://patchwork.kernel.org/patch/9888041/
>>
>>> Any of this of course requires buy-in from the arm64 maintainers, as this is
>>> quite a departure from the way things work so far.
>>>
>>
>> With my distribution head on, I would prefer a solution that does not change
>> FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
>> the same problem :)
> 
> Why is that a problem? What impact does this have on your favourite distro?
> 

The impact is on changing FORCE_MAX_ZONEORDER on an already released kernel will
break Kernel ABI and with that all external modules. I know that's nothing
upstream cares too much about, but the distros do :)
Richter, Robert Oct. 5, 2018, 3:15 p.m. UTC | #6
On 05.10.18 16:13:48, Matthias Brugger wrote:
> On 05/10/2018 15:42, Marc Zyngier wrote:
> > On 05/10/18 13:33, Matthias Brugger wrote:

> >> With my distribution head on, I would prefer a solution that does not change
> >> FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
> >> the same problem :)
> >
> > Why is that a problem? What impact does this have on your favourite distro?
> >
> 
> The impact is on changing FORCE_MAX_ZONEORDER on an already released kernel will
> break Kernel ABI and with that all external modules. I know that's nothing
> upstream cares too much about, but the distros do :)

Wrt upstream, there is no way to do this change without patching the
kernel. Thus, it is impossible to run an unpatched 4k page size kernel
on ThunderX.

-Robert
Marc Zyngier Oct. 5, 2018, 3:17 p.m. UTC | #7
On Fri, 05 Oct 2018 15:13:48 +0100,
Matthias Brugger <matthias.bgg@gmail.com> wrote:
> 
> 
> 
> On 05/10/2018 15:42, Marc Zyngier wrote:
> > On 05/10/18 13:33, Matthias Brugger wrote:
> >>
> >>
> >> On 05/10/2018 12:55, Marc Zyngier wrote:
> >>> Hi Matthias,
> >>>
> >>> On 04/10/18 23:11, Matthias Brugger wrote:
> >>>> Friendly reminder, if anyone has any comment on the patch :)
> >>>>
> >>>> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
> >>>>> From: Matthias Brugger <mbrugger@suse.com>
> >>>>>
> >>>>> Some hardware does not implement two-level page tables so that
> >>>>> the amount of contigious memory needed by the baser is bigger
> >>>>> then the zone order. This is a known problem on Cavium Thunderx
> >>>>> with 4K page size.
> >>>>>
> >>>>> We fix this by adding an errata which allocates the memory early
> >>>>> in the boot cycle, using the memblock allocator.
> >>>>>
> >>>>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
> >>>>> ---
> >>>>>    arch/arm64/Kconfig               | 12 ++++++++
> >>>>>    arch/arm64/include/asm/cpucaps.h |  3 +-
> >>>>>    arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
> >>>>>    drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
> >>>>>    4 files changed, 79 insertions(+), 19 deletions(-)
> >>>
> >>> My only comment would be to state how much I dislike both the HW and the
> >>> patch... ;-) The idea that we have some erratum that depends on the page size
> >>> doesn't feel good at all.
> >>>
> >>
> >> Well ugly HW needs ugly patches ;-)
> >>
> >>>>>
> >>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> >>>>> index 1b1a0e95c751..dfd9fe08f0b2 100644
> >>>>> --- a/arch/arm64/Kconfig
> >>>>> +++ b/arch/arm64/Kconfig
> >>>>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
> >>>>>            If unsure, say Y.
> >>>>>    +config CAVIUM_ALLOC_ITS_TABLE_EARLY
> >>>>> +    bool "Cavium Thunderx: Allocate the its table early"
> >>>>> +    default y
> >>>>> +    depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
> >>>
> >>> Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as we
> >>> could always allocate the same amount of memory, no matter what the page size
> >>> is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel includes support
> >>> for TX1.
> >>>
> >>
> >> Bumping FORCE_MAX_ZONEORDER when TX1 is supported was proposed here:
> >> https://patchwork.kernel.org/patch/6322281/
> >>
> >> To bring in some more history, the CMA approach ended with this discussion:
> >> https://patchwork.kernel.org/patch/9888041/
> >>
> >>> Any of this of course requires buy-in from the arm64 maintainers, as this is
> >>> quite a departure from the way things work so far.
> >>>
> >>
> >> With my distribution head on, I would prefer a solution that does not change
> >> FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
> >> the same problem :)
> > 
> > Why is that a problem? What impact does this have on your favourite distro?
> > 
> 
> The impact is on changing FORCE_MAX_ZONEORDER on an already released
> kernel will break Kernel ABI and with that all external modules. I
> know that's nothing upstream cares too much about, but the distros
> do :)

Unfortunately, that's something you're bringing upon yourself, and I'm
afraid I can't really take this into account. You could always bump
that ABI if you really want to support this platform as, at the end of
the day, this is something you're in control of.

But I'd really like to hear what Catalin or Will think of this (Will
wasn't massively impressed by this 3 years ago, and I wonder if his
approach has changed since).

Thanks,

	M.
Will Deacon Oct. 10, 2018, 5:08 p.m. UTC | #8
On Fri, Oct 05, 2018 at 04:17:30PM +0100, Marc Zyngier wrote:
> On Fri, 05 Oct 2018 15:13:48 +0100,
> Matthias Brugger <matthias.bgg@gmail.com> wrote:
> > 
> > 
> > 
> > On 05/10/2018 15:42, Marc Zyngier wrote:
> > > On 05/10/18 13:33, Matthias Brugger wrote:
> > >>
> > >>
> > >> On 05/10/2018 12:55, Marc Zyngier wrote:
> > >>> Hi Matthias,
> > >>>
> > >>> On 04/10/18 23:11, Matthias Brugger wrote:
> > >>>> Friendly reminder, if anyone has any comment on the patch :)
> > >>>>
> > >>>> On 9/12/18 11:52 AM, matthias.bgg@kernel.org wrote:
> > >>>>> From: Matthias Brugger <mbrugger@suse.com>
> > >>>>>
> > >>>>> Some hardware does not implement two-level page tables so that
> > >>>>> the amount of contigious memory needed by the baser is bigger
> > >>>>> then the zone order. This is a known problem on Cavium Thunderx
> > >>>>> with 4K page size.
> > >>>>>
> > >>>>> We fix this by adding an errata which allocates the memory early
> > >>>>> in the boot cycle, using the memblock allocator.
> > >>>>>
> > >>>>> Signed-off-by: Matthias Brugger <mbrugger@suse.com>
> > >>>>> ---
> > >>>>>    arch/arm64/Kconfig               | 12 ++++++++
> > >>>>>    arch/arm64/include/asm/cpucaps.h |  3 +-
> > >>>>>    arch/arm64/kernel/cpu_errata.c   | 33 +++++++++++++++++++++
> > >>>>>    drivers/irqchip/irq-gic-v3-its.c | 50 ++++++++++++++++++++------------
> > >>>>>    4 files changed, 79 insertions(+), 19 deletions(-)
> > >>>
> > >>> My only comment would be to state how much I dislike both the HW and the
> > >>> patch... ;-) The idea that we have some erratum that depends on the page size
> > >>> doesn't feel good at all.
> > >>>
> > >>
> > >> Well ugly HW needs ugly patches ;-)
> > >>
> > >>>>>
> > >>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > >>>>> index 1b1a0e95c751..dfd9fe08f0b2 100644
> > >>>>> --- a/arch/arm64/Kconfig
> > >>>>> +++ b/arch/arm64/Kconfig
> > >>>>> @@ -597,6 +597,18 @@ config QCOM_FALKOR_ERRATUM_E1041
> > >>>>>            If unsure, say Y.
> > >>>>>    +config CAVIUM_ALLOC_ITS_TABLE_EARLY
> > >>>>> +    bool "Cavium Thunderx: Allocate the its table early"
> > >>>>> +    default y
> > >>>>> +    depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
> > >>>
> > >>> Here's a though: Why don't we ensure that FORCE_MAX_ZONEORDER is such as we
> > >>> could always allocate the same amount of memory, no matter what the page size
> > >>> is? That, or bump FORCE_MAX_ZONEORDER to 13 if the kernel includes support
> > >>> for TX1.
> > >>>
> > >>
> > >> Bumping FORCE_MAX_ZONEORDER when TX1 is supported was proposed here:
> > >> https://patchwork.kernel.org/patch/6322281/
> > >>
> > >> To bring in some more history, the CMA approach ended with this discussion:
> > >> https://patchwork.kernel.org/patch/9888041/
> > >>
> > >>> Any of this of course requires buy-in from the arm64 maintainers, as this is
> > >>> quite a departure from the way things work so far.
> > >>>
> > >>
> > >> With my distribution head on, I would prefer a solution that does not change
> > >> FORCE_MAX_ZONEORDER. That's how I came to the idea providing a third solution to
> > >> the same problem :)
> > > 
> > > Why is that a problem? What impact does this have on your favourite distro?
> > > 
> > 
> > The impact is on changing FORCE_MAX_ZONEORDER on an already released
> > kernel will break Kernel ABI and with that all external modules. I
> > know that's nothing upstream cares too much about, but the distros
> > do :)
> 
> Unfortunately, that's something you're bringing upon yourself, and I'm
> afraid I can't really take this into account. You could always bump
> that ABI if you really want to support this platform as, at the end of
> the day, this is something you're in control of.
> 
> But I'd really like to hear what Catalin or Will think of this (Will
> wasn't massively impressed by this 3 years ago, and I wonder if his
> approach has changed since).

I don't see anything that changes my opinion here, and the reality is
that bumping FORCE_MAX_ZONEORDER doesn't guarantee anything about the
allocation succeeding. I'm also hesitant to punish other platforms
(including TX2!) because of this TX1 "feature".

One thing I'm unsure about is why the CMA approach failed; the link
above is a complain about the use of subsys_initcall() afaict.

Will
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1b1a0e95c751..dfd9fe08f0b2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -597,6 +597,18 @@  config QCOM_FALKOR_ERRATUM_E1041
 
 	  If unsure, say Y.
 
+config CAVIUM_ALLOC_ITS_TABLE_EARLY
+	bool "Cavium Thunderx: Allocate the its table early"
+	default y
+	depends on ARM64_4K_PAGES && FORCE_MAX_ZONEORDER < 13
+	depends on ARM_GIC_V3_ITS
+	help
+	  Cavium Thunderx needs to allocate 16MB of ITS translation table.
+	  This can be bigger as MAX_ZONE_ORDER and need therefore be done
+	  via the memblock allocator.
+
+	  If unsure, say Y.
+
 endmenu
 
 
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ae1f70450fb2..c98be4809b7f 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -51,7 +51,8 @@ 
 #define ARM64_SSBD				30
 #define ARM64_MISMATCHED_CACHE_TYPE		31
 #define ARM64_HAS_STAGE2_FWB			32
+#define ARM64_WORKAROUND_CAVIUM_ITS_TABLE	33
 
-#define ARM64_NCAPS				33
+#define ARM64_NCAPS				34
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index dec10898d688..7908f8fa3ba8 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -411,6 +411,29 @@  static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
 }
 #endif	/* CONFIG_ARM64_SSBD */
 
+#ifdef CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY
+#include <linux/bootmem.h>
+extern void *its_base;
+
+/*
+ * Hardware that doesn't use two-level page table and exceedes
+ * the maximum order of pages that can be allocated by the buddy
+ * allocator. Try to use the memblock allocator instead.
+ * This has been observed on Cavium Thunderx machines with 4K
+ * page size.
+ */
+static bool __init its_early_alloc(const struct arm64_cpu_capabilities *cap,
+		       int scope)
+{
+	/* We need to allocate the table only once */
+	if (scope & ARM64_CPUCAP_SCOPE_BOOT_CPU && !its_base)
+		its_base = (void *)memblock_virt_alloc_nopanic(16 * SZ_1M,
+					64 * SZ_1K);
+
+	return true;
+}
+#endif /* CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY */
+
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
@@ -679,6 +702,16 @@  const struct arm64_cpu_capabilities arm64_errata[] = {
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = has_ssbd_mitigation,
 	},
+#endif
+#ifdef CONFIG_CAVIUM_ALLOC_ITS_TABLE_EARLY
+	{
+		/* Cavium ThunderX, pass 1.x - 2.1 */
+		.desc = "Cavium alloc ITS table early",
+		.capability = ARM64_WORKAROUND_CAVIUM_ITS_TABLE,
+		.type = ARM64_CPUCAP_SCOPE_BOOT_CPU,
+		.matches = its_early_alloc,
+		.midr_range = MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
+	},
 #endif
 	{
 	}
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index c2df341ff6fa..b78546740a0d 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -87,6 +87,8 @@  struct its_baser {
 	u32		psz;
 };
 
+void *its_base;
+
 struct its_device;
 
 /*
@@ -1666,7 +1668,7 @@  static void its_write_baser(struct its_node *its, struct its_baser *baser,
 	baser->val = its_read_baser(its, baser);
 }
 
-static int its_setup_baser(struct its_node *its, struct its_baser *baser,
+static int __init its_setup_baser(struct its_node *its, struct its_baser *baser,
 			   u64 cache, u64 shr, u32 psz, u32 order,
 			   bool indirect)
 {
@@ -1675,7 +1677,6 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 	u64 type = GITS_BASER_TYPE(val);
 	u64 baser_phys, tmp;
 	u32 alloc_pages;
-	void *base;
 
 retry_alloc_baser:
 	alloc_pages = (PAGE_ORDER_TO_SIZE(order) / psz);
@@ -1687,11 +1688,22 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		order = get_order(GITS_BASER_PAGES_MAX * psz);
 	}
 
-	base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!base)
-		return -ENOMEM;
+	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_ITS_TABLE)) {
+		if (!its_base) {
+			pr_warn("ITS@%pa: %s Allocation using memblock failed %pS\n",
+				&its->phys_base, its_base_type_string[type],
+				its_base);
+			return -ENOMEM;
+		}
+
+	} else {
+		its_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+							order);
+		if (!its_base)
+			return -ENOMEM;
+	}
 
-	baser_phys = virt_to_phys(base);
+	baser_phys = virt_to_phys(its_base);
 
 	/* Check if the physical address of the memory is above 48bits */
 	if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && (baser_phys >> 48)) {
@@ -1699,7 +1711,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		/* 52bit PA is supported only when PageSize=64K */
 		if (psz != SZ_64K) {
 			pr_err("ITS: no 52bit PA support when psz=%d\n", psz);
-			free_pages((unsigned long)base, order);
+			free_pages((unsigned long)its_base, order);
 			return -ENXIO;
 		}
 
@@ -1744,7 +1756,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		shr = tmp & GITS_BASER_SHAREABILITY_MASK;
 		if (!shr) {
 			cache = GITS_BASER_nC;
-			gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
+			gic_flush_dcache_to_poc(its_base, PAGE_ORDER_TO_SIZE(order));
 		}
 		goto retry_baser;
 	}
@@ -1755,7 +1767,7 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		 * size and retry. If we reach 4K, then
 		 * something is horribly wrong...
 		 */
-		free_pages((unsigned long)base, order);
+		free_pages((unsigned long)its_base, order);
 		baser->base = NULL;
 
 		switch (psz) {
@@ -1772,19 +1784,19 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n",
 		       &its->phys_base, its_base_type_string[type],
 		       val, tmp);
-		free_pages((unsigned long)base, order);
+		free_pages((unsigned long)its_base, order);
 		return -ENXIO;
 	}
 
 	baser->order = order;
-	baser->base = base;
+	baser->base = its_base;
 	baser->psz = psz;
 	tmp = indirect ? GITS_LVL1_ENTRY_SIZE : esz;
 
 	pr_info("ITS@%pa: allocated %d %s @%lx (%s, esz %d, psz %dK, shr %d)\n",
 		&its->phys_base, (int)(PAGE_ORDER_TO_SIZE(order) / (int)tmp),
 		its_base_type_string[type],
-		(unsigned long)virt_to_phys(base),
+		(unsigned long)virt_to_phys(its_base),
 		indirect ? "indirect" : "flat", (int)esz,
 		psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
 
@@ -1832,12 +1844,14 @@  static bool its_parse_indirect_baser(struct its_node *its,
 	 * feature is not supported by hardware.
 	 */
 	new_order = max_t(u32, get_order(esz << ids), new_order);
-	if (new_order >= MAX_ORDER) {
-		new_order = MAX_ORDER - 1;
-		ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
-		pr_warn("ITS@%pa: %s Table too large, reduce ids %u->%u\n",
-			&its->phys_base, its_base_type_string[type],
-			its->device_ids, ids);
+	if (!cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_ITS_TABLE)) {
+		if (new_order >= MAX_ORDER) {
+			new_order = MAX_ORDER - 1;
+			ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
+			pr_warn("ITS@%pa: %s Table too large, reduce ids %u->%u\n",
+				&its->phys_base, its_base_type_string[type],
+				its->device_ids, ids);
+		}
 	}
 
 	*order = new_order;