diff mbox series

arm64: errata: add detection for AMEVCNTR01 incrementing incorrectly

Message ID 20220607125340.13635-1-ionela.voinescu@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: errata: add detection for AMEVCNTR01 incrementing incorrectly | expand

Commit Message

Ionela Voinescu June 7, 2022, 12:53 p.m. UTC
The AMU counter AMEVCNTR01 (constant counter) should increment at the same
rate as the system counter. On affected Cortex-A510 cores, AMEVCNTR01
increments incorrectly giving a significantly higher output value. This
results in inaccurate task scheduler utilization tracking and incorrect
feedback on CPU frequency.

Work around this problem in the arm64 topology code by always returning 0
when reading the affected counter. This will disable all users of this
counter from using it either for frequency invariance or as FFH reference
counter. This effect is the same to firmware disabling affected counters.

Details on how the two features are affected by this erratum:

 - AMU counters will not be used for frequency invariance for affected
   CPUs and CPUs in the same cpufreq policy. AMUs can still be used for
   frequency invariance for unaffected CPUs in the system. Although
   unlikely, if no alternative method can be found to support frequency
   invariance for affected CPUs (cpufreq based or solution based on
   platform counters) frequency invariance will be disabled. Please check
   the chapter on frequency invariance at
   Documentation/scheduler/sched-capacity.rst for details of its effect.

 - Given that FFH can be used to fetch either the core or constant counter
   values, restrictions are lifted regarding any of these counters
   returning a valid (!0) value. Therefore FFH is considered supported
   if there is a least one CPU that support AMUs, independent of any
   counters being enabled or affected by this erratum.

The above is achieved through adding a new erratum: ARM64_ERRATUM_2457168.

Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: James Morse <james.morse@arm.com>
---

Hi,


This patch is based on the information in the A510 Errata Notice,
version 13.0 at [1] and applies on v5.19-rc1.

[1] https://developer.arm.com/documentation/SDEN2397589/1300/?lang=en

Thanks,
Ionela.

 Documentation/arm64/silicon-errata.rst |  2 ++
 arch/arm64/Kconfig                     | 18 ++++++++++++++++++
 arch/arm64/include/asm/cpufeature.h    |  5 +++++
 arch/arm64/kernel/cpufeature.c         | 13 +++++++++++++
 arch/arm64/kernel/topology.c           | 10 ++++++++--
 5 files changed, 46 insertions(+), 2 deletions(-)

Comments

Catalin Marinas June 10, 2022, 4:47 p.m. UTC | #1
On Tue, Jun 07, 2022 at 01:53:40PM +0100, Ionela Voinescu wrote:
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 14a8f3d93add..80e0c700cecf 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -881,11 +881,16 @@ static inline bool cpu_has_pan(void)
>  #ifdef CONFIG_ARM64_AMU_EXTN
>  /* Check whether the cpu supports the Activity Monitors Unit (AMU) */
>  extern bool cpu_has_amu_feat(int cpu);
> +extern bool cpu_has_broken_amu_constcnt(void);
>  #else
>  static inline bool cpu_has_amu_feat(int cpu)
>  {
>  	return false;
>  }
> +static inline bool cpu_has_broken_amu_constcnt(void)
> +{
> +	return false;
> +}
>  #endif
>  
>  /* Get a cpu that supports the Activity Monitors Unit (AMU) */
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 42ea2bd856c6..b9e4b2bd2c63 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1791,6 +1791,19 @@ int get_cpu_with_amu_feat(void)
>  	return cpumask_any(&amu_cpus);
>  }
>  
> +bool cpu_has_broken_amu_constcnt(void)
> +{
> +	/* List of CPUs which have broken AMEVCNTR01 (constant counter) */
> +	static const struct midr_range cpus[] = {
> +#ifdef CONFIG_ARM64_ERRATUM_2457168
> +		MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
> +#endif
> +		{},
> +	};
> +
> +	return is_midr_in_range(read_cpuid_id(), cpus);
> +}

I'd rather not have this in cpufeature.c as it's not really a feature.
We have some precedent with checking errata in cpufeature.c but IIRC we
did that only to check whether to enable a feature or not in that file
(DBM).

> +
>  static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap)
>  {
>  	if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) {
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 9ab78ad826e2..d4b0b0a40515 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -127,7 +127,8 @@ int __init parse_acpi_topology(void)
>  
>  #ifdef CONFIG_ARM64_AMU_EXTN
>  #define read_corecnt()	read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)
> -#define read_constcnt()	read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)
> +#define read_constcnt()	(cpu_has_broken_amu_constcnt() ? 0UL : \
> +			read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0))

How often is this called? You end up reading the cpuid, comparing the
range on each call. I guess you can't use a cpucap in the arm64_errata[]
array as you want a check per-CPU? Does it matter if we return 0UL on
for all CPUs if one is affected?
Ionela Voinescu June 14, 2022, 1:42 p.m. UTC | #2
Hi Catalin,

Thank you for the review!

On Friday 10 Jun 2022 at 17:47:12 (+0100), Catalin Marinas wrote:
> On Tue, Jun 07, 2022 at 01:53:40PM +0100, Ionela Voinescu wrote:
> > diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> > index 14a8f3d93add..80e0c700cecf 100644
> > --- a/arch/arm64/include/asm/cpufeature.h
> > +++ b/arch/arm64/include/asm/cpufeature.h
> > @@ -881,11 +881,16 @@ static inline bool cpu_has_pan(void)
> >  #ifdef CONFIG_ARM64_AMU_EXTN
> >  /* Check whether the cpu supports the Activity Monitors Unit (AMU) */
> >  extern bool cpu_has_amu_feat(int cpu);
> > +extern bool cpu_has_broken_amu_constcnt(void);
> >  #else
> >  static inline bool cpu_has_amu_feat(int cpu)
> >  {
> >  	return false;
> >  }
> > +static inline bool cpu_has_broken_amu_constcnt(void)
> > +{
> > +	return false;
> > +}
> >  #endif
> >  
> >  /* Get a cpu that supports the Activity Monitors Unit (AMU) */
> > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> > index 42ea2bd856c6..b9e4b2bd2c63 100644
> > --- a/arch/arm64/kernel/cpufeature.c
> > +++ b/arch/arm64/kernel/cpufeature.c
> > @@ -1791,6 +1791,19 @@ int get_cpu_with_amu_feat(void)
> >  	return cpumask_any(&amu_cpus);
> >  }
> >  
> > +bool cpu_has_broken_amu_constcnt(void)
> > +{
> > +	/* List of CPUs which have broken AMEVCNTR01 (constant counter) */
> > +	static const struct midr_range cpus[] = {
> > +#ifdef CONFIG_ARM64_ERRATUM_2457168
> > +		MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
> > +#endif
> > +		{},
> > +	};
> > +
> > +	return is_midr_in_range(read_cpuid_id(), cpus);
> > +}
> 
> I'd rather not have this in cpufeature.c as it's not really a feature.
> We have some precedent with checking errata in cpufeature.c but IIRC we
> did that only to check whether to enable a feature or not in that file
> (DBM).
> 

If it's okay with you I can move this to cpu_errata.c:arm64_errata[], but
the type of the capability would have to be
ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE. I see there are other workarounds
like this so I hope it's not a problem.

> > +
> >  static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap)
> >  {
> >  	if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) {
> > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> > index 9ab78ad826e2..d4b0b0a40515 100644
> > --- a/arch/arm64/kernel/topology.c
> > +++ b/arch/arm64/kernel/topology.c
> > @@ -127,7 +127,8 @@ int __init parse_acpi_topology(void)
> >  
> >  #ifdef CONFIG_ARM64_AMU_EXTN
> >  #define read_corecnt()	read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)
> > -#define read_constcnt()	read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)
> > +#define read_constcnt()	(cpu_has_broken_amu_constcnt() ? 0UL : \
> > +			read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0))
> 
> How often is this called? You end up reading the cpuid, comparing the
> range on each call. I guess you can't use a cpucap in the arm64_errata[]
> array as you want a check per-CPU? Does it matter if we return 0UL on
> for all CPUs if one is affected?
> 

Yes, ideally we only want to disable the use of the constant counter for
the affected CPUs. In that case some alternative method can be used for
FIE for the affected CPUs (usually cpufreq) while the other CPUs can still
use AMUs. Given that the bigger CPUs usually end up throttled, it would be
useful to maintain the use of AMUs for them even if we have affected
A510s in the system.

Also, I wanted to avoid disabling the feature altogether by not setting
amu_cpus as only one counter is affected, not all. But this would be the
simpler option as it will also remove the need for changes for FFH, we
would end up calling this only once for each CPU in cpu_amu_enable() -
so no additional function would be needed, and functionality will be
unchanged as all usecases for AMUs so far are tied to the use of the
constant counter. But we'd need to change how we handle this erratum in
the future when we add usescases for other counters.

So we do end up calling this function on the tick for CPUs that are not
affected, which is not ideal.

But I have a few ideas about how to make it nicer - clearing
arch_const_cycles_prev before freq_counters_valid() so we disable use of
counters for FIE by checking for affected CPUs only once. Handling FFH
will be more tricky but nonetheless let me see if I can do a better job
in v2.

Thanks,
Ionela.

> -- 
> Catalin
Catalin Marinas June 17, 2022, 6:24 p.m. UTC | #3
On Tue, Jun 14, 2022 at 02:42:58PM +0100, Ionela Voinescu wrote:
> On Friday 10 Jun 2022 at 17:47:12 (+0100), Catalin Marinas wrote:
> > On Tue, Jun 07, 2022 at 01:53:40PM +0100, Ionela Voinescu wrote:
> > > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> > > index 42ea2bd856c6..b9e4b2bd2c63 100644
> > > --- a/arch/arm64/kernel/cpufeature.c
> > > +++ b/arch/arm64/kernel/cpufeature.c
> > > @@ -1791,6 +1791,19 @@ int get_cpu_with_amu_feat(void)
> > >  	return cpumask_any(&amu_cpus);
> > >  }
> > >  
> > > +bool cpu_has_broken_amu_constcnt(void)
> > > +{
> > > +	/* List of CPUs which have broken AMEVCNTR01 (constant counter) */
> > > +	static const struct midr_range cpus[] = {
> > > +#ifdef CONFIG_ARM64_ERRATUM_2457168
> > > +		MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
> > > +#endif
> > > +		{},
> > > +	};
> > > +
> > > +	return is_midr_in_range(read_cpuid_id(), cpus);
> > > +}
> > 
> > I'd rather not have this in cpufeature.c as it's not really a feature.
> > We have some precedent with checking errata in cpufeature.c but IIRC we
> > did that only to check whether to enable a feature or not in that file
> > (DBM).
> 
> If it's okay with you I can move this to cpu_errata.c:arm64_errata[], but
> the type of the capability would have to be
> ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE. I see there are other workarounds
> like this so I hope it's not a problem.

I think this should work. If you want to make a per-CPU decision,
instead of checking cpus_have_const_cap(), use this_cpu_has_cap(). It
would read the actual CPU regs pretty much like your
cpu_has_broken_amu_constcnt() but at least is more unified with the
errata framework.
diff mbox series

Patch

diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index d27db84d585e..d9aff50c26cd 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -52,6 +52,8 @@  stable kernels.
 | Allwinner      | A64/R18         | UNKNOWN1        | SUN50I_ERRATUM_UNKNOWN1     |
 +----------------+-----------------+-----------------+-----------------------------+
 +----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A510     | #2457168        | ARM64_ERRATUM_2457168       |
++----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Cortex-A510     | #2064142        | ARM64_ERRATUM_2064142       |
 +----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Cortex-A510     | #2038923        | ARM64_ERRATUM_2038923       |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..a7bab0312261 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -880,6 +880,24 @@  config ARM64_ERRATUM_1902691
 
 	  If unsure, say Y.
 
+config ARM64_ERRATUM_2457168
+	bool "Cortex-A510: 2457168: workaround for AMEVCNTR01 incrementing incorrectly"
+	depends on ARM64_AMU_EXTN
+	default y
+	help
+	  This option adds the workaround for ARM Cortex-A510 erratum 2457168.
+
+	  The AMU counter AMEVCNTR01 (constant counter) should increment at the same rate
+	  as the system counter. On affected Cortex-A510 cores AMEVCNTR01 increments
+	  incorrectly giving a significantly higher output value.
+
+	  Work around this problem in the arm64 topology code by always returning 0 when
+	  reading the affected counter. This will disable all users of this counter from
+	  using it. This effect is the same as firmware disabling affected counters.
+
+	  If unsure, say Y.
+
+
 config CAVIUM_ERRATUM_22375
 	bool "Cavium erratum 22375, 24313"
 	default y
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 14a8f3d93add..80e0c700cecf 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -881,11 +881,16 @@  static inline bool cpu_has_pan(void)
 #ifdef CONFIG_ARM64_AMU_EXTN
 /* Check whether the cpu supports the Activity Monitors Unit (AMU) */
 extern bool cpu_has_amu_feat(int cpu);
+extern bool cpu_has_broken_amu_constcnt(void);
 #else
 static inline bool cpu_has_amu_feat(int cpu)
 {
 	return false;
 }
+static inline bool cpu_has_broken_amu_constcnt(void)
+{
+	return false;
+}
 #endif
 
 /* Get a cpu that supports the Activity Monitors Unit (AMU) */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 42ea2bd856c6..b9e4b2bd2c63 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1791,6 +1791,19 @@  int get_cpu_with_amu_feat(void)
 	return cpumask_any(&amu_cpus);
 }
 
+bool cpu_has_broken_amu_constcnt(void)
+{
+	/* List of CPUs which have broken AMEVCNTR01 (constant counter) */
+	static const struct midr_range cpus[] = {
+#ifdef CONFIG_ARM64_ERRATUM_2457168
+		MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
+#endif
+		{},
+	};
+
+	return is_midr_in_range(read_cpuid_id(), cpus);
+}
+
 static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap)
 {
 	if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) {
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 9ab78ad826e2..d4b0b0a40515 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -127,7 +127,8 @@  int __init parse_acpi_topology(void)
 
 #ifdef CONFIG_ARM64_AMU_EXTN
 #define read_corecnt()	read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)
-#define read_constcnt()	read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)
+#define read_constcnt()	(cpu_has_broken_amu_constcnt() ? 0UL : \
+			read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0))
 #else
 #define read_corecnt()	(0UL)
 #define read_constcnt()	(0UL)
@@ -342,7 +343,12 @@  int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val)
  */
 bool cpc_ffh_supported(void)
 {
-	return freq_counters_valid(get_cpu_with_amu_feat());
+	int cpu = get_cpu_with_amu_feat();
+
+	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
+		return false;
+
+	return true;
 }
 
 int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val)