diff mbox series

[v4,5/6] perf vendor events arm64: Update stall_slot workaround for N2 r0p3

Message ID 20230807142138.288713-6-james.clark@arm.com (mailing list archive)
State New, archived
Headers show
Series perf vendor events arm64: Update N2 and V2 metrics and events using Arm telemetry repo | expand

Commit Message

James Clark Aug. 7, 2023, 2:20 p.m. UTC
N2 r0p3 doesn't require the workaround [1], so gating on (#slots - 5) no
longer works because all N2s have 5 slots. Add a new expression builtin
that identifies the need for the workaround correctly.

[1]: https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/blob/main/data/pmu/cpu/neoverse/neoverse-n2-r0p3.json
Signed-off-by: James Clark <james.clark@arm.com>
---
 tools/perf/arch/arm64/util/pmu.c              | 21 +++++++++++++++++++
 .../arm64/arm/neoverse-n2-v2/metrics.json     |  8 +++----
 tools/perf/util/expr.c                        |  4 ++++
 tools/perf/util/pmu.c                         |  6 ++++++
 tools/perf/util/pmu.h                         |  1 +
 5 files changed, 36 insertions(+), 4 deletions(-)

Comments

John Garry Aug. 8, 2023, 10:18 a.m. UTC | #1
On 07/08/2023 15:20, James Clark wrote:

Hi James,

Thanks for the effort in doing this.

> N2 r0p3 doesn't require the workaround [1], so gating on (#slots - 5) no
> longer works because all N2s have 5 slots. Add a new expression builtin
> that identifies the need for the workaround correctly.
> 
> [1]: https://urldefense.com/v3/__https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/blob/main/data/pmu/cpu/neoverse/neoverse-n2-r0p3.json__;!!ACWV5N9M2RV99hQ!Nx1xgWXXS9GBasNpOKQXHWKe8VwpRB0h8lAfOmwUmkkOQTjFqn2NswO8ti8vTcblUfAYN9NAtxqAh-sf0TEOvQ$
> Signed-off-by: James Clark <james.clark@arm.com>
> ---
>   tools/perf/arch/arm64/util/pmu.c              | 21 +++++++++++++++++++
>   .../arm64/arm/neoverse-n2-v2/metrics.json     |  8 +++----
>   tools/perf/util/expr.c                        |  4 ++++
>   tools/perf/util/pmu.c                         |  6 ++++++
>   tools/perf/util/pmu.h                         |  1 +
>   5 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
> index 561de0cb6b95..30e2385a83cf 100644
> --- a/tools/perf/arch/arm64/util/pmu.c
> +++ b/tools/perf/arch/arm64/util/pmu.c
> @@ -2,6 +2,7 @@
>   
>   #include <internal/cpumap.h>
>   #include "../../../util/cpumap.h"
> +#include "../../../util/header.h"
>   #include "../../../util/pmu.h"
>   #include "../../../util/pmus.h"
>   #include <api/fs/fs.h>
> @@ -62,3 +63,23 @@ double perf_pmu__cpu_slots_per_cycle(void)
>   
>   	return slots ? (double)slots : NAN;
>   }
> +
> +double perf_pmu__no_stall_errata(void)

While I like the approach of encoding the per-CPU support in the metric 
expression, I find that literal "no stall errata" is vague and could 
apply to any "stall errata" for any SoC for any architecture.

If we were going to do it this way, then we would need a more specific 
name, like perf_pmu__no_stall_errata_arm64_errata123456(), but that 
should not be in the core perf code.

Could we instead add a function to check cpuid and have something like 
this in the JSON:

"MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if 
(cpuid_less_than(410fd493)) else (stall_slot - cpu_cycles)) / (#slots * 
cpu_cycles))"

I'm currently figuring out how cpuid_less_than() would be implemented 
(I'm no great python wrangler), but it would be along the lines of what 
Ian added for "has_event" in 
https://lore.kernel.org/linux-perf-users/20230623151016.4193660-1-irogers@google.com/

Thanks,
John
James Clark Aug. 9, 2023, 1:06 p.m. UTC | #2
On 08/08/2023 11:18, John Garry wrote:
> On 07/08/2023 15:20, James Clark wrote:
> 
> Hi James,
> 
> Thanks for the effort in doing this.
> 
>> N2 r0p3 doesn't require the workaround [1], so gating on (#slots - 5) no
>> longer works because all N2s have 5 slots. Add a new expression builtin
>> that identifies the need for the workaround correctly.
>>
>> [1]:
>> https://urldefense.com/v3/__https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/blob/main/data/pmu/cpu/neoverse/neoverse-n2-r0p3.json__;!!ACWV5N9M2RV99hQ!Nx1xgWXXS9GBasNpOKQXHWKe8VwpRB0h8lAfOmwUmkkOQTjFqn2NswO8ti8vTcblUfAYN9NAtxqAh-sf0TEOvQ$
>> Signed-off-by: James Clark <james.clark@arm.com>
>> ---
>>   tools/perf/arch/arm64/util/pmu.c              | 21 +++++++++++++++++++
>>   .../arm64/arm/neoverse-n2-v2/metrics.json     |  8 +++----
>>   tools/perf/util/expr.c                        |  4 ++++
>>   tools/perf/util/pmu.c                         |  6 ++++++
>>   tools/perf/util/pmu.h                         |  1 +
>>   5 files changed, 36 insertions(+), 4 deletions(-)
>>
>> diff --git a/tools/perf/arch/arm64/util/pmu.c
>> b/tools/perf/arch/arm64/util/pmu.c
>> index 561de0cb6b95..30e2385a83cf 100644
>> --- a/tools/perf/arch/arm64/util/pmu.c
>> +++ b/tools/perf/arch/arm64/util/pmu.c
>> @@ -2,6 +2,7 @@
>>     #include <internal/cpumap.h>
>>   #include "../../../util/cpumap.h"
>> +#include "../../../util/header.h"
>>   #include "../../../util/pmu.h"
>>   #include "../../../util/pmus.h"
>>   #include <api/fs/fs.h>
>> @@ -62,3 +63,23 @@ double perf_pmu__cpu_slots_per_cycle(void)
>>         return slots ? (double)slots : NAN;
>>   }
>> +
>> +double perf_pmu__no_stall_errata(void)
> 
> While I like the approach of encoding the per-CPU support in the metric
> expression, I find that literal "no stall errata" is vague and could
> apply to any "stall errata" for any SoC for any architecture.
> 
> If we were going to do it this way, then we would need a more specific
> name, like perf_pmu__no_stall_errata_arm64_errata123456(), but that
> should not be in the core perf code.
> 
> Could we instead add a function to check cpuid and have something like
> this in the JSON:
> 
> "MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if
> (cpuid_less_than(410fd493)) else (stall_slot - cpu_cycles)) / (#slots *
> cpu_cycles))"
> 
> I'm currently figuring out how cpuid_less_than() would be implemented
> (I'm no great python wrangler), but it would be along the lines of what
> Ian added for "has_event" in
> https://lore.kernel.org/linux-perf-users/20230623151016.4193660-1-irogers@google.com/
> 
> Thanks,
> John

Yeah it looks like it could be done that way. Also, the way I added it,
it doesn't have access to the PMU type, it just does a generic
pmu__find_core_pmu() so won't work very well for heterogeneous systems.

If we're going to do a deeper modification of the expression parser like
with has_event() it might be possible to pass in the actual CPU ID that
the metric is running on which would be better.

I'll have a look.

James
John Garry Aug. 9, 2023, 1:54 p.m. UTC | #3
On 09/08/2023 14:06, James Clark wrote:
>> "MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if
>> (cpuid_less_than(410fd493)) else (stall_slot - cpu_cycles)) / (#slots *
>> cpu_cycles))"
>>
>> I'm currently figuring out how cpuid_less_than() would be implemented
>> (I'm no great python wrangler), but it would be along the lines of what
>> Ian added for "has_event" in
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-perf-users/20230623151016.4193660-1-irogers@google.com/__;!!ACWV5N9M2RV99hQ!PlOppEWtIj9jDW2Zlon0zRZVpzPTzPvm5Ho5NnRIN0vD78iFcEzMEAtsrW_MrRPiW84XhWpbhc3seQcmLu-BfQ$  
>>
>> Thanks,
>> John
> Yeah it looks like it could be done that way. Also, the way I added it,
> it doesn't have access to the PMU type, it just does a generic
> pmu__find_core_pmu() so won't work very well for heterogeneous systems.

I haven't been keeping a close eye on the hybrid PMU support, but AFAIK 
metrics for hybrid arm64 system, i.e. bL, aren't supported - maybe that 
has changed. The gating for bL support was in pmu__find_core_pmu() 
returning NULL for a hybrid system.

> 
> If we're going to do a deeper modification of the expression parser like
> with has_event() it might be possible to pass in the actual CPU ID that
> the metric is running on which would be better.
> 
> I'll have a look.

Thanks. I was playing with this yesterday, but I was making slow 
progress. I was essentially following the has_event example, but the 
argument type causes an issue, in that has_event expected an event name, 
while we want to pass a hex string.

If you could check this then that would be great.

Thanks,
John
James Clark Aug. 9, 2023, 3:14 p.m. UTC | #4
On 09/08/2023 14:54, John Garry wrote:
> On 09/08/2023 14:06, James Clark wrote:
>>> "MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if
>>> (cpuid_less_than(410fd493)) else (stall_slot - cpu_cycles)) / (#slots *
>>> cpu_cycles))"
>>>
>>> I'm currently figuring out how cpuid_less_than() would be implemented
>>> (I'm no great python wrangler), but it would be along the lines of what
>>> Ian added for "has_event" in
>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-perf-users/20230623151016.4193660-1-irogers@google.com/__;!!ACWV5N9M2RV99hQ!PlOppEWtIj9jDW2Zlon0zRZVpzPTzPvm5Ho5NnRIN0vD78iFcEzMEAtsrW_MrRPiW84XhWpbhc3seQcmLu-BfQ$ 
>>> Thanks,
>>> John
>> Yeah it looks like it could be done that way. Also, the way I added it,
>> it doesn't have access to the PMU type, it just does a generic
>> pmu__find_core_pmu() so won't work very well for heterogeneous systems.
> 
> I haven't been keeping a close eye on the hybrid PMU support, but AFAIK
> metrics for hybrid arm64 system, i.e. bL, aren't supported - maybe that
> has changed. The gating for bL support was in pmu__find_core_pmu()
> returning NULL for a hybrid system.
> 

Yes we're not currently publishing any metrics for hybrid systems, so in
this case for N2 and V2 it's not needed. But it would be good to try to
future proof it if possible. Although I don't know how the metrics
currently react on hybrid systems, it's also something I have to take a
look at.


>>
>> If we're going to do a deeper modification of the expression parser like
>> with has_event() it might be possible to pass in the actual CPU ID that
>> the metric is running on which would be better.
>>
>> I'll have a look.
> 
> Thanks. I was playing with this yesterday, but I was making slow
> progress. I was essentially following the has_event example, but the
> argument type causes an issue, in that has_event expected an event name,
> while we want to pass a hex string.
> 
> If you could check this then that would be great.
> 
> Thanks,
> John
>
diff mbox series

Patch

diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
index 561de0cb6b95..30e2385a83cf 100644
--- a/tools/perf/arch/arm64/util/pmu.c
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -2,6 +2,7 @@ 
 
 #include <internal/cpumap.h>
 #include "../../../util/cpumap.h"
+#include "../../../util/header.h"
 #include "../../../util/pmu.h"
 #include "../../../util/pmus.h"
 #include <api/fs/fs.h>
@@ -62,3 +63,23 @@  double perf_pmu__cpu_slots_per_cycle(void)
 
 	return slots ? (double)slots : NAN;
 }
+
+double perf_pmu__no_stall_errata(void)
+{
+	struct perf_pmu *pmu = pmu__find_core_pmu();
+	char *cpuid = perf_pmu__getcpuid(pmu);
+	bool n2_r0p3_plus;
+	bool not_n2;
+
+	if (!cpuid)
+		return NAN;
+
+	/* N2 r0p3+ doesn't need CPU_CYCLES to be subtracted from slots. */
+	n2_r0p3_plus = !strcmp_cpuid_str("0x00000000410fd493", cpuid);
+
+	/* Anything other than N2 doesn't need the workaround either */
+	not_n2 = strcmp_cpuid_str("0x00000000410fd490", cpuid);
+
+	free(cpuid);
+	return n2_r0p3_plus || not_n2;
+}
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/metrics.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/metrics.json
index 8ad15b726dca..9b912a9427f6 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/metrics.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/metrics.json
@@ -1,15 +1,15 @@ 
 [
     {
         "ArchStdEvent": "FRONTEND_BOUND",
-        "MetricExpr": "((stall_slot_frontend) if (#slots - 5) else (stall_slot_frontend - cpu_cycles)) / (#slots * cpu_cycles)"
+        "MetricExpr": "((stall_slot_frontend) if (#no_stall_errata) else (stall_slot_frontend - cpu_cycles)) / (#slots * cpu_cycles)"
     },
     {
         "ArchStdEvent": "BAD_SPECULATION",
-        "MetricExpr": "(1 - op_retired / op_spec) * (1 - (stall_slot if (#slots - 5) else (stall_slot - cpu_cycles)) / (#slots * cpu_cycles))"
+        "MetricExpr": "(1 - op_retired / op_spec) * (1 - (stall_slot if (#no_stall_errata) else (stall_slot - cpu_cycles)) / (#slots * cpu_cycles))"
     },
     {
         "ArchStdEvent": "RETIRING",
-        "MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if (#slots - 5) else (stall_slot - cpu_cycles)) / (#slots * cpu_cycles))"
+        "MetricExpr": "(op_retired / op_spec) * (1 - (stall_slot if (#no_stall_errata) else (stall_slot - cpu_cycles)) / (#slots * cpu_cycles))"
     },
     {
         "ArchStdEvent": "BACKEND_BOUND"
@@ -201,7 +201,7 @@ 
         "ScaleUnit": "100%"
     },
     {
-        "MetricExpr": "OP_RETIRED / OP_SPEC * (1 - (STALL_SLOT if (#slots - 5) else (STALL_SLOT - CPU_CYCLES)) / (#slots * CPU_CYCLES))",
+        "MetricExpr": "OP_RETIRED / OP_SPEC * (1 - (STALL_SLOT if (#no_stall_errata) else (STALL_SLOT - CPU_CYCLES)) / (#slots * CPU_CYCLES))",
         "BriefDescription": "The truly effective ratio of micro-operations executed by the CPU, which means that misprediction and stall are not included",
         "MetricGroup": "PEutilization",
         "MetricName": "cpu_utilization",
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 7410a165f68b..3bae19785796 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -465,6 +465,10 @@  double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx
 		result = perf_pmu__cpu_slots_per_cycle();
 		goto out;
 	}
+	if (!strcmp("#no_stall_errata", literal)) {
+		result = perf_pmu__no_stall_errata();
+		goto out;
+	}
 	if (!strcmp("#has_pmem", literal)) {
 		result = has_pmem();
 		goto out;
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index b6654b9f55d2..d37dc7202ddb 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1779,3 +1779,9 @@  void perf_pmu__delete(struct perf_pmu *pmu)
 	zfree(&pmu->alias_name);
 	free(pmu);
 }
+
+__weak double perf_pmu__no_stall_errata(void)
+{
+	/* Only exists on Arm */
+	return NAN;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 203b92860e3c..57c002308f9a 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -288,5 +288,6 @@  int perf_pmu__pathname_fd(int dirfd, const char *pmu_name, const char *filename,
 struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name);
 struct perf_pmu *perf_pmu__create_placeholder_core_pmu(struct list_head *core_pmus);
 void perf_pmu__delete(struct perf_pmu *pmu);
+double perf_pmu__no_stall_errata(void);
 
 #endif /* __PMU_H */