Message ID | 1673017529-1429208-2-git-send-email-renyu.zj@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add metrics for neoverse-n2-v2 | expand |
On 06/01/2023 15:05, Jing Zhang wrote: > The metrics of topdown L1 are from ARM sbsa7.0 platform design doc[0], > D37-38, which are standard. So put them in the common file sbsa.json of > arm64, so that other cores besides n2/v2 can also be reused. > > Slots may be different in each architecture, so added "#slots" literal > to get different constant for each architecture. > > The value of slots comes from the register PMMIR_EL1, which I can read > in /sys/bus/event_source/device/armv8_pmuv3_*/caps/slots. PMMIR_EL1.SLOT > might read as zero if the STALL_SLOT event is not implemented or the PMU > version is lower than ID_AA64DFR0_EL1_PMUVer_V3P4. > > [0] https://urldefense.com/v3/__https://documentation-service.arm.com/static/60250c7395978b529036da86?token=__;!!ACWV5N9M2RV99hQ!J5JW3y6GhaJUqLfbEAzWIy4GJOhUkHQN4D5hEv3Outpzd54fN1Nt4LNKGnuRtMAepS_Nit-KLSUW98tVfFR0TmMVGQ$ > > Signed-off-by: Jing Zhang <renyu.zj@linux.alibaba.com> > Acked-by: Ian Rogers <irogers@google.com> hmmm... you have made significant changes in this version (compared to previous), so I would not have picked up this tag. That's just my opinion. As for the patchset org, I'd move the JSON change here into patch #2, and make this patch purely about add "slots" literal support for arm64. > --- > tools/perf/arch/arm64/util/pmu.c | 22 ++++++++++++++++++++++ > tools/perf/pmu-events/arch/arm64/sbsa.json | 30 ++++++++++++++++++++++++++++++ > tools/perf/pmu-events/jevents.py | 2 ++ > tools/perf/util/expr.c | 5 +++++ > tools/perf/util/pmu.c | 5 +++++ > tools/perf/util/pmu.h | 1 + > 6 files changed, 65 insertions(+) > create mode 100644 tools/perf/pmu-events/arch/arm64/sbsa.json > > diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c > index 477e513..227dadb 100644 > --- a/tools/perf/arch/arm64/util/pmu.c > +++ b/tools/perf/arch/arm64/util/pmu.c > @@ -3,6 +3,7 @@ > #include <internal/cpumap.h> > #include "../../../util/cpumap.h" > #include "../../../util/pmu.h" > +#include <api/fs/fs.h> > > const struct pmu_events_table *pmu_events_table__find(void) > { > @@ -24,3 +25,24 @@ const struct pmu_events_table *pmu_events_table__find(void) > > return NULL; > } > + > +int perf_pmu__get_slots(void) > +{ > + char path[PATH_MAX]; > + unsigned long long slots = 0; > + struct perf_pmu *pmu = NULL; > + > + while ((pmu = perf_pmu__scan(pmu)) != NULL) { > + if (is_pmu_core(pmu->name)) > + break; > + } There is a lot in common with arm64's pmu_events_table__find() - can you factor it out? I also prefer how we check for homogeneous CPUs in pmu_events_table__find() (which you should do, also). > + if (pmu) { > + scnprintf(path, PATH_MAX, > + EVENT_SOURCE_DEVICE_PATH "%s/caps/slots", pmu->name); > + /* The value of slots is not greater than INT_MAX, but sysfs__read_int > + * can't read value with 0x prefix, so use sysfs__read_ull instead. > + */ > + sysfs__read_ull(path, &slots); > + } > + return (int)slots; > +} > diff --git a/tools/perf/pmu-events/arch/arm64/sbsa.json b/tools/perf/pmu-events/arch/arm64/sbsa.json > new file mode 100644 > index 0000000..f678c37e > --- /dev/null > +++ b/tools/perf/pmu-events/arch/arm64/sbsa.json > @@ -0,0 +1,30 @@ > +[ > + { > + "MetricExpr": "stall_slot_frontend / (#slots * cpu_cycles)", > + "BriefDescription": "Frontend bound L1 topdown metric", > + "MetricGroup": "TopdownL1", > + "MetricName": "frontend_bound", > + "ScaleUnit": "100%" > + }, > + { > + "MetricExpr": "(1 - op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", > + "BriefDescription": "Bad speculation L1 topdown metric", > + "MetricGroup": "TopdownL1", > + "MetricName": "bad_speculation", > + "ScaleUnit": "100%" > + }, > + { > + "MetricExpr": "(op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", > + "BriefDescription": "Retiring L1 topdown metric", > + "MetricGroup": "TopdownL1", > + "MetricName": "retiring", > + "ScaleUnit": "100%" > + }, > + { > + "MetricExpr": "stall_slot_backend / (#slots * cpu_cycles)", > + "BriefDescription": "Backend Bound L1 topdown metric", > + "MetricGroup": "TopdownL1", > + "MetricName": "backend_bound", > + "ScaleUnit": "100%" > + } > +] > diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py > index 4c398e0..0416b74 100755 > --- a/tools/perf/pmu-events/jevents.py > +++ b/tools/perf/pmu-events/jevents.py > @@ -358,6 +358,8 @@ def preprocess_arch_std_files(archpath: str) -> None: > for event in read_json_events(item.path, topic=''): > if event.name: > _arch_std_events[event.name.lower()] = event > + if event.metric_name: > + _arch_std_events[event.metric_name.lower()] = event > > > def print_events_table_prefix(tblname: str) -> None: > diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c > index 00dcde3..3d67707 100644 > --- a/tools/perf/util/expr.c > +++ b/tools/perf/util/expr.c > @@ -19,6 +19,7 @@ > #include <linux/zalloc.h> > #include <ctype.h> > #include <math.h> > +#include "pmu.h" > > #ifdef PARSER_DEBUG > extern int expr_debug; > @@ -448,6 +449,10 @@ double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx > result = topology->core_cpus_lists; > goto out; > } > + if (!strcmp("#slots", literal)) { > + result = perf_pmu__get_slots(); > + goto out; > + } > > pr_err("Unrecognized literal '%s'", literal); > out: > diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c > index 2bdeb89..d4cace2 100644 > --- a/tools/perf/util/pmu.c > +++ b/tools/perf/util/pmu.c > @@ -1993,3 +1993,8 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, > *ucpus_ptr = unmatched_cpus; > return 0; > } > + > +int __weak perf_pmu__get_slots(void) > +{ > + return 0; should this be NAN? > +} > diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h > index 69ca000..a2f7df8 100644 > --- a/tools/perf/util/pmu.h > +++ b/tools/perf/util/pmu.h > @@ -259,4 +259,5 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, > > char *pmu_find_real_name(const char *name); > char *pmu_find_alias_name(const char *name); > +int perf_pmu__get_slots(void); I think that this name is a bit too vague. Maybe perf_pmu__cpu_cycles_per_slot() could be better. > #endif /* __PMU_H */ Thanks, John
在 2023/1/6 下午11:59, John Garry 写道: > On 06/01/2023 15:05, Jing Zhang wrote: >> The metrics of topdown L1 are from ARM sbsa7.0 platform design doc[0], >> D37-38, which are standard. So put them in the common file sbsa.json of >> arm64, so that other cores besides n2/v2 can also be reused. >> >> Slots may be different in each architecture, so added "#slots" literal >> to get different constant for each architecture. >> >> The value of slots comes from the register PMMIR_EL1, which I can read >> in /sys/bus/event_source/device/armv8_pmuv3_*/caps/slots. PMMIR_EL1.SLOT >> might read as zero if the STALL_SLOT event is not implemented or the PMU >> version is lower than ID_AA64DFR0_EL1_PMUVer_V3P4. >> >> [0] https://urldefense.com/v3/__https://documentation-service.arm.com/static/60250c7395978b529036da86?token=__;!!ACWV5N9M2RV99hQ!J5JW3y6GhaJUqLfbEAzWIy4GJOhUkHQN4D5hEv3Outpzd54fN1Nt4LNKGnuRtMAepS_Nit-KLSUW98tVfFR0TmMVGQ$ >> >> Signed-off-by: Jing Zhang <renyu.zj@linux.alibaba.com> >> Acked-by: Ian Rogers <irogers@google.com> > > hmmm... you have made significant changes in this version (compared to previous), so I would not have picked up this tag. That's just my opinion. > Thanks for pointing it out. > As for the patchset org, I'd move the JSON change here into patch #2, and make this patch purely about add "slots" literal support for arm64. > Ok, I will move the changes of sbsa.json and jevent.py to patch#2. >> --- >> tools/perf/arch/arm64/util/pmu.c | 22 ++++++++++++++++++++++ >> tools/perf/pmu-events/arch/arm64/sbsa.json | 30 ++++++++++++++++++++++++++++++ >> tools/perf/pmu-events/jevents.py | 2 ++ >> tools/perf/util/expr.c | 5 +++++ >> tools/perf/util/pmu.c | 5 +++++ >> tools/perf/util/pmu.h | 1 + >> 6 files changed, 65 insertions(+) >> create mode 100644 tools/perf/pmu-events/arch/arm64/sbsa.json >> >> diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c >> index 477e513..227dadb 100644 >> --- a/tools/perf/arch/arm64/util/pmu.c >> +++ b/tools/perf/arch/arm64/util/pmu.c >> @@ -3,6 +3,7 @@ >> #include <internal/cpumap.h> >> #include "../../../util/cpumap.h" >> #include "../../../util/pmu.h" >> +#include <api/fs/fs.h> >> const struct pmu_events_table *pmu_events_table__find(void) >> { >> @@ -24,3 +25,24 @@ const struct pmu_events_table *pmu_events_table__find(void) >> return NULL; >> } >> + >> +int perf_pmu__get_slots(void) >> +{ >> + char path[PATH_MAX]; >> + unsigned long long slots = 0; >> + struct perf_pmu *pmu = NULL; >> + >> + while ((pmu = perf_pmu__scan(pmu)) != NULL) { >> + if (is_pmu_core(pmu->name)) >> + break; >> + } > > There is a lot in common with arm64's pmu_events_table__find() - can you factor it out? I also prefer how we check for homogeneous CPUs in pmu_events_table__find() (which you should do, also). > I'll factor out the pmu_core__find function in tools/perf/arch/arm64/util/pmu.c: static const struct perf_pmu *pmu_core__find(void) { struct perf_pmu *pmu = NULL; while ((pmu = perf_pmu__scan(pmu))) { if (!is_pmu_core(pmu->name)) continue; /* * The cpumap should cover all CPUs. Otherwise, some CPUs may * not support some events or have different event IDs. */ if (pmu->cpus->nr != cpu__max_cpu().cpu) return NULL; return pmu; } return NULL; } >> + if (pmu) { >> + scnprintf(path, PATH_MAX, >> + EVENT_SOURCE_DEVICE_PATH "%s/caps/slots", pmu->name); >> + /* The value of slots is not greater than INT_MAX, but sysfs__read_int >> + * can't read value with 0x prefix, so use sysfs__read_ull instead. >> + */ >> + sysfs__read_ull(path, &slots); >> + } >> + return (int)slots; >> +} >> diff --git a/tools/perf/pmu-events/arch/arm64/sbsa.json b/tools/perf/pmu-events/arch/arm64/sbsa.json >> new file mode 100644 >> index 0000000..f678c37e >> --- /dev/null >> +++ b/tools/perf/pmu-events/arch/arm64/sbsa.json >> @@ -0,0 +1,30 @@ >> +[ >> + { >> + "MetricExpr": "stall_slot_frontend / (#slots * cpu_cycles)", >> + "BriefDescription": "Frontend bound L1 topdown metric", >> + "MetricGroup": "TopdownL1", >> + "MetricName": "frontend_bound", >> + "ScaleUnit": "100%" >> + }, >> + { >> + "MetricExpr": "(1 - op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", >> + "BriefDescription": "Bad speculation L1 topdown metric", >> + "MetricGroup": "TopdownL1", >> + "MetricName": "bad_speculation", >> + "ScaleUnit": "100%" >> + }, >> + { >> + "MetricExpr": "(op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", >> + "BriefDescription": "Retiring L1 topdown metric", >> + "MetricGroup": "TopdownL1", >> + "MetricName": "retiring", >> + "ScaleUnit": "100%" >> + }, >> + { >> + "MetricExpr": "stall_slot_backend / (#slots * cpu_cycles)", >> + "BriefDescription": "Backend Bound L1 topdown metric", >> + "MetricGroup": "TopdownL1", >> + "MetricName": "backend_bound", >> + "ScaleUnit": "100%" >> + } >> +] >> diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py >> index 4c398e0..0416b74 100755 >> --- a/tools/perf/pmu-events/jevents.py >> +++ b/tools/perf/pmu-events/jevents.py >> @@ -358,6 +358,8 @@ def preprocess_arch_std_files(archpath: str) -> None: >> for event in read_json_events(item.path, topic=''): >> if event.name: >> _arch_std_events[event.name.lower()] = event >> + if event.metric_name: >> + _arch_std_events[event.metric_name.lower()] = event >> def print_events_table_prefix(tblname: str) -> None: >> diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c >> index 00dcde3..3d67707 100644 >> --- a/tools/perf/util/expr.c >> +++ b/tools/perf/util/expr.c >> @@ -19,6 +19,7 @@ >> #include <linux/zalloc.h> >> #include <ctype.h> >> #include <math.h> >> +#include "pmu.h" >> #ifdef PARSER_DEBUG >> extern int expr_debug; >> @@ -448,6 +449,10 @@ double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx >> result = topology->core_cpus_lists; >> goto out; >> } >> + if (!strcmp("#slots", literal)) { >> + result = perf_pmu__get_slots(); >> + goto out; >> + } >> pr_err("Unrecognized literal '%s'", literal); >> out: >> diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c >> index 2bdeb89..d4cace2 100644 >> --- a/tools/perf/util/pmu.c >> +++ b/tools/perf/util/pmu.c >> @@ -1993,3 +1993,8 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, >> *ucpus_ptr = unmatched_cpus; >> return 0; >> } >> + >> +int __weak perf_pmu__get_slots(void) >> +{ >> + return 0; > > should this be NAN? > Will do. >> +} >> diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h >> index 69ca000..a2f7df8 100644 >> --- a/tools/perf/util/pmu.h >> +++ b/tools/perf/util/pmu.h >> @@ -259,4 +259,5 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, >> char *pmu_find_real_name(const char *name); >> char *pmu_find_alias_name(const char *name); >> +int perf_pmu__get_slots(void); > > I think that this name is a bit too vague. Maybe perf_pmu__cpu_cycles_per_slot() could be better. > Does cpu_cycles_per_slot mean "cpu cycles per slot"? In the documemt, Slots mean operation width. If slots are 5, the largest value by which the STALL_SLOT PMU event may increment in one cycle is 5. So, maybe perf_pmu__cpu_slots_per_cycle() could be more accurate?
On 09/01/2023 02:53, Jing Zhang wrote: > I'll factor out the pmu_core__find function in tools/perf/arch/arm64/util/pmu.c: > > static const struct perf_pmu *pmu_core__find(void) maybe name as pmu_core__find_same() or similar to indicate that we're only dealing with homogeneous cores > { > struct perf_pmu *pmu = NULL; no need to init to NULL > > while ((pmu = perf_pmu__scan(pmu))) { 1x superfluous level of () > if (!is_pmu_core(pmu->name)) > continue; > > /* > * The cpumap should cover all CPUs. Otherwise, some CPUs may > * not support some events or have different event IDs. > */ > if (pmu->cpus->nr != cpu__max_cpu().cpu) > return NULL; > return pmu; > } > > return NULL; > } > ... > >>> +} >>> diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h >>> index 69ca000..a2f7df8 100644 >>> --- a/tools/perf/util/pmu.h >>> +++ b/tools/perf/util/pmu.h >>> @@ -259,4 +259,5 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, >>> char *pmu_find_real_name(const char *name); >>> char *pmu_find_alias_name(const char *name); >>> +int perf_pmu__get_slots(void); >> I think that this name is a bit too vague. Maybe perf_pmu__cpu_cycles_per_slot() could be better. >> > Does cpu_cycles_per_slot mean "cpu cycles per slot"? In the documemt, Slots mean operation width. > If slots are 5, the largest value by which the STALL_SLOT PMU event may increment in one cycle is 5. > So, maybe perf_pmu__cpu_slots_per_cycle() could be more accurate? ok, yes, fine. Thanks, John
在 2023/1/9 下午10:58, John Garry 写道: > On 09/01/2023 02:53, Jing Zhang wrote: >> I'll factor out the pmu_core__find function in tools/perf/arch/arm64/util/pmu.c: >> >> static const struct perf_pmu *pmu_core__find(void) > > maybe name as pmu_core__find_same() or similar to indicate that we're only dealing with homogeneous cores > >> { >> struct perf_pmu *pmu = NULL; > > no need to init to NULL > >> >> while ((pmu = perf_pmu__scan(pmu))) { > > 1x superfluous level of () > Ok, will do, Thank you!
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c index 477e513..227dadb 100644 --- a/tools/perf/arch/arm64/util/pmu.c +++ b/tools/perf/arch/arm64/util/pmu.c @@ -3,6 +3,7 @@ #include <internal/cpumap.h> #include "../../../util/cpumap.h" #include "../../../util/pmu.h" +#include <api/fs/fs.h> const struct pmu_events_table *pmu_events_table__find(void) { @@ -24,3 +25,24 @@ const struct pmu_events_table *pmu_events_table__find(void) return NULL; } + +int perf_pmu__get_slots(void) +{ + char path[PATH_MAX]; + unsigned long long slots = 0; + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (is_pmu_core(pmu->name)) + break; + } + if (pmu) { + scnprintf(path, PATH_MAX, + EVENT_SOURCE_DEVICE_PATH "%s/caps/slots", pmu->name); + /* The value of slots is not greater than INT_MAX, but sysfs__read_int + * can't read value with 0x prefix, so use sysfs__read_ull instead. + */ + sysfs__read_ull(path, &slots); + } + return (int)slots; +} diff --git a/tools/perf/pmu-events/arch/arm64/sbsa.json b/tools/perf/pmu-events/arch/arm64/sbsa.json new file mode 100644 index 0000000..f678c37e --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/sbsa.json @@ -0,0 +1,30 @@ +[ + { + "MetricExpr": "stall_slot_frontend / (#slots * cpu_cycles)", + "BriefDescription": "Frontend bound L1 topdown metric", + "MetricGroup": "TopdownL1", + "MetricName": "frontend_bound", + "ScaleUnit": "100%" + }, + { + "MetricExpr": "(1 - op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", + "BriefDescription": "Bad speculation L1 topdown metric", + "MetricGroup": "TopdownL1", + "MetricName": "bad_speculation", + "ScaleUnit": "100%" + }, + { + "MetricExpr": "(op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", + "BriefDescription": "Retiring L1 topdown metric", + "MetricGroup": "TopdownL1", + "MetricName": "retiring", + "ScaleUnit": "100%" + }, + { + "MetricExpr": "stall_slot_backend / (#slots * cpu_cycles)", + "BriefDescription": "Backend Bound L1 topdown metric", + "MetricGroup": "TopdownL1", + "MetricName": "backend_bound", + "ScaleUnit": "100%" + } +] diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 4c398e0..0416b74 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -358,6 +358,8 @@ def preprocess_arch_std_files(archpath: str) -> None: for event in read_json_events(item.path, topic=''): if event.name: _arch_std_events[event.name.lower()] = event + if event.metric_name: + _arch_std_events[event.metric_name.lower()] = event def print_events_table_prefix(tblname: str) -> None: diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c index 00dcde3..3d67707 100644 --- a/tools/perf/util/expr.c +++ b/tools/perf/util/expr.c @@ -19,6 +19,7 @@ #include <linux/zalloc.h> #include <ctype.h> #include <math.h> +#include "pmu.h" #ifdef PARSER_DEBUG extern int expr_debug; @@ -448,6 +449,10 @@ double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx result = topology->core_cpus_lists; goto out; } + if (!strcmp("#slots", literal)) { + result = perf_pmu__get_slots(); + goto out; + } pr_err("Unrecognized literal '%s'", literal); out: diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 2bdeb89..d4cace2 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1993,3 +1993,8 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, *ucpus_ptr = unmatched_cpus; return 0; } + +int __weak perf_pmu__get_slots(void) +{ + return 0; +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 69ca000..a2f7df8 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -259,4 +259,5 @@ int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, char *pmu_find_real_name(const char *name); char *pmu_find_alias_name(const char *name); +int perf_pmu__get_slots(void); #endif /* __PMU_H */