diff mbox series

[v4,4/4] perf tools: Support "branch-misses:pp" on arm64

Message ID 20200211140445.21986-5-james.clark@arm.com (mailing list archive)
State New, archived
Headers show
Series perf tools: Add support for some spe events and precise ip | expand

Commit Message

James Clark Feb. 11, 2020, 2:04 p.m. UTC
From: Tan Xiaojun <tanxiaojun@huawei.com>

At the suggestion of James Clark, use spe to support the precise
ip of some events. Currently its support event is:
branch-misses.

Example usage:

$ ./perf record -e branch-misses:pp dd if=/dev/zero of=/dev/null count=10000
(:p/pp/ppp is same for this case.)

$ ./perf report --stdio
("--stdio is not necessary")

--------------------------------------------------------------------
...
 # Samples: 14  of event 'branch-misses:pp'
 # Event count (approx.): 14
 #
 # Children      Self  Command  Shared Object      Symbol
 # ........  ........  .......  .................  ..........................
 #
    14.29%    14.29%  dd       [kernel.kallsyms]  [k] __arch_copy_from_user
    14.29%    14.29%  dd       libc-2.28.so       [.] _dl_addr
     7.14%     7.14%  dd       [kernel.kallsyms]  [k] __free_pages
     7.14%     7.14%  dd       [kernel.kallsyms]  [k] __pi_memcpy
     7.14%     7.14%  dd       [kernel.kallsyms]  [k] pagecache_get_page
     7.14%     7.14%  dd       [kernel.kallsyms]  [k] unmap_single_vma
     7.14%     7.14%  dd       dd                 [.] 0x00000000000025ec
     7.14%     7.14%  dd       ld-2.28.so         [.] _dl_lookup_symbol_x
     7.14%     7.14%  dd       ld-2.28.so         [.] check_match
     7.14%     7.14%  dd       libc-2.28.so       [.] __mpn_rshift
     7.14%     7.14%  dd       libc-2.28.so       [.] _nl_intern_locale_data
     7.14%     7.14%  dd       libc-2.28.so       [.] read_alias_file
...
--------------------------------------------------------------------

Signed-off-by: Tan Xiaojun <tanxiaojun@huawei.com>
Suggested-by: James Clark <James.Clark@arm.com>
Tested-by: Qi Liu <liuqi115@hisilicon.com>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Tan Xiaojun <tanxiaojun@huawei.com>
Cc: Al Grant <al.grant@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/arch/arm/util/auxtrace.c | 38 +++++++++++++++++++++++++++++
 tools/perf/builtin-record.c         |  5 ++++
 tools/perf/util/arm-spe.c           |  9 +++++++
 tools/perf/util/arm-spe.h           |  3 +++
 tools/perf/util/auxtrace.h          |  6 +++++
 5 files changed, 61 insertions(+)

Comments

Adrian Hunter Feb. 17, 2020, 11:42 a.m. UTC | #1
On 11/02/20 4:04 pm, James Clark wrote:
> From: Tan Xiaojun <tanxiaojun@huawei.com>
> 
> At the suggestion of James Clark, use spe to support the precise
> ip of some events. Currently its support event is:
> branch-misses.
> 
> Example usage:
> 
> $ ./perf record -e branch-misses:pp dd if=/dev/zero of=/dev/null count=10000
> (:p/pp/ppp is same for this case.)
> 
> $ ./perf report --stdio
> ("--stdio is not necessary")
> 
> --------------------------------------------------------------------
> ...
>  # Samples: 14  of event 'branch-misses:pp'
>  # Event count (approx.): 14
>  #
>  # Children      Self  Command  Shared Object      Symbol
>  # ........  ........  .......  .................  ..........................
>  #
>     14.29%    14.29%  dd       [kernel.kallsyms]  [k] __arch_copy_from_user
>     14.29%    14.29%  dd       libc-2.28.so       [.] _dl_addr
>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __free_pages
>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __pi_memcpy
>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] pagecache_get_page
>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] unmap_single_vma
>      7.14%     7.14%  dd       dd                 [.] 0x00000000000025ec
>      7.14%     7.14%  dd       ld-2.28.so         [.] _dl_lookup_symbol_x
>      7.14%     7.14%  dd       ld-2.28.so         [.] check_match
>      7.14%     7.14%  dd       libc-2.28.so       [.] __mpn_rshift
>      7.14%     7.14%  dd       libc-2.28.so       [.] _nl_intern_locale_data
>      7.14%     7.14%  dd       libc-2.28.so       [.] read_alias_file
> ...
> --------------------------------------------------------------------
> 
> Signed-off-by: Tan Xiaojun <tanxiaojun@huawei.com>
> Suggested-by: James Clark <James.Clark@arm.com>
> Tested-by: Qi Liu <liuqi115@hisilicon.com>
> Signed-off-by: James Clark <james.clark@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Tan Xiaojun <tanxiaojun@huawei.com>
> Cc: Al Grant <al.grant@arm.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/arch/arm/util/auxtrace.c | 38 +++++++++++++++++++++++++++++
>  tools/perf/builtin-record.c         |  5 ++++
>  tools/perf/util/arm-spe.c           |  9 +++++++
>  tools/perf/util/arm-spe.h           |  3 +++
>  tools/perf/util/auxtrace.h          |  6 +++++
>  5 files changed, 61 insertions(+)
> 
> diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
> index 0a6e75b8777a..18f0ea7556e7 100644
> --- a/tools/perf/arch/arm/util/auxtrace.c
> +++ b/tools/perf/arch/arm/util/auxtrace.c
> @@ -10,11 +10,25 @@
>  
>  #include "../../util/auxtrace.h"
>  #include "../../util/debug.h"
> +#include "../../util/env.h"
>  #include "../../util/evlist.h"
>  #include "../../util/pmu.h"
>  #include "cs-etm.h"
>  #include "arm-spe.h"
>  
> +#define SPE_ATTR_TS_ENABLE		BIT(0)
> +#define SPE_ATTR_PA_ENABLE		BIT(1)
> +#define SPE_ATTR_PCT_ENABLE		BIT(2)
> +#define SPE_ATTR_JITTER			BIT(16)
> +#define SPE_ATTR_BRANCH_FILTER		BIT(32)
> +#define SPE_ATTR_LOAD_FILTER		BIT(33)
> +#define SPE_ATTR_STORE_FILTER		BIT(34)
> +
> +#define SPE_ATTR_EV_RETIRED		BIT(1)
> +#define SPE_ATTR_EV_CACHE		BIT(3)
> +#define SPE_ATTR_EV_TLB			BIT(5)
> +#define SPE_ATTR_EV_BRANCH		BIT(7)
> +
>  static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
>  {
>  	struct perf_pmu **arm_spe_pmus = NULL;
> @@ -108,3 +122,27 @@ struct auxtrace_record
>  	*err = 0;
>  	return NULL;
>  }
> +
> +void auxtrace__preprocess_evlist(struct evlist *evlist)
> +{
> +	struct evsel *evsel;
> +	struct perf_pmu *pmu;
> +
> +	evlist__for_each_entry(evlist, evsel) {
> +		/* Currently only supports precise_ip for branch-misses on arm64 */
> +		if (!strcmp(perf_env__arch(evlist->env), "arm64")

Isn't config ambiguous unless you also check type i.e.

			&& evsel->core.attr.type == PERF_TYPE_HARDWARE

> +			&& evsel->core.attr.config == PERF_COUNT_HW_BRANCH_MISSES
> +			&& evsel->core.attr.precise_ip)
> +		{
> +			pmu = perf_pmu__find("arm_spe_0");
> +			if (pmu) {

Changing the event seems a bit weird.

> +				evsel->pmu_name = pmu->name;
> +				evsel->core.attr.type = pmu->type;
> +				evsel->core.attr.config = SPE_ATTR_TS_ENABLE
> +							| SPE_ATTR_BRANCH_FILTER;
> +				evsel->core.attr.config1 = SPE_ATTR_EV_BRANCH;
> +				evsel->core.attr.precise_ip = 0;
> +			}
> +		}
> +	}
> +}
> \ No newline at end of file
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 4c301466101b..3bc61f03d572 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -2451,6 +2451,11 @@ int cmd_record(int argc, const char **argv)
>  
>  	argc = parse_options(argc, argv, record_options, record_usage,
>  			    PARSE_OPT_STOP_AT_NON_OPTION);
> +
> +	if (auxtrace__preprocess_evlist) {
> +		auxtrace__preprocess_evlist(rec->evlist);
> +	}
> +
>  	if (quiet)
>  		perf_quiet_option();
>  
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index 4ef22a0775a9..b21806c97dd8 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -778,6 +778,15 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
>  	attr.sample_id_all = evsel->core.attr.sample_id_all;
>  	attr.read_format = evsel->core.attr.read_format;
>  
> +	/* If it is in the precise ip mode, there is no need to
> +	 * synthesize new events. */
> +	if (!strncmp(evsel->name, "branch-misses", 13)) {
> +		spe->sample_branch_miss = true;
> +		spe->branch_miss_id = evsel->core.id[0];
> +
> +		return 0;
> +	}
> +
>  	/* create new id val to be a fixed offset from evsel id */
>  	id = evsel->core.id[0] + 1000000000;
>  
> diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
> index 98d3235781c3..8b1fb191d03a 100644
> --- a/tools/perf/util/arm-spe.h
> +++ b/tools/perf/util/arm-spe.h
> @@ -20,6 +20,8 @@ enum {
>  union perf_event;
>  struct perf_session;
>  struct perf_pmu;
> +struct evlist;
> +struct evsel;
>  
>  struct auxtrace_record *arm_spe_recording_init(int *err,
>  					       struct perf_pmu *arm_spe_pmu);
> @@ -28,4 +30,5 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
>  				  struct perf_session *session);
>  
>  struct perf_event_attr *arm_spe_pmu_default_config(struct perf_pmu *arm_spe_pmu);
> +void arm_spe_precise_ip_support(struct evlist *evlist, struct evsel *evsel);
>  #endif
> diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
> index 80617b0d044d..4f89a3a31ab2 100644
> --- a/tools/perf/util/auxtrace.h
> +++ b/tools/perf/util/auxtrace.h
> @@ -584,6 +584,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session,
>  int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool);
>  void auxtrace__free_events(struct perf_session *session);
>  void auxtrace__free(struct perf_session *session);
> +void auxtrace__preprocess_evlist(struct evlist *evlist) __attribute__((weak));
>  
>  #define ITRACE_HELP \
>  "				i:	    		synthesize instructions events\n"		\
> @@ -728,6 +729,11 @@ void auxtrace__free(struct perf_session *session __maybe_unused)
>  {
>  }
>  
> +static inline
> +void auxtrace__preprocess_evlist(struct evlist *evlist __maybe_unused)
> +{
> +}
> +
>  static inline
>  int auxtrace_index__write(int fd __maybe_unused,
>  			  struct list_head *head __maybe_unused)
>
James Clark Feb. 24, 2020, 5:08 p.m. UTC | #2
Hi Adrian,

On 2/17/20 11:42 AM, Adrian Hunter wrote:
> On 11/02/20 4:04 pm, James Clark wrote:
>> From: Tan Xiaojun <tanxiaojun@huawei.com>
>>
>> At the suggestion of James Clark, use spe to support the precise
>> ip of some events. Currently its support event is:
>> branch-misses.
>>
>> Example usage:
>>
>> $ ./perf record -e branch-misses:pp dd if=/dev/zero of=/dev/null count=10000
>> (:p/pp/ppp is same for this case.)
>>
>> $ ./perf report --stdio
>> ("--stdio is not necessary")
>>
>> --------------------------------------------------------------------
>> ...
>>  # Samples: 14  of event 'branch-misses:pp'
>>  # Event count (approx.): 14
>>  #
>>  # Children      Self  Command  Shared Object      Symbol
>>  # ........  ........  .......  .................  ..........................
>>  #
>>     14.29%    14.29%  dd       [kernel.kallsyms]  [k] __arch_copy_from_user
>>     14.29%    14.29%  dd       libc-2.28.so       [.] _dl_addr
>>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __free_pages
>>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __pi_memcpy
>>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] pagecache_get_page
>>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] unmap_single_vma
>>      7.14%     7.14%  dd       dd                 [.] 0x00000000000025ec
>>      7.14%     7.14%  dd       ld-2.28.so         [.] _dl_lookup_symbol_x
>>      7.14%     7.14%  dd       ld-2.28.so         [.] check_match
>>      7.14%     7.14%  dd       libc-2.28.so       [.] __mpn_rshift
>>      7.14%     7.14%  dd       libc-2.28.so       [.] _nl_intern_locale_data
>>      7.14%     7.14%  dd       libc-2.28.so       [.] read_alias_file
>> ...
>> --------------------------------------------------------------------
>>
>> Signed-off-by: Tan Xiaojun <tanxiaojun@huawei.com>
>> Suggested-by: James Clark <James.Clark@arm.com>
>> Tested-by: Qi Liu <liuqi115@hisilicon.com>
>> Signed-off-by: James Clark <james.clark@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>> Cc: Jiri Olsa <jolsa@redhat.com>
>> Cc: Tan Xiaojun <tanxiaojun@huawei.com>
>> Cc: Al Grant <al.grant@arm.com>
>> Cc: Namhyung Kim <namhyung@kernel.org>
>> ---
>>  tools/perf/arch/arm/util/auxtrace.c | 38 +++++++++++++++++++++++++++++
>>  tools/perf/builtin-record.c         |  5 ++++
>>  tools/perf/util/arm-spe.c           |  9 +++++++
>>  tools/perf/util/arm-spe.h           |  3 +++
>>  tools/perf/util/auxtrace.h          |  6 +++++
>>  5 files changed, 61 insertions(+)
>>
>> diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
>> index 0a6e75b8777a..18f0ea7556e7 100644
>> --- a/tools/perf/arch/arm/util/auxtrace.c
>> +++ b/tools/perf/arch/arm/util/auxtrace.c
>> @@ -10,11 +10,25 @@
>>  
>>  #include "../../util/auxtrace.h"
>>  #include "../../util/debug.h"
>> +#include "../../util/env.h"
>>  #include "../../util/evlist.h"
>>  #include "../../util/pmu.h"
>>  #include "cs-etm.h"
>>  #include "arm-spe.h"
>>  
>> +#define SPE_ATTR_TS_ENABLE		BIT(0)
>> +#define SPE_ATTR_PA_ENABLE		BIT(1)
>> +#define SPE_ATTR_PCT_ENABLE		BIT(2)
>> +#define SPE_ATTR_JITTER			BIT(16)
>> +#define SPE_ATTR_BRANCH_FILTER		BIT(32)
>> +#define SPE_ATTR_LOAD_FILTER		BIT(33)
>> +#define SPE_ATTR_STORE_FILTER		BIT(34)
>> +
>> +#define SPE_ATTR_EV_RETIRED		BIT(1)
>> +#define SPE_ATTR_EV_CACHE		BIT(3)
>> +#define SPE_ATTR_EV_TLB			BIT(5)
>> +#define SPE_ATTR_EV_BRANCH		BIT(7)
>> +
>>  static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
>>  {
>>  	struct perf_pmu **arm_spe_pmus = NULL;
>> @@ -108,3 +122,27 @@ struct auxtrace_record
>>  	*err = 0;
>>  	return NULL;
>>  }
>> +
>> +void auxtrace__preprocess_evlist(struct evlist *evlist)
>> +{
>> +	struct evsel *evsel;
>> +	struct perf_pmu *pmu;
>> +
>> +	evlist__for_each_entry(evlist, evsel) {
>> +		/* Currently only supports precise_ip for branch-misses on arm64 */
>> +		if (!strcmp(perf_env__arch(evlist->env), "arm64")
> 
> Isn't config ambiguous unless you also check type i.e.
> 
> 			&& evsel->core.attr.type == PERF_TYPE_HARDWARE
> 

Yes you're right I will add this.

>> +			&& evsel->core.attr.config == PERF_COUNT_HW_BRANCH_MISSES
>> +			&& evsel->core.attr.precise_ip)
>> +		{
>> +			pmu = perf_pmu__find("arm_spe_0");
>> +			if (pmu) {
> 
> Changing the event seems a bit weird.
> 

This is because there is no support in the kernel for the precise_ip attribute on Arm.
SPE can give you precise ip data for the same event, but changing the event is required.
 
>> +				evsel->pmu_name = pmu->name;
>> +				evsel->core.attr.type = pmu->type;
>> +				evsel->core.attr.config = SPE_ATTR_TS_ENABLE
>> +							| SPE_ATTR_BRANCH_FILTER;
>> +				evsel->core.attr.config1 = SPE_ATTR_EV_BRANCH;
>> +				evsel->core.attr.precise_ip = 0;
>> +			}
>> +		}
>> +	}
>> +}
>> \ No newline at end of file
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 4c301466101b..3bc61f03d572 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -2451,6 +2451,11 @@ int cmd_record(int argc, const char **argv)
>>  
>>  	argc = parse_options(argc, argv, record_options, record_usage,
>>  			    PARSE_OPT_STOP_AT_NON_OPTION);
>> +
>> +	if (auxtrace__preprocess_evlist) {
>> +		auxtrace__preprocess_evlist(rec->evlist);
>> +	}
>> +
>>  	if (quiet)
>>  		perf_quiet_option();
>>  
>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>> index 4ef22a0775a9..b21806c97dd8 100644
>> --- a/tools/perf/util/arm-spe.c
>> +++ b/tools/perf/util/arm-spe.c
>> @@ -778,6 +778,15 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
>>  	attr.sample_id_all = evsel->core.attr.sample_id_all;
>>  	attr.read_format = evsel->core.attr.read_format;
>>  
>> +	/* If it is in the precise ip mode, there is no need to
>> +	 * synthesize new events. */
>> +	if (!strncmp(evsel->name, "branch-misses", 13)) {
>> +		spe->sample_branch_miss = true;
>> +		spe->branch_miss_id = evsel->core.id[0];
>> +
>> +		return 0;
>> +	}
>> +
>>  	/* create new id val to be a fixed offset from evsel id */
>>  	id = evsel->core.id[0] + 1000000000;
>>  
>> diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
>> index 98d3235781c3..8b1fb191d03a 100644
>> --- a/tools/perf/util/arm-spe.h
>> +++ b/tools/perf/util/arm-spe.h
>> @@ -20,6 +20,8 @@ enum {
>>  union perf_event;
>>  struct perf_session;
>>  struct perf_pmu;
>> +struct evlist;
>> +struct evsel;
>>  
>>  struct auxtrace_record *arm_spe_recording_init(int *err,
>>  					       struct perf_pmu *arm_spe_pmu);
>> @@ -28,4 +30,5 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
>>  				  struct perf_session *session);
>>  
>>  struct perf_event_attr *arm_spe_pmu_default_config(struct perf_pmu *arm_spe_pmu);
>> +void arm_spe_precise_ip_support(struct evlist *evlist, struct evsel *evsel);
>>  #endif
>> diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
>> index 80617b0d044d..4f89a3a31ab2 100644
>> --- a/tools/perf/util/auxtrace.h
>> +++ b/tools/perf/util/auxtrace.h
>> @@ -584,6 +584,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session,
>>  int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool);
>>  void auxtrace__free_events(struct perf_session *session);
>>  void auxtrace__free(struct perf_session *session);
>> +void auxtrace__preprocess_evlist(struct evlist *evlist) __attribute__((weak));
>>  
>>  #define ITRACE_HELP \
>>  "				i:	    		synthesize instructions events\n"		\
>> @@ -728,6 +729,11 @@ void auxtrace__free(struct perf_session *session __maybe_unused)
>>  {
>>  }
>>  
>> +static inline
>> +void auxtrace__preprocess_evlist(struct evlist *evlist __maybe_unused)
>> +{
>> +}
>> +
>>  static inline
>>  int auxtrace_index__write(int fd __maybe_unused,
>>  			  struct list_head *head __maybe_unused)
>>
>
Mark Rutland Feb. 28, 2020, 4:01 p.m. UTC | #3
Hi James,

On Mon, Feb 24, 2020 at 05:08:26PM +0000, James Clark wrote:
> On 2/17/20 11:42 AM, Adrian Hunter wrote:
> > On 11/02/20 4:04 pm, James Clark wrote:
> >> From: Tan Xiaojun <tanxiaojun@huawei.com>
> >>
> >> At the suggestion of James Clark, use spe to support the precise
> >> ip of some events. Currently its support event is:
> >> branch-misses.
> >>
> >> Example usage:
> >>
> >> $ ./perf record -e branch-misses:pp dd if=/dev/zero of=/dev/null count=10000
> >> (:p/pp/ppp is same for this case.)
> >>
> >> $ ./perf report --stdio
> >> ("--stdio is not necessary")
> >>
> >> --------------------------------------------------------------------
> >> ...
> >>  # Samples: 14  of event 'branch-misses:pp'
> >>  # Event count (approx.): 14
> >>  #
> >>  # Children      Self  Command  Shared Object      Symbol
> >>  # ........  ........  .......  .................  ..........................
> >>  #
> >>     14.29%    14.29%  dd       [kernel.kallsyms]  [k] __arch_copy_from_user
> >>     14.29%    14.29%  dd       libc-2.28.so       [.] _dl_addr
> >>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __free_pages
> >>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] __pi_memcpy
> >>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] pagecache_get_page
> >>      7.14%     7.14%  dd       [kernel.kallsyms]  [k] unmap_single_vma
> >>      7.14%     7.14%  dd       dd                 [.] 0x00000000000025ec
> >>      7.14%     7.14%  dd       ld-2.28.so         [.] _dl_lookup_symbol_x
> >>      7.14%     7.14%  dd       ld-2.28.so         [.] check_match
> >>      7.14%     7.14%  dd       libc-2.28.so       [.] __mpn_rshift
> >>      7.14%     7.14%  dd       libc-2.28.so       [.] _nl_intern_locale_data
> >>      7.14%     7.14%  dd       libc-2.28.so       [.] read_alias_file
> >> ...
> >> --------------------------------------------------------------------
> >>
> >> Signed-off-by: Tan Xiaojun <tanxiaojun@huawei.com>
> >> Suggested-by: James Clark <James.Clark@arm.com>
> >> Tested-by: Qi Liu <liuqi115@hisilicon.com>
> >> Signed-off-by: James Clark <james.clark@arm.com>
> >> Cc: Will Deacon <will@kernel.org>
> >> Cc: Mark Rutland <mark.rutland@arm.com>
> >> Cc: Peter Zijlstra <peterz@infradead.org>
> >> Cc: Ingo Molnar <mingo@redhat.com>
> >> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> >> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> >> Cc: Jiri Olsa <jolsa@redhat.com>
> >> Cc: Tan Xiaojun <tanxiaojun@huawei.com>
> >> Cc: Al Grant <al.grant@arm.com>
> >> Cc: Namhyung Kim <namhyung@kernel.org>
> >> ---
> >>  tools/perf/arch/arm/util/auxtrace.c | 38 +++++++++++++++++++++++++++++
> >>  tools/perf/builtin-record.c         |  5 ++++
> >>  tools/perf/util/arm-spe.c           |  9 +++++++
> >>  tools/perf/util/arm-spe.h           |  3 +++
> >>  tools/perf/util/auxtrace.h          |  6 +++++
> >>  5 files changed, 61 insertions(+)
> >>
> >> diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
> >> index 0a6e75b8777a..18f0ea7556e7 100644
> >> --- a/tools/perf/arch/arm/util/auxtrace.c
> >> +++ b/tools/perf/arch/arm/util/auxtrace.c
> >> @@ -10,11 +10,25 @@
> >>  
> >>  #include "../../util/auxtrace.h"
> >>  #include "../../util/debug.h"
> >> +#include "../../util/env.h"
> >>  #include "../../util/evlist.h"
> >>  #include "../../util/pmu.h"
> >>  #include "cs-etm.h"
> >>  #include "arm-spe.h"
> >>  
> >> +#define SPE_ATTR_TS_ENABLE		BIT(0)
> >> +#define SPE_ATTR_PA_ENABLE		BIT(1)
> >> +#define SPE_ATTR_PCT_ENABLE		BIT(2)
> >> +#define SPE_ATTR_JITTER			BIT(16)
> >> +#define SPE_ATTR_BRANCH_FILTER		BIT(32)
> >> +#define SPE_ATTR_LOAD_FILTER		BIT(33)
> >> +#define SPE_ATTR_STORE_FILTER		BIT(34)
> >> +
> >> +#define SPE_ATTR_EV_RETIRED		BIT(1)
> >> +#define SPE_ATTR_EV_CACHE		BIT(3)
> >> +#define SPE_ATTR_EV_TLB			BIT(5)
> >> +#define SPE_ATTR_EV_BRANCH		BIT(7)
> >> +
> >>  static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
> >>  {
> >>  	struct perf_pmu **arm_spe_pmus = NULL;
> >> @@ -108,3 +122,27 @@ struct auxtrace_record
> >>  	*err = 0;
> >>  	return NULL;
> >>  }
> >> +
> >> +void auxtrace__preprocess_evlist(struct evlist *evlist)
> >> +{
> >> +	struct evsel *evsel;
> >> +	struct perf_pmu *pmu;
> >> +
> >> +	evlist__for_each_entry(evlist, evsel) {
> >> +		/* Currently only supports precise_ip for branch-misses on arm64 */
> >> +		if (!strcmp(perf_env__arch(evlist->env), "arm64")
> > 
> > Isn't config ambiguous unless you also check type i.e.
> > 
> > 			&& evsel->core.attr.type == PERF_TYPE_HARDWARE
> > 
> 
> Yes you're right I will add this.
> 
> >> +			&& evsel->core.attr.config == PERF_COUNT_HW_BRANCH_MISSES
> >> +			&& evsel->core.attr.precise_ip)
> >> +		{
> >> +			pmu = perf_pmu__find("arm_spe_0");
> >> +			if (pmu) {
> > 
> > Changing the event seems a bit weird.
> > 
> 
> This is because there is no support in the kernel for the precise_ip attribute on Arm.
> SPE can give you precise ip data for the same event, but changing the event is required.

I don't think this is the right level to override the event.

It's true that contemporary CPU PMUs can't generate synchronous events,
and hence the kernel doesn't have a precise IP, but that's not
necessarily going to be the case in future. I'd rather we didn't
silently override the event requested by the user, as I think that is
going to cause more problems for us in future.

When the SPE buffer overflows, events will be silently dropped, which I
don't believe is in keeping with the usual semantics of precise events.
Additionally, hard-coding the "arm_spe_0" name means that this will not
work reliably on big.LITTLE systems.

Instead, can we have the user explicitly request to use the SPE PMU in
this way? If the perf tool could be smart with the "_%d" suffix, and
collate PMUs differing only by that, the user would only need to do
something like:

  perf record -e arm_spe/branch-misses/pp

... which doesn't seem to onerous.

Is something like that possible?

Thanks,
Mark.
diff mbox series

Patch

diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
index 0a6e75b8777a..18f0ea7556e7 100644
--- a/tools/perf/arch/arm/util/auxtrace.c
+++ b/tools/perf/arch/arm/util/auxtrace.c
@@ -10,11 +10,25 @@ 
 
 #include "../../util/auxtrace.h"
 #include "../../util/debug.h"
+#include "../../util/env.h"
 #include "../../util/evlist.h"
 #include "../../util/pmu.h"
 #include "cs-etm.h"
 #include "arm-spe.h"
 
+#define SPE_ATTR_TS_ENABLE		BIT(0)
+#define SPE_ATTR_PA_ENABLE		BIT(1)
+#define SPE_ATTR_PCT_ENABLE		BIT(2)
+#define SPE_ATTR_JITTER			BIT(16)
+#define SPE_ATTR_BRANCH_FILTER		BIT(32)
+#define SPE_ATTR_LOAD_FILTER		BIT(33)
+#define SPE_ATTR_STORE_FILTER		BIT(34)
+
+#define SPE_ATTR_EV_RETIRED		BIT(1)
+#define SPE_ATTR_EV_CACHE		BIT(3)
+#define SPE_ATTR_EV_TLB			BIT(5)
+#define SPE_ATTR_EV_BRANCH		BIT(7)
+
 static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
 {
 	struct perf_pmu **arm_spe_pmus = NULL;
@@ -108,3 +122,27 @@  struct auxtrace_record
 	*err = 0;
 	return NULL;
 }
+
+void auxtrace__preprocess_evlist(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	struct perf_pmu *pmu;
+
+	evlist__for_each_entry(evlist, evsel) {
+		/* Currently only supports precise_ip for branch-misses on arm64 */
+		if (!strcmp(perf_env__arch(evlist->env), "arm64")
+			&& evsel->core.attr.config == PERF_COUNT_HW_BRANCH_MISSES
+			&& evsel->core.attr.precise_ip)
+		{
+			pmu = perf_pmu__find("arm_spe_0");
+			if (pmu) {
+				evsel->pmu_name = pmu->name;
+				evsel->core.attr.type = pmu->type;
+				evsel->core.attr.config = SPE_ATTR_TS_ENABLE
+							| SPE_ATTR_BRANCH_FILTER;
+				evsel->core.attr.config1 = SPE_ATTR_EV_BRANCH;
+				evsel->core.attr.precise_ip = 0;
+			}
+		}
+	}
+}
\ No newline at end of file
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4c301466101b..3bc61f03d572 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2451,6 +2451,11 @@  int cmd_record(int argc, const char **argv)
 
 	argc = parse_options(argc, argv, record_options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (auxtrace__preprocess_evlist) {
+		auxtrace__preprocess_evlist(rec->evlist);
+	}
+
 	if (quiet)
 		perf_quiet_option();
 
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 4ef22a0775a9..b21806c97dd8 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -778,6 +778,15 @@  arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 	attr.sample_id_all = evsel->core.attr.sample_id_all;
 	attr.read_format = evsel->core.attr.read_format;
 
+	/* If it is in the precise ip mode, there is no need to
+	 * synthesize new events. */
+	if (!strncmp(evsel->name, "branch-misses", 13)) {
+		spe->sample_branch_miss = true;
+		spe->branch_miss_id = evsel->core.id[0];
+
+		return 0;
+	}
+
 	/* create new id val to be a fixed offset from evsel id */
 	id = evsel->core.id[0] + 1000000000;
 
diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h
index 98d3235781c3..8b1fb191d03a 100644
--- a/tools/perf/util/arm-spe.h
+++ b/tools/perf/util/arm-spe.h
@@ -20,6 +20,8 @@  enum {
 union perf_event;
 struct perf_session;
 struct perf_pmu;
+struct evlist;
+struct evsel;
 
 struct auxtrace_record *arm_spe_recording_init(int *err,
 					       struct perf_pmu *arm_spe_pmu);
@@ -28,4 +30,5 @@  int arm_spe_process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session);
 
 struct perf_event_attr *arm_spe_pmu_default_config(struct perf_pmu *arm_spe_pmu);
+void arm_spe_precise_ip_support(struct evlist *evlist, struct evsel *evsel);
 #endif
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 80617b0d044d..4f89a3a31ab2 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -584,6 +584,7 @@  void auxtrace__dump_auxtrace_sample(struct perf_session *session,
 int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool);
 void auxtrace__free_events(struct perf_session *session);
 void auxtrace__free(struct perf_session *session);
+void auxtrace__preprocess_evlist(struct evlist *evlist) __attribute__((weak));
 
 #define ITRACE_HELP \
 "				i:	    		synthesize instructions events\n"		\
@@ -728,6 +729,11 @@  void auxtrace__free(struct perf_session *session __maybe_unused)
 {
 }
 
+static inline
+void auxtrace__preprocess_evlist(struct evlist *evlist __maybe_unused)
+{
+}
+
 static inline
 int auxtrace_index__write(int fd __maybe_unused,
 			  struct list_head *head __maybe_unused)