Message ID | 20230419172101.78638-1-gankulkarni@os.amperecomputing.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | perf cs-etm: Add support for coresight trace for any range of CPUs | expand |
On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: > The current implementation supports coresight trace for a range of > CPUs, if the first CPU is CPU0. > > Adding changes to enable coresight trace for any range of CPUs by > decoding the first CPU also from the header. > Later, first CPU id is used instead of CPU0 across the decoder functions. > > Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> > --- > .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- > .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- > tools/perf/util/cs-etm.c | 62 ++++++++++++------- > 3 files changed, 42 insertions(+), 27 deletions(-) > > diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c > index 82a27ab90c8b..41ab299b643b 100644 > --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c > +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c > @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params, > } > > struct cs_etm_decoder * > -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, > +cs_etm_decoder__new(int first_decoder, int decoders, struct cs_etm_decoder_params *d_params, > struct cs_etm_trace_params t_params[]) > { > struct cs_etm_decoder *decoder; > @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, > /* init raw frame logging if required */ > cs_etm_decoder__init_raw_frame_logging(d_params, decoder); > > - for (i = 0; i < decoders; i++) { > + for (i = first_decoder; i < decoders; i++) { > ret = cs_etm_decoder__create_etm_decoder(d_params, > &t_params[i], > decoder); > diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h > index 92a855fbe5b8..b06193fc75b4 100644 > --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h > +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h > @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder, > size_t len, size_t *consumed); > > struct cs_etm_decoder * > -cs_etm_decoder__new(int num_cpu, > +cs_etm_decoder__new(int first_decoder, > + int decoders, > struct cs_etm_decoder_params *d_params, > struct cs_etm_trace_params t_params[]); > > diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c > index 94e2d02009eb..2619513ae088 100644 > --- a/tools/perf/util/cs-etm.c > +++ b/tools/perf/util/cs-etm.c > @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { > u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ > > int num_cpu; > + int first_cpu; > + int last_cpu; > u64 latest_kernel_timestamp; > u32 auxtrace_type; > u64 branches_sample_type; > @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, > } > > static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, > - struct cs_etm_auxtrace *etm, > - int decoders) > + struct cs_etm_auxtrace *etm) > { > int i; > u32 etmidr; > u64 architecture; > > - for (i = 0; i < decoders; i++) { > + for (i = etm->first_cpu; i < etm->last_cpu; i++) { > architecture = etm->metadata[i][CS_ETM_MAGIC]; > > switch (architecture) { > @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session *session) > /* Then the RB tree itself */ > intlist__delete(traceid_list); > > - for (i = 0; i < aux->num_cpu; i++) > + for (i = aux->first_cpu; i < aux->last_cpu; i++) > zfree(&aux->metadata[i]); > > thread__zput(aux->unknown_thread); > @@ -921,7 +922,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, > * Each queue can only contain data from one CPU when unformatted, so only one decoder is > * needed. > */ > - int decoders = formatted ? etm->num_cpu : 1; > + int first_decoder = formatted ? etm->first_cpu : 0; > + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); > > etmq = zalloc(sizeof(*etmq)); > if (!etmq) > @@ -937,7 +939,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, > if (!t_params) > goto out_free; > > - if (cs_etm__init_trace_params(t_params, etm, decoders)) > + if (cs_etm__init_trace_params(t_params, etm)) > goto out_free; > > /* Set decoder parameters to decode trace packets */ > @@ -947,8 +949,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, > formatted)) > goto out_free; > > - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, > - t_params); > + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, &d_params, t_params); > > if (!etmq->decoder) > goto out_free; > @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct perf_session *session) > * Loop through the ETMs and complain if we find at least one where ts_source != 1 (virtual > * timestamps). > */ > -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) > +static bool cs_etm__has_virtual_ts(u64 **metadata, struct cs_etm_auxtrace *etm) > { > int j; > > - for (j = 0; j < num_cpu; j++) { > + for (j = etm->first_cpu; j < etm->last_cpu; j++) { > switch (metadata[j][CS_ETM_MAGIC]) { > case __perf_cs_etmv4_magic: > if (HAS_PARAM(j, ETMV4, TS_SOURCE) || metadata[j][CS_ETMV4_TS_SOURCE] != 1) > @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) > } > > /* map trace ids to correct metadata block, from information in metadata */ > -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) > +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) > { > u64 cs_etm_magic; > + u64 **metadata = etm->metadata; > u8 trace_chan_id; > int i, err; > > - for (i = 0; i < num_cpu; i++) { > + for (i = etm->first_cpu; i < etm->last_cpu; i++) { > cs_etm_magic = metadata[i][CS_ETM_MAGIC]; > switch (cs_etm_magic) { > case __perf_cs_etmv3_magic: > @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) > * If we found AUX_HW_ID packets, then set any metadata marked as unused to the > * unused value to reduce the number of unneeded decoders created. > */ > -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) > +static int cs_etm__clear_unused_trace_ids_metadata(struct cs_etm_auxtrace *etm) > { > u64 cs_etm_magic; > + u64 **metadata = etm->metadata; > int i; > > - for (i = 0; i < num_cpu; i++) { > + for (i = etm->first_cpu; i < etm->last_cpu; i++) { > cs_etm_magic = metadata[i][CS_ETM_MAGIC]; > switch (cs_etm_magic) { > case __perf_cs_etmv3_magic: > @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, > int event_header_size = sizeof(struct perf_event_header); > int total_size = auxtrace_info->header.size; > int priv_size = 0; > - int num_cpu; > + int num_cpu, first_cpu = 0, last_cpu; > int err = 0; > int aux_hw_id_found; > int i, j; > @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, > /* First the global part */ > ptr = (u64 *) auxtrace_info->priv; > num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; > - metadata = zalloc(sizeof(*metadata) * num_cpu); > + > + /* Start parsing after the common part of the header */ > + i = CS_HEADER_VERSION_MAX; > + > + /*Get CPU id of first event */ > + first_cpu = ptr[i + CS_ETM_CPU]; > + last_cpu = first_cpu + num_cpu; > + > + if (first_cpu > cpu__max_cpu().cpu || > + last_cpu > cpu__max_cpu().cpu) > + return -EINVAL; > + > + metadata = zalloc(sizeof(*metadata) * last_cpu); Hi Ganapatrao, I think I see what the problem is, but I'm wondering if a better fix would be to further decouple the CPU ID from the index in the array. With your change it's not clear what happens with sparse recordings, for example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is some wastage in the zalloc here for example if only CPU 256 is traced then we'd still make 256 decoders but 255 of them would be unused? I tried to test sparse recordings, but your change doesn't apply to the latest coresight/next branch. I did notice that 'perf report -D' doesn't work with them on coresight/next (it just quits), but I couldn't see if that's fixed with your change. Would a better fix not be to keep the metadata loops from 0-N and instead save the CPU ID in cs_etm_decoder_params or the decoder. That way it would support both sparse and not starting from 0 cases? I think the code would be better if it's worded like "i < recorded_cpus" rather than "i < cpu" to make it clear that i isn't actually the CPU ID it's just an index. Also a new test for this scenario would probably be a good idea. Thanks James
On 20/04/2023 10:43, James Clark wrote: > > > On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: >> The current implementation supports coresight trace for a range of >> CPUs, if the first CPU is CPU0. >> >> Adding changes to enable coresight trace for any range of CPUs by >> decoding the first CPU also from the header. >> Later, first CPU id is used instead of CPU0 across the decoder functions. >> >> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> >> --- >> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- >> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- >> tools/perf/util/cs-etm.c | 62 ++++++++++++------- >> 3 files changed, 42 insertions(+), 27 deletions(-) >> >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> index 82a27ab90c8b..41ab299b643b 100644 >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params, >> } >> >> struct cs_etm_decoder * >> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, >> +cs_etm_decoder__new(int first_decoder, int decoders, struct cs_etm_decoder_params *d_params, >> struct cs_etm_trace_params t_params[]) >> { >> struct cs_etm_decoder *decoder; >> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, >> /* init raw frame logging if required */ >> cs_etm_decoder__init_raw_frame_logging(d_params, decoder); >> >> - for (i = 0; i < decoders; i++) { >> + for (i = first_decoder; i < decoders; i++) { >> ret = cs_etm_decoder__create_etm_decoder(d_params, >> &t_params[i], >> decoder); >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> index 92a855fbe5b8..b06193fc75b4 100644 >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder, >> size_t len, size_t *consumed); >> >> struct cs_etm_decoder * >> -cs_etm_decoder__new(int num_cpu, >> +cs_etm_decoder__new(int first_decoder, >> + int decoders, >> struct cs_etm_decoder_params *d_params, >> struct cs_etm_trace_params t_params[]); >> >> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c >> index 94e2d02009eb..2619513ae088 100644 >> --- a/tools/perf/util/cs-etm.c >> +++ b/tools/perf/util/cs-etm.c >> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { >> u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ >> >> int num_cpu; >> + int first_cpu; >> + int last_cpu; >> u64 latest_kernel_timestamp; >> u32 auxtrace_type; >> u64 branches_sample_type; >> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, >> } >> >> static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, >> - struct cs_etm_auxtrace *etm, >> - int decoders) >> + struct cs_etm_auxtrace *etm) >> { >> int i; >> u32 etmidr; >> u64 architecture; >> >> - for (i = 0; i < decoders; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> architecture = etm->metadata[i][CS_ETM_MAGIC]; >> >> switch (architecture) { >> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session *session) >> /* Then the RB tree itself */ >> intlist__delete(traceid_list); >> >> - for (i = 0; i < aux->num_cpu; i++) >> + for (i = aux->first_cpu; i < aux->last_cpu; i++) >> zfree(&aux->metadata[i]); >> >> thread__zput(aux->unknown_thread); >> @@ -921,7 +922,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> * Each queue can only contain data from one CPU when unformatted, so only one decoder is >> * needed. >> */ >> - int decoders = formatted ? etm->num_cpu : 1; >> + int first_decoder = formatted ? etm->first_cpu : 0; >> + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); >> >> etmq = zalloc(sizeof(*etmq)); >> if (!etmq) >> @@ -937,7 +939,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> if (!t_params) >> goto out_free; >> >> - if (cs_etm__init_trace_params(t_params, etm, decoders)) >> + if (cs_etm__init_trace_params(t_params, etm)) >> goto out_free; >> >> /* Set decoder parameters to decode trace packets */ >> @@ -947,8 +949,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> formatted)) >> goto out_free; >> >> - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, >> - t_params); >> + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, &d_params, t_params); >> >> if (!etmq->decoder) >> goto out_free; >> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct perf_session *session) >> * Loop through the ETMs and complain if we find at least one where ts_source != 1 (virtual >> * timestamps). >> */ >> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct cs_etm_auxtrace *etm) >> { >> int j; >> >> - for (j = 0; j < num_cpu; j++) { >> + for (j = etm->first_cpu; j < etm->last_cpu; j++) { >> switch (metadata[j][CS_ETM_MAGIC]) { >> case __perf_cs_etmv4_magic: >> if (HAS_PARAM(j, ETMV4, TS_SOURCE) || metadata[j][CS_ETMV4_TS_SOURCE] != 1) >> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >> } >> >> /* map trace ids to correct metadata block, from information in metadata */ >> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) >> { >> u64 cs_etm_magic; >> + u64 **metadata = etm->metadata; >> u8 trace_chan_id; >> int i, err; >> >> - for (i = 0; i < num_cpu; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >> switch (cs_etm_magic) { >> case __perf_cs_etmv3_magic: >> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >> * If we found AUX_HW_ID packets, then set any metadata marked as unused to the >> * unused value to reduce the number of unneeded decoders created. >> */ >> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) >> +static int cs_etm__clear_unused_trace_ids_metadata(struct cs_etm_auxtrace *etm) >> { >> u64 cs_etm_magic; >> + u64 **metadata = etm->metadata; >> int i; >> >> - for (i = 0; i < num_cpu; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >> switch (cs_etm_magic) { >> case __perf_cs_etmv3_magic: >> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, >> int event_header_size = sizeof(struct perf_event_header); >> int total_size = auxtrace_info->header.size; >> int priv_size = 0; >> - int num_cpu; >> + int num_cpu, first_cpu = 0, last_cpu; >> int err = 0; >> int aux_hw_id_found; >> int i, j; >> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, >> /* First the global part */ >> ptr = (u64 *) auxtrace_info->priv; >> num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; >> - metadata = zalloc(sizeof(*metadata) * num_cpu); >> + >> + /* Start parsing after the common part of the header */ >> + i = CS_HEADER_VERSION_MAX; >> + >> + /*Get CPU id of first event */ >> + first_cpu = ptr[i + CS_ETM_CPU]; >> + last_cpu = first_cpu + num_cpu; >> + >> + if (first_cpu > cpu__max_cpu().cpu || >> + last_cpu > cpu__max_cpu().cpu) >> + return -EINVAL; >> + >> + metadata = zalloc(sizeof(*metadata) * last_cpu); > > Hi Ganapatrao, > > I think I see what the problem is, but I'm wondering if a better fix > would be to further decouple the CPU ID from the index in the array. > > With your change it's not clear what happens with sparse recordings, for > example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is > some wastage in the zalloc here for example if only CPU 256 is traced > then we'd still make 256 decoders but 255 of them would be unused? > > I tried to test sparse recordings, but your change doesn't apply to the > latest coresight/next branch. I did notice that 'perf report -D' doesn't > work with them on coresight/next (it just quits), but I couldn't see if > that's fixed with your change. > > Would a better fix not be to keep the metadata loops from 0-N and > instead save the CPU ID in cs_etm_decoder_params or the decoder. That > way it would support both sparse and not starting from 0 cases? I think > the code would be better if it's worded like "i < recorded_cpus" rather > than "i < cpu" to make it clear that i isn't actually the CPU ID it's > just an index. > > Also a new test for this scenario would probably be a good idea. > > Thanks > James > BTW for the test, I'm currently working on something where I've added something like this to test_arm_coresight.sh: arm_cs_etm_basic_test() { echo "Recording trace with $@" perf record -o ${perfdata} "$@" -- ls > /dev/null 2>&1 perf_script_branch_samples ls && perf_report_branch_samples ls && perf_report_instruction_samples ls err=$? arm_cs_report "CoreSight basic testing with %@" $err } # Test all combinations of per-thread, system-wide and normal mode with # and without timestamps arm_cs_etm_basic_test -e cs_etm/timestamp=0/ --per-thread arm_cs_etm_basic_test -e cs_etm/timestamp=1/ --per-thread arm_cs_etm_basic_test -e cs_etm/timestamp=0/ -a arm_cs_etm_basic_test -e cs_etm/timestamp=1/ -a arm_cs_etm_basic_test -e cs_etm/timestamp=0/ arm_cs_etm_basic_test -e cs_etm/timestamp=1/ I think you should be able to add this to cover your cases as well: # Test non-zero indexed and sparse CPU lists arm_cs_etm_basic_test -e cs_etm// -C 1,2 arm_cs_etm_basic_test -e cs_etm// -C 0,2
Hi James, On 20-04-2023 03:13 pm, James Clark wrote: > > > On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: >> The current implementation supports coresight trace for a range of >> CPUs, if the first CPU is CPU0. >> >> Adding changes to enable coresight trace for any range of CPUs by >> decoding the first CPU also from the header. >> Later, first CPU id is used instead of CPU0 across the decoder functions. >> >> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> >> --- >> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- >> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- >> tools/perf/util/cs-etm.c | 62 ++++++++++++------- >> 3 files changed, 42 insertions(+), 27 deletions(-) >> >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> index 82a27ab90c8b..41ab299b643b 100644 >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params, >> } >> >> struct cs_etm_decoder * >> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, >> +cs_etm_decoder__new(int first_decoder, int decoders, struct cs_etm_decoder_params *d_params, >> struct cs_etm_trace_params t_params[]) >> { >> struct cs_etm_decoder *decoder; >> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, >> /* init raw frame logging if required */ >> cs_etm_decoder__init_raw_frame_logging(d_params, decoder); >> >> - for (i = 0; i < decoders; i++) { >> + for (i = first_decoder; i < decoders; i++) { >> ret = cs_etm_decoder__create_etm_decoder(d_params, >> &t_params[i], >> decoder); >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> index 92a855fbe5b8..b06193fc75b4 100644 >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder, >> size_t len, size_t *consumed); >> >> struct cs_etm_decoder * >> -cs_etm_decoder__new(int num_cpu, >> +cs_etm_decoder__new(int first_decoder, >> + int decoders, >> struct cs_etm_decoder_params *d_params, >> struct cs_etm_trace_params t_params[]); >> >> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c >> index 94e2d02009eb..2619513ae088 100644 >> --- a/tools/perf/util/cs-etm.c >> +++ b/tools/perf/util/cs-etm.c >> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { >> u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ >> >> int num_cpu; >> + int first_cpu; >> + int last_cpu; >> u64 latest_kernel_timestamp; >> u32 auxtrace_type; >> u64 branches_sample_type; >> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, >> } >> >> static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, >> - struct cs_etm_auxtrace *etm, >> - int decoders) >> + struct cs_etm_auxtrace *etm) >> { >> int i; >> u32 etmidr; >> u64 architecture; >> >> - for (i = 0; i < decoders; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> architecture = etm->metadata[i][CS_ETM_MAGIC]; >> >> switch (architecture) { >> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session *session) >> /* Then the RB tree itself */ >> intlist__delete(traceid_list); >> >> - for (i = 0; i < aux->num_cpu; i++) >> + for (i = aux->first_cpu; i < aux->last_cpu; i++) >> zfree(&aux->metadata[i]); >> >> thread__zput(aux->unknown_thread); >> @@ -921,7 +922,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> * Each queue can only contain data from one CPU when unformatted, so only one decoder is >> * needed. >> */ >> - int decoders = formatted ? etm->num_cpu : 1; >> + int first_decoder = formatted ? etm->first_cpu : 0; >> + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); >> >> etmq = zalloc(sizeof(*etmq)); >> if (!etmq) >> @@ -937,7 +939,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> if (!t_params) >> goto out_free; >> >> - if (cs_etm__init_trace_params(t_params, etm, decoders)) >> + if (cs_etm__init_trace_params(t_params, etm)) >> goto out_free; >> >> /* Set decoder parameters to decode trace packets */ >> @@ -947,8 +949,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >> formatted)) >> goto out_free; >> >> - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, >> - t_params); >> + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, &d_params, t_params); >> >> if (!etmq->decoder) >> goto out_free; >> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct perf_session *session) >> * Loop through the ETMs and complain if we find at least one where ts_source != 1 (virtual >> * timestamps). >> */ >> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct cs_etm_auxtrace *etm) >> { >> int j; >> >> - for (j = 0; j < num_cpu; j++) { >> + for (j = etm->first_cpu; j < etm->last_cpu; j++) { >> switch (metadata[j][CS_ETM_MAGIC]) { >> case __perf_cs_etmv4_magic: >> if (HAS_PARAM(j, ETMV4, TS_SOURCE) || metadata[j][CS_ETMV4_TS_SOURCE] != 1) >> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >> } >> >> /* map trace ids to correct metadata block, from information in metadata */ >> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) >> { >> u64 cs_etm_magic; >> + u64 **metadata = etm->metadata; >> u8 trace_chan_id; >> int i, err; >> >> - for (i = 0; i < num_cpu; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >> switch (cs_etm_magic) { >> case __perf_cs_etmv3_magic: >> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >> * If we found AUX_HW_ID packets, then set any metadata marked as unused to the >> * unused value to reduce the number of unneeded decoders created. >> */ >> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) >> +static int cs_etm__clear_unused_trace_ids_metadata(struct cs_etm_auxtrace *etm) >> { >> u64 cs_etm_magic; >> + u64 **metadata = etm->metadata; >> int i; >> >> - for (i = 0; i < num_cpu; i++) { >> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >> switch (cs_etm_magic) { >> case __perf_cs_etmv3_magic: >> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, >> int event_header_size = sizeof(struct perf_event_header); >> int total_size = auxtrace_info->header.size; >> int priv_size = 0; >> - int num_cpu; >> + int num_cpu, first_cpu = 0, last_cpu; >> int err = 0; >> int aux_hw_id_found; >> int i, j; >> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, >> /* First the global part */ >> ptr = (u64 *) auxtrace_info->priv; >> num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; >> - metadata = zalloc(sizeof(*metadata) * num_cpu); >> + >> + /* Start parsing after the common part of the header */ >> + i = CS_HEADER_VERSION_MAX; >> + >> + /*Get CPU id of first event */ >> + first_cpu = ptr[i + CS_ETM_CPU]; >> + last_cpu = first_cpu + num_cpu; >> + >> + if (first_cpu > cpu__max_cpu().cpu || >> + last_cpu > cpu__max_cpu().cpu) >> + return -EINVAL; >> + >> + metadata = zalloc(sizeof(*metadata) * last_cpu); > > Hi Ganapatrao, > > I think I see what the problem is, but I'm wondering if a better fix > would be to further decouple the CPU ID from the index in the array. > > With your change it's not clear what happens with sparse recordings, for > example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is This patch fixes for any range of CPUs. Record with sparse list of CPUs will not work with current code still. > some wastage in the zalloc here for example if only CPU 256 is traced > then we'd still make 256 decoders but 255 of them would be unused? > > I tried to test sparse recordings, but your change doesn't apply to the > latest coresight/next branch. I did notice that 'perf report -D' doesn't > work with them on coresight/next (it just quits), but I couldn't see if > that's fixed with your change. My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches related to dynamic id [1] support(queued for 6.4). "perf report -D" works for me. [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html > > Would a better fix not be to keep the metadata loops from 0-N and > instead save the CPU ID in cs_etm_decoder_params or the decoder. That > way it would support both sparse and not starting from 0 cases? I think Yep, I though this initially, it got complicated due to for more for-loops. I will try again and post V2. > the code would be better if it's worded like "i < recorded_cpus" rather > than "i < cpu" to make it clear that i isn't actually the CPU ID it's > just an index. Yes, makes sense to call it "recorded_cpus". > > Also a new test for this scenario would probably be a good idea. > > Thanks > James > Thanks, Ganapat
On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: > > Hi James, > > On 20-04-2023 03:13 pm, James Clark wrote: >> >> >> On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: >>> The current implementation supports coresight trace for a range of >>> CPUs, if the first CPU is CPU0. >>> >>> Adding changes to enable coresight trace for any range of CPUs by >>> decoding the first CPU also from the header. >>> Later, first CPU id is used instead of CPU0 across the decoder >>> functions. >>> >>> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> >>> --- >>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- >>> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- >>> tools/perf/util/cs-etm.c | 62 ++++++++++++------- >>> 3 files changed, 42 insertions(+), 27 deletions(-) >>> >>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>> index 82a27ab90c8b..41ab299b643b 100644 >>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct >>> cs_etm_decoder_params *d_params, >>> } >>> struct cs_etm_decoder * >>> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params >>> *d_params, >>> +cs_etm_decoder__new(int first_decoder, int decoders, struct >>> cs_etm_decoder_params *d_params, >>> struct cs_etm_trace_params t_params[]) >>> { >>> struct cs_etm_decoder *decoder; >>> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct >>> cs_etm_decoder_params *d_params, >>> /* init raw frame logging if required */ >>> cs_etm_decoder__init_raw_frame_logging(d_params, decoder); >>> - for (i = 0; i < decoders; i++) { >>> + for (i = first_decoder; i < decoders; i++) { >>> ret = cs_etm_decoder__create_etm_decoder(d_params, >>> &t_params[i], >>> decoder); >>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>> index 92a855fbe5b8..b06193fc75b4 100644 >>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct >>> cs_etm_decoder *decoder, >>> size_t len, size_t *consumed); >>> struct cs_etm_decoder * >>> -cs_etm_decoder__new(int num_cpu, >>> +cs_etm_decoder__new(int first_decoder, >>> + int decoders, >>> struct cs_etm_decoder_params *d_params, >>> struct cs_etm_trace_params t_params[]); >>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c >>> index 94e2d02009eb..2619513ae088 100644 >>> --- a/tools/perf/util/cs-etm.c >>> +++ b/tools/perf/util/cs-etm.c >>> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { >>> u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ >>> int num_cpu; >>> + int first_cpu; >>> + int last_cpu; >>> u64 latest_kernel_timestamp; >>> u32 auxtrace_type; >>> u64 branches_sample_type; >>> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct >>> cs_etm_trace_params *t_params, >>> } >>> static int cs_etm__init_trace_params(struct cs_etm_trace_params >>> *t_params, >>> - struct cs_etm_auxtrace *etm, >>> - int decoders) >>> + struct cs_etm_auxtrace *etm) >>> { >>> int i; >>> u32 etmidr; >>> u64 architecture; >>> - for (i = 0; i < decoders; i++) { >>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>> architecture = etm->metadata[i][CS_ETM_MAGIC]; >>> switch (architecture) { >>> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session >>> *session) >>> /* Then the RB tree itself */ >>> intlist__delete(traceid_list); >>> - for (i = 0; i < aux->num_cpu; i++) >>> + for (i = aux->first_cpu; i < aux->last_cpu; i++) >>> zfree(&aux->metadata[i]); >>> thread__zput(aux->unknown_thread); >>> @@ -921,7 +922,8 @@ static struct cs_etm_queue >>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>> * Each queue can only contain data from one CPU when >>> unformatted, so only one decoder is >>> * needed. >>> */ >>> - int decoders = formatted ? etm->num_cpu : 1; >>> + int first_decoder = formatted ? etm->first_cpu : 0; >>> + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); >>> etmq = zalloc(sizeof(*etmq)); >>> if (!etmq) >>> @@ -937,7 +939,7 @@ static struct cs_etm_queue >>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>> if (!t_params) >>> goto out_free; >>> - if (cs_etm__init_trace_params(t_params, etm, decoders)) >>> + if (cs_etm__init_trace_params(t_params, etm)) >>> goto out_free; >>> /* Set decoder parameters to decode trace packets */ >>> @@ -947,8 +949,7 @@ static struct cs_etm_queue >>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>> formatted)) >>> goto out_free; >>> - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, >>> - t_params); >>> + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, >>> &d_params, t_params); >>> if (!etmq->decoder) >>> goto out_free; >>> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct >>> perf_session *session) >>> * Loop through the ETMs and complain if we find at least one where >>> ts_source != 1 (virtual >>> * timestamps). >>> */ >>> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >>> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct >>> cs_etm_auxtrace *etm) >>> { >>> int j; >>> - for (j = 0; j < num_cpu; j++) { >>> + for (j = etm->first_cpu; j < etm->last_cpu; j++) { >>> switch (metadata[j][CS_ETM_MAGIC]) { >>> case __perf_cs_etmv4_magic: >>> if (HAS_PARAM(j, ETMV4, TS_SOURCE) || >>> metadata[j][CS_ETMV4_TS_SOURCE] != 1) >>> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 >>> **metadata, int num_cpu) >>> } >>> /* map trace ids to correct metadata block, from information in >>> metadata */ >>> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >>> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) >>> { >>> u64 cs_etm_magic; >>> + u64 **metadata = etm->metadata; >>> u8 trace_chan_id; >>> int i, err; >>> - for (i = 0; i < num_cpu; i++) { >>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>> switch (cs_etm_magic) { >>> case __perf_cs_etmv3_magic: >>> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int >>> num_cpu, u64 **metadata) >>> * If we found AUX_HW_ID packets, then set any metadata marked as >>> unused to the >>> * unused value to reduce the number of unneeded decoders created. >>> */ >>> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 >>> **metadata) >>> +static int cs_etm__clear_unused_trace_ids_metadata(struct >>> cs_etm_auxtrace *etm) >>> { >>> u64 cs_etm_magic; >>> + u64 **metadata = etm->metadata; >>> int i; >>> - for (i = 0; i < num_cpu; i++) { >>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>> switch (cs_etm_magic) { >>> case __perf_cs_etmv3_magic: >>> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union >>> perf_event *event, >>> int event_header_size = sizeof(struct perf_event_header); >>> int total_size = auxtrace_info->header.size; >>> int priv_size = 0; >>> - int num_cpu; >>> + int num_cpu, first_cpu = 0, last_cpu; >>> int err = 0; >>> int aux_hw_id_found; >>> int i, j; >>> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union >>> perf_event *event, >>> /* First the global part */ >>> ptr = (u64 *) auxtrace_info->priv; >>> num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; >>> - metadata = zalloc(sizeof(*metadata) * num_cpu); >>> + >>> + /* Start parsing after the common part of the header */ >>> + i = CS_HEADER_VERSION_MAX; >>> + >>> + /*Get CPU id of first event */ >>> + first_cpu = ptr[i + CS_ETM_CPU]; >>> + last_cpu = first_cpu + num_cpu; >>> + >>> + if (first_cpu > cpu__max_cpu().cpu || >>> + last_cpu > cpu__max_cpu().cpu) >>> + return -EINVAL; >>> + >>> + metadata = zalloc(sizeof(*metadata) * last_cpu); >> >> Hi Ganapatrao, >> >> I think I see what the problem is, but I'm wondering if a better fix >> would be to further decouple the CPU ID from the index in the array. >> >> With your change it's not clear what happens with sparse recordings, for >> example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is > > This patch fixes for any range of CPUs. > Record with sparse list of CPUs will not work with current code still. > Is there a major issue that means sparse can't be done? I'm thinking it would be best to fix both issues with one change while we understand this part rather than a half fix that might have do be completely re-understood and re-done later anyway. Unless there is some big blocker but I can't see it? >> some wastage in the zalloc here for example if only CPU 256 is traced >> then we'd still make 256 decoders but 255 of them would be unused? >> >> I tried to test sparse recordings, but your change doesn't apply to the >> latest coresight/next branch. I did notice that 'perf report -D' doesn't >> work with them on coresight/next (it just quits), but I couldn't see if >> that's fixed with your change. > > My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches > related to dynamic id [1] support(queued for 6.4). > > "perf report -D" works for me. I was referring to sparse CPU lists, which I think you mentioned above doesn't work even with this patch. > > [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html > It should be based on the next branch here: git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git >> >> Would a better fix not be to keep the metadata loops from 0-N and >> instead save the CPU ID in cs_etm_decoder_params or the decoder. That >> way it would support both sparse and not starting from 0 cases? I think > > Yep, I though this initially, it got complicated due to for more > for-loops. I will try again and post V2. > I can't imagine it would need any extra for loops off the top of my head. Just saving the CPU ID in a few structs and using it wherever it's needed instead of the loop index. I imagine most of the loops would actually stay the same rather than be changed like you have in V1. >> the code would be better if it's worded like "i < recorded_cpus" rather >> than "i < cpu" to make it clear that i isn't actually the CPU ID it's >> just an index. > > Yes, makes sense to call it "recorded_cpus". > >> >> Also a new test for this scenario would probably be a good idea. >> >> Thanks >> James >> > Thanks, > Ganapat
On 20-04-2023 06:00 pm, James Clark wrote: > > > On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: >> >> Hi James, >> >> On 20-04-2023 03:13 pm, James Clark wrote: >>> >>> >>> On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: >>>> The current implementation supports coresight trace for a range of >>>> CPUs, if the first CPU is CPU0. >>>> >>>> Adding changes to enable coresight trace for any range of CPUs by >>>> decoding the first CPU also from the header. >>>> Later, first CPU id is used instead of CPU0 across the decoder >>>> functions. >>>> >>>> Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> >>>> --- >>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- >>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- >>>> tools/perf/util/cs-etm.c | 62 ++++++++++++------- >>>> 3 files changed, 42 insertions(+), 27 deletions(-) >>>> >>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>> index 82a27ab90c8b..41ab299b643b 100644 >>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct >>>> cs_etm_decoder_params *d_params, >>>> } >>>> struct cs_etm_decoder * >>>> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params >>>> *d_params, >>>> +cs_etm_decoder__new(int first_decoder, int decoders, struct >>>> cs_etm_decoder_params *d_params, >>>> struct cs_etm_trace_params t_params[]) >>>> { >>>> struct cs_etm_decoder *decoder; >>>> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct >>>> cs_etm_decoder_params *d_params, >>>> /* init raw frame logging if required */ >>>> cs_etm_decoder__init_raw_frame_logging(d_params, decoder); >>>> - for (i = 0; i < decoders; i++) { >>>> + for (i = first_decoder; i < decoders; i++) { >>>> ret = cs_etm_decoder__create_etm_decoder(d_params, >>>> &t_params[i], >>>> decoder); >>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>> index 92a855fbe5b8..b06193fc75b4 100644 >>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct >>>> cs_etm_decoder *decoder, >>>> size_t len, size_t *consumed); >>>> struct cs_etm_decoder * >>>> -cs_etm_decoder__new(int num_cpu, >>>> +cs_etm_decoder__new(int first_decoder, >>>> + int decoders, >>>> struct cs_etm_decoder_params *d_params, >>>> struct cs_etm_trace_params t_params[]); >>>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c >>>> index 94e2d02009eb..2619513ae088 100644 >>>> --- a/tools/perf/util/cs-etm.c >>>> +++ b/tools/perf/util/cs-etm.c >>>> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { >>>> u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ >>>> int num_cpu; >>>> + int first_cpu; >>>> + int last_cpu; >>>> u64 latest_kernel_timestamp; >>>> u32 auxtrace_type; >>>> u64 branches_sample_type; >>>> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct >>>> cs_etm_trace_params *t_params, >>>> } >>>> static int cs_etm__init_trace_params(struct cs_etm_trace_params >>>> *t_params, >>>> - struct cs_etm_auxtrace *etm, >>>> - int decoders) >>>> + struct cs_etm_auxtrace *etm) >>>> { >>>> int i; >>>> u32 etmidr; >>>> u64 architecture; >>>> - for (i = 0; i < decoders; i++) { >>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>> architecture = etm->metadata[i][CS_ETM_MAGIC]; >>>> switch (architecture) { >>>> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session >>>> *session) >>>> /* Then the RB tree itself */ >>>> intlist__delete(traceid_list); >>>> - for (i = 0; i < aux->num_cpu; i++) >>>> + for (i = aux->first_cpu; i < aux->last_cpu; i++) >>>> zfree(&aux->metadata[i]); >>>> thread__zput(aux->unknown_thread); >>>> @@ -921,7 +922,8 @@ static struct cs_etm_queue >>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>> * Each queue can only contain data from one CPU when >>>> unformatted, so only one decoder is >>>> * needed. >>>> */ >>>> - int decoders = formatted ? etm->num_cpu : 1; >>>> + int first_decoder = formatted ? etm->first_cpu : 0; >>>> + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); >>>> etmq = zalloc(sizeof(*etmq)); >>>> if (!etmq) >>>> @@ -937,7 +939,7 @@ static struct cs_etm_queue >>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>> if (!t_params) >>>> goto out_free; >>>> - if (cs_etm__init_trace_params(t_params, etm, decoders)) >>>> + if (cs_etm__init_trace_params(t_params, etm)) >>>> goto out_free; >>>> /* Set decoder parameters to decode trace packets */ >>>> @@ -947,8 +949,7 @@ static struct cs_etm_queue >>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>> formatted)) >>>> goto out_free; >>>> - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, >>>> - t_params); >>>> + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, >>>> &d_params, t_params); >>>> if (!etmq->decoder) >>>> goto out_free; >>>> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct >>>> perf_session *session) >>>> * Loop through the ETMs and complain if we find at least one where >>>> ts_source != 1 (virtual >>>> * timestamps). >>>> */ >>>> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >>>> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct >>>> cs_etm_auxtrace *etm) >>>> { >>>> int j; >>>> - for (j = 0; j < num_cpu; j++) { >>>> + for (j = etm->first_cpu; j < etm->last_cpu; j++) { >>>> switch (metadata[j][CS_ETM_MAGIC]) { >>>> case __perf_cs_etmv4_magic: >>>> if (HAS_PARAM(j, ETMV4, TS_SOURCE) || >>>> metadata[j][CS_ETMV4_TS_SOURCE] != 1) >>>> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 >>>> **metadata, int num_cpu) >>>> } >>>> /* map trace ids to correct metadata block, from information in >>>> metadata */ >>>> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) >>>> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) >>>> { >>>> u64 cs_etm_magic; >>>> + u64 **metadata = etm->metadata; >>>> u8 trace_chan_id; >>>> int i, err; >>>> - for (i = 0; i < num_cpu; i++) { >>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>>> switch (cs_etm_magic) { >>>> case __perf_cs_etmv3_magic: >>>> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int >>>> num_cpu, u64 **metadata) >>>> * If we found AUX_HW_ID packets, then set any metadata marked as >>>> unused to the >>>> * unused value to reduce the number of unneeded decoders created. >>>> */ >>>> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 >>>> **metadata) >>>> +static int cs_etm__clear_unused_trace_ids_metadata(struct >>>> cs_etm_auxtrace *etm) >>>> { >>>> u64 cs_etm_magic; >>>> + u64 **metadata = etm->metadata; >>>> int i; >>>> - for (i = 0; i < num_cpu; i++) { >>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>>> switch (cs_etm_magic) { >>>> case __perf_cs_etmv3_magic: >>>> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union >>>> perf_event *event, >>>> int event_header_size = sizeof(struct perf_event_header); >>>> int total_size = auxtrace_info->header.size; >>>> int priv_size = 0; >>>> - int num_cpu; >>>> + int num_cpu, first_cpu = 0, last_cpu; >>>> int err = 0; >>>> int aux_hw_id_found; >>>> int i, j; >>>> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union >>>> perf_event *event, >>>> /* First the global part */ >>>> ptr = (u64 *) auxtrace_info->priv; >>>> num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; >>>> - metadata = zalloc(sizeof(*metadata) * num_cpu); >>>> + >>>> + /* Start parsing after the common part of the header */ >>>> + i = CS_HEADER_VERSION_MAX; >>>> + >>>> + /*Get CPU id of first event */ >>>> + first_cpu = ptr[i + CS_ETM_CPU]; >>>> + last_cpu = first_cpu + num_cpu; >>>> + >>>> + if (first_cpu > cpu__max_cpu().cpu || >>>> + last_cpu > cpu__max_cpu().cpu) >>>> + return -EINVAL; >>>> + >>>> + metadata = zalloc(sizeof(*metadata) * last_cpu); >>> >>> Hi Ganapatrao, >>> >>> I think I see what the problem is, but I'm wondering if a better fix >>> would be to further decouple the CPU ID from the index in the array. >>> >>> With your change it's not clear what happens with sparse recordings, for >>> example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is >> >> This patch fixes for any range of CPUs. >> Record with sparse list of CPUs will not work with current code still. >> > > Is there a major issue that means sparse can't be done? I'm thinking it > would be best to fix both issues with one change while we understand > this part rather than a half fix that might have do be completely > re-understood and re-done later anyway. Unless there is some big blocker > but I can't see it? > >>> some wastage in the zalloc here for example if only CPU 256 is traced >>> then we'd still make 256 decoders but 255 of them would be unused? >>> >>> I tried to test sparse recordings, but your change doesn't apply to the >>> latest coresight/next branch. I did notice that 'perf report -D' doesn't >>> work with them on coresight/next (it just quits), but I couldn't see if >>> that's fixed with your change. >> >> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches >> related to dynamic id [1] support(queued for 6.4). >> >> "perf report -D" works for me. > > I was referring to sparse CPU lists, which I think you mentioned above > doesn't work even with this patch. > >> >> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html >> > > It should be based on the next branch here: > git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git OK. > >>> >>> Would a better fix not be to keep the metadata loops from 0-N and >>> instead save the CPU ID in cs_etm_decoder_params or the decoder. That >>> way it would support both sparse and not starting from 0 cases? I think >> >> Yep, I though this initially, it got complicated due to for more >> for-loops. I will try again and post V2. >> > > I can't imagine it would need any extra for loops off the top of my > head. Just saving the CPU ID in a few structs and using it wherever it's > needed instead of the loop index. I imagine most of the loops would > actually stay the same rather than be changed like you have in V1. Working on V2, I will post it at the earliest. > >>> the code would be better if it's worded like "i < recorded_cpus" rather >>> than "i < cpu" to make it clear that i isn't actually the CPU ID it's >>> just an index. >> >> Yes, makes sense to call it "recorded_cpus". >> >>> >>> Also a new test for this scenario would probably be a good idea. >>> >>> Thanks >>> James >>> >> Thanks, >> Ganapat Thanks, Ganapat
On 20/04/2023 13:37, Ganapatrao Kulkarni wrote: > > > On 20-04-2023 06:00 pm, James Clark wrote: >> >> >> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: >>> ... >>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches >>> related to dynamic id [1] support(queued for 6.4). >>> >>> "perf report -D" works for me. >> >> I was referring to sparse CPU lists, which I think you mentioned above >> doesn't work even with this patch. >> >>> >>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html >>> >> >> It should be based on the next branch here: >> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git > > OK. It need not be. Since this patch is purely perf tools patch and has nothing to do with the kernel drivers, it should be beased on whatever the tip of the perf tool tree is. Otherwise we risk rebasing to that eventually. Cheers Suzuki
Hi James, On 20-04-2023 06:07 pm, Ganapatrao Kulkarni wrote: > > > On 20-04-2023 06:00 pm, James Clark wrote: >> >> >> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: >>> >>> Hi James, >>> >>> On 20-04-2023 03:13 pm, James Clark wrote: >>>> >>>> >>>> On 19/04/2023 18:21, Ganapatrao Kulkarni wrote: >>>>> The current implementation supports coresight trace for a range of >>>>> CPUs, if the first CPU is CPU0. >>>>> >>>>> Adding changes to enable coresight trace for any range of CPUs by >>>>> decoding the first CPU also from the header. >>>>> Later, first CPU id is used instead of CPU0 across the decoder >>>>> functions. >>>>> >>>>> Signed-off-by: Ganapatrao Kulkarni >>>>> <gankulkarni@os.amperecomputing.com> >>>>> --- >>>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- >>>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- >>>>> tools/perf/util/cs-etm.c | 62 >>>>> ++++++++++++------- >>>>> 3 files changed, 42 insertions(+), 27 deletions(-) >>>>> >>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>>> index 82a27ab90c8b..41ab299b643b 100644 >>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c >>>>> @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct >>>>> cs_etm_decoder_params *d_params, >>>>> } >>>>> struct cs_etm_decoder * >>>>> -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params >>>>> *d_params, >>>>> +cs_etm_decoder__new(int first_decoder, int decoders, struct >>>>> cs_etm_decoder_params *d_params, >>>>> struct cs_etm_trace_params t_params[]) >>>>> { >>>>> struct cs_etm_decoder *decoder; >>>>> @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct >>>>> cs_etm_decoder_params *d_params, >>>>> /* init raw frame logging if required */ >>>>> cs_etm_decoder__init_raw_frame_logging(d_params, decoder); >>>>> - for (i = 0; i < decoders; i++) { >>>>> + for (i = first_decoder; i < decoders; i++) { >>>>> ret = cs_etm_decoder__create_etm_decoder(d_params, >>>>> &t_params[i], >>>>> decoder); >>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>>> b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>>> index 92a855fbe5b8..b06193fc75b4 100644 >>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h >>>>> @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct >>>>> cs_etm_decoder *decoder, >>>>> size_t len, size_t *consumed); >>>>> struct cs_etm_decoder * >>>>> -cs_etm_decoder__new(int num_cpu, >>>>> +cs_etm_decoder__new(int first_decoder, >>>>> + int decoders, >>>>> struct cs_etm_decoder_params *d_params, >>>>> struct cs_etm_trace_params t_params[]); >>>>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c >>>>> index 94e2d02009eb..2619513ae088 100644 >>>>> --- a/tools/perf/util/cs-etm.c >>>>> +++ b/tools/perf/util/cs-etm.c >>>>> @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { >>>>> u8 has_virtual_ts; /* Virtual/Kernel timestamps in the >>>>> trace. */ >>>>> int num_cpu; >>>>> + int first_cpu; >>>>> + int last_cpu; >>>>> u64 latest_kernel_timestamp; >>>>> u32 auxtrace_type; >>>>> u64 branches_sample_type; >>>>> @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct >>>>> cs_etm_trace_params *t_params, >>>>> } >>>>> static int cs_etm__init_trace_params(struct cs_etm_trace_params >>>>> *t_params, >>>>> - struct cs_etm_auxtrace *etm, >>>>> - int decoders) >>>>> + struct cs_etm_auxtrace *etm) >>>>> { >>>>> int i; >>>>> u32 etmidr; >>>>> u64 architecture; >>>>> - for (i = 0; i < decoders; i++) { >>>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>>> architecture = etm->metadata[i][CS_ETM_MAGIC]; >>>>> switch (architecture) { >>>>> @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session >>>>> *session) >>>>> /* Then the RB tree itself */ >>>>> intlist__delete(traceid_list); >>>>> - for (i = 0; i < aux->num_cpu; i++) >>>>> + for (i = aux->first_cpu; i < aux->last_cpu; i++) >>>>> zfree(&aux->metadata[i]); >>>>> thread__zput(aux->unknown_thread); >>>>> @@ -921,7 +922,8 @@ static struct cs_etm_queue >>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>>> * Each queue can only contain data from one CPU when >>>>> unformatted, so only one decoder is >>>>> * needed. >>>>> */ >>>>> - int decoders = formatted ? etm->num_cpu : 1; >>>>> + int first_decoder = formatted ? etm->first_cpu : 0; >>>>> + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); >>>>> etmq = zalloc(sizeof(*etmq)); >>>>> if (!etmq) >>>>> @@ -937,7 +939,7 @@ static struct cs_etm_queue >>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>>> if (!t_params) >>>>> goto out_free; >>>>> - if (cs_etm__init_trace_params(t_params, etm, decoders)) >>>>> + if (cs_etm__init_trace_params(t_params, etm)) >>>>> goto out_free; >>>>> /* Set decoder parameters to decode trace packets */ >>>>> @@ -947,8 +949,7 @@ static struct cs_etm_queue >>>>> *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, >>>>> formatted)) >>>>> goto out_free; >>>>> - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, >>>>> - t_params); >>>>> + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, >>>>> &d_params, t_params); >>>>> if (!etmq->decoder) >>>>> goto out_free; >>>>> @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct >>>>> perf_session *session) >>>>> * Loop through the ETMs and complain if we find at least one where >>>>> ts_source != 1 (virtual >>>>> * timestamps). >>>>> */ >>>>> -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) >>>>> +static bool cs_etm__has_virtual_ts(u64 **metadata, struct >>>>> cs_etm_auxtrace *etm) >>>>> { >>>>> int j; >>>>> - for (j = 0; j < num_cpu; j++) { >>>>> + for (j = etm->first_cpu; j < etm->last_cpu; j++) { >>>>> switch (metadata[j][CS_ETM_MAGIC]) { >>>>> case __perf_cs_etmv4_magic: >>>>> if (HAS_PARAM(j, ETMV4, TS_SOURCE) || >>>>> metadata[j][CS_ETMV4_TS_SOURCE] != 1) >>>>> @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 >>>>> **metadata, int num_cpu) >>>>> } >>>>> /* map trace ids to correct metadata block, from information in >>>>> metadata */ >>>>> -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 >>>>> **metadata) >>>>> +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace >>>>> *etm) >>>>> { >>>>> u64 cs_etm_magic; >>>>> + u64 **metadata = etm->metadata; >>>>> u8 trace_chan_id; >>>>> int i, err; >>>>> - for (i = 0; i < num_cpu; i++) { >>>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>>>> switch (cs_etm_magic) { >>>>> case __perf_cs_etmv3_magic: >>>>> @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int >>>>> num_cpu, u64 **metadata) >>>>> * If we found AUX_HW_ID packets, then set any metadata marked as >>>>> unused to the >>>>> * unused value to reduce the number of unneeded decoders created. >>>>> */ >>>>> -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 >>>>> **metadata) >>>>> +static int cs_etm__clear_unused_trace_ids_metadata(struct >>>>> cs_etm_auxtrace *etm) >>>>> { >>>>> u64 cs_etm_magic; >>>>> + u64 **metadata = etm->metadata; >>>>> int i; >>>>> - for (i = 0; i < num_cpu; i++) { >>>>> + for (i = etm->first_cpu; i < etm->last_cpu; i++) { >>>>> cs_etm_magic = metadata[i][CS_ETM_MAGIC]; >>>>> switch (cs_etm_magic) { >>>>> case __perf_cs_etmv3_magic: >>>>> @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union >>>>> perf_event *event, >>>>> int event_header_size = sizeof(struct perf_event_header); >>>>> int total_size = auxtrace_info->header.size; >>>>> int priv_size = 0; >>>>> - int num_cpu; >>>>> + int num_cpu, first_cpu = 0, last_cpu; >>>>> int err = 0; >>>>> int aux_hw_id_found; >>>>> int i, j; >>>>> @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union >>>>> perf_event *event, >>>>> /* First the global part */ >>>>> ptr = (u64 *) auxtrace_info->priv; >>>>> num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; >>>>> - metadata = zalloc(sizeof(*metadata) * num_cpu); >>>>> + >>>>> + /* Start parsing after the common part of the header */ >>>>> + i = CS_HEADER_VERSION_MAX; >>>>> + >>>>> + /*Get CPU id of first event */ >>>>> + first_cpu = ptr[i + CS_ETM_CPU]; >>>>> + last_cpu = first_cpu + num_cpu; >>>>> + >>>>> + if (first_cpu > cpu__max_cpu().cpu || >>>>> + last_cpu > cpu__max_cpu().cpu) >>>>> + return -EINVAL; >>>>> + >>>>> + metadata = zalloc(sizeof(*metadata) * last_cpu); >>>> >>>> Hi Ganapatrao, >>>> >>>> I think I see what the problem is, but I'm wondering if a better fix >>>> would be to further decouple the CPU ID from the index in the array. >>>> >>>> With your change it's not clear what happens with sparse recordings, >>>> for >>>> example 'perf record -e cs_etm// -C 1,3,5'. And it seems like there is >>> >>> This patch fixes for any range of CPUs. >>> Record with sparse list of CPUs will not work with current code still. >>> >> >> Is there a major issue that means sparse can't be done? I'm thinking it >> would be best to fix both issues with one change while we understand >> this part rather than a half fix that might have do be completely >> re-understood and re-done later anyway. Unless there is some big blocker >> but I can't see it? >> >>>> some wastage in the zalloc here for example if only CPU 256 is traced >>>> then we'd still make 256 decoders but 255 of them would be unused? >>>> >>>> I tried to test sparse recordings, but your change doesn't apply to the >>>> latest coresight/next branch. I did notice that 'perf report -D' >>>> doesn't >>>> work with them on coresight/next (it just quits), but I couldn't see if >>>> that's fixed with your change. >>> >>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches >>> related to dynamic id [1] support(queued for 6.4). >>> >>> "perf report -D" works for me. >> >> I was referring to sparse CPU lists, which I think you mentioned above >> doesn't work even with this patch. >> >>> >>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html >>> >> >> It should be based on the next branch here: >> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git > > OK. >> >>>> >>>> Would a better fix not be to keep the metadata loops from 0-N and >>>> instead save the CPU ID in cs_etm_decoder_params or the decoder. That >>>> way it would support both sparse and not starting from 0 cases? I think >>> >>> Yep, I though this initially, it got complicated due to for more >>> for-loops. I will try again and post V2. >>> >> >> I can't imagine it would need any extra for loops off the top of my >> head. Just saving the CPU ID in a few structs and using it wherever it's >> needed instead of the loop index. I imagine most of the loops would >> actually stay the same rather than be changed like you have in V1. > > Working on V2, I will post it at the earliest. My codebase is "6.3-RC7 + Mike's patche[1] + Anshuman/Suzuki patches[3]" and below diff works for any CPU range and sparse list as well. I will send the V2 ASAP. [root@B0-213 perf]# git diff diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 94e2d02009eb..bfed0bb8be98 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -274,6 +274,22 @@ static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata) (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ }) + +static u64 *get_cpu_data(struct cs_etm_auxtrace *etm, int cpu) +{ + int i; + u64 *metadata = NULL; + + for (i = 0; i < etm->num_cpu; i++) { + if (etm->metadata[i][CS_ETM_CPU] == (u64)cpu) { + metadata = etm->metadata[i]; + break; + } + } + + return metadata; +} + /* * Handle the PERF_RECORD_AUX_OUTPUT_HW_ID event. * @@ -344,7 +360,7 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, } /* not one we've seen before - lets map it */ - cpu_data = etm->metadata[cpu]; + cpu_data = get_cpu_data(etm, cpu); err = cs_etm__map_trace_id(trace_chan_id, cpu_data); if (err) return err; I tried below commands and works well with this diff. timeout 8s ./perf record -e cs_etm// -C 0 dd if=/dev/zero of=/dev/null timeout 8s ./perf record -e cs_etm// -C 1 dd if=/dev/zero of=/dev/null timeout 8s ./perf record -e cs_etm// -C 1,3,5 dd if=/dev/zero of=/dev/null timeout 8s ./perf record -e cs_etm// -C 10,3,12 dd if=/dev/zero of=/dev/null timeout 8s ./perf record -e cs_etm//u -C 0-10 dd if=/dev/zero of=/dev/null timeout 8s ./perf record -e cs_etm//u -C 10-20 dd if=/dev/zero of=/dev/nu However I could not be able to apply to next branch of coresight repo git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git Then I looked at the code, it does not have Mike's dynamic ID patches[1] which are merged to perf/next. It seems the code is not the latest or it is next for coresight driver only. I think either patch should be rebased to most-recent kernel RC1/RCx(which I did) or perf-tools-next of [2] for perf tool. [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html [2] git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git [3] https://www.spinics.net/lists/arm-kernel/msg1059012.html >> >>>> the code would be better if it's worded like "i < recorded_cpus" rather >>>> than "i < cpu" to make it clear that i isn't actually the CPU ID it's >>>> just an index. >>> >>> Yes, makes sense to call it "recorded_cpus". >>> >>>> >>>> Also a new test for this scenario would probably be a good idea. >>>> >>>> Thanks >>>> James >>>> >>> Thanks, >>> Ganapat > > Thanks, > Ganapat Thanks, Ganapat
On 20/04/2023 14:03, Suzuki K Poulose wrote: > On 20/04/2023 13:37, Ganapatrao Kulkarni wrote: >> >> >> On 20-04-2023 06:00 pm, James Clark wrote: >>> >>> >>> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: >>>> > > ... > >>>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches >>>> related to dynamic id [1] support(queued for 6.4). >>>> >>>> "perf report -D" works for me. >>> >>> I was referring to sparse CPU lists, which I think you mentioned above >>> doesn't work even with this patch. >>> >>>> >>>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html >>>> >>> >>> It should be based on the next branch here: >>> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git >> >> OK. > > It need not be. Since this patch is purely perf tools patch and has > nothing to do with the kernel drivers, it should be beased on whatever > the tip of the perf tool tree is. Otherwise we risk rebasing to that > eventually. > > Cheers > Suzuki > Good point, sorry for the confusion! I wonder if we could have some kind of new staging branch that has both up to date perf and coresight changes at the same time? Either that would make things like this easier, or more complicated. I'm not sure. I suppose I can DIY it quite easily but then everyone would have to as well. James
Em Thu, Apr 20, 2023 at 04:44:21PM +0100, James Clark escreveu: > On 20/04/2023 14:03, Suzuki K Poulose wrote: > > On 20/04/2023 13:37, Ganapatrao Kulkarni wrote: > >> On 20-04-2023 06:00 pm, James Clark wrote: > >>> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: > >>>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches > >>>> related to dynamic id [1] support(queued for 6.4). > >>>> "perf report -D" works for me. > >>> I was referring to sparse CPU lists, which I think you mentioned above > >>> doesn't work even with this patch. > >>>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html > >>> It should be based on the next branch here: > >>> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git > >> OK. > > It need not be. Since this patch is purely perf tools patch and has > > nothing to do with the kernel drivers, it should be beased on whatever > > the tip of the perf tool tree is. Otherwise we risk rebasing to that > > eventually. > Good point, sorry for the confusion! > I wonder if we could have some kind of new staging branch that has both > up to date perf and coresight changes at the same time? Either that > would make things like this easier, or more complicated. I'm not sure. > I suppose I can DIY it quite easily but then everyone would have to as well. My two cents: It this was available together with a CI that would run 'perf test' + 'make -C tools/perf build-test' and any other set of tests, that would be great. But not having it also has an advantage: no lockstep development, tooling should gracefully work with whatever is available. I say this because it is a really common theme, even Debian had a packaging scheme that shoehorned (forcefully fused?) perf's and the kernel's version :-\ - Arnaldo
On 20/04/2023 16:44, James Clark wrote: > > > On 20/04/2023 14:03, Suzuki K Poulose wrote: >> On 20/04/2023 13:37, Ganapatrao Kulkarni wrote: >>> >>> >>> On 20-04-2023 06:00 pm, James Clark wrote: >>>> >>>> >>>> On 20/04/2023 12:47, Ganapatrao Kulkarni wrote: >>>>> >> >> ... >> >>>>> My patch is rebased on 6.3-RC7 codebase with Mike's 3 perf patches >>>>> related to dynamic id [1] support(queued for 6.4). >>>>> >>>>> "perf report -D" works for me. >>>> >>>> I was referring to sparse CPU lists, which I think you mentioned above >>>> doesn't work even with this patch. >>>> >>>>> >>>>> [1] https://www.spinics.net/lists/linux-perf-users/msg27452.html >>>>> >>>> >>>> It should be based on the next branch here: >>>> git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git >>> >>> OK. >> >> It need not be. Since this patch is purely perf tools patch and has >> nothing to do with the kernel drivers, it should be beased on whatever >> the tip of the perf tool tree is. Otherwise we risk rebasing to that >> eventually. >> >> Cheers >> Suzuki >> > > Good point, sorry for the confusion! > > I wonder if we could have some kind of new staging branch that has both > up to date perf and coresight changes at the same time? Either that > would make things like this easier, or more complicated. I'm not sure. I agree that it is complicated. :-( We could do something about this if a situation arises in the future, where the kernel and perf patches are out of sync w.r.t merging. As, such the dependency on Anshuman's series is for ACPI support which Ampere system needs. I would let this one pass, given the merge window is too close. Thanks Suzuki
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 82a27ab90c8b..41ab299b643b 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -724,7 +724,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params, } struct cs_etm_decoder * -cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, +cs_etm_decoder__new(int first_decoder, int decoders, struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params t_params[]) { struct cs_etm_decoder *decoder; @@ -769,7 +769,7 @@ cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params, /* init raw frame logging if required */ cs_etm_decoder__init_raw_frame_logging(d_params, decoder); - for (i = 0; i < decoders; i++) { + for (i = first_decoder; i < decoders; i++) { ret = cs_etm_decoder__create_etm_decoder(d_params, &t_params[i], decoder); diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index 92a855fbe5b8..b06193fc75b4 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -90,7 +90,8 @@ int cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder, size_t len, size_t *consumed); struct cs_etm_decoder * -cs_etm_decoder__new(int num_cpu, +cs_etm_decoder__new(int first_decoder, + int decoders, struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params t_params[]); diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 94e2d02009eb..2619513ae088 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -55,6 +55,8 @@ struct cs_etm_auxtrace { u8 has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ int num_cpu; + int first_cpu; + int last_cpu; u64 latest_kernel_timestamp; u32 auxtrace_type; u64 branches_sample_type; @@ -638,14 +640,13 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, } static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, - struct cs_etm_auxtrace *etm, - int decoders) + struct cs_etm_auxtrace *etm) { int i; u32 etmidr; u64 architecture; - for (i = 0; i < decoders; i++) { + for (i = etm->first_cpu; i < etm->last_cpu; i++) { architecture = etm->metadata[i][CS_ETM_MAGIC]; switch (architecture) { @@ -817,7 +818,7 @@ static void cs_etm__free(struct perf_session *session) /* Then the RB tree itself */ intlist__delete(traceid_list); - for (i = 0; i < aux->num_cpu; i++) + for (i = aux->first_cpu; i < aux->last_cpu; i++) zfree(&aux->metadata[i]); thread__zput(aux->unknown_thread); @@ -921,7 +922,8 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, * Each queue can only contain data from one CPU when unformatted, so only one decoder is * needed. */ - int decoders = formatted ? etm->num_cpu : 1; + int first_decoder = formatted ? etm->first_cpu : 0; + int decoders = first_decoder + (formatted ? etm->num_cpu : 1); etmq = zalloc(sizeof(*etmq)); if (!etmq) @@ -937,7 +939,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, if (!t_params) goto out_free; - if (cs_etm__init_trace_params(t_params, etm, decoders)) + if (cs_etm__init_trace_params(t_params, etm)) goto out_free; /* Set decoder parameters to decode trace packets */ @@ -947,8 +949,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, formatted)) goto out_free; - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, - t_params); + etmq->decoder = cs_etm_decoder__new(first_decoder, decoders, &d_params, t_params); if (!etmq->decoder) goto out_free; @@ -2959,11 +2960,11 @@ static int cs_etm__queue_aux_records(struct perf_session *session) * Loop through the ETMs and complain if we find at least one where ts_source != 1 (virtual * timestamps). */ -static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) +static bool cs_etm__has_virtual_ts(u64 **metadata, struct cs_etm_auxtrace *etm) { int j; - for (j = 0; j < num_cpu; j++) { + for (j = etm->first_cpu; j < etm->last_cpu; j++) { switch (metadata[j][CS_ETM_MAGIC]) { case __perf_cs_etmv4_magic: if (HAS_PARAM(j, ETMV4, TS_SOURCE) || metadata[j][CS_ETMV4_TS_SOURCE] != 1) @@ -2982,13 +2983,14 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) } /* map trace ids to correct metadata block, from information in metadata */ -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm) { u64 cs_etm_magic; + u64 **metadata = etm->metadata; u8 trace_chan_id; int i, err; - for (i = 0; i < num_cpu; i++) { + for (i = etm->first_cpu; i < etm->last_cpu; i++) { cs_etm_magic = metadata[i][CS_ETM_MAGIC]; switch (cs_etm_magic) { case __perf_cs_etmv3_magic: @@ -3015,12 +3017,13 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) * If we found AUX_HW_ID packets, then set any metadata marked as unused to the * unused value to reduce the number of unneeded decoders created. */ -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) +static int cs_etm__clear_unused_trace_ids_metadata(struct cs_etm_auxtrace *etm) { u64 cs_etm_magic; + u64 **metadata = etm->metadata; int i; - for (i = 0; i < num_cpu; i++) { + for (i = etm->first_cpu; i < etm->last_cpu; i++) { cs_etm_magic = metadata[i][CS_ETM_MAGIC]; switch (cs_etm_magic) { case __perf_cs_etmv3_magic: @@ -3049,7 +3052,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, int event_header_size = sizeof(struct perf_event_header); int total_size = auxtrace_info->header.size; int priv_size = 0; - int num_cpu; + int num_cpu, first_cpu = 0, last_cpu; int err = 0; int aux_hw_id_found; int i, j; @@ -3068,22 +3071,31 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, /* First the global part */ ptr = (u64 *) auxtrace_info->priv; num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; - metadata = zalloc(sizeof(*metadata) * num_cpu); + + /* Start parsing after the common part of the header */ + i = CS_HEADER_VERSION_MAX; + + /*Get CPU id of first event */ + first_cpu = ptr[i + CS_ETM_CPU]; + last_cpu = first_cpu + num_cpu; + + if (first_cpu > cpu__max_cpu().cpu || + last_cpu > cpu__max_cpu().cpu) + return -EINVAL; + + metadata = zalloc(sizeof(*metadata) * last_cpu); if (!metadata) { err = -ENOMEM; goto err_free_traceid_list; } - /* Start parsing after the common part of the header */ - i = CS_HEADER_VERSION_MAX; - /* * The metadata is stored in the auxtrace_info section and encodes * the configuration of the ARM embedded trace macrocell which is * required by the trace decoder to properly decode the trace due * to its highly compressed nature. */ - for (j = 0; j < num_cpu; j++) { + for (j = first_cpu; j < last_cpu; j++) { if (ptr[i] == __perf_cs_etmv3_magic) { metadata[j] = cs_etm__create_meta_blk(ptr, &i, @@ -3145,6 +3157,8 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, etm->machine = &session->machines.host; etm->num_cpu = num_cpu; + etm->first_cpu = first_cpu; + etm->last_cpu = last_cpu; etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff); etm->snapshot_mode = (ptr[CS_ETM_SNAPSHOT] != 0); etm->metadata = metadata; @@ -3152,7 +3166,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, etm->timeless_decoding = cs_etm__is_timeless_decoding(etm); /* Use virtual timestamps if all ETMs report ts_source = 1 */ - etm->has_virtual_ts = cs_etm__has_virtual_ts(metadata, num_cpu); + etm->has_virtual_ts = cs_etm__has_virtual_ts(metadata, etm); if (!etm->has_virtual_ts) ui__warning("Virtual timestamps are not enabled, or not supported by the traced system.\n" @@ -3232,10 +3246,10 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, /* if HW ID found then clear any unused metadata ID values */ if (aux_hw_id_found) - err = cs_etm__clear_unused_trace_ids_metadata(num_cpu, metadata); + err = cs_etm__clear_unused_trace_ids_metadata(etm); /* otherwise, this is a file with metadata values only, map from metadata */ else - err = cs_etm__map_trace_ids_metadata(num_cpu, metadata); + err = cs_etm__map_trace_ids_metadata(etm); if (err) goto err_delete_thread; @@ -3256,7 +3270,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, zfree(&etm); err_free_metadata: /* No need to check @metadata[j], free(NULL) is supported */ - for (j = 0; j < num_cpu; j++) + for (j = first_cpu; j < last_cpu; j++) zfree(&metadata[j]); zfree(&metadata); err_free_traceid_list:
The current implementation supports coresight trace for a range of CPUs, if the first CPU is CPU0. Adding changes to enable coresight trace for any range of CPUs by decoding the first CPU also from the header. Later, first CPU id is used instead of CPU0 across the decoder functions. Signed-off-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> --- .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 +- .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 3 +- tools/perf/util/cs-etm.c | 62 ++++++++++++------- 3 files changed, 42 insertions(+), 27 deletions(-)