diff mbox series

[3/7] perf cs-etm: Save aux records in each etm queue

Message ID 20210212144513.31765-4-james.clark@arm.com (mailing list archive)
State New, archived
Headers show
Series Split Coresight decode by aux records | expand

Commit Message

James Clark Feb. 12, 2021, 2:45 p.m. UTC
The aux records will be used set the bounds of decoding in a
later commit. In the future we may also want to use the flags
of each record to control decoding.

Do these need to be saved in their entirety, or can pointers
to each record safely be saved instead for later access?

Signed-off-by: James Clark <james.clark@arm.com>
---
 tools/perf/util/cs-etm.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

Comments

Leo Yan Feb. 27, 2021, 7:10 a.m. UTC | #1
On Fri, Feb 12, 2021 at 04:45:09PM +0200, James Clark wrote:
> The aux records will be used set the bounds of decoding in a
> later commit. In the future we may also want to use the flags
> of each record to control decoding.
> 
> Do these need to be saved in their entirety, or can pointers
> to each record safely be saved instead for later access?

Rather than introudcing the perf record list, I just wander if we can
use easier method to fix this problem.  So below is the rough idea
(though I don't really verify it):

The essential information we need is what's the valid buffer length can
be used for decoding.  Though cs_etm_queue::buf_len tracks the buffer
length, but it's the buffer length is for the whole AUX buffer, and
which belongs to multiple "PERF_RECORD_AUX" events.  So we cannot decode
at once for the whole trace data in the AUX trace buffer, on the other
hand, the incoming "PERF_RECORD_AUX" event can guide the CoreSight
decoder it should decode how much buffer size.  At the end, the trace
data can be decoded step by step based on the incoming "PERF_RECORD_AUX"
events.

I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
stands for the record length based on the RECORD_AUX event.  In
theory, this value should be always less than "cs_etm_queue::buf_len".

When every time the "PERF_RECORD_AUX" event is coming, we find out the
corresponding queue (so this can be applied for "1:1" or "N:1" models
for source and sink), and accumulate "perf_record_aux::aux_size" into
"cs_etm_queue::buf_rec_len".

At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
the current round of decoding (see cs_etm__decode_data_block()).  Since
all the "PERF_RECORD_AUX" event will be processed before
"PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
ignored.

The main reason for this suggestion is it don't need to change the
significant logic in current code.  I will try to do experiment for this
idea and share back.

James, if you think I miss anything, please correct me as needed.
Thanks!

Leo

> Signed-off-by: James Clark <james.clark@arm.com>
> ---
>  tools/perf/util/cs-etm.c | 32 +++++++++++++++++++++++++++++---
>  1 file changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 8f8b448632fb..88b541b2a804 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -92,12 +92,16 @@ struct cs_etm_queue {
>  	/* Conversion between traceID and index in traceid_queues array */
>  	struct intlist *traceid_queues_list;
>  	struct cs_etm_traceid_queue **traceid_queues;
> +	int aux_record_list_len;
> +	int aux_record_list_idx;
> +	struct perf_record_aux *aux_record_list;
>  };
>  
>  /* RB tree for quick conversion between traceID and metadata pointers */
>  static struct intlist *traceid_list;
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
> +				 struct perf_record_aux *aux_record);
>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>  					   pid_t tid);
> @@ -585,6 +589,7 @@ static void cs_etm__free_queue(void *priv)
>  
>  	cs_etm_decoder__free(etmq->decoder);
>  	cs_etm__free_traceid_queues(etmq);
> +	free(etmq->aux_record_list);
>  	free(etmq);
>  }
>  
> @@ -759,6 +764,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm)
>  	return NULL;
>  }
>  
> +static int cs_etm__save_aux_record(struct cs_etm_queue *etmq,
> +				   struct perf_record_aux *aux_record)
> +{
> +	etmq->aux_record_list = reallocarray(etmq->aux_record_list,
> +					      etmq->aux_record_list_len+1,
> +					      sizeof(*etmq->aux_record_list));
> +	if (!etmq->aux_record_list)
> +		return -ENOMEM;
> +
> +	etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
> +	return 0;
> +}
> +
>  static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
>  {
>  	int ret = 0;
> @@ -865,7 +883,7 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
>  	return 0;
>  }
>  
> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, struct perf_record_aux *aux)
>  {
>  	int ret;
>  	if (etm->queues.new_data) {
> @@ -875,6 +893,14 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
>  			return ret;
>  	}
>  
> +	/* In timeless mode, cpu is set to -1, and a single aux buffer is filled */
> +	if (cpu < 0)
> +		cpu = 0;
> +
> +	ret = cs_etm__save_aux_record(etm->queues.queue_array[cpu].priv, aux);
> +	if (ret)
> +		return ret;
> +
>  	if (!etm->timeless_decoding)
>  		return cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
>  	else
> @@ -2357,7 +2383,7 @@ static int cs_etm__process_event(struct perf_session *session,
>  
>  	if ((timestamp || etm->timeless_decoding)
>  			&& event->header.type == PERF_RECORD_AUX) {
> -		err = cs_etm__update_queues(etm, sample->cpu);
> +		err = cs_etm__update_queues(etm, sample->cpu, &event->aux);
>  		if (err)
>  			return err;
>  	}
> -- 
> 2.28.0
>
James Clark March 1, 2021, 3:43 p.m. UTC | #2
On 27/02/2021 09:10, Leo Yan wrote:
> On Fri, Feb 12, 2021 at 04:45:09PM +0200, James Clark wrote:
>> The aux records will be used set the bounds of decoding in a
>> later commit. In the future we may also want to use the flags
>> of each record to control decoding.
>>
>> Do these need to be saved in their entirety, or can pointers
>> to each record safely be saved instead for later access?
> 
> Rather than introudcing the perf record list, I just wander if we can
> use easier method to fix this problem.  So below is the rough idea
> (though I don't really verify it):
> 
> The essential information we need is what's the valid buffer length can
> be used for decoding.  Though cs_etm_queue::buf_len tracks the buffer
> length, but it's the buffer length is for the whole AUX buffer, and
> which belongs to multiple "PERF_RECORD_AUX" events.  So we cannot decode
> at once for the whole trace data in the AUX trace buffer, on the other
> hand, the incoming "PERF_RECORD_AUX" event can guide the CoreSight
> decoder it should decode how much buffer size.  At the end, the trace
> data can be decoded step by step based on the incoming "PERF_RECORD_AUX"
> events.
> 
> I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
> stands for the record length based on the RECORD_AUX event.  In
> theory, this value should be always less than "cs_etm_queue::buf_len".
> 
> When every time the "PERF_RECORD_AUX" event is coming, we find out the
> corresponding queue (so this can be applied for "1:1" or "N:1" models
> for source and sink), and accumulate "perf_record_aux::aux_size" into
> "cs_etm_queue::buf_rec_len".
> 
> At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
> the current round of decoding (see cs_etm__decode_data_block()).  Since
> all the "PERF_RECORD_AUX" event will be processed before
> "PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
> ignored.
> 
> The main reason for this suggestion is it don't need to change the
> significant logic in current code.  I will try to do experiment for this
> idea and share back.
> 
> James, if you think I miss anything, please correct me as needed.
> Thanks!
> 

This is an interesting idea, I think we could push decoded packets into the
min heap as the aux records are received, and not do anything with them until
the end of the data is reached. That way instead of saving aux records, we'd
save the result of the decode for each aux record.

Currently each cs_etm_queue has a cs_etm_traceid_queue/cs_etm_packet_queue for each
stream, but that would have to be changed to have multiple ones because multiple
packets could be decoded to get through the whole aux record.

It would be a similarly sized change, and could also have a bigger impact on
memory. So I'm not sure if it would help to reduce the changes, but it is possible.

James

> Leo
> 
>> Signed-off-by: James Clark <james.clark@arm.com>
>> ---
>>  tools/perf/util/cs-etm.c | 32 +++++++++++++++++++++++++++++---
>>  1 file changed, 29 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index 8f8b448632fb..88b541b2a804 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -92,12 +92,16 @@ struct cs_etm_queue {
>>  	/* Conversion between traceID and index in traceid_queues array */
>>  	struct intlist *traceid_queues_list;
>>  	struct cs_etm_traceid_queue **traceid_queues;
>> +	int aux_record_list_len;
>> +	int aux_record_list_idx;
>> +	struct perf_record_aux *aux_record_list;
>>  };
>>  
>>  /* RB tree for quick conversion between traceID and metadata pointers */
>>  static struct intlist *traceid_list;
>>  
>> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
>> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
>> +				 struct perf_record_aux *aux_record);
>>  static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
>>  static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
>>  					   pid_t tid);
>> @@ -585,6 +589,7 @@ static void cs_etm__free_queue(void *priv)
>>  
>>  	cs_etm_decoder__free(etmq->decoder);
>>  	cs_etm__free_traceid_queues(etmq);
>> +	free(etmq->aux_record_list);
>>  	free(etmq);
>>  }
>>  
>> @@ -759,6 +764,19 @@ static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm)
>>  	return NULL;
>>  }
>>  
>> +static int cs_etm__save_aux_record(struct cs_etm_queue *etmq,
>> +				   struct perf_record_aux *aux_record)
>> +{
>> +	etmq->aux_record_list = reallocarray(etmq->aux_record_list,
>> +					      etmq->aux_record_list_len+1,
>> +					      sizeof(*etmq->aux_record_list));
>> +	if (!etmq->aux_record_list)
>> +		return -ENOMEM;
>> +
>> +	etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
>> +	return 0;
>> +}
>> +
>>  static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
>>  {
>>  	int ret = 0;
>> @@ -865,7 +883,7 @@ static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
>>  	return 0;
>>  }
>>  
>> -static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
>> +static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, struct perf_record_aux *aux)
>>  {
>>  	int ret;
>>  	if (etm->queues.new_data) {
>> @@ -875,6 +893,14 @@ static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
>>  			return ret;
>>  	}
>>  
>> +	/* In timeless mode, cpu is set to -1, and a single aux buffer is filled */
>> +	if (cpu < 0)
>> +		cpu = 0;
>> +
>> +	ret = cs_etm__save_aux_record(etm->queues.queue_array[cpu].priv, aux);
>> +	if (ret)
>> +		return ret;
>> +
>>  	if (!etm->timeless_decoding)
>>  		return cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
>>  	else
>> @@ -2357,7 +2383,7 @@ static int cs_etm__process_event(struct perf_session *session,
>>  
>>  	if ((timestamp || etm->timeless_decoding)
>>  			&& event->header.type == PERF_RECORD_AUX) {
>> -		err = cs_etm__update_queues(etm, sample->cpu);
>> +		err = cs_etm__update_queues(etm, sample->cpu, &event->aux);
>>  		if (err)
>>  			return err;
>>  	}
>> -- 
>> 2.28.0
>>
Leo Yan March 2, 2021, 12:03 p.m. UTC | #3
On Mon, Mar 01, 2021 at 05:43:43PM +0200, James Clark wrote:

[...]

> > I'd like to propose to add a new field "cs_etm_queue::buf_rec_len", it
> > stands for the record length based on the RECORD_AUX event.  In
> > theory, this value should be always less than "cs_etm_queue::buf_len".
> > 
> > When every time the "PERF_RECORD_AUX" event is coming, we find out the
> > corresponding queue (so this can be applied for "1:1" or "N:1" models
> > for source and sink), and accumulate "perf_record_aux::aux_size" into
> > "cs_etm_queue::buf_rec_len".
> > 
> > At the decoder side, it decreases "etmq->buf_rec_len" until to zero for
> > the current round of decoding (see cs_etm__decode_data_block()).  Since
> > all the "PERF_RECORD_AUX" event will be processed before
> > "PERF_RECORD_EXIT" event, so we don't worry the tail trace data will be
> > ignored.
> > 
> > The main reason for this suggestion is it don't need to change the
> > significant logic in current code.  I will try to do experiment for this
> > idea and share back.
> > 
> > James, if you think I miss anything, please correct me as needed.
> > Thanks!
> > 
> 
> This is an interesting idea, I think we could push decoded packets into the
> min heap as the aux records are received, and not do anything with them until
> the end of the data is reached. That way instead of saving aux records, we'd
> save the result of the decode for each aux record.
> 
> Currently each cs_etm_queue has a cs_etm_traceid_queue/cs_etm_packet_queue for each
> stream, but that would have to be changed to have multiple ones because multiple
> packets could be decoded to get through the whole aux record.
> 
> It would be a similarly sized change, and could also have a bigger impact on
> memory. So I'm not sure if it would help to reduce the changes, but it is possible.

Below change is still very coarse and I just did very basic testing for
it, so didn't cover all cases; so simply use it to demonstrate the basic
idea.

Before the event PERF_RECORD_AUX arrives, we don't decode any trace
data.  And after PERF_RECORD_AUX coming, the aux buffer size will be
accumulated into the queue, and decode the trace data for the queue
based on the accumulated buffer length.

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index b9c1d329a7f1..3bd5609b6de4 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -89,7 +89,7 @@ struct cs_etm_queue {
 	u8 pending_timestamp;
 	u64 offset;
 	const unsigned char *buf;
-	size_t buf_len, buf_used;
+	size_t aux_buf_len, buf_len, buf_used;
 	/* Conversion between traceID and index in traceid_queues array */
 	struct intlist *traceid_queues_list;
 	struct cs_etm_traceid_queue **traceid_queues;
@@ -1085,6 +1085,7 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
 		if (old_buffer)
 			auxtrace_buffer__drop_data(old_buffer);
 		etmq->buf_len = 0;
+		etmq->aux_buf_len = 0;
 		return 0;
 	}
 
@@ -2052,6 +2053,7 @@ static int cs_etm__decode_data_block(struct cs_etm_queue *etmq)
 	etmq->offset += processed;
 	etmq->buf_used += processed;
 	etmq->buf_len -= processed;
+	etmq->aux_buf_len -= processed;
 
 out:
 	return ret;
@@ -2177,7 +2179,7 @@ static int cs_etm__run_decoder(struct cs_etm_queue *etmq)
 			 */
 			err = cs_etm__process_traceid_queue(etmq, tidq);
 
-		} while (etmq->buf_len);
+		} while (etmq->aux_buf_len > 0);
 
 		if (err == 0)
 			/* Flush any remaining branch stack entries */
@@ -2216,6 +2218,27 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
 	return 0;
 }
 
+static void cs_etm__update_aux_buf_len(struct cs_etm_auxtrace *etm,
+				      struct perf_record_aux *aux)
+{
+	unsigned int cs_queue_nr, queue_nr;
+	struct auxtrace_queue *queue;
+	struct cs_etm_queue *etmq;
+
+	if (!etm->heap.heap_cnt)
+		return;
+
+	/* Take the entry at the top of the min heap */
+	cs_queue_nr = etm->heap.heap_array[0].queue_nr;
+	queue_nr = TO_QUEUE_NR(cs_queue_nr);
+	queue = &etm->queues.queue_array[queue_nr];
+	etmq = queue->priv;
+
+	etmq->aux_buf_len += aux->aux_size;
+	fprintf(stderr, "%s: aux_buf_len=%ld\n", __func__, etmq->aux_buf_len);
+	return;
+}
+
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
 {
 	int ret = 0;
@@ -2272,6 +2295,9 @@ static int cs_etm__process_queues(struct cs_etm_auxtrace *etm)
 		if (ret < 0)
 			goto out;
 
+		if (etmq->aux_buf_len <= 0)
+			goto out;
+
 		/*
 		 * No more auxtrace_buffers to process in this etmq, simply
 		 * move on to another entry in the auxtrace_heap.
@@ -2414,9 +2440,15 @@ static int cs_etm__process_event(struct perf_session *session,
 	else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
 		return cs_etm__process_switch_cpu_wide(etm, event);
 
+	fprintf(stderr, "%s: event->header.type=%d\n", __func__, event->header.type);
+
 	if (!etm->timeless_decoding &&
-	    event->header.type == PERF_RECORD_AUX)
+	    event->header.type == PERF_RECORD_AUX) {
+
+		fprintf(stderr, "%s: aux_size=%lld\n", __func__, event->aux.aux_size);
+		cs_etm__update_aux_buf_len(etm, &event->aux);
 		return cs_etm__process_queues(etm);
+	}
 
 	return 0;
 }
diff mbox series

Patch

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 8f8b448632fb..88b541b2a804 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -92,12 +92,16 @@  struct cs_etm_queue {
 	/* Conversion between traceID and index in traceid_queues array */
 	struct intlist *traceid_queues_list;
 	struct cs_etm_traceid_queue **traceid_queues;
+	int aux_record_list_len;
+	int aux_record_list_idx;
+	struct perf_record_aux *aux_record_list;
 };
 
 /* RB tree for quick conversion between traceID and metadata pointers */
 static struct intlist *traceid_list;
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu,
+				 struct perf_record_aux *aux_record);
 static int cs_etm__process_queues(struct cs_etm_auxtrace *etm);
 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
 					   pid_t tid);
@@ -585,6 +589,7 @@  static void cs_etm__free_queue(void *priv)
 
 	cs_etm_decoder__free(etmq->decoder);
 	cs_etm__free_traceid_queues(etmq);
+	free(etmq->aux_record_list);
 	free(etmq);
 }
 
@@ -759,6 +764,19 @@  static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm)
 	return NULL;
 }
 
+static int cs_etm__save_aux_record(struct cs_etm_queue *etmq,
+				   struct perf_record_aux *aux_record)
+{
+	etmq->aux_record_list = reallocarray(etmq->aux_record_list,
+					      etmq->aux_record_list_len+1,
+					      sizeof(*etmq->aux_record_list));
+	if (!etmq->aux_record_list)
+		return -ENOMEM;
+
+	etmq->aux_record_list[etmq->aux_record_list_len++] = *aux_record;
+	return 0;
+}
+
 static int cs_etm__search_first_timestamp(struct cs_etm_queue *etmq)
 {
 	int ret = 0;
@@ -865,7 +883,7 @@  static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
 	return 0;
 }
 
-static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu, struct perf_record_aux *aux)
 {
 	int ret;
 	if (etm->queues.new_data) {
@@ -875,6 +893,14 @@  static int cs_etm__update_queues(struct cs_etm_auxtrace *etm, int cpu)
 			return ret;
 	}
 
+	/* In timeless mode, cpu is set to -1, and a single aux buffer is filled */
+	if (cpu < 0)
+		cpu = 0;
+
+	ret = cs_etm__save_aux_record(etm->queues.queue_array[cpu].priv, aux);
+	if (ret)
+		return ret;
+
 	if (!etm->timeless_decoding)
 		return cs_etm__search_first_timestamp(etm->queues.queue_array[cpu].priv);
 	else
@@ -2357,7 +2383,7 @@  static int cs_etm__process_event(struct perf_session *session,
 
 	if ((timestamp || etm->timeless_decoding)
 			&& event->header.type == PERF_RECORD_AUX) {
-		err = cs_etm__update_queues(etm, sample->cpu);
+		err = cs_etm__update_queues(etm, sample->cpu, &event->aux);
 		if (err)
 			return err;
 	}