diff mbox series

[2/2] perf arm-spe: Parse more SPE fields and store source

Message ID 20220125192016.20538-3-alisaidi@amazon.com (mailing list archive)
State New, archived
Headers show
Series Allow perf scripts to process SPE raw data | expand

Commit Message

Ali Saidi Jan. 25, 2022, 7:20 p.m. UTC
Decode more SPE events and op types to allow for processing by perf
scripts. For example looking for branches which may indicate candidates
for conversion to a CSEL, store exclusives that are candidates for
conversion to LSE atomics and record the source information for memory
ops.

Signed-off-by: Ali Saidi <alisaidi@amazon.com>
---
 .../util/arm-spe-decoder/arm-spe-decoder.c     | 18 ++++++++++++++++++
 .../util/arm-spe-decoder/arm-spe-decoder.h     |  8 ++++++++
 2 files changed, 26 insertions(+)

Comments

German Gomez Jan. 28, 2022, 5:20 p.m. UTC | #1
Hi Ali,

On 25/01/2022 19:20, Ali Saidi wrote:
> Decode more SPE events and op types to allow for processing by perf
> scripts. For example looking for branches which may indicate candidates
> for conversion to a CSEL, store exclusives that are candidates for
> conversion to LSE atomics and record the source information for memory
> ops.
>
> Signed-off-by: Ali Saidi <alisaidi@amazon.com>
> ---
>  .../util/arm-spe-decoder/arm-spe-decoder.c     | 18 ++++++++++++++++++
>  .../util/arm-spe-decoder/arm-spe-decoder.h     |  8 ++++++++
>  2 files changed, 26 insertions(+)
>
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> index 5e390a1a79ab..177bac0f7128 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> @@ -191,6 +191,20 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
>  					decoder->record.op = ARM_SPE_ST;
>  				else
>  					decoder->record.op = ARM_SPE_LD;
> +				if (SPE_OP_PKT_IS_LDST_ATOMIC(payload)) {
> +					if (payload & SPE_OP_PKT_AT)
> +						decoder->record.op |= ARM_SPE_LDST_ATOMIC;

In "utils/arm-spe.c" we check "if (record->op == ARM_SPE_LD)" so this
ORing could break some of the generated samples.

> +					if (payload & SPE_OP_PKT_EXCL)
> +						decoder->record.op |= ARM_SPE_LDST_EXCL;
> +					if (payload & SPE_OP_PKT_AR)
> +						decoder->record.op |= ARM_SPE_LDST_ACQREL;
> +				}
> +			} else if (idx == SPE_OP_PKT_HDR_CLASS_BR_ERET) {
> +				decoder->record.op = ARM_SPE_BR;
> +				if (payload & SPE_OP_PKT_COND)
> +					decoder->record.op |= ARM_SPE_BR_COND;
> +				if (SPE_OP_PKT_IS_INDIRECT_BRANCH(payload))
> +					decoder->record.op |= ARM_SPE_BR_IND;
>  			}
>  			break;
>  		case ARM_SPE_EVENTS:
> @@ -218,8 +232,12 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
>  			if (payload & BIT(EV_MISPRED))
>  				decoder->record.type |= ARM_SPE_BRANCH_MISS;
>  
> +			if (payload & BIT(EV_NOT_TAKEN))
> +				decoder->record.type |= ARM_SPE_BR_NOT_TAKEN;
> +
>  			break;
>  		case ARM_SPE_DATA_SOURCE:
> +			decoder->record.source = payload;
>  			break;
>  		case ARM_SPE_BAD:
>  			break;
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> index 69b31084d6be..113e427afe99 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> @@ -22,11 +22,18 @@ enum arm_spe_sample_type {
>  	ARM_SPE_TLB_MISS	= 1 << 5,
>  	ARM_SPE_BRANCH_MISS	= 1 << 6,
>  	ARM_SPE_REMOTE_ACCESS	= 1 << 7,
> +	ARM_SPE_BR_NOT_TAKEN	= 1 << 8,

Can you rename it to ARM_SPE_BRANCH_NOT_TAKEN for consistency?

>  };
>  
>  enum arm_spe_op_type {
>  	ARM_SPE_LD		= 1 << 0,
>  	ARM_SPE_ST		= 1 << 1,
> +	ARM_SPE_LDST_EXCL	= 1 << 2,
> +	ARM_SPE_LDST_ATOMIC	= 1 << 3,
> +	ARM_SPE_LDST_ACQREL	= 1 << 4,
> +	ARM_SPE_BR		= 1 << 5,
> +	ARM_SPE_BR_COND		= 1 << 6,
> +	ARM_SPE_BR_IND		= 1 << 7,

I'm not sure if we should keep everything in one enum/bitmask. I'm also
looking at adding more of the data from the packets to the record, and
considering refactoring the record structure. I'll share here when I
have something.

Thanks,
German

>  };
>  
>  struct arm_spe_record {
> @@ -40,6 +47,7 @@ struct arm_spe_record {
>  	u64 virt_addr;
>  	u64 phys_addr;
>  	u64 context_id;
> +	u16 source;
>  };
>  
>  struct arm_spe_insn;
Ali Saidi Jan. 28, 2022, 9:02 p.m. UTC | #2
Hi German,

On 28/01/2022 19:20, German Gomez wrote:
>Hi Ali,
>
>On 25/01/2022 19:20, Ali Saidi wrote:
>> Decode more SPE events and op types to allow for processing by perf
>> scripts. For example looking for branches which may indicate candidates
>> for conversion to a CSEL, store exclusives that are candidates for
>> conversion to LSE atomics and record the source information for memory
>> ops.
>> [snip]
>> +				if (SPE_OP_PKT_IS_LDST_ATOMIC(payload)) {
>> +					if (payload & SPE_OP_PKT_AT)
>> +						decoder->record.op |= ARM_SPE_LDST_ATOMIC;
>
>In "utils/arm-spe.c" we check "if (record->op == ARM_SPE_LD)" so this
>ORing could break some of the generated samples.

Yep, you're correct. Interestingly I can only find one use of record->op using
equivalence instead of a logical and so perhaps it's best to fix this one use.

...
>> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> index 69b31084d6be..113e427afe99 100644
>> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> @@ -22,11 +22,18 @@ enum arm_spe_sample_type {
>>  	ARM_SPE_TLB_MISS	= 1 << 5,
>>  	ARM_SPE_BRANCH_MISS	= 1 << 6,
>>  	ARM_SPE_REMOTE_ACCESS	= 1 << 7,
>> +	ARM_SPE_BR_NOT_TAKEN	= 1 << 8,
>
>Can you rename it to ARM_SPE_BRANCH_NOT_TAKEN for consistency?

No problem

>
>>  };
>>  
>>  enum arm_spe_op_type {
>>  	ARM_SPE_LD		= 1 << 0,
>>  	ARM_SPE_ST		= 1 << 1,
>> +	ARM_SPE_LDST_EXCL	= 1 << 2,
>> +	ARM_SPE_LDST_ATOMIC	= 1 << 3,
>> +	ARM_SPE_LDST_ACQREL	= 1 << 4,
>> +	ARM_SPE_BR		= 1 << 5,
>> +	ARM_SPE_BR_COND		= 1 << 6,
>> +	ARM_SPE_BR_IND		= 1 << 7,
>
>I'm not sure if we should keep everything in one enum/bitmask. I'm also
>looking at adding more of the data from the packets to the record, and
>considering refactoring the record structure. I'll share here when I
>have something.

One straight forward way to do this would be to make it a u16 field that was 
SPE operation-type header and operation-type payload with some accessors instead
of trying to re-encode the operation type into a new format.

Thanks,
Ali
German Gomez Feb. 11, 2022, 4:31 p.m. UTC | #3
Hi Ali,

On 28/01/2022 21:02, Ali Saidi wrote:
> Hi German,
>
> On 28/01/2022 19:20, German Gomez wrote:
>> Hi Ali,
>>
>> [...]
>>>  };
>>>  
>>>  enum arm_spe_op_type {
>>>  	ARM_SPE_LD		= 1 << 0,
>>>  	ARM_SPE_ST		= 1 << 1,
>>> +	ARM_SPE_LDST_EXCL	= 1 << 2,
>>> +	ARM_SPE_LDST_ATOMIC	= 1 << 3,
>>> +	ARM_SPE_LDST_ACQREL	= 1 << 4,

Wondering if we can store this in perf_sample->flags. The values are
defined in "util/event.h" (PERF_IP_*). Maybe we can extend it to allow
doing "sample->flags = PERF_LDST_FLAG_LD | PERF_LDST_FLAG_ATOMIC" and
such.

@Leo do you think that could work?

>>> +	ARM_SPE_BR		= 1 << 5,
>>> +	ARM_SPE_BR_COND		= 1 << 6,
>>> +	ARM_SPE_BR_IND		= 1 << 7,

Seems like we can store BR_COND in the existing "branch-miss" event
(--itrace=b) with:

  sample->flags = PERF_IP_FLAG_BRANCH;
  sample->flags |= PERF_IP_FLAG_CONDITIONAL;
and/or
  sample->flags |= PERF_IP_FLAG_INDIRECT;

PERF_IP_FLAG_INDIRECT doesn't exist yet but we can probably add it.
Leo Yan Feb. 12, 2022, 4:19 a.m. UTC | #4
Hi German, Ali,

On Fri, Feb 11, 2022 at 04:31:40PM +0000, German Gomez wrote:
> Hi Ali,
> 
> On 28/01/2022 21:02, Ali Saidi wrote:
> > Hi German,
> >
> > On 28/01/2022 19:20, German Gomez wrote:
> >> Hi Ali,
> >>
> >> [...]
> >>>  };
> >>>  
> >>>  enum arm_spe_op_type {
> >>>  	ARM_SPE_LD		= 1 << 0,
> >>>  	ARM_SPE_ST		= 1 << 1,
> >>> +	ARM_SPE_LDST_EXCL	= 1 << 2,
> >>> +	ARM_SPE_LDST_ATOMIC	= 1 << 3,
> >>> +	ARM_SPE_LDST_ACQREL	= 1 << 4,
> 
> Wondering if we can store this in perf_sample->flags. The values are
> defined in "util/event.h" (PERF_IP_*). Maybe we can extend it to allow
> doing "sample->flags = PERF_LDST_FLAG_LD | PERF_LDST_FLAG_ATOMIC" and
> such.
> 
> @Leo do you think that could work?

Let's step back a bit and divide the decoding flow into two parts:
backend and frontend.

For the backend part, we decode the SPE hardware trace data and
generate the SPE record in the file
util/arm-spe-decoder/arm-spe-decoder.c.  As we want to support
complete operation types, we can extend arm_spe_op_type as below:

enum arm_spe_op_type {
        /* First level operation type */
	ARM_SPE_OP_OTHER        = 1 << 0,
	ARM_SPE_OP_LDST		= 1 << 1,
	ARM_SPE_OP_BRANCH_ERET  = 1 << 2,

        /* Second level operation type for OTHER */
        ARM_SPE_OP_SVE_OTHER    = 1 << 16,
        ARM_SPE_OP_SVE_FP       = 1 << 17,
        ARM_SPE_OP_SVE_PRED     = 1 << 18,

        /* Second level operation type for LDST */
        ARM_SPE_OP_LD           = 1 << 16,
        ARM_SPE_OP_ST           = 1 << 17,
        ARM_SPE_OP_ATOMIC       = 1 << 18,
        ARM_SPE_OP_EXCL         = 1 << 19,
        ARM_SPE_OP_AR           = 1 << 20,
        ARM_SPE_OP_SIMD_FP      = 1 << 21,
        ARM_SPE_OP_GP_REG       = 1 << 22,
        ARM_SPE_OP_UNSPEC_REG   = 1 << 23,
        ARM_SPE_OP_NV_SYSREG    = 1 << 24,
        ARM_SPE_OP_SVE_PRED     = 1 << 25,
        ARM_SPE_OP_SVE_SG       = 1 << 26,

        /* Second level operation type for BRANCH_ERET */
        ARM_SPE_OP_BR_COND      = 1 << 16,
        ARM_SPE_OP_BR_INDIRECT  = 1 << 17,
};

IIUC, Ali suggested to directly reuse packet format to express
operation type and don't need to redefine the operation types like
above structure arm_spe_op_type.  I personally bias to convert the raw
packet format to more readable format, a benefit is this would give
us more readable code.

For the frontend, we need to convert Arm SPE record to samples.
We can explore two fields: sample::flags and sample::data_src, for
load/store related operations, I perfer we can fill more complete
info in field sample::data_src and extend the definitions for
perf_mem_data_src; for branch operations, we can fill sample::flags.

So I am just wandering if we can set the field
sample::data_src::mem_lock for atomic operations, like:

    data_src.mem_op   = PERF_MEM_OP_LOAD;
    data_src.mem_lock = PERF_MEM_LOCK_ATOMIC;

The field "mem_lock" is only two bits, we can consider to extend the
structure with an extra filed "mem_lock_ext" if it cannot meet our
requirement.

> >>> +	ARM_SPE_BR		= 1 << 5,
> >>> +	ARM_SPE_BR_COND		= 1 << 6,
> >>> +	ARM_SPE_BR_IND		= 1 << 7,
> 
> Seems like we can store BR_COND in the existing "branch-miss" event
> (--itrace=b) with:
> 
>   sample->flags = PERF_IP_FLAG_BRANCH;
>   sample->flags |= PERF_IP_FLAG_CONDITIONAL;
> and/or
>   sample->flags |= PERF_IP_FLAG_INDIRECT;
> 
> PERF_IP_FLAG_INDIRECT doesn't exist yet but we can probably add it.

Yes, for branch samples, this makes sense for me.

Thanks,
Leo
German Gomez Feb. 21, 2022, 8:41 p.m. UTC | #5
Hi Leo, Ali,

On 12/02/2022 04:19, Leo Yan wrote:
> Hi German, Ali,
>
> On Fri, Feb 11, 2022 at 04:31:40PM +0000, German Gomez wrote:
>> Hi Ali,
>>
>> [...]
> Let's step back a bit and divide the decoding flow into two parts:
> backend and frontend.

Sorry for derailing the conversation.

(I made some additional comments on the generation of samples below)

> enum arm_spe_op_type {
>         /* First level operation type */
> 	ARM_SPE_OP_OTHER        = 1 << 0,
> 	ARM_SPE_OP_LDST		= 1 << 1,
> 	ARM_SPE_OP_BRANCH_ERET  = 1 << 2,
>
>         /* Second level operation type for OTHER */
>         ARM_SPE_OP_SVE_OTHER    = 1 << 16,
>         ARM_SPE_OP_SVE_FP       = 1 << 17,
>         ARM_SPE_OP_SVE_PRED     = 1 << 18,
>
>         /* Second level operation type for LDST */
>         ARM_SPE_OP_LD           = 1 << 16,
>         ARM_SPE_OP_ST           = 1 << 17,
>         ARM_SPE_OP_ATOMIC       = 1 << 18,
>         ARM_SPE_OP_EXCL         = 1 << 19,
>         ARM_SPE_OP_AR           = 1 << 20,
>         ARM_SPE_OP_SIMD_FP      = 1 << 21,
>         ARM_SPE_OP_GP_REG       = 1 << 22,
>         ARM_SPE_OP_UNSPEC_REG   = 1 << 23,
>         ARM_SPE_OP_NV_SYSREG    = 1 << 24,
>         ARM_SPE_OP_SVE_PRED     = 1 << 25,
>         ARM_SPE_OP_SVE_SG       = 1 << 26,
>
>         /* Second level operation type for BRANCH_ERET */
>         ARM_SPE_OP_BR_COND      = 1 << 16,
>         ARM_SPE_OP_BR_INDIRECT  = 1 << 17,
> };
>
> IIUC, Ali suggested to directly reuse packet format to express
> operation type and don't need to redefine the operation types like
> above structure arm_spe_op_type.  I personally bias to convert the raw
> packet format to more readable format, a benefit is this would give
> us more readable code.

I personally like this method as well

>
> For the frontend, we need to convert Arm SPE record to samples.
> We can explore two fields: sample::flags and sample::data_src, for
> load/store related operations, I perfer we can fill more complete
> info in field sample::data_src and extend the definitions for
> perf_mem_data_src; for branch operations, we can fill sample::flags.
>
> So I am just wandering if we can set the field
> sample::data_src::mem_lock for atomic operations, like:
>
>     data_src.mem_op   = PERF_MEM_OP_LOAD;
>     data_src.mem_lock = PERF_MEM_LOCK_ATOMIC;
>
> The field "mem_lock" is only two bits, we can consider to extend the
> structure with an extra filed "mem_lock_ext" if it cannot meet our
> requirement.

Indeed it makes more sense to use data_src as much as possible. Thanks
for pointing that out.

Some comments:

# ARM_SPE_OP_ATOMIC

  This might be a hack, but can we not represent it as both LD&SR as the
  atomic op would combine both?

  data_src.mem_op = PERF_MEM_OP_LOAD | PERF_MEM_OP_STORE;

# ARM_SPE_OP_EXCL (instructions ldxr/stxr)

  x86 doesn't seem to have similar instructions with similar semantics
  (please correct me if I'm wrong). For this arch, PERF_MEM_LOCK_LOCK
  probably suffices.

  PPC seems to have similar instructions to arm64 (lwarx/stwcx). I don't
  know if they also have instructions with same semantics as x86.

  I think it makes sense to have a PERF_MEM_LOCK_EXCL. If not, reusing
  PERF_MEM_LOCK_LOCK is the quicker alternative.

# ARM_SPE_OP_SVE_SG

  (I'm sorry if this is too far out of scope of the original patch. Let
  me know if you would prefer to discuss it on a separate channel)

  On a separate note, I'm also looking at incorporating some of the SVE
  bits in the perf samples.
 
  For this, do you think it makes sense to have two mem_* categories in
  perf_mem_data_src:

  mem_vector (2 bits)
    - simd
    - other (SVE in arm64)

  mem_src (1 bit)
    - sparse (scatter/gather loads/stores in SVE, as well as simd)

---
Thanks,
German

>>>>> +	ARM_SPE_BR		= 1 << 5,
>>>>> +	ARM_SPE_BR_COND		= 1 << 6,
>>>>> +	ARM_SPE_BR_IND		= 1 << 7,
>> Seems like we can store BR_COND in the existing "branch-miss" event
>> (--itrace=b) with:
>>
>> sample->flags = PERF_IP_FLAG_BRANCH;
>> sample->flags |= PERF_IP_FLAG_CONDITIONAL;
>> and/or
>> sample->flags |= PERF_IP_FLAG_INDIRECT;
>>
>> PERF_IP_FLAG_INDIRECT doesn't exist yet but we can probably add it.
> Yes, for branch samples, this makes sense for me.
>
> Thanks,
> Leo
German Gomez Feb. 25, 2022, 12:40 p.m. UTC | #6
On 22/02/2022 19:29, Ali Saidi wrote:
> Hi German & Yan,
>
> Sorry about the delay in responding.
>
>> Hi German, Ali,
>>
> [...]
>>>>>  };
>>>>>>  
>>>>>>  enum arm_spe_op_type {
>>>>>>  	ARM_SPE_LD		= 1 << 0,
>>>>>>  	ARM_SPE_ST		= 1 << 1,
>>>>>> +	ARM_SPE_LDST_EXCL	= 1 << 2,
>>>>>> +	ARM_SPE_LDST_ATOMIC	= 1 << 3,
>>>>>> +	ARM_SPE_LDST_ACQREL	= 1 << 4,
>>> Wondering if we can store this in perf_sample->flags. The values are
>>> defined in "util/event.h" (PERF_IP_*). Maybe we can extend it to allow
>>> doing "sample->flags = PERF_LDST_FLAG_LD | PERF_LDST_FLAG_ATOMIC" and
>>> such.
>>>
>>> @Leo do you think that could work?
>> Let's step back a bit and divide the decoding flow into two parts:
>> backend and frontend.
>>
>> For the backend part, we decode the SPE hardware trace data and
>> generate the SPE record in the file
>> util/arm-spe-decoder/arm-spe-decoder.c.  As we want to support
>> complete operation types, we can extend arm_spe_op_type as below:
>>
>> enum arm_spe_op_type {
>>        /* First level operation type */
>> 	ARM_SPE_OP_OTHER        = 1 << 0,
>> 	ARM_SPE_OP_LDST		= 1 << 1,
> [...]
>
> I'm OK with this approach, but perhaps instead the op type should
> just be the raw traces op-type and op-type-payload? Macros to decode
> this information are already present and extensively used in the text
> decoding of the packet. While it's a little bit harder than just picking
> a bit, the op_type is only used in a single place today outside of
> the existing textual script decoding and what would be this decoding.
> Do we forsee many more uses that would justify having to maintain

I wanted to include some of the sve/simd bits in the perf samples.

For that I would be using a few of these flags.

> the immediate format vs finding a way to unify arm_spe_pkt_desc_op_type
> to support both the text decoding and this?
>
> [...]
>> So I am just wandering if we can set the field
>> sample::data_src::mem_lock for atomic operations, like:
>>
>>    data_src.mem_op   = PERF_MEM_OP_LOAD;
>>    data_src.mem_lock = PERF_MEM_LOCK_ATOMIC;
>>
>> The field "mem_lock" is only two bits, we can consider to extend the
>> structure with an extra filed "mem_lock_ext" if it cannot meet our
>> requirement.
> These are for the LOCK instruction on x86. I don't know that we want to
> overload the meaning here. Minimally there is value in differentiating
> exclusives vs atomics.
>
>>>>>> +	ARM_SPE_BR		= 1 << 5,
>>>>>> +	ARM_SPE_BR_COND		= 1 << 6,
>>>>>> +	ARM_SPE_BR_IND		= 1 << 7,
>>> Seems like we can store BR_COND in the existing "branch-miss" event
>>> (--itrace=b) with:
>>>
>>> sample->flags = PERF_IP_FLAG_BRANCH;
>>> sample->flags |= PERF_IP_FLAG_CONDITIONAL;
>>> and/or
>>> sample->flags |= PERF_IP_FLAG_INDIRECT;
>>>
>>> PERF_IP_FLAG_INDIRECT doesn't exist yet but we can probably add it.
>> Yes, for branch samples, this makes sense for me.
> makes sense to me too.
>
> Ali
>
Leo Yan Feb. 27, 2022, 1:20 p.m. UTC | #7
On Mon, Feb 21, 2022 at 08:41:43PM +0000, German Gomez wrote:

[...]

> Some comments:
> 
> # ARM_SPE_OP_ATOMIC
> 
>   This might be a hack, but can we not represent it as both LD&SR as the
>   atomic op would combine both?
> 
>   data_src.mem_op = PERF_MEM_OP_LOAD | PERF_MEM_OP_STORE;

BTH, I don't understand well for this question, but let me explain a
bit:

We cannot use 'LOAD | STORE' to present the atomic operation.  Please
see Armv8 ARM section D10.2.7 Operation Type packet, it would give out
more details.  Atomic operation is an extra attribution for a load or
store operations, it could be an atomic load or store, or
load-acquire/store-release instructions, or
load-exclusive/store-exclusive instructions.

The function arm_spe_pkt_desc_op_type() in perf would also give more
info about atomic operations.

> # ARM_SPE_OP_EXCL (instructions ldxr/stxr)
> 
>   x86 doesn't seem to have similar instructions with similar semantics
>   (please correct me if I'm wrong). For this arch, PERF_MEM_LOCK_LOCK
>   probably suffices.
> 
>   PPC seems to have similar instructions to arm64 (lwarx/stwcx). I don't
>   know if they also have instructions with same semantics as x86.
> 
>   I think it makes sense to have a PERF_MEM_LOCK_EXCL. If not, reusing
>   PERF_MEM_LOCK_LOCK is the quicker alternative.

On Arm archs, I think OP_EXCL means Load-Exclusive and Store-Exclusive
instructions.  Different archs have different memory model and
different atomic instructions, e.g. on Armv7, we uses Load-Exclusive and
Store-Exclusive instructions for spinlock and on Armv8 we uses
Load-Acquire and Store-Release instructions for spinlock.

I have no any knowledge for x86 and PPC archs.  Seems to me, x86 uses
compare-and-swap instruction and PPC's lwarx/stwcx instructions "are
primitive, or simple, instructions used to perform a read-modify-write
operation to storage" [1].

So I personally think we can define PERF_MEM_LOCK_EXCL type for Arm
arches and fill into the field perf_mem_data_src::lock:

    data_src.lock = PERF_MEM_LOCK_EXCL;

... or we can consider to introduce a field perf_mem_data_src::atomic
and fill a new type PERF_MEM_ATOMIC_EXCL into this new field:

    data_src.atomic = PERF_MEM_ATOMIC_EXCL;

[1] https://www.ibm.com/docs/en/aix/7.2?topic=set-lwarx-load-word-reserve-indexed-instruction

> # ARM_SPE_OP_SVE_SG
> 
>   (I'm sorry if this is too far out of scope of the original patch. Let
>   me know if you would prefer to discuss it on a separate channel)
> 
>   On a separate note, I'm also looking at incorporating some of the SVE
>   bits in the perf samples.
>  
>   For this, do you think it makes sense to have two mem_* categories in
>   perf_mem_data_src:
> 
>   mem_vector (2 bits)
>     - simd
>     - other (SVE in arm64)

I think we can define below vector types:

    PERF_MEM_VECTOR_SIMD
    PERF_MEM_VECTOR_SVE

The tricky thing is "other"... Based on the description for "Operation
Type packet payload (Other)" in the Armv8 Arm, I think we even need to
add an extra operation type PERF_MEM_OP_OTHER and assign it to
data_src.mem_op field.

>   mem_src (1 bit)
>     - sparse (scatter/gather loads/stores in SVE, as well as simd)

How about the naming "mem_attr" for new field and define two
attributions:

    PERF_MEM_ATTR_SPARSE  -> Gather/Scatter operation
    PERF_MEM_ATTR_PRED    -> Predicated operation

Just remind, we cannot only approve within Arm related developers,
it's good to seek more wider review from other Arch developers when
you send new patch set.

Thanks,
Leo
Leo Yan Feb. 27, 2022, 1:54 p.m. UTC | #8
On Tue, Feb 22, 2022 at 07:29:43PM +0000, Ali Saidi wrote:

[...]

> >So I am just wandering if we can set the field
> >sample::data_src::mem_lock for atomic operations, like:
> >
> >    data_src.mem_op   = PERF_MEM_OP_LOAD;
> >    data_src.mem_lock = PERF_MEM_LOCK_ATOMIC;
> >
> >The field "mem_lock" is only two bits, we can consider to extend the
> >structure with an extra filed "mem_lock_ext" if it cannot meet our
> >requirement.
> 
> These are for the LOCK instruction on x86. I don't know that we want to
> overload the meaning here. Minimally there is value in differentiating
> exclusives vs atomics.

Good point.  Can we consider to add new filed data_src.atomic with
below types?

    PERF_MEM_ATOMIC_INST  -> Atomic operations
    PERF_MEM_ATOMIC_EXCL  -> Load-Exclusive/Store-Exclusive
    PERF_MEM_ATOMIC_ACQUIRE_RELEASE  ->Load-Acquire/Store-Release

Thanks,
Leo
German Gomez March 1, 2022, 10:54 a.m. UTC | #9
Hi Leo,

Thanks for taking the time, and sorry for the late reply

On 27/02/2022 13:20, Leo Yan wrote:
> On Mon, Feb 21, 2022 at 08:41:43PM +0000, German Gomez wrote:
>
> [...]
>
>> Some comments:
>>
>> # ARM_SPE_OP_ATOMIC
>>
>> This might be a hack, but can we not represent it as both LD&SR as the
>> atomic op would combine both?
>>
>> data_src.mem_op = PERF_MEM_OP_LOAD | PERF_MEM_OP_STORE;
> BTH, I don't understand well for this question, but let me explain a
> bit:
>
> We cannot use 'LOAD | STORE' to present the atomic operation.  Please
> see Armv8 ARM section D10.2.7 Operation Type packet, it would give out
> more details.  Atomic operation is an extra attribution for a load or
> store operations, it could be an atomic load or store, or
> load-acquire/store-release instructions, or
> load-exclusive/store-exclusive instructions.

I will check, thanks.

My thinking was that atomics perform some load-modify-store operation
hence why I suggested combining LOAD&STORE flags. But I admit didn't try
running the instructions myself so I didn't check the actual records.

> [...]
>
>> # ARM_SPE_OP_SVE_SG
>>
>> (I'm sorry if this is too far out of scope of the original patch. Let
>> me know if you would prefer to discuss it on a separate channel)
>>
>> On a separate note, I'm also looking at incorporating some of the SVE
>> bits in the perf samples.
>>
>> For this, do you think it makes sense to have two mem_* categories in
>> perf_mem_data_src:
>>
>> mem_vector (2 bits)
>> - simd
>> - other (SVE in arm64)
> I think we can define below vector types:
>
> PERF_MEM_VECTOR_SIMD
> PERF_MEM_VECTOR_SVE
>
> The tricky thing is "other"... Based on the description for "Operation
> Type packet payload (Other)" in the Armv8 Arm, I think we even need to
> add an extra operation type PERF_MEM_OP_OTHER and assign it to
> data_src.mem_op field.
>
>> mem_src (1 bit)
>> - sparse (scatter/gather loads/stores in SVE, as well as simd)
> How about the naming "mem_attr" for new field and define two
> attributions:
>
> PERF_MEM_ATTR_SPARSE  -> Gather/Scatter operation
> PERF_MEM_ATTR_PRED    -> Predicated operation
>
> Just remind, we cannot only approve within Arm related developers,
> it's good to seek more wider review from other Arch developers when
> you send new patch set.

Agree. On second thought, the mention of sve seems very arch-specific
for this...

Recently the idea of adding arch-specific flags to the branch entries
was mentioned in [1]. Perhaps we could suggest something similar for
this. Or leave simd/sve as a perf-tool-only feature for now.

[1] https://lore.kernel.org/all/Ygv4cmO%2Fzb3qO48q@robh.at.kernel.org/

>
> Thanks,
> Leo
diff mbox series

Patch

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 5e390a1a79ab..177bac0f7128 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -191,6 +191,20 @@  static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 					decoder->record.op = ARM_SPE_ST;
 				else
 					decoder->record.op = ARM_SPE_LD;
+				if (SPE_OP_PKT_IS_LDST_ATOMIC(payload)) {
+					if (payload & SPE_OP_PKT_AT)
+						decoder->record.op |= ARM_SPE_LDST_ATOMIC;
+					if (payload & SPE_OP_PKT_EXCL)
+						decoder->record.op |= ARM_SPE_LDST_EXCL;
+					if (payload & SPE_OP_PKT_AR)
+						decoder->record.op |= ARM_SPE_LDST_ACQREL;
+				}
+			} else if (idx == SPE_OP_PKT_HDR_CLASS_BR_ERET) {
+				decoder->record.op = ARM_SPE_BR;
+				if (payload & SPE_OP_PKT_COND)
+					decoder->record.op |= ARM_SPE_BR_COND;
+				if (SPE_OP_PKT_IS_INDIRECT_BRANCH(payload))
+					decoder->record.op |= ARM_SPE_BR_IND;
 			}
 			break;
 		case ARM_SPE_EVENTS:
@@ -218,8 +232,12 @@  static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 			if (payload & BIT(EV_MISPRED))
 				decoder->record.type |= ARM_SPE_BRANCH_MISS;
 
+			if (payload & BIT(EV_NOT_TAKEN))
+				decoder->record.type |= ARM_SPE_BR_NOT_TAKEN;
+
 			break;
 		case ARM_SPE_DATA_SOURCE:
+			decoder->record.source = payload;
 			break;
 		case ARM_SPE_BAD:
 			break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 69b31084d6be..113e427afe99 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -22,11 +22,18 @@  enum arm_spe_sample_type {
 	ARM_SPE_TLB_MISS	= 1 << 5,
 	ARM_SPE_BRANCH_MISS	= 1 << 6,
 	ARM_SPE_REMOTE_ACCESS	= 1 << 7,
+	ARM_SPE_BR_NOT_TAKEN	= 1 << 8,
 };
 
 enum arm_spe_op_type {
 	ARM_SPE_LD		= 1 << 0,
 	ARM_SPE_ST		= 1 << 1,
+	ARM_SPE_LDST_EXCL	= 1 << 2,
+	ARM_SPE_LDST_ATOMIC	= 1 << 3,
+	ARM_SPE_LDST_ACQREL	= 1 << 4,
+	ARM_SPE_BR		= 1 << 5,
+	ARM_SPE_BR_COND		= 1 << 6,
+	ARM_SPE_BR_IND		= 1 << 7,
 };
 
 struct arm_spe_record {
@@ -40,6 +47,7 @@  struct arm_spe_record {
 	u64 virt_addr;
 	u64 phys_addr;
 	u64 context_id;
+	u16 source;
 };
 
 struct arm_spe_insn;