diff mbox series

[v4,2/4] perf arm-spe: Use SPE data source for neoverse cores

Message ID 20220324183323.31414-3-alisaidi@amazon.com (mailing list archive)
State New, archived
Headers show
Series perf: arm-spe: Decode SPE source and use for perf c2c | expand

Commit Message

Ali Saidi March 24, 2022, 6:33 p.m. UTC
When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for
future work.

This change populates the mem_lvl_num for Neoverse cores instead of the
deprecated mem_lvl namespace.

Signed-off-by: Ali Saidi <alisaidi@amazon.com>
Tested-by: German Gomez <german.gomez@arm.com>
Reviewed-by: German Gomez <german.gomez@arm.com>
---
 .../util/arm-spe-decoder/arm-spe-decoder.c    |   1 +
 .../util/arm-spe-decoder/arm-spe-decoder.h    |  12 ++
 tools/perf/util/arm-spe.c                     | 110 +++++++++++++++---
 3 files changed, 109 insertions(+), 14 deletions(-)

Comments

Leo Yan March 26, 2022, 1:47 p.m. UTC | #1
Hi Ali, German,

On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:

[...]

> +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> +						union perf_mem_data_src *data_src)
>  {
> -	union perf_mem_data_src	data_src = { 0 };
> +	/*
> +	 * Even though four levels of cache hierarchy are possible, no known
> +	 * production Neoverse systems currently include more than three levels
> +	 * so for the time being we assume three exist. If a production system
> +	 * is built with four the this function would have to be changed to
> +	 * detect the number of levels for reporting.
> +	 */
>  
> -	if (record->op == ARM_SPE_LD)
> -		data_src.mem_op = PERF_MEM_OP_LOAD;
> -	else
> -		data_src.mem_op = PERF_MEM_OP_STORE;

Firstly, apologize that I didn't give clear idea when Ali sent patch sets
v2 and v3.

IMHO, we need to consider two kinds of information which can guide us
for a reliable implementation.  The first thing is to summarize the data
source configuration for x86 PEBS, we can dive in more details for this
part; the second thing is we can refer to the AMBA architecture document
ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and
its sub section 'Suggested DataSource values', which would help us
much for mapping the cache topology to Arm SPE data source.

As a result, I summarized the data source configurations for PEBS and
Arm SPE Neoverse in the spreadsheet:
https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing

Please see below comments.

> +	switch (record->source) {
> +	case ARM_SPE_NV_L1D:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> +		break;

I think we need to set the field 'mem_snoop' for L1 cache hit:

        data_src->mem_snoop = PERF_MEM_SNOOP_NONE;

For L1 cache hit, it doesn't involve snooping.

> +	case ARM_SPE_NV_L2:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> +		break;

Ditto:

        data_src->mem_snoop = PERF_MEM_SNOOP_NONE;

> +	case ARM_SPE_NV_PEER_CORE:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;

Peer core contains its local L1 cache, so I think we can set the
memory level L1 to indicate this case.

For this data source type and below types, though they indicate
the snooping happens, but it doesn't mean the data in the cache line
is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
think this will mislead users when report the result.

I prefer we set below fields for ARM_SPE_NV_PEER_CORE:

        data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
        data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;

> +		break;
> +	/*
> +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> +	 * transfer, so set SNOOP_HITM
> +	 */
> +	case ARM_SPE_NV_LCL_CLSTR:

For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
the cluster level, it should happen in L2 cache:

        data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
        data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;

> +	case ARM_SPE_NV_PEER_CLSTR:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> +		break;

This type can snoop from L1 or L2 cache in the peer cluster, so it
makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE.  But here
should use the snoop type PERF_MEM_SNOOP_HIT, so:

        data_src->mem_lvl = PERF_MEM_LVL_HIT
        data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;

> +	/*
> +	 * System cache is assumed to be L3
> +	 */
> +	case ARM_SPE_NV_SYS_CACHE:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> +		break;

        data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3;
        data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;

> +	/*
> +	 * We don't know what level it hit in, except it came from the other
> +	 * socket
> +	 */
> +	case ARM_SPE_NV_REMOTE:
> +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> +		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> +		break;

The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen
in any cache levels in remote chip:

        data_src->mem_lvl = PERF_MEM_LVL_HIT;
        data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
        data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;

> +	case ARM_SPE_NV_DRAM:
> +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> +		break;

We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source:

        data_src->mem_lvl = PERF_MEM_LVL_HIT;
        data_src->mem_snoop = PERF_MEM_SNOOP_MISS;
        data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;

The rest of this patch looks good to me.

Thanks,
Leo

> +	default:
> +		break;
> +	}
> +}
>  
> +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
> +						union perf_mem_data_src *data_src)
> +{
>  	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> -		data_src.mem_lvl = PERF_MEM_LVL_L3;
> +		data_src->mem_lvl = PERF_MEM_LVL_L3;
>  
>  		if (record->type & ARM_SPE_LLC_MISS)
> -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> +			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
>  		else
> -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
>  	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
> -		data_src.mem_lvl = PERF_MEM_LVL_L1;
> +		data_src->mem_lvl = PERF_MEM_LVL_L1;
>  
>  		if (record->type & ARM_SPE_L1D_MISS)
> -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> +			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
>  		else
> -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
>  	}
>  
>  	if (record->type & ARM_SPE_REMOTE_ACCESS)
> -		data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> +		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> +}
> +
> +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
> +{
> +	union perf_mem_data_src	data_src = { 0 };
> +	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
> +
> +	if (record->op & ARM_SPE_LD)
> +		data_src.mem_op = PERF_MEM_OP_LOAD;
> +	else
> +		data_src.mem_op = PERF_MEM_OP_STORE;
> +
> +	if (is_neoverse)
> +		arm_spe__synth_data_source_neoverse(record, &data_src);
> +	else
> +		arm_spe__synth_data_source_generic(record, &data_src);
>  
>  	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
>  		data_src.mem_dtlb = PERF_MEM_TLB_WK;
> @@ -446,7 +525,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
>  	u64 data_src;
>  	int err;
>  
> -	data_src = arm_spe__synth_data_source(record);
> +	data_src = arm_spe__synth_data_source(record, spe->midr);
>  
>  	if (spe->sample_flc) {
>  		if (record->type & ARM_SPE_L1D_MISS) {
> @@ -1183,6 +1262,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
>  	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
>  	size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
>  	struct perf_record_time_conv *tc = &session->time_conv;
> +	const char *cpuid = perf_env__cpuid(session->evlist->env);
> +	u64 midr = strtol(cpuid, NULL, 16);
>  	struct arm_spe *spe;
>  	int err;
>  
> @@ -1202,6 +1283,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
>  	spe->machine = &session->machines.host; /* No kvm support */
>  	spe->auxtrace_type = auxtrace_info->type;
>  	spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
> +	spe->midr = midr;
>  
>  	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
>  
> -- 
> 2.32.0
>
Arnaldo Carvalho de Melo March 26, 2022, 1:52 p.m. UTC | #2
Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu:
> Hi Ali, German,
> 
> On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
> 
> [...]
> 
> > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> > +						union perf_mem_data_src *data_src)
> >  {
> > -	union perf_mem_data_src	data_src = { 0 };
> > +	/*
> > +	 * Even though four levels of cache hierarchy are possible, no known
> > +	 * production Neoverse systems currently include more than three levels
> > +	 * so for the time being we assume three exist. If a production system
> > +	 * is built with four the this function would have to be changed to
> > +	 * detect the number of levels for reporting.
> > +	 */
> >  
> > -	if (record->op == ARM_SPE_LD)
> > -		data_src.mem_op = PERF_MEM_OP_LOAD;
> > -	else
> > -		data_src.mem_op = PERF_MEM_OP_STORE;
> 
> Firstly, apologize that I didn't give clear idea when Ali sent patch sets
> v2 and v3.

Ok, removing this as well.

Thanks for reviewing.

- Arnaldo
 
> IMHO, we need to consider two kinds of information which can guide us
> for a reliable implementation.  The first thing is to summarize the data
> source configuration for x86 PEBS, we can dive in more details for this
> part; the second thing is we can refer to the AMBA architecture document
> ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and
> its sub section 'Suggested DataSource values', which would help us
> much for mapping the cache topology to Arm SPE data source.
> 
> As a result, I summarized the data source configurations for PEBS and
> Arm SPE Neoverse in the spreadsheet:
> https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing
> 
> Please see below comments.
> 
> > +	switch (record->source) {
> > +	case ARM_SPE_NV_L1D:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > +		break;
> 
> I think we need to set the field 'mem_snoop' for L1 cache hit:
> 
>         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> 
> For L1 cache hit, it doesn't involve snooping.
> 
> > +	case ARM_SPE_NV_L2:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> > +		break;
> 
> Ditto:
> 
>         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> 
> > +	case ARM_SPE_NV_PEER_CORE:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> Peer core contains its local L1 cache, so I think we can set the
> memory level L1 to indicate this case.
> 
> For this data source type and below types, though they indicate
> the snooping happens, but it doesn't mean the data in the cache line
> is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> think this will mislead users when report the result.
> 
> I prefer we set below fields for ARM_SPE_NV_PEER_CORE:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> 
> > +		break;
> > +	/*
> > +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> > +	 * transfer, so set SNOOP_HITM
> > +	 */
> > +	case ARM_SPE_NV_LCL_CLSTR:
> 
> For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
> the cluster level, it should happen in L2 cache:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> 
> > +	case ARM_SPE_NV_PEER_CLSTR:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > +		break;
> 
> This type can snoop from L1 or L2 cache in the peer cluster, so it
> makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE.  But here
> should use the snoop type PERF_MEM_SNOOP_HIT, so:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> > +	/*
> > +	 * System cache is assumed to be L3
> > +	 */
> > +	case ARM_SPE_NV_SYS_CACHE:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> > +		break;
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> 
> > +	/*
> > +	 * We don't know what level it hit in, except it came from the other
> > +	 * socket
> > +	 */
> > +	case ARM_SPE_NV_REMOTE:
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> > +		break;
> 
> The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen
> in any cache levels in remote chip:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> > +	case ARM_SPE_NV_DRAM:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> > +		break;
> 
> We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT;
>         data_src->mem_snoop = PERF_MEM_SNOOP_MISS;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> 
> The rest of this patch looks good to me.
> 
> Thanks,
> Leo
> 
> > +	default:
> > +		break;
> > +	}
> > +}
> >  
> > +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
> > +						union perf_mem_data_src *data_src)
> > +{
> >  	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> > -		data_src.mem_lvl = PERF_MEM_LVL_L3;
> > +		data_src->mem_lvl = PERF_MEM_LVL_L3;
> >  
> >  		if (record->type & ARM_SPE_LLC_MISS)
> > -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> > +			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
> >  		else
> > -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> > +			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
> >  	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
> > -		data_src.mem_lvl = PERF_MEM_LVL_L1;
> > +		data_src->mem_lvl = PERF_MEM_LVL_L1;
> >  
> >  		if (record->type & ARM_SPE_L1D_MISS)
> > -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> > +			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
> >  		else
> > -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> > +			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
> >  	}
> >  
> >  	if (record->type & ARM_SPE_REMOTE_ACCESS)
> > -		data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> > +		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> > +}
> > +
> > +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
> > +{
> > +	union perf_mem_data_src	data_src = { 0 };
> > +	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
> > +
> > +	if (record->op & ARM_SPE_LD)
> > +		data_src.mem_op = PERF_MEM_OP_LOAD;
> > +	else
> > +		data_src.mem_op = PERF_MEM_OP_STORE;
> > +
> > +	if (is_neoverse)
> > +		arm_spe__synth_data_source_neoverse(record, &data_src);
> > +	else
> > +		arm_spe__synth_data_source_generic(record, &data_src);
> >  
> >  	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
> >  		data_src.mem_dtlb = PERF_MEM_TLB_WK;
> > @@ -446,7 +525,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
> >  	u64 data_src;
> >  	int err;
> >  
> > -	data_src = arm_spe__synth_data_source(record);
> > +	data_src = arm_spe__synth_data_source(record, spe->midr);
> >  
> >  	if (spe->sample_flc) {
> >  		if (record->type & ARM_SPE_L1D_MISS) {
> > @@ -1183,6 +1262,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
> >  	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
> >  	size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
> >  	struct perf_record_time_conv *tc = &session->time_conv;
> > +	const char *cpuid = perf_env__cpuid(session->evlist->env);
> > +	u64 midr = strtol(cpuid, NULL, 16);
> >  	struct arm_spe *spe;
> >  	int err;
> >  
> > @@ -1202,6 +1283,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
> >  	spe->machine = &session->machines.host; /* No kvm support */
> >  	spe->auxtrace_type = auxtrace_info->type;
> >  	spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
> > +	spe->midr = midr;
> >  
> >  	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
> >  
> > -- 
> > 2.32.0
> >
Leo Yan March 26, 2022, 1:56 p.m. UTC | #3
On Sat, Mar 26, 2022 at 10:52:01AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu:
> > Hi Ali, German,
> > 
> > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
> > 
> > [...]
> > 
> > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> > > +						union perf_mem_data_src *data_src)
> > >  {
> > > -	union perf_mem_data_src	data_src = { 0 };
> > > +	/*
> > > +	 * Even though four levels of cache hierarchy are possible, no known
> > > +	 * production Neoverse systems currently include more than three levels
> > > +	 * so for the time being we assume three exist. If a production system
> > > +	 * is built with four the this function would have to be changed to
> > > +	 * detect the number of levels for reporting.
> > > +	 */
> > >  
> > > -	if (record->op == ARM_SPE_LD)
> > > -		data_src.mem_op = PERF_MEM_OP_LOAD;
> > > -	else
> > > -		data_src.mem_op = PERF_MEM_OP_STORE;
> > 
> > Firstly, apologize that I didn't give clear idea when Ali sent patch sets
> > v2 and v3.
> 
> Ok, removing this as well.
> 
> Thanks for reviewing.

Thanks a lot, Arnaldo.  Yeah, it's good to give a bit more time to
dismiss the concerns in this patch.

Sorry again for the inconvenience.

Leo
Arnaldo Carvalho de Melo March 26, 2022, 2:04 p.m. UTC | #4
Em Sat, Mar 26, 2022 at 09:56:53PM +0800, Leo Yan escreveu:
> On Sat, Mar 26, 2022 at 10:52:01AM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu:
> > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
> > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> > > > +						union perf_mem_data_src *data_src)
> > > >  {
> > > > -	union perf_mem_data_src	data_src = { 0 };
> > > > +	/*
> > > > +	 * Even though four levels of cache hierarchy are possible, no known
> > > > +	 * production Neoverse systems currently include more than three levels
> > > > +	 * so for the time being we assume three exist. If a production system
> > > > +	 * is built with four the this function would have to be changed to
> > > > +	 * detect the number of levels for reporting.
> > > > +	 */

> > > > -	if (record->op == ARM_SPE_LD)
> > > > -		data_src.mem_op = PERF_MEM_OP_LOAD;
> > > > -	else
> > > > -		data_src.mem_op = PERF_MEM_OP_STORE;

> > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets
> > > v2 and v3.

> > Ok, removing this as well.

> > Thanks for reviewing.

> Thanks a lot, Arnaldo.  Yeah, it's good to give a bit more time to
> dismiss the concerns in this patch.

Sure, at least it was build tested on many distros/cross compilers and
this part is ok 8-)
 
> Sorry again for the inconvenience.

np.

- Arnaldo
Ali Saidi March 26, 2022, 7:43 p.m. UTC | #5
Hi Leo,
On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote:
> Hi Ali, German,
> 
> On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
> 
> [...]
> 
> > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> > +						union perf_mem_data_src *data_src)
> >  {
> > -	union perf_mem_data_src	data_src = { 0 };
> > +	/*
> > +	 * Even though four levels of cache hierarchy are possible, no known
> > +	 * production Neoverse systems currently include more than three levels
> > +	 * so for the time being we assume three exist. If a production system
> > +	 * is built with four the this function would have to be changed to
> > +	 * detect the number of levels for reporting.
> > +	 */
> >  
> > -	if (record->op == ARM_SPE_LD)
> > -		data_src.mem_op = PERF_MEM_OP_LOAD;
> > -	else
> > -		data_src.mem_op = PERF_MEM_OP_STORE;
> 
> Firstly, apologize that I didn't give clear idea when Ali sent patch sets
> v2 and v3.
> 
> IMHO, we need to consider two kinds of information which can guide us
> for a reliable implementation.  The first thing is to summarize the data
> source configuration for x86 PEBS, we can dive in more details for this
> part; the second thing is we can refer to the AMBA architecture document
> ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and
> its sub section 'Suggested DataSource values', which would help us
> much for mapping the cache topology to Arm SPE data source.
> 
> As a result, I summarized the data source configurations for PEBS and
> Arm SPE Neoverse in the spreadsheet:
> https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing

Thanks for putting this together and digging into the details, but you're making
assumptions in neoverse data sources about the core configurations that aren't
correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the
line is coming from a peer-core we don't know which level it's actually coming
from.  Similarly, if it's coming from a local cluster, that could mean a cluster
l3, but it's not the L2. 


> Please see below comments.
> 
> > +	switch (record->source) {
> > +	case ARM_SPE_NV_L1D:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > +		break;
> 
> I think we need to set the field 'mem_snoop' for L1 cache hit:
> 
>         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> 
> For L1 cache hit, it doesn't involve snooping.
I can't find a precise definition for SNOOP_NONE, but it seemed as though
this would be used for cases where a snoop could have occurred but didn't
not for accesses that by definition don't snoop? I'm happy with either way,
perhaps i just read more into it. 

> > +	case ARM_SPE_NV_L2:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> > +		break;
> 
> Ditto:
> 
>         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
Same comment as above.

> > +	case ARM_SPE_NV_PEER_CORE:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> Peer core contains its local L1 cache, so I think we can set the
> memory level L1 to indicate this case.
It could be either the L1 or the L2. All the neoverse cores have private L2
caches and we don't know. 

> For this data source type and below types, though they indicate
> the snooping happens, but it doesn't mean the data in the cache line
> is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> think this will mislead users when report the result.

I'm of the opposite opinion. If the data wasn't modified, it will likely be
found in the lower-level shared cache and the transaction wouldn't require a
cache-to-cache transfer of the modified data, so the most common case when we
source a line out of another cores cache will be if it was "modifiable" in that
cache. 

> 
> I prefer we set below fields for ARM_SPE_NV_PEER_CORE:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> 
> > +		break;
> > +	/*
> > +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> > +	 * transfer, so set SNOOP_HITM
> > +	 */
> > +	case ARM_SPE_NV_LCL_CLSTR:
> 
> For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
> the cluster level, it should happen in L2 cache:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;

We don't know if this is coming from the cluster cache, or the private L1 or L2
core caches. The description above about why we'll be transferring the line from
cache-to-cache applies here too. 

> > +	case ARM_SPE_NV_PEER_CLSTR:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > +		break;
> 
> This type can snoop from L1 or L2 cache in the peer cluster, so it
> makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE.  But here
> should use the snoop type PERF_MEM_SNOOP_HIT, so:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;

Given that we agreed to only focus on the three levels generally used by
the existing implementations LCL and PEER should be the same for now. 

> > +	/*
> > +	 * System cache is assumed to be L3
> > +	 */
> > +	case ARM_SPE_NV_SYS_CACHE:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> > +		break;
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;

I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. 
If we're hitting in the unified L3 cache, we aren't actually snooping anything
which is why I didn't set mem_snoop here.

> > +	/*
> > +	 * We don't know what level it hit in, except it came from the other
> > +	 * socket
> > +	 */
> > +	case ARM_SPE_NV_REMOTE:
> > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > +		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> > +		break;
> 
> The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen
> in any cache levels in remote chip:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT;
>         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
>         data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;

Ok.

> 
> > +	case ARM_SPE_NV_DRAM:
> > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> > +		break;
> 
> We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source:
> 
>         data_src->mem_lvl = PERF_MEM_LVL_HIT;
>         data_src->mem_snoop = PERF_MEM_SNOOP_MISS;
>         data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> 

Ok.

Thanks,
Ali
Leo Yan March 27, 2022, 9:09 a.m. UTC | #6
On Sat, Mar 26, 2022 at 07:43:27PM +0000, Ali Saidi wrote:
> Hi Leo,
> On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote:
> > Hi Ali, German,
> > 
> > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
> > 
> > [...]
> > 
> > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> > > +						union perf_mem_data_src *data_src)
> > >  {
> > > -	union perf_mem_data_src	data_src = { 0 };
> > > +	/*
> > > +	 * Even though four levels of cache hierarchy are possible, no known
> > > +	 * production Neoverse systems currently include more than three levels
> > > +	 * so for the time being we assume three exist. If a production system
> > > +	 * is built with four the this function would have to be changed to
> > > +	 * detect the number of levels for reporting.
> > > +	 */
> > >  
> > > -	if (record->op == ARM_SPE_LD)
> > > -		data_src.mem_op = PERF_MEM_OP_LOAD;
> > > -	else
> > > -		data_src.mem_op = PERF_MEM_OP_STORE;
> > 
> > Firstly, apologize that I didn't give clear idea when Ali sent patch sets
> > v2 and v3.
> > 
> > IMHO, we need to consider two kinds of information which can guide us
> > for a reliable implementation.  The first thing is to summarize the data
> > source configuration for x86 PEBS, we can dive in more details for this
> > part; the second thing is we can refer to the AMBA architecture document
> > ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and
> > its sub section 'Suggested DataSource values', which would help us
> > much for mapping the cache topology to Arm SPE data source.
> > 
> > As a result, I summarized the data source configurations for PEBS and
> > Arm SPE Neoverse in the spreadsheet:
> > https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing
> 
> Thanks for putting this together and digging into the details, but you're making
> assumptions in neoverse data sources about the core configurations that aren't
> correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the
> line is coming from a peer-core we don't know which level it's actually coming
> from.  Similarly, if it's coming from a local cluster, that could mean a cluster
> l3, but it's not the L2. 

I remembered before you have mentioned this, and yes, these concerns are
valid for me.  Please see below comments.

> > Please see below comments.
> > 
> > > +	switch (record->source) {
> > > +	case ARM_SPE_NV_L1D:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > > +		break;
> > 
> > I think we need to set the field 'mem_snoop' for L1 cache hit:
> > 
> >         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> > 
> > For L1 cache hit, it doesn't involve snooping.
>
> I can't find a precise definition for SNOOP_NONE, but it seemed as though
> this would be used for cases where a snoop could have occurred but didn't
> not for accesses that by definition don't snoop? I'm happy with either way,
> perhaps i just read more into it.

I have the same understanding with you that "this would be used for
cases where a snoop could have occurred but didn't not for accesses
that by definition don't snoop".

If we refer to PEBS's data source type 01H: "Minimal latency core cache
hit.  This request was satisfied by the L1 data cache" and x86 sets
SNOOP_NONE as the snoop type for this case.

If we connect with snooping protocol, let's use MOIS protocol as an
example (here simply use the MOIS protocol for discussion, but I
don't have any knowledge for implementation of CPUs actually), as
described in the wiki page [1], when a cache line is in Owned (O)
state or Shared (S) state, "a processor read (PrRd) does not generate
any snooped signal".  In these cases, I think we should use snoop type
PERF_MEM_SNOOP_NONE.

> > > +	case ARM_SPE_NV_L2:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> > > +		break;
> > 
> > Ditto:
> > 
> >         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> Same comment as above.
> 
> > > +	case ARM_SPE_NV_PEER_CORE:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > 
> > Peer core contains its local L1 cache, so I think we can set the
> > memory level L1 to indicate this case.
> It could be either the L1 or the L2. All the neoverse cores have private L2
> caches and we don't know. 

How about set both L1 and L2 cache together for this case?

Although 'L1 | L2' cannot tell the exact cache level, I think it's
better than use ANY_CACHE, at least this can help us to distinguish
from other data source types if we avoid to use ANY_CACHE for all of
them.

> > For this data source type and below types, though they indicate
> > the snooping happens, but it doesn't mean the data in the cache line
> > is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> > think this will mislead users when report the result.
> 
> I'm of the opposite opinion. If the data wasn't modified, it will likely be
> found in the lower-level shared cache and the transaction wouldn't require a
> cache-to-cache transfer of the modified data, so the most common case when we
> source a line out of another cores cache will be if it was "modifiable" in that
> cache. 

Let's still use MOSI protocol as example.  I think there have two
cases: on case is the peer cache line is in 'Shared' state and another
case is the peer cache line is in 'Owned' state.

Quotes the wiki page for these two cases:

"When the cache block is in the Shared (S) state and there is a
snooped bus read (BusRd) transaction, then the block stays in the same
state and generates no more transactions as all the cache blocks have
the same value including the main memory and it is only being read,
not written into."

"While in the Owner (O) state and there is a snooped read request
(BusRd), the block remains in the same state while flushing (Flush)
the data for the other processor to read from it."

Seems to me, it's reasonable to set HTIM flag when the snooping happens
for the cache line line is in the Modified (M) state.

Again, my comment is based on the literal understanding; so please
correct if have any misunderstanding at here.

> > I prefer we set below fields for ARM_SPE_NV_PEER_CORE:
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
> >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > 
> > > +		break;
> > > +	/*
> > > +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> > > +	 * transfer, so set SNOOP_HITM
> > > +	 */
> > > +	case ARM_SPE_NV_LCL_CLSTR:
> > 
> > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
> > the cluster level, it should happen in L2 cache:
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
> >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> 
> We don't know if this is coming from the cluster cache, or the private L1 or L2
> core caches. The description above about why we'll be transferring the line from
> cache-to-cache applies here too. 

I think a minor difference between PEER_CORE and LCL_CLSTR is:
if data source is PEER_CORE, the snooping happens on the peer core's
local cache (Core's L1 or Core's L2 when core contains L2); for the data
source LCL_CLSTR, the snooping occurs on the cluster level's cache
(cluster's L2 cache or cluster's L3 cache when cluster contains L3).

So can we set both 'L2 | L3' for LCL_CLSTR case?

> > > +	case ARM_SPE_NV_PEER_CLSTR:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > > +		break;
> > 
> > This type can snoop from L1 or L2 cache in the peer cluster, so it
> > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE.  But here
> > should use the snoop type PERF_MEM_SNOOP_HIT, so:
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT
> >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> Given that we agreed to only focus on the three levels generally used by
> the existing implementations LCL and PEER should be the same for now. 

For PEER_CLSTR, we don't know the snooping happening on CPU's private
cache or cluster's shared cache, this is why we should use ANY_CACHE
for cache level.

But LCL_CLSTR is different from PEER_CLSTR, LCL_CLSTR indicates the
snooping on the local cluster's share cache, we set 'L2 | L3' for it;
therefore, we can distinguish between these two cases.

> > > +	/*
> > > +	 * System cache is assumed to be L3
> > > +	 */
> > > +	case ARM_SPE_NV_SYS_CACHE:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> > > +		break;
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3;
> >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> 
> I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. 

Is this because the decoding flow has any specific requirement that
only can set mem_lvl_num and not set mem_lvl?

> If we're hitting in the unified L3 cache, we aren't actually snooping anything
> which is why I didn't set mem_snoop here.

I am a bit suspecious for the clarification.  If the system cache is in
the cache conhernecy domain, then snooping occurs on it; in other words,
if system cache is connected with a bus (like CCI or CMN), and the bus
can help for data consistency, I prefer to set SNOOP_HIT flag.

Could you confirm for this?

> > > +	/*
> > > +	 * We don't know what level it hit in, except it came from the other
> > > +	 * socket
> > > +	 */
> > > +	case ARM_SPE_NV_REMOTE:
> > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > +		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> > > +		break;
> > 
> > The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen
> > in any cache levels in remote chip:
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT;
> >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> >         data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> 
> Ok.
> 
> > 
> > > +	case ARM_SPE_NV_DRAM:
> > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> > > +		break;
> > 
> > We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source:
> > 
> >         data_src->mem_lvl = PERF_MEM_LVL_HIT;
> >         data_src->mem_snoop = PERF_MEM_SNOOP_MISS;
> >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> > 
> 
> Ok.

Thanks a lot for your work!

Leo

[1] https://en.wikipedia.org/wiki/MOSI_protocol
Ali Saidi March 28, 2022, 3:08 a.m. UTC | #7
Hi Leo,

Thanks for the additional comments.

On Sun, 27 Mar 2022 07:09:19 +0000, Leo Yan wrote:
[snip]
> 
> > > Please see below comments.
> > > 
> > > > +	switch (record->source) {
> > > > +	case ARM_SPE_NV_L1D:
> > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > > > +		break;
> > > 
> > > I think we need to set the field 'mem_snoop' for L1 cache hit:
> > > 
> > >         data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
> > > 
> > > For L1 cache hit, it doesn't involve snooping.
> >
> > I can't find a precise definition for SNOOP_NONE, but it seemed as though
> > this would be used for cases where a snoop could have occurred but didn't
> > not for accesses that by definition don't snoop? I'm happy with either way,
> > perhaps i just read more into it.
> 
> I have the same understanding with you that "this would be used for
> cases where a snoop could have occurred but didn't not for accesses
> that by definition don't snoop".
> 
> If we refer to PEBS's data source type 01H: "Minimal latency core cache
> hit.  This request was satisfied by the L1 data cache" and x86 sets
> SNOOP_NONE as the snoop type for this case.

I'm happy to set it if everyone believes it's the right thing.

[snip]
> > Same comment as above.
> > 
> > > > +	case ARM_SPE_NV_PEER_CORE:
> > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > > 
> > > Peer core contains its local L1 cache, so I think we can set the
> > > memory level L1 to indicate this case.
> > It could be either the L1 or the L2. All the neoverse cores have private L2
> > caches and we don't know. 
> 
> How about set both L1 and L2 cache together for this case?
> 
> Although 'L1 | L2' cannot tell the exact cache level, I think it's
> better than use ANY_CACHE, at least this can help us to distinguish
> from other data source types if we avoid to use ANY_CACHE for all of
> them.

This seems much more confusing then the ambiguity of where the line came from
and is only possible with the deprecated mem_lvl enconding.  It will make
perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to
attribute a line to a single cache will find that the sum of the number of hits
is greater than the number of accesses which also seems terribly confusing.

> 
> > > For this data source type and below types, though they indicate
> > > the snooping happens, but it doesn't mean the data in the cache line
> > > is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> > > think this will mislead users when report the result.
> > 
> > I'm of the opposite opinion. If the data wasn't modified, it will likely be
> > found in the lower-level shared cache and the transaction wouldn't require a
> > cache-to-cache transfer of the modified data, so the most common case when we
> > source a line out of another cores cache will be if it was "modifiable" in that
> > cache. 
> 
> Let's still use MOSI protocol as example.  I think there have two
> cases: on case is the peer cache line is in 'Shared' state and another
> case is the peer cache line is in 'Owned' state.
> 
> Quotes the wiki page for these two cases:
> 
> "When the cache block is in the Shared (S) state and there is a
> snooped bus read (BusRd) transaction, then the block stays in the same
> state and generates no more transactions as all the cache blocks have
> the same value including the main memory and it is only being read,
> not written into."
> 
> "While in the Owner (O) state and there is a snooped read request
> (BusRd), the block remains in the same state while flushing (Flush)
> the data for the other processor to read from it."
> 
> Seems to me, it's reasonable to set HTIM flag when the snooping happens
> for the cache line line is in the Modified (M) state.
> 
> Again, my comment is based on the literal understanding; so please
> correct if have any misunderstanding at here.

Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except
where sharing patterns are detected," so if a line is shared among caches it
will likely also be in the SLC (and thus we'd get an L3 hit). 

If there is one copy of the cache line and that cache line needs to transition
from one core to another core, I don't believe it matters if it was truly
modified or not because the core already had permission to modify it and the
transaction is just as costly (ping ponging between core caches). This is the
one thing I really want to be able to uniquely identify as any cacheline doing
this that isn't a lock is a place where optimization is likely possible. 

> 
> > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE:
> > > 
> > >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
> > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > > 
> > > > +		break;
> > > > +	/*
> > > > +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> > > > +	 * transfer, so set SNOOP_HITM
> > > > +	 */
> > > > +	case ARM_SPE_NV_LCL_CLSTR:
> > > 
> > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
> > > the cluster level, it should happen in L2 cache:
> > > 
> > >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
> > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> > 
> > We don't know if this is coming from the cluster cache, or the private L1 or L2
> > core caches. The description above about why we'll be transferring the line from
> > cache-to-cache applies here too. 
> 
> I think a minor difference between PEER_CORE and LCL_CLSTR is:
> if data source is PEER_CORE, the snooping happens on the peer core's
> local cache (Core's L1 or Core's L2 when core contains L2); for the data
> source LCL_CLSTR, the snooping occurs on the cluster level's cache
> (cluster's L2 cache or cluster's L3 cache when cluster contains L3).
> 
> So can we set both 'L2 | L3' for LCL_CLSTR case?

Just as above, I believe setting two options will lead to more confusion.
Additionally, we agreed in the previous discussion that the system cache shall
be the L3, so i don't see how this helps alleviate any confusion. That said, the
systems I'm most familiar with never set LCL_CLSTR as a source. 


> > > > +	case ARM_SPE_NV_PEER_CLSTR:
> > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > > > +		break;
> > > 
> > > This type can snoop from L1 or L2 cache in the peer cluster, so it
> > > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE.  But here
> > > should use the snoop type PERF_MEM_SNOOP_HIT, so:
> > > 
> > >         data_src->mem_lvl = PERF_MEM_LVL_HIT
> > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > 
> > Given that we agreed to only focus on the three levels generally used by
> > the existing implementations LCL and PEER should be the same for now. 
> 
> For PEER_CLSTR, we don't know the snooping happening on CPU's private
> cache or cluster's shared cache, this is why we should use ANY_CACHE
> for cache level.
> 
> But LCL_CLSTR is different from PEER_CLSTR, LCL_CLSTR indicates the
> snooping on the local cluster's share cache, we set 'L2 | L3' for it;
> therefore, we can distinguish between these two cases.

For the same reasons above, I believe we should only set a single value. 

> 
> > > > +	/*
> > > > +	 * System cache is assumed to be L3
> > > > +	 */
> > > > +	case ARM_SPE_NV_SYS_CACHE:
> > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> > > > +		break;
> > > 
> > >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3;
> > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> > 
> > I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. 
> 
> Is this because the decoding flow has any specific requirement that
> only can set mem_lvl_num and not set mem_lvl?

As one example, perf_mem__lvl_scnprintf() breaks if both are set. 

> 
> > If we're hitting in the unified L3 cache, we aren't actually snooping anything
> > which is why I didn't set mem_snoop here.
> 
> I am a bit suspecious for the clarification.  If the system cache is in
> the cache conhernecy domain, then snooping occurs on it; in other words,
> if system cache is connected with a bus (like CCI or CMN), and the bus
> can help for data consistency, I prefer to set SNOOP_HIT flag.
> 
> Could you confirm for this?

Thinking about this more that seems reasonable. 

Thanks,
Ali
Leo Yan March 28, 2022, 1:05 p.m. UTC | #8
Hi Ali,

On Mon, Mar 28, 2022 at 03:08:05AM +0000, Ali Saidi wrote:

[...]

> > > > > +	case ARM_SPE_NV_PEER_CORE:
> > > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > > > 
> > > > Peer core contains its local L1 cache, so I think we can set the
> > > > memory level L1 to indicate this case.
> > > It could be either the L1 or the L2. All the neoverse cores have private L2
> > > caches and we don't know. 
> > 
> > How about set both L1 and L2 cache together for this case?
> > 
> > Although 'L1 | L2' cannot tell the exact cache level, I think it's
> > better than use ANY_CACHE, at least this can help us to distinguish
> > from other data source types if we avoid to use ANY_CACHE for all of
> > them.
> 
> This seems much more confusing then the ambiguity of where the line came from
> and is only possible with the deprecated mem_lvl enconding.  It will make
> perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to
> attribute a line to a single cache will find that the sum of the number of hits
> is greater than the number of accesses which also seems terribly confusing.

Understand.  I considered the potential issue for
perf_mem__lvl_scnprintf(), actually it supports printing multipl cache
levels for 'mem_lvl', by conjuncting with 'or' it composes the multiple
cache levels.  We might need to extend a bit for another field
'mem_lvlnum'.

Agreed that there would have inaccurate issue for statistics, it's fine
for me to use ANY_CACHE in this patch set.

I still think we should consider to extend the memory levels to
demonstrate clear momory hierarchy on Arm archs, I personally like the
definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
though these cache levels are not precise like L1/L2/L3 levels, they can
help us to map very well for the cache topology on Arm archs and without
any confusion.  We could take this as an enhancement if you don't want
to bother the current patch set's upstreaming.

> > > > For this data source type and below types, though they indicate
> > > > the snooping happens, but it doesn't mean the data in the cache line
> > > > is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> > > > think this will mislead users when report the result.
> > > 
> > > I'm of the opposite opinion. If the data wasn't modified, it will likely be
> > > found in the lower-level shared cache and the transaction wouldn't require a
> > > cache-to-cache transfer of the modified data, so the most common case when we
> > > source a line out of another cores cache will be if it was "modifiable" in that
> > > cache. 
> > 
> > Let's still use MOSI protocol as example.  I think there have two
> > cases: on case is the peer cache line is in 'Shared' state and another
> > case is the peer cache line is in 'Owned' state.
> > 
> > Quotes the wiki page for these two cases:
> > 
> > "When the cache block is in the Shared (S) state and there is a
> > snooped bus read (BusRd) transaction, then the block stays in the same
> > state and generates no more transactions as all the cache blocks have
> > the same value including the main memory and it is only being read,
> > not written into."
> > 
> > "While in the Owner (O) state and there is a snooped read request
> > (BusRd), the block remains in the same state while flushing (Flush)
> > the data for the other processor to read from it."
> > 
> > Seems to me, it's reasonable to set HTIM flag when the snooping happens
> > for the cache line line is in the Modified (M) state.
> > 
> > Again, my comment is based on the literal understanding; so please
> > correct if have any misunderstanding at here.
> 
> Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except
> where sharing patterns are detected," so if a line is shared among caches it
> will likely also be in the SLC (and thus we'd get an L3 hit). 
> 
> If there is one copy of the cache line and that cache line needs to transition
> from one core to another core, I don't believe it matters if it was truly
> modified or not because the core already had permission to modify it and the
> transaction is just as costly (ping ponging between core caches). This is the
> one thing I really want to be able to uniquely identify as any cacheline doing
> this that isn't a lock is a place where optimization is likely possible. 

I understood that your point that if big amount of transitions within
multiple cores hit the same cache line, it would be very likely caused
by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag
for easier reviewing.

Alternatively, I think it's good to pick up the patch series "perf c2c:
Sort cacheline with all loads" [1], rather than relying on HITM tag, the
patch series extends a new option "-d all" for perf c2c, so it displays
the suspecious false sharing cache lines based on load/store ops and
thread infos.  The main reason for holding on th patch set is due to we
cannot verify it with Arm SPE at that time point, as the time being Arm
SPE trace data was absent both store ops and data source packets.

I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can
upstream the patch series "perf c2c: Sort cacheline with all loads"
(only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches
have been merged in the mainline kernel).

If this is fine for you, I can respin the patch series for "perf c2c".
Or any other thoughts?

[1] https://lore.kernel.org/lkml/20201213133850.10070-1-leo.yan@linaro.org/

> > > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE:
> > > > 
> > > >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1;
> > > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> > > > 
> > > > > +		break;
> > > > > +	/*
> > > > > +	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> > > > > +	 * transfer, so set SNOOP_HITM
> > > > > +	 */
> > > > > +	case ARM_SPE_NV_LCL_CLSTR:
> > > > 
> > > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in
> > > > the cluster level, it should happen in L2 cache:
> > > > 
> > > >         data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2;
> > > >         data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
> > > >         data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> > > 
> > > We don't know if this is coming from the cluster cache, or the private L1 or L2
> > > core caches. The description above about why we'll be transferring the line from
> > > cache-to-cache applies here too. 
> > 
> > I think a minor difference between PEER_CORE and LCL_CLSTR is:
> > if data source is PEER_CORE, the snooping happens on the peer core's
> > local cache (Core's L1 or Core's L2 when core contains L2); for the data
> > source LCL_CLSTR, the snooping occurs on the cluster level's cache
> > (cluster's L2 cache or cluster's L3 cache when cluster contains L3).
> > 
> > So can we set both 'L2 | L3' for LCL_CLSTR case?
> 
> Just as above, I believe setting two options will lead to more confusion.
> Additionally, we agreed in the previous discussion that the system cache shall
> be the L3, so i don't see how this helps alleviate any confusion. That said, the
> systems I'm most familiar with never set LCL_CLSTR as a source. 

Okay, let's rollback to PERF_MEM_LVLNUM_ANY_CACHE as cache level for
LCL_CLSTR and PEER_CLSTR.

Thanks,
Leo
Shuai Xue March 29, 2022, 1:34 p.m. UTC | #9
Hi Leo, Ali,

Thank you for your great work and valuable discussion.

在 2022/3/27 AM3:43, Ali Saidi 写道:> Hi Leo,
> On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote:
>> Hi Ali, German,
>>
>> On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote:
>>
>> [...]
>>
>>> +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
>>> +						union perf_mem_data_src *data_src)
>>>  {
>>> -	union perf_mem_data_src	data_src = { 0 };
>>> +	/*
>>> +	 * Even though four levels of cache hierarchy are possible, no known
>>> +	 * production Neoverse systems currently include more than three levels
>>> +	 * so for the time being we assume three exist. If a production system
>>> +	 * is built with four the this function would have to be changed to
>>> +	 * detect the number of levels for reporting.
>>> +	 */
>>>
>>> -	if (record->op == ARM_SPE_LD)
>>> -		data_src.mem_op = PERF_MEM_OP_LOAD;
>>> -	else
>>> -		data_src.mem_op = PERF_MEM_OP_STORE;
>>
>> Firstly, apologize that I didn't give clear idea when Ali sent patch sets
>> v2 and v3.
>>
>> IMHO, we need to consider two kinds of information which can guide us
>> for a reliable implementation.  The first thing is to summarize the data
>> source configuration for x86 PEBS, we can dive in more details for this
>> part; the second thing is we can refer to the AMBA architecture document
>> ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and
>> its sub section 'Suggested DataSource values', which would help us
>> much for mapping the cache topology to Arm SPE data source.
>>
>> As a result, I summarized the data source configurations for PEBS and
>> Arm SPE Neoverse in the spreadsheet:
>> https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing
>
> Thanks for putting this together and digging into the details, but you're making
> assumptions in neoverse data sources about the core configurations that aren't
> correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the
> line is coming from a peer-core we don't know which level it's actually coming
> from.  Similarly, if it's coming from a local cluster, that could mean a cluster
> l3, but it's not the L2.

As far as I know, Neoverse N2 microarchitecture L3 Cache is non-inclusive, and L1
and L2 are strictly inclusive, like  Intel Skylake SP (SKX), i.e., the L2 may
or may not be in the L3 (no guarantee is made). That is to say, we can not tell
it is from cluster L2 or L3. Could you confirm this?

[...]


> I still think we should consider to extend the memory levels to
> demonstrate clear momory hierarchy on Arm archs, I personally like the
> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
> though these cache levels are not precise like L1/L2/L3 levels, they can
> help us to map very well for the cache topology on Arm archs and without
> any confusion.  We could take this as an enhancement if you don't want
> to bother the current patch set's upstreaming.

Agree. In my opinion, imprecise cache levels can lead to wrong conclusions.
"PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE" are more intuitive.

Best Regards,
Shuai
Ali Saidi March 29, 2022, 2:32 p.m. UTC | #10
Hi Leo,

On Mon, 28 Mar 2022 21:05:47 +0800, Leo Yan wrote:
> Hi Ali,
> 
> On Mon, Mar 28, 2022 at 03:08:05AM +0000, Ali Saidi wrote:
> 
> [...]
> 
> > > > > > +	case ARM_SPE_NV_PEER_CORE:
> > > > > > +		data_src->mem_lvl = PERF_MEM_LVL_HIT;
> > > > > > +		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> > > > > > +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> > > > > 
> > > > > Peer core contains its local L1 cache, so I think we can set the
> > > > > memory level L1 to indicate this case.
> > > > It could be either the L1 or the L2. All the neoverse cores have private L2
> > > > caches and we don't know. 
> > > 
> > > How about set both L1 and L2 cache together for this case?
> > > 
> > > Although 'L1 | L2' cannot tell the exact cache level, I think it's
> > > better than use ANY_CACHE, at least this can help us to distinguish
> > > from other data source types if we avoid to use ANY_CACHE for all of
> > > them.
> > 
> > This seems much more confusing then the ambiguity of where the line came from
> > and is only possible with the deprecated mem_lvl enconding.  It will make
> > perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to
> > attribute a line to a single cache will find that the sum of the number of hits
> > is greater than the number of accesses which also seems terribly confusing.
> 
> Understand.  I considered the potential issue for
> perf_mem__lvl_scnprintf(), actually it supports printing multipl cache
> levels for 'mem_lvl', by conjuncting with 'or' it composes the multiple
> cache levels.  We might need to extend a bit for another field
> 'mem_lvlnum'.
> 
> Agreed that there would have inaccurate issue for statistics, it's fine
> for me to use ANY_CACHE in this patch set.

Thanks!

> 
> I still think we should consider to extend the memory levels to
> demonstrate clear momory hierarchy on Arm archs, I personally like the
> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
> though these cache levels are not precise like L1/L2/L3 levels, they can
> help us to map very well for the cache topology on Arm archs and without
> any confusion.  We could take this as an enhancement if you don't want
> to bother the current patch set's upstreaming.

I'd like to do this in a separate patch, but I have one other proposal. The
Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
it's also in the L2. Given that the Graviton systems and afaik the Ampere
systems don't have any cache between the L2 and the SLC, thus anything from
PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
should just set L2 for these cases? German, are you good with this for now? 

> > > > > For this data source type and below types, though they indicate
> > > > > the snooping happens, but it doesn't mean the data in the cache line
> > > > > is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
> > > > > think this will mislead users when report the result.
> > > > 
> > > > I'm of the opposite opinion. If the data wasn't modified, it will likely be
> > > > found in the lower-level shared cache and the transaction wouldn't require a
> > > > cache-to-cache transfer of the modified data, so the most common case when we
> > > > source a line out of another cores cache will be if it was "modifiable" in that
> > > > cache. 
> > > 
> > > Let's still use MOSI protocol as example.  I think there have two
> > > cases: on case is the peer cache line is in 'Shared' state and another
> > > case is the peer cache line is in 'Owned' state.
> > > 
> > > Quotes the wiki page for these two cases:
> > > 
> > > "When the cache block is in the Shared (S) state and there is a
> > > snooped bus read (BusRd) transaction, then the block stays in the same
> > > state and generates no more transactions as all the cache blocks have
> > > the same value including the main memory and it is only being read,
> > > not written into."
> > > 
> > > "While in the Owner (O) state and there is a snooped read request
> > > (BusRd), the block remains in the same state while flushing (Flush)
> > > the data for the other processor to read from it."
> > > 
> > > Seems to me, it's reasonable to set HTIM flag when the snooping happens
> > > for the cache line line is in the Modified (M) state.
> > > 
> > > Again, my comment is based on the literal understanding; so please
> > > correct if have any misunderstanding at here.
> > 
> > Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except
> > where sharing patterns are detected," so if a line is shared among caches it
> > will likely also be in the SLC (and thus we'd get an L3 hit). 
> > 
> > If there is one copy of the cache line and that cache line needs to transition
> > from one core to another core, I don't believe it matters if it was truly
> > modified or not because the core already had permission to modify it and the
> > transaction is just as costly (ping ponging between core caches). This is the
> > one thing I really want to be able to uniquely identify as any cacheline doing
> > this that isn't a lock is a place where optimization is likely possible. 
> 
> I understood that your point that if big amount of transitions within
> multiple cores hit the same cache line, it would be very likely caused
> by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag
> for easier reviewing.

And that from a dataflow perspective if the line is owned (and could be
modifiable) vs. was actually modified is really the less interesting bit. 

> Alternatively, I think it's good to pick up the patch series "perf c2c:
> Sort cacheline with all loads" [1], rather than relying on HITM tag, the
> patch series extends a new option "-d all" for perf c2c, so it displays
> the suspecious false sharing cache lines based on load/store ops and
> thread infos.  The main reason for holding on th patch set is due to we
> cannot verify it with Arm SPE at that time point, as the time being Arm
> SPE trace data was absent both store ops and data source packets.

Looking at examples I don't, at least from my system, data-source isn't set for
stores, only for loads. 

> I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can
> upstream the patch series "perf c2c: Sort cacheline with all loads"
> (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches
> have been merged in the mainline kernel).
> 
> If this is fine for you, I can respin the patch series for "perf c2c".
> Or any other thoughts?

I think this is a nice option to have in the tool-box, but from my point of
view, I'd like someone who is familiar with c2c output on x86 to come to an
arm64 system and be able to zero in on a ping-ponging line like they would
otherwise. Highlighting a line that is moving between cores frequently which is
likely in the exclusive state by tagging it an HITM accomplishes this and will
make it easier to find these cases.  Your approach also has innaccurancies and
wouldn't be able to differentiate between core X accessing a line a lot followed
by core Y acessing a line alot vs the cores ping-ponging.  Yes, I agree that we
will "overcount" HITM, but I don't think this is particularly bad and it does
specifically highlight the core-2-core transfers that are likely a performance
issue easily and it will result in easier identification of areas of false or
true sharing and improve performance.

Thanks,
Ali
Leo Yan March 31, 2022, 12:19 p.m. UTC | #11
Hi Ali,

On Tue, Mar 29, 2022 at 02:32:14PM +0000, Ali Saidi wrote:

[...]

> > I still think we should consider to extend the memory levels to
> > demonstrate clear momory hierarchy on Arm archs, I personally like the
> > definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
> > though these cache levels are not precise like L1/L2/L3 levels, they can
> > help us to map very well for the cache topology on Arm archs and without
> > any confusion.  We could take this as an enhancement if you don't want
> > to bother the current patch set's upstreaming.
> 
> I'd like to do this in a separate patch, but I have one other proposal. The
> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
> it's also in the L2. Given that the Graviton systems and afaik the Ampere
> systems don't have any cache between the L2 and the SLC, thus anything from
> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
> should just set L2 for these cases? German, are you good with this for now? 

If we use a single cache level (no matterh it's L2 or ANY_CACHE) for
these data sources, it's hard for users to understand what's the cost
for the memory operations.  So here I suggested for these new cache
levels is not only about cache level, it's more about the information
telling the memory operation's cost.

[...]

> > Alternatively, I think it's good to pick up the patch series "perf c2c:
> > Sort cacheline with all loads" [1], rather than relying on HITM tag, the
> > patch series extends a new option "-d all" for perf c2c, so it displays
> > the suspecious false sharing cache lines based on load/store ops and
> > thread infos.  The main reason for holding on th patch set is due to we
> > cannot verify it with Arm SPE at that time point, as the time being Arm
> > SPE trace data was absent both store ops and data source packets.
> 
> Looking at examples I don't, at least from my system, data-source isn't set for
> stores, only for loads.

Ouch ...  If data source is not set for store operation, then all store
samples will absent cache level info.  Or should we set ANY_CACHE as
cache level for store operations?

> > I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can
> > upstream the patch series "perf c2c: Sort cacheline with all loads"
> > (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches
> > have been merged in the mainline kernel).
> > 
> > If this is fine for you, I can respin the patch series for "perf c2c".
> > Or any other thoughts?
> 
> I think this is a nice option to have in the tool-box, but from my point of
> view, I'd like someone who is familiar with c2c output on x86 to come to an
> arm64 system and be able to zero in on a ping-ponging line like they would
> otherwise. Highlighting a line that is moving between cores frequently which is
> likely in the exclusive state by tagging it an HITM accomplishes this and will
> make it easier to find these cases.  Your approach also has innaccurancies and
> wouldn't be able to differentiate between core X accessing a line a lot followed
> by core Y acessing a line alot vs the cores ping-ponging.  Yes, I agree that we
> will "overcount" HITM, but I don't think this is particularly bad and it does
> specifically highlight the core-2-core transfers that are likely a performance
> issue easily and it will result in easier identification of areas of false or
> true sharing and improve performance.

I don't want to block this patch set by this part, and either I don't
want to introduce any confusion for later users, especially I think
users who in later use this tool but it's hard for them to be aware any
assumptions in this discussion thread.  So two options would be fine
for me:

Option 1: if you and Arm mates can confirm that inaccuracy caused by
setting HITM is low (e.g. 2%-3% inaccuracy that introduced by directly
set HITM), I think this could be acceptable.  Otherwise, please
consider option 2.

Option 2: by default we set PERF_MEM_SNOOP_HIT flag since now actually
we have no info to support HITM.  Then use a new patch to add an extra
option (say '--coarse-hitm') for 'perf c2c' tool, a user can explictly
specify this option for 'perf c2c' command; when a user specifies this
option it means that the user understands and accepts inaccuracy by
forcing to use PERF_MEM_SNOOP_HITM flag.  I think you could refer to
the option '--stitch-lbr' for adding an option for 'perf c2c' tool.

Thanks,
Leo
German Gomez March 31, 2022, 12:28 p.m. UTC | #12
Hi all,

It seems I gave the Review tags a bit too early this time. Apologies for
the inconvenience. Indeed there was more interesting discussions to be
had :)

(Probably best to remove by tags for the next re-spin)

On 29/03/2022 15:32, Ali Saidi wrote:
> [...]
>
>> I still think we should consider to extend the memory levels to
>> demonstrate clear momory hierarchy on Arm archs, I personally like the
>> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
>> though these cache levels are not precise like L1/L2/L3 levels, they can
>> help us to map very well for the cache topology on Arm archs and without
>> any confusion.  We could take this as an enhancement if you don't want
>> to bother the current patch set's upstreaming.
> I'd like to do this in a separate patch, but I have one other proposal. The
> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
> it's also in the L2. Given that the Graviton systems and afaik the Ampere
> systems don't have any cache between the L2 and the SLC, thus anything from
> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
> should just set L2 for these cases? German, are you good with this for now? 

Sorry for the delay. I'd like to also check this with someone. I'll try
to get back asap. In the meantime, if this approach is also OK with Leo,
I think it would be fine by me.

Thanks,
German

>>>>>> For this data source type and below types, though they indicate
>>>>>> the snooping happens, but it doesn't mean the data in the cache line
>>>>>> is in 'modified' state.  If set flag PERF_MEM_SNOOP_HITM, I personally
>>>>>> think this will mislead users when report the result.
>>>>> I'm of the opposite opinion. If the data wasn't modified, it will likely be
>>>>> found in the lower-level shared cache and the transaction wouldn't require a
>>>>> cache-to-cache transfer of the modified data, so the most common case when we
>>>>> source a line out of another cores cache will be if it was "modifiable" in that
>>>>> cache. 
>>>> Let's still use MOSI protocol as example.  I think there have two
>>>> cases: on case is the peer cache line is in 'Shared' state and another
>>>> case is the peer cache line is in 'Owned' state.
>>>>
>>>> Quotes the wiki page for these two cases:
>>>>
>>>> "When the cache block is in the Shared (S) state and there is a
>>>> snooped bus read (BusRd) transaction, then the block stays in the same
>>>> state and generates no more transactions as all the cache blocks have
>>>> the same value including the main memory and it is only being read,
>>>> not written into."
>>>>
>>>> "While in the Owner (O) state and there is a snooped read request
>>>> (BusRd), the block remains in the same state while flushing (Flush)
>>>> the data for the other processor to read from it."
>>>>
>>>> Seems to me, it's reasonable to set HTIM flag when the snooping happens
>>>> for the cache line line is in the Modified (M) state.
>>>>
>>>> Again, my comment is based on the literal understanding; so please
>>>> correct if have any misunderstanding at here.
>>> Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except
>>> where sharing patterns are detected," so if a line is shared among caches it
>>> will likely also be in the SLC (and thus we'd get an L3 hit). 
>>>
>>> If there is one copy of the cache line and that cache line needs to transition
>>> from one core to another core, I don't believe it matters if it was truly
>>> modified or not because the core already had permission to modify it and the
>>> transaction is just as costly (ping ponging between core caches). This is the
>>> one thing I really want to be able to uniquely identify as any cacheline doing
>>> this that isn't a lock is a place where optimization is likely possible. 
>> I understood that your point that if big amount of transitions within
>> multiple cores hit the same cache line, it would be very likely caused
>> by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag
>> for easier reviewing.
> And that from a dataflow perspective if the line is owned (and could be
> modifiable) vs. was actually modified is really the less interesting bit. 
>
>> Alternatively, I think it's good to pick up the patch series "perf c2c:
>> Sort cacheline with all loads" [1], rather than relying on HITM tag, the
>> patch series extends a new option "-d all" for perf c2c, so it displays
>> the suspecious false sharing cache lines based on load/store ops and
>> thread infos.  The main reason for holding on th patch set is due to we
>> cannot verify it with Arm SPE at that time point, as the time being Arm
>> SPE trace data was absent both store ops and data source packets.
> Looking at examples I don't, at least from my system, data-source isn't set for
> stores, only for loads. 
>
>> I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can
>> upstream the patch series "perf c2c: Sort cacheline with all loads"
>> (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches
>> have been merged in the mainline kernel).
>>
>> If this is fine for you, I can respin the patch series for "perf c2c".
>> Or any other thoughts?
> I think this is a nice option to have in the tool-box, but from my point of
> view, I'd like someone who is familiar with c2c output on x86 to come to an
> arm64 system and be able to zero in on a ping-ponging line like they would
> otherwise. Highlighting a line that is moving between cores frequently which is
> likely in the exclusive state by tagging it an HITM accomplishes this and will
> make it easier to find these cases.  Your approach also has innaccurancies and
> wouldn't be able to differentiate between core X accessing a line a lot followed
> by core Y acessing a line alot vs the cores ping-ponging.  Yes, I agree that we
> will "overcount" HITM, but I don't think this is particularly bad and it does
> specifically highlight the core-2-core transfers that are likely a performance
> issue easily and it will result in easier identification of areas of false or
> true sharing and improve performance.
>
> Thanks,
> Ali
Leo Yan March 31, 2022, 12:44 p.m. UTC | #13
On Thu, Mar 31, 2022 at 01:28:58PM +0100, German Gomez wrote:
> Hi all,
> 
> It seems I gave the Review tags a bit too early this time. Apologies for
> the inconvenience. Indeed there was more interesting discussions to be
> had :)
> 
> (Probably best to remove by tags for the next re-spin)

Now worries, German.  Your review and testing are very helpful :)

> On 29/03/2022 15:32, Ali Saidi wrote:
> > [...]
> >
> >> I still think we should consider to extend the memory levels to
> >> demonstrate clear momory hierarchy on Arm archs, I personally like the
> >> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
> >> though these cache levels are not precise like L1/L2/L3 levels, they can
> >> help us to map very well for the cache topology on Arm archs and without
> >> any confusion.  We could take this as an enhancement if you don't want
> >> to bother the current patch set's upstreaming.
> > I'd like to do this in a separate patch, but I have one other proposal. The
> > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
> > it's also in the L2. Given that the Graviton systems and afaik the Ampere
> > systems don't have any cache between the L2 and the SLC, thus anything from
> > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
> > should just set L2 for these cases? German, are you good with this for now? 
> 
> Sorry for the delay. I'd like to also check this with someone. I'll try
> to get back asap. In the meantime, if this approach is also OK with Leo,
> I think it would be fine by me.

Thanks for the checking internally.  Let me just bring up my another
thinking (sorry that my suggestion is float): another choice is we set
ANY_CACHE as cache level if we are not certain the cache level, and
extend snoop field to indicate the snooping logics, like:

  PERF_MEM_SNOOP_PEER_CORE
  PERF_MEM_SNOOP_LCL_CLSTR
  PERF_MEM_SNOOP_PEER_CLSTR

Seems to me, we doing this is not only for cache level, it's more
important for users to know the variant cost for involving different
snooping logics.

Thanks,
Leo
Ali Saidi April 3, 2022, 8:33 p.m. UTC | #14
On Thu, 31 Mar 2022 12:44:3 +0000, Leo Yan wrote:
> 
> On Thu, Mar 31, 2022 at 01:28:58PM +0100, German Gomez wrote:
> > Hi all,
> > 
> > It seems I gave the Review tags a bit too early this time. Apologies for
> > the inconvenience. Indeed there was more interesting discussions to be
> > had :)
> > 
> > (Probably best to remove by tags for the next re-spin)
> 
> Now worries, German.  Your review and testing are very helpful :)
> 
> > On 29/03/2022 15:32, Ali Saidi wrote:
> > > [...]
> > >
> > >> I still think we should consider to extend the memory levels to
> > >> demonstrate clear momory hierarchy on Arm archs, I personally like the
> > >> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE",
> > >> though these cache levels are not precise like L1/L2/L3 levels, they can
> > >> help us to map very well for the cache topology on Arm archs and without
> > >> any confusion.  We could take this as an enhancement if you don't want
> > >> to bother the current patch set's upstreaming.
> > > I'd like to do this in a separate patch, but I have one other proposal. The
> > > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
> > > it's also in the L2. Given that the Graviton systems and afaik the Ampere
> > > systems don't have any cache between the L2 and the SLC, thus anything from
> > > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
> > > should just set L2 for these cases? German, are you good with this for now? 
> > 
> > Sorry for the delay. I'd like to also check this with someone. I'll try
> > to get back asap. In the meantime, if this approach is also OK with Leo,
> > I think it would be fine by me.
> 
> Thanks for the checking internally.  Let me just bring up my another
> thinking (sorry that my suggestion is float): another choice is we set
> ANY_CACHE as cache level if we are not certain the cache level, and
> extend snoop field to indicate the snooping logics, like:
> 
>   PERF_MEM_SNOOP_PEER_CORE
>   PERF_MEM_SNOOP_LCL_CLSTR
>   PERF_MEM_SNOOP_PEER_CLSTR
> 
> Seems to me, we doing this is not only for cache level, it's more
> important for users to know the variant cost for involving different
> snooping logics.

I think we've come full circle :). Going back to what do we want to indicate to
a user about the source of the cache line, I believe there are three things with
an eye toward helping a user of the data improve the performance of their
application:

1. The level below them in the hierarchy it it (L1, L2, LLC, local DRAM).
Depending on the level this directly indicates the expense of the operation. 

2. If it came from a peer of theirs on the same socket. I'm really of the
opinion still that exactly which peer, doesn't matter much as it's a 2nd or 3rd
order concern compared to, it it couldn't be sourced from a cache level below
the originating core, had to come from a local peer and the request went to
that lower levels and was eventually sourced from a peer.  Why it was sourced
from the peer is still almost irrelevant to me. If it was truly modified or the
core it was sourced from only had permission to modify it the snoop filter
doesn't necessarily need to know the difference and the outcome is the same. 

3. For multi-socket systems that it came from a different socket and there it is
probably most interesting if it came from DRAM on the remote socket or a cache.

I'm putting 3 aside for now since we've really been focusing on 1 and 2 in this
discussion and I think the biggest hangup has been the definition of HIT vs
HITM. If someone has a precise definition, that would be great, but AFAIK it
goes back to the P6 bus where HIT was asserted by another core if it had a line
(in any state) and HITM was additionally asserted if a core needed to inhibit
another device (e.g. DDR controller) from providing that line to the requestor. 

The latter logic is why I think it's perfectly acceptable to use HITM to
indicate a peer cache-to-cache transfer, however since others don't feel that way
let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that
indicates some peer of the hierarchy below the originating core sourced the
data.  This clears up the definition that line came from from a peer and may or
may not have been modified, but it doesn't add a lot of implementation dependant
functionality into the SNOOP API. 

We could use the mem-level to indicate the level of the cache hierarchy we had
to get to before the snoop traveled upward, which seems like what x86 is doing
here.

PEER_CORE -> MEM_SNOOP_PEER + L2
PEER_CLSTR -> MEM_SNOOP_PEER + L3
PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support
the clusters and the existing commercial implementations don't have them).

Thanks,
Ali
Leo Yan April 4, 2022, 3:12 p.m. UTC | #15
On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote:

[...]

> > Let me just bring up my another
> > thinking (sorry that my suggestion is float): another choice is we set
> > ANY_CACHE as cache level if we are not certain the cache level, and
> > extend snoop field to indicate the snooping logics, like:
> > 
> >   PERF_MEM_SNOOP_PEER_CORE
> >   PERF_MEM_SNOOP_LCL_CLSTR
> >   PERF_MEM_SNOOP_PEER_CLSTR
> > 
> > Seems to me, we doing this is not only for cache level, it's more
> > important for users to know the variant cost for involving different
> > snooping logics.
> 
> I think we've come full circle :).

Not too bad, and I learned a lot :)

> Going back to what do we want to indicate to
> a user about the source of the cache line, I believe there are three things with
> an eye toward helping a user of the data improve the performance of their
> application:

Thanks a lot for summary!

> 1. The level below them in the hierarchy it it (L1, L2, LLC, local DRAM).
> Depending on the level this directly indicates the expense of the operation. 
> 
> 2. If it came from a peer of theirs on the same socket. I'm really of the
> opinion still that exactly which peer, doesn't matter much as it's a 2nd or 3rd
> order concern compared to, it it couldn't be sourced from a cache level below
> the originating core, had to come from a local peer and the request went to
> that lower levels and was eventually sourced from a peer.  Why it was sourced
> from the peer is still almost irrelevant to me. If it was truly modified or the
> core it was sourced from only had permission to modify it the snoop filter
> doesn't necessarily need to know the difference and the outcome is the same. 

I think here the key information delivered is:

For the peer snooping, you think there has big cost difference between
L2 cache snooping and L3 cache snooping; for L3 cache snooping, we
don't care about it's an internal cluster snooping or external cluster
snooping, and we have no enough info to reason snooping type (HIT vs
HITM).

> 3. For multi-socket systems that it came from a different socket and there it is
> probably most interesting if it came from DRAM on the remote socket or a cache.
>
> I'm putting 3 aside for now since we've really been focusing on 1 and 2 in this
> discussion and I think the biggest hangup has been the definition of HIT vs
> HITM.

Agree on the item 3.

> If someone has a precise definition, that would be great, but AFAIK it
> goes back to the P6 bus where HIT was asserted by another core if it had a line
> (in any state) and HITM was additionally asserted if a core needed to inhibit
> another device (e.g. DDR controller) from providing that line to the requestor. 

Thanks for sharing the info for how the bus implements HIT/HITM.

> The latter logic is why I think it's perfectly acceptable to use HITM to
> indicate a peer cache-to-cache transfer, however since others don't feel that way
> let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that
> indicates some peer of the hierarchy below the originating core sourced the
> data.  This clears up the definition that line came from from a peer and may or
> may not have been modified, but it doesn't add a lot of implementation dependant
> functionality into the SNOOP API. 
> 
> We could use the mem-level to indicate the level of the cache hierarchy we had
> to get to before the snoop traveled upward, which seems like what x86 is doing
> here.

It makes sense to me that to use the highest cache level as mem-level.
Please add comments in the code for this, this would be useful for
understanding the code.

> PEER_CORE -> MEM_SNOOP_PEER + L2
> PEER_CLSTR -> MEM_SNOOP_PEER + L3
> PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support
> the clusters and the existing commercial implementations don't have them).

Generally, this idea is fine for me.

Following your suggestion, if we connect the concepts PoC and PoU in Arm
reference manual, we can extend the snooping mode with MEM_SNOOP_POU
(for PoU) and MEM_SNOOP_POC (for PoC), so:

PEER_CORE -> MEM_SNOOP_POU + L2
PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3
PEER_CLSTR -> MEM_SNOOP_POC + L3

Seems to me, we could consider for this.  If this is over complexity or
even I said any wrong concepts for this, please use your method.

Thanks,
Leo
Ali Saidi April 6, 2022, 9 p.m. UTC | #16
On Mon, 4 Apr 2022 15:12:18  +0000, Leo Yan wrote:
> On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote:

[...]

> > The latter logic is why I think it's perfectly acceptable to use HITM to
> > indicate a peer cache-to-cache transfer, however since others don't feel that way
> > let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that
> > indicates some peer of the hierarchy below the originating core sourced the
> > data.  This clears up the definition that line came from from a peer and may or
> > may not have been modified, but it doesn't add a lot of implementation dependant
> > functionality into the SNOOP API. 
> > 
> > We could use the mem-level to indicate the level of the cache hierarchy we had
> > to get to before the snoop traveled upward, which seems like what x86 is doing
> > here.
> 
> It makes sense to me that to use the highest cache level as mem-level.
> Please add comments in the code for this, this would be useful for
> understanding the code.

Ok.

> > PEER_CORE -> MEM_SNOOP_PEER + L2
> > PEER_CLSTR -> MEM_SNOOP_PEER + L3
> > PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support
> > the clusters and the existing commercial implementations don't have them).
> 
> Generally, this idea is fine for me.

Great.  

Now the next tricky thing. Since we're not using HITM for recording the memory
events, the question becomes for the c2c output should we output the SNOOP_PEER
events as if they are HITM events with a clarification in the perf-c2c man page
or effectively duplicate all the lcl_hitm logic, which is a fair amount,  in
perf c2c to add a column and sort option? 

> Following your suggestion, if we connect the concepts PoC and PoU in Arm
> reference manual, we can extend the snooping mode with MEM_SNOOP_POU
> (for PoU) and MEM_SNOOP_POC (for PoC), so:
> 
> PEER_CORE -> MEM_SNOOP_POU + L2
> PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3
> PEER_CLSTR -> MEM_SNOOP_POC + L3
> 
> Seems to me, we could consider for this.  If this is over complexity or
> even I said any wrong concepts for this, please use your method.

I think this adds a lot of complexity and reduces clarity. Some systems
implement coherent icaches and the PoU would be the L1 cache, others don't so
that would be the L2 (or wherever there is a unified cache). Similarly, with the
point of coherency, some systems would consider that dram, but other systems
have transparent LLCs and it would be the LLC. 

Thanks,
Ali
German Gomez April 7, 2022, 3:24 p.m. UTC | #17
Hi,

On 31/03/2022 13:44, Leo Yan wrote:
> [...]
>>> I'd like to do this in a separate patch, but I have one other proposal. The
>>> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
>>> it's also in the L2. Given that the Graviton systems and afaik the Ampere
>>> systems don't have any cache between the L2 and the SLC, thus anything from
>>> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
>>> should just set L2 for these cases? German, are you good with this for now? 
>> Sorry for the delay. I'd like to also check this with someone. I'll try
>> to get back asap. In the meantime, if this approach is also OK with Leo,
>> I think it would be fine by me.

Sorry for the delay. Yeah setting it to L2 indeed looks reasonable for
now. Somebody brought up the case of running SPE in a heterogeneous 
system, but also we think might be beyond the scope of this change.

One very minor nit though. Would you be ok with renaming LCL to LOCAL 
and CLSTR to CLUSTER? I sometimes mistead the former as "LLC".

> Thanks for the checking internally.  Let me just bring up my another
> thinking (sorry that my suggestion is float): another choice is we set
> ANY_CACHE as cache level if we are not certain the cache level, and
> extend snoop field to indicate the snooping logics, like:
>
>   PERF_MEM_SNOOP_PEER_CORE
>   PERF_MEM_SNOOP_LCL_CLSTR
>   PERF_MEM_SNOOP_PEER_CLSTR
>
> Seems to me, we doing this is not only for cache level, it's more
> important for users to know the variant cost for involving different
> snooping logics.
>
> Thanks,
> Leo

I see there's been some more messages I need to catch up with. Is the 
intention to extend the PERF_MEM_* flags for this cset, or will it be
left for a later change?

In any case, I'd be keen to take another look at it and try to bring
some more eyes into this.

Thanks,
German
Leo Yan April 8, 2022, 1:06 a.m. UTC | #18
On Wed, Apr 06, 2022 at 09:00:17PM +0000, Ali Saidi wrote:
> On Mon, 4 Apr 2022 15:12:18  +0000, Leo Yan wrote:
> > On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote:

[...]

> > > PEER_CORE -> MEM_SNOOP_PEER + L2
> > > PEER_CLSTR -> MEM_SNOOP_PEER + L3
> > > PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support
> > > the clusters and the existing commercial implementations don't have them).
> > 
> > Generally, this idea is fine for me.
> 
> Great.  
> 
> Now the next tricky thing. Since we're not using HITM for recording the memory
> events, the question becomes for the c2c output should we output the SNOOP_PEER
> events as if they are HITM events with a clarification in the perf-c2c man page
> or effectively duplicate all the lcl_hitm logic, which is a fair amount,  in
> perf c2c to add a column and sort option? 

I think we need to handle both load and store operations in 'perf c2c'
tool.

For the load operation, in the 'cache line details' view, we need to
support 'snoop_peer' conlumn; and since Arm SPE doesn't give any data
source info for store opeartion, so my plan is to add an extra conlumn
'Other' based on the two existed conlumns 'L1 Hit' and 'L1 Miss'.

Could you leave this part for me?  I will respin my patch set for
extend 'perf c2c' for this (and hope can support the old Arm SPE trace
data).

Please note, when you spin new patch set, you need to take care for
the store operations.  In the current patch set, it will wrongly
always set L1 hit for all store operations due to the data source
field is always zero.  My understanding is for all store operations,
we need to set the cache level as PERF_MEM_LVLNUM_ANY_CACHE and snoop
type as PERF_MEM_SNOOP_NA.

> > Following your suggestion, if we connect the concepts PoC and PoU in Arm
> > reference manual, we can extend the snooping mode with MEM_SNOOP_POU
> > (for PoU) and MEM_SNOOP_POC (for PoC), so:
> > 
> > PEER_CORE -> MEM_SNOOP_POU + L2
> > PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3
> > PEER_CLSTR -> MEM_SNOOP_POC + L3
> > 
> > Seems to me, we could consider for this.  If this is over complexity or
> > even I said any wrong concepts for this, please use your method.
> 
> I think this adds a lot of complexity and reduces clarity. Some systems
> implement coherent icaches and the PoU would be the L1 cache, others don't so
> that would be the L2 (or wherever there is a unified cache). Similarly, with the
> point of coherency, some systems would consider that dram, but other systems
> have transparent LLCs and it would be the LLC. 

Okay, it's fine for me to move forward to use MEM_SNOOP_PEER as the
solution.

Since German is looking into this part, @German, if you have any comment
on this part, just let us know.

Thanks,
Leo
Leo Yan April 8, 2022, 1:18 a.m. UTC | #19
On Thu, Apr 07, 2022 at 04:24:35PM +0100, German Gomez wrote:
> Hi,
> 
> On 31/03/2022 13:44, Leo Yan wrote:
> > [...]
> >>> I'd like to do this in a separate patch, but I have one other proposal. The
> >>> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1,
> >>> it's also in the L2. Given that the Graviton systems and afaik the Ampere
> >>> systems don't have any cache between the L2 and the SLC, thus anything from
> >>> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we
> >>> should just set L2 for these cases? German, are you good with this for now? 
> >> Sorry for the delay. I'd like to also check this with someone. I'll try
> >> to get back asap. In the meantime, if this approach is also OK with Leo,
> >> I think it would be fine by me.
> 
> Sorry for the delay. Yeah setting it to L2 indeed looks reasonable for
> now. Somebody brought up the case of running SPE in a heterogeneous 
> system, but also we think might be beyond the scope of this change.
> 
> One very minor nit though. Would you be ok with renaming LCL to LOCAL 
> and CLSTR to CLUSTER? I sometimes mistead the former as "LLC".

Ali's suggestion is to use the format: highest_cache_level | MEM_SNOOP_PEER.

Simply to say, the highest cache level is where we snoop the cache
data with the highest cache level.  And we use an extra snoop op
MEM_SNOOP_PEER as the flag to indicate a peer snooping from the local
cluster or peer cluster.

Please review the more detailed discussion in another email.

> > Thanks for the checking internally.  Let me just bring up my another
> > thinking (sorry that my suggestion is float): another choice is we set
> > ANY_CACHE as cache level if we are not certain the cache level, and
> > extend snoop field to indicate the snooping logics, like:
> >
> >   PERF_MEM_SNOOP_PEER_CORE
> >   PERF_MEM_SNOOP_LCL_CLSTR
> >   PERF_MEM_SNOOP_PEER_CLSTR
> >
> > Seems to me, we doing this is not only for cache level, it's more
> > important for users to know the variant cost for involving different
> > snooping logics.
> >
> I see there's been some more messages I need to catch up with. Is the 
> intention to extend the PERF_MEM_* flags for this cset, or will it be
> left for a later change?

The plan is to extend the PERF_MEM_* flags in this patch set.

> In any case, I'd be keen to take another look at it and try to bring
> some more eyes into this.

Sure.  Please check at your side and thanks for confirmation.

Thanks,
Leo
diff mbox series

Patch

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 5e390a1a79ab..091987dd3966 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -220,6 +220,7 @@  static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 
 			break;
 		case ARM_SPE_DATA_SOURCE:
+			decoder->record.source = payload;
 			break;
 		case ARM_SPE_BAD:
 			break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 69b31084d6be..c81bf90c0996 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -29,6 +29,17 @@  enum arm_spe_op_type {
 	ARM_SPE_ST		= 1 << 1,
 };
 
+enum arm_spe_neoverse_data_source {
+	ARM_SPE_NV_L1D        = 0x0,
+	ARM_SPE_NV_L2         = 0x8,
+	ARM_SPE_NV_PEER_CORE  = 0x9,
+	ARM_SPE_NV_LCL_CLSTR  = 0xa,
+	ARM_SPE_NV_SYS_CACHE  = 0xb,
+	ARM_SPE_NV_PEER_CLSTR = 0xc,
+	ARM_SPE_NV_REMOTE     = 0xd,
+	ARM_SPE_NV_DRAM       = 0xe,
+};
+
 struct arm_spe_record {
 	enum arm_spe_sample_type type;
 	int err;
@@ -40,6 +51,7 @@  struct arm_spe_record {
 	u64 virt_addr;
 	u64 phys_addr;
 	u64 context_id;
+	u16 source;
 };
 
 struct arm_spe_insn;
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d2b64e3f588b..f92ebce88c6a 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -34,6 +34,7 @@ 
 #include "arm-spe-decoder/arm-spe-decoder.h"
 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
 
+#include "../../arch/arm64/include/asm/cputype.h"
 #define MAX_TIMESTAMP (~0ULL)
 
 struct arm_spe {
@@ -45,6 +46,7 @@  struct arm_spe {
 	struct perf_session		*session;
 	struct machine			*machine;
 	u32				pmu_type;
+	u64				midr;
 
 	struct perf_tsc_conversion	tc;
 
@@ -399,33 +401,110 @@  static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
 	return false;
 }
 
-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
+	{},
+};
+
+
+static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
+						union perf_mem_data_src *data_src)
 {
-	union perf_mem_data_src	data_src = { 0 };
+	/*
+	 * Even though four levels of cache hierarchy are possible, no known
+	 * production Neoverse systems currently include more than three levels
+	 * so for the time being we assume three exist. If a production system
+	 * is built with four the this function would have to be changed to
+	 * detect the number of levels for reporting.
+	 */
 
-	if (record->op == ARM_SPE_LD)
-		data_src.mem_op = PERF_MEM_OP_LOAD;
-	else
-		data_src.mem_op = PERF_MEM_OP_STORE;
+	switch (record->source) {
+	case ARM_SPE_NV_L1D:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
+		break;
+	case ARM_SPE_NV_L2:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+		break;
+	case ARM_SPE_NV_PEER_CORE:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+		break;
+	/*
+	 * We don't know if this is L1, L2 but we do know it was a cache-2-cache
+	 * transfer, so set SNOOP_HITM
+	 */
+	case ARM_SPE_NV_LCL_CLSTR:
+	case ARM_SPE_NV_PEER_CLSTR:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+		break;
+	/*
+	 * System cache is assumed to be L3
+	 */
+	case ARM_SPE_NV_SYS_CACHE:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+		break;
+	/*
+	 * We don't know what level it hit in, except it came from the other
+	 * socket
+	 */
+	case ARM_SPE_NV_REMOTE:
+		data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+		break;
+	case ARM_SPE_NV_DRAM:
+		data_src->mem_lvl = PERF_MEM_LVL_HIT;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+		break;
+	default:
+		break;
+	}
+}
 
+static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
+						union perf_mem_data_src *data_src)
+{
 	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
-		data_src.mem_lvl = PERF_MEM_LVL_L3;
+		data_src->mem_lvl = PERF_MEM_LVL_L3;
 
 		if (record->type & ARM_SPE_LLC_MISS)
-			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
 		else
-			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
 	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
-		data_src.mem_lvl = PERF_MEM_LVL_L1;
+		data_src->mem_lvl = PERF_MEM_LVL_L1;
 
 		if (record->type & ARM_SPE_L1D_MISS)
-			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+			data_src->mem_lvl |= PERF_MEM_LVL_MISS;
 		else
-			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+			data_src->mem_lvl |= PERF_MEM_LVL_HIT;
 	}
 
 	if (record->type & ARM_SPE_REMOTE_ACCESS)
-		data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+}
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
+{
+	union perf_mem_data_src	data_src = { 0 };
+	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
+
+	if (record->op & ARM_SPE_LD)
+		data_src.mem_op = PERF_MEM_OP_LOAD;
+	else
+		data_src.mem_op = PERF_MEM_OP_STORE;
+
+	if (is_neoverse)
+		arm_spe__synth_data_source_neoverse(record, &data_src);
+	else
+		arm_spe__synth_data_source_generic(record, &data_src);
 
 	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
 		data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -446,7 +525,7 @@  static int arm_spe_sample(struct arm_spe_queue *speq)
 	u64 data_src;
 	int err;
 
-	data_src = arm_spe__synth_data_source(record);
+	data_src = arm_spe__synth_data_source(record, spe->midr);
 
 	if (spe->sample_flc) {
 		if (record->type & ARM_SPE_L1D_MISS) {
@@ -1183,6 +1262,8 @@  int arm_spe_process_auxtrace_info(union perf_event *event,
 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
 	size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
 	struct perf_record_time_conv *tc = &session->time_conv;
+	const char *cpuid = perf_env__cpuid(session->evlist->env);
+	u64 midr = strtol(cpuid, NULL, 16);
 	struct arm_spe *spe;
 	int err;
 
@@ -1202,6 +1283,7 @@  int arm_spe_process_auxtrace_info(union perf_event *event,
 	spe->machine = &session->machines.host; /* No kvm support */
 	spe->auxtrace_type = auxtrace_info->type;
 	spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+	spe->midr = midr;
 
 	spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);