Message ID | 20220324183323.31414-3-alisaidi@amazon.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | perf: arm-spe: Decode SPE source and use for perf c2c | expand |
Hi Ali, German, On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: [...] > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > + union perf_mem_data_src *data_src) > { > - union perf_mem_data_src data_src = { 0 }; > + /* > + * Even though four levels of cache hierarchy are possible, no known > + * production Neoverse systems currently include more than three levels > + * so for the time being we assume three exist. If a production system > + * is built with four the this function would have to be changed to > + * detect the number of levels for reporting. > + */ > > - if (record->op == ARM_SPE_LD) > - data_src.mem_op = PERF_MEM_OP_LOAD; > - else > - data_src.mem_op = PERF_MEM_OP_STORE; Firstly, apologize that I didn't give clear idea when Ali sent patch sets v2 and v3. IMHO, we need to consider two kinds of information which can guide us for a reliable implementation. The first thing is to summarize the data source configuration for x86 PEBS, we can dive in more details for this part; the second thing is we can refer to the AMBA architecture document ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and its sub section 'Suggested DataSource values', which would help us much for mapping the cache topology to Arm SPE data source. As a result, I summarized the data source configurations for PEBS and Arm SPE Neoverse in the spreadsheet: https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing Please see below comments. > + switch (record->source) { > + case ARM_SPE_NV_L1D: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > + break; I think we need to set the field 'mem_snoop' for L1 cache hit: data_src->mem_snoop = PERF_MEM_SNOOP_NONE; For L1 cache hit, it doesn't involve snooping. > + case ARM_SPE_NV_L2: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > + break; Ditto: data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > + case ARM_SPE_NV_PEER_CORE: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; Peer core contains its local L1 cache, so I think we can set the memory level L1 to indicate this case. For this data source type and below types, though they indicate the snooping happens, but it doesn't mean the data in the cache line is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally think this will mislead users when report the result. I prefer we set below fields for ARM_SPE_NV_PEER_CORE: data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; data_src->mem_snoop = PERF_MEM_SNOOP_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > + break; > + /* > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > + * transfer, so set SNOOP_HITM > + */ > + case ARM_SPE_NV_LCL_CLSTR: For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in the cluster level, it should happen in L2 cache: data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; data_src->mem_snoop = PERF_MEM_SNOOP_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > + case ARM_SPE_NV_PEER_CLSTR: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > + break; This type can snoop from L1 or L2 cache in the peer cluster, so it makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE. But here should use the snoop type PERF_MEM_SNOOP_HIT, so: data_src->mem_lvl = PERF_MEM_LVL_HIT data_src->mem_snoop = PERF_MEM_SNOOP_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > + /* > + * System cache is assumed to be L3 > + */ > + case ARM_SPE_NV_SYS_CACHE: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > + break; data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3; data_src->mem_snoop = PERF_MEM_SNOOP_HIT; data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > + /* > + * We don't know what level it hit in, except it came from the other > + * socket > + */ > + case ARM_SPE_NV_REMOTE: > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > + break; The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen in any cache levels in remote chip: data_src->mem_lvl = PERF_MEM_LVL_HIT; data_src->mem_snoop = PERF_MEM_SNOOP_HIT; data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > + case ARM_SPE_NV_DRAM: > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > + break; We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source: data_src->mem_lvl = PERF_MEM_LVL_HIT; data_src->mem_snoop = PERF_MEM_SNOOP_MISS; data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; The rest of this patch looks good to me. Thanks, Leo > + default: > + break; > + } > +} > > +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, > + union perf_mem_data_src *data_src) > +{ > if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { > - data_src.mem_lvl = PERF_MEM_LVL_L3; > + data_src->mem_lvl = PERF_MEM_LVL_L3; > > if (record->type & ARM_SPE_LLC_MISS) > - data_src.mem_lvl |= PERF_MEM_LVL_MISS; > + data_src->mem_lvl |= PERF_MEM_LVL_MISS; > else > - data_src.mem_lvl |= PERF_MEM_LVL_HIT; > + data_src->mem_lvl |= PERF_MEM_LVL_HIT; > } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { > - data_src.mem_lvl = PERF_MEM_LVL_L1; > + data_src->mem_lvl = PERF_MEM_LVL_L1; > > if (record->type & ARM_SPE_L1D_MISS) > - data_src.mem_lvl |= PERF_MEM_LVL_MISS; > + data_src->mem_lvl |= PERF_MEM_LVL_MISS; > else > - data_src.mem_lvl |= PERF_MEM_LVL_HIT; > + data_src->mem_lvl |= PERF_MEM_LVL_HIT; > } > > if (record->type & ARM_SPE_REMOTE_ACCESS) > - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1; > + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; > +} > + > +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) > +{ > + union perf_mem_data_src data_src = { 0 }; > + bool is_neoverse = is_midr_in_range(midr, neoverse_spe); > + > + if (record->op & ARM_SPE_LD) > + data_src.mem_op = PERF_MEM_OP_LOAD; > + else > + data_src.mem_op = PERF_MEM_OP_STORE; > + > + if (is_neoverse) > + arm_spe__synth_data_source_neoverse(record, &data_src); > + else > + arm_spe__synth_data_source_generic(record, &data_src); > > if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { > data_src.mem_dtlb = PERF_MEM_TLB_WK; > @@ -446,7 +525,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) > u64 data_src; > int err; > > - data_src = arm_spe__synth_data_source(record); > + data_src = arm_spe__synth_data_source(record, spe->midr); > > if (spe->sample_flc) { > if (record->type & ARM_SPE_L1D_MISS) { > @@ -1183,6 +1262,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event, > struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; > size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; > struct perf_record_time_conv *tc = &session->time_conv; > + const char *cpuid = perf_env__cpuid(session->evlist->env); > + u64 midr = strtol(cpuid, NULL, 16); > struct arm_spe *spe; > int err; > > @@ -1202,6 +1283,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, > spe->machine = &session->machines.host; /* No kvm support */ > spe->auxtrace_type = auxtrace_info->type; > spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; > + spe->midr = midr; > > spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); > > -- > 2.32.0 >
Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu: > Hi Ali, German, > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: > > [...] > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > > + union perf_mem_data_src *data_src) > > { > > - union perf_mem_data_src data_src = { 0 }; > > + /* > > + * Even though four levels of cache hierarchy are possible, no known > > + * production Neoverse systems currently include more than three levels > > + * so for the time being we assume three exist. If a production system > > + * is built with four the this function would have to be changed to > > + * detect the number of levels for reporting. > > + */ > > > > - if (record->op == ARM_SPE_LD) > > - data_src.mem_op = PERF_MEM_OP_LOAD; > > - else > > - data_src.mem_op = PERF_MEM_OP_STORE; > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets > v2 and v3. Ok, removing this as well. Thanks for reviewing. - Arnaldo > IMHO, we need to consider two kinds of information which can guide us > for a reliable implementation. The first thing is to summarize the data > source configuration for x86 PEBS, we can dive in more details for this > part; the second thing is we can refer to the AMBA architecture document > ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and > its sub section 'Suggested DataSource values', which would help us > much for mapping the cache topology to Arm SPE data source. > > As a result, I summarized the data source configurations for PEBS and > Arm SPE Neoverse in the spreadsheet: > https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing > > Please see below comments. > > > + switch (record->source) { > > + case ARM_SPE_NV_L1D: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > + break; > > I think we need to set the field 'mem_snoop' for L1 cache hit: > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > > For L1 cache hit, it doesn't involve snooping. > > > + case ARM_SPE_NV_L2: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > + break; > > Ditto: > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > > > + case ARM_SPE_NV_PEER_CORE: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > Peer core contains its local L1 cache, so I think we can set the > memory level L1 to indicate this case. > > For this data source type and below types, though they indicate > the snooping happens, but it doesn't mean the data in the cache line > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > think this will mislead users when report the result. > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE: > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > + break; > > + /* > > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > > + * transfer, so set SNOOP_HITM > > + */ > > + case ARM_SPE_NV_LCL_CLSTR: > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in > the cluster level, it should happen in L2 cache: > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > > + case ARM_SPE_NV_PEER_CLSTR: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > + break; > > This type can snoop from L1 or L2 cache in the peer cluster, so it > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE. But here > should use the snoop type PERF_MEM_SNOOP_HIT, so: > > data_src->mem_lvl = PERF_MEM_LVL_HIT > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > + /* > > + * System cache is assumed to be L3 > > + */ > > + case ARM_SPE_NV_SYS_CACHE: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > + break; > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > > + /* > > + * We don't know what level it hit in, except it came from the other > > + * socket > > + */ > > + case ARM_SPE_NV_REMOTE: > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > > + break; > > The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen > in any cache levels in remote chip: > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > + case ARM_SPE_NV_DRAM: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > > + break; > > We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source: > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > data_src->mem_snoop = PERF_MEM_SNOOP_MISS; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > > The rest of this patch looks good to me. > > Thanks, > Leo > > > + default: > > + break; > > + } > > +} > > > > +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, > > + union perf_mem_data_src *data_src) > > +{ > > if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { > > - data_src.mem_lvl = PERF_MEM_LVL_L3; > > + data_src->mem_lvl = PERF_MEM_LVL_L3; > > > > if (record->type & ARM_SPE_LLC_MISS) > > - data_src.mem_lvl |= PERF_MEM_LVL_MISS; > > + data_src->mem_lvl |= PERF_MEM_LVL_MISS; > > else > > - data_src.mem_lvl |= PERF_MEM_LVL_HIT; > > + data_src->mem_lvl |= PERF_MEM_LVL_HIT; > > } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { > > - data_src.mem_lvl = PERF_MEM_LVL_L1; > > + data_src->mem_lvl = PERF_MEM_LVL_L1; > > > > if (record->type & ARM_SPE_L1D_MISS) > > - data_src.mem_lvl |= PERF_MEM_LVL_MISS; > > + data_src->mem_lvl |= PERF_MEM_LVL_MISS; > > else > > - data_src.mem_lvl |= PERF_MEM_LVL_HIT; > > + data_src->mem_lvl |= PERF_MEM_LVL_HIT; > > } > > > > if (record->type & ARM_SPE_REMOTE_ACCESS) > > - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1; > > + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; > > +} > > + > > +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) > > +{ > > + union perf_mem_data_src data_src = { 0 }; > > + bool is_neoverse = is_midr_in_range(midr, neoverse_spe); > > + > > + if (record->op & ARM_SPE_LD) > > + data_src.mem_op = PERF_MEM_OP_LOAD; > > + else > > + data_src.mem_op = PERF_MEM_OP_STORE; > > + > > + if (is_neoverse) > > + arm_spe__synth_data_source_neoverse(record, &data_src); > > + else > > + arm_spe__synth_data_source_generic(record, &data_src); > > > > if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { > > data_src.mem_dtlb = PERF_MEM_TLB_WK; > > @@ -446,7 +525,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) > > u64 data_src; > > int err; > > > > - data_src = arm_spe__synth_data_source(record); > > + data_src = arm_spe__synth_data_source(record, spe->midr); > > > > if (spe->sample_flc) { > > if (record->type & ARM_SPE_L1D_MISS) { > > @@ -1183,6 +1262,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event, > > struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; > > size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; > > struct perf_record_time_conv *tc = &session->time_conv; > > + const char *cpuid = perf_env__cpuid(session->evlist->env); > > + u64 midr = strtol(cpuid, NULL, 16); > > struct arm_spe *spe; > > int err; > > > > @@ -1202,6 +1283,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, > > spe->machine = &session->machines.host; /* No kvm support */ > > spe->auxtrace_type = auxtrace_info->type; > > spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; > > + spe->midr = midr; > > > > spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); > > > > -- > > 2.32.0 > >
On Sat, Mar 26, 2022 at 10:52:01AM -0300, Arnaldo Carvalho de Melo wrote: > Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu: > > Hi Ali, German, > > > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: > > > > [...] > > > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > > > + union perf_mem_data_src *data_src) > > > { > > > - union perf_mem_data_src data_src = { 0 }; > > > + /* > > > + * Even though four levels of cache hierarchy are possible, no known > > > + * production Neoverse systems currently include more than three levels > > > + * so for the time being we assume three exist. If a production system > > > + * is built with four the this function would have to be changed to > > > + * detect the number of levels for reporting. > > > + */ > > > > > > - if (record->op == ARM_SPE_LD) > > > - data_src.mem_op = PERF_MEM_OP_LOAD; > > > - else > > > - data_src.mem_op = PERF_MEM_OP_STORE; > > > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets > > v2 and v3. > > Ok, removing this as well. > > Thanks for reviewing. Thanks a lot, Arnaldo. Yeah, it's good to give a bit more time to dismiss the concerns in this patch. Sorry again for the inconvenience. Leo
Em Sat, Mar 26, 2022 at 09:56:53PM +0800, Leo Yan escreveu: > On Sat, Mar 26, 2022 at 10:52:01AM -0300, Arnaldo Carvalho de Melo wrote: > > Em Sat, Mar 26, 2022 at 09:47:54PM +0800, Leo Yan escreveu: > > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: > > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > > > > + union perf_mem_data_src *data_src) > > > > { > > > > - union perf_mem_data_src data_src = { 0 }; > > > > + /* > > > > + * Even though four levels of cache hierarchy are possible, no known > > > > + * production Neoverse systems currently include more than three levels > > > > + * so for the time being we assume three exist. If a production system > > > > + * is built with four the this function would have to be changed to > > > > + * detect the number of levels for reporting. > > > > + */ > > > > - if (record->op == ARM_SPE_LD) > > > > - data_src.mem_op = PERF_MEM_OP_LOAD; > > > > - else > > > > - data_src.mem_op = PERF_MEM_OP_STORE; > > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets > > > v2 and v3. > > Ok, removing this as well. > > Thanks for reviewing. > Thanks a lot, Arnaldo. Yeah, it's good to give a bit more time to > dismiss the concerns in this patch. Sure, at least it was build tested on many distros/cross compilers and this part is ok 8-) > Sorry again for the inconvenience. np. - Arnaldo
Hi Leo, On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote: > Hi Ali, German, > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: > > [...] > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > > + union perf_mem_data_src *data_src) > > { > > - union perf_mem_data_src data_src = { 0 }; > > + /* > > + * Even though four levels of cache hierarchy are possible, no known > > + * production Neoverse systems currently include more than three levels > > + * so for the time being we assume three exist. If a production system > > + * is built with four the this function would have to be changed to > > + * detect the number of levels for reporting. > > + */ > > > > - if (record->op == ARM_SPE_LD) > > - data_src.mem_op = PERF_MEM_OP_LOAD; > > - else > > - data_src.mem_op = PERF_MEM_OP_STORE; > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets > v2 and v3. > > IMHO, we need to consider two kinds of information which can guide us > for a reliable implementation. The first thing is to summarize the data > source configuration for x86 PEBS, we can dive in more details for this > part; the second thing is we can refer to the AMBA architecture document > ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and > its sub section 'Suggested DataSource values', which would help us > much for mapping the cache topology to Arm SPE data source. > > As a result, I summarized the data source configurations for PEBS and > Arm SPE Neoverse in the spreadsheet: > https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing Thanks for putting this together and digging into the details, but you're making assumptions in neoverse data sources about the core configurations that aren't correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the line is coming from a peer-core we don't know which level it's actually coming from. Similarly, if it's coming from a local cluster, that could mean a cluster l3, but it's not the L2. > Please see below comments. > > > + switch (record->source) { > > + case ARM_SPE_NV_L1D: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > + break; > > I think we need to set the field 'mem_snoop' for L1 cache hit: > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > > For L1 cache hit, it doesn't involve snooping. I can't find a precise definition for SNOOP_NONE, but it seemed as though this would be used for cases where a snoop could have occurred but didn't not for accesses that by definition don't snoop? I'm happy with either way, perhaps i just read more into it. > > + case ARM_SPE_NV_L2: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > + break; > > Ditto: > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; Same comment as above. > > + case ARM_SPE_NV_PEER_CORE: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > Peer core contains its local L1 cache, so I think we can set the > memory level L1 to indicate this case. It could be either the L1 or the L2. All the neoverse cores have private L2 caches and we don't know. > For this data source type and below types, though they indicate > the snooping happens, but it doesn't mean the data in the cache line > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > think this will mislead users when report the result. I'm of the opposite opinion. If the data wasn't modified, it will likely be found in the lower-level shared cache and the transaction wouldn't require a cache-to-cache transfer of the modified data, so the most common case when we source a line out of another cores cache will be if it was "modifiable" in that cache. > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE: > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > + break; > > + /* > > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > > + * transfer, so set SNOOP_HITM > > + */ > > + case ARM_SPE_NV_LCL_CLSTR: > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in > the cluster level, it should happen in L2 cache: > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; We don't know if this is coming from the cluster cache, or the private L1 or L2 core caches. The description above about why we'll be transferring the line from cache-to-cache applies here too. > > + case ARM_SPE_NV_PEER_CLSTR: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > + break; > > This type can snoop from L1 or L2 cache in the peer cluster, so it > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE. But here > should use the snoop type PERF_MEM_SNOOP_HIT, so: > > data_src->mem_lvl = PERF_MEM_LVL_HIT > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; Given that we agreed to only focus on the three levels generally used by the existing implementations LCL and PEER should be the same for now. > > + /* > > + * System cache is assumed to be L3 > > + */ > > + case ARM_SPE_NV_SYS_CACHE: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > + break; > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. If we're hitting in the unified L3 cache, we aren't actually snooping anything which is why I didn't set mem_snoop here. > > + /* > > + * We don't know what level it hit in, except it came from the other > > + * socket > > + */ > > + case ARM_SPE_NV_REMOTE: > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > > + break; > > The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen > in any cache levels in remote chip: > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; Ok. > > > + case ARM_SPE_NV_DRAM: > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > > + break; > > We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source: > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > data_src->mem_snoop = PERF_MEM_SNOOP_MISS; > data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > Ok. Thanks, Ali
On Sat, Mar 26, 2022 at 07:43:27PM +0000, Ali Saidi wrote: > Hi Leo, > On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote: > > Hi Ali, German, > > > > On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: > > > > [...] > > > > > +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, > > > + union perf_mem_data_src *data_src) > > > { > > > - union perf_mem_data_src data_src = { 0 }; > > > + /* > > > + * Even though four levels of cache hierarchy are possible, no known > > > + * production Neoverse systems currently include more than three levels > > > + * so for the time being we assume three exist. If a production system > > > + * is built with four the this function would have to be changed to > > > + * detect the number of levels for reporting. > > > + */ > > > > > > - if (record->op == ARM_SPE_LD) > > > - data_src.mem_op = PERF_MEM_OP_LOAD; > > > - else > > > - data_src.mem_op = PERF_MEM_OP_STORE; > > > > Firstly, apologize that I didn't give clear idea when Ali sent patch sets > > v2 and v3. > > > > IMHO, we need to consider two kinds of information which can guide us > > for a reliable implementation. The first thing is to summarize the data > > source configuration for x86 PEBS, we can dive in more details for this > > part; the second thing is we can refer to the AMBA architecture document > > ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and > > its sub section 'Suggested DataSource values', which would help us > > much for mapping the cache topology to Arm SPE data source. > > > > As a result, I summarized the data source configurations for PEBS and > > Arm SPE Neoverse in the spreadsheet: > > https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing > > Thanks for putting this together and digging into the details, but you're making > assumptions in neoverse data sources about the core configurations that aren't > correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the > line is coming from a peer-core we don't know which level it's actually coming > from. Similarly, if it's coming from a local cluster, that could mean a cluster > l3, but it's not the L2. I remembered before you have mentioned this, and yes, these concerns are valid for me. Please see below comments. > > Please see below comments. > > > > > + switch (record->source) { > > > + case ARM_SPE_NV_L1D: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > + break; > > > > I think we need to set the field 'mem_snoop' for L1 cache hit: > > > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > > > > For L1 cache hit, it doesn't involve snooping. > > I can't find a precise definition for SNOOP_NONE, but it seemed as though > this would be used for cases where a snoop could have occurred but didn't > not for accesses that by definition don't snoop? I'm happy with either way, > perhaps i just read more into it. I have the same understanding with you that "this would be used for cases where a snoop could have occurred but didn't not for accesses that by definition don't snoop". If we refer to PEBS's data source type 01H: "Minimal latency core cache hit. This request was satisfied by the L1 data cache" and x86 sets SNOOP_NONE as the snoop type for this case. If we connect with snooping protocol, let's use MOIS protocol as an example (here simply use the MOIS protocol for discussion, but I don't have any knowledge for implementation of CPUs actually), as described in the wiki page [1], when a cache line is in Owned (O) state or Shared (S) state, "a processor read (PrRd) does not generate any snooped signal". In these cases, I think we should use snoop type PERF_MEM_SNOOP_NONE. > > > + case ARM_SPE_NV_L2: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > > + break; > > > > Ditto: > > > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > Same comment as above. > > > > + case ARM_SPE_NV_PEER_CORE: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > Peer core contains its local L1 cache, so I think we can set the > > memory level L1 to indicate this case. > It could be either the L1 or the L2. All the neoverse cores have private L2 > caches and we don't know. How about set both L1 and L2 cache together for this case? Although 'L1 | L2' cannot tell the exact cache level, I think it's better than use ANY_CACHE, at least this can help us to distinguish from other data source types if we avoid to use ANY_CACHE for all of them. > > For this data source type and below types, though they indicate > > the snooping happens, but it doesn't mean the data in the cache line > > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > > think this will mislead users when report the result. > > I'm of the opposite opinion. If the data wasn't modified, it will likely be > found in the lower-level shared cache and the transaction wouldn't require a > cache-to-cache transfer of the modified data, so the most common case when we > source a line out of another cores cache will be if it was "modifiable" in that > cache. Let's still use MOSI protocol as example. I think there have two cases: on case is the peer cache line is in 'Shared' state and another case is the peer cache line is in 'Owned' state. Quotes the wiki page for these two cases: "When the cache block is in the Shared (S) state and there is a snooped bus read (BusRd) transaction, then the block stays in the same state and generates no more transactions as all the cache blocks have the same value including the main memory and it is only being read, not written into." "While in the Owner (O) state and there is a snooped read request (BusRd), the block remains in the same state while flushing (Flush) the data for the other processor to read from it." Seems to me, it's reasonable to set HTIM flag when the snooping happens for the cache line line is in the Modified (M) state. Again, my comment is based on the literal understanding; so please correct if have any misunderstanding at here. > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE: > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > > > + break; > > > + /* > > > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > > > + * transfer, so set SNOOP_HITM > > > + */ > > > + case ARM_SPE_NV_LCL_CLSTR: > > > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in > > the cluster level, it should happen in L2 cache: > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > We don't know if this is coming from the cluster cache, or the private L1 or L2 > core caches. The description above about why we'll be transferring the line from > cache-to-cache applies here too. I think a minor difference between PEER_CORE and LCL_CLSTR is: if data source is PEER_CORE, the snooping happens on the peer core's local cache (Core's L1 or Core's L2 when core contains L2); for the data source LCL_CLSTR, the snooping occurs on the cluster level's cache (cluster's L2 cache or cluster's L3 cache when cluster contains L3). So can we set both 'L2 | L3' for LCL_CLSTR case? > > > + case ARM_SPE_NV_PEER_CLSTR: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > + break; > > > > This type can snoop from L1 or L2 cache in the peer cluster, so it > > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE. But here > > should use the snoop type PERF_MEM_SNOOP_HIT, so: > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > Given that we agreed to only focus on the three levels generally used by > the existing implementations LCL and PEER should be the same for now. For PEER_CLSTR, we don't know the snooping happening on CPU's private cache or cluster's shared cache, this is why we should use ANY_CACHE for cache level. But LCL_CLSTR is different from PEER_CLSTR, LCL_CLSTR indicates the snooping on the local cluster's share cache, we set 'L2 | L3' for it; therefore, we can distinguish between these two cases. > > > + /* > > > + * System cache is assumed to be L3 > > > + */ > > > + case ARM_SPE_NV_SYS_CACHE: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > > + break; > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3; > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. Is this because the decoding flow has any specific requirement that only can set mem_lvl_num and not set mem_lvl? > If we're hitting in the unified L3 cache, we aren't actually snooping anything > which is why I didn't set mem_snoop here. I am a bit suspecious for the clarification. If the system cache is in the cache conhernecy domain, then snooping occurs on it; in other words, if system cache is connected with a bus (like CCI or CMN), and the bus can help for data consistency, I prefer to set SNOOP_HIT flag. Could you confirm for this? > > > + /* > > > + * We don't know what level it hit in, except it came from the other > > > + * socket > > > + */ > > > + case ARM_SPE_NV_REMOTE: > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > > > + break; > > > > The type ARM_SPE_NV_REMOTE is a snooping operation and it can happen > > in any cache levels in remote chip: > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > Ok. > > > > > > + case ARM_SPE_NV_DRAM: > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > > > + break; > > > > We can set snoop as PERF_MEM_SNOOP_MISS for DRAM data source: > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT; > > data_src->mem_snoop = PERF_MEM_SNOOP_MISS; > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; > > > > Ok. Thanks a lot for your work! Leo [1] https://en.wikipedia.org/wiki/MOSI_protocol
Hi Leo, Thanks for the additional comments. On Sun, 27 Mar 2022 07:09:19 +0000, Leo Yan wrote: [snip] > > > > Please see below comments. > > > > > > > + switch (record->source) { > > > > + case ARM_SPE_NV_L1D: > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > > + break; > > > > > > I think we need to set the field 'mem_snoop' for L1 cache hit: > > > > > > data_src->mem_snoop = PERF_MEM_SNOOP_NONE; > > > > > > For L1 cache hit, it doesn't involve snooping. > > > > I can't find a precise definition for SNOOP_NONE, but it seemed as though > > this would be used for cases where a snoop could have occurred but didn't > > not for accesses that by definition don't snoop? I'm happy with either way, > > perhaps i just read more into it. > > I have the same understanding with you that "this would be used for > cases where a snoop could have occurred but didn't not for accesses > that by definition don't snoop". > > If we refer to PEBS's data source type 01H: "Minimal latency core cache > hit. This request was satisfied by the L1 data cache" and x86 sets > SNOOP_NONE as the snoop type for this case. I'm happy to set it if everyone believes it's the right thing. [snip] > > Same comment as above. > > > > > > + case ARM_SPE_NV_PEER_CORE: > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > > > Peer core contains its local L1 cache, so I think we can set the > > > memory level L1 to indicate this case. > > It could be either the L1 or the L2. All the neoverse cores have private L2 > > caches and we don't know. > > How about set both L1 and L2 cache together for this case? > > Although 'L1 | L2' cannot tell the exact cache level, I think it's > better than use ANY_CACHE, at least this can help us to distinguish > from other data source types if we avoid to use ANY_CACHE for all of > them. This seems much more confusing then the ambiguity of where the line came from and is only possible with the deprecated mem_lvl enconding. It will make perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to attribute a line to a single cache will find that the sum of the number of hits is greater than the number of accesses which also seems terribly confusing. > > > > For this data source type and below types, though they indicate > > > the snooping happens, but it doesn't mean the data in the cache line > > > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > > > think this will mislead users when report the result. > > > > I'm of the opposite opinion. If the data wasn't modified, it will likely be > > found in the lower-level shared cache and the transaction wouldn't require a > > cache-to-cache transfer of the modified data, so the most common case when we > > source a line out of another cores cache will be if it was "modifiable" in that > > cache. > > Let's still use MOSI protocol as example. I think there have two > cases: on case is the peer cache line is in 'Shared' state and another > case is the peer cache line is in 'Owned' state. > > Quotes the wiki page for these two cases: > > "When the cache block is in the Shared (S) state and there is a > snooped bus read (BusRd) transaction, then the block stays in the same > state and generates no more transactions as all the cache blocks have > the same value including the main memory and it is only being read, > not written into." > > "While in the Owner (O) state and there is a snooped read request > (BusRd), the block remains in the same state while flushing (Flush) > the data for the other processor to read from it." > > Seems to me, it's reasonable to set HTIM flag when the snooping happens > for the cache line line is in the Modified (M) state. > > Again, my comment is based on the literal understanding; so please > correct if have any misunderstanding at here. Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except where sharing patterns are detected," so if a line is shared among caches it will likely also be in the SLC (and thus we'd get an L3 hit). If there is one copy of the cache line and that cache line needs to transition from one core to another core, I don't believe it matters if it was truly modified or not because the core already had permission to modify it and the transaction is just as costly (ping ponging between core caches). This is the one thing I really want to be able to uniquely identify as any cacheline doing this that isn't a lock is a place where optimization is likely possible. > > > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE: > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > > > > > + break; > > > > + /* > > > > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > > > > + * transfer, so set SNOOP_HITM > > > > + */ > > > > + case ARM_SPE_NV_LCL_CLSTR: > > > > > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in > > > the cluster level, it should happen in L2 cache: > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > > > We don't know if this is coming from the cluster cache, or the private L1 or L2 > > core caches. The description above about why we'll be transferring the line from > > cache-to-cache applies here too. > > I think a minor difference between PEER_CORE and LCL_CLSTR is: > if data source is PEER_CORE, the snooping happens on the peer core's > local cache (Core's L1 or Core's L2 when core contains L2); for the data > source LCL_CLSTR, the snooping occurs on the cluster level's cache > (cluster's L2 cache or cluster's L3 cache when cluster contains L3). > > So can we set both 'L2 | L3' for LCL_CLSTR case? Just as above, I believe setting two options will lead to more confusion. Additionally, we agreed in the previous discussion that the system cache shall be the L3, so i don't see how this helps alleviate any confusion. That said, the systems I'm most familiar with never set LCL_CLSTR as a source. > > > > + case ARM_SPE_NV_PEER_CLSTR: > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > + break; > > > > > > This type can snoop from L1 or L2 cache in the peer cluster, so it > > > makes sense to set cache level as PERF_MEM_LVLNUM_ANY_CACHE. But here > > > should use the snoop type PERF_MEM_SNOOP_HIT, so: > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > Given that we agreed to only focus on the three levels generally used by > > the existing implementations LCL and PEER should be the same for now. > > For PEER_CLSTR, we don't know the snooping happening on CPU's private > cache or cluster's shared cache, this is why we should use ANY_CACHE > for cache level. > > But LCL_CLSTR is different from PEER_CLSTR, LCL_CLSTR indicates the > snooping on the local cluster's share cache, we set 'L2 | L3' for it; > therefore, we can distinguish between these two cases. For the same reasons above, I believe we should only set a single value. > > > > > + /* > > > > + * System cache is assumed to be L3 > > > > + */ > > > > + case ARM_SPE_NV_SYS_CACHE: > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > > > + break; > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L3; > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; > > > > I don't think we should set both the deprecated mem_lvl and the mem_lvl_num. > > Is this because the decoding flow has any specific requirement that > only can set mem_lvl_num and not set mem_lvl? As one example, perf_mem__lvl_scnprintf() breaks if both are set. > > > If we're hitting in the unified L3 cache, we aren't actually snooping anything > > which is why I didn't set mem_snoop here. > > I am a bit suspecious for the clarification. If the system cache is in > the cache conhernecy domain, then snooping occurs on it; in other words, > if system cache is connected with a bus (like CCI or CMN), and the bus > can help for data consistency, I prefer to set SNOOP_HIT flag. > > Could you confirm for this? Thinking about this more that seems reasonable. Thanks, Ali
Hi Ali, On Mon, Mar 28, 2022 at 03:08:05AM +0000, Ali Saidi wrote: [...] > > > > > + case ARM_SPE_NV_PEER_CORE: > > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > > > > > Peer core contains its local L1 cache, so I think we can set the > > > > memory level L1 to indicate this case. > > > It could be either the L1 or the L2. All the neoverse cores have private L2 > > > caches and we don't know. > > > > How about set both L1 and L2 cache together for this case? > > > > Although 'L1 | L2' cannot tell the exact cache level, I think it's > > better than use ANY_CACHE, at least this can help us to distinguish > > from other data source types if we avoid to use ANY_CACHE for all of > > them. > > This seems much more confusing then the ambiguity of where the line came from > and is only possible with the deprecated mem_lvl enconding. It will make > perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to > attribute a line to a single cache will find that the sum of the number of hits > is greater than the number of accesses which also seems terribly confusing. Understand. I considered the potential issue for perf_mem__lvl_scnprintf(), actually it supports printing multipl cache levels for 'mem_lvl', by conjuncting with 'or' it composes the multiple cache levels. We might need to extend a bit for another field 'mem_lvlnum'. Agreed that there would have inaccurate issue for statistics, it's fine for me to use ANY_CACHE in this patch set. I still think we should consider to extend the memory levels to demonstrate clear momory hierarchy on Arm archs, I personally like the definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", though these cache levels are not precise like L1/L2/L3 levels, they can help us to map very well for the cache topology on Arm archs and without any confusion. We could take this as an enhancement if you don't want to bother the current patch set's upstreaming. > > > > For this data source type and below types, though they indicate > > > > the snooping happens, but it doesn't mean the data in the cache line > > > > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > > > > think this will mislead users when report the result. > > > > > > I'm of the opposite opinion. If the data wasn't modified, it will likely be > > > found in the lower-level shared cache and the transaction wouldn't require a > > > cache-to-cache transfer of the modified data, so the most common case when we > > > source a line out of another cores cache will be if it was "modifiable" in that > > > cache. > > > > Let's still use MOSI protocol as example. I think there have two > > cases: on case is the peer cache line is in 'Shared' state and another > > case is the peer cache line is in 'Owned' state. > > > > Quotes the wiki page for these two cases: > > > > "When the cache block is in the Shared (S) state and there is a > > snooped bus read (BusRd) transaction, then the block stays in the same > > state and generates no more transactions as all the cache blocks have > > the same value including the main memory and it is only being read, > > not written into." > > > > "While in the Owner (O) state and there is a snooped read request > > (BusRd), the block remains in the same state while flushing (Flush) > > the data for the other processor to read from it." > > > > Seems to me, it's reasonable to set HTIM flag when the snooping happens > > for the cache line line is in the Modified (M) state. > > > > Again, my comment is based on the literal understanding; so please > > correct if have any misunderstanding at here. > > Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except > where sharing patterns are detected," so if a line is shared among caches it > will likely also be in the SLC (and thus we'd get an L3 hit). > > If there is one copy of the cache line and that cache line needs to transition > from one core to another core, I don't believe it matters if it was truly > modified or not because the core already had permission to modify it and the > transaction is just as costly (ping ponging between core caches). This is the > one thing I really want to be able to uniquely identify as any cacheline doing > this that isn't a lock is a place where optimization is likely possible. I understood that your point that if big amount of transitions within multiple cores hit the same cache line, it would be very likely caused by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag for easier reviewing. Alternatively, I think it's good to pick up the patch series "perf c2c: Sort cacheline with all loads" [1], rather than relying on HITM tag, the patch series extends a new option "-d all" for perf c2c, so it displays the suspecious false sharing cache lines based on load/store ops and thread infos. The main reason for holding on th patch set is due to we cannot verify it with Arm SPE at that time point, as the time being Arm SPE trace data was absent both store ops and data source packets. I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can upstream the patch series "perf c2c: Sort cacheline with all loads" (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches have been merged in the mainline kernel). If this is fine for you, I can respin the patch series for "perf c2c". Or any other thoughts? [1] https://lore.kernel.org/lkml/20201213133850.10070-1-leo.yan@linaro.org/ > > > > I prefer we set below fields for ARM_SPE_NV_PEER_CORE: > > > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L1; > > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; > > > > > > > > > + break; > > > > > + /* > > > > > + * We don't know if this is L1, L2 but we do know it was a cache-2-cache > > > > > + * transfer, so set SNOOP_HITM > > > > > + */ > > > > > + case ARM_SPE_NV_LCL_CLSTR: > > > > > > > > For ARM_SPE_NV_LCL_CLSTR, it fetches the data from the shared cache in > > > > the cluster level, it should happen in L2 cache: > > > > > > > > data_src->mem_lvl = PERF_MEM_LVL_HIT | PERF_MEM_LVL_L2; > > > > data_src->mem_snoop = PERF_MEM_SNOOP_HIT; > > > > data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; > > > > > > We don't know if this is coming from the cluster cache, or the private L1 or L2 > > > core caches. The description above about why we'll be transferring the line from > > > cache-to-cache applies here too. > > > > I think a minor difference between PEER_CORE and LCL_CLSTR is: > > if data source is PEER_CORE, the snooping happens on the peer core's > > local cache (Core's L1 or Core's L2 when core contains L2); for the data > > source LCL_CLSTR, the snooping occurs on the cluster level's cache > > (cluster's L2 cache or cluster's L3 cache when cluster contains L3). > > > > So can we set both 'L2 | L3' for LCL_CLSTR case? > > Just as above, I believe setting two options will lead to more confusion. > Additionally, we agreed in the previous discussion that the system cache shall > be the L3, so i don't see how this helps alleviate any confusion. That said, the > systems I'm most familiar with never set LCL_CLSTR as a source. Okay, let's rollback to PERF_MEM_LVLNUM_ANY_CACHE as cache level for LCL_CLSTR and PEER_CLSTR. Thanks, Leo
Hi Leo, Ali, Thank you for your great work and valuable discussion. 在 2022/3/27 AM3:43, Ali Saidi 写道:> Hi Leo, > On Sat, 26 Mar 2022 21:47:54 +0800, Leo Yan wrote: >> Hi Ali, German, >> >> On Thu, Mar 24, 2022 at 06:33:21PM +0000, Ali Saidi wrote: >> >> [...] >> >>> +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, >>> + union perf_mem_data_src *data_src) >>> { >>> - union perf_mem_data_src data_src = { 0 }; >>> + /* >>> + * Even though four levels of cache hierarchy are possible, no known >>> + * production Neoverse systems currently include more than three levels >>> + * so for the time being we assume three exist. If a production system >>> + * is built with four the this function would have to be changed to >>> + * detect the number of levels for reporting. >>> + */ >>> >>> - if (record->op == ARM_SPE_LD) >>> - data_src.mem_op = PERF_MEM_OP_LOAD; >>> - else >>> - data_src.mem_op = PERF_MEM_OP_STORE; >> >> Firstly, apologize that I didn't give clear idea when Ali sent patch sets >> v2 and v3. >> >> IMHO, we need to consider two kinds of information which can guide us >> for a reliable implementation. The first thing is to summarize the data >> source configuration for x86 PEBS, we can dive in more details for this >> part; the second thing is we can refer to the AMBA architecture document >> ARM IHI 0050E.b, section 11.1.2 'Crossing a chip-to-chip interface' and >> its sub section 'Suggested DataSource values', which would help us >> much for mapping the cache topology to Arm SPE data source. >> >> As a result, I summarized the data source configurations for PEBS and >> Arm SPE Neoverse in the spreadsheet: >> https://docs.google.com/spreadsheets/d/11YmjG0TyRjH7IXgvsREFgTg3AVtxh2dvLloRK1EdNjU/edit?usp=sharing > > Thanks for putting this together and digging into the details, but you're making > assumptions in neoverse data sources about the core configurations that aren't > correct. The Neoverse cores have all have integrated L1 and L2 cache, so if the > line is coming from a peer-core we don't know which level it's actually coming > from. Similarly, if it's coming from a local cluster, that could mean a cluster > l3, but it's not the L2. As far as I know, Neoverse N2 microarchitecture L3 Cache is non-inclusive, and L1 and L2 are strictly inclusive, like Intel Skylake SP (SKX), i.e., the L2 may or may not be in the L3 (no guarantee is made). That is to say, we can not tell it is from cluster L2 or L3. Could you confirm this? [...] > I still think we should consider to extend the memory levels to > demonstrate clear momory hierarchy on Arm archs, I personally like the > definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", > though these cache levels are not precise like L1/L2/L3 levels, they can > help us to map very well for the cache topology on Arm archs and without > any confusion. We could take this as an enhancement if you don't want > to bother the current patch set's upstreaming. Agree. In my opinion, imprecise cache levels can lead to wrong conclusions. "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE" are more intuitive. Best Regards, Shuai
Hi Leo, On Mon, 28 Mar 2022 21:05:47 +0800, Leo Yan wrote: > Hi Ali, > > On Mon, Mar 28, 2022 at 03:08:05AM +0000, Ali Saidi wrote: > > [...] > > > > > > > + case ARM_SPE_NV_PEER_CORE: > > > > > > + data_src->mem_lvl = PERF_MEM_LVL_HIT; > > > > > > + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; > > > > > > + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; > > > > > > > > > > Peer core contains its local L1 cache, so I think we can set the > > > > > memory level L1 to indicate this case. > > > > It could be either the L1 or the L2. All the neoverse cores have private L2 > > > > caches and we don't know. > > > > > > How about set both L1 and L2 cache together for this case? > > > > > > Although 'L1 | L2' cannot tell the exact cache level, I think it's > > > better than use ANY_CACHE, at least this can help us to distinguish > > > from other data source types if we avoid to use ANY_CACHE for all of > > > them. > > > > This seems much more confusing then the ambiguity of where the line came from > > and is only possible with the deprecated mem_lvl enconding. It will make > > perf_mem__lvl_scnprintf() print the wrong thing and anyone who is trying to > > attribute a line to a single cache will find that the sum of the number of hits > > is greater than the number of accesses which also seems terribly confusing. > > Understand. I considered the potential issue for > perf_mem__lvl_scnprintf(), actually it supports printing multipl cache > levels for 'mem_lvl', by conjuncting with 'or' it composes the multiple > cache levels. We might need to extend a bit for another field > 'mem_lvlnum'. > > Agreed that there would have inaccurate issue for statistics, it's fine > for me to use ANY_CACHE in this patch set. Thanks! > > I still think we should consider to extend the memory levels to > demonstrate clear momory hierarchy on Arm archs, I personally like the > definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", > though these cache levels are not precise like L1/L2/L3 levels, they can > help us to map very well for the cache topology on Arm archs and without > any confusion. We could take this as an enhancement if you don't want > to bother the current patch set's upstreaming. I'd like to do this in a separate patch, but I have one other proposal. The Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, it's also in the L2. Given that the Graviton systems and afaik the Ampere systems don't have any cache between the L2 and the SLC, thus anything from PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we should just set L2 for these cases? German, are you good with this for now? > > > > > For this data source type and below types, though they indicate > > > > > the snooping happens, but it doesn't mean the data in the cache line > > > > > is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally > > > > > think this will mislead users when report the result. > > > > > > > > I'm of the opposite opinion. If the data wasn't modified, it will likely be > > > > found in the lower-level shared cache and the transaction wouldn't require a > > > > cache-to-cache transfer of the modified data, so the most common case when we > > > > source a line out of another cores cache will be if it was "modifiable" in that > > > > cache. > > > > > > Let's still use MOSI protocol as example. I think there have two > > > cases: on case is the peer cache line is in 'Shared' state and another > > > case is the peer cache line is in 'Owned' state. > > > > > > Quotes the wiki page for these two cases: > > > > > > "When the cache block is in the Shared (S) state and there is a > > > snooped bus read (BusRd) transaction, then the block stays in the same > > > state and generates no more transactions as all the cache blocks have > > > the same value including the main memory and it is only being read, > > > not written into." > > > > > > "While in the Owner (O) state and there is a snooped read request > > > (BusRd), the block remains in the same state while flushing (Flush) > > > the data for the other processor to read from it." > > > > > > Seems to me, it's reasonable to set HTIM flag when the snooping happens > > > for the cache line line is in the Modified (M) state. > > > > > > Again, my comment is based on the literal understanding; so please > > > correct if have any misunderstanding at here. > > > > Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except > > where sharing patterns are detected," so if a line is shared among caches it > > will likely also be in the SLC (and thus we'd get an L3 hit). > > > > If there is one copy of the cache line and that cache line needs to transition > > from one core to another core, I don't believe it matters if it was truly > > modified or not because the core already had permission to modify it and the > > transaction is just as costly (ping ponging between core caches). This is the > > one thing I really want to be able to uniquely identify as any cacheline doing > > this that isn't a lock is a place where optimization is likely possible. > > I understood that your point that if big amount of transitions within > multiple cores hit the same cache line, it would be very likely caused > by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag > for easier reviewing. And that from a dataflow perspective if the line is owned (and could be modifiable) vs. was actually modified is really the less interesting bit. > Alternatively, I think it's good to pick up the patch series "perf c2c: > Sort cacheline with all loads" [1], rather than relying on HITM tag, the > patch series extends a new option "-d all" for perf c2c, so it displays > the suspecious false sharing cache lines based on load/store ops and > thread infos. The main reason for holding on th patch set is due to we > cannot verify it with Arm SPE at that time point, as the time being Arm > SPE trace data was absent both store ops and data source packets. Looking at examples I don't, at least from my system, data-source isn't set for stores, only for loads. > I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can > upstream the patch series "perf c2c: Sort cacheline with all loads" > (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches > have been merged in the mainline kernel). > > If this is fine for you, I can respin the patch series for "perf c2c". > Or any other thoughts? I think this is a nice option to have in the tool-box, but from my point of view, I'd like someone who is familiar with c2c output on x86 to come to an arm64 system and be able to zero in on a ping-ponging line like they would otherwise. Highlighting a line that is moving between cores frequently which is likely in the exclusive state by tagging it an HITM accomplishes this and will make it easier to find these cases. Your approach also has innaccurancies and wouldn't be able to differentiate between core X accessing a line a lot followed by core Y acessing a line alot vs the cores ping-ponging. Yes, I agree that we will "overcount" HITM, but I don't think this is particularly bad and it does specifically highlight the core-2-core transfers that are likely a performance issue easily and it will result in easier identification of areas of false or true sharing and improve performance. Thanks, Ali
Hi Ali, On Tue, Mar 29, 2022 at 02:32:14PM +0000, Ali Saidi wrote: [...] > > I still think we should consider to extend the memory levels to > > demonstrate clear momory hierarchy on Arm archs, I personally like the > > definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", > > though these cache levels are not precise like L1/L2/L3 levels, they can > > help us to map very well for the cache topology on Arm archs and without > > any confusion. We could take this as an enhancement if you don't want > > to bother the current patch set's upstreaming. > > I'd like to do this in a separate patch, but I have one other proposal. The > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, > it's also in the L2. Given that the Graviton systems and afaik the Ampere > systems don't have any cache between the L2 and the SLC, thus anything from > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we > should just set L2 for these cases? German, are you good with this for now? If we use a single cache level (no matterh it's L2 or ANY_CACHE) for these data sources, it's hard for users to understand what's the cost for the memory operations. So here I suggested for these new cache levels is not only about cache level, it's more about the information telling the memory operation's cost. [...] > > Alternatively, I think it's good to pick up the patch series "perf c2c: > > Sort cacheline with all loads" [1], rather than relying on HITM tag, the > > patch series extends a new option "-d all" for perf c2c, so it displays > > the suspecious false sharing cache lines based on load/store ops and > > thread infos. The main reason for holding on th patch set is due to we > > cannot verify it with Arm SPE at that time point, as the time being Arm > > SPE trace data was absent both store ops and data source packets. > > Looking at examples I don't, at least from my system, data-source isn't set for > stores, only for loads. Ouch ... If data source is not set for store operation, then all store samples will absent cache level info. Or should we set ANY_CACHE as cache level for store operations? > > I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can > > upstream the patch series "perf c2c: Sort cacheline with all loads" > > (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches > > have been merged in the mainline kernel). > > > > If this is fine for you, I can respin the patch series for "perf c2c". > > Or any other thoughts? > > I think this is a nice option to have in the tool-box, but from my point of > view, I'd like someone who is familiar with c2c output on x86 to come to an > arm64 system and be able to zero in on a ping-ponging line like they would > otherwise. Highlighting a line that is moving between cores frequently which is > likely in the exclusive state by tagging it an HITM accomplishes this and will > make it easier to find these cases. Your approach also has innaccurancies and > wouldn't be able to differentiate between core X accessing a line a lot followed > by core Y acessing a line alot vs the cores ping-ponging. Yes, I agree that we > will "overcount" HITM, but I don't think this is particularly bad and it does > specifically highlight the core-2-core transfers that are likely a performance > issue easily and it will result in easier identification of areas of false or > true sharing and improve performance. I don't want to block this patch set by this part, and either I don't want to introduce any confusion for later users, especially I think users who in later use this tool but it's hard for them to be aware any assumptions in this discussion thread. So two options would be fine for me: Option 1: if you and Arm mates can confirm that inaccuracy caused by setting HITM is low (e.g. 2%-3% inaccuracy that introduced by directly set HITM), I think this could be acceptable. Otherwise, please consider option 2. Option 2: by default we set PERF_MEM_SNOOP_HIT flag since now actually we have no info to support HITM. Then use a new patch to add an extra option (say '--coarse-hitm') for 'perf c2c' tool, a user can explictly specify this option for 'perf c2c' command; when a user specifies this option it means that the user understands and accepts inaccuracy by forcing to use PERF_MEM_SNOOP_HITM flag. I think you could refer to the option '--stitch-lbr' for adding an option for 'perf c2c' tool. Thanks, Leo
Hi all, It seems I gave the Review tags a bit too early this time. Apologies for the inconvenience. Indeed there was more interesting discussions to be had :) (Probably best to remove by tags for the next re-spin) On 29/03/2022 15:32, Ali Saidi wrote: > [...] > >> I still think we should consider to extend the memory levels to >> demonstrate clear momory hierarchy on Arm archs, I personally like the >> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", >> though these cache levels are not precise like L1/L2/L3 levels, they can >> help us to map very well for the cache topology on Arm archs and without >> any confusion. We could take this as an enhancement if you don't want >> to bother the current patch set's upstreaming. > I'd like to do this in a separate patch, but I have one other proposal. The > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, > it's also in the L2. Given that the Graviton systems and afaik the Ampere > systems don't have any cache between the L2 and the SLC, thus anything from > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we > should just set L2 for these cases? German, are you good with this for now? Sorry for the delay. I'd like to also check this with someone. I'll try to get back asap. In the meantime, if this approach is also OK with Leo, I think it would be fine by me. Thanks, German >>>>>> For this data source type and below types, though they indicate >>>>>> the snooping happens, but it doesn't mean the data in the cache line >>>>>> is in 'modified' state. If set flag PERF_MEM_SNOOP_HITM, I personally >>>>>> think this will mislead users when report the result. >>>>> I'm of the opposite opinion. If the data wasn't modified, it will likely be >>>>> found in the lower-level shared cache and the transaction wouldn't require a >>>>> cache-to-cache transfer of the modified data, so the most common case when we >>>>> source a line out of another cores cache will be if it was "modifiable" in that >>>>> cache. >>>> Let's still use MOSI protocol as example. I think there have two >>>> cases: on case is the peer cache line is in 'Shared' state and another >>>> case is the peer cache line is in 'Owned' state. >>>> >>>> Quotes the wiki page for these two cases: >>>> >>>> "When the cache block is in the Shared (S) state and there is a >>>> snooped bus read (BusRd) transaction, then the block stays in the same >>>> state and generates no more transactions as all the cache blocks have >>>> the same value including the main memory and it is only being read, >>>> not written into." >>>> >>>> "While in the Owner (O) state and there is a snooped read request >>>> (BusRd), the block remains in the same state while flushing (Flush) >>>> the data for the other processor to read from it." >>>> >>>> Seems to me, it's reasonable to set HTIM flag when the snooping happens >>>> for the cache line line is in the Modified (M) state. >>>> >>>> Again, my comment is based on the literal understanding; so please >>>> correct if have any misunderstanding at here. >>> Per the CMN TRM, "The SLC allocation policy is exclusive for data lines, except >>> where sharing patterns are detected," so if a line is shared among caches it >>> will likely also be in the SLC (and thus we'd get an L3 hit). >>> >>> If there is one copy of the cache line and that cache line needs to transition >>> from one core to another core, I don't believe it matters if it was truly >>> modified or not because the core already had permission to modify it and the >>> transaction is just as costly (ping ponging between core caches). This is the >>> one thing I really want to be able to uniquely identify as any cacheline doing >>> this that isn't a lock is a place where optimization is likely possible. >> I understood that your point that if big amount of transitions within >> multiple cores hit the same cache line, it would be very likely caused >> by the cache line's Modified state so we set PERF_MEM_SNOOP_HITM flag >> for easier reviewing. > And that from a dataflow perspective if the line is owned (and could be > modifiable) vs. was actually modified is really the less interesting bit. > >> Alternatively, I think it's good to pick up the patch series "perf c2c: >> Sort cacheline with all loads" [1], rather than relying on HITM tag, the >> patch series extends a new option "-d all" for perf c2c, so it displays >> the suspecious false sharing cache lines based on load/store ops and >> thread infos. The main reason for holding on th patch set is due to we >> cannot verify it with Arm SPE at that time point, as the time being Arm >> SPE trace data was absent both store ops and data source packets. > Looking at examples I don't, at least from my system, data-source isn't set for > stores, only for loads. > >> I perfer to set PERF_MEM_SNOOP_HIT flag in this patch set and we can >> upstream the patch series "perf c2c: Sort cacheline with all loads" >> (only needs upstreaming patches 01, 02, 03, 10, 11, the rest patches >> have been merged in the mainline kernel). >> >> If this is fine for you, I can respin the patch series for "perf c2c". >> Or any other thoughts? > I think this is a nice option to have in the tool-box, but from my point of > view, I'd like someone who is familiar with c2c output on x86 to come to an > arm64 system and be able to zero in on a ping-ponging line like they would > otherwise. Highlighting a line that is moving between cores frequently which is > likely in the exclusive state by tagging it an HITM accomplishes this and will > make it easier to find these cases. Your approach also has innaccurancies and > wouldn't be able to differentiate between core X accessing a line a lot followed > by core Y acessing a line alot vs the cores ping-ponging. Yes, I agree that we > will "overcount" HITM, but I don't think this is particularly bad and it does > specifically highlight the core-2-core transfers that are likely a performance > issue easily and it will result in easier identification of areas of false or > true sharing and improve performance. > > Thanks, > Ali
On Thu, Mar 31, 2022 at 01:28:58PM +0100, German Gomez wrote: > Hi all, > > It seems I gave the Review tags a bit too early this time. Apologies for > the inconvenience. Indeed there was more interesting discussions to be > had :) > > (Probably best to remove by tags for the next re-spin) Now worries, German. Your review and testing are very helpful :) > On 29/03/2022 15:32, Ali Saidi wrote: > > [...] > > > >> I still think we should consider to extend the memory levels to > >> demonstrate clear momory hierarchy on Arm archs, I personally like the > >> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", > >> though these cache levels are not precise like L1/L2/L3 levels, they can > >> help us to map very well for the cache topology on Arm archs and without > >> any confusion. We could take this as an enhancement if you don't want > >> to bother the current patch set's upstreaming. > > I'd like to do this in a separate patch, but I have one other proposal. The > > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, > > it's also in the L2. Given that the Graviton systems and afaik the Ampere > > systems don't have any cache between the L2 and the SLC, thus anything from > > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we > > should just set L2 for these cases? German, are you good with this for now? > > Sorry for the delay. I'd like to also check this with someone. I'll try > to get back asap. In the meantime, if this approach is also OK with Leo, > I think it would be fine by me. Thanks for the checking internally. Let me just bring up my another thinking (sorry that my suggestion is float): another choice is we set ANY_CACHE as cache level if we are not certain the cache level, and extend snoop field to indicate the snooping logics, like: PERF_MEM_SNOOP_PEER_CORE PERF_MEM_SNOOP_LCL_CLSTR PERF_MEM_SNOOP_PEER_CLSTR Seems to me, we doing this is not only for cache level, it's more important for users to know the variant cost for involving different snooping logics. Thanks, Leo
On Thu, 31 Mar 2022 12:44:3 +0000, Leo Yan wrote: > > On Thu, Mar 31, 2022 at 01:28:58PM +0100, German Gomez wrote: > > Hi all, > > > > It seems I gave the Review tags a bit too early this time. Apologies for > > the inconvenience. Indeed there was more interesting discussions to be > > had :) > > > > (Probably best to remove by tags for the next re-spin) > > Now worries, German. Your review and testing are very helpful :) > > > On 29/03/2022 15:32, Ali Saidi wrote: > > > [...] > > > > > >> I still think we should consider to extend the memory levels to > > >> demonstrate clear momory hierarchy on Arm archs, I personally like the > > >> definitions for "PEER_CORE", "LCL_CLSTR", "PEER_CLSTR" and "SYS_CACHE", > > >> though these cache levels are not precise like L1/L2/L3 levels, they can > > >> help us to map very well for the cache topology on Arm archs and without > > >> any confusion. We could take this as an enhancement if you don't want > > >> to bother the current patch set's upstreaming. > > > I'd like to do this in a separate patch, but I have one other proposal. The > > > Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, > > > it's also in the L2. Given that the Graviton systems and afaik the Ampere > > > systems don't have any cache between the L2 and the SLC, thus anything from > > > PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we > > > should just set L2 for these cases? German, are you good with this for now? > > > > Sorry for the delay. I'd like to also check this with someone. I'll try > > to get back asap. In the meantime, if this approach is also OK with Leo, > > I think it would be fine by me. > > Thanks for the checking internally. Let me just bring up my another > thinking (sorry that my suggestion is float): another choice is we set > ANY_CACHE as cache level if we are not certain the cache level, and > extend snoop field to indicate the snooping logics, like: > > PERF_MEM_SNOOP_PEER_CORE > PERF_MEM_SNOOP_LCL_CLSTR > PERF_MEM_SNOOP_PEER_CLSTR > > Seems to me, we doing this is not only for cache level, it's more > important for users to know the variant cost for involving different > snooping logics. I think we've come full circle :). Going back to what do we want to indicate to a user about the source of the cache line, I believe there are three things with an eye toward helping a user of the data improve the performance of their application: 1. The level below them in the hierarchy it it (L1, L2, LLC, local DRAM). Depending on the level this directly indicates the expense of the operation. 2. If it came from a peer of theirs on the same socket. I'm really of the opinion still that exactly which peer, doesn't matter much as it's a 2nd or 3rd order concern compared to, it it couldn't be sourced from a cache level below the originating core, had to come from a local peer and the request went to that lower levels and was eventually sourced from a peer. Why it was sourced from the peer is still almost irrelevant to me. If it was truly modified or the core it was sourced from only had permission to modify it the snoop filter doesn't necessarily need to know the difference and the outcome is the same. 3. For multi-socket systems that it came from a different socket and there it is probably most interesting if it came from DRAM on the remote socket or a cache. I'm putting 3 aside for now since we've really been focusing on 1 and 2 in this discussion and I think the biggest hangup has been the definition of HIT vs HITM. If someone has a precise definition, that would be great, but AFAIK it goes back to the P6 bus where HIT was asserted by another core if it had a line (in any state) and HITM was additionally asserted if a core needed to inhibit another device (e.g. DDR controller) from providing that line to the requestor. The latter logic is why I think it's perfectly acceptable to use HITM to indicate a peer cache-to-cache transfer, however since others don't feel that way let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that indicates some peer of the hierarchy below the originating core sourced the data. This clears up the definition that line came from from a peer and may or may not have been modified, but it doesn't add a lot of implementation dependant functionality into the SNOOP API. We could use the mem-level to indicate the level of the cache hierarchy we had to get to before the snoop traveled upward, which seems like what x86 is doing here. PEER_CORE -> MEM_SNOOP_PEER + L2 PEER_CLSTR -> MEM_SNOOP_PEER + L3 PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support the clusters and the existing commercial implementations don't have them). Thanks, Ali
On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote: [...] > > Let me just bring up my another > > thinking (sorry that my suggestion is float): another choice is we set > > ANY_CACHE as cache level if we are not certain the cache level, and > > extend snoop field to indicate the snooping logics, like: > > > > PERF_MEM_SNOOP_PEER_CORE > > PERF_MEM_SNOOP_LCL_CLSTR > > PERF_MEM_SNOOP_PEER_CLSTR > > > > Seems to me, we doing this is not only for cache level, it's more > > important for users to know the variant cost for involving different > > snooping logics. > > I think we've come full circle :). Not too bad, and I learned a lot :) > Going back to what do we want to indicate to > a user about the source of the cache line, I believe there are three things with > an eye toward helping a user of the data improve the performance of their > application: Thanks a lot for summary! > 1. The level below them in the hierarchy it it (L1, L2, LLC, local DRAM). > Depending on the level this directly indicates the expense of the operation. > > 2. If it came from a peer of theirs on the same socket. I'm really of the > opinion still that exactly which peer, doesn't matter much as it's a 2nd or 3rd > order concern compared to, it it couldn't be sourced from a cache level below > the originating core, had to come from a local peer and the request went to > that lower levels and was eventually sourced from a peer. Why it was sourced > from the peer is still almost irrelevant to me. If it was truly modified or the > core it was sourced from only had permission to modify it the snoop filter > doesn't necessarily need to know the difference and the outcome is the same. I think here the key information delivered is: For the peer snooping, you think there has big cost difference between L2 cache snooping and L3 cache snooping; for L3 cache snooping, we don't care about it's an internal cluster snooping or external cluster snooping, and we have no enough info to reason snooping type (HIT vs HITM). > 3. For multi-socket systems that it came from a different socket and there it is > probably most interesting if it came from DRAM on the remote socket or a cache. > > I'm putting 3 aside for now since we've really been focusing on 1 and 2 in this > discussion and I think the biggest hangup has been the definition of HIT vs > HITM. Agree on the item 3. > If someone has a precise definition, that would be great, but AFAIK it > goes back to the P6 bus where HIT was asserted by another core if it had a line > (in any state) and HITM was additionally asserted if a core needed to inhibit > another device (e.g. DDR controller) from providing that line to the requestor. Thanks for sharing the info for how the bus implements HIT/HITM. > The latter logic is why I think it's perfectly acceptable to use HITM to > indicate a peer cache-to-cache transfer, however since others don't feel that way > let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that > indicates some peer of the hierarchy below the originating core sourced the > data. This clears up the definition that line came from from a peer and may or > may not have been modified, but it doesn't add a lot of implementation dependant > functionality into the SNOOP API. > > We could use the mem-level to indicate the level of the cache hierarchy we had > to get to before the snoop traveled upward, which seems like what x86 is doing > here. It makes sense to me that to use the highest cache level as mem-level. Please add comments in the code for this, this would be useful for understanding the code. > PEER_CORE -> MEM_SNOOP_PEER + L2 > PEER_CLSTR -> MEM_SNOOP_PEER + L3 > PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support > the clusters and the existing commercial implementations don't have them). Generally, this idea is fine for me. Following your suggestion, if we connect the concepts PoC and PoU in Arm reference manual, we can extend the snooping mode with MEM_SNOOP_POU (for PoU) and MEM_SNOOP_POC (for PoC), so: PEER_CORE -> MEM_SNOOP_POU + L2 PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3 PEER_CLSTR -> MEM_SNOOP_POC + L3 Seems to me, we could consider for this. If this is over complexity or even I said any wrong concepts for this, please use your method. Thanks, Leo
On Mon, 4 Apr 2022 15:12:18 +0000, Leo Yan wrote: > On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote: [...] > > The latter logic is why I think it's perfectly acceptable to use HITM to > > indicate a peer cache-to-cache transfer, however since others don't feel that way > > let me propose a single additional snooping type PERF_MEM_SNOOP_PEER that > > indicates some peer of the hierarchy below the originating core sourced the > > data. This clears up the definition that line came from from a peer and may or > > may not have been modified, but it doesn't add a lot of implementation dependant > > functionality into the SNOOP API. > > > > We could use the mem-level to indicate the level of the cache hierarchy we had > > to get to before the snoop traveled upward, which seems like what x86 is doing > > here. > > It makes sense to me that to use the highest cache level as mem-level. > Please add comments in the code for this, this would be useful for > understanding the code. Ok. > > PEER_CORE -> MEM_SNOOP_PEER + L2 > > PEER_CLSTR -> MEM_SNOOP_PEER + L3 > > PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support > > the clusters and the existing commercial implementations don't have them). > > Generally, this idea is fine for me. Great. Now the next tricky thing. Since we're not using HITM for recording the memory events, the question becomes for the c2c output should we output the SNOOP_PEER events as if they are HITM events with a clarification in the perf-c2c man page or effectively duplicate all the lcl_hitm logic, which is a fair amount, in perf c2c to add a column and sort option? > Following your suggestion, if we connect the concepts PoC and PoU in Arm > reference manual, we can extend the snooping mode with MEM_SNOOP_POU > (for PoU) and MEM_SNOOP_POC (for PoC), so: > > PEER_CORE -> MEM_SNOOP_POU + L2 > PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3 > PEER_CLSTR -> MEM_SNOOP_POC + L3 > > Seems to me, we could consider for this. If this is over complexity or > even I said any wrong concepts for this, please use your method. I think this adds a lot of complexity and reduces clarity. Some systems implement coherent icaches and the PoU would be the L1 cache, others don't so that would be the L2 (or wherever there is a unified cache). Similarly, with the point of coherency, some systems would consider that dram, but other systems have transparent LLCs and it would be the LLC. Thanks, Ali
Hi, On 31/03/2022 13:44, Leo Yan wrote: > [...] >>> I'd like to do this in a separate patch, but I have one other proposal. The >>> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, >>> it's also in the L2. Given that the Graviton systems and afaik the Ampere >>> systems don't have any cache between the L2 and the SLC, thus anything from >>> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we >>> should just set L2 for these cases? German, are you good with this for now? >> Sorry for the delay. I'd like to also check this with someone. I'll try >> to get back asap. In the meantime, if this approach is also OK with Leo, >> I think it would be fine by me. Sorry for the delay. Yeah setting it to L2 indeed looks reasonable for now. Somebody brought up the case of running SPE in a heterogeneous system, but also we think might be beyond the scope of this change. One very minor nit though. Would you be ok with renaming LCL to LOCAL and CLSTR to CLUSTER? I sometimes mistead the former as "LLC". > Thanks for the checking internally. Let me just bring up my another > thinking (sorry that my suggestion is float): another choice is we set > ANY_CACHE as cache level if we are not certain the cache level, and > extend snoop field to indicate the snooping logics, like: > > PERF_MEM_SNOOP_PEER_CORE > PERF_MEM_SNOOP_LCL_CLSTR > PERF_MEM_SNOOP_PEER_CLSTR > > Seems to me, we doing this is not only for cache level, it's more > important for users to know the variant cost for involving different > snooping logics. > > Thanks, > Leo I see there's been some more messages I need to catch up with. Is the intention to extend the PERF_MEM_* flags for this cset, or will it be left for a later change? In any case, I'd be keen to take another look at it and try to bring some more eyes into this. Thanks, German
On Wed, Apr 06, 2022 at 09:00:17PM +0000, Ali Saidi wrote: > On Mon, 4 Apr 2022 15:12:18 +0000, Leo Yan wrote: > > On Sun, Apr 03, 2022 at 08:33:37PM +0000, Ali Saidi wrote: [...] > > > PEER_CORE -> MEM_SNOOP_PEER + L2 > > > PEER_CLSTR -> MEM_SNOOP_PEER + L3 > > > PEER_LCL_CLSTR -> MEM_SNOOP_PEER + L3 (since newer neoverse cores don't support > > > the clusters and the existing commercial implementations don't have them). > > > > Generally, this idea is fine for me. > > Great. > > Now the next tricky thing. Since we're not using HITM for recording the memory > events, the question becomes for the c2c output should we output the SNOOP_PEER > events as if they are HITM events with a clarification in the perf-c2c man page > or effectively duplicate all the lcl_hitm logic, which is a fair amount, in > perf c2c to add a column and sort option? I think we need to handle both load and store operations in 'perf c2c' tool. For the load operation, in the 'cache line details' view, we need to support 'snoop_peer' conlumn; and since Arm SPE doesn't give any data source info for store opeartion, so my plan is to add an extra conlumn 'Other' based on the two existed conlumns 'L1 Hit' and 'L1 Miss'. Could you leave this part for me? I will respin my patch set for extend 'perf c2c' for this (and hope can support the old Arm SPE trace data). Please note, when you spin new patch set, you need to take care for the store operations. In the current patch set, it will wrongly always set L1 hit for all store operations due to the data source field is always zero. My understanding is for all store operations, we need to set the cache level as PERF_MEM_LVLNUM_ANY_CACHE and snoop type as PERF_MEM_SNOOP_NA. > > Following your suggestion, if we connect the concepts PoC and PoU in Arm > > reference manual, we can extend the snooping mode with MEM_SNOOP_POU > > (for PoU) and MEM_SNOOP_POC (for PoC), so: > > > > PEER_CORE -> MEM_SNOOP_POU + L2 > > PEER_LCL_CLSTR -> MEM_SNOOP_POU + L3 > > PEER_CLSTR -> MEM_SNOOP_POC + L3 > > > > Seems to me, we could consider for this. If this is over complexity or > > even I said any wrong concepts for this, please use your method. > > I think this adds a lot of complexity and reduces clarity. Some systems > implement coherent icaches and the PoU would be the L1 cache, others don't so > that would be the L2 (or wherever there is a unified cache). Similarly, with the > point of coherency, some systems would consider that dram, but other systems > have transparent LLCs and it would be the LLC. Okay, it's fine for me to move forward to use MEM_SNOOP_PEER as the solution. Since German is looking into this part, @German, if you have any comment on this part, just let us know. Thanks, Leo
On Thu, Apr 07, 2022 at 04:24:35PM +0100, German Gomez wrote: > Hi, > > On 31/03/2022 13:44, Leo Yan wrote: > > [...] > >>> I'd like to do this in a separate patch, but I have one other proposal. The > >>> Neoverse cores L2 is strictly inclusive of the L1, so even if it's in the L1, > >>> it's also in the L2. Given that the Graviton systems and afaik the Ampere > >>> systems don't have any cache between the L2 and the SLC, thus anything from > >>> PEER_CORE, LCL_CLSTR, or PEER_CLSTR would hit in the L2, perhaps we > >>> should just set L2 for these cases? German, are you good with this for now? > >> Sorry for the delay. I'd like to also check this with someone. I'll try > >> to get back asap. In the meantime, if this approach is also OK with Leo, > >> I think it would be fine by me. > > Sorry for the delay. Yeah setting it to L2 indeed looks reasonable for > now. Somebody brought up the case of running SPE in a heterogeneous > system, but also we think might be beyond the scope of this change. > > One very minor nit though. Would you be ok with renaming LCL to LOCAL > and CLSTR to CLUSTER? I sometimes mistead the former as "LLC". Ali's suggestion is to use the format: highest_cache_level | MEM_SNOOP_PEER. Simply to say, the highest cache level is where we snoop the cache data with the highest cache level. And we use an extra snoop op MEM_SNOOP_PEER as the flag to indicate a peer snooping from the local cluster or peer cluster. Please review the more detailed discussion in another email. > > Thanks for the checking internally. Let me just bring up my another > > thinking (sorry that my suggestion is float): another choice is we set > > ANY_CACHE as cache level if we are not certain the cache level, and > > extend snoop field to indicate the snooping logics, like: > > > > PERF_MEM_SNOOP_PEER_CORE > > PERF_MEM_SNOOP_LCL_CLSTR > > PERF_MEM_SNOOP_PEER_CLSTR > > > > Seems to me, we doing this is not only for cache level, it's more > > important for users to know the variant cost for involving different > > snooping logics. > > > I see there's been some more messages I need to catch up with. Is the > intention to extend the PERF_MEM_* flags for this cset, or will it be > left for a later change? The plan is to extend the PERF_MEM_* flags in this patch set. > In any case, I'd be keen to take another look at it and try to bring > some more eyes into this. Sure. Please check at your side and thanks for confirmation. Thanks, Leo
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c index 5e390a1a79ab..091987dd3966 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c @@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder) break; case ARM_SPE_DATA_SOURCE: + decoder->record.source = payload; break; case ARM_SPE_BAD: break; diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h index 69b31084d6be..c81bf90c0996 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h @@ -29,6 +29,17 @@ enum arm_spe_op_type { ARM_SPE_ST = 1 << 1, }; +enum arm_spe_neoverse_data_source { + ARM_SPE_NV_L1D = 0x0, + ARM_SPE_NV_L2 = 0x8, + ARM_SPE_NV_PEER_CORE = 0x9, + ARM_SPE_NV_LCL_CLSTR = 0xa, + ARM_SPE_NV_SYS_CACHE = 0xb, + ARM_SPE_NV_PEER_CLSTR = 0xc, + ARM_SPE_NV_REMOTE = 0xd, + ARM_SPE_NV_DRAM = 0xe, +}; + struct arm_spe_record { enum arm_spe_sample_type type; int err; @@ -40,6 +51,7 @@ struct arm_spe_record { u64 virt_addr; u64 phys_addr; u64 context_id; + u16 source; }; struct arm_spe_insn; diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index d2b64e3f588b..f92ebce88c6a 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -34,6 +34,7 @@ #include "arm-spe-decoder/arm-spe-decoder.h" #include "arm-spe-decoder/arm-spe-pkt-decoder.h" +#include "../../arch/arm64/include/asm/cputype.h" #define MAX_TIMESTAMP (~0ULL) struct arm_spe { @@ -45,6 +46,7 @@ struct arm_spe { struct perf_session *session; struct machine *machine; u32 pmu_type; + u64 midr; struct perf_tsc_conversion tc; @@ -399,33 +401,110 @@ static bool arm_spe__is_memory_event(enum arm_spe_sample_type type) return false; } -static u64 arm_spe__synth_data_source(const struct arm_spe_record *record) +static const struct midr_range neoverse_spe[] = { + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, +}; + + +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { - union perf_mem_data_src data_src = { 0 }; + /* + * Even though four levels of cache hierarchy are possible, no known + * production Neoverse systems currently include more than three levels + * so for the time being we assume three exist. If a production system + * is built with four the this function would have to be changed to + * detect the number of levels for reporting. + */ - if (record->op == ARM_SPE_LD) - data_src.mem_op = PERF_MEM_OP_LOAD; - else - data_src.mem_op = PERF_MEM_OP_STORE; + switch (record->source) { + case ARM_SPE_NV_L1D: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + break; + case ARM_SPE_NV_L2: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + break; + case ARM_SPE_NV_PEER_CORE: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + break; + /* + * We don't know if this is L1, L2 but we do know it was a cache-2-cache + * transfer, so set SNOOP_HITM + */ + case ARM_SPE_NV_LCL_CLSTR: + case ARM_SPE_NV_PEER_CLSTR: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + break; + /* + * System cache is assumed to be L3 + */ + case ARM_SPE_NV_SYS_CACHE: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + break; + /* + * We don't know what level it hit in, except it came from the other + * socket + */ + case ARM_SPE_NV_REMOTE: + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + break; + case ARM_SPE_NV_DRAM: + data_src->mem_lvl = PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; + break; + default: + break; + } +} +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { - data_src.mem_lvl = PERF_MEM_LVL_L3; + data_src->mem_lvl = PERF_MEM_LVL_L3; if (record->type & ARM_SPE_LLC_MISS) - data_src.mem_lvl |= PERF_MEM_LVL_MISS; + data_src->mem_lvl |= PERF_MEM_LVL_MISS; else - data_src.mem_lvl |= PERF_MEM_LVL_HIT; + data_src->mem_lvl |= PERF_MEM_LVL_HIT; } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { - data_src.mem_lvl = PERF_MEM_LVL_L1; + data_src->mem_lvl = PERF_MEM_LVL_L1; if (record->type & ARM_SPE_L1D_MISS) - data_src.mem_lvl |= PERF_MEM_LVL_MISS; + data_src->mem_lvl |= PERF_MEM_LVL_MISS; else - data_src.mem_lvl |= PERF_MEM_LVL_HIT; + data_src->mem_lvl |= PERF_MEM_LVL_HIT; } if (record->type & ARM_SPE_REMOTE_ACCESS) - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1; + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; +} + +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) +{ + union perf_mem_data_src data_src = { 0 }; + bool is_neoverse = is_midr_in_range(midr, neoverse_spe); + + if (record->op & ARM_SPE_LD) + data_src.mem_op = PERF_MEM_OP_LOAD; + else + data_src.mem_op = PERF_MEM_OP_STORE; + + if (is_neoverse) + arm_spe__synth_data_source_neoverse(record, &data_src); + else + arm_spe__synth_data_source_generic(record, &data_src); if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -446,7 +525,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq) u64 data_src; int err; - data_src = arm_spe__synth_data_source(record); + data_src = arm_spe__synth_data_source(record, spe->midr); if (spe->sample_flc) { if (record->type & ARM_SPE_L1D_MISS) { @@ -1183,6 +1262,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event, struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; struct perf_record_time_conv *tc = &session->time_conv; + const char *cpuid = perf_env__cpuid(session->evlist->env); + u64 midr = strtol(cpuid, NULL, 16); struct arm_spe *spe; int err; @@ -1202,6 +1283,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event, spe->machine = &session->machines.host; /* No kvm support */ spe->auxtrace_type = auxtrace_info->type; spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; + spe->midr = midr; spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);