Message ID | 20161011184044.28373-3-cov@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Oct 11, 2016 at 02:40:44PM -0400, Christopher Covington wrote: > Calculate the numbers of cycles per instruction (CPI) implied by ARM > PMU cycle counter values. The code includes a strict checking facility > intended for the -icount option in TCG mode but it is not yet enabled > in the configuration file. Enabling it must wait on infrastructure > improvements which allow for different tests to be run on TCG versus > KVM. > > Signed-off-by: Christopher Covington <cov@codeaurora.org> This one should already have my r-b as well. At least I don't see any difference from the one I reviewed way-back-when. I'd like to merge these along with additional patches enabling running with KVM. I think Wei was looking into that. Or, even just refreshing these patches with KVM support being in from the beginning would be good. Can you guys decide among yourselves how to proceed, and then send a fresh series that works with both TCG and KVM? Thanks, drew > --- > arm/pmu.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 102 insertions(+), 1 deletion(-) > > diff --git a/arm/pmu.c b/arm/pmu.c > index 4334de4..788886a 100644 > --- a/arm/pmu.c > +++ b/arm/pmu.c > @@ -43,6 +43,23 @@ static inline unsigned long get_pmccntr(void) > asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); > return cycles; > } > + > +/* > + * Extra instructions inserted by the compiler would be difficult to compensate > + * for, so hand assemble everything between, and including, the PMCR accesses > + * to start and stop counting. > + */ > +static inline void loop(int i, uint32_t pmcr) > +{ > + asm volatile( > + " mcr p15, 0, %[pmcr], c9, c12, 0\n" > + "1: subs %[i], %[i], #1\n" > + " bgt 1b\n" > + " mcr p15, 0, %[z], c9, c12, 0\n" > + : [i] "+r" (i) > + : [pmcr] "r" (pmcr), [z] "r" (0) > + : "cc"); > +} > #elif defined(__aarch64__) > static inline uint32_t get_pmcr(void) > { > @@ -64,6 +81,23 @@ static inline unsigned long get_pmccntr(void) > asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); > return cycles; > } > + > +/* > + * Extra instructions inserted by the compiler would be difficult to compensate > + * for, so hand assemble everything between, and including, the PMCR accesses > + * to start and stop counting. > + */ > +static inline void loop(int i, uint32_t pmcr) > +{ > + asm volatile( > + " msr pmcr_el0, %[pmcr]\n" > + "1: subs %[i], %[i], #1\n" > + " b.gt 1b\n" > + " msr pmcr_el0, xzr\n" > + : [i] "+r" (i) > + : [pmcr] "r" (pmcr) > + : "cc"); > +} > #endif > > struct pmu_data { > @@ -131,12 +165,79 @@ static bool check_cycles_increase(void) > return true; > } > > -int main(void) > +/* > + * Execute a known number of guest instructions. Only odd instruction counts > + * greater than or equal to 3 are supported by the in-line assembly code. The > + * control register (PMCR_EL0) is initialized with the provided value (allowing > + * for example for the cycle counter or event counters to be reset). At the end > + * of the exact instruction loop, zero is written to PMCR_EL0 to disable > + * counting, allowing the cycle counter or event counters to be read at the > + * leisure of the calling code. > + */ > +static void measure_instrs(int num, uint32_t pmcr) > +{ > + int i = (num - 1) / 2; > + > + assert(num >= 3 && ((num - 1) % 2 == 0)); > + loop(i, pmcr); > +} > + > +/* > + * Measure cycle counts for various known instruction counts. Ensure that the > + * cycle counter progresses (similar to check_cycles_increase() but with more > + * instructions and using reset and stop controls). If supplied a positive, > + * nonzero CPI parameter, also strictly check that every measurement matches > + * it. Strict CPI checking is used to test -icount mode. > + */ > +static bool check_cpi(int cpi) > +{ > + struct pmu_data pmu = {0}; > + > + pmu.cycle_counter_reset = 1; > + pmu.enable = 1; > + > + if (cpi > 0) > + printf("Checking for CPI=%d.\n", cpi); > + printf("instrs : cycles0 cycles1 ...\n"); > + > + for (int i = 3; i < 300; i += 32) { > + int avg, sum = 0; > + > + printf("%d :", i); > + for (int j = 0; j < NR_SAMPLES; j++) { > + int cycles; > + > + measure_instrs(i, pmu.pmcr_el0); > + cycles = get_pmccntr(); > + printf(" %d", cycles); > + > + if (!cycles || (cpi > 0 && cycles != i * cpi)) { > + printf("\n"); > + return false; > + } > + > + sum += cycles; > + } > + avg = sum / NR_SAMPLES; > + printf(" sum=%d avg=%d avg_ipc=%d avg_cpi=%d\n", > + sum, avg, i / avg, avg / i); > + } > + > + return true; > +} > + > +int main(int argc, char *argv[]) > { > + int cpi = 0; > + > + if (argc >= 1) > + cpi = atol(argv[0]); > + > report_prefix_push("pmu"); > > report("Control register", check_pmcr()); > report("Monotonically increasing cycle count", check_cycles_increase()); > + report("Cycle/instruction ratio", check_cpi(cpi)); > > return report_summary(); > } > -- > Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm > Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora > Forum, a Linux Foundation Collaborative Project. > >
On 10/12/2016 07:05 AM, Andrew Jones wrote: > On Tue, Oct 11, 2016 at 02:40:44PM -0400, Christopher Covington wrote: >> Calculate the numbers of cycles per instruction (CPI) implied by ARM >> PMU cycle counter values. The code includes a strict checking facility >> intended for the -icount option in TCG mode but it is not yet enabled >> in the configuration file. Enabling it must wait on infrastructure >> improvements which allow for different tests to be run on TCG versus >> KVM. >> >> Signed-off-by: Christopher Covington <cov@codeaurora.org> > > This one should already have my r-b as well. At least I don't see any > difference from the one I reviewed way-back-when. > > I'd like to merge these along with additional patches enabling running > with KVM. I think Wei was looking into that. Or, even just refreshing > these patches with KVM support being in from the beginning would be > good. Can you guys decide among yourselves how to proceed, and then > send a fresh series that works with both TCG and KVM? Here is the issues I found so far. After they are fixed, KVM mode can be supported. Basically, as stated by Cov, it failed under KVM mode with 2 out of 3 tests (cycle-increase, cpi). The main problem is that: reading PMCCNTR always returns the same value; so the delta is always 0 between two cycle_count reads, causing failures in unit tests. Debugging it showed that both unit-testing (pmu.c) and kvm-vpmu code have issues. Details below: 1. kvm-unit-tests/arm/pmu.c * Bit 31 of PMCNTENSET_EL0 needs to be set 1. According to ARM doc, PMCNTENSET_EL0 register "enables the Cycle Count Register, PMCCNTR_EL0, and any implemented event counters PMEVCNTR<n>_EL0..." Without it, KVM vPMU can't enables cycle counter. * The PMCCFILTER needs to be setup for KVM to create perf_event for cycle counter. A fix like below should be enough. + /* init PMCCFILTER */ + val = 0; + asm volatile("msr pmccfiltr_el0, %0" :: "r" (val)); Note: this requirement shouldn't be needed. I think it is a KVM's bug (see analysis below) 2. virt/kvm/arm/pmu.c (KVM) kvm_pmu_set_counter_event_type() has some bugs and wrong assumption. * First of all, KVM vPMU kvm_pmu_set_counter_event_type() is called to create perf_event when PMXEVTYPER_EL0 is set. This is OK for PMEVCNTRn_EL0. But unlike PMEVCNTRn_EL0, setting event type shouldn't be needed for PMCCNTR; instead PMCCNTR should be always available to read. In other words, KVM should create a perf_event and make it available for reading PMCCNTR. * Secondly, ARMV8_PMU_EVTYPE_EVENT_SW_INCR is 0, same as PMCCFILTER reserved bits. So KVM shouldn't return if eventsel is ARMV8_PMU_EVTYPE_EVENT_SW_INCR. Instead it needs to further check if select_idx is ARMV8_PMU_CYCLE_IDX or not. See below: - if (eventsel == ARMV8_PMU_EVTYPE_EVENT_SW_INCR) + if (eventsel == ARMV8_PMU_EVTYPE_EVENT_SW_INCR && + select_idx != ARMV8_PMU_CYCLE_IDX) return; * perf_event.attr configuration shouldn't copy eventsel bits of PMCCFILTER directly. Instead KVM should convert it to type 0x11 (i.e. the "cpu cycle" event type) when guest reading cpu cycle is detected. I will send out patches to address the problems above. Thanks, -Wei > > Thanks, > drew > >> --- >> arm/pmu.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 102 insertions(+), 1 deletion(-) >> >> diff --git a/arm/pmu.c b/arm/pmu.c >> index 4334de4..788886a 100644 >> --- a/arm/pmu.c >> +++ b/arm/pmu.c >> @@ -43,6 +43,23 @@ static inline unsigned long get_pmccntr(void) >> asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); >> return cycles; >> } >> + >> +/* >> + * Extra instructions inserted by the compiler would be difficult to compensate >> + * for, so hand assemble everything between, and including, the PMCR accesses >> + * to start and stop counting. >> + */ >> +static inline void loop(int i, uint32_t pmcr) >> +{ >> + asm volatile( >> + " mcr p15, 0, %[pmcr], c9, c12, 0\n" >> + "1: subs %[i], %[i], #1\n" >> + " bgt 1b\n" >> + " mcr p15, 0, %[z], c9, c12, 0\n" >> + : [i] "+r" (i) >> + : [pmcr] "r" (pmcr), [z] "r" (0) >> + : "cc"); >> +} >> #elif defined(__aarch64__) >> static inline uint32_t get_pmcr(void) >> { >> @@ -64,6 +81,23 @@ static inline unsigned long get_pmccntr(void) >> asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); >> return cycles; >> } >> + >> +/* >> + * Extra instructions inserted by the compiler would be difficult to compensate >> + * for, so hand assemble everything between, and including, the PMCR accesses >> + * to start and stop counting. >> + */ >> +static inline void loop(int i, uint32_t pmcr) >> +{ >> + asm volatile( >> + " msr pmcr_el0, %[pmcr]\n" >> + "1: subs %[i], %[i], #1\n" >> + " b.gt 1b\n" >> + " msr pmcr_el0, xzr\n" >> + : [i] "+r" (i) >> + : [pmcr] "r" (pmcr) >> + : "cc"); >> +} >> #endif >> >> struct pmu_data { >> @@ -131,12 +165,79 @@ static bool check_cycles_increase(void) >> return true; >> } >> >> -int main(void) >> +/* >> + * Execute a known number of guest instructions. Only odd instruction counts >> + * greater than or equal to 3 are supported by the in-line assembly code. The >> + * control register (PMCR_EL0) is initialized with the provided value (allowing >> + * for example for the cycle counter or event counters to be reset). At the end >> + * of the exact instruction loop, zero is written to PMCR_EL0 to disable >> + * counting, allowing the cycle counter or event counters to be read at the >> + * leisure of the calling code. >> + */ >> +static void measure_instrs(int num, uint32_t pmcr) >> +{ >> + int i = (num - 1) / 2; >> + >> + assert(num >= 3 && ((num - 1) % 2 == 0)); >> + loop(i, pmcr); >> +} >> + >> +/* >> + * Measure cycle counts for various known instruction counts. Ensure that the >> + * cycle counter progresses (similar to check_cycles_increase() but with more >> + * instructions and using reset and stop controls). If supplied a positive, >> + * nonzero CPI parameter, also strictly check that every measurement matches >> + * it. Strict CPI checking is used to test -icount mode. >> + */ >> +static bool check_cpi(int cpi) >> +{ >> + struct pmu_data pmu = {0}; >> + >> + pmu.cycle_counter_reset = 1; >> + pmu.enable = 1; >> + >> + if (cpi > 0) >> + printf("Checking for CPI=%d.\n", cpi); >> + printf("instrs : cycles0 cycles1 ...\n"); >> + >> + for (int i = 3; i < 300; i += 32) { >> + int avg, sum = 0; >> + >> + printf("%d :", i); >> + for (int j = 0; j < NR_SAMPLES; j++) { >> + int cycles; >> + >> + measure_instrs(i, pmu.pmcr_el0); >> + cycles = get_pmccntr(); >> + printf(" %d", cycles); >> + >> + if (!cycles || (cpi > 0 && cycles != i * cpi)) { >> + printf("\n"); >> + return false; >> + } >> + >> + sum += cycles; >> + } >> + avg = sum / NR_SAMPLES; >> + printf(" sum=%d avg=%d avg_ipc=%d avg_cpi=%d\n", >> + sum, avg, i / avg, avg / i); >> + } >> + >> + return true; >> +} >> + >> +int main(int argc, char *argv[]) >> { >> + int cpi = 0; >> + >> + if (argc >= 1) >> + cpi = atol(argv[0]); >> + >> report_prefix_push("pmu"); >> >> report("Control register", check_pmcr()); >> report("Monotonically increasing cycle count", check_cycles_increase()); >> + report("Cycle/instruction ratio", check_cpi(cpi)); >> >> return report_summary(); >> } >> -- >> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm >> Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora >> Forum, a Linux Foundation Collaborative Project. >> >> >
diff --git a/arm/pmu.c b/arm/pmu.c index 4334de4..788886a 100644 --- a/arm/pmu.c +++ b/arm/pmu.c @@ -43,6 +43,23 @@ static inline unsigned long get_pmccntr(void) asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); return cycles; } + +/* + * Extra instructions inserted by the compiler would be difficult to compensate + * for, so hand assemble everything between, and including, the PMCR accesses + * to start and stop counting. + */ +static inline void loop(int i, uint32_t pmcr) +{ + asm volatile( + " mcr p15, 0, %[pmcr], c9, c12, 0\n" + "1: subs %[i], %[i], #1\n" + " bgt 1b\n" + " mcr p15, 0, %[z], c9, c12, 0\n" + : [i] "+r" (i) + : [pmcr] "r" (pmcr), [z] "r" (0) + : "cc"); +} #elif defined(__aarch64__) static inline uint32_t get_pmcr(void) { @@ -64,6 +81,23 @@ static inline unsigned long get_pmccntr(void) asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); return cycles; } + +/* + * Extra instructions inserted by the compiler would be difficult to compensate + * for, so hand assemble everything between, and including, the PMCR accesses + * to start and stop counting. + */ +static inline void loop(int i, uint32_t pmcr) +{ + asm volatile( + " msr pmcr_el0, %[pmcr]\n" + "1: subs %[i], %[i], #1\n" + " b.gt 1b\n" + " msr pmcr_el0, xzr\n" + : [i] "+r" (i) + : [pmcr] "r" (pmcr) + : "cc"); +} #endif struct pmu_data { @@ -131,12 +165,79 @@ static bool check_cycles_increase(void) return true; } -int main(void) +/* + * Execute a known number of guest instructions. Only odd instruction counts + * greater than or equal to 3 are supported by the in-line assembly code. The + * control register (PMCR_EL0) is initialized with the provided value (allowing + * for example for the cycle counter or event counters to be reset). At the end + * of the exact instruction loop, zero is written to PMCR_EL0 to disable + * counting, allowing the cycle counter or event counters to be read at the + * leisure of the calling code. + */ +static void measure_instrs(int num, uint32_t pmcr) +{ + int i = (num - 1) / 2; + + assert(num >= 3 && ((num - 1) % 2 == 0)); + loop(i, pmcr); +} + +/* + * Measure cycle counts for various known instruction counts. Ensure that the + * cycle counter progresses (similar to check_cycles_increase() but with more + * instructions and using reset and stop controls). If supplied a positive, + * nonzero CPI parameter, also strictly check that every measurement matches + * it. Strict CPI checking is used to test -icount mode. + */ +static bool check_cpi(int cpi) +{ + struct pmu_data pmu = {0}; + + pmu.cycle_counter_reset = 1; + pmu.enable = 1; + + if (cpi > 0) + printf("Checking for CPI=%d.\n", cpi); + printf("instrs : cycles0 cycles1 ...\n"); + + for (int i = 3; i < 300; i += 32) { + int avg, sum = 0; + + printf("%d :", i); + for (int j = 0; j < NR_SAMPLES; j++) { + int cycles; + + measure_instrs(i, pmu.pmcr_el0); + cycles = get_pmccntr(); + printf(" %d", cycles); + + if (!cycles || (cpi > 0 && cycles != i * cpi)) { + printf("\n"); + return false; + } + + sum += cycles; + } + avg = sum / NR_SAMPLES; + printf(" sum=%d avg=%d avg_ipc=%d avg_cpi=%d\n", + sum, avg, i / avg, avg / i); + } + + return true; +} + +int main(int argc, char *argv[]) { + int cpi = 0; + + if (argc >= 1) + cpi = atol(argv[0]); + report_prefix_push("pmu"); report("Control register", check_pmcr()); report("Monotonically increasing cycle count", check_cycles_increase()); + report("Cycle/instruction ratio", check_cpi(cpi)); return report_summary(); }
Calculate the numbers of cycles per instruction (CPI) implied by ARM PMU cycle counter values. The code includes a strict checking facility intended for the -icount option in TCG mode but it is not yet enabled in the configuration file. Enabling it must wait on infrastructure improvements which allow for different tests to be run on TCG versus KVM. Signed-off-by: Christopher Covington <cov@codeaurora.org> --- arm/pmu.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-)