Message ID | 1478629035-12938-3-git-send-email-wei@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: > From: Christopher Covington <cov@codeaurora.org> > > Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, > even for the smallest delta of two subsequent reads. > > Signed-off-by: Christopher Covington <cov@codeaurora.org> > Signed-off-by: Wei Huang <wei@redhat.com> > --- > arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 98 insertions(+) > > diff --git a/arm/pmu.c b/arm/pmu.c > index 0b29088..d5e3ac3 100644 > --- a/arm/pmu.c > +++ b/arm/pmu.c > @@ -14,6 +14,7 @@ > */ > #include "libcflat.h" > > +#define PMU_PMCR_E (1 << 0) > #define PMU_PMCR_N_SHIFT 11 > #define PMU_PMCR_N_MASK 0x1f > #define PMU_PMCR_ID_SHIFT 16 > @@ -21,6 +22,10 @@ > #define PMU_PMCR_IMP_SHIFT 24 > #define PMU_PMCR_IMP_MASK 0xff > > +#define PMU_CYCLE_IDX 31 > + > +#define NR_SAMPLES 10 > + > #if defined(__arm__) > static inline uint32_t pmcr_read(void) > { > @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) > asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); > return ret; > } > + > +static inline void pmcr_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); > +} > + > +static inline void pmselr_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); > +} > + > +static inline void pmxevtyper_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); > +} > + > +/* > + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 > + * bits doesn't seem worth the trouble when differential usage of the result is > + * expected (with differences that can easily fit in 32 bits). So just return > + * the lower 32 bits of the cycle count in AArch32. Like I said in the last review, I'd rather we not do this. We should return the full value and then the test case should confirm the upper 32 bits are zero. > + */ > +static inline uint32_t pmccntr_read(void) > +{ > + uint32_t cycles; > + > + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); > + return cycles; > +} > + > +static inline void pmcntenset_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value)); > +} > + > +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */ > +static inline void pmccfiltr_write(uint32_t value) > +{ > + pmselr_write(PMU_CYCLE_IDX); > + pmxevtyper_write(value); > +} > #elif defined(__aarch64__) > static inline uint32_t pmcr_read(void) > { > @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void) > asm volatile("mrs %0, pmcr_el0" : "=r" (ret)); > return ret; > } > + > +static inline void pmcr_write(uint32_t value) > +{ > + asm volatile("msr pmcr_el0, %0" : : "r" (value)); > +} > + > +static inline uint32_t pmccntr_read(void) > +{ > + uint32_t cycles; > + > + asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); > + return cycles; > +} > + > +static inline void pmcntenset_write(uint32_t value) > +{ > + asm volatile("msr pmcntenset_el0, %0" : : "r" (value)); > +} > + > +static inline void pmccfiltr_write(uint32_t value) > +{ > + asm volatile("msr pmccfiltr_el0, %0" : : "r" (value)); > +} > #endif > > /* > @@ -63,11 +132,40 @@ static bool check_pmcr(void) > return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0; > } > > +/* > + * Ensure that the cycle counter progresses between back-to-back reads. > + */ > +static bool check_cycles_increase(void) > +{ > + pmcr_write(pmcr_read() | PMU_PMCR_E); > + > + for (int i = 0; i < NR_SAMPLES; i++) { > + unsigned long a, b; > + > + a = pmccntr_read(); > + b = pmccntr_read(); > + > + if (a >= b) { > + printf("Read %ld then %ld.\n", a, b); > + return false; > + } > + } > + > + pmcr_write(pmcr_read() & ~PMU_PMCR_E); > + > + return true; > +} > + > int main(void) > { > report_prefix_push("pmu"); > > + /* init for PMU event access, right now only care about cycle count */ > + pmcntenset_write(1 << PMU_CYCLE_IDX); > + pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */ > + > report("Control register", check_pmcr()); > + report("Monotonically increasing cycle count", check_cycles_increase()); > > return report_summary(); > } > -- > 1.8.3.1 Besides needing to use u64's for registers that return u64's, it looks good to me. drew
On 11/11/2016 01:43 AM, Andrew Jones wrote: > On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: >> From: Christopher Covington <cov@codeaurora.org> >> >> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, >> even for the smallest delta of two subsequent reads. >> >> Signed-off-by: Christopher Covington <cov@codeaurora.org> >> Signed-off-by: Wei Huang <wei@redhat.com> >> --- >> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 98 insertions(+) >> >> diff --git a/arm/pmu.c b/arm/pmu.c >> index 0b29088..d5e3ac3 100644 >> --- a/arm/pmu.c >> +++ b/arm/pmu.c >> @@ -14,6 +14,7 @@ >> */ >> #include "libcflat.h" >> >> +#define PMU_PMCR_E (1 << 0) >> #define PMU_PMCR_N_SHIFT 11 >> #define PMU_PMCR_N_MASK 0x1f >> #define PMU_PMCR_ID_SHIFT 16 >> @@ -21,6 +22,10 @@ >> #define PMU_PMCR_IMP_SHIFT 24 >> #define PMU_PMCR_IMP_MASK 0xff >> >> +#define PMU_CYCLE_IDX 31 >> + >> +#define NR_SAMPLES 10 >> + >> #if defined(__arm__) >> static inline uint32_t pmcr_read(void) >> { >> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) >> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); >> return ret; >> } >> + >> +static inline void pmcr_write(uint32_t value) >> +{ >> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); >> +} >> + >> +static inline void pmselr_write(uint32_t value) >> +{ >> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); >> +} >> + >> +static inline void pmxevtyper_write(uint32_t value) >> +{ >> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); >> +} >> + >> +/* >> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 >> + * bits doesn't seem worth the trouble when differential usage of the result is >> + * expected (with differences that can easily fit in 32 bits). So just return >> + * the lower 32 bits of the cycle count in AArch32. > > Like I said in the last review, I'd rather we not do this. We should > return the full value and then the test case should confirm the upper > 32 bits are zero. > Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit register. We can force it to a more coarse-grained cycle counter with PMCR.D bit=1 (see below). But it is still not a 64-bit register. ARMv8 PMCCNTR_EL0 is a 64-bit register. "The PMCR.D bit configures whether PMCCNTR increments once every clock cycle, or once every 64 clock cycles. " So I think the comment above in the code is an overstatement, which should be deleted or moved down to ARMv8 pmccntr_read() below. >> + */ >> +static inline uint32_t pmccntr_read(void) >> +{ >> + uint32_t cycles; >> + >> + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); >> + return cycles; >> +} >> + >> +static inline void pmcntenset_write(uint32_t value) >> +{ >> + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value)); >> +} >> + >> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */ >> +static inline void pmccfiltr_write(uint32_t value) >> +{ >> + pmselr_write(PMU_CYCLE_IDX); >> + pmxevtyper_write(value); >> +} >> #elif defined(__aarch64__) >> static inline uint32_t pmcr_read(void) >> { >> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void) >> asm volatile("mrs %0, pmcr_el0" : "=r" (ret)); >> return ret; >> } >> + >> +static inline void pmcr_write(uint32_t value) >> +{ >> + asm volatile("msr pmcr_el0, %0" : : "r" (value)); >> +} >> + >> +static inline uint32_t pmccntr_read(void) >> +{ >> + uint32_t cycles; >> + >> + asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); >> + return cycles; >> +} >> + >> +static inline void pmcntenset_write(uint32_t value) >> +{ >> + asm volatile("msr pmcntenset_el0, %0" : : "r" (value)); >> +} >> + >> +static inline void pmccfiltr_write(uint32_t value) >> +{ >> + asm volatile("msr pmccfiltr_el0, %0" : : "r" (value)); >> +} >> #endif >> >> /* >> @@ -63,11 +132,40 @@ static bool check_pmcr(void) >> return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0; >> } >> >> +/* >> + * Ensure that the cycle counter progresses between back-to-back reads. >> + */ >> +static bool check_cycles_increase(void) >> +{ >> + pmcr_write(pmcr_read() | PMU_PMCR_E); >> + >> + for (int i = 0; i < NR_SAMPLES; i++) { >> + unsigned long a, b; >> + >> + a = pmccntr_read(); >> + b = pmccntr_read(); >> + >> + if (a >= b) { >> + printf("Read %ld then %ld.\n", a, b); >> + return false; >> + } >> + } >> + >> + pmcr_write(pmcr_read() & ~PMU_PMCR_E); >> + >> + return true; >> +} >> + >> int main(void) >> { >> report_prefix_push("pmu"); >> >> + /* init for PMU event access, right now only care about cycle count */ >> + pmcntenset_write(1 << PMU_CYCLE_IDX); >> + pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */ >> + >> report("Control register", check_pmcr()); >> + report("Monotonically increasing cycle count", check_cycles_increase()); >> >> return report_summary(); >> } >> -- >> 1.8.3.1 > > Besides needing to use u64's for registers that return u64's, it > looks good to me. > > drew >
On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: > > > On 11/11/2016 01:43 AM, Andrew Jones wrote: > > On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: > >> From: Christopher Covington <cov@codeaurora.org> > >> > >> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, > >> even for the smallest delta of two subsequent reads. > >> > >> Signed-off-by: Christopher Covington <cov@codeaurora.org> > >> Signed-off-by: Wei Huang <wei@redhat.com> > >> --- > >> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 98 insertions(+) > >> > >> diff --git a/arm/pmu.c b/arm/pmu.c > >> index 0b29088..d5e3ac3 100644 > >> --- a/arm/pmu.c > >> +++ b/arm/pmu.c > >> @@ -14,6 +14,7 @@ > >> */ > >> #include "libcflat.h" > >> > >> +#define PMU_PMCR_E (1 << 0) > >> #define PMU_PMCR_N_SHIFT 11 > >> #define PMU_PMCR_N_MASK 0x1f > >> #define PMU_PMCR_ID_SHIFT 16 > >> @@ -21,6 +22,10 @@ > >> #define PMU_PMCR_IMP_SHIFT 24 > >> #define PMU_PMCR_IMP_MASK 0xff > >> > >> +#define PMU_CYCLE_IDX 31 > >> + > >> +#define NR_SAMPLES 10 > >> + > >> #if defined(__arm__) > >> static inline uint32_t pmcr_read(void) > >> { > >> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) > >> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); > >> return ret; > >> } > >> + > >> +static inline void pmcr_write(uint32_t value) > >> +{ > >> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); > >> +} > >> + > >> +static inline void pmselr_write(uint32_t value) > >> +{ > >> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); > >> +} > >> + > >> +static inline void pmxevtyper_write(uint32_t value) > >> +{ > >> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); > >> +} > >> + > >> +/* > >> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 > >> + * bits doesn't seem worth the trouble when differential usage of the result is > >> + * expected (with differences that can easily fit in 32 bits). So just return > >> + * the lower 32 bits of the cycle count in AArch32. > > > > Like I said in the last review, I'd rather we not do this. We should > > return the full value and then the test case should confirm the upper > > 32 bits are zero. > > > > Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit > register. We can force it to a more coarse-grained cycle counter with > PMCR.D bit=1 (see below). But it is still not a 64-bit register. ARMv8 > PMCCNTR_EL0 is a 64-bit register. > > "The PMCR.D bit configures whether PMCCNTR increments once every clock > cycle, or once every 64 clock cycles. " > > So I think the comment above in the code is an overstatement, which > should be deleted or moved down to ARMv8 pmccntr_read() below. OK, please fix as appropriate, but for the v8 64-bit register, please don't drop the upper bits until after a unit test has a chance to check them. Thanks, drew > > >> + */ > >> +static inline uint32_t pmccntr_read(void) > >> +{ > >> + uint32_t cycles; > >> + > >> + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); > >> + return cycles; > >> +} > >> + > >> +static inline void pmcntenset_write(uint32_t value) > >> +{ > >> + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value)); > >> +} > >> + > >> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */ > >> +static inline void pmccfiltr_write(uint32_t value) > >> +{ > >> + pmselr_write(PMU_CYCLE_IDX); > >> + pmxevtyper_write(value); > >> +} > >> #elif defined(__aarch64__) > >> static inline uint32_t pmcr_read(void) > >> { > >> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void) > >> asm volatile("mrs %0, pmcr_el0" : "=r" (ret)); > >> return ret; > >> } > >> + > >> +static inline void pmcr_write(uint32_t value) > >> +{ > >> + asm volatile("msr pmcr_el0, %0" : : "r" (value)); > >> +} > >> + > >> +static inline uint32_t pmccntr_read(void) > >> +{ > >> + uint32_t cycles; > >> + > >> + asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); > >> + return cycles; > >> +} > >> + > >> +static inline void pmcntenset_write(uint32_t value) > >> +{ > >> + asm volatile("msr pmcntenset_el0, %0" : : "r" (value)); > >> +} > >> + > >> +static inline void pmccfiltr_write(uint32_t value) > >> +{ > >> + asm volatile("msr pmccfiltr_el0, %0" : : "r" (value)); > >> +} > >> #endif > >> > >> /* > >> @@ -63,11 +132,40 @@ static bool check_pmcr(void) > >> return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0; > >> } > >> > >> +/* > >> + * Ensure that the cycle counter progresses between back-to-back reads. > >> + */ > >> +static bool check_cycles_increase(void) > >> +{ > >> + pmcr_write(pmcr_read() | PMU_PMCR_E); > >> + > >> + for (int i = 0; i < NR_SAMPLES; i++) { > >> + unsigned long a, b; > >> + > >> + a = pmccntr_read(); > >> + b = pmccntr_read(); > >> + > >> + if (a >= b) { > >> + printf("Read %ld then %ld.\n", a, b); > >> + return false; > >> + } > >> + } > >> + > >> + pmcr_write(pmcr_read() & ~PMU_PMCR_E); > >> + > >> + return true; > >> +} > >> + > >> int main(void) > >> { > >> report_prefix_push("pmu"); > >> > >> + /* init for PMU event access, right now only care about cycle count */ > >> + pmcntenset_write(1 << PMU_CYCLE_IDX); > >> + pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */ > >> + > >> report("Control register", check_pmcr()); > >> + report("Monotonically increasing cycle count", check_cycles_increase()); > >> > >> return report_summary(); > >> } > >> -- > >> 1.8.3.1 > > > > Besides needing to use u64's for registers that return u64's, it > > looks good to me. > > > > drew > > >
Hi Drew, Wei, On 11/14/2016 05:05 AM, Andrew Jones wrote: > On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: >> >> >> On 11/11/2016 01:43 AM, Andrew Jones wrote: >>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: >>>> From: Christopher Covington <cov@codeaurora.org> >>>> >>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, >>>> even for the smallest delta of two subsequent reads. >>>> >>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> >>>> Signed-off-by: Wei Huang <wei@redhat.com> >>>> --- >>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>> 1 file changed, 98 insertions(+) >>>> >>>> diff --git a/arm/pmu.c b/arm/pmu.c >>>> index 0b29088..d5e3ac3 100644 >>>> --- a/arm/pmu.c >>>> +++ b/arm/pmu.c >>>> @@ -14,6 +14,7 @@ >>>> */ >>>> #include "libcflat.h" >>>> >>>> +#define PMU_PMCR_E (1 << 0) >>>> #define PMU_PMCR_N_SHIFT 11 >>>> #define PMU_PMCR_N_MASK 0x1f >>>> #define PMU_PMCR_ID_SHIFT 16 >>>> @@ -21,6 +22,10 @@ >>>> #define PMU_PMCR_IMP_SHIFT 24 >>>> #define PMU_PMCR_IMP_MASK 0xff >>>> >>>> +#define PMU_CYCLE_IDX 31 >>>> + >>>> +#define NR_SAMPLES 10 >>>> + >>>> #if defined(__arm__) >>>> static inline uint32_t pmcr_read(void) >>>> { >>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) >>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); >>>> return ret; >>>> } >>>> + >>>> +static inline void pmcr_write(uint32_t value) >>>> +{ >>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); >>>> +} >>>> + >>>> +static inline void pmselr_write(uint32_t value) >>>> +{ >>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); >>>> +} >>>> + >>>> +static inline void pmxevtyper_write(uint32_t value) >>>> +{ >>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); >>>> +} >>>> + >>>> +/* >>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 >>>> + * bits doesn't seem worth the trouble when differential usage of the result is >>>> + * expected (with differences that can easily fit in 32 bits). So just return >>>> + * the lower 32 bits of the cycle count in AArch32. >>> >>> Like I said in the last review, I'd rather we not do this. We should >>> return the full value and then the test case should confirm the upper >>> 32 bits are zero. >> >> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit >> register. We can force it to a more coarse-grained cycle counter with >> PMCR.D bit=1 (see below). But it is still not a 64-bit register. AArch32 System Register Descriptions Performance Monitors registers PMCCNTR, Performance Monitors Cycle Count Register To access the PMCCNTR when accessing as a 32-bit register: MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged To access the PMCCNTR when accessing as a 64-bit register: MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] Regards, Cov
On 11/14/2016 09:12 AM, Christopher Covington wrote: > Hi Drew, Wei, > > On 11/14/2016 05:05 AM, Andrew Jones wrote: >> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: >>> >>> >>> On 11/11/2016 01:43 AM, Andrew Jones wrote: >>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: >>>>> From: Christopher Covington <cov@codeaurora.org> >>>>> >>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, >>>>> even for the smallest delta of two subsequent reads. >>>>> >>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> >>>>> Signed-off-by: Wei Huang <wei@redhat.com> >>>>> --- >>>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>> 1 file changed, 98 insertions(+) >>>>> >>>>> diff --git a/arm/pmu.c b/arm/pmu.c >>>>> index 0b29088..d5e3ac3 100644 >>>>> --- a/arm/pmu.c >>>>> +++ b/arm/pmu.c >>>>> @@ -14,6 +14,7 @@ >>>>> */ >>>>> #include "libcflat.h" >>>>> >>>>> +#define PMU_PMCR_E (1 << 0) >>>>> #define PMU_PMCR_N_SHIFT 11 >>>>> #define PMU_PMCR_N_MASK 0x1f >>>>> #define PMU_PMCR_ID_SHIFT 16 >>>>> @@ -21,6 +22,10 @@ >>>>> #define PMU_PMCR_IMP_SHIFT 24 >>>>> #define PMU_PMCR_IMP_MASK 0xff >>>>> >>>>> +#define PMU_CYCLE_IDX 31 >>>>> + >>>>> +#define NR_SAMPLES 10 >>>>> + >>>>> #if defined(__arm__) >>>>> static inline uint32_t pmcr_read(void) >>>>> { >>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) >>>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); >>>>> return ret; >>>>> } >>>>> + >>>>> +static inline void pmcr_write(uint32_t value) >>>>> +{ >>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); >>>>> +} >>>>> + >>>>> +static inline void pmselr_write(uint32_t value) >>>>> +{ >>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); >>>>> +} >>>>> + >>>>> +static inline void pmxevtyper_write(uint32_t value) >>>>> +{ >>>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); >>>>> +} >>>>> + >>>>> +/* >>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 >>>>> + * bits doesn't seem worth the trouble when differential usage of the result is >>>>> + * expected (with differences that can easily fit in 32 bits). So just return >>>>> + * the lower 32 bits of the cycle count in AArch32. >>>> >>>> Like I said in the last review, I'd rather we not do this. We should >>>> return the full value and then the test case should confirm the upper >>>> 32 bits are zero. >>> >>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit >>> register. We can force it to a more coarse-grained cycle counter with >>> PMCR.D bit=1 (see below). But it is still not a 64-bit register. > > AArch32 System Register Descriptions > Performance Monitors registers > PMCCNTR, Performance Monitors Cycle Count Register > > To access the PMCCNTR when accessing as a 32-bit register: > MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt > MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged > > To access the PMCCNTR when accessing as a 64-bit register: > MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 > MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] > Thanks. I did some research based on your info and came back with the following proposals (Cov, correct me if I am wrong): By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I think this 64-bit cycle register is only available when running under aarch32 compatibility mode on ARMv8 because it is not specified in A15 TRM. To further verify it, I tested 32-bit pmu code on QEMU with TCG mode. The result is: accessing 64-bit PMCCNTR using the following assembly failed on A15: volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi)); or volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val)); Given this difference, I think there are two solutions for 64-bit AArch32 pmccntr_read, as requested by Drew: 1) The PMU unit testing code tells if it is running under ARMv7 or under AArch32-compability mode. When it is running ARMv7, such as A15, let us use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise use "MRRC p15,0,<Rt>,<Rt2>,c9". 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will be the same as the original code. Thoughts? -Wei [1] A57 TRM, http://infocenter.arm.com/help/topic/com.arm.doc.ddi0488c/DDI0488C_cortex_a57_mpcore_r1p0_trm.pdf [2] A15 TRM, http://infocenter.arm.com/help/topic/com.arm.doc.ddi0438c/DDI0438C_cortex_a15_r2p0_trm.pdf > Regards, > Cov >
On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote: > > > On 11/14/2016 09:12 AM, Christopher Covington wrote: > > Hi Drew, Wei, > > > > On 11/14/2016 05:05 AM, Andrew Jones wrote: > >> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: > >>> > >>> > >>> On 11/11/2016 01:43 AM, Andrew Jones wrote: > >>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: > >>>>> From: Christopher Covington <cov@codeaurora.org> > >>>>> > >>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, > >>>>> even for the smallest delta of two subsequent reads. > >>>>> > >>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> > >>>>> Signed-off-by: Wei Huang <wei@redhat.com> > >>>>> --- > >>>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >>>>> 1 file changed, 98 insertions(+) > >>>>> > >>>>> diff --git a/arm/pmu.c b/arm/pmu.c > >>>>> index 0b29088..d5e3ac3 100644 > >>>>> --- a/arm/pmu.c > >>>>> +++ b/arm/pmu.c > >>>>> @@ -14,6 +14,7 @@ > >>>>> */ > >>>>> #include "libcflat.h" > >>>>> > >>>>> +#define PMU_PMCR_E (1 << 0) > >>>>> #define PMU_PMCR_N_SHIFT 11 > >>>>> #define PMU_PMCR_N_MASK 0x1f > >>>>> #define PMU_PMCR_ID_SHIFT 16 > >>>>> @@ -21,6 +22,10 @@ > >>>>> #define PMU_PMCR_IMP_SHIFT 24 > >>>>> #define PMU_PMCR_IMP_MASK 0xff > >>>>> > >>>>> +#define PMU_CYCLE_IDX 31 > >>>>> + > >>>>> +#define NR_SAMPLES 10 > >>>>> + > >>>>> #if defined(__arm__) > >>>>> static inline uint32_t pmcr_read(void) > >>>>> { > >>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) > >>>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); > >>>>> return ret; > >>>>> } > >>>>> + > >>>>> +static inline void pmcr_write(uint32_t value) > >>>>> +{ > >>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); > >>>>> +} > >>>>> + > >>>>> +static inline void pmselr_write(uint32_t value) > >>>>> +{ > >>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); > >>>>> +} > >>>>> + > >>>>> +static inline void pmxevtyper_write(uint32_t value) > >>>>> +{ > >>>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); > >>>>> +} > >>>>> + > >>>>> +/* > >>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 > >>>>> + * bits doesn't seem worth the trouble when differential usage of the result is > >>>>> + * expected (with differences that can easily fit in 32 bits). So just return > >>>>> + * the lower 32 bits of the cycle count in AArch32. > >>>> > >>>> Like I said in the last review, I'd rather we not do this. We should > >>>> return the full value and then the test case should confirm the upper > >>>> 32 bits are zero. > >>> > >>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit > >>> register. We can force it to a more coarse-grained cycle counter with > >>> PMCR.D bit=1 (see below). But it is still not a 64-bit register. > > > > AArch32 System Register Descriptions > > Performance Monitors registers > > PMCCNTR, Performance Monitors Cycle Count Register > > > > To access the PMCCNTR when accessing as a 32-bit register: > > MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt > > MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged > > > > To access the PMCCNTR when accessing as a 64-bit register: > > MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 > > MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] > > > > Thanks. I did some research based on your info and came back with the > following proposals (Cov, correct me if I am wrong): > > By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I > think this 64-bit cycle register is only available when running under > aarch32 compatibility mode on ARMv8 because it is not specified in A15 > TRM. OK, I hadn't realized that there would be differences between v7 and AArch32. It looks like we need to add a function to the kvm-unit-tests framework that enables unit tests to make that distinction, because we'll want to explicitly test those differences in order to flush out emulation bugs. I see now that Appendix K5 of the v8 ARM ARM lists some differences, but this PMCCNTR difference isn't there... As v8-A32 is an update/extension of v7-A, I'd expect there to be a RES0 bit in some v7 ID register that, on v8, is no longer reserved and a 1. Unfortunately I just did some ARM doc skimming but can't find anything like that. As we currently only use the cortex-a15 for our v7 processor, then I guess we can just check MIDR, but yuck. Anyway, I'll send a patch for that. > To further verify it, I tested 32-bit pmu code on QEMU with TCG > mode. The result is: accessing 64-bit PMCCNTR using the following > assembly failed on A15: > > volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi)); > or > volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val)); > > Given this difference, I think there are two solutions for 64-bit > AArch32 pmccntr_read, as requested by Drew: > > 1) The PMU unit testing code tells if it is running under ARMv7 or under > AArch32-compability mode. When it is running ARMv7, such as A15, let us > use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise > use "MRRC p15,0,<Rt>,<Rt2>,c9". > > 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC > p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will > be the same as the original code. 3) For the basic test do (2), but add an additional test for AArch32 mode that also does the MRRC. That way on AArch32 we test both access types. Going with (3) means we can finish this series off now and then post another patch later with the additional access, after my is_aarch32() patch, that I'll write now, gets merged. Thanks, drew > > Thoughts? > > -Wei > > [1] A57 TRM, > http://infocenter.arm.com/help/topic/com.arm.doc.ddi0488c/DDI0488C_cortex_a57_mpcore_r1p0_trm.pdf > [2] A15 TRM, > http://infocenter.arm.com/help/topic/com.arm.doc.ddi0438c/DDI0438C_cortex_a15_r2p0_trm.pdf > > > Regards, > > Cov > > >
Just crossed my mind that we're missing isb's. On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: > From: Christopher Covington <cov@codeaurora.org> > > Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, > even for the smallest delta of two subsequent reads. > > Signed-off-by: Christopher Covington <cov@codeaurora.org> > Signed-off-by: Wei Huang <wei@redhat.com> > --- > arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 98 insertions(+) > > diff --git a/arm/pmu.c b/arm/pmu.c > index 0b29088..d5e3ac3 100644 > --- a/arm/pmu.c > +++ b/arm/pmu.c > @@ -14,6 +14,7 @@ > */ > #include "libcflat.h" > > +#define PMU_PMCR_E (1 << 0) > #define PMU_PMCR_N_SHIFT 11 > #define PMU_PMCR_N_MASK 0x1f > #define PMU_PMCR_ID_SHIFT 16 > @@ -21,6 +22,10 @@ > #define PMU_PMCR_IMP_SHIFT 24 > #define PMU_PMCR_IMP_MASK 0xff > > +#define PMU_CYCLE_IDX 31 > + > +#define NR_SAMPLES 10 > + > #if defined(__arm__) > static inline uint32_t pmcr_read(void) > { > @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) > asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); > return ret; > } > + > +static inline void pmcr_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); > +} > + > +static inline void pmselr_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); Probably want an isb here, users will call this and then immediately another PMU reg write, like is done below > +} > + > +static inline void pmxevtyper_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); > +} > + > +/* > + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 > + * bits doesn't seem worth the trouble when differential usage of the result is > + * expected (with differences that can easily fit in 32 bits). So just return > + * the lower 32 bits of the cycle count in AArch32. Also, while we're discussing confirming upper bits are as expected, I guess we should confirm no overflow too. We should clear the overflow bit PMOVSCLR_EL0.C before we use the counter, and then check it at some point to confirm it's as expected. I guess that could be separate test cases though. > + */ > +static inline uint32_t pmccntr_read(void) > +{ > + uint32_t cycles; > + > + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); > + return cycles; > +} > + > +static inline void pmcntenset_write(uint32_t value) > +{ > + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value)); > +} > + > +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */ > +static inline void pmccfiltr_write(uint32_t value) > +{ > + pmselr_write(PMU_CYCLE_IDX); > + pmxevtyper_write(value); > +} > #elif defined(__aarch64__) > static inline uint32_t pmcr_read(void) > { > @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void) > asm volatile("mrs %0, pmcr_el0" : "=r" (ret)); > return ret; > } > + > +static inline void pmcr_write(uint32_t value) > +{ > + asm volatile("msr pmcr_el0, %0" : : "r" (value)); > +} > + > +static inline uint32_t pmccntr_read(void) > +{ > + uint32_t cycles; > + > + asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); > + return cycles; > +} > + > +static inline void pmcntenset_write(uint32_t value) > +{ > + asm volatile("msr pmcntenset_el0, %0" : : "r" (value)); > +} > + > +static inline void pmccfiltr_write(uint32_t value) > +{ > + asm volatile("msr pmccfiltr_el0, %0" : : "r" (value)); > +} > #endif > > /* > @@ -63,11 +132,40 @@ static bool check_pmcr(void) > return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0; > } > > +/* > + * Ensure that the cycle counter progresses between back-to-back reads. > + */ > +static bool check_cycles_increase(void) > +{ > + pmcr_write(pmcr_read() | PMU_PMCR_E); Need isb() here > + > + for (int i = 0; i < NR_SAMPLES; i++) { > + unsigned long a, b; > + > + a = pmccntr_read(); > + b = pmccntr_read(); > + > + if (a >= b) { > + printf("Read %ld then %ld.\n", a, b); > + return false; > + } > + } > + > + pmcr_write(pmcr_read() & ~PMU_PMCR_E); > + Need isb() here > + return true; > +} > + > int main(void) > { > report_prefix_push("pmu"); > > + /* init for PMU event access, right now only care about cycle count */ > + pmcntenset_write(1 << PMU_CYCLE_IDX); > + pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */ Need isb() here > + > report("Control register", check_pmcr()); > + report("Monotonically increasing cycle count", check_cycles_increase()); > > return report_summary(); > } > -- > 1.8.3.1 > > Thanks, drew
On 11/16/2016 08:01 AM, Andrew Jones wrote: > On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote: >> >> >> On 11/14/2016 09:12 AM, Christopher Covington wrote: >>> Hi Drew, Wei, >>> >>> On 11/14/2016 05:05 AM, Andrew Jones wrote: >>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: >>>>> >>>>> >>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote: >>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: >>>>>>> From: Christopher Covington <cov@codeaurora.org> >>>>>>> >>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, >>>>>>> even for the smallest delta of two subsequent reads. >>>>>>> >>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> >>>>>>> Signed-off-by: Wei Huang <wei@redhat.com> >>>>>>> --- >>>>>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>> 1 file changed, 98 insertions(+) >>>>>>> >>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c >>>>>>> index 0b29088..d5e3ac3 100644 >>>>>>> --- a/arm/pmu.c >>>>>>> +++ b/arm/pmu.c >>>>>>> @@ -14,6 +14,7 @@ >>>>>>> */ >>>>>>> #include "libcflat.h" >>>>>>> >>>>>>> +#define PMU_PMCR_E (1 << 0) >>>>>>> #define PMU_PMCR_N_SHIFT 11 >>>>>>> #define PMU_PMCR_N_MASK 0x1f >>>>>>> #define PMU_PMCR_ID_SHIFT 16 >>>>>>> @@ -21,6 +22,10 @@ >>>>>>> #define PMU_PMCR_IMP_SHIFT 24 >>>>>>> #define PMU_PMCR_IMP_MASK 0xff >>>>>>> >>>>>>> +#define PMU_CYCLE_IDX 31 >>>>>>> + >>>>>>> +#define NR_SAMPLES 10 >>>>>>> + >>>>>>> #if defined(__arm__) >>>>>>> static inline uint32_t pmcr_read(void) >>>>>>> { >>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) >>>>>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); >>>>>>> return ret; >>>>>>> } >>>>>>> + >>>>>>> +static inline void pmcr_write(uint32_t value) >>>>>>> +{ >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); >>>>>>> +} >>>>>>> + >>>>>>> +static inline void pmselr_write(uint32_t value) >>>>>>> +{ >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); >>>>>>> +} >>>>>>> + >>>>>>> +static inline void pmxevtyper_write(uint32_t value) >>>>>>> +{ >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); >>>>>>> +} >>>>>>> + >>>>>>> +/* >>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 >>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is >>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return >>>>>>> + * the lower 32 bits of the cycle count in AArch32. >>>>>> >>>>>> Like I said in the last review, I'd rather we not do this. We should >>>>>> return the full value and then the test case should confirm the upper >>>>>> 32 bits are zero. >>>>> >>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit >>>>> register. We can force it to a more coarse-grained cycle counter with >>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register. >>> >>> AArch32 System Register Descriptions >>> Performance Monitors registers >>> PMCCNTR, Performance Monitors Cycle Count Register >>> >>> To access the PMCCNTR when accessing as a 32-bit register: >>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt >>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged >>> >>> To access the PMCCNTR when accessing as a 64-bit register: >>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 >>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] >>> >> >> Thanks. I did some research based on your info and came back with the >> following proposals (Cov, correct me if I am wrong): >> >> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I >> think this 64-bit cycle register is only available when running under >> aarch32 compatibility mode on ARMv8 because it is not specified in A15 >> TRM. That interpretation sounds really strange to me. My recollection is that the cycle counter was available as a 64 bit register in ARMv7 as well. I would expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference Manual is the complete and authoritative source. >> To further verify it, I tested 32-bit pmu code on QEMU with TCG >> mode. The result is: accessing 64-bit PMCCNTR using the following >> assembly failed on A15: >> >> volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi)); >> or >> volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val)); The PMU implementation on QEMU TCG mode is infantile. (I was trying to write these tests to help guide fixes and enhancements in a test-driven-development manner.) I would not trust QEMU TCG to behave properly here. If you want to execute those instructions, is there anything preventing you from doing it on hardware, or at least the Foundation Model? >> Given this difference, I think there are two solutions for 64-bit >> AArch32 pmccntr_read, as requested by Drew: >> >> 1) The PMU unit testing code tells if it is running under ARMv7 or under >> AArch32-compability mode. When it is running ARMv7, such as A15, let us >> use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise >> use "MRRC p15,0,<Rt>,<Rt2>,c9". >> >> 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC >> p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will >> be the same as the original code. > > 3) For the basic test do (2), but add an additional test for AArch32 > mode that also does the MRRC. That way on AArch32 we test both access > types. The upper bits being non-zero is an insane corner case. I'd really prefer to first build some momentum with checks for issues that are A) likely to occur and 2) not too difficult to check, like whether PMCR is writeable (especially relevant to KVM mode where it's not by default). Thanks, Cov
On Wed, Nov 16, 2016 at 11:08:42AM -0500, Christopher Covington wrote: > On 11/16/2016 08:01 AM, Andrew Jones wrote: > > On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote: > >> > >> > >> On 11/14/2016 09:12 AM, Christopher Covington wrote: > >>> Hi Drew, Wei, > >>> > >>> On 11/14/2016 05:05 AM, Andrew Jones wrote: > >>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: > >>>>> > >>>>> > >>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote: > >>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: > >>>>>>> From: Christopher Covington <cov@codeaurora.org> > >>>>>>> > >>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, > >>>>>>> even for the smallest delta of two subsequent reads. > >>>>>>> > >>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> > >>>>>>> Signed-off-by: Wei Huang <wei@redhat.com> > >>>>>>> --- > >>>>>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >>>>>>> 1 file changed, 98 insertions(+) > >>>>>>> > >>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c > >>>>>>> index 0b29088..d5e3ac3 100644 > >>>>>>> --- a/arm/pmu.c > >>>>>>> +++ b/arm/pmu.c > >>>>>>> @@ -14,6 +14,7 @@ > >>>>>>> */ > >>>>>>> #include "libcflat.h" > >>>>>>> > >>>>>>> +#define PMU_PMCR_E (1 << 0) > >>>>>>> #define PMU_PMCR_N_SHIFT 11 > >>>>>>> #define PMU_PMCR_N_MASK 0x1f > >>>>>>> #define PMU_PMCR_ID_SHIFT 16 > >>>>>>> @@ -21,6 +22,10 @@ > >>>>>>> #define PMU_PMCR_IMP_SHIFT 24 > >>>>>>> #define PMU_PMCR_IMP_MASK 0xff > >>>>>>> > >>>>>>> +#define PMU_CYCLE_IDX 31 > >>>>>>> + > >>>>>>> +#define NR_SAMPLES 10 > >>>>>>> + > >>>>>>> #if defined(__arm__) > >>>>>>> static inline uint32_t pmcr_read(void) > >>>>>>> { > >>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) > >>>>>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); > >>>>>>> return ret; > >>>>>>> } > >>>>>>> + > >>>>>>> +static inline void pmcr_write(uint32_t value) > >>>>>>> +{ > >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); > >>>>>>> +} > >>>>>>> + > >>>>>>> +static inline void pmselr_write(uint32_t value) > >>>>>>> +{ > >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); > >>>>>>> +} > >>>>>>> + > >>>>>>> +static inline void pmxevtyper_write(uint32_t value) > >>>>>>> +{ > >>>>>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); > >>>>>>> +} > >>>>>>> + > >>>>>>> +/* > >>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 > >>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is > >>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return > >>>>>>> + * the lower 32 bits of the cycle count in AArch32. > >>>>>> > >>>>>> Like I said in the last review, I'd rather we not do this. We should > >>>>>> return the full value and then the test case should confirm the upper > >>>>>> 32 bits are zero. > >>>>> > >>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit > >>>>> register. We can force it to a more coarse-grained cycle counter with > >>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register. > >>> > >>> AArch32 System Register Descriptions > >>> Performance Monitors registers > >>> PMCCNTR, Performance Monitors Cycle Count Register > >>> > >>> To access the PMCCNTR when accessing as a 32-bit register: > >>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt > >>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged > >>> > >>> To access the PMCCNTR when accessing as a 64-bit register: > >>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 > >>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] > >>> > >> > >> Thanks. I did some research based on your info and came back with the > >> following proposals (Cov, correct me if I am wrong): > >> > >> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I > >> think this 64-bit cycle register is only available when running under > >> aarch32 compatibility mode on ARMv8 because it is not specified in A15 > >> TRM. > > That interpretation sounds really strange to me. My recollection is that the > cycle counter was available as a 64 bit register in ARMv7 as well. I would > expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference > Manual is the complete and authoritative source. Yes, the v7 ARM ARM is the authoritative source, and it says 32-bit. Whereas the v8 ARM ARM wrt to AArch32 mode says it's both 32 and 64. > > >> To further verify it, I tested 32-bit pmu code on QEMU with TCG > >> mode. The result is: accessing 64-bit PMCCNTR using the following > >> assembly failed on A15: > >> > >> volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi)); > >> or > >> volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val)); > > The PMU implementation on QEMU TCG mode is infantile. (I was trying to > write these tests to help guide fixes and enhancements in a > test-driven-development manner.) I would not trust QEMU TCG to behave > properly here. If you want to execute those instructions, is there anything > preventing you from doing it on hardware, or at least the Foundation Model? > > >> Given this difference, I think there are two solutions for 64-bit > >> AArch32 pmccntr_read, as requested by Drew: > >> > >> 1) The PMU unit testing code tells if it is running under ARMv7 or under > >> AArch32-compability mode. When it is running ARMv7, such as A15, let us > >> use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise > >> use "MRRC p15,0,<Rt>,<Rt2>,c9". > >> > >> 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC > >> p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will > >> be the same as the original code. > > > > 3) For the basic test do (2), but add an additional test for AArch32 > > mode that also does the MRRC. That way on AArch32 we test both access > > types. > > The upper bits being non-zero is an insane corner case. The bits will most likely be zero, but how do you know without checking? Also, just invoking an mrrc vs. mrc is a big difference when testing emulation. We shouldn't skip it just because we expect it to give us a boring result. > > I'd really prefer to first build some momentum with checks for issues that > are A) likely to occur and 2) not too difficult to check, like whether PMCR > is writeable (especially relevant to KVM mode where it's not by default). Looks like we're on the right track for this starter series then. Thanks, drew > > Thanks, > Cov > > -- > Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm > Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code > Aurora Forum, a Linux Foundation Collaborative Project. >
On 11/16/2016 11:25 AM, Andrew Jones wrote: > On Wed, Nov 16, 2016 at 11:08:42AM -0500, Christopher Covington wrote: >> On 11/16/2016 08:01 AM, Andrew Jones wrote: >>> On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote: >>>> >>>> >>>> On 11/14/2016 09:12 AM, Christopher Covington wrote: >>>>> Hi Drew, Wei, >>>>> >>>>> On 11/14/2016 05:05 AM, Andrew Jones wrote: >>>>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote: >>>>>>> >>>>>>> >>>>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote: >>>>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote: >>>>>>>>> From: Christopher Covington <cov@codeaurora.org> >>>>>>>>> >>>>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing, >>>>>>>>> even for the smallest delta of two subsequent reads. >>>>>>>>> >>>>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org> >>>>>>>>> Signed-off-by: Wei Huang <wei@redhat.com> >>>>>>>>> --- >>>>>>>>> arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>>>> 1 file changed, 98 insertions(+) >>>>>>>>> >>>>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c >>>>>>>>> index 0b29088..d5e3ac3 100644 >>>>>>>>> --- a/arm/pmu.c >>>>>>>>> +++ b/arm/pmu.c >>>>>>>>> @@ -14,6 +14,7 @@ >>>>>>>>> */ >>>>>>>>> #include "libcflat.h" >>>>>>>>> >>>>>>>>> +#define PMU_PMCR_E (1 << 0) >>>>>>>>> #define PMU_PMCR_N_SHIFT 11 >>>>>>>>> #define PMU_PMCR_N_MASK 0x1f >>>>>>>>> #define PMU_PMCR_ID_SHIFT 16 >>>>>>>>> @@ -21,6 +22,10 @@ >>>>>>>>> #define PMU_PMCR_IMP_SHIFT 24 >>>>>>>>> #define PMU_PMCR_IMP_MASK 0xff >>>>>>>>> >>>>>>>>> +#define PMU_CYCLE_IDX 31 >>>>>>>>> + >>>>>>>>> +#define NR_SAMPLES 10 >>>>>>>>> + >>>>>>>>> #if defined(__arm__) >>>>>>>>> static inline uint32_t pmcr_read(void) >>>>>>>>> { >>>>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) >>>>>>>>> asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); >>>>>>>>> return ret; >>>>>>>>> } >>>>>>>>> + >>>>>>>>> +static inline void pmcr_write(uint32_t value) >>>>>>>>> +{ >>>>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void pmselr_write(uint32_t value) >>>>>>>>> +{ >>>>>>>>> + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +static inline void pmxevtyper_write(uint32_t value) >>>>>>>>> +{ >>>>>>>>> + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); >>>>>>>>> +} >>>>>>>>> + >>>>>>>>> +/* >>>>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 >>>>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is >>>>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return >>>>>>>>> + * the lower 32 bits of the cycle count in AArch32. >>>>>>>> >>>>>>>> Like I said in the last review, I'd rather we not do this. We should >>>>>>>> return the full value and then the test case should confirm the upper >>>>>>>> 32 bits are zero. >>>>>>> >>>>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit >>>>>>> register. We can force it to a more coarse-grained cycle counter with >>>>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register. >>>>> >>>>> AArch32 System Register Descriptions >>>>> Performance Monitors registers >>>>> PMCCNTR, Performance Monitors Cycle Count Register >>>>> >>>>> To access the PMCCNTR when accessing as a 32-bit register: >>>>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt >>>>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged >>>>> >>>>> To access the PMCCNTR when accessing as a 64-bit register: >>>>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2 >>>>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32] >>>>> >>>> >>>> Thanks. I did some research based on your info and came back with the >>>> following proposals (Cov, correct me if I am wrong): >>>> >>>> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I >>>> think this 64-bit cycle register is only available when running under >>>> aarch32 compatibility mode on ARMv8 because it is not specified in A15 >>>> TRM. >> >> That interpretation sounds really strange to me. My recollection is that the >> cycle counter was available as a 64 bit register in ARMv7 as well. I would >> expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference >> Manual is the complete and authoritative source. > > Yes, the v7 ARM ARM is the authoritative source, and it says 32-bit. > Whereas the v8 ARM ARM wrt to AArch32 mode says it's both 32 and 64. Just looked it up as well in the good old ARM DDI 0406C.c and you're absolutely right. Sorry for the bad recollection. Cov
diff --git a/arm/pmu.c b/arm/pmu.c index 0b29088..d5e3ac3 100644 --- a/arm/pmu.c +++ b/arm/pmu.c @@ -14,6 +14,7 @@ */ #include "libcflat.h" +#define PMU_PMCR_E (1 << 0) #define PMU_PMCR_N_SHIFT 11 #define PMU_PMCR_N_MASK 0x1f #define PMU_PMCR_ID_SHIFT 16 @@ -21,6 +22,10 @@ #define PMU_PMCR_IMP_SHIFT 24 #define PMU_PMCR_IMP_MASK 0xff +#define PMU_CYCLE_IDX 31 + +#define NR_SAMPLES 10 + #if defined(__arm__) static inline uint32_t pmcr_read(void) { @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void) asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret)); return ret; } + +static inline void pmcr_write(uint32_t value) +{ + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value)); +} + +static inline void pmselr_write(uint32_t value) +{ + asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value)); +} + +static inline void pmxevtyper_write(uint32_t value) +{ + asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value)); +} + +/* + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64 + * bits doesn't seem worth the trouble when differential usage of the result is + * expected (with differences that can easily fit in 32 bits). So just return + * the lower 32 bits of the cycle count in AArch32. + */ +static inline uint32_t pmccntr_read(void) +{ + uint32_t cycles; + + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); + return cycles; +} + +static inline void pmcntenset_write(uint32_t value) +{ + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value)); +} + +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */ +static inline void pmccfiltr_write(uint32_t value) +{ + pmselr_write(PMU_CYCLE_IDX); + pmxevtyper_write(value); +} #elif defined(__aarch64__) static inline uint32_t pmcr_read(void) { @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void) asm volatile("mrs %0, pmcr_el0" : "=r" (ret)); return ret; } + +static inline void pmcr_write(uint32_t value) +{ + asm volatile("msr pmcr_el0, %0" : : "r" (value)); +} + +static inline uint32_t pmccntr_read(void) +{ + uint32_t cycles; + + asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles)); + return cycles; +} + +static inline void pmcntenset_write(uint32_t value) +{ + asm volatile("msr pmcntenset_el0, %0" : : "r" (value)); +} + +static inline void pmccfiltr_write(uint32_t value) +{ + asm volatile("msr pmccfiltr_el0, %0" : : "r" (value)); +} #endif /* @@ -63,11 +132,40 @@ static bool check_pmcr(void) return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0; } +/* + * Ensure that the cycle counter progresses between back-to-back reads. + */ +static bool check_cycles_increase(void) +{ + pmcr_write(pmcr_read() | PMU_PMCR_E); + + for (int i = 0; i < NR_SAMPLES; i++) { + unsigned long a, b; + + a = pmccntr_read(); + b = pmccntr_read(); + + if (a >= b) { + printf("Read %ld then %ld.\n", a, b); + return false; + } + } + + pmcr_write(pmcr_read() & ~PMU_PMCR_E); + + return true; +} + int main(void) { report_prefix_push("pmu"); + /* init for PMU event access, right now only care about cycle count */ + pmcntenset_write(1 << PMU_CYCLE_IDX); + pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */ + report("Control register", check_pmcr()); + report("Monotonically increasing cycle count", check_cycles_increase()); return report_summary(); }