diff mbox

[kvm-unit-tests,v8,2/3] arm: pmu: Check cycle count increases

Message ID 1478629035-12938-3-git-send-email-wei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wei Huang Nov. 8, 2016, 6:17 p.m. UTC
From: Christopher Covington <cov@codeaurora.org>

Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
even for the smallest delta of two subsequent reads.

Signed-off-by: Christopher Covington <cov@codeaurora.org>
Signed-off-by: Wei Huang <wei@redhat.com>
---
 arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

Comments

Andrew Jones Nov. 11, 2016, 7:43 a.m. UTC | #1
On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
> From: Christopher Covington <cov@codeaurora.org>
> 
> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
> even for the smallest delta of two subsequent reads.
> 
> Signed-off-by: Christopher Covington <cov@codeaurora.org>
> Signed-off-by: Wei Huang <wei@redhat.com>
> ---
>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 98 insertions(+)
> 
> diff --git a/arm/pmu.c b/arm/pmu.c
> index 0b29088..d5e3ac3 100644
> --- a/arm/pmu.c
> +++ b/arm/pmu.c
> @@ -14,6 +14,7 @@
>   */
>  #include "libcflat.h"
>  
> +#define PMU_PMCR_E         (1 << 0)
>  #define PMU_PMCR_N_SHIFT   11
>  #define PMU_PMCR_N_MASK    0x1f
>  #define PMU_PMCR_ID_SHIFT  16
> @@ -21,6 +22,10 @@
>  #define PMU_PMCR_IMP_SHIFT 24
>  #define PMU_PMCR_IMP_MASK  0xff
>  
> +#define PMU_CYCLE_IDX      31
> +
> +#define NR_SAMPLES 10
> +
>  #if defined(__arm__)
>  static inline uint32_t pmcr_read(void)
>  {
> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>  	return ret;
>  }
> +
> +static inline void pmcr_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
> +}
> +
> +static inline void pmselr_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
> +}
> +
> +static inline void pmxevtyper_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
> +}
> +
> +/*
> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
> + * bits doesn't seem worth the trouble when differential usage of the result is
> + * expected (with differences that can easily fit in 32 bits). So just return
> + * the lower 32 bits of the cycle count in AArch32.

Like I said in the last review, I'd rather we not do this. We should
return the full value and then the test case should confirm the upper
32 bits are zero.

> + */
> +static inline uint32_t pmccntr_read(void)
> +{
> +	uint32_t cycles;
> +
> +	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
> +	return cycles;
> +}
> +
> +static inline void pmcntenset_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value));
> +}
> +
> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */
> +static inline void pmccfiltr_write(uint32_t value)
> +{
> +	pmselr_write(PMU_CYCLE_IDX);
> +	pmxevtyper_write(value);
> +}
>  #elif defined(__aarch64__)
>  static inline uint32_t pmcr_read(void)
>  {
> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void)
>  	asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
>  	return ret;
>  }
> +
> +static inline void pmcr_write(uint32_t value)
> +{
> +	asm volatile("msr pmcr_el0, %0" : : "r" (value));
> +}
> +
> +static inline uint32_t pmccntr_read(void)
> +{
> +	uint32_t cycles;
> +
> +	asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
> +	return cycles;
> +}
> +
> +static inline void pmcntenset_write(uint32_t value)
> +{
> +	asm volatile("msr pmcntenset_el0, %0" : : "r" (value));
> +}
> +
> +static inline void pmccfiltr_write(uint32_t value)
> +{
> +	asm volatile("msr pmccfiltr_el0, %0" : : "r" (value));
> +}
>  #endif
>  
>  /*
> @@ -63,11 +132,40 @@ static bool check_pmcr(void)
>  	return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0;
>  }
>  
> +/*
> + * Ensure that the cycle counter progresses between back-to-back reads.
> + */
> +static bool check_cycles_increase(void)
> +{
> +	pmcr_write(pmcr_read() | PMU_PMCR_E);
> +
> +	for (int i = 0; i < NR_SAMPLES; i++) {
> +		unsigned long a, b;
> +
> +		a = pmccntr_read();
> +		b = pmccntr_read();
> +
> +		if (a >= b) {
> +			printf("Read %ld then %ld.\n", a, b);
> +			return false;
> +		}
> +	}
> +
> +	pmcr_write(pmcr_read() & ~PMU_PMCR_E);
> +
> +	return true;
> +}
> +
>  int main(void)
>  {
>  	report_prefix_push("pmu");
>  
> +	/* init for PMU event access, right now only care about cycle count */
> +	pmcntenset_write(1 << PMU_CYCLE_IDX);
> +	pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */
> +
>  	report("Control register", check_pmcr());
> +	report("Monotonically increasing cycle count", check_cycles_increase());
>  
>  	return report_summary();
>  }
> -- 
> 1.8.3.1

Besides needing to use u64's for registers that return u64's, it
looks good to me.

drew
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wei Huang Nov. 11, 2016, 7:55 p.m. UTC | #2
On 11/11/2016 01:43 AM, Andrew Jones wrote:
> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
>> From: Christopher Covington <cov@codeaurora.org>
>>
>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>> even for the smallest delta of two subsequent reads.
>>
>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
>> Signed-off-by: Wei Huang <wei@redhat.com>
>> ---
>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 98 insertions(+)
>>
>> diff --git a/arm/pmu.c b/arm/pmu.c
>> index 0b29088..d5e3ac3 100644
>> --- a/arm/pmu.c
>> +++ b/arm/pmu.c
>> @@ -14,6 +14,7 @@
>>   */
>>  #include "libcflat.h"
>>  
>> +#define PMU_PMCR_E         (1 << 0)
>>  #define PMU_PMCR_N_SHIFT   11
>>  #define PMU_PMCR_N_MASK    0x1f
>>  #define PMU_PMCR_ID_SHIFT  16
>> @@ -21,6 +22,10 @@
>>  #define PMU_PMCR_IMP_SHIFT 24
>>  #define PMU_PMCR_IMP_MASK  0xff
>>  
>> +#define PMU_CYCLE_IDX      31
>> +
>> +#define NR_SAMPLES 10
>> +
>>  #if defined(__arm__)
>>  static inline uint32_t pmcr_read(void)
>>  {
>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>  	return ret;
>>  }
>> +
>> +static inline void pmcr_write(uint32_t value)
>> +{
>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
>> +}
>> +
>> +static inline void pmselr_write(uint32_t value)
>> +{
>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
>> +}
>> +
>> +static inline void pmxevtyper_write(uint32_t value)
>> +{
>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
>> +}
>> +
>> +/*
>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
>> + * bits doesn't seem worth the trouble when differential usage of the result is
>> + * expected (with differences that can easily fit in 32 bits). So just return
>> + * the lower 32 bits of the cycle count in AArch32.
> 
> Like I said in the last review, I'd rather we not do this. We should
> return the full value and then the test case should confirm the upper
> 32 bits are zero.
> 

Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
register. We can force it to a more coarse-grained cycle counter with
PMCR.D bit=1 (see below). But it is still not a 64-bit register. ARMv8
PMCCNTR_EL0 is a 64-bit register.

"The PMCR.D bit configures whether PMCCNTR increments once every clock
cycle, or once every 64 clock cycles. "

So I think the comment above in the code is an overstatement, which
should be deleted or moved down to ARMv8 pmccntr_read() below.

>> + */
>> +static inline uint32_t pmccntr_read(void)
>> +{
>> +	uint32_t cycles;
>> +
>> +	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
>> +	return cycles;
>> +}
>> +
>> +static inline void pmcntenset_write(uint32_t value)
>> +{
>> +	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value));
>> +}
>> +
>> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */
>> +static inline void pmccfiltr_write(uint32_t value)
>> +{
>> +	pmselr_write(PMU_CYCLE_IDX);
>> +	pmxevtyper_write(value);
>> +}
>>  #elif defined(__aarch64__)
>>  static inline uint32_t pmcr_read(void)
>>  {
>> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void)
>>  	asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
>>  	return ret;
>>  }
>> +
>> +static inline void pmcr_write(uint32_t value)
>> +{
>> +	asm volatile("msr pmcr_el0, %0" : : "r" (value));
>> +}
>> +
>> +static inline uint32_t pmccntr_read(void)
>> +{
>> +	uint32_t cycles;
>> +
>> +	asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
>> +	return cycles;
>> +}
>> +
>> +static inline void pmcntenset_write(uint32_t value)
>> +{
>> +	asm volatile("msr pmcntenset_el0, %0" : : "r" (value));
>> +}
>> +
>> +static inline void pmccfiltr_write(uint32_t value)
>> +{
>> +	asm volatile("msr pmccfiltr_el0, %0" : : "r" (value));
>> +}
>>  #endif
>>  
>>  /*
>> @@ -63,11 +132,40 @@ static bool check_pmcr(void)
>>  	return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0;
>>  }
>>  
>> +/*
>> + * Ensure that the cycle counter progresses between back-to-back reads.
>> + */
>> +static bool check_cycles_increase(void)
>> +{
>> +	pmcr_write(pmcr_read() | PMU_PMCR_E);
>> +
>> +	for (int i = 0; i < NR_SAMPLES; i++) {
>> +		unsigned long a, b;
>> +
>> +		a = pmccntr_read();
>> +		b = pmccntr_read();
>> +
>> +		if (a >= b) {
>> +			printf("Read %ld then %ld.\n", a, b);
>> +			return false;
>> +		}
>> +	}
>> +
>> +	pmcr_write(pmcr_read() & ~PMU_PMCR_E);
>> +
>> +	return true;
>> +}
>> +
>>  int main(void)
>>  {
>>  	report_prefix_push("pmu");
>>  
>> +	/* init for PMU event access, right now only care about cycle count */
>> +	pmcntenset_write(1 << PMU_CYCLE_IDX);
>> +	pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */
>> +
>>  	report("Control register", check_pmcr());
>> +	report("Monotonically increasing cycle count", check_cycles_increase());
>>  
>>  	return report_summary();
>>  }
>> -- 
>> 1.8.3.1
> 
> Besides needing to use u64's for registers that return u64's, it
> looks good to me.
> 
> drew
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jones Nov. 14, 2016, 10:05 a.m. UTC | #3
On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
> 
> 
> On 11/11/2016 01:43 AM, Andrew Jones wrote:
> > On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
> >> From: Christopher Covington <cov@codeaurora.org>
> >>
> >> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
> >> even for the smallest delta of two subsequent reads.
> >>
> >> Signed-off-by: Christopher Covington <cov@codeaurora.org>
> >> Signed-off-by: Wei Huang <wei@redhat.com>
> >> ---
> >>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 98 insertions(+)
> >>
> >> diff --git a/arm/pmu.c b/arm/pmu.c
> >> index 0b29088..d5e3ac3 100644
> >> --- a/arm/pmu.c
> >> +++ b/arm/pmu.c
> >> @@ -14,6 +14,7 @@
> >>   */
> >>  #include "libcflat.h"
> >>  
> >> +#define PMU_PMCR_E         (1 << 0)
> >>  #define PMU_PMCR_N_SHIFT   11
> >>  #define PMU_PMCR_N_MASK    0x1f
> >>  #define PMU_PMCR_ID_SHIFT  16
> >> @@ -21,6 +22,10 @@
> >>  #define PMU_PMCR_IMP_SHIFT 24
> >>  #define PMU_PMCR_IMP_MASK  0xff
> >>  
> >> +#define PMU_CYCLE_IDX      31
> >> +
> >> +#define NR_SAMPLES 10
> >> +
> >>  #if defined(__arm__)
> >>  static inline uint32_t pmcr_read(void)
> >>  {
> >> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
> >>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
> >>  	return ret;
> >>  }
> >> +
> >> +static inline void pmcr_write(uint32_t value)
> >> +{
> >> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
> >> +}
> >> +
> >> +static inline void pmselr_write(uint32_t value)
> >> +{
> >> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
> >> +}
> >> +
> >> +static inline void pmxevtyper_write(uint32_t value)
> >> +{
> >> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
> >> +}
> >> +
> >> +/*
> >> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
> >> + * bits doesn't seem worth the trouble when differential usage of the result is
> >> + * expected (with differences that can easily fit in 32 bits). So just return
> >> + * the lower 32 bits of the cycle count in AArch32.
> > 
> > Like I said in the last review, I'd rather we not do this. We should
> > return the full value and then the test case should confirm the upper
> > 32 bits are zero.
> > 
> 
> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
> register. We can force it to a more coarse-grained cycle counter with
> PMCR.D bit=1 (see below). But it is still not a 64-bit register. ARMv8
> PMCCNTR_EL0 is a 64-bit register.
> 
> "The PMCR.D bit configures whether PMCCNTR increments once every clock
> cycle, or once every 64 clock cycles. "
> 
> So I think the comment above in the code is an overstatement, which
> should be deleted or moved down to ARMv8 pmccntr_read() below.

OK, please fix as appropriate, but for the v8 64-bit register, please
don't drop the upper bits until after a unit test has a chance to check
them.

Thanks,
drew

> 
> >> + */
> >> +static inline uint32_t pmccntr_read(void)
> >> +{
> >> +	uint32_t cycles;
> >> +
> >> +	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
> >> +	return cycles;
> >> +}
> >> +
> >> +static inline void pmcntenset_write(uint32_t value)
> >> +{
> >> +	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value));
> >> +}
> >> +
> >> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */
> >> +static inline void pmccfiltr_write(uint32_t value)
> >> +{
> >> +	pmselr_write(PMU_CYCLE_IDX);
> >> +	pmxevtyper_write(value);
> >> +}
> >>  #elif defined(__aarch64__)
> >>  static inline uint32_t pmcr_read(void)
> >>  {
> >> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void)
> >>  	asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
> >>  	return ret;
> >>  }
> >> +
> >> +static inline void pmcr_write(uint32_t value)
> >> +{
> >> +	asm volatile("msr pmcr_el0, %0" : : "r" (value));
> >> +}
> >> +
> >> +static inline uint32_t pmccntr_read(void)
> >> +{
> >> +	uint32_t cycles;
> >> +
> >> +	asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
> >> +	return cycles;
> >> +}
> >> +
> >> +static inline void pmcntenset_write(uint32_t value)
> >> +{
> >> +	asm volatile("msr pmcntenset_el0, %0" : : "r" (value));
> >> +}
> >> +
> >> +static inline void pmccfiltr_write(uint32_t value)
> >> +{
> >> +	asm volatile("msr pmccfiltr_el0, %0" : : "r" (value));
> >> +}
> >>  #endif
> >>  
> >>  /*
> >> @@ -63,11 +132,40 @@ static bool check_pmcr(void)
> >>  	return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0;
> >>  }
> >>  
> >> +/*
> >> + * Ensure that the cycle counter progresses between back-to-back reads.
> >> + */
> >> +static bool check_cycles_increase(void)
> >> +{
> >> +	pmcr_write(pmcr_read() | PMU_PMCR_E);
> >> +
> >> +	for (int i = 0; i < NR_SAMPLES; i++) {
> >> +		unsigned long a, b;
> >> +
> >> +		a = pmccntr_read();
> >> +		b = pmccntr_read();
> >> +
> >> +		if (a >= b) {
> >> +			printf("Read %ld then %ld.\n", a, b);
> >> +			return false;
> >> +		}
> >> +	}
> >> +
> >> +	pmcr_write(pmcr_read() & ~PMU_PMCR_E);
> >> +
> >> +	return true;
> >> +}
> >> +
> >>  int main(void)
> >>  {
> >>  	report_prefix_push("pmu");
> >>  
> >> +	/* init for PMU event access, right now only care about cycle count */
> >> +	pmcntenset_write(1 << PMU_CYCLE_IDX);
> >> +	pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */
> >> +
> >>  	report("Control register", check_pmcr());
> >> +	report("Monotonically increasing cycle count", check_cycles_increase());
> >>  
> >>  	return report_summary();
> >>  }
> >> -- 
> >> 1.8.3.1
> > 
> > Besides needing to use u64's for registers that return u64's, it
> > looks good to me.
> > 
> > drew
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christopher Covington Nov. 14, 2016, 3:12 p.m. UTC | #4
Hi Drew, Wei,

On 11/14/2016 05:05 AM, Andrew Jones wrote:
> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
>>
>>
>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
>>>> From: Christopher Covington <cov@codeaurora.org>
>>>>
>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>>>> even for the smallest delta of two subsequent reads.
>>>>
>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
>>>> Signed-off-by: Wei Huang <wei@redhat.com>
>>>> ---
>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 98 insertions(+)
>>>>
>>>> diff --git a/arm/pmu.c b/arm/pmu.c
>>>> index 0b29088..d5e3ac3 100644
>>>> --- a/arm/pmu.c
>>>> +++ b/arm/pmu.c
>>>> @@ -14,6 +14,7 @@
>>>>   */
>>>>  #include "libcflat.h"
>>>>  
>>>> +#define PMU_PMCR_E         (1 << 0)
>>>>  #define PMU_PMCR_N_SHIFT   11
>>>>  #define PMU_PMCR_N_MASK    0x1f
>>>>  #define PMU_PMCR_ID_SHIFT  16
>>>> @@ -21,6 +22,10 @@
>>>>  #define PMU_PMCR_IMP_SHIFT 24
>>>>  #define PMU_PMCR_IMP_MASK  0xff
>>>>  
>>>> +#define PMU_CYCLE_IDX      31
>>>> +
>>>> +#define NR_SAMPLES 10
>>>> +
>>>>  #if defined(__arm__)
>>>>  static inline uint32_t pmcr_read(void)
>>>>  {
>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>>>  	return ret;
>>>>  }
>>>> +
>>>> +static inline void pmcr_write(uint32_t value)
>>>> +{
>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
>>>> +}
>>>> +
>>>> +static inline void pmselr_write(uint32_t value)
>>>> +{
>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
>>>> +}
>>>> +
>>>> +static inline void pmxevtyper_write(uint32_t value)
>>>> +{
>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
>>>> +}
>>>> +
>>>> +/*
>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
>>>> + * expected (with differences that can easily fit in 32 bits). So just return
>>>> + * the lower 32 bits of the cycle count in AArch32.
>>>
>>> Like I said in the last review, I'd rather we not do this. We should
>>> return the full value and then the test case should confirm the upper
>>> 32 bits are zero.
>>
>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
>> register. We can force it to a more coarse-grained cycle counter with
>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.

AArch32 System Register Descriptions
Performance Monitors registers
PMCCNTR, Performance Monitors Cycle Count Register

To access the PMCCNTR when accessing as a 32-bit register:
MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged

To access the PMCCNTR when accessing as a 64-bit register:
MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]

Regards,
Cov
Wei Huang Nov. 15, 2016, 10:50 p.m. UTC | #5
On 11/14/2016 09:12 AM, Christopher Covington wrote:
> Hi Drew, Wei,
> 
> On 11/14/2016 05:05 AM, Andrew Jones wrote:
>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
>>>
>>>
>>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
>>>>> From: Christopher Covington <cov@codeaurora.org>
>>>>>
>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>>>>> even for the smallest delta of two subsequent reads.
>>>>>
>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
>>>>> Signed-off-by: Wei Huang <wei@redhat.com>
>>>>> ---
>>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  1 file changed, 98 insertions(+)
>>>>>
>>>>> diff --git a/arm/pmu.c b/arm/pmu.c
>>>>> index 0b29088..d5e3ac3 100644
>>>>> --- a/arm/pmu.c
>>>>> +++ b/arm/pmu.c
>>>>> @@ -14,6 +14,7 @@
>>>>>   */
>>>>>  #include "libcflat.h"
>>>>>  
>>>>> +#define PMU_PMCR_E         (1 << 0)
>>>>>  #define PMU_PMCR_N_SHIFT   11
>>>>>  #define PMU_PMCR_N_MASK    0x1f
>>>>>  #define PMU_PMCR_ID_SHIFT  16
>>>>> @@ -21,6 +22,10 @@
>>>>>  #define PMU_PMCR_IMP_SHIFT 24
>>>>>  #define PMU_PMCR_IMP_MASK  0xff
>>>>>  
>>>>> +#define PMU_CYCLE_IDX      31
>>>>> +
>>>>> +#define NR_SAMPLES 10
>>>>> +
>>>>>  #if defined(__arm__)
>>>>>  static inline uint32_t pmcr_read(void)
>>>>>  {
>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>>>>  	return ret;
>>>>>  }
>>>>> +
>>>>> +static inline void pmcr_write(uint32_t value)
>>>>> +{
>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
>>>>> +}
>>>>> +
>>>>> +static inline void pmselr_write(uint32_t value)
>>>>> +{
>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
>>>>> +}
>>>>> +
>>>>> +static inline void pmxevtyper_write(uint32_t value)
>>>>> +{
>>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
>>>>> + * expected (with differences that can easily fit in 32 bits). So just return
>>>>> + * the lower 32 bits of the cycle count in AArch32.
>>>>
>>>> Like I said in the last review, I'd rather we not do this. We should
>>>> return the full value and then the test case should confirm the upper
>>>> 32 bits are zero.
>>>
>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
>>> register. We can force it to a more coarse-grained cycle counter with
>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.
> 
> AArch32 System Register Descriptions
> Performance Monitors registers
> PMCCNTR, Performance Monitors Cycle Count Register
> 
> To access the PMCCNTR when accessing as a 32-bit register:
> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged
> 
> To access the PMCCNTR when accessing as a 64-bit register:
> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]
> 

Thanks. I did some research based on your info and came back with the
following proposals (Cov, correct me if I am wrong):

By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I
think this 64-bit cycle register is only available when running under
aarch32 compatibility mode on ARMv8 because it is not specified in A15
TRM. To further verify it, I tested 32-bit pmu code on QEMU with TCG
mode. The result is: accessing 64-bit PMCCNTR using the following
assembly failed on A15:

   volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi));
or
   volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val));

Given this difference, I think there are two solutions for 64-bit
AArch32 pmccntr_read, as requested by Drew:

1) The PMU unit testing code tells if it is running under ARMv7 or under
AArch32-compability mode. When it is running ARMv7, such as A15, let us
use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise
use "MRRC p15,0,<Rt>,<Rt2>,c9".

2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC
p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will
be the same as the original code.

Thoughts?

-Wei

[1] A57 TRM,
http://infocenter.arm.com/help/topic/com.arm.doc.ddi0488c/DDI0488C_cortex_a57_mpcore_r1p0_trm.pdf
[2] A15 TRM,
http://infocenter.arm.com/help/topic/com.arm.doc.ddi0438c/DDI0438C_cortex_a15_r2p0_trm.pdf

> Regards,
> Cov
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jones Nov. 16, 2016, 1:01 p.m. UTC | #6
On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote:
> 
> 
> On 11/14/2016 09:12 AM, Christopher Covington wrote:
> > Hi Drew, Wei,
> > 
> > On 11/14/2016 05:05 AM, Andrew Jones wrote:
> >> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
> >>>
> >>>
> >>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
> >>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
> >>>>> From: Christopher Covington <cov@codeaurora.org>
> >>>>>
> >>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
> >>>>> even for the smallest delta of two subsequent reads.
> >>>>>
> >>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
> >>>>> Signed-off-by: Wei Huang <wei@redhat.com>
> >>>>> ---
> >>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>  1 file changed, 98 insertions(+)
> >>>>>
> >>>>> diff --git a/arm/pmu.c b/arm/pmu.c
> >>>>> index 0b29088..d5e3ac3 100644
> >>>>> --- a/arm/pmu.c
> >>>>> +++ b/arm/pmu.c
> >>>>> @@ -14,6 +14,7 @@
> >>>>>   */
> >>>>>  #include "libcflat.h"
> >>>>>  
> >>>>> +#define PMU_PMCR_E         (1 << 0)
> >>>>>  #define PMU_PMCR_N_SHIFT   11
> >>>>>  #define PMU_PMCR_N_MASK    0x1f
> >>>>>  #define PMU_PMCR_ID_SHIFT  16
> >>>>> @@ -21,6 +22,10 @@
> >>>>>  #define PMU_PMCR_IMP_SHIFT 24
> >>>>>  #define PMU_PMCR_IMP_MASK  0xff
> >>>>>  
> >>>>> +#define PMU_CYCLE_IDX      31
> >>>>> +
> >>>>> +#define NR_SAMPLES 10
> >>>>> +
> >>>>>  #if defined(__arm__)
> >>>>>  static inline uint32_t pmcr_read(void)
> >>>>>  {
> >>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
> >>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
> >>>>>  	return ret;
> >>>>>  }
> >>>>> +
> >>>>> +static inline void pmcr_write(uint32_t value)
> >>>>> +{
> >>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
> >>>>> +}
> >>>>> +
> >>>>> +static inline void pmselr_write(uint32_t value)
> >>>>> +{
> >>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
> >>>>> +}
> >>>>> +
> >>>>> +static inline void pmxevtyper_write(uint32_t value)
> >>>>> +{
> >>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
> >>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
> >>>>> + * expected (with differences that can easily fit in 32 bits). So just return
> >>>>> + * the lower 32 bits of the cycle count in AArch32.
> >>>>
> >>>> Like I said in the last review, I'd rather we not do this. We should
> >>>> return the full value and then the test case should confirm the upper
> >>>> 32 bits are zero.
> >>>
> >>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
> >>> register. We can force it to a more coarse-grained cycle counter with
> >>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.
> > 
> > AArch32 System Register Descriptions
> > Performance Monitors registers
> > PMCCNTR, Performance Monitors Cycle Count Register
> > 
> > To access the PMCCNTR when accessing as a 32-bit register:
> > MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
> > MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged
> > 
> > To access the PMCCNTR when accessing as a 64-bit register:
> > MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
> > MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]
> > 
> 
> Thanks. I did some research based on your info and came back with the
> following proposals (Cov, correct me if I am wrong):
> 
> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I
> think this 64-bit cycle register is only available when running under
> aarch32 compatibility mode on ARMv8 because it is not specified in A15
> TRM.

OK, I hadn't realized that there would be differences between v7 and
AArch32. It looks like we need to add a function to the kvm-unit-tests
framework that enables unit tests to make that distinction, because we'll
want to explicitly test those differences in order to flush out emulation
bugs. I see now that Appendix K5 of the v8 ARM ARM lists some differences,
but this PMCCNTR difference isn't there...

As v8-A32 is an update/extension of v7-A, I'd expect there to be a RES0
bit in some v7 ID register that, on v8, is no longer reserved and a 1.
Unfortunately I just did some ARM doc skimming but can't find anything
like that. As we currently only use the cortex-a15 for our v7 processor,
then I guess we can just check MIDR, but yuck. Anyway, I'll send a
patch for that.

> To further verify it, I tested 32-bit pmu code on QEMU with TCG
> mode. The result is: accessing 64-bit PMCCNTR using the following
> assembly failed on A15:
> 
>    volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi));
> or
>    volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val));
> 
> Given this difference, I think there are two solutions for 64-bit
> AArch32 pmccntr_read, as requested by Drew:
> 
> 1) The PMU unit testing code tells if it is running under ARMv7 or under
> AArch32-compability mode. When it is running ARMv7, such as A15, let us
> use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise
> use "MRRC p15,0,<Rt>,<Rt2>,c9".
> 
> 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC
> p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will
> be the same as the original code.

3) For the basic test do (2), but add an additional test for AArch32
   mode that also does the MRRC. That way on AArch32 we test both access
   types.

Going with (3) means we can finish this series off now and then post
another patch later with the additional access, after my is_aarch32()
patch, that I'll write now, gets merged.

Thanks,
drew

> 
> Thoughts?
> 
> -Wei
> 
> [1] A57 TRM,
> http://infocenter.arm.com/help/topic/com.arm.doc.ddi0488c/DDI0488C_cortex_a57_mpcore_r1p0_trm.pdf
> [2] A15 TRM,
> http://infocenter.arm.com/help/topic/com.arm.doc.ddi0438c/DDI0438C_cortex_a15_r2p0_trm.pdf
> 
> > Regards,
> > Cov
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jones Nov. 16, 2016, 3:40 p.m. UTC | #7
Just crossed my mind that we're missing isb's.

On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
> From: Christopher Covington <cov@codeaurora.org>
> 
> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
> even for the smallest delta of two subsequent reads.
> 
> Signed-off-by: Christopher Covington <cov@codeaurora.org>
> Signed-off-by: Wei Huang <wei@redhat.com>
> ---
>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 98 insertions(+)
> 
> diff --git a/arm/pmu.c b/arm/pmu.c
> index 0b29088..d5e3ac3 100644
> --- a/arm/pmu.c
> +++ b/arm/pmu.c
> @@ -14,6 +14,7 @@
>   */
>  #include "libcflat.h"
>  
> +#define PMU_PMCR_E         (1 << 0)
>  #define PMU_PMCR_N_SHIFT   11
>  #define PMU_PMCR_N_MASK    0x1f
>  #define PMU_PMCR_ID_SHIFT  16
> @@ -21,6 +22,10 @@
>  #define PMU_PMCR_IMP_SHIFT 24
>  #define PMU_PMCR_IMP_MASK  0xff
>  
> +#define PMU_CYCLE_IDX      31
> +
> +#define NR_SAMPLES 10
> +
>  #if defined(__arm__)
>  static inline uint32_t pmcr_read(void)
>  {
> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>  	return ret;
>  }
> +
> +static inline void pmcr_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
> +}
> +
> +static inline void pmselr_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));

Probably want an isb here, users will call this and then immediately
another PMU reg write, like is done below

> +}
> +
> +static inline void pmxevtyper_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
> +}
> +
> +/*
> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
> + * bits doesn't seem worth the trouble when differential usage of the result is
> + * expected (with differences that can easily fit in 32 bits). So just return
> + * the lower 32 bits of the cycle count in AArch32.

Also, while we're discussing confirming upper bits are as expected, I
guess we should confirm no overflow too. We should clear the overflow
bit PMOVSCLR_EL0.C before we use the counter, and then check it at some
point to confirm it's as expected. I guess that could be separate test
cases though.

> + */
> +static inline uint32_t pmccntr_read(void)
> +{
> +	uint32_t cycles;
> +
> +	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
> +	return cycles;
> +}
> +
> +static inline void pmcntenset_write(uint32_t value)
> +{
> +	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value));
> +}
> +
> +/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */
> +static inline void pmccfiltr_write(uint32_t value)
> +{
> +	pmselr_write(PMU_CYCLE_IDX);
> +	pmxevtyper_write(value);
> +}
>  #elif defined(__aarch64__)
>  static inline uint32_t pmcr_read(void)
>  {
> @@ -37,6 +83,29 @@ static inline uint32_t pmcr_read(void)
>  	asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
>  	return ret;
>  }
> +
> +static inline void pmcr_write(uint32_t value)
> +{
> +	asm volatile("msr pmcr_el0, %0" : : "r" (value));
> +}
> +
> +static inline uint32_t pmccntr_read(void)
> +{
> +	uint32_t cycles;
> +
> +	asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
> +	return cycles;
> +}
> +
> +static inline void pmcntenset_write(uint32_t value)
> +{
> +	asm volatile("msr pmcntenset_el0, %0" : : "r" (value));
> +}
> +
> +static inline void pmccfiltr_write(uint32_t value)
> +{
> +	asm volatile("msr pmccfiltr_el0, %0" : : "r" (value));
> +}
>  #endif
>  
>  /*
> @@ -63,11 +132,40 @@ static bool check_pmcr(void)
>  	return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0;
>  }
>  
> +/*
> + * Ensure that the cycle counter progresses between back-to-back reads.
> + */
> +static bool check_cycles_increase(void)
> +{
> +	pmcr_write(pmcr_read() | PMU_PMCR_E);

Need isb() here

> +
> +	for (int i = 0; i < NR_SAMPLES; i++) {
> +		unsigned long a, b;
> +
> +		a = pmccntr_read();
> +		b = pmccntr_read();
> +
> +		if (a >= b) {
> +			printf("Read %ld then %ld.\n", a, b);
> +			return false;
> +		}
> +	}
> +
> +	pmcr_write(pmcr_read() & ~PMU_PMCR_E);
> +

Need isb() here

> +	return true;
> +}
> +
>  int main(void)
>  {
>  	report_prefix_push("pmu");
>  
> +	/* init for PMU event access, right now only care about cycle count */
> +	pmcntenset_write(1 << PMU_CYCLE_IDX);
> +	pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */

Need isb() here

> +
>  	report("Control register", check_pmcr());
> +	report("Monotonically increasing cycle count", check_cycles_increase());
>  
>  	return report_summary();
>  }
> -- 
> 1.8.3.1
> 
>

Thanks,
drew
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christopher Covington Nov. 16, 2016, 4:08 p.m. UTC | #8
On 11/16/2016 08:01 AM, Andrew Jones wrote:
> On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote:
>>
>>
>> On 11/14/2016 09:12 AM, Christopher Covington wrote:
>>> Hi Drew, Wei,
>>>
>>> On 11/14/2016 05:05 AM, Andrew Jones wrote:
>>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
>>>>>
>>>>>
>>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
>>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
>>>>>>> From: Christopher Covington <cov@codeaurora.org>
>>>>>>>
>>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>>>>>>> even for the smallest delta of two subsequent reads.
>>>>>>>
>>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
>>>>>>> Signed-off-by: Wei Huang <wei@redhat.com>
>>>>>>> ---
>>>>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>  1 file changed, 98 insertions(+)
>>>>>>>
>>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c
>>>>>>> index 0b29088..d5e3ac3 100644
>>>>>>> --- a/arm/pmu.c
>>>>>>> +++ b/arm/pmu.c
>>>>>>> @@ -14,6 +14,7 @@
>>>>>>>   */
>>>>>>>  #include "libcflat.h"
>>>>>>>  
>>>>>>> +#define PMU_PMCR_E         (1 << 0)
>>>>>>>  #define PMU_PMCR_N_SHIFT   11
>>>>>>>  #define PMU_PMCR_N_MASK    0x1f
>>>>>>>  #define PMU_PMCR_ID_SHIFT  16
>>>>>>> @@ -21,6 +22,10 @@
>>>>>>>  #define PMU_PMCR_IMP_SHIFT 24
>>>>>>>  #define PMU_PMCR_IMP_MASK  0xff
>>>>>>>  
>>>>>>> +#define PMU_CYCLE_IDX      31
>>>>>>> +
>>>>>>> +#define NR_SAMPLES 10
>>>>>>> +
>>>>>>>  #if defined(__arm__)
>>>>>>>  static inline uint32_t pmcr_read(void)
>>>>>>>  {
>>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>>>>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>>>>>>  	return ret;
>>>>>>>  }
>>>>>>> +
>>>>>>> +static inline void pmcr_write(uint32_t value)
>>>>>>> +{
>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void pmselr_write(uint32_t value)
>>>>>>> +{
>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void pmxevtyper_write(uint32_t value)
>>>>>>> +{
>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
>>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
>>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return
>>>>>>> + * the lower 32 bits of the cycle count in AArch32.
>>>>>>
>>>>>> Like I said in the last review, I'd rather we not do this. We should
>>>>>> return the full value and then the test case should confirm the upper
>>>>>> 32 bits are zero.
>>>>>
>>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
>>>>> register. We can force it to a more coarse-grained cycle counter with
>>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.
>>>
>>> AArch32 System Register Descriptions
>>> Performance Monitors registers
>>> PMCCNTR, Performance Monitors Cycle Count Register
>>>
>>> To access the PMCCNTR when accessing as a 32-bit register:
>>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
>>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged
>>>
>>> To access the PMCCNTR when accessing as a 64-bit register:
>>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
>>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]
>>>
>>
>> Thanks. I did some research based on your info and came back with the
>> following proposals (Cov, correct me if I am wrong):
>>
>> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I
>> think this 64-bit cycle register is only available when running under
>> aarch32 compatibility mode on ARMv8 because it is not specified in A15
>> TRM.

That interpretation sounds really strange to me. My recollection is that the
cycle counter was available as a 64 bit register in ARMv7 as well. I would
expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference
Manual is the complete and authoritative source.

>> To further verify it, I tested 32-bit pmu code on QEMU with TCG
>> mode. The result is: accessing 64-bit PMCCNTR using the following
>> assembly failed on A15:
>>
>>    volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi));
>> or
>>    volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val));

The PMU implementation on QEMU TCG mode is infantile. (I was trying to
write these tests to help guide fixes and enhancements in a
test-driven-development manner.) I would not trust QEMU TCG to behave
properly here. If you want to execute those instructions, is there anything
preventing you from doing it on hardware, or at least the Foundation Model?

>> Given this difference, I think there are two solutions for 64-bit
>> AArch32 pmccntr_read, as requested by Drew:
>>
>> 1) The PMU unit testing code tells if it is running under ARMv7 or under
>> AArch32-compability mode. When it is running ARMv7, such as A15, let us
>> use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise
>> use "MRRC p15,0,<Rt>,<Rt2>,c9".
>>
>> 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC
>> p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will
>> be the same as the original code.
> 
> 3) For the basic test do (2), but add an additional test for AArch32
>    mode that also does the MRRC. That way on AArch32 we test both access
>    types.

The upper bits being non-zero is an insane corner case.

I'd really prefer to first build some momentum with checks for issues that
are A) likely to occur and 2) not too difficult to check, like whether PMCR
is writeable (especially relevant to KVM mode where it's not by default).

Thanks,
Cov
Andrew Jones Nov. 16, 2016, 4:25 p.m. UTC | #9
On Wed, Nov 16, 2016 at 11:08:42AM -0500, Christopher Covington wrote:
> On 11/16/2016 08:01 AM, Andrew Jones wrote:
> > On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote:
> >>
> >>
> >> On 11/14/2016 09:12 AM, Christopher Covington wrote:
> >>> Hi Drew, Wei,
> >>>
> >>> On 11/14/2016 05:05 AM, Andrew Jones wrote:
> >>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
> >>>>>
> >>>>>
> >>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
> >>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
> >>>>>>> From: Christopher Covington <cov@codeaurora.org>
> >>>>>>>
> >>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
> >>>>>>> even for the smallest delta of two subsequent reads.
> >>>>>>>
> >>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
> >>>>>>> Signed-off-by: Wei Huang <wei@redhat.com>
> >>>>>>> ---
> >>>>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>  1 file changed, 98 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c
> >>>>>>> index 0b29088..d5e3ac3 100644
> >>>>>>> --- a/arm/pmu.c
> >>>>>>> +++ b/arm/pmu.c
> >>>>>>> @@ -14,6 +14,7 @@
> >>>>>>>   */
> >>>>>>>  #include "libcflat.h"
> >>>>>>>  
> >>>>>>> +#define PMU_PMCR_E         (1 << 0)
> >>>>>>>  #define PMU_PMCR_N_SHIFT   11
> >>>>>>>  #define PMU_PMCR_N_MASK    0x1f
> >>>>>>>  #define PMU_PMCR_ID_SHIFT  16
> >>>>>>> @@ -21,6 +22,10 @@
> >>>>>>>  #define PMU_PMCR_IMP_SHIFT 24
> >>>>>>>  #define PMU_PMCR_IMP_MASK  0xff
> >>>>>>>  
> >>>>>>> +#define PMU_CYCLE_IDX      31
> >>>>>>> +
> >>>>>>> +#define NR_SAMPLES 10
> >>>>>>> +
> >>>>>>>  #if defined(__arm__)
> >>>>>>>  static inline uint32_t pmcr_read(void)
> >>>>>>>  {
> >>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
> >>>>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
> >>>>>>>  	return ret;
> >>>>>>>  }
> >>>>>>> +
> >>>>>>> +static inline void pmcr_write(uint32_t value)
> >>>>>>> +{
> >>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline void pmselr_write(uint32_t value)
> >>>>>>> +{
> >>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline void pmxevtyper_write(uint32_t value)
> >>>>>>> +{
> >>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
> >>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
> >>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return
> >>>>>>> + * the lower 32 bits of the cycle count in AArch32.
> >>>>>>
> >>>>>> Like I said in the last review, I'd rather we not do this. We should
> >>>>>> return the full value and then the test case should confirm the upper
> >>>>>> 32 bits are zero.
> >>>>>
> >>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
> >>>>> register. We can force it to a more coarse-grained cycle counter with
> >>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.
> >>>
> >>> AArch32 System Register Descriptions
> >>> Performance Monitors registers
> >>> PMCCNTR, Performance Monitors Cycle Count Register
> >>>
> >>> To access the PMCCNTR when accessing as a 32-bit register:
> >>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
> >>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged
> >>>
> >>> To access the PMCCNTR when accessing as a 64-bit register:
> >>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
> >>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]
> >>>
> >>
> >> Thanks. I did some research based on your info and came back with the
> >> following proposals (Cov, correct me if I am wrong):
> >>
> >> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I
> >> think this 64-bit cycle register is only available when running under
> >> aarch32 compatibility mode on ARMv8 because it is not specified in A15
> >> TRM.
> 
> That interpretation sounds really strange to me. My recollection is that the
> cycle counter was available as a 64 bit register in ARMv7 as well. I would
> expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference
> Manual is the complete and authoritative source.

Yes, the v7 ARM ARM is the authoritative source, and it says 32-bit.
Whereas the v8 ARM ARM wrt to AArch32 mode says it's both 32 and 64.

> 
> >> To further verify it, I tested 32-bit pmu code on QEMU with TCG
> >> mode. The result is: accessing 64-bit PMCCNTR using the following
> >> assembly failed on A15:
> >>
> >>    volatile("mrrc p15, 0, %0, %1, c9" : "=r" (lo), "=r" (hi));
> >> or
> >>    volatile("mrrc p15, 0, %Q0, %R0, c9" : "=r" (val));
> 
> The PMU implementation on QEMU TCG mode is infantile. (I was trying to
> write these tests to help guide fixes and enhancements in a
> test-driven-development manner.) I would not trust QEMU TCG to behave
> properly here. If you want to execute those instructions, is there anything
> preventing you from doing it on hardware, or at least the Foundation Model?
> 
> >> Given this difference, I think there are two solutions for 64-bit
> >> AArch32 pmccntr_read, as requested by Drew:
> >>
> >> 1) The PMU unit testing code tells if it is running under ARMv7 or under
> >> AArch32-compability mode. When it is running ARMv7, such as A15, let us
> >> use "MRC p15,0,<Rt>,c9,c13,0" and clear the upper 32-bit as 0. Otherwise
> >> use "MRRC p15,0,<Rt>,<Rt2>,c9".
> >>
> >> 2) Returns 64-bit results for ARM pmccntr_read(). But we only uses "MRC
> >> p15,0,<Rt>,c9,c13,0" and always clear the upper 32-bit as 0. This will
> >> be the same as the original code.
> > 
> > 3) For the basic test do (2), but add an additional test for AArch32
> >    mode that also does the MRRC. That way on AArch32 we test both access
> >    types.
> 
> The upper bits being non-zero is an insane corner case.

The bits will most likely be zero, but how do you know without checking?
Also, just invoking an mrrc vs. mrc is a big difference when testing
emulation. We shouldn't skip it just because we expect it to give us a
boring result.

> 
> I'd really prefer to first build some momentum with checks for issues that
> are A) likely to occur and 2) not too difficult to check, like whether PMCR
> is writeable (especially relevant to KVM mode where it's not by default).

Looks like we're on the right track for this starter series then.

Thanks,
drew

> 
> Thanks,
> Cov
> 
> -- 
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
> Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code
> Aurora Forum, a Linux Foundation Collaborative Project.
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christopher Covington Nov. 16, 2016, 4:41 p.m. UTC | #10
On 11/16/2016 11:25 AM, Andrew Jones wrote:
> On Wed, Nov 16, 2016 at 11:08:42AM -0500, Christopher Covington wrote:
>> On 11/16/2016 08:01 AM, Andrew Jones wrote:
>>> On Tue, Nov 15, 2016 at 04:50:53PM -0600, Wei Huang wrote:
>>>>
>>>>
>>>> On 11/14/2016 09:12 AM, Christopher Covington wrote:
>>>>> Hi Drew, Wei,
>>>>>
>>>>> On 11/14/2016 05:05 AM, Andrew Jones wrote:
>>>>>> On Fri, Nov 11, 2016 at 01:55:49PM -0600, Wei Huang wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 11/11/2016 01:43 AM, Andrew Jones wrote:
>>>>>>>> On Tue, Nov 08, 2016 at 12:17:14PM -0600, Wei Huang wrote:
>>>>>>>>> From: Christopher Covington <cov@codeaurora.org>
>>>>>>>>>
>>>>>>>>> Ensure that reads of the PMCCNTR_EL0 are monotonically increasing,
>>>>>>>>> even for the smallest delta of two subsequent reads.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Christopher Covington <cov@codeaurora.org>
>>>>>>>>> Signed-off-by: Wei Huang <wei@redhat.com>
>>>>>>>>> ---
>>>>>>>>>  arm/pmu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>  1 file changed, 98 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/arm/pmu.c b/arm/pmu.c
>>>>>>>>> index 0b29088..d5e3ac3 100644
>>>>>>>>> --- a/arm/pmu.c
>>>>>>>>> +++ b/arm/pmu.c
>>>>>>>>> @@ -14,6 +14,7 @@
>>>>>>>>>   */
>>>>>>>>>  #include "libcflat.h"
>>>>>>>>>  
>>>>>>>>> +#define PMU_PMCR_E         (1 << 0)
>>>>>>>>>  #define PMU_PMCR_N_SHIFT   11
>>>>>>>>>  #define PMU_PMCR_N_MASK    0x1f
>>>>>>>>>  #define PMU_PMCR_ID_SHIFT  16
>>>>>>>>> @@ -21,6 +22,10 @@
>>>>>>>>>  #define PMU_PMCR_IMP_SHIFT 24
>>>>>>>>>  #define PMU_PMCR_IMP_MASK  0xff
>>>>>>>>>  
>>>>>>>>> +#define PMU_CYCLE_IDX      31
>>>>>>>>> +
>>>>>>>>> +#define NR_SAMPLES 10
>>>>>>>>> +
>>>>>>>>>  #if defined(__arm__)
>>>>>>>>>  static inline uint32_t pmcr_read(void)
>>>>>>>>>  {
>>>>>>>>> @@ -29,6 +34,47 @@ static inline uint32_t pmcr_read(void)
>>>>>>>>>  	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
>>>>>>>>>  	return ret;
>>>>>>>>>  }
>>>>>>>>> +
>>>>>>>>> +static inline void pmcr_write(uint32_t value)
>>>>>>>>> +{
>>>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void pmselr_write(uint32_t value)
>>>>>>>>> +{
>>>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void pmxevtyper_write(uint32_t value)
>>>>>>>>> +{
>>>>>>>>> +	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
>>>>>>>>> + * bits doesn't seem worth the trouble when differential usage of the result is
>>>>>>>>> + * expected (with differences that can easily fit in 32 bits). So just return
>>>>>>>>> + * the lower 32 bits of the cycle count in AArch32.
>>>>>>>>
>>>>>>>> Like I said in the last review, I'd rather we not do this. We should
>>>>>>>> return the full value and then the test case should confirm the upper
>>>>>>>> 32 bits are zero.
>>>>>>>
>>>>>>> Unless I miss something in ARM documentation, ARMv7 PMCCNTR is a 32-bit
>>>>>>> register. We can force it to a more coarse-grained cycle counter with
>>>>>>> PMCR.D bit=1 (see below). But it is still not a 64-bit register.
>>>>>
>>>>> AArch32 System Register Descriptions
>>>>> Performance Monitors registers
>>>>> PMCCNTR, Performance Monitors Cycle Count Register
>>>>>
>>>>> To access the PMCCNTR when accessing as a 32-bit register:
>>>>> MRC p15,0,<Rt>,c9,c13,0 ; Read PMCCNTR[31:0] into Rt
>>>>> MCR p15,0,<Rt>,c9,c13,0 ; Write Rt to PMCCNTR[31:0]. PMCCNTR[63:32] are unchanged
>>>>>
>>>>> To access the PMCCNTR when accessing as a 64-bit register:
>>>>> MRRC p15,0,<Rt>,<Rt2>,c9 ; Read PMCCNTR[31:0] into Rt and PMCCNTR[63:32] into Rt2
>>>>> MCRR p15,0,<Rt>,<Rt2>,c9 ; Write Rt to PMCCNTR[31:0] and Rt2 to PMCCNTR[63:32]
>>>>>
>>>>
>>>> Thanks. I did some research based on your info and came back with the
>>>> following proposals (Cov, correct me if I am wrong):
>>>>
>>>> By comparing A57 TRM (page 394 in [1]) with A15 TRM (page 273 in [2]), I
>>>> think this 64-bit cycle register is only available when running under
>>>> aarch32 compatibility mode on ARMv8 because it is not specified in A15
>>>> TRM.
>>
>> That interpretation sounds really strange to me. My recollection is that the
>> cycle counter was available as a 64 bit register in ARMv7 as well. I would
>> expect the Cortex TRMs to omit such details. The ARMv7 Architecture Reference
>> Manual is the complete and authoritative source.
> 
> Yes, the v7 ARM ARM is the authoritative source, and it says 32-bit.
> Whereas the v8 ARM ARM wrt to AArch32 mode says it's both 32 and 64.

Just looked it up as well in the good old ARM DDI 0406C.c and you're absolutely
right. Sorry for the bad recollection.

Cov
diff mbox

Patch

diff --git a/arm/pmu.c b/arm/pmu.c
index 0b29088..d5e3ac3 100644
--- a/arm/pmu.c
+++ b/arm/pmu.c
@@ -14,6 +14,7 @@ 
  */
 #include "libcflat.h"
 
+#define PMU_PMCR_E         (1 << 0)
 #define PMU_PMCR_N_SHIFT   11
 #define PMU_PMCR_N_MASK    0x1f
 #define PMU_PMCR_ID_SHIFT  16
@@ -21,6 +22,10 @@ 
 #define PMU_PMCR_IMP_SHIFT 24
 #define PMU_PMCR_IMP_MASK  0xff
 
+#define PMU_CYCLE_IDX      31
+
+#define NR_SAMPLES 10
+
 #if defined(__arm__)
 static inline uint32_t pmcr_read(void)
 {
@@ -29,6 +34,47 @@  static inline uint32_t pmcr_read(void)
 	asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (ret));
 	return ret;
 }
+
+static inline void pmcr_write(uint32_t value)
+{
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (value));
+}
+
+static inline void pmselr_write(uint32_t value)
+{
+	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (value));
+}
+
+static inline void pmxevtyper_write(uint32_t value)
+{
+	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (value));
+}
+
+/*
+ * While PMCCNTR can be accessed as a 64 bit coprocessor register, returning 64
+ * bits doesn't seem worth the trouble when differential usage of the result is
+ * expected (with differences that can easily fit in 32 bits). So just return
+ * the lower 32 bits of the cycle count in AArch32.
+ */
+static inline uint32_t pmccntr_read(void)
+{
+	uint32_t cycles;
+
+	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
+	return cycles;
+}
+
+static inline void pmcntenset_write(uint32_t value)
+{
+	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (value));
+}
+
+/* PMCCFILTR is an obsolete name for PMXEVTYPER31 in ARMv7 */
+static inline void pmccfiltr_write(uint32_t value)
+{
+	pmselr_write(PMU_CYCLE_IDX);
+	pmxevtyper_write(value);
+}
 #elif defined(__aarch64__)
 static inline uint32_t pmcr_read(void)
 {
@@ -37,6 +83,29 @@  static inline uint32_t pmcr_read(void)
 	asm volatile("mrs %0, pmcr_el0" : "=r" (ret));
 	return ret;
 }
+
+static inline void pmcr_write(uint32_t value)
+{
+	asm volatile("msr pmcr_el0, %0" : : "r" (value));
+}
+
+static inline uint32_t pmccntr_read(void)
+{
+	uint32_t cycles;
+
+	asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
+	return cycles;
+}
+
+static inline void pmcntenset_write(uint32_t value)
+{
+	asm volatile("msr pmcntenset_el0, %0" : : "r" (value));
+}
+
+static inline void pmccfiltr_write(uint32_t value)
+{
+	asm volatile("msr pmccfiltr_el0, %0" : : "r" (value));
+}
 #endif
 
 /*
@@ -63,11 +132,40 @@  static bool check_pmcr(void)
 	return ((pmcr >> PMU_PMCR_IMP_SHIFT) & PMU_PMCR_IMP_MASK) != 0;
 }
 
+/*
+ * Ensure that the cycle counter progresses between back-to-back reads.
+ */
+static bool check_cycles_increase(void)
+{
+	pmcr_write(pmcr_read() | PMU_PMCR_E);
+
+	for (int i = 0; i < NR_SAMPLES; i++) {
+		unsigned long a, b;
+
+		a = pmccntr_read();
+		b = pmccntr_read();
+
+		if (a >= b) {
+			printf("Read %ld then %ld.\n", a, b);
+			return false;
+		}
+	}
+
+	pmcr_write(pmcr_read() & ~PMU_PMCR_E);
+
+	return true;
+}
+
 int main(void)
 {
 	report_prefix_push("pmu");
 
+	/* init for PMU event access, right now only care about cycle count */
+	pmcntenset_write(1 << PMU_CYCLE_IDX);
+	pmccfiltr_write(0); /* count cycles in EL0, EL1, but not EL2 */
+
 	report("Control register", check_pmcr());
+	report("Monotonically increasing cycle count", check_cycles_increase());
 
 	return report_summary();
 }