diff mbox series

[kvm-unit-tests,v3,04/11] x86: pmu: Switch instructions and core cycles events sequence

Message ID 20240103031409.2504051-5-dapeng1.mi@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series pmu test bugs fix and improvements | expand

Commit Message

Mi, Dapeng Jan. 3, 2024, 3:14 a.m. UTC
When running pmu test on SPR, sometimes the following failure is
reported.

PMU version:         2
GP counters:         8
GP counter width:    48
Mask length:         8
Fixed counters:      3
Fixed counter width: 48
1000000 <= 55109398 <= 50000000
FAIL: Intel: core cycles-0
1000000 <= 18279571 <= 50000000
PASS: Intel: core cycles-1
1000000 <= 12238092 <= 50000000
PASS: Intel: core cycles-2
1000000 <= 7981727 <= 50000000
PASS: Intel: core cycles-3
1000000 <= 6984711 <= 50000000
PASS: Intel: core cycles-4
1000000 <= 6773673 <= 50000000
PASS: Intel: core cycles-5
1000000 <= 6697842 <= 50000000
PASS: Intel: core cycles-6
1000000 <= 6747947 <= 50000000
PASS: Intel: core cycles-7

The count of the "core cycles" on first counter would exceed the upper
boundary and leads to a failure, and then the "core cycles" count would
drop gradually and reach a stable state.

That looks reasonable. The "core cycles" event is defined as the 1st
event in xxx_gp_events[] array and it is always verified at first.
when the program loop() is executed at the first time it needs to warm
up the pipeline and cache, such as it has to wait for cache is filled.
All these warm-up work leads to a quite large core cycles count which
may exceeds the verification range.

The event "instructions" instead of "core cycles" is a good choice as
the warm-up event since it would always return a fixed count. Thus
switch instructions and core cycles events sequence in the
xxx_gp_events[] array.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
---
 x86/pmu.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

Comments

Mingwei Zhang March 27, 2024, 5:36 a.m. UTC | #1
On Wed, Jan 03, 2024, Dapeng Mi wrote:
> When running pmu test on SPR, sometimes the following failure is
> reported.
> 
> PMU version:         2
> GP counters:         8
> GP counter width:    48
> Mask length:         8
> Fixed counters:      3
> Fixed counter width: 48
> 1000000 <= 55109398 <= 50000000
> FAIL: Intel: core cycles-0
> 1000000 <= 18279571 <= 50000000
> PASS: Intel: core cycles-1
> 1000000 <= 12238092 <= 50000000
> PASS: Intel: core cycles-2
> 1000000 <= 7981727 <= 50000000
> PASS: Intel: core cycles-3
> 1000000 <= 6984711 <= 50000000
> PASS: Intel: core cycles-4
> 1000000 <= 6773673 <= 50000000
> PASS: Intel: core cycles-5
> 1000000 <= 6697842 <= 50000000
> PASS: Intel: core cycles-6
> 1000000 <= 6747947 <= 50000000
> PASS: Intel: core cycles-7
> 
> The count of the "core cycles" on first counter would exceed the upper
> boundary and leads to a failure, and then the "core cycles" count would
> drop gradually and reach a stable state.
> 
> That looks reasonable. The "core cycles" event is defined as the 1st
> event in xxx_gp_events[] array and it is always verified at first.
> when the program loop() is executed at the first time it needs to warm
> up the pipeline and cache, such as it has to wait for cache is filled.
> All these warm-up work leads to a quite large core cycles count which
> may exceeds the verification range.
> 
> The event "instructions" instead of "core cycles" is a good choice as
> the warm-up event since it would always return a fixed count. Thus
> switch instructions and core cycles events sequence in the
> xxx_gp_events[] array.

The observation is great. However, it is hard to agree that we fix the
problem by switching the order. Maybe directly tweaking the N from 50 to
a larger value makes more sense.

Thanks.
-Mingwei
> 
> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> ---
>  x86/pmu.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/x86/pmu.c b/x86/pmu.c
> index a42fff8d8b36..67ebfbe55b49 100644
> --- a/x86/pmu.c
> +++ b/x86/pmu.c
> @@ -31,16 +31,16 @@ struct pmu_event {
>  	int min;
>  	int max;
>  } intel_gp_events[] = {
> -	{"core cycles", 0x003c, 1*N, 50*N},
>  	{"instructions", 0x00c0, 10*N, 10.2*N},
> +	{"core cycles", 0x003c, 1*N, 50*N},
>  	{"ref cycles", 0x013c, 1*N, 30*N},
>  	{"llc references", 0x4f2e, 1, 2*N},
>  	{"llc misses", 0x412e, 1, 1*N},
>  	{"branches", 0x00c4, 1*N, 1.1*N},
>  	{"branch misses", 0x00c5, 0, 0.1*N},
>  }, amd_gp_events[] = {
> -	{"core cycles", 0x0076, 1*N, 50*N},
>  	{"instructions", 0x00c0, 10*N, 10.2*N},
> +	{"core cycles", 0x0076, 1*N, 50*N},
>  	{"branches", 0x00c2, 1*N, 1.1*N},
>  	{"branch misses", 0x00c3, 0, 0.1*N},
>  }, fixed_events[] = {
> @@ -307,7 +307,7 @@ static void check_counter_overflow(void)
>  	int i;
>  	pmu_counter_t cnt = {
>  		.ctr = MSR_GP_COUNTERx(0),
> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>  	};
>  	overflow_preset = measure_for_overflow(&cnt);
>  
> @@ -365,11 +365,11 @@ static void check_gp_counter_cmask(void)
>  {
>  	pmu_counter_t cnt = {
>  		.ctr = MSR_GP_COUNTERx(0),
> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>  	};
>  	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
>  	measure_one(&cnt);
> -	report(cnt.count < gp_events[1].min, "cmask");
> +	report(cnt.count < gp_events[0].min, "cmask");
>  }
>  
>  static void do_rdpmc_fast(void *ptr)
> @@ -446,7 +446,7 @@ static void check_running_counter_wrmsr(void)
>  	uint64_t count;
>  	pmu_counter_t evt = {
>  		.ctr = MSR_GP_COUNTERx(0),
> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>  	};
>  
>  	report_prefix_push("running counter wrmsr");
> @@ -455,7 +455,7 @@ static void check_running_counter_wrmsr(void)
>  	loop();
>  	wrmsr(MSR_GP_COUNTERx(0), 0);
>  	stop_event(&evt);
> -	report(evt.count < gp_events[1].min, "cntr");
> +	report(evt.count < gp_events[0].min, "cntr");
>  
>  	/* clear status before overflow test */
>  	if (this_cpu_has_perf_global_status())
> @@ -493,7 +493,7 @@ static void check_emulated_instr(void)
>  	pmu_counter_t instr_cnt = {
>  		.ctr = MSR_GP_COUNTERx(1),
>  		/* instructions */
> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>  	};
>  	report_prefix_push("emulated instruction");
>  
> -- 
> 2.34.1
>
Mi, Dapeng March 27, 2024, 8:54 a.m. UTC | #2
On 3/27/2024 1:36 PM, Mingwei Zhang wrote:
> On Wed, Jan 03, 2024, Dapeng Mi wrote:
>> When running pmu test on SPR, sometimes the following failure is
>> reported.
>>
>> PMU version:         2
>> GP counters:         8
>> GP counter width:    48
>> Mask length:         8
>> Fixed counters:      3
>> Fixed counter width: 48
>> 1000000 <= 55109398 <= 50000000
>> FAIL: Intel: core cycles-0
>> 1000000 <= 18279571 <= 50000000
>> PASS: Intel: core cycles-1
>> 1000000 <= 12238092 <= 50000000
>> PASS: Intel: core cycles-2
>> 1000000 <= 7981727 <= 50000000
>> PASS: Intel: core cycles-3
>> 1000000 <= 6984711 <= 50000000
>> PASS: Intel: core cycles-4
>> 1000000 <= 6773673 <= 50000000
>> PASS: Intel: core cycles-5
>> 1000000 <= 6697842 <= 50000000
>> PASS: Intel: core cycles-6
>> 1000000 <= 6747947 <= 50000000
>> PASS: Intel: core cycles-7
>>
>> The count of the "core cycles" on first counter would exceed the upper
>> boundary and leads to a failure, and then the "core cycles" count would
>> drop gradually and reach a stable state.
>>
>> That looks reasonable. The "core cycles" event is defined as the 1st
>> event in xxx_gp_events[] array and it is always verified at first.
>> when the program loop() is executed at the first time it needs to warm
>> up the pipeline and cache, such as it has to wait for cache is filled.
>> All these warm-up work leads to a quite large core cycles count which
>> may exceeds the verification range.
>>
>> The event "instructions" instead of "core cycles" is a good choice as
>> the warm-up event since it would always return a fixed count. Thus
>> switch instructions and core cycles events sequence in the
>> xxx_gp_events[] array.
> The observation is great. However, it is hard to agree that we fix the
> problem by switching the order. Maybe directly tweaking the N from 50 to
> a larger value makes more sense.
>
> Thanks.
> -Mingwei

yeah, a larger upper boundary can fix the fault as well, but the 
question is how large it would be enough. For different CPU model, the 
needed cycles could be different for warming up. So we may have to set a 
quite large upper boundary but a large boundary would decrease 
credibility of this validation. Not sure which one is better. Any inputs 
from other ones?


>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
>> ---
>>   x86/pmu.c | 16 ++++++++--------
>>   1 file changed, 8 insertions(+), 8 deletions(-)
>>
>> diff --git a/x86/pmu.c b/x86/pmu.c
>> index a42fff8d8b36..67ebfbe55b49 100644
>> --- a/x86/pmu.c
>> +++ b/x86/pmu.c
>> @@ -31,16 +31,16 @@ struct pmu_event {
>>   	int min;
>>   	int max;
>>   } intel_gp_events[] = {
>> -	{"core cycles", 0x003c, 1*N, 50*N},
>>   	{"instructions", 0x00c0, 10*N, 10.2*N},
>> +	{"core cycles", 0x003c, 1*N, 50*N},
>>   	{"ref cycles", 0x013c, 1*N, 30*N},
>>   	{"llc references", 0x4f2e, 1, 2*N},
>>   	{"llc misses", 0x412e, 1, 1*N},
>>   	{"branches", 0x00c4, 1*N, 1.1*N},
>>   	{"branch misses", 0x00c5, 0, 0.1*N},
>>   }, amd_gp_events[] = {
>> -	{"core cycles", 0x0076, 1*N, 50*N},
>>   	{"instructions", 0x00c0, 10*N, 10.2*N},
>> +	{"core cycles", 0x0076, 1*N, 50*N},
>>   	{"branches", 0x00c2, 1*N, 1.1*N},
>>   	{"branch misses", 0x00c3, 0, 0.1*N},
>>   }, fixed_events[] = {
>> @@ -307,7 +307,7 @@ static void check_counter_overflow(void)
>>   	int i;
>>   	pmu_counter_t cnt = {
>>   		.ctr = MSR_GP_COUNTERx(0),
>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>>   	};
>>   	overflow_preset = measure_for_overflow(&cnt);
>>   
>> @@ -365,11 +365,11 @@ static void check_gp_counter_cmask(void)
>>   {
>>   	pmu_counter_t cnt = {
>>   		.ctr = MSR_GP_COUNTERx(0),
>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>>   	};
>>   	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
>>   	measure_one(&cnt);
>> -	report(cnt.count < gp_events[1].min, "cmask");
>> +	report(cnt.count < gp_events[0].min, "cmask");
>>   }
>>   
>>   static void do_rdpmc_fast(void *ptr)
>> @@ -446,7 +446,7 @@ static void check_running_counter_wrmsr(void)
>>   	uint64_t count;
>>   	pmu_counter_t evt = {
>>   		.ctr = MSR_GP_COUNTERx(0),
>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>>   	};
>>   
>>   	report_prefix_push("running counter wrmsr");
>> @@ -455,7 +455,7 @@ static void check_running_counter_wrmsr(void)
>>   	loop();
>>   	wrmsr(MSR_GP_COUNTERx(0), 0);
>>   	stop_event(&evt);
>> -	report(evt.count < gp_events[1].min, "cntr");
>> +	report(evt.count < gp_events[0].min, "cntr");
>>   
>>   	/* clear status before overflow test */
>>   	if (this_cpu_has_perf_global_status())
>> @@ -493,7 +493,7 @@ static void check_emulated_instr(void)
>>   	pmu_counter_t instr_cnt = {
>>   		.ctr = MSR_GP_COUNTERx(1),
>>   		/* instructions */
>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>>   	};
>>   	report_prefix_push("emulated instruction");
>>   
>> -- 
>> 2.34.1
>>
Mingwei Zhang March 27, 2024, 5:06 p.m. UTC | #3
On Wed, Mar 27, 2024, Mi, Dapeng wrote:
> 
> On 3/27/2024 1:36 PM, Mingwei Zhang wrote:
> > On Wed, Jan 03, 2024, Dapeng Mi wrote:
> > > When running pmu test on SPR, sometimes the following failure is
> > > reported.
> > > 
> > > PMU version:         2
> > > GP counters:         8
> > > GP counter width:    48
> > > Mask length:         8
> > > Fixed counters:      3
> > > Fixed counter width: 48
> > > 1000000 <= 55109398 <= 50000000
> > > FAIL: Intel: core cycles-0
> > > 1000000 <= 18279571 <= 50000000
> > > PASS: Intel: core cycles-1
> > > 1000000 <= 12238092 <= 50000000
> > > PASS: Intel: core cycles-2
> > > 1000000 <= 7981727 <= 50000000
> > > PASS: Intel: core cycles-3
> > > 1000000 <= 6984711 <= 50000000
> > > PASS: Intel: core cycles-4
> > > 1000000 <= 6773673 <= 50000000
> > > PASS: Intel: core cycles-5
> > > 1000000 <= 6697842 <= 50000000
> > > PASS: Intel: core cycles-6
> > > 1000000 <= 6747947 <= 50000000
> > > PASS: Intel: core cycles-7
> > > 
> > > The count of the "core cycles" on first counter would exceed the upper
> > > boundary and leads to a failure, and then the "core cycles" count would
> > > drop gradually and reach a stable state.
> > > 
> > > That looks reasonable. The "core cycles" event is defined as the 1st
> > > event in xxx_gp_events[] array and it is always verified at first.
> > > when the program loop() is executed at the first time it needs to warm
> > > up the pipeline and cache, such as it has to wait for cache is filled.
> > > All these warm-up work leads to a quite large core cycles count which
> > > may exceeds the verification range.
> > > 
> > > The event "instructions" instead of "core cycles" is a good choice as
> > > the warm-up event since it would always return a fixed count. Thus
> > > switch instructions and core cycles events sequence in the
> > > xxx_gp_events[] array.
> > The observation is great. However, it is hard to agree that we fix the
> > problem by switching the order. Maybe directly tweaking the N from 50 to
> > a larger value makes more sense.
> > 
> > Thanks.
> > -Mingwei
> 
> yeah, a larger upper boundary can fix the fault as well, but the question is
> how large it would be enough. For different CPU model, the needed cycles
> could be different for warming up. So we may have to set a quite large upper
> boundary but a large boundary would decrease credibility of this validation.
> Not sure which one is better. Any inputs from other ones?
> 

Just checked with an expert from our side, so "core cycles" (0x003c)
is affected the current CPU state/frequency, ie., its counting value
could vary largely. In that sense, "warming" up seems reasonable.
However, switching the order would be a terrible idea for maintanence
since people will forget it and the problem will come back.

From another perspective, "warming" up seems just a best effort. Nobody
knows how warm is really warm. Besides, some systems might turn off some
C-State and may set a cap on max turbo frequency. All of these will
directly affect the warm-up process and the counting result of 0x003c.

So, while adding a warm-up blob is reasonable, tweaking the heuristics
seems to be same for me. Regarding the value, I think I will rely on
your experiments and observation.

Thanks.
-Mingwei
> 
> > > Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
> > > ---
> > >   x86/pmu.c | 16 ++++++++--------
> > >   1 file changed, 8 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/x86/pmu.c b/x86/pmu.c
> > > index a42fff8d8b36..67ebfbe55b49 100644
> > > --- a/x86/pmu.c
> > > +++ b/x86/pmu.c
> > > @@ -31,16 +31,16 @@ struct pmu_event {
> > >   	int min;
> > >   	int max;
> > >   } intel_gp_events[] = {
> > > -	{"core cycles", 0x003c, 1*N, 50*N},
> > >   	{"instructions", 0x00c0, 10*N, 10.2*N},
> > > +	{"core cycles", 0x003c, 1*N, 50*N},
> > >   	{"ref cycles", 0x013c, 1*N, 30*N},
> > >   	{"llc references", 0x4f2e, 1, 2*N},
> > >   	{"llc misses", 0x412e, 1, 1*N},
> > >   	{"branches", 0x00c4, 1*N, 1.1*N},
> > >   	{"branch misses", 0x00c5, 0, 0.1*N},
> > >   }, amd_gp_events[] = {
> > > -	{"core cycles", 0x0076, 1*N, 50*N},
> > >   	{"instructions", 0x00c0, 10*N, 10.2*N},
> > > +	{"core cycles", 0x0076, 1*N, 50*N},
> > >   	{"branches", 0x00c2, 1*N, 1.1*N},
> > >   	{"branch misses", 0x00c3, 0, 0.1*N},
> > >   }, fixed_events[] = {
> > > @@ -307,7 +307,7 @@ static void check_counter_overflow(void)
> > >   	int i;
> > >   	pmu_counter_t cnt = {
> > >   		.ctr = MSR_GP_COUNTERx(0),
> > > -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
> > > +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
> > >   	};
> > >   	overflow_preset = measure_for_overflow(&cnt);
> > > @@ -365,11 +365,11 @@ static void check_gp_counter_cmask(void)
> > >   {
> > >   	pmu_counter_t cnt = {
> > >   		.ctr = MSR_GP_COUNTERx(0),
> > > -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
> > > +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
> > >   	};
> > >   	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
> > >   	measure_one(&cnt);
> > > -	report(cnt.count < gp_events[1].min, "cmask");
> > > +	report(cnt.count < gp_events[0].min, "cmask");
> > >   }
> > >   static void do_rdpmc_fast(void *ptr)
> > > @@ -446,7 +446,7 @@ static void check_running_counter_wrmsr(void)
> > >   	uint64_t count;
> > >   	pmu_counter_t evt = {
> > >   		.ctr = MSR_GP_COUNTERx(0),
> > > -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
> > > +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
> > >   	};
> > >   	report_prefix_push("running counter wrmsr");
> > > @@ -455,7 +455,7 @@ static void check_running_counter_wrmsr(void)
> > >   	loop();
> > >   	wrmsr(MSR_GP_COUNTERx(0), 0);
> > >   	stop_event(&evt);
> > > -	report(evt.count < gp_events[1].min, "cntr");
> > > +	report(evt.count < gp_events[0].min, "cntr");
> > >   	/* clear status before overflow test */
> > >   	if (this_cpu_has_perf_global_status())
> > > @@ -493,7 +493,7 @@ static void check_emulated_instr(void)
> > >   	pmu_counter_t instr_cnt = {
> > >   		.ctr = MSR_GP_COUNTERx(1),
> > >   		/* instructions */
> > > -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
> > > +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
> > >   	};
> > >   	report_prefix_push("emulated instruction");
> > > -- 
> > > 2.34.1
> > >
Mi, Dapeng March 28, 2024, 10:09 a.m. UTC | #4
On 3/28/2024 1:06 AM, Mingwei Zhang wrote:
> On Wed, Mar 27, 2024, Mi, Dapeng wrote:
>> On 3/27/2024 1:36 PM, Mingwei Zhang wrote:
>>> On Wed, Jan 03, 2024, Dapeng Mi wrote:
>>>> When running pmu test on SPR, sometimes the following failure is
>>>> reported.
>>>>
>>>> PMU version:         2
>>>> GP counters:         8
>>>> GP counter width:    48
>>>> Mask length:         8
>>>> Fixed counters:      3
>>>> Fixed counter width: 48
>>>> 1000000 <= 55109398 <= 50000000
>>>> FAIL: Intel: core cycles-0
>>>> 1000000 <= 18279571 <= 50000000
>>>> PASS: Intel: core cycles-1
>>>> 1000000 <= 12238092 <= 50000000
>>>> PASS: Intel: core cycles-2
>>>> 1000000 <= 7981727 <= 50000000
>>>> PASS: Intel: core cycles-3
>>>> 1000000 <= 6984711 <= 50000000
>>>> PASS: Intel: core cycles-4
>>>> 1000000 <= 6773673 <= 50000000
>>>> PASS: Intel: core cycles-5
>>>> 1000000 <= 6697842 <= 50000000
>>>> PASS: Intel: core cycles-6
>>>> 1000000 <= 6747947 <= 50000000
>>>> PASS: Intel: core cycles-7
>>>>
>>>> The count of the "core cycles" on first counter would exceed the upper
>>>> boundary and leads to a failure, and then the "core cycles" count would
>>>> drop gradually and reach a stable state.
>>>>
>>>> That looks reasonable. The "core cycles" event is defined as the 1st
>>>> event in xxx_gp_events[] array and it is always verified at first.
>>>> when the program loop() is executed at the first time it needs to warm
>>>> up the pipeline and cache, such as it has to wait for cache is filled.
>>>> All these warm-up work leads to a quite large core cycles count which
>>>> may exceeds the verification range.
>>>>
>>>> The event "instructions" instead of "core cycles" is a good choice as
>>>> the warm-up event since it would always return a fixed count. Thus
>>>> switch instructions and core cycles events sequence in the
>>>> xxx_gp_events[] array.
>>> The observation is great. However, it is hard to agree that we fix the
>>> problem by switching the order. Maybe directly tweaking the N from 50 to
>>> a larger value makes more sense.
>>>
>>> Thanks.
>>> -Mingwei
>> yeah, a larger upper boundary can fix the fault as well, but the question is
>> how large it would be enough. For different CPU model, the needed cycles
>> could be different for warming up. So we may have to set a quite large upper
>> boundary but a large boundary would decrease credibility of this validation.
>> Not sure which one is better. Any inputs from other ones?
>>
> Just checked with an expert from our side, so "core cycles" (0x003c)
> is affected the current CPU state/frequency, ie., its counting value
> could vary largely. In that sense, "warming" up seems reasonable.
> However, switching the order would be a terrible idea for maintanence
> since people will forget it and the problem will come back.
>
>  From another perspective, "warming" up seems just a best effort. Nobody
> knows how warm is really warm. Besides, some systems might turn off some
> C-State and may set a cap on max turbo frequency. All of these will
> directly affect the warm-up process and the counting result of 0x003c.
>
> So, while adding a warm-up blob is reasonable, tweaking the heuristics
> seems to be same for me. Regarding the value, I think I will rely on
> your experiments and observation.

Per my understanding, most of extra cpu cycles should come from the warm 
up for cache. If we don't want to change the validation order,  it may 
be doable to add an extra warm-up phase before starting the validation. 
Thus we don't need to enlarge the upper boundary. It looks not a 
preferred way since it would decrease the credibility of the validation.

Let me try to add a warm-up phase first and check if it works as expect.


>
> Thanks.
> -Mingwei
>>>> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
>>>> ---
>>>>    x86/pmu.c | 16 ++++++++--------
>>>>    1 file changed, 8 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/x86/pmu.c b/x86/pmu.c
>>>> index a42fff8d8b36..67ebfbe55b49 100644
>>>> --- a/x86/pmu.c
>>>> +++ b/x86/pmu.c
>>>> @@ -31,16 +31,16 @@ struct pmu_event {
>>>>    	int min;
>>>>    	int max;
>>>>    } intel_gp_events[] = {
>>>> -	{"core cycles", 0x003c, 1*N, 50*N},
>>>>    	{"instructions", 0x00c0, 10*N, 10.2*N},
>>>> +	{"core cycles", 0x003c, 1*N, 50*N},
>>>>    	{"ref cycles", 0x013c, 1*N, 30*N},
>>>>    	{"llc references", 0x4f2e, 1, 2*N},
>>>>    	{"llc misses", 0x412e, 1, 1*N},
>>>>    	{"branches", 0x00c4, 1*N, 1.1*N},
>>>>    	{"branch misses", 0x00c5, 0, 0.1*N},
>>>>    }, amd_gp_events[] = {
>>>> -	{"core cycles", 0x0076, 1*N, 50*N},
>>>>    	{"instructions", 0x00c0, 10*N, 10.2*N},
>>>> +	{"core cycles", 0x0076, 1*N, 50*N},
>>>>    	{"branches", 0x00c2, 1*N, 1.1*N},
>>>>    	{"branch misses", 0x00c3, 0, 0.1*N},
>>>>    }, fixed_events[] = {
>>>> @@ -307,7 +307,7 @@ static void check_counter_overflow(void)
>>>>    	int i;
>>>>    	pmu_counter_t cnt = {
>>>>    		.ctr = MSR_GP_COUNTERx(0),
>>>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
>>>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>>>>    	};
>>>>    	overflow_preset = measure_for_overflow(&cnt);
>>>> @@ -365,11 +365,11 @@ static void check_gp_counter_cmask(void)
>>>>    {
>>>>    	pmu_counter_t cnt = {
>>>>    		.ctr = MSR_GP_COUNTERx(0),
>>>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
>>>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
>>>>    	};
>>>>    	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
>>>>    	measure_one(&cnt);
>>>> -	report(cnt.count < gp_events[1].min, "cmask");
>>>> +	report(cnt.count < gp_events[0].min, "cmask");
>>>>    }
>>>>    static void do_rdpmc_fast(void *ptr)
>>>> @@ -446,7 +446,7 @@ static void check_running_counter_wrmsr(void)
>>>>    	uint64_t count;
>>>>    	pmu_counter_t evt = {
>>>>    		.ctr = MSR_GP_COUNTERx(0),
>>>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
>>>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>>>>    	};
>>>>    	report_prefix_push("running counter wrmsr");
>>>> @@ -455,7 +455,7 @@ static void check_running_counter_wrmsr(void)
>>>>    	loop();
>>>>    	wrmsr(MSR_GP_COUNTERx(0), 0);
>>>>    	stop_event(&evt);
>>>> -	report(evt.count < gp_events[1].min, "cntr");
>>>> +	report(evt.count < gp_events[0].min, "cntr");
>>>>    	/* clear status before overflow test */
>>>>    	if (this_cpu_has_perf_global_status())
>>>> @@ -493,7 +493,7 @@ static void check_emulated_instr(void)
>>>>    	pmu_counter_t instr_cnt = {
>>>>    		.ctr = MSR_GP_COUNTERx(1),
>>>>    		/* instructions */
>>>> -		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
>>>> +		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
>>>>    	};
>>>>    	report_prefix_push("emulated instruction");
>>>> -- 
>>>> 2.34.1
>>>>
diff mbox series

Patch

diff --git a/x86/pmu.c b/x86/pmu.c
index a42fff8d8b36..67ebfbe55b49 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -31,16 +31,16 @@  struct pmu_event {
 	int min;
 	int max;
 } intel_gp_events[] = {
-	{"core cycles", 0x003c, 1*N, 50*N},
 	{"instructions", 0x00c0, 10*N, 10.2*N},
+	{"core cycles", 0x003c, 1*N, 50*N},
 	{"ref cycles", 0x013c, 1*N, 30*N},
 	{"llc references", 0x4f2e, 1, 2*N},
 	{"llc misses", 0x412e, 1, 1*N},
 	{"branches", 0x00c4, 1*N, 1.1*N},
 	{"branch misses", 0x00c5, 0, 0.1*N},
 }, amd_gp_events[] = {
-	{"core cycles", 0x0076, 1*N, 50*N},
 	{"instructions", 0x00c0, 10*N, 10.2*N},
+	{"core cycles", 0x0076, 1*N, 50*N},
 	{"branches", 0x00c2, 1*N, 1.1*N},
 	{"branch misses", 0x00c3, 0, 0.1*N},
 }, fixed_events[] = {
@@ -307,7 +307,7 @@  static void check_counter_overflow(void)
 	int i;
 	pmu_counter_t cnt = {
 		.ctr = MSR_GP_COUNTERx(0),
-		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
+		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
 	};
 	overflow_preset = measure_for_overflow(&cnt);
 
@@ -365,11 +365,11 @@  static void check_gp_counter_cmask(void)
 {
 	pmu_counter_t cnt = {
 		.ctr = MSR_GP_COUNTERx(0),
-		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
+		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel /* instructions */,
 	};
 	cnt.config |= (0x2 << EVNTSEL_CMASK_SHIFT);
 	measure_one(&cnt);
-	report(cnt.count < gp_events[1].min, "cmask");
+	report(cnt.count < gp_events[0].min, "cmask");
 }
 
 static void do_rdpmc_fast(void *ptr)
@@ -446,7 +446,7 @@  static void check_running_counter_wrmsr(void)
 	uint64_t count;
 	pmu_counter_t evt = {
 		.ctr = MSR_GP_COUNTERx(0),
-		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
+		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
 	};
 
 	report_prefix_push("running counter wrmsr");
@@ -455,7 +455,7 @@  static void check_running_counter_wrmsr(void)
 	loop();
 	wrmsr(MSR_GP_COUNTERx(0), 0);
 	stop_event(&evt);
-	report(evt.count < gp_events[1].min, "cntr");
+	report(evt.count < gp_events[0].min, "cntr");
 
 	/* clear status before overflow test */
 	if (this_cpu_has_perf_global_status())
@@ -493,7 +493,7 @@  static void check_emulated_instr(void)
 	pmu_counter_t instr_cnt = {
 		.ctr = MSR_GP_COUNTERx(1),
 		/* instructions */
-		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
+		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[0].unit_sel,
 	};
 	report_prefix_push("emulated instruction");