diff mbox series

[1/1] KVM: selftests: pmu_counters_test: increase robustness of LLC cache misses

Message ID 20240621204305.1730677-2-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: selftests: pmu_counters_test: increase robustness of LLC cache misses | expand

Commit Message

Maxim Levitsky June 21, 2024, 8:43 p.m. UTC
Currently this test does a single CLFLUSH on its memory location
but due to speculative execution this might not cause LLC misses.

Instead, do a cache flush on each loop iteration to confuse the prediction
and make sure that cache misses always occur.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 .../selftests/kvm/x86_64/pmu_counters_test.c  | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

Comments

Maxim Levitsky June 26, 2024, 4:08 p.m. UTC | #1
On Fri, 2024-06-21 at 16:43 -0400, Maxim Levitsky wrote:
> Currently this test does a single CLFLUSH on its memory location
> but due to speculative execution this might not cause LLC misses.
> 
> Instead, do a cache flush on each loop iteration to confuse the prediction
> and make sure that cache misses always occur.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  .../selftests/kvm/x86_64/pmu_counters_test.c  | 20 +++++++++----------
>  1 file changed, 9 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> index 96446134c00b7..ddc0b7e4a888e 100644
> --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> @@ -14,8 +14,8 @@
>   * instructions that are needed to set up the loop and then disabled the
>   * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
>   */
> -#define NUM_EXTRA_INSNS		7
> -#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
> +#define NUM_EXTRA_INSNS		5
> +#define NUM_INSNS_RETIRED	(NUM_BRANCHES * 2 + NUM_EXTRA_INSNS)
>  
>  static uint8_t kvm_pmu_version;
>  static bool kvm_has_perf_caps;
> @@ -133,9 +133,8 @@ static void guest_assert_event_count(uint8_t idx,
>   * doesn't need to be clobbered as the input value, @pmc_msr, is restored
>   * before the end of the sequence.
>   *
> - * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
> - * start of the loop to force LLC references and misses, i.e. to allow testing
> - * that those events actually count.
> + * If CLFUSH{,OPT} is supported, flush the cacheline containing the CLFUSH{,OPT}
> + * instruction on each loop iteration to ensure that LLC cache misses happen.
>   *
>   * If forced emulation is enabled (and specified), force emulation on a subset
>   * of the measured code to verify that KVM correctly emulates instructions and
> @@ -145,10 +144,9 @@ static void guest_assert_event_count(uint8_t idx,
>  #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
>  do {										\
>  	__asm__ __volatile__("wrmsr\n\t"					\
> -			     clflush "\n\t"					\
> -			     "mfence\n\t"					\
> -			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> -			     FEP "loop .\n\t"					\
> +			     " mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> +			     "1: " clflush "\n\t"				\
> +			     FEP "loop 1b\n\t"					\
>  			     FEP "mov %%edi, %%ecx\n\t"				\
>  			     FEP "xor %%eax, %%eax\n\t"				\
>  			     FEP "xor %%edx, %%edx\n\t"				\
> @@ -163,9 +161,9 @@ do {										\
>  	wrmsr(pmc_msr, 0);							\
>  										\
>  	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
> -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
> +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
>  	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
> -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
> +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
>  	else									\
>  		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
>  										\

Any update? The test patched with this patch survived about 3 days of running
in a loop.

Best regards,
	Maxim Levitsky
Sean Christopherson June 27, 2024, 5:42 p.m. UTC | #2
On Fri, Jun 21, 2024, Maxim Levitsky wrote:
> Currently this test does a single CLFLUSH on its memory location
> but due to speculative execution this might not cause LLC misses.
> 
> Instead, do a cache flush on each loop iteration to confuse the prediction
> and make sure that cache misses always occur.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  .../selftests/kvm/x86_64/pmu_counters_test.c  | 20 +++++++++----------
>  1 file changed, 9 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> index 96446134c00b7..ddc0b7e4a888e 100644
> --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> @@ -14,8 +14,8 @@
>   * instructions that are needed to set up the loop and then disabled the
>   * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
>   */
> -#define NUM_EXTRA_INSNS		7
> -#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
> +#define NUM_EXTRA_INSNS		5
> +#define NUM_INSNS_RETIRED	(NUM_BRANCHES * 2 + NUM_EXTRA_INSNS)

The comment above is stale.  I also think it's worth adding a macro to capture
that the '2' comes from having two instructions in the loop body (three, if we
keep the MFENCE).

>  static uint8_t kvm_pmu_version;
>  static bool kvm_has_perf_caps;
> @@ -133,9 +133,8 @@ static void guest_assert_event_count(uint8_t idx,
>   * doesn't need to be clobbered as the input value, @pmc_msr, is restored
>   * before the end of the sequence.
>   *
> - * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
> - * start of the loop to force LLC references and misses, i.e. to allow testing
> - * that those events actually count.
> + * If CLFUSH{,OPT} is supported, flush the cacheline containing the CLFUSH{,OPT}
> + * instruction on each loop iteration to ensure that LLC cache misses happen.
>   *
>   * If forced emulation is enabled (and specified), force emulation on a subset
>   * of the measured code to verify that KVM correctly emulates instructions and
> @@ -145,10 +144,9 @@ static void guest_assert_event_count(uint8_t idx,
>  #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
>  do {										\
>  	__asm__ __volatile__("wrmsr\n\t"					\
> -			     clflush "\n\t"					\
> -			     "mfence\n\t"					\

Based on your testing, it's probably ok to drop the mfence, but I don't see any
reason to do so.  It's not like that mfence meaningfully affects the runtime, and
anything easy/free we can do to avoid flaky tests is worth doing.

I'll post and apply a v2, with a prep patch to add a NUM_INSNS_PER_LOOP macro and
keep the MFENCE (I'll be offline all of next week, and don't want to push anything
to -next tomorrow, even though the risk of breaking anything is minimal).

> -			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> -			     FEP "loop .\n\t"					\
> +			     " mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> +			     "1: " clflush "\n\t"				\
> +			     FEP "loop 1b\n\t"					\
>  			     FEP "mov %%edi, %%ecx\n\t"				\
>  			     FEP "xor %%eax, %%eax\n\t"				\
>  			     FEP "xor %%edx, %%edx\n\t"				\
> @@ -163,9 +161,9 @@ do {										\
>  	wrmsr(pmc_msr, 0);							\
>  										\
>  	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
> -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
> +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
>  	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
> -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
> +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
>  	else									\
>  		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
>  										\
> -- 
> 2.26.3
>
Maxim Levitsky July 5, 2024, 2:48 a.m. UTC | #3
On Thu, 2024-06-27 at 10:42 -0700, Sean Christopherson wrote:
> On Fri, Jun 21, 2024, Maxim Levitsky wrote:
> > Currently this test does a single CLFLUSH on its memory location
> > but due to speculative execution this might not cause LLC misses.
> > 
> > Instead, do a cache flush on each loop iteration to confuse the prediction
> > and make sure that cache misses always occur.
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  .../selftests/kvm/x86_64/pmu_counters_test.c  | 20 +++++++++----------
> >  1 file changed, 9 insertions(+), 11 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> > index 96446134c00b7..ddc0b7e4a888e 100644
> > --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> > +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
> > @@ -14,8 +14,8 @@
> >   * instructions that are needed to set up the loop and then disabled the
> >   * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
> >   */
> > -#define NUM_EXTRA_INSNS		7
> > -#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
> > +#define NUM_EXTRA_INSNS		5
> > +#define NUM_INSNS_RETIRED	(NUM_BRANCHES * 2 + NUM_EXTRA_INSNS)
> 
> The comment above is stale.  I also think it's worth adding a macro to capture
> that the '2' comes from having two instructions in the loop body (three, if we
> keep the MFENCE).

True, my mistake.

> 
> >  static uint8_t kvm_pmu_version;
> >  static bool kvm_has_perf_caps;
> > @@ -133,9 +133,8 @@ static void guest_assert_event_count(uint8_t idx,
> >   * doesn't need to be clobbered as the input value, @pmc_msr, is restored
> >   * before the end of the sequence.
> >   *
> > - * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
> > - * start of the loop to force LLC references and misses, i.e. to allow testing
> > - * that those events actually count.
> > + * If CLFUSH{,OPT} is supported, flush the cacheline containing the CLFUSH{,OPT}
> > + * instruction on each loop iteration to ensure that LLC cache misses happen.
> >   *
> >   * If forced emulation is enabled (and specified), force emulation on a subset
> >   * of the measured code to verify that KVM correctly emulates instructions and
> > @@ -145,10 +144,9 @@ static void guest_assert_event_count(uint8_t idx,
> >  #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
> >  do {										\
> >  	__asm__ __volatile__("wrmsr\n\t"					\
> > -			     clflush "\n\t"					\
> > -			     "mfence\n\t"					\
> 
> Based on your testing, it's probably ok to drop the mfence, but I don't see any
> reason to do so.  It's not like that mfence meaningfully affects the runtime, and
> anything easy/free we can do to avoid flaky tests is worth doing.

Hi,

I just didn't want to add another instruction to the loop, since in theory
that will slow the test down.


From PRM:

"Executions of the CLFLUSH instruction are ordered with respect to each other and with respect to writes, locked
read-modify-write instructions, and fence instructions. 1 They are not ordered with respect to executions of
CLFLUSHOPT and CLWB. Software can use the SFENCE instruction to order an execution of CLFLUSH relative to one
of those operations."

Plus there is note that:

"Earlier versions of this manual specified that executions of the CLFLUSH instruction were ordered only by the MFENCE instruction.
All processors implementing the CLFLUSH instruction also order it relative to the other operations enumerated above."

Here we have an instruction fetch and cache flush, and it is not clear if MFENCE orders two operations.
Thus it is not clear if MFENCE helps or not.

I honestly would have preferred a cache flush on data memory, followed by a read from it, except
that this also sometimes doesn't work (maybe I made some mistake, maybe it is possible to make it work, don't know)

But overall I don't object keeping it.


> 
> I'll post and apply a v2, with a prep patch to add a NUM_INSNS_PER_LOOP macro and
> keep the MFENCE (I'll be offline all of next week, and don't want to push anything
> to -next tomorrow, even though the risk of breaking anything is minimal).

Sounds good.

Best regards,
	Maxim Levitsky


> 
> > -			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> > -			     FEP "loop .\n\t"					\
> > +			     " mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
> > +			     "1: " clflush "\n\t"				\
> > +			     FEP "loop 1b\n\t"					\
> >  			     FEP "mov %%edi, %%ecx\n\t"				\
> >  			     FEP "xor %%eax, %%eax\n\t"				\
> >  			     FEP "xor %%edx, %%edx\n\t"				\
> > @@ -163,9 +161,9 @@ do {										\
> >  	wrmsr(pmc_msr, 0);							\
> >  										\
> >  	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
> > -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
> > +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
> >  	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
> > -		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
> > +		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
> >  	else									\
> >  		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
> >  										\
> > -- 
> > 2.26.3
> >
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
index 96446134c00b7..ddc0b7e4a888e 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -14,8 +14,8 @@ 
  * instructions that are needed to set up the loop and then disabled the
  * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
  */
-#define NUM_EXTRA_INSNS		7
-#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
+#define NUM_EXTRA_INSNS		5
+#define NUM_INSNS_RETIRED	(NUM_BRANCHES * 2 + NUM_EXTRA_INSNS)
 
 static uint8_t kvm_pmu_version;
 static bool kvm_has_perf_caps;
@@ -133,9 +133,8 @@  static void guest_assert_event_count(uint8_t idx,
  * doesn't need to be clobbered as the input value, @pmc_msr, is restored
  * before the end of the sequence.
  *
- * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
- * start of the loop to force LLC references and misses, i.e. to allow testing
- * that those events actually count.
+ * If CLFUSH{,OPT} is supported, flush the cacheline containing the CLFUSH{,OPT}
+ * instruction on each loop iteration to ensure that LLC cache misses happen.
  *
  * If forced emulation is enabled (and specified), force emulation on a subset
  * of the measured code to verify that KVM correctly emulates instructions and
@@ -145,10 +144,9 @@  static void guest_assert_event_count(uint8_t idx,
 #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
 do {										\
 	__asm__ __volatile__("wrmsr\n\t"					\
-			     clflush "\n\t"					\
-			     "mfence\n\t"					\
-			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
-			     FEP "loop .\n\t"					\
+			     " mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
+			     "1: " clflush "\n\t"				\
+			     FEP "loop 1b\n\t"					\
 			     FEP "mov %%edi, %%ecx\n\t"				\
 			     FEP "xor %%eax, %%eax\n\t"				\
 			     FEP "xor %%edx, %%edx\n\t"				\
@@ -163,9 +161,9 @@  do {										\
 	wrmsr(pmc_msr, 0);							\
 										\
 	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
-		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP);	\
 	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
-		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP);	\
 	else									\
 		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
 										\