diff mbox series

[kvm-unit-tests,v3] x86: Add RDTSC test

Message ID 20191202204356.250357-1-aaronlewis@google.com (mailing list archive)
State New, archived
Headers show
Series [kvm-unit-tests,v3] x86: Add RDTSC test | expand

Commit Message

Aaron Lewis Dec. 2, 2019, 8:43 p.m. UTC
Verify that the difference between a guest RDTSC instruction and the
IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
MSR-store list is less than 750 cycles, 99.9% of the time.

662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
---
 x86/vmx.h       |  1 +
 x86/vmx_tests.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

Comments

Liran Alon Dec. 3, 2019, 1:51 a.m. UTC | #1
> On 2 Dec 2019, at 22:43, Aaron Lewis <aaronlewis@google.com> wrote:
> 
> Verify that the difference between a guest RDTSC instruction and the
> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> MSR-store list is less than 750 cycles, 99.9% of the time.
> 
> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)

Reviewed-by: Liran Alon <liran.alon@oracle.com>

> 
> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> Reviewed-by: Jim Mattson <jmattson@google.com>
> ---
> x86/vmx.h       |  1 +
> x86/vmx_tests.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 94 insertions(+)
> 
> diff --git a/x86/vmx.h b/x86/vmx.h
> index 8496be7..21ba953 100644
> --- a/x86/vmx.h
> +++ b/x86/vmx.h
> @@ -420,6 +420,7 @@ enum Ctrl1 {
> 	CPU_SHADOW_VMCS		= 1ul << 14,
> 	CPU_RDSEED		= 1ul << 16,
> 	CPU_PML                 = 1ul << 17,
> +	CPU_USE_TSC_SCALING	= 1ul << 25,
> };
> 
> enum Intr_type {
> diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
> index 1d8932f..6ceaf9a 100644
> --- a/x86/vmx_tests.c
> +++ b/x86/vmx_tests.c
> @@ -8790,7 +8790,99 @@ static void vmx_vmcs_shadow_test(void)
> 	enter_guest();
> }
> 
> +/*
> + * This test monitors the difference between a guest RDTSC instruction
> + * and the IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12
> + * VM-exit MSR-store list when taking a VM-exit on the instruction
> + * following RDTSC.
> + */
> +#define RDTSC_DIFF_ITERS 100000
> +#define RDTSC_DIFF_FAILS 100
> +#define HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD 750
> +
> +/*
> + * Set 'use TSC offsetting' and set the guest offset to the
> + * inverse of the host's current TSC value, so that the guest starts running
> + * with an effective TSC value of 0.
> + */
> +static void reset_guest_tsc_to_zero(void)
> +{
> +	TEST_ASSERT_MSG(ctrl_cpu_rev[0].clr & CPU_USE_TSC_OFFSET,
> +			"Expected support for 'use TSC offsetting'");
> +
> +	vmcs_set_bits(CPU_EXEC_CTRL0, CPU_USE_TSC_OFFSET);
> +	vmcs_write(TSC_OFFSET, -rdtsc());
> +}
> +
> +static void rdtsc_vmexit_diff_test_guest(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < RDTSC_DIFF_ITERS; i++)
> +		/* Ensure rdtsc is the last instruction before the vmcall. */
> +		asm volatile("rdtsc; vmcall" : : : "eax", "edx");
> +}
> 
> +/*
> + * This function only considers the "use TSC offsetting" VM-execution
> + * control.  It does not handle "use TSC scaling" (because the latter
> + * isn't available to the host today.)
> + */
> +static unsigned long long host_time_to_guest_time(unsigned long long t)
> +{
> +	TEST_ASSERT(!(ctrl_cpu_rev[0].clr & CPU_SECONDARY) ||
> +		    !(vmcs_read(CPU_EXEC_CTRL1) & CPU_USE_TSC_SCALING));
> +
> +	if (vmcs_read(CPU_EXEC_CTRL0) & CPU_USE_TSC_OFFSET)
> +		t += vmcs_read(TSC_OFFSET);
> +
> +	return t;
> +}
> +
> +static unsigned long long rdtsc_vmexit_diff_test_iteration(void)
> +{
> +	unsigned long long guest_tsc, host_to_guest_tsc;
> +
> +	enter_guest();
> +	skip_exit_vmcall();
> +	guest_tsc = (u32) regs.rax + (regs.rdx << 32);
> +	host_to_guest_tsc = host_time_to_guest_time(exit_msr_store[0].value);
> +
> +	return host_to_guest_tsc - guest_tsc;
> +}
> +
> +static void rdtsc_vmexit_diff_test(void)
> +{
> +	int fail = 0;
> +	int i;
> +
> +	test_set_guest(rdtsc_vmexit_diff_test_guest);
> +
> +	reset_guest_tsc_to_zero();
> +
> +	/*
> +	 * Set up the VMCS12 VM-exit MSR-store list to store just one
> +	 * MSR: IA32_TIME_STAMP_COUNTER. Note that the value stored is
> +	 * in the host time domain (i.e., it is not adjusted according
> +	 * to the TSC multiplier and TSC offset fields in the VMCS12,
> +	 * as a guest RDTSC would be.)
> +	 */
> +	exit_msr_store = alloc_page();
> +	exit_msr_store[0].index = MSR_IA32_TSC;
> +	vmcs_write(EXI_MSR_ST_CNT, 1);
> +	vmcs_write(EXIT_MSR_ST_ADDR, virt_to_phys(exit_msr_store));
> +
> +	for (i = 0; i < RDTSC_DIFF_ITERS; i++) {
> +		if (rdtsc_vmexit_diff_test_iteration() >=
> +		    HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD)
> +			fail++;
> +	}
> +
> +	enter_guest();
> +
> +	report("RDTSC to VM-exit delta too high in %d of %d iterations",
> +	       fail < RDTSC_DIFF_FAILS, fail, RDTSC_DIFF_ITERS);
> +}
> 
> static int invalid_msr_init(struct vmcs *vmcs)
> {
> @@ -9056,5 +9148,6 @@ struct vmx_test vmx_tests[] = {
> 	/* Atomic MSR switch tests. */
> 	TEST(atomic_switch_max_msrs_test),
> 	TEST(atomic_switch_overflow_msrs_test),
> +	TEST(rdtsc_vmexit_diff_test),
> 	{ NULL, NULL, NULL, NULL, NULL, {0} },
> };
> -- 
> 2.24.0.393.g34dc348eaf-goog
>
Paolo Bonzini Dec. 4, 2019, 11:48 a.m. UTC | #2
On 02/12/19 21:43, Aaron Lewis wrote:
> Verify that the difference between a guest RDTSC instruction and the
> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> MSR-store list is less than 750 cycles, 99.9% of the time.
> 
> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
> 
> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> Reviewed-by: Jim Mattson <jmattson@google.com>
> ---
>  x86/vmx.h       |  1 +
>  x86/vmx_tests.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 94 insertions(+)
> 
> diff --git a/x86/vmx.h b/x86/vmx.h
> index 8496be7..21ba953 100644
> --- a/x86/vmx.h
> +++ b/x86/vmx.h
> @@ -420,6 +420,7 @@ enum Ctrl1 {
>  	CPU_SHADOW_VMCS		= 1ul << 14,
>  	CPU_RDSEED		= 1ul << 16,
>  	CPU_PML                 = 1ul << 17,
> +	CPU_USE_TSC_SCALING	= 1ul << 25,
>  };
>  
>  enum Intr_type {
> diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
> index 1d8932f..6ceaf9a 100644
> --- a/x86/vmx_tests.c
> +++ b/x86/vmx_tests.c
> @@ -8790,7 +8790,99 @@ static void vmx_vmcs_shadow_test(void)
>  	enter_guest();
>  }
>  
> +/*
> + * This test monitors the difference between a guest RDTSC instruction
> + * and the IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12
> + * VM-exit MSR-store list when taking a VM-exit on the instruction
> + * following RDTSC.
> + */
> +#define RDTSC_DIFF_ITERS 100000
> +#define RDTSC_DIFF_FAILS 100
> +#define HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD 750
> +
> +/*
> + * Set 'use TSC offsetting' and set the guest offset to the
> + * inverse of the host's current TSC value, so that the guest starts running
> + * with an effective TSC value of 0.
> + */
> +static void reset_guest_tsc_to_zero(void)
> +{
> +	TEST_ASSERT_MSG(ctrl_cpu_rev[0].clr & CPU_USE_TSC_OFFSET,
> +			"Expected support for 'use TSC offsetting'");
> +
> +	vmcs_set_bits(CPU_EXEC_CTRL0, CPU_USE_TSC_OFFSET);
> +	vmcs_write(TSC_OFFSET, -rdtsc());
> +}
> +
> +static void rdtsc_vmexit_diff_test_guest(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < RDTSC_DIFF_ITERS; i++)
> +		/* Ensure rdtsc is the last instruction before the vmcall. */
> +		asm volatile("rdtsc; vmcall" : : : "eax", "edx");
> +}
>  
> +/*
> + * This function only considers the "use TSC offsetting" VM-execution
> + * control.  It does not handle "use TSC scaling" (because the latter
> + * isn't available to the host today.)
> + */
> +static unsigned long long host_time_to_guest_time(unsigned long long t)
> +{
> +	TEST_ASSERT(!(ctrl_cpu_rev[0].clr & CPU_SECONDARY) ||
> +		    !(vmcs_read(CPU_EXEC_CTRL1) & CPU_USE_TSC_SCALING));
> +
> +	if (vmcs_read(CPU_EXEC_CTRL0) & CPU_USE_TSC_OFFSET)
> +		t += vmcs_read(TSC_OFFSET);
> +
> +	return t;
> +}
> +
> +static unsigned long long rdtsc_vmexit_diff_test_iteration(void)
> +{
> +	unsigned long long guest_tsc, host_to_guest_tsc;
> +
> +	enter_guest();
> +	skip_exit_vmcall();
> +	guest_tsc = (u32) regs.rax + (regs.rdx << 32);
> +	host_to_guest_tsc = host_time_to_guest_time(exit_msr_store[0].value);
> +
> +	return host_to_guest_tsc - guest_tsc;
> +}
> +
> +static void rdtsc_vmexit_diff_test(void)
> +{
> +	int fail = 0;
> +	int i;
> +
> +	test_set_guest(rdtsc_vmexit_diff_test_guest);
> +
> +	reset_guest_tsc_to_zero();
> +
> +	/*
> +	 * Set up the VMCS12 VM-exit MSR-store list to store just one
> +	 * MSR: IA32_TIME_STAMP_COUNTER. Note that the value stored is
> +	 * in the host time domain (i.e., it is not adjusted according
> +	 * to the TSC multiplier and TSC offset fields in the VMCS12,
> +	 * as a guest RDTSC would be.)
> +	 */
> +	exit_msr_store = alloc_page();
> +	exit_msr_store[0].index = MSR_IA32_TSC;
> +	vmcs_write(EXI_MSR_ST_CNT, 1);
> +	vmcs_write(EXIT_MSR_ST_ADDR, virt_to_phys(exit_msr_store));
> +
> +	for (i = 0; i < RDTSC_DIFF_ITERS; i++) {
> +		if (rdtsc_vmexit_diff_test_iteration() >=
> +		    HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD)
> +			fail++;
> +	}
> +
> +	enter_guest();
> +
> +	report("RDTSC to VM-exit delta too high in %d of %d iterations",
> +	       fail < RDTSC_DIFF_FAILS, fail, RDTSC_DIFF_ITERS);
> +}
>  
>  static int invalid_msr_init(struct vmcs *vmcs)
>  {
> @@ -9056,5 +9148,6 @@ struct vmx_test vmx_tests[] = {
>  	/* Atomic MSR switch tests. */
>  	TEST(atomic_switch_max_msrs_test),
>  	TEST(atomic_switch_overflow_msrs_test),
> +	TEST(rdtsc_vmexit_diff_test),
>  	{ NULL, NULL, NULL, NULL, NULL, {0} },
>  };
> 

Applied, thanks.

Paolo
Nadav Amit Jan. 24, 2020, 11:13 p.m. UTC | #3
> On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
> 
> Verify that the difference between a guest RDTSC instruction and the
> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> MSR-store list is less than 750 cycles, 99.9% of the time.
> 
> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
> 
> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> Reviewed-by: Jim Mattson <jmattson@google.com>

Running this test on bare-metal I get:

  Test suite: rdtsc_vmexit_diff_test
  FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations

Any idea why? Should I just play with the 750 cycles magic number?
Sean Christopherson Jan. 24, 2020, 11:38 p.m. UTC | #4
On Fri, Jan 24, 2020 at 03:13:44PM -0800, Nadav Amit wrote:
> > On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
> > 
> > Verify that the difference between a guest RDTSC instruction and the
> > IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> > MSR-store list is less than 750 cycles, 99.9% of the time.
> > 
> > 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
> > 
> > Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> > Reviewed-by: Jim Mattson <jmattson@google.com>
> 
> Running this test on bare-metal I get:
> 
>   Test suite: rdtsc_vmexit_diff_test
>   FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
> 
> Any idea why? Should I just play with the 750 cycles magic number?

Argh, this reminds me that I have a patch for this test to improve the
error message to makes things easier to debug.  Give me a few minutes to
get it sent out, might help a bit.
Nadav Amit Jan. 25, 2020, 12:06 a.m. UTC | #5
> On Jan 24, 2020, at 3:38 PM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Fri, Jan 24, 2020 at 03:13:44PM -0800, Nadav Amit wrote:
>>> On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
>>> 
>>> Verify that the difference between a guest RDTSC instruction and the
>>> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
>>> MSR-store list is less than 750 cycles, 99.9% of the time.
>>> 
>>> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
>>> 
>>> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
>>> Reviewed-by: Jim Mattson <jmattson@google.com>
>> 
>> Running this test on bare-metal I get:
>> 
>>  Test suite: rdtsc_vmexit_diff_test
>>  FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
>> 
>> Any idea why? Should I just play with the 750 cycles magic number?
> 
> Argh, this reminds me that I have a patch for this test to improve the
> error message to makes things easier to debug.  Give me a few minutes to
> get it sent out, might help a bit.

Thanks for the quick response. With this patch I get on my bare-metal Skylake:

FAIL: RDTSC to VM-exit delta too high in 100 of 49757 iterations, last = 1152
FAIL: Guest didn't run to completion.

I’ll try to raise the delta and see what happens.

Sorry for my laziness - it is just that like ~30% of the tests that are
added fail on bare-metal :(
Paolo Bonzini Jan. 25, 2020, 9:43 a.m. UTC | #6
On 25/01/20 00:13, Nadav Amit wrote:
> Running this test on bare-metal I get:
> 
>   Test suite: rdtsc_vmexit_diff_test
>   FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
> 
> Any idea why? Should I just play with the 750 cycles magic number?

Either the 750 cycles number, or the 99.9%.  Do you see it on all hosts?
 Some are more SMI-happy than others.

Paolo
Jim Mattson Jan. 26, 2020, 10:06 p.m. UTC | #7
If I had to guess, you probably have SMM malware on your host. Remove
the malware, and the test should pass.

On Fri, Jan 24, 2020 at 4:06 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Jan 24, 2020, at 3:38 PM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> >
> > On Fri, Jan 24, 2020 at 03:13:44PM -0800, Nadav Amit wrote:
> >>> On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
> >>>
> >>> Verify that the difference between a guest RDTSC instruction and the
> >>> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> >>> MSR-store list is less than 750 cycles, 99.9% of the time.
> >>>
> >>> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
> >>>
> >>> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> >>> Reviewed-by: Jim Mattson <jmattson@google.com>
> >>
> >> Running this test on bare-metal I get:
> >>
> >>  Test suite: rdtsc_vmexit_diff_test
> >>  FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
> >>
> >> Any idea why? Should I just play with the 750 cycles magic number?
> >
> > Argh, this reminds me that I have a patch for this test to improve the
> > error message to makes things easier to debug.  Give me a few minutes to
> > get it sent out, might help a bit.
>
> Thanks for the quick response. With this patch I get on my bare-metal Skylake:
>
> FAIL: RDTSC to VM-exit delta too high in 100 of 49757 iterations, last = 1152
> FAIL: Guest didn't run to completion.
>
> I’ll try to raise the delta and see what happens.
>
> Sorry for my laziness - it is just that like ~30% of the tests that are
> added fail on bare-metal :(
>
Nadav Amit Jan. 27, 2020, 4:36 a.m. UTC | #8
> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> 
> If I had to guess, you probably have SMM malware on your host. Remove
> the malware, and the test should pass.

Well, malware will always be an option, but I doubt this is the case.

Interestingly, in the last few times the failure did not reproduce. Yet,
thinking about it made me concerned about MTRRs configuration, and that
perhaps performance is affected by memory marked as UC after boot, since
kvm-unit-test does not reset MTRRs.

Reading the variable range MTRRs, I do see some ranges marked as UC (most of
the range 2GB-4GB, if I read the MTRRs correctly):

  MSR 0x200 = 0x80000000
  MSR 0x201 = 0x3fff80000800
  MSR 0x202 = 0xff000005
  MSR 0x203 = 0x3fffff000800
  MSR 0x204 = 0x38000000000
  MSR 0x205 = 0x3f8000000800

Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
you suggest a reasonable configuration?

Thanks,
Nadav


> 
> On Fri, Jan 24, 2020 at 4:06 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>>> On Jan 24, 2020, at 3:38 PM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
>>> 
>>> On Fri, Jan 24, 2020 at 03:13:44PM -0800, Nadav Amit wrote:
>>>>> On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
>>>>> 
>>>>> Verify that the difference between a guest RDTSC instruction and the
>>>>> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
>>>>> MSR-store list is less than 750 cycles, 99.9% of the time.
>>>>> 
>>>>> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
>>>>> 
>>>>> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
>>>>> Reviewed-by: Jim Mattson <jmattson@google.com>
>>>> 
>>>> Running this test on bare-metal I get:
>>>> 
>>>> Test suite: rdtsc_vmexit_diff_test
>>>> FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
>>>> 
>>>> Any idea why? Should I just play with the 750 cycles magic number?
>>> 
>>> Argh, this reminds me that I have a patch for this test to improve the
>>> error message to makes things easier to debug.  Give me a few minutes to
>>> get it sent out, might help a bit.
>> 
>> Thanks for the quick response. With this patch I get on my bare-metal Skylake:
>> 
>> FAIL: RDTSC to VM-exit delta too high in 100 of 49757 iterations, last = 1152
>> FAIL: Guest didn't run to completion.
>> 
>> I’ll try to raise the delta and see what happens.
>> 
>> Sorry for my laziness - it is just that like ~30% of the tests that are
>> added fail on bare-metal :(
Jim Mattson Jan. 27, 2020, 7:24 p.m. UTC | #9
On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> >
> > If I had to guess, you probably have SMM malware on your host. Remove
> > the malware, and the test should pass.
>
> Well, malware will always be an option, but I doubt this is the case.

Was my innuendo too subtle? I consider any code executing in SMM to be malware.

> Interestingly, in the last few times the failure did not reproduce. Yet,
> thinking about it made me concerned about MTRRs configuration, and that
> perhaps performance is affected by memory marked as UC after boot, since
> kvm-unit-test does not reset MTRRs.
>
> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> the range 2GB-4GB, if I read the MTRRs correctly):
>
>   MSR 0x200 = 0x80000000
>   MSR 0x201 = 0x3fff80000800
>   MSR 0x202 = 0xff000005
>   MSR 0x203 = 0x3fffff000800
>   MSR 0x204 = 0x38000000000
>   MSR 0x205 = 0x3f8000000800
>
> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> you suggest a reasonable configuration?

I would expect MTRR issues to result in repeatable failures. For
instance, if your VMCS ended up in UC memory, that might slow things
down quite a bit. But, I would expect the VMCS to end up at the same
address each time the test is run.

> >
> > On Fri, Jan 24, 2020 at 4:06 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >>> On Jan 24, 2020, at 3:38 PM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> >>>
> >>> On Fri, Jan 24, 2020 at 03:13:44PM -0800, Nadav Amit wrote:
> >>>>> On Dec 2, 2019, at 12:43 PM, Aaron Lewis <aaronlewis@google.com> wrote:
> >>>>>
> >>>>> Verify that the difference between a guest RDTSC instruction and the
> >>>>> IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12's VM-exit
> >>>>> MSR-store list is less than 750 cycles, 99.9% of the time.
> >>>>>
> >>>>> 662f1d1d1931 ("KVM: nVMX: Add support for capturing highest observable L2 TSC”)
> >>>>>
> >>>>> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> >>>>> Reviewed-by: Jim Mattson <jmattson@google.com>
> >>>>
> >>>> Running this test on bare-metal I get:
> >>>>
> >>>> Test suite: rdtsc_vmexit_diff_test
> >>>> FAIL: RDTSC to VM-exit delta too high in 117 of 100000 iterations
> >>>>
> >>>> Any idea why? Should I just play with the 750 cycles magic number?
> >>>
> >>> Argh, this reminds me that I have a patch for this test to improve the
> >>> error message to makes things easier to debug.  Give me a few minutes to
> >>> get it sent out, might help a bit.
> >>
> >> Thanks for the quick response. With this patch I get on my bare-metal Skylake:
> >>
> >> FAIL: RDTSC to VM-exit delta too high in 100 of 49757 iterations, last = 1152
> >> FAIL: Guest didn't run to completion.
> >>
> >> I’ll try to raise the delta and see what happens.
> >>
> >> Sorry for my laziness - it is just that like ~30% of the tests that are
> >> added fail on bare-metal :(
>
>
Sean Christopherson Jan. 27, 2020, 8:56 p.m. UTC | #10
On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >
> > > On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> > >
> > > If I had to guess, you probably have SMM malware on your host. Remove
> > > the malware, and the test should pass.
> >
> > Well, malware will always be an option, but I doubt this is the case.
> 
> Was my innuendo too subtle? I consider any code executing in SMM to be malware.

SMI complications seem unlikely.  The straw that broke the camel's back
was a 1152 cyle delta, presumably the other failing runs had similar deltas.
I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
wouldn't be surprised if just SMI+RSM is over 1500 cycles.

> > Interestingly, in the last few times the failure did not reproduce. Yet,
> > thinking about it made me concerned about MTRRs configuration, and that
> > perhaps performance is affected by memory marked as UC after boot, since
> > kvm-unit-test does not reset MTRRs.
> >
> > Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> > the range 2GB-4GB, if I read the MTRRs correctly):
> >
> >   MSR 0x200 = 0x80000000
> >   MSR 0x201 = 0x3fff80000800
> >   MSR 0x202 = 0xff000005
> >   MSR 0x203 = 0x3fffff000800
> >   MSR 0x204 = 0x38000000000
> >   MSR 0x205 = 0x3f8000000800
> >
> > Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> > you suggest a reasonable configuration?
> 
> I would expect MTRR issues to result in repeatable failures. For
> instance, if your VMCS ended up in UC memory, that might slow things
> down quite a bit. But, I would expect the VMCS to end up at the same
> address each time the test is run.

Agreed on the repeatable failures part, but putting the VMCS in UC memory
shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
UC.
Jim Mattson Jan. 28, 2020, 5:59 p.m. UTC | #11
On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
> > On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> > >
> > > > On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> > > >
> > > > If I had to guess, you probably have SMM malware on your host. Remove
> > > > the malware, and the test should pass.
> > >
> > > Well, malware will always be an option, but I doubt this is the case.
> >
> > Was my innuendo too subtle? I consider any code executing in SMM to be malware.
>
> SMI complications seem unlikely.  The straw that broke the camel's back
> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
> wouldn't be surprised if just SMI+RSM is over 1500 cycles.

Good point. What generation of hardware are you running on, Nadav?

> > > Interestingly, in the last few times the failure did not reproduce. Yet,
> > > thinking about it made me concerned about MTRRs configuration, and that
> > > perhaps performance is affected by memory marked as UC after boot, since
> > > kvm-unit-test does not reset MTRRs.
> > >
> > > Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> > > the range 2GB-4GB, if I read the MTRRs correctly):
> > >
> > >   MSR 0x200 = 0x80000000
> > >   MSR 0x201 = 0x3fff80000800
> > >   MSR 0x202 = 0xff000005
> > >   MSR 0x203 = 0x3fffff000800
> > >   MSR 0x204 = 0x38000000000
> > >   MSR 0x205 = 0x3f8000000800
> > >
> > > Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> > > you suggest a reasonable configuration?
> >
> > I would expect MTRR issues to result in repeatable failures. For
> > instance, if your VMCS ended up in UC memory, that might slow things
> > down quite a bit. But, I would expect the VMCS to end up at the same
> > address each time the test is run.
>
> Agreed on the repeatable failures part, but putting the VMCS in UC memory
> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
> UC.

But the internal VMCS cache only contains selected fields, doesn't it?
Uncached fields would have to be written to memory on VM-exit. Or are
all of the mutable fields in the internal VMCS cache?
Nadav Amit Jan. 28, 2020, 6:32 p.m. UTC | #12
> On Jan 28, 2020, at 9:59 AM, Jim Mattson <jmattson@google.com> wrote:
> 
> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
>> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
>>> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
>>>>> 
>>>>> If I had to guess, you probably have SMM malware on your host. Remove
>>>>> the malware, and the test should pass.
>>>> 
>>>> Well, malware will always be an option, but I doubt this is the case.
>>> 
>>> Was my innuendo too subtle? I consider any code executing in SMM to be malware.
>> 
>> SMI complications seem unlikely.  The straw that broke the camel's back
>> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
>> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
>> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
>> wouldn't be surprised if just SMI+RSM is over 1500 cycles.
> 
> Good point. What generation of hardware are you running on, Nadav?
> 
>>>> Interestingly, in the last few times the failure did not reproduce. Yet,
>>>> thinking about it made me concerned about MTRRs configuration, and that
>>>> perhaps performance is affected by memory marked as UC after boot, since
>>>> kvm-unit-test does not reset MTRRs.
>>>> 
>>>> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
>>>> the range 2GB-4GB, if I read the MTRRs correctly):
>>>> 
>>>>  MSR 0x200 = 0x80000000
>>>>  MSR 0x201 = 0x3fff80000800
>>>>  MSR 0x202 = 0xff000005
>>>>  MSR 0x203 = 0x3fffff000800
>>>>  MSR 0x204 = 0x38000000000
>>>>  MSR 0x205 = 0x3f8000000800
>>>> 
>>>> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
>>>> you suggest a reasonable configuration?
>>> 
>>> I would expect MTRR issues to result in repeatable failures. For
>>> instance, if your VMCS ended up in UC memory, that might slow things
>>> down quite a bit. But, I would expect the VMCS to end up at the same
>>> address each time the test is run.
>> 
>> Agreed on the repeatable failures part, but putting the VMCS in UC memory
>> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
>> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
>> UC.
> 
> But the internal VMCS cache only contains selected fields, doesn't it?
> Uncached fields would have to be written to memory on VM-exit. Or are
> all of the mutable fields in the internal VMCS cache?

In that regard, the SDM says that "To ensure proper behavior in VMX
operation, software should maintain the VMCS region and related structures
(enumerated in Section 24.11.4) in writeback cacheable memory.”

Is it relevant? This means the MTRRs should be set up regardless of the
RDTSC test failure.

I did not have time to further investigate this issue yet. I did encounter
additional strange failures (EPT A/D and PML test failures, which are
persistent) due to Paolo’s patch that allows memory above 4GB to be used, so
I need to see if these issues are somehow related.
Sean Christopherson Jan. 28, 2020, 6:33 p.m. UTC | #13
On Tue, Jan 28, 2020 at 09:59:45AM -0800, Jim Mattson wrote:
> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> >
> > On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
> > > On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> > > >
> > > > > On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> > > > >
> > > > > If I had to guess, you probably have SMM malware on your host. Remove
> > > > > the malware, and the test should pass.
> > > >
> > > > Well, malware will always be an option, but I doubt this is the case.
> > >
> > > Was my innuendo too subtle? I consider any code executing in SMM to be malware.
> >
> > SMI complications seem unlikely.  The straw that broke the camel's back
> > was a 1152 cyle delta, presumably the other failing runs had similar deltas.
> > I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
> > to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
> > wouldn't be surprised if just SMI+RSM is over 1500 cycles.
> 
> Good point. What generation of hardware are you running on, Nadav?

Skylake.

> > > > Interestingly, in the last few times the failure did not reproduce. Yet,
> > > > thinking about it made me concerned about MTRRs configuration, and that
> > > > perhaps performance is affected by memory marked as UC after boot, since
> > > > kvm-unit-test does not reset MTRRs.
> > > >
> > > > Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> > > > the range 2GB-4GB, if I read the MTRRs correctly):
> > > >
> > > >   MSR 0x200 = 0x80000000
> > > >   MSR 0x201 = 0x3fff80000800
> > > >   MSR 0x202 = 0xff000005
> > > >   MSR 0x203 = 0x3fffff000800
> > > >   MSR 0x204 = 0x38000000000
> > > >   MSR 0x205 = 0x3f8000000800
> > > >
> > > > Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> > > > you suggest a reasonable configuration?
> > >
> > > I would expect MTRR issues to result in repeatable failures. For
> > > instance, if your VMCS ended up in UC memory, that might slow things
> > > down quite a bit. But, I would expect the VMCS to end up at the same
> > > address each time the test is run.
> >
> > Agreed on the repeatable failures part, but putting the VMCS in UC memory
> > shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
> > coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
> > UC.
> 
> But the internal VMCS cache only contains selected fields, doesn't it?
> Uncached fields would have to be written to memory on VM-exit. Or are
> all of the mutable fields in the internal VMCS cache?

Hmm.  I can neither confirm nor deny?  The official Intel response to this
would be "it's microarchitectural".  I'll put it this way: it's in Intel's
best interest to minimize the latency of VMREAD, VMWRITE, VM-Enter and
VM-Exit.
Nadav Amit Jan. 28, 2020, 6:42 p.m. UTC | #14
> On Jan 28, 2020, at 10:33 AM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Tue, Jan 28, 2020 at 09:59:45AM -0800, Jim Mattson wrote:
>> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
>> <sean.j.christopherson@intel.com> wrote:
>>> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
>>>> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
>>>>>> 
>>>>>> If I had to guess, you probably have SMM malware on your host. Remove
>>>>>> the malware, and the test should pass.
>>>>> 
>>>>> Well, malware will always be an option, but I doubt this is the case.
>>>> 
>>>> Was my innuendo too subtle? I consider any code executing in SMM to be malware.
>>> 
>>> SMI complications seem unlikely.  The straw that broke the camel's back
>>> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
>>> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
>>> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
>>> wouldn't be surprised if just SMI+RSM is over 1500 cycles.
>> 
>> Good point. What generation of hardware are you running on, Nadav?
> 
> Skylake.

Indeed. Thanks for answering on my behalf ;-)

> 
>>>>> Interestingly, in the last few times the failure did not reproduce. Yet,
>>>>> thinking about it made me concerned about MTRRs configuration, and that
>>>>> perhaps performance is affected by memory marked as UC after boot, since
>>>>> kvm-unit-test does not reset MTRRs.
>>>>> 
>>>>> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
>>>>> the range 2GB-4GB, if I read the MTRRs correctly):
>>>>> 
>>>>>  MSR 0x200 = 0x80000000
>>>>>  MSR 0x201 = 0x3fff80000800
>>>>>  MSR 0x202 = 0xff000005
>>>>>  MSR 0x203 = 0x3fffff000800
>>>>>  MSR 0x204 = 0x38000000000
>>>>>  MSR 0x205 = 0x3f8000000800
>>>>> 
>>>>> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
>>>>> you suggest a reasonable configuration?
>>>> 
>>>> I would expect MTRR issues to result in repeatable failures. For
>>>> instance, if your VMCS ended up in UC memory, that might slow things
>>>> down quite a bit. But, I would expect the VMCS to end up at the same
>>>> address each time the test is run.
>>> 
>>> Agreed on the repeatable failures part, but putting the VMCS in UC memory
>>> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
>>> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
>>> UC.
>> 
>> But the internal VMCS cache only contains selected fields, doesn't it?
>> Uncached fields would have to be written to memory on VM-exit. Or are
>> all of the mutable fields in the internal VMCS cache?
> 
> Hmm.  I can neither confirm nor deny?  The official Intel response to this
> would be "it's microarchitectural".  I'll put it this way: it's in Intel's
> best interest to minimize the latency of VMREAD, VMWRITE, VM-Enter and
> VM-Exit.

I will run some more experiments and get back to you. It is a shame that
every experiment requires a (real) boot…
Jim Mattson Jan. 28, 2020, 6:43 p.m. UTC | #15
On Tue, Jan 28, 2020 at 10:42 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Jan 28, 2020, at 10:33 AM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> >
> > On Tue, Jan 28, 2020 at 09:59:45AM -0800, Jim Mattson wrote:
> >> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
> >> <sean.j.christopherson@intel.com> wrote:
> >>> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
> >>>> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> >>>>>>
> >>>>>> If I had to guess, you probably have SMM malware on your host. Remove
> >>>>>> the malware, and the test should pass.
> >>>>>
> >>>>> Well, malware will always be an option, but I doubt this is the case.
> >>>>
> >>>> Was my innuendo too subtle? I consider any code executing in SMM to be malware.
> >>>
> >>> SMI complications seem unlikely.  The straw that broke the camel's back
> >>> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
> >>> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
> >>> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
> >>> wouldn't be surprised if just SMI+RSM is over 1500 cycles.
> >>
> >> Good point. What generation of hardware are you running on, Nadav?
> >
> > Skylake.
>
> Indeed. Thanks for answering on my behalf ;-)
>
> >
> >>>>> Interestingly, in the last few times the failure did not reproduce. Yet,
> >>>>> thinking about it made me concerned about MTRRs configuration, and that
> >>>>> perhaps performance is affected by memory marked as UC after boot, since
> >>>>> kvm-unit-test does not reset MTRRs.
> >>>>>
> >>>>> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> >>>>> the range 2GB-4GB, if I read the MTRRs correctly):
> >>>>>
> >>>>>  MSR 0x200 = 0x80000000
> >>>>>  MSR 0x201 = 0x3fff80000800
> >>>>>  MSR 0x202 = 0xff000005
> >>>>>  MSR 0x203 = 0x3fffff000800
> >>>>>  MSR 0x204 = 0x38000000000
> >>>>>  MSR 0x205 = 0x3f8000000800
> >>>>>
> >>>>> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> >>>>> you suggest a reasonable configuration?
> >>>>
> >>>> I would expect MTRR issues to result in repeatable failures. For
> >>>> instance, if your VMCS ended up in UC memory, that might slow things
> >>>> down quite a bit. But, I would expect the VMCS to end up at the same
> >>>> address each time the test is run.
> >>>
> >>> Agreed on the repeatable failures part, but putting the VMCS in UC memory
> >>> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
> >>> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
> >>> UC.
> >>
> >> But the internal VMCS cache only contains selected fields, doesn't it?
> >> Uncached fields would have to be written to memory on VM-exit. Or are
> >> all of the mutable fields in the internal VMCS cache?
> >
> > Hmm.  I can neither confirm nor deny?  The official Intel response to this
> > would be "it's microarchitectural".  I'll put it this way: it's in Intel's
> > best interest to minimize the latency of VMREAD, VMWRITE, VM-Enter and
> > VM-Exit.
>
> I will run some more experiments and get back to you. It is a shame that
> every experiment requires a (real) boot…

Yes! It's not just a shame; it's a serious usability issue.
Nadav Amit Jan. 28, 2020, 7:03 p.m. UTC | #16
> On Jan 28, 2020, at 10:43 AM, Jim Mattson <jmattson@google.com> wrote:
> 
> On Tue, Jan 28, 2020 at 10:42 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>>> On Jan 28, 2020, at 10:33 AM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
>>> 
>>> On Tue, Jan 28, 2020 at 09:59:45AM -0800, Jim Mattson wrote:
>>>> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
>>>> <sean.j.christopherson@intel.com> wrote:
>>>>> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
>>>>>> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
>>>>>>>> 
>>>>>>>> If I had to guess, you probably have SMM malware on your host. Remove
>>>>>>>> the malware, and the test should pass.
>>>>>>> 
>>>>>>> Well, malware will always be an option, but I doubt this is the case.
>>>>>> 
>>>>>> Was my innuendo too subtle? I consider any code executing in SMM to be malware.
>>>>> 
>>>>> SMI complications seem unlikely.  The straw that broke the camel's back
>>>>> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
>>>>> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
>>>>> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
>>>>> wouldn't be surprised if just SMI+RSM is over 1500 cycles.
>>>> 
>>>> Good point. What generation of hardware are you running on, Nadav?
>>> 
>>> Skylake.
>> 
>> Indeed. Thanks for answering on my behalf ;-)
>> 
>>>>>>> Interestingly, in the last few times the failure did not reproduce. Yet,
>>>>>>> thinking about it made me concerned about MTRRs configuration, and that
>>>>>>> perhaps performance is affected by memory marked as UC after boot, since
>>>>>>> kvm-unit-test does not reset MTRRs.
>>>>>>> 
>>>>>>> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
>>>>>>> the range 2GB-4GB, if I read the MTRRs correctly):
>>>>>>> 
>>>>>>> MSR 0x200 = 0x80000000
>>>>>>> MSR 0x201 = 0x3fff80000800
>>>>>>> MSR 0x202 = 0xff000005
>>>>>>> MSR 0x203 = 0x3fffff000800
>>>>>>> MSR 0x204 = 0x38000000000
>>>>>>> MSR 0x205 = 0x3f8000000800
>>>>>>> 
>>>>>>> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
>>>>>>> you suggest a reasonable configuration?
>>>>>> 
>>>>>> I would expect MTRR issues to result in repeatable failures. For
>>>>>> instance, if your VMCS ended up in UC memory, that might slow things
>>>>>> down quite a bit. But, I would expect the VMCS to end up at the same
>>>>>> address each time the test is run.
>>>>> 
>>>>> Agreed on the repeatable failures part, but putting the VMCS in UC memory
>>>>> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
>>>>> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
>>>>> UC.
>>>> 
>>>> But the internal VMCS cache only contains selected fields, doesn't it?
>>>> Uncached fields would have to be written to memory on VM-exit. Or are
>>>> all of the mutable fields in the internal VMCS cache?
>>> 
>>> Hmm.  I can neither confirm nor deny?  The official Intel response to this
>>> would be "it's microarchitectural".  I'll put it this way: it's in Intel's
>>> best interest to minimize the latency of VMREAD, VMWRITE, VM-Enter and
>>> VM-Exit.
>> 
>> I will run some more experiments and get back to you. It is a shame that
>> every experiment requires a (real) boot…
> 
> Yes! It's not just a shame; it's a serious usability issue.

The easy way to run these experiments would have been to use an Intel CRB
(Customer Reference Board), which boots relatively fast, with an ITP
(In-Target Probe). This would have simplified testing and debugging
considerably. Perhaps some sort of PXE-boot would also be beneficial.
Unfortunately, I do not have the hardware and it does not seem other care
that much so far.

Despite the usability issues, running the tests on bare-metal already
revealed several bugs in KVM (and one SDM issue), which were not apparent
since the tests were wrong.
Jim Mattson Jan. 28, 2020, 7:34 p.m. UTC | #17
On Tue, Jan 28, 2020 at 11:03 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> > On Jan 28, 2020, at 10:43 AM, Jim Mattson <jmattson@google.com> wrote:
> >
> > On Tue, Jan 28, 2020 at 10:42 AM Nadav Amit <nadav.amit@gmail.com> wrote:
> >>> On Jan 28, 2020, at 10:33 AM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> >>>
> >>> On Tue, Jan 28, 2020 at 09:59:45AM -0800, Jim Mattson wrote:
> >>>> On Mon, Jan 27, 2020 at 12:56 PM Sean Christopherson
> >>>> <sean.j.christopherson@intel.com> wrote:
> >>>>> On Mon, Jan 27, 2020 at 11:24:31AM -0800, Jim Mattson wrote:
> >>>>>> On Sun, Jan 26, 2020 at 8:36 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>>> On Jan 26, 2020, at 2:06 PM, Jim Mattson <jmattson@google.com> wrote:
> >>>>>>>>
> >>>>>>>> If I had to guess, you probably have SMM malware on your host. Remove
> >>>>>>>> the malware, and the test should pass.
> >>>>>>>
> >>>>>>> Well, malware will always be an option, but I doubt this is the case.
> >>>>>>
> >>>>>> Was my innuendo too subtle? I consider any code executing in SMM to be malware.
> >>>>>
> >>>>> SMI complications seem unlikely.  The straw that broke the camel's back
> >>>>> was a 1152 cyle delta, presumably the other failing runs had similar deltas.
> >>>>> I've never benchmarked SMI+RSM, but I highly doubt it comes anywhere close
> >>>>> to VM-Enter/VM-Exit's super optimized ~400 cycle round trip.  E.g. I
> >>>>> wouldn't be surprised if just SMI+RSM is over 1500 cycles.
> >>>>
> >>>> Good point. What generation of hardware are you running on, Nadav?
> >>>
> >>> Skylake.
> >>
> >> Indeed. Thanks for answering on my behalf ;-)
> >>
> >>>>>>> Interestingly, in the last few times the failure did not reproduce. Yet,
> >>>>>>> thinking about it made me concerned about MTRRs configuration, and that
> >>>>>>> perhaps performance is affected by memory marked as UC after boot, since
> >>>>>>> kvm-unit-test does not reset MTRRs.
> >>>>>>>
> >>>>>>> Reading the variable range MTRRs, I do see some ranges marked as UC (most of
> >>>>>>> the range 2GB-4GB, if I read the MTRRs correctly):
> >>>>>>>
> >>>>>>> MSR 0x200 = 0x80000000
> >>>>>>> MSR 0x201 = 0x3fff80000800
> >>>>>>> MSR 0x202 = 0xff000005
> >>>>>>> MSR 0x203 = 0x3fffff000800
> >>>>>>> MSR 0x204 = 0x38000000000
> >>>>>>> MSR 0x205 = 0x3f8000000800
> >>>>>>>
> >>>>>>> Do you think we should set the MTRRs somehow in KVM-unit-tests? If yes, can
> >>>>>>> you suggest a reasonable configuration?
> >>>>>>
> >>>>>> I would expect MTRR issues to result in repeatable failures. For
> >>>>>> instance, if your VMCS ended up in UC memory, that might slow things
> >>>>>> down quite a bit. But, I would expect the VMCS to end up at the same
> >>>>>> address each time the test is run.
> >>>>>
> >>>>> Agreed on the repeatable failures part, but putting the VMCS in UC memory
> >>>>> shouldn't affect this type of test.  The CPU's internal VMCS cache isn't
> >>>>> coherent, and IIRC isn't disabled if the MTRRs for the VMCS happen to be
> >>>>> UC.
> >>>>
> >>>> But the internal VMCS cache only contains selected fields, doesn't it?
> >>>> Uncached fields would have to be written to memory on VM-exit. Or are
> >>>> all of the mutable fields in the internal VMCS cache?
> >>>
> >>> Hmm.  I can neither confirm nor deny?  The official Intel response to this
> >>> would be "it's microarchitectural".  I'll put it this way: it's in Intel's
> >>> best interest to minimize the latency of VMREAD, VMWRITE, VM-Enter and
> >>> VM-Exit.
> >>
> >> I will run some more experiments and get back to you. It is a shame that
> >> every experiment requires a (real) boot…
> >
> > Yes! It's not just a shame; it's a serious usability issue.
>
> The easy way to run these experiments would have been to use an Intel CRB
> (Customer Reference Board), which boots relatively fast, with an ITP
> (In-Target Probe). This would have simplified testing and debugging
> considerably. Perhaps some sort of PXE-boot would also be beneficial.
> Unfortunately, I do not have the hardware and it does not seem other care
> that much so far.

Not true. I think others do care (I know I do). It's just that bare
metal testing is too hard right now. We need a way to "fire and
forget" the entire test suite and then check a log for failures once
it's done. I'm not suggesting that you do that; I'm just suggesting
that lowering the barrier to bare-metal testing would increase the
likelihood of people doing it.

> Despite the usability issues, running the tests on bare-metal already
> revealed several bugs in KVM (and one SDM issue), which were not apparent
> since the tests were wrong.

I'm not surprised. It's ludicrous that the test results are not
verified on hardware.
diff mbox series

Patch

diff --git a/x86/vmx.h b/x86/vmx.h
index 8496be7..21ba953 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -420,6 +420,7 @@  enum Ctrl1 {
 	CPU_SHADOW_VMCS		= 1ul << 14,
 	CPU_RDSEED		= 1ul << 16,
 	CPU_PML                 = 1ul << 17,
+	CPU_USE_TSC_SCALING	= 1ul << 25,
 };
 
 enum Intr_type {
diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 1d8932f..6ceaf9a 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -8790,7 +8790,99 @@  static void vmx_vmcs_shadow_test(void)
 	enter_guest();
 }
 
+/*
+ * This test monitors the difference between a guest RDTSC instruction
+ * and the IA32_TIME_STAMP_COUNTER MSR value stored in the VMCS12
+ * VM-exit MSR-store list when taking a VM-exit on the instruction
+ * following RDTSC.
+ */
+#define RDTSC_DIFF_ITERS 100000
+#define RDTSC_DIFF_FAILS 100
+#define HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD 750
+
+/*
+ * Set 'use TSC offsetting' and set the guest offset to the
+ * inverse of the host's current TSC value, so that the guest starts running
+ * with an effective TSC value of 0.
+ */
+static void reset_guest_tsc_to_zero(void)
+{
+	TEST_ASSERT_MSG(ctrl_cpu_rev[0].clr & CPU_USE_TSC_OFFSET,
+			"Expected support for 'use TSC offsetting'");
+
+	vmcs_set_bits(CPU_EXEC_CTRL0, CPU_USE_TSC_OFFSET);
+	vmcs_write(TSC_OFFSET, -rdtsc());
+}
+
+static void rdtsc_vmexit_diff_test_guest(void)
+{
+	int i;
+
+	for (i = 0; i < RDTSC_DIFF_ITERS; i++)
+		/* Ensure rdtsc is the last instruction before the vmcall. */
+		asm volatile("rdtsc; vmcall" : : : "eax", "edx");
+}
 
+/*
+ * This function only considers the "use TSC offsetting" VM-execution
+ * control.  It does not handle "use TSC scaling" (because the latter
+ * isn't available to the host today.)
+ */
+static unsigned long long host_time_to_guest_time(unsigned long long t)
+{
+	TEST_ASSERT(!(ctrl_cpu_rev[0].clr & CPU_SECONDARY) ||
+		    !(vmcs_read(CPU_EXEC_CTRL1) & CPU_USE_TSC_SCALING));
+
+	if (vmcs_read(CPU_EXEC_CTRL0) & CPU_USE_TSC_OFFSET)
+		t += vmcs_read(TSC_OFFSET);
+
+	return t;
+}
+
+static unsigned long long rdtsc_vmexit_diff_test_iteration(void)
+{
+	unsigned long long guest_tsc, host_to_guest_tsc;
+
+	enter_guest();
+	skip_exit_vmcall();
+	guest_tsc = (u32) regs.rax + (regs.rdx << 32);
+	host_to_guest_tsc = host_time_to_guest_time(exit_msr_store[0].value);
+
+	return host_to_guest_tsc - guest_tsc;
+}
+
+static void rdtsc_vmexit_diff_test(void)
+{
+	int fail = 0;
+	int i;
+
+	test_set_guest(rdtsc_vmexit_diff_test_guest);
+
+	reset_guest_tsc_to_zero();
+
+	/*
+	 * Set up the VMCS12 VM-exit MSR-store list to store just one
+	 * MSR: IA32_TIME_STAMP_COUNTER. Note that the value stored is
+	 * in the host time domain (i.e., it is not adjusted according
+	 * to the TSC multiplier and TSC offset fields in the VMCS12,
+	 * as a guest RDTSC would be.)
+	 */
+	exit_msr_store = alloc_page();
+	exit_msr_store[0].index = MSR_IA32_TSC;
+	vmcs_write(EXI_MSR_ST_CNT, 1);
+	vmcs_write(EXIT_MSR_ST_ADDR, virt_to_phys(exit_msr_store));
+
+	for (i = 0; i < RDTSC_DIFF_ITERS; i++) {
+		if (rdtsc_vmexit_diff_test_iteration() >=
+		    HOST_CAPTURED_GUEST_TSC_DIFF_THRESHOLD)
+			fail++;
+	}
+
+	enter_guest();
+
+	report("RDTSC to VM-exit delta too high in %d of %d iterations",
+	       fail < RDTSC_DIFF_FAILS, fail, RDTSC_DIFF_ITERS);
+}
 
 static int invalid_msr_init(struct vmcs *vmcs)
 {
@@ -9056,5 +9148,6 @@  struct vmx_test vmx_tests[] = {
 	/* Atomic MSR switch tests. */
 	TEST(atomic_switch_max_msrs_test),
 	TEST(atomic_switch_overflow_msrs_test),
+	TEST(rdtsc_vmexit_diff_test),
 	{ NULL, NULL, NULL, NULL, NULL, {0} },
 };