diff mbox series

[4/4] KVM: selftests: Add test case for guest and host LBR preemption

Message ID 20230616113353.45202-5-xiong.y.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series Part of fix for host and guest LBR event coexist | expand

Commit Message

Zhang, Xiong Y June 16, 2023, 11:33 a.m. UTC
When guest access LBR msr at the first time, kvm will create a vLBR event,
vLBR event joins perf scheduler and occupy physical LBR for guest usage.
Once vLBR event is active and own LBR, guest could access LBR msr.

But vLBR event is per process pinned event, perf has higher priority event:
per cpu pinned LBR event, perf has lower priority events also: per cpu LBR
event and per process LBR event.
So if host doesn't have higher priority per cpu pinned LBR event, vLBR
event could occupy physical LBR always. But once per cpu pinned LBR event
is active, vLBR event couldn't be active anymore, then guest couldn't
access LBR msr.

This commit adds test case to cover guest and host lbr contend.

Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/include/ucall_common.h      |  17 ++
 .../kvm/x86_64/pmu_event_filter_test.c        |  16 --
 .../kvm/x86_64/vmx_pmu_lbr_contend.c          | 171 ++++++++++++++++++
 4 files changed, 189 insertions(+), 16 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c

Comments

Like Xu June 28, 2023, 6:27 a.m. UTC | #1
This direction could be a vPMU "coexistence" feature, please feel free to test it.
But the first step might be to test the behavior of vPMC when its event is 
preempted.
Then expand to Guest LBR and Guest PEBS etc.

On 16/6/2023 7:33 pm, Xiong Zhang wrote:
> When guest access LBR msr at the first time, kvm will create a vLBR event,
> vLBR event joins perf scheduler and occupy physical LBR for guest usage.
> Once vLBR event is active and own LBR, guest could access LBR msr.
> 
> But vLBR event is per process pinned event, perf has higher priority event:
> per cpu pinned LBR event, perf has lower priority events also: per cpu LBR
> event and per process LBR event.
> So if host doesn't have higher priority per cpu pinned LBR event, vLBR
> event could occupy physical LBR always. But once per cpu pinned LBR event
> is active, vLBR event couldn't be active anymore, then guest couldn't
> access LBR msr.
> 
> This commit adds test case to cover guest and host lbr contend.
> 
> Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> ---
>   tools/testing/selftests/kvm/Makefile          |   1 +
>   .../selftests/kvm/include/ucall_common.h      |  17 ++
>   .../kvm/x86_64/pmu_event_filter_test.c        |  16 --
>   .../kvm/x86_64/vmx_pmu_lbr_contend.c          | 171 ++++++++++++++++++
>   4 files changed, 189 insertions(+), 16 deletions(-)
>   create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 4761b768b773..422bbc16ba2a 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -100,6 +100,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend

x86_64/vmx_pmu_coexistence ?

>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
> index 1a6aaef5ccae..c1bb0cacf390 100644
> --- a/tools/testing/selftests/kvm/include/ucall_common.h
> +++ b/tools/testing/selftests/kvm/include/ucall_common.h
> @@ -35,6 +35,23 @@ void ucall(uint64_t cmd, int nargs, ...);
>   uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
>   void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
>   
> +/*
> + * Run the VM to the next GUEST_SYNC(value), and return the value passed
> + * to the sync. Any other exit from the guest is fatal.
> + */
> +static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
> +{
> +	struct ucall uc;
> +
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +	get_ucall(vcpu, &uc);
> +	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> +		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> +
> +	return uc.args[1];
> +}
> +
>   /*
>    * Perform userspace call without any associated data.  This bare call avoids
>    * allocating a ucall struct, which can be useful if the atomic operations in
> diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> index 40507ed9fe8a..8c68029cfb4b 100644
> --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> @@ -177,22 +177,6 @@ static void amd_guest_code(void)
>   	}
>   }
>   
> -/*
> - * Run the VM to the next GUEST_SYNC(value), and return the value passed
> - * to the sync. Any other exit from the guest is fatal.
> - */
> -static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
> -{
> -	struct ucall uc;
> -
> -	vcpu_run(vcpu);
> -	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> -	get_ucall(vcpu, &uc);
> -	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> -		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> -	return uc.args[1];
> -}

Can this part be a separate patch ?

> -
>   static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
>   {
>   	uint64_t r;
> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> new file mode 100644
> index 000000000000..a6a793f08515
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Test for host and guest LBR preemption
> + *
> + * Copyright (C) 2021 Intel Corporation
> + *
> + */
> +
> +#define _GNU_SOURCEGG
> +
> +#include <linux/perf_event.h>
> +#include <sys/syscall.h>
> +#include <sys/sysinfo.h>
> +#include <unistd.h>
> +
> +#include "test_util.h"
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +static void create_perf_events(int *fds, int cpu_num, bool pinned)
> +{
> +	struct perf_event_attr attr = {
> +		.type = PERF_TYPE_HARDWARE,
> +		.size = sizeof(attr),
> +		.config = PERF_COUNT_HW_CPU_CYCLES,
> +		.sample_type = PERF_SAMPLE_BRANCH_STACK,
> +		.sample_period = 1000,
> +		.pinned = pinned,
> +		.branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
> +				      PERF_SAMPLE_BRANCH_USER |
> +				      PERF_SAMPLE_BRANCH_KERNEL,
> +	};
> +	int i;
> +
> +	for (i = 0; i < cpu_num; i++) {
> +		fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1, PERF_FLAG_FD_CLOEXEC);

Which field can point to the generation of a "per cpu pinned" event ?
More comments are required.

> +		TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d", i);
> +	}
> +}
> +
> +static void release_perf_events(int *fds, int cpu_num)
> +{
> +	int i;
> +
> +	for (i = 0; i < cpu_num; i++)
> +		close(fds[i]);
> +}
> +
> +#define PERF_CAP_LBR_FMT_MASK  0x1F
> +
> +#define LBR_NOT_SUPPORTED  0xFFFE
> +#define LBR_MSR_WRITE_ERROR 0xFFFD
> +
> +#define LBR_MODE_CHECK_PASS 0x0
> +#define LBR_MSR_WRITE_SUCC  0x1
> +
> +static bool check_lbr_msr(void)
> +{
> +	uint64_t v, old_val;
> +
> +	old_val = rdmsr(MSR_LBR_TOS);

Why focus only on MSR_LBR_TOS ?

> +
> +	v  = old_val ^ 0x3UL;
> +
> +	wrmsr(MSR_LBR_TOS, v);
> +	if (rdmsr(MSR_LBR_TOS) != v)
> +		return false;
> +
> +	wrmsr(MSR_LBR_TOS, old_val);
> +	if (rdmsr(MSR_LBR_TOS) != old_val)
> +		return false;
> +
> +	return true;
> +}
> +
> +static void guest_code(void)
> +{
> +	uint64_t v;
> +
> +	v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> +	if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
> +		GUEST_SYNC(LBR_NOT_SUPPORTED);
> +
> +	GUEST_SYNC(LBR_MODE_CHECK_PASS);
> +
> +	while (1) {
> +		if (!check_lbr_msr()) {
> +			GUEST_SYNC(LBR_MSR_WRITE_ERROR);
> +			continue;
> +		}
> +
> +		/* Enable LBR to avoid KVM recyling LBR. */
> +		 v = rdmsr(MSR_IA32_DEBUGCTLMSR);
> +		 v |= DEBUGCTLMSR_LBR;
> +		 wrmsr(MSR_IA32_DEBUGCTLMSR, v);
> +
> +		GUEST_SYNC(LBR_MSR_WRITE_SUCC);
> +	}
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	int *fds, ncpus;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_vm *vm;
> +	uint64_t r;
> +
> +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> +	TEST_REQUIRE(host_cpu_is_intel);
> +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
> +
> +	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
> +		    "LBR format in guest PERF_CAP msr isn't correct");
> +
> +	ncpus = get_nprocs();

Could we limit the test to a specific cpu, since it will affect the load on 
other cpus?

> +	fds = malloc(sizeof(int) * ncpus);
> +	TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
> +
> +	/* Create per cpu pinned LBR event, then it will own LBR. */
> +	create_perf_events(fds, ncpus, true);
> +
> +	/* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
> +	 * so guest couldn't access LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> +		    "1. Unexpected successfully read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);

Obviously there are duplicate calls on release_perf_events() that can be omitted.

> +
> +	/* Since per cpu pinned event is closed and LBR is free, guest could get it,
> +	 * so guest could access LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> +		    "2. Failed to read/write guest LBR_TO msr");
> +
> +	/* Create per cpu LBR event, its priority is lower than vLBR event, and it
> +	 *  couldn't get LBR back from vLBR
> +	 */
> +	create_perf_events(fds, ncpus, false);
> +
> +	/* LBR is still owned by guest, So guest could access LBR_TOS successfully. */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> +		    "3. Failed read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);
> +
> +	/* Create per cpu pinned LBR event, its priority is higher than vLBR event,
> +	 * so it will get LBR back from vLBR.
> +	 */
> +	create_perf_events(fds, ncpus, true);
> +
> +	/* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
> +	 * LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> +		    "4. Unexpected successfully read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);

  Why not add more tests to cover all possibilities ?

	per cpu pinned event
	per process pinned event
	per cpu event
	per process event

> +
> +	kvm_vm_free(vm);
> +
> +	free(fds);
> +
> +	return 0;
> +}
Yang, Weijiang June 28, 2023, 9:27 a.m. UTC | #2
What kind of issues you expect this selftest to find?

IMO, it verifies the generic perf schedule rules but cannot find 
specific issues.

e.g., whether the LBR data is corrupted in some cases. If the selftest 
can verify whether

guest/host data is maintained correctly in a high contention env., that 
can be better to

sever the purpose of selftest.


On 6/16/2023 7:33 PM, Xiong Zhang wrote:
> When guest access LBR msr at the first time, kvm will create a vLBR event,
> vLBR event joins perf scheduler and occupy physical LBR for guest usage.
> Once vLBR event is active and own LBR, guest could access LBR msr.
>
> But vLBR event is per process pinned event, perf has higher priority event:
> per cpu pinned LBR event, perf has lower priority events also: per cpu LBR
> event and per process LBR event.
> So if host doesn't have higher priority per cpu pinned LBR event, vLBR
> event could occupy physical LBR always. But once per cpu pinned LBR event
> is active, vLBR event couldn't be active anymore, then guest couldn't
> access LBR msr.
>
> This commit adds test case to cover guest and host lbr contend.
>
> Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> ---
>   tools/testing/selftests/kvm/Makefile          |   1 +
>   .../selftests/kvm/include/ucall_common.h      |  17 ++
>   .../kvm/x86_64/pmu_event_filter_test.c        |  16 --
>   .../kvm/x86_64/vmx_pmu_lbr_contend.c          | 171 ++++++++++++++++++
>   4 files changed, 189 insertions(+), 16 deletions(-)
>   create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
>
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 4761b768b773..422bbc16ba2a 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -100,6 +100,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
>   TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
> index 1a6aaef5ccae..c1bb0cacf390 100644
> --- a/tools/testing/selftests/kvm/include/ucall_common.h
> +++ b/tools/testing/selftests/kvm/include/ucall_common.h
> @@ -35,6 +35,23 @@ void ucall(uint64_t cmd, int nargs, ...);
>   uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
>   void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
>   
> +/*
> + * Run the VM to the next GUEST_SYNC(value), and return the value passed
> + * to the sync. Any other exit from the guest is fatal.
> + */
> +static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
> +{
> +	struct ucall uc;
> +
> +	vcpu_run(vcpu);
> +	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +	get_ucall(vcpu, &uc);
> +	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> +		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> +
> +	return uc.args[1];
> +}
> +
>   /*
>    * Perform userspace call without any associated data.  This bare call avoids
>    * allocating a ucall struct, which can be useful if the atomic operations in
> diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> index 40507ed9fe8a..8c68029cfb4b 100644
> --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> @@ -177,22 +177,6 @@ static void amd_guest_code(void)
>   	}
>   }
>   
> -/*
> - * Run the VM to the next GUEST_SYNC(value), and return the value passed
> - * to the sync. Any other exit from the guest is fatal.
> - */
> -static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
> -{
> -	struct ucall uc;
> -
> -	vcpu_run(vcpu);
> -	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> -	get_ucall(vcpu, &uc);
> -	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> -		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> -	return uc.args[1];
> -}
> -
>   static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
>   {
>   	uint64_t r;
> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> new file mode 100644
> index 000000000000..a6a793f08515
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Test for host and guest LBR preemption
> + *
> + * Copyright (C) 2021 Intel Corporation
> + *
> + */
> +
> +#define _GNU_SOURCEGG
> +
> +#include <linux/perf_event.h>
> +#include <sys/syscall.h>
> +#include <sys/sysinfo.h>
> +#include <unistd.h>
> +
> +#include "test_util.h"
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +static void create_perf_events(int *fds, int cpu_num, bool pinned)
> +{
> +	struct perf_event_attr attr = {
> +		.type = PERF_TYPE_HARDWARE,
> +		.size = sizeof(attr),
> +		.config = PERF_COUNT_HW_CPU_CYCLES,
> +		.sample_type = PERF_SAMPLE_BRANCH_STACK,
> +		.sample_period = 1000,
> +		.pinned = pinned,
> +		.branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
> +				      PERF_SAMPLE_BRANCH_USER |
> +				      PERF_SAMPLE_BRANCH_KERNEL,
> +	};
> +	int i;
> +
> +	for (i = 0; i < cpu_num; i++) {
> +		fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1, PERF_FLAG_FD_CLOEXEC);
> +		TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d", i);
> +	}
> +}
> +
> +static void release_perf_events(int *fds, int cpu_num)
> +{
> +	int i;
> +
> +	for (i = 0; i < cpu_num; i++)
> +		close(fds[i]);
> +}
> +
> +#define PERF_CAP_LBR_FMT_MASK  0x1F
> +
> +#define LBR_NOT_SUPPORTED  0xFFFE
> +#define LBR_MSR_WRITE_ERROR 0xFFFD
> +
> +#define LBR_MODE_CHECK_PASS 0x0
> +#define LBR_MSR_WRITE_SUCC  0x1
> +
> +static bool check_lbr_msr(void)
> +{
> +	uint64_t v, old_val;
> +
> +	old_val = rdmsr(MSR_LBR_TOS);
> +
> +	v  = old_val ^ 0x3UL;
> +
> +	wrmsr(MSR_LBR_TOS, v);
> +	if (rdmsr(MSR_LBR_TOS) != v)
> +		return false;
> +
> +	wrmsr(MSR_LBR_TOS, old_val);
> +	if (rdmsr(MSR_LBR_TOS) != old_val)
> +		return false;
> +
> +	return true;
> +}
> +
> +static void guest_code(void)
> +{
> +	uint64_t v;
> +
> +	v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> +	if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
> +		GUEST_SYNC(LBR_NOT_SUPPORTED);
> +
> +	GUEST_SYNC(LBR_MODE_CHECK_PASS);
> +
> +	while (1) {
> +		if (!check_lbr_msr()) {
> +			GUEST_SYNC(LBR_MSR_WRITE_ERROR);
> +			continue;
> +		}
> +
> +		/* Enable LBR to avoid KVM recyling LBR. */
> +		 v = rdmsr(MSR_IA32_DEBUGCTLMSR);
> +		 v |= DEBUGCTLMSR_LBR;
> +		 wrmsr(MSR_IA32_DEBUGCTLMSR, v);
> +
> +		GUEST_SYNC(LBR_MSR_WRITE_SUCC);
> +	}
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	int *fds, ncpus;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm_vm *vm;
> +	uint64_t r;
> +
> +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> +	TEST_REQUIRE(host_cpu_is_intel);
> +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
> +
> +	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
> +		    "LBR format in guest PERF_CAP msr isn't correct");
> +
> +	ncpus = get_nprocs();
> +	fds = malloc(sizeof(int) * ncpus);
> +	TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
> +
> +	/* Create per cpu pinned LBR event, then it will own LBR. */
> +	create_perf_events(fds, ncpus, true);
> +
> +	/* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
> +	 * so guest couldn't access LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> +		    "1. Unexpected successfully read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);
> +
> +	/* Since per cpu pinned event is closed and LBR is free, guest could get it,
> +	 * so guest could access LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> +		    "2. Failed to read/write guest LBR_TO msr");
> +
> +	/* Create per cpu LBR event, its priority is lower than vLBR event, and it
> +	 *  couldn't get LBR back from vLBR
> +	 */
> +	create_perf_events(fds, ncpus, false);
> +
> +	/* LBR is still owned by guest, So guest could access LBR_TOS successfully. */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> +		    "3. Failed read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);
> +
> +	/* Create per cpu pinned LBR event, its priority is higher than vLBR event,
> +	 * so it will get LBR back from vLBR.
> +	 */
> +	create_perf_events(fds, ncpus, true);
> +
> +	/* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
> +	 * LBR_TOS msr.
> +	 */
> +	r = run_vcpu_to_sync(vcpu);
> +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> +		    "4. Unexpected successfully read/write guest LBR_TO msr");
> +
> +	release_perf_events(fds, ncpus);
> +
> +	kvm_vm_free(vm);
> +
> +	free(fds);
> +
> +	return 0;
> +}
Zhang, Xiong Y June 29, 2023, 2:39 a.m. UTC | #3
> This direction could be a vPMU "coexistence" feature, please feel free to test it.
> But the first step might be to test the behavior of vPMC when its event is
> preempted.
It is harder to construct vPMC preemption as several counters exist in processor, while only one LBR and one PEBS exist in processor. 
> Then expand to Guest LBR and Guest PEBS etc.
> 
> On 16/6/2023 7:33 pm, Xiong Zhang wrote:
> > When guest access LBR msr at the first time, kvm will create a vLBR
> > event, vLBR event joins perf scheduler and occupy physical LBR for guest usage.
> > Once vLBR event is active and own LBR, guest could access LBR msr.
> >
> > But vLBR event is per process pinned event, perf has higher priority event:
> > per cpu pinned LBR event, perf has lower priority events also: per cpu
> > LBR event and per process LBR event.
> > So if host doesn't have higher priority per cpu pinned LBR event, vLBR
> > event could occupy physical LBR always. But once per cpu pinned LBR
> > event is active, vLBR event couldn't be active anymore, then guest
> > couldn't access LBR msr.
> >
> > This commit adds test case to cover guest and host lbr contend.
> >
> > Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> > ---
> >   tools/testing/selftests/kvm/Makefile          |   1 +
> >   .../selftests/kvm/include/ucall_common.h      |  17 ++
> >   .../kvm/x86_64/pmu_event_filter_test.c        |  16 --
> >   .../kvm/x86_64/vmx_pmu_lbr_contend.c          | 171 ++++++++++++++++++
> >   4 files changed, 189 insertions(+), 16 deletions(-)
> >   create mode 100644
> > tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> >
> > diff --git a/tools/testing/selftests/kvm/Makefile
> > b/tools/testing/selftests/kvm/Makefile
> > index 4761b768b773..422bbc16ba2a 100644
> > --- a/tools/testing/selftests/kvm/Makefile
> > +++ b/tools/testing/selftests/kvm/Makefile
> > @@ -100,6 +100,7 @@ TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_dirty_log_test
> >   TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_exception_with_invalid_guest_state
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
> > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend
> 
> x86_64/vmx_pmu_coexistence ?
Yes, once this expand to vPMC and PEBS
> 
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> > diff --git a/tools/testing/selftests/kvm/include/ucall_common.h
> > b/tools/testing/selftests/kvm/include/ucall_common.h
> > index 1a6aaef5ccae..c1bb0cacf390 100644
> > --- a/tools/testing/selftests/kvm/include/ucall_common.h
> > +++ b/tools/testing/selftests/kvm/include/ucall_common.h
> > @@ -35,6 +35,23 @@ void ucall(uint64_t cmd, int nargs, ...);
> >   uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
> >   void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
> >
> > +/*
> > + * Run the VM to the next GUEST_SYNC(value), and return the value
> > +passed
> > + * to the sync. Any other exit from the guest is fatal.
> > + */
> > +static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) {
> > +	struct ucall uc;
> > +
> > +	vcpu_run(vcpu);
> > +	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > +	get_ucall(vcpu, &uc);
> > +	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > +		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > +
> > +	return uc.args[1];
> > +}
> > +
> >   /*
> >    * Perform userspace call without any associated data.  This bare call avoids
> >    * allocating a ucall struct, which can be useful if the atomic
> > operations in diff --git
> > a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > index 40507ed9fe8a..8c68029cfb4b 100644
> > --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > @@ -177,22 +177,6 @@ static void amd_guest_code(void)
> >   	}
> >   }
> >
> > -/*
> > - * Run the VM to the next GUEST_SYNC(value), and return the value
> > passed
> > - * to the sync. Any other exit from the guest is fatal.
> > - */
> > -static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) -{
> > -	struct ucall uc;
> > -
> > -	vcpu_run(vcpu);
> > -	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > -	get_ucall(vcpu, &uc);
> > -	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > -		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > -	return uc.args[1];
> > -}
> 
> Can this part be a separate patch ?
OK.
> 
> > -
> >   static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
> >   {
> >   	uint64_t r;
> > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > new file mode 100644
> > index 000000000000..a6a793f08515
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > @@ -0,0 +1,171 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Test for host and guest LBR preemption
> > + *
> > + * Copyright (C) 2021 Intel Corporation
> > + *
> > + */
> > +
> > +#define _GNU_SOURCEGG
> > +
> > +#include <linux/perf_event.h>
> > +#include <sys/syscall.h>
> > +#include <sys/sysinfo.h>
> > +#include <unistd.h>
> > +
> > +#include "test_util.h"
> > +#include "kvm_util.h"
> > +#include "processor.h"
> > +
> > +static void create_perf_events(int *fds, int cpu_num, bool pinned) {
> > +	struct perf_event_attr attr = {
> > +		.type = PERF_TYPE_HARDWARE,
> > +		.size = sizeof(attr),
> > +		.config = PERF_COUNT_HW_CPU_CYCLES,
> > +		.sample_type = PERF_SAMPLE_BRANCH_STACK,
> > +		.sample_period = 1000,
> > +		.pinned = pinned,
> > +		.branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
> > +				      PERF_SAMPLE_BRANCH_USER |
> > +				      PERF_SAMPLE_BRANCH_KERNEL,
> > +	};
> > +	int i;
> > +
> > +	for (i = 0; i < cpu_num; i++) {
> > +		fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1,
> > +PERF_FLAG_FD_CLOEXEC);
> 
> Which field can point to the generation of a "per cpu pinned" event ?
> More comments are required.
Yes, I should add more comments. This function create pinned and flexible event, it is controlled by input parameter "bool pinned".
> 
> > +		TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d",
> i);
> > +	}
> > +}
> > +
> > +static void release_perf_events(int *fds, int cpu_num) {
> > +	int i;
> > +
> > +	for (i = 0; i < cpu_num; i++)
> > +		close(fds[i]);
> > +}
> > +
> > +#define PERF_CAP_LBR_FMT_MASK  0x1F
> > +
> > +#define LBR_NOT_SUPPORTED  0xFFFE
> > +#define LBR_MSR_WRITE_ERROR 0xFFFD
> > +
> > +#define LBR_MODE_CHECK_PASS 0x0
> > +#define LBR_MSR_WRITE_SUCC  0x1
> > +
> > +static bool check_lbr_msr(void)
> > +{
> > +	uint64_t v, old_val;
> > +
> > +	old_val = rdmsr(MSR_LBR_TOS);
> 
> Why focus only on MSR_LBR_TOS ?
MSR_LBR_FROMx/TOx could be used also, I choose TOS without special reason.
> 
> > +
> > +	v  = old_val ^ 0x3UL;
> > +
> > +	wrmsr(MSR_LBR_TOS, v);
> > +	if (rdmsr(MSR_LBR_TOS) != v)
> > +		return false;
> > +
> > +	wrmsr(MSR_LBR_TOS, old_val);
> > +	if (rdmsr(MSR_LBR_TOS) != old_val)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static void guest_code(void)
> > +{
> > +	uint64_t v;
> > +
> > +	v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> > +	if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
> > +		GUEST_SYNC(LBR_NOT_SUPPORTED);
> > +
> > +	GUEST_SYNC(LBR_MODE_CHECK_PASS);
> > +
> > +	while (1) {
> > +		if (!check_lbr_msr()) {
> > +			GUEST_SYNC(LBR_MSR_WRITE_ERROR);
> > +			continue;
> > +		}
> > +
> > +		/* Enable LBR to avoid KVM recyling LBR. */
> > +		 v = rdmsr(MSR_IA32_DEBUGCTLMSR);
> > +		 v |= DEBUGCTLMSR_LBR;
> > +		 wrmsr(MSR_IA32_DEBUGCTLMSR, v);
> > +
> > +		GUEST_SYNC(LBR_MSR_WRITE_SUCC);
> > +	}
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > +	int *fds, ncpus;
> > +	struct kvm_vcpu *vcpu;
> > +	struct kvm_vm *vm;
> > +	uint64_t r;
> > +
> > +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> > +	TEST_REQUIRE(host_cpu_is_intel);
> > +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
> > +
> > +	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
> > +		    "LBR format in guest PERF_CAP msr isn't correct");
> > +
> > +	ncpus = get_nprocs();
> 
> Could we limit the test to a specific cpu, since it will affect the load on other
> cpus?
Yes, If selftest or vcpu could be bind to a specific cpu, only one perf event could be created on the target cpu. But I don't know how to specify cpu. 
> 
> > +	fds = malloc(sizeof(int) * ncpus);
> > +	TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
> > +
> > +	/* Create per cpu pinned LBR event, then it will own LBR. */
> > +	create_perf_events(fds, ncpus, true);
> > +
> > +	/* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
> > +	 * so guest couldn't access LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > +		    "1. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> 
> Obviously there are duplicate calls on release_perf_events() that can be omitted.
> 
> > +
> > +	/* Since per cpu pinned event is closed and LBR is free, guest could get it,
> > +	 * so guest could access LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > +		    "2. Failed to read/write guest LBR_TO msr");
> > +
> > +	/* Create per cpu LBR event, its priority is lower than vLBR event, and it
> > +	 *  couldn't get LBR back from vLBR
> > +	 */
> > +	create_perf_events(fds, ncpus, false);
> > +
> > +	/* LBR is still owned by guest, So guest could access LBR_TOS
> successfully. */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > +		    "3. Failed read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> > +
> > +	/* Create per cpu pinned LBR event, its priority is higher than vLBR event,
> > +	 * so it will get LBR back from vLBR.
> > +	 */
> > +	create_perf_events(fds, ncpus, true);
> > +
> > +	/* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
> > +	 * LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > +		    "4. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> 
>   Why not add more tests to cover all possibilities ?
> 
> 	per cpu pinned event
> 	per process pinned event
> 	per cpu event
> 	per process event
Per cpu pinned/flexible event have been covered here.
Per process, I think it means attaching perf onto qemu's process, I will add it.

Anyway, without the previous commits, vLBR has highest priority, this test result will change a lot.
> 
> > +
> > +	kvm_vm_free(vm);
> > +
> > +	free(fds);
> > +
> > +	return 0;
> > +}
Zhang, Xiong Y June 29, 2023, 2:52 a.m. UTC | #4
> 
> What kind of issues you expect this selftest to find?
> 
> IMO, it verifies the generic perf schedule rules but cannot find specific issues.
Current vLBR event break the generic perf schedule rule. So I write the fix commits and selftest to avoid future broken again. 
> 
> e.g., whether the LBR data is corrupted in some cases. If the selftest can verify
> whether
> 
> guest/host data is maintained correctly in a high contention env., that can be
> better to
> 
> sever the purpose of selftest.
Once vLBR event is preempted, I see designing gap and guest should get wrong data, it is out of this commits scope to fix the gap and to verify the result.
I should add this into commit message.

thanks
> 
> 
> On 6/16/2023 7:33 PM, Xiong Zhang wrote:
> > When guest access LBR msr at the first time, kvm will create a vLBR
> > event, vLBR event joins perf scheduler and occupy physical LBR for guest usage.
> > Once vLBR event is active and own LBR, guest could access LBR msr.
> >
> > But vLBR event is per process pinned event, perf has higher priority event:
> > per cpu pinned LBR event, perf has lower priority events also: per cpu
> > LBR event and per process LBR event.
> > So if host doesn't have higher priority per cpu pinned LBR event, vLBR
> > event could occupy physical LBR always. But once per cpu pinned LBR
> > event is active, vLBR event couldn't be active anymore, then guest
> > couldn't access LBR msr.
> >
> > This commit adds test case to cover guest and host lbr contend.
> >
> > Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
> > ---
> >   tools/testing/selftests/kvm/Makefile          |   1 +
> >   .../selftests/kvm/include/ucall_common.h      |  17 ++
> >   .../kvm/x86_64/pmu_event_filter_test.c        |  16 --
> >   .../kvm/x86_64/vmx_pmu_lbr_contend.c          | 171 ++++++++++++++++++
> >   4 files changed, 189 insertions(+), 16 deletions(-)
> >   create mode 100644
> > tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> >
> > diff --git a/tools/testing/selftests/kvm/Makefile
> > b/tools/testing/selftests/kvm/Makefile
> > index 4761b768b773..422bbc16ba2a 100644
> > --- a/tools/testing/selftests/kvm/Makefile
> > +++ b/tools/testing/selftests/kvm/Makefile
> > @@ -100,6 +100,7 @@ TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_dirty_log_test
> >   TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_exception_with_invalid_guest_state
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
> > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> >   TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> > diff --git a/tools/testing/selftests/kvm/include/ucall_common.h
> > b/tools/testing/selftests/kvm/include/ucall_common.h
> > index 1a6aaef5ccae..c1bb0cacf390 100644
> > --- a/tools/testing/selftests/kvm/include/ucall_common.h
> > +++ b/tools/testing/selftests/kvm/include/ucall_common.h
> > @@ -35,6 +35,23 @@ void ucall(uint64_t cmd, int nargs, ...);
> >   uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
> >   void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
> >
> > +/*
> > + * Run the VM to the next GUEST_SYNC(value), and return the value
> > +passed
> > + * to the sync. Any other exit from the guest is fatal.
> > + */
> > +static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) {
> > +	struct ucall uc;
> > +
> > +	vcpu_run(vcpu);
> > +	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > +	get_ucall(vcpu, &uc);
> > +	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > +		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > +
> > +	return uc.args[1];
> > +}
> > +
> >   /*
> >    * Perform userspace call without any associated data.  This bare call avoids
> >    * allocating a ucall struct, which can be useful if the atomic
> > operations in diff --git
> > a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > index 40507ed9fe8a..8c68029cfb4b 100644
> > --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > @@ -177,22 +177,6 @@ static void amd_guest_code(void)
> >   	}
> >   }
> >
> > -/*
> > - * Run the VM to the next GUEST_SYNC(value), and return the value
> > passed
> > - * to the sync. Any other exit from the guest is fatal.
> > - */
> > -static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) -{
> > -	struct ucall uc;
> > -
> > -	vcpu_run(vcpu);
> > -	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > -	get_ucall(vcpu, &uc);
> > -	TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > -		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > -	return uc.args[1];
> > -}
> > -
> >   static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
> >   {
> >   	uint64_t r;
> > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > new file mode 100644
> > index 000000000000..a6a793f08515
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > @@ -0,0 +1,171 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Test for host and guest LBR preemption
> > + *
> > + * Copyright (C) 2021 Intel Corporation
> > + *
> > + */
> > +
> > +#define _GNU_SOURCEGG
> > +
> > +#include <linux/perf_event.h>
> > +#include <sys/syscall.h>
> > +#include <sys/sysinfo.h>
> > +#include <unistd.h>
> > +
> > +#include "test_util.h"
> > +#include "kvm_util.h"
> > +#include "processor.h"
> > +
> > +static void create_perf_events(int *fds, int cpu_num, bool pinned) {
> > +	struct perf_event_attr attr = {
> > +		.type = PERF_TYPE_HARDWARE,
> > +		.size = sizeof(attr),
> > +		.config = PERF_COUNT_HW_CPU_CYCLES,
> > +		.sample_type = PERF_SAMPLE_BRANCH_STACK,
> > +		.sample_period = 1000,
> > +		.pinned = pinned,
> > +		.branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
> > +				      PERF_SAMPLE_BRANCH_USER |
> > +				      PERF_SAMPLE_BRANCH_KERNEL,
> > +	};
> > +	int i;
> > +
> > +	for (i = 0; i < cpu_num; i++) {
> > +		fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1,
> PERF_FLAG_FD_CLOEXEC);
> > +		TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d",
> i);
> > +	}
> > +}
> > +
> > +static void release_perf_events(int *fds, int cpu_num) {
> > +	int i;
> > +
> > +	for (i = 0; i < cpu_num; i++)
> > +		close(fds[i]);
> > +}
> > +
> > +#define PERF_CAP_LBR_FMT_MASK  0x1F
> > +
> > +#define LBR_NOT_SUPPORTED  0xFFFE
> > +#define LBR_MSR_WRITE_ERROR 0xFFFD
> > +
> > +#define LBR_MODE_CHECK_PASS 0x0
> > +#define LBR_MSR_WRITE_SUCC  0x1
> > +
> > +static bool check_lbr_msr(void)
> > +{
> > +	uint64_t v, old_val;
> > +
> > +	old_val = rdmsr(MSR_LBR_TOS);
> > +
> > +	v  = old_val ^ 0x3UL;
> > +
> > +	wrmsr(MSR_LBR_TOS, v);
> > +	if (rdmsr(MSR_LBR_TOS) != v)
> > +		return false;
> > +
> > +	wrmsr(MSR_LBR_TOS, old_val);
> > +	if (rdmsr(MSR_LBR_TOS) != old_val)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static void guest_code(void)
> > +{
> > +	uint64_t v;
> > +
> > +	v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> > +	if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
> > +		GUEST_SYNC(LBR_NOT_SUPPORTED);
> > +
> > +	GUEST_SYNC(LBR_MODE_CHECK_PASS);
> > +
> > +	while (1) {
> > +		if (!check_lbr_msr()) {
> > +			GUEST_SYNC(LBR_MSR_WRITE_ERROR);
> > +			continue;
> > +		}
> > +
> > +		/* Enable LBR to avoid KVM recyling LBR. */
> > +		 v = rdmsr(MSR_IA32_DEBUGCTLMSR);
> > +		 v |= DEBUGCTLMSR_LBR;
> > +		 wrmsr(MSR_IA32_DEBUGCTLMSR, v);
> > +
> > +		GUEST_SYNC(LBR_MSR_WRITE_SUCC);
> > +	}
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > +	int *fds, ncpus;
> > +	struct kvm_vcpu *vcpu;
> > +	struct kvm_vm *vm;
> > +	uint64_t r;
> > +
> > +	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> > +	TEST_REQUIRE(host_cpu_is_intel);
> > +	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
> > +
> > +	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
> > +		    "LBR format in guest PERF_CAP msr isn't correct");
> > +
> > +	ncpus = get_nprocs();
> > +	fds = malloc(sizeof(int) * ncpus);
> > +	TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
> > +
> > +	/* Create per cpu pinned LBR event, then it will own LBR. */
> > +	create_perf_events(fds, ncpus, true);
> > +
> > +	/* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
> > +	 * so guest couldn't access LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > +		    "1. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> > +
> > +	/* Since per cpu pinned event is closed and LBR is free, guest could get it,
> > +	 * so guest could access LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > +		    "2. Failed to read/write guest LBR_TO msr");
> > +
> > +	/* Create per cpu LBR event, its priority is lower than vLBR event, and it
> > +	 *  couldn't get LBR back from vLBR
> > +	 */
> > +	create_perf_events(fds, ncpus, false);
> > +
> > +	/* LBR is still owned by guest, So guest could access LBR_TOS
> successfully. */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > +		    "3. Failed read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> > +
> > +	/* Create per cpu pinned LBR event, its priority is higher than vLBR event,
> > +	 * so it will get LBR back from vLBR.
> > +	 */
> > +	create_perf_events(fds, ncpus, true);
> > +
> > +	/* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
> > +	 * LBR_TOS msr.
> > +	 */
> > +	r = run_vcpu_to_sync(vcpu);
> > +	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > +		    "4. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > +	release_perf_events(fds, ncpus);
> > +
> > +	kvm_vm_free(vm);
> > +
> > +	free(fds);
> > +
> > +	return 0;
> > +}
Yang, Weijiang June 30, 2023, 2:05 a.m. UTC | #5
On 6/29/2023 10:52 AM, Zhang, Xiong Y wrote:
>> What kind of issues you expect this selftest to find?
>>
>> IMO, it verifies the generic perf schedule rules but cannot find specific issues.
> Current vLBR event break the generic perf schedule rule. So I write the fix commits and selftest to avoid future broken again.

OK, I think you need to refine the assert failure messages and give the 
usersĀ  straightforward messages showing something

wrong is happening and stop the following tests since something is 
broken, no need to trigger more errors. E.g.,

+	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
+		    "1. Unexpected successfully read/write guest LBR_TO msr");


"1. Unexpected successfully read/write guest LBR_TO msr"
=> "Host LBR ON: Detected unexpected results when write guest vLBR MSRs. Stop testing."

Then at the end of tests, you can print a successful message showing the perf/LBR is working as
expected. This way, testers can got clear result indication of the app.

>> e.g., whether the LBR data is corrupted in some cases. If the selftest can verify
>> whether
>>
>> guest/host data is maintained correctly in a high contention env., that can be
>> better to
>>
>> sever the purpose of selftest.
> Once vLBR event is preempted, I see designing gap and guest should get wrong data, it is out of this commits scope to fix the gap and to verify the result.
> I should add this into commit message.

Yes, refine the change log of this patch to tell clear purpose of this 
app so that users know why this app is needed.

>
> thanks
>>
>> On 6/16/2023 7:33 PM, Xiong Zhang wrote:
>>> When guest access LBR msr at the first time, kvm will create a vLBR
>>> event, vLBR event joins perf scheduler and occupy physical LBR for guest usage.
>>> Once vLBR event is active and own LBR, guest could access LBR msr.
>>>
>>> But vLBR event is per process pinned event, perf has higher priority event:
>>> per cpu pinned LBR event, perf has lower priority events also: per cpu
>>> LBR event and per process LBR event.
>>> So if host doesn't have higher priority per cpu pinned LBR event, vLBR
>>> event could occupy physical LBR always. But once per cpu pinned LBR
>>> event is active, vLBR event couldn't be active anymore, then guest
>>> couldn't access LBR msr.
>>>
>>> This commit adds test case to cover guest and host lbr contend.
>>>
>>> Signed-off-by: Xiong Zhang <xiong.y.zhang@intel.com>
>>> ---
>>>    tools/testing/selftests/kvm/Makefile          |   1 +
>>>    .../selftests/kvm/include/ucall_common.h      |  17 ++
[...]
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4761b768b773..422bbc16ba2a 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -100,6 +100,7 @@  TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
index 1a6aaef5ccae..c1bb0cacf390 100644
--- a/tools/testing/selftests/kvm/include/ucall_common.h
+++ b/tools/testing/selftests/kvm/include/ucall_common.h
@@ -35,6 +35,23 @@  void ucall(uint64_t cmd, int nargs, ...);
 uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
 void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
 
+/*
+ * Run the VM to the next GUEST_SYNC(value), and return the value passed
+ * to the sync. Any other exit from the guest is fatal.
+ */
+static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	get_ucall(vcpu, &uc);
+	TEST_ASSERT(uc.cmd == UCALL_SYNC,
+		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
+
+	return uc.args[1];
+}
+
 /*
  * Perform userspace call without any associated data.  This bare call avoids
  * allocating a ucall struct, which can be useful if the atomic operations in
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index 40507ed9fe8a..8c68029cfb4b 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -177,22 +177,6 @@  static void amd_guest_code(void)
 	}
 }
 
-/*
- * Run the VM to the next GUEST_SYNC(value), and return the value passed
- * to the sync. Any other exit from the guest is fatal.
- */
-static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
-{
-	struct ucall uc;
-
-	vcpu_run(vcpu);
-	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-	get_ucall(vcpu, &uc);
-	TEST_ASSERT(uc.cmd == UCALL_SYNC,
-		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
-	return uc.args[1];
-}
-
 static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
 {
 	uint64_t r;
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
new file mode 100644
index 000000000000..a6a793f08515
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
@@ -0,0 +1,171 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for host and guest LBR preemption
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ */
+
+#define _GNU_SOURCEGG
+
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+static void create_perf_events(int *fds, int cpu_num, bool pinned)
+{
+	struct perf_event_attr attr = {
+		.type = PERF_TYPE_HARDWARE,
+		.size = sizeof(attr),
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type = PERF_SAMPLE_BRANCH_STACK,
+		.sample_period = 1000,
+		.pinned = pinned,
+		.branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+				      PERF_SAMPLE_BRANCH_USER |
+				      PERF_SAMPLE_BRANCH_KERNEL,
+	};
+	int i;
+
+	for (i = 0; i < cpu_num; i++) {
+		fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1, PERF_FLAG_FD_CLOEXEC);
+		TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d", i);
+	}
+}
+
+static void release_perf_events(int *fds, int cpu_num)
+{
+	int i;
+
+	for (i = 0; i < cpu_num; i++)
+		close(fds[i]);
+}
+
+#define PERF_CAP_LBR_FMT_MASK  0x1F
+
+#define LBR_NOT_SUPPORTED  0xFFFE
+#define LBR_MSR_WRITE_ERROR 0xFFFD
+
+#define LBR_MODE_CHECK_PASS 0x0
+#define LBR_MSR_WRITE_SUCC  0x1
+
+static bool check_lbr_msr(void)
+{
+	uint64_t v, old_val;
+
+	old_val = rdmsr(MSR_LBR_TOS);
+
+	v  = old_val ^ 0x3UL;
+
+	wrmsr(MSR_LBR_TOS, v);
+	if (rdmsr(MSR_LBR_TOS) != v)
+		return false;
+
+	wrmsr(MSR_LBR_TOS, old_val);
+	if (rdmsr(MSR_LBR_TOS) != old_val)
+		return false;
+
+	return true;
+}
+
+static void guest_code(void)
+{
+	uint64_t v;
+
+	v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
+	if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
+		GUEST_SYNC(LBR_NOT_SUPPORTED);
+
+	GUEST_SYNC(LBR_MODE_CHECK_PASS);
+
+	while (1) {
+		if (!check_lbr_msr()) {
+			GUEST_SYNC(LBR_MSR_WRITE_ERROR);
+			continue;
+		}
+
+		/* Enable LBR to avoid KVM recyling LBR. */
+		 v = rdmsr(MSR_IA32_DEBUGCTLMSR);
+		 v |= DEBUGCTLMSR_LBR;
+		 wrmsr(MSR_IA32_DEBUGCTLMSR, v);
+
+		GUEST_SYNC(LBR_MSR_WRITE_SUCC);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int *fds, ncpus;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t r;
+
+	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
+		    "LBR format in guest PERF_CAP msr isn't correct");
+
+	ncpus = get_nprocs();
+	fds = malloc(sizeof(int) * ncpus);
+	TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
+
+	/* Create per cpu pinned LBR event, then it will own LBR. */
+	create_perf_events(fds, ncpus, true);
+
+	/* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
+	 * so guest couldn't access LBR_TOS msr.
+	 */
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
+		    "1. Unexpected successfully read/write guest LBR_TO msr");
+
+	release_perf_events(fds, ncpus);
+
+	/* Since per cpu pinned event is closed and LBR is free, guest could get it,
+	 * so guest could access LBR_TOS msr.
+	 */
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
+		    "2. Failed to read/write guest LBR_TO msr");
+
+	/* Create per cpu LBR event, its priority is lower than vLBR event, and it
+	 *  couldn't get LBR back from vLBR
+	 */
+	create_perf_events(fds, ncpus, false);
+
+	/* LBR is still owned by guest, So guest could access LBR_TOS successfully. */
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
+		    "3. Failed read/write guest LBR_TO msr");
+
+	release_perf_events(fds, ncpus);
+
+	/* Create per cpu pinned LBR event, its priority is higher than vLBR event,
+	 * so it will get LBR back from vLBR.
+	 */
+	create_perf_events(fds, ncpus, true);
+
+	/* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
+	 * LBR_TOS msr.
+	 */
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
+		    "4. Unexpected successfully read/write guest LBR_TO msr");
+
+	release_perf_events(fds, ncpus);
+
+	kvm_vm_free(vm);
+
+	free(fds);
+
+	return 0;
+}