diff mbox

[v1,4/4] KVM/vmx: enable lbr for the guest

Message ID 1506314696-4632-5-git-send-email-wei.w.wang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wang, Wei W Sept. 25, 2017, 4:44 a.m. UTC
Passthrough the LBR stack to the guest, and auto switch the stack MSRs
upon VMEntry and VMExit.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
---
 arch/x86/kvm/vmx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

Comments

Paolo Bonzini Sept. 25, 2017, 9:16 a.m. UTC | #1
On 25/09/2017 06:44, Wei Wang wrote:
> Passthrough the LBR stack to the guest, and auto switch the stack MSRs
> upon VMEntry and VMExit.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>

This has to be enabled separately for each guest, because it may prevent
live migration to hosts with a different family/model.

Paolo

> ---
>  arch/x86/kvm/vmx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 50 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 5f5c2f1..35e02a7 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -107,6 +107,9 @@ static u64 __read_mostly host_xss;
>  static bool __read_mostly enable_pml = 1;
>  module_param_named(pml, enable_pml, bool, S_IRUGO);
>  
> +static bool __read_mostly enable_lbrv = 1;
> +module_param_named(lbrv, enable_lbrv, bool, 0444);
> +
>  #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
>  
>  /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
> @@ -5428,6 +5431,25 @@ static void ept_set_mmio_spte_mask(void)
>  				   VMX_EPT_MISCONFIG_WX_VALUE);
>  }
>  
> +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx)
> +{
> +	int i;
> +	struct perf_lbr_stack lbr_stack;
> +
> +	perf_get_lbr_stack(&lbr_stack);
> +
> +	add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0);
> +	add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0);
> +
> +	for (i = 0; i < lbr_stack.lbr_nr; i++) {
> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0);
> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0);
> +		if (lbr_stack.lbr_info)
> +			add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0,
> +					      0);
> +	}
> +}
> +
>  #define VMX_XSS_EXIT_BITMAP 0
>  /*
>   * Sets up the vmcs for emulated real mode.
> @@ -5508,6 +5530,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  
>  	add_atomic_switch_msr(vmx, MSR_IA32_DEBUGCTLMSR, 0, 0);
>  
> +	if (enable_lbrv)
> +		auto_switch_lbr_msrs(vmx);
> +
>  	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
>  		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
>  
> @@ -6721,6 +6746,28 @@ void vmx_enable_tdp(void)
>  	kvm_enable_tdp();
>  }
>  
> +static void vmx_passthrough_lbr_msrs(void)
> +{
> +	int i;
> +	struct perf_lbr_stack lbr_stack;
> +
> +	if (perf_get_lbr_stack(&lbr_stack) < 0) {
> +		enable_lbrv = false;
> +		return;
> +	}
> +
> +	vmx_disable_intercept_for_msr(MSR_LBR_SELECT, false);
> +	vmx_disable_intercept_for_msr(lbr_stack.lbr_tos, false);
> +
> +	for (i = 0; i < lbr_stack.lbr_nr; i++) {
> +		vmx_disable_intercept_for_msr(lbr_stack.lbr_from + i, false);
> +		vmx_disable_intercept_for_msr(lbr_stack.lbr_to + i, false);
> +		if (lbr_stack.lbr_info)
> +			vmx_disable_intercept_for_msr(lbr_stack.lbr_info + i,
> +						      false);
> +	}
> +}
> +
>  static __init int hardware_setup(void)
>  {
>  	int r = -ENOMEM, i, msr;
> @@ -6822,6 +6869,9 @@ static __init int hardware_setup(void)
>  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
>  	vmx_disable_intercept_for_msr(MSR_IA32_DEBUGCTLMSR, false);
>  
> +	if (enable_lbrv)
> +		vmx_passthrough_lbr_msrs();
> +
>  	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
>  			vmx_msr_bitmap_legacy, PAGE_SIZE);
>  	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
>
Wang, Wei W Sept. 25, 2017, 12:57 p.m. UTC | #2
On 09/25/2017 05:16 PM, Paolo Bonzini wrote:
> On 25/09/2017 06:44, Wei Wang wrote:
>> Passthrough the LBR stack to the guest, and auto switch the stack MSRs
>> upon VMEntry and VMExit.
>>
>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> This has to be enabled separately for each guest, because it may prevent
> live migration to hosts with a different family/model.

Did you mean trapping MSR_IA32_DEBUGCTLMSR, instead of passing through it?
In that case, we would also need to modify the kernel driver (i.e. the 
PMI handler)
to check MSR_IA32_DEBUGCTLMSR before reading the LBR MSRs. Then the guest
driver can notice that the feature that is in use has been disabled 
after live
migration.

This kind of live migration disables features that are being used. Would 
it be
common in real usage to migrate between different CPU models?

I think this issue isn't specific to the LBR feature. May I know how 
would other
features be handled in this case? Thanks.

On the other hand, an alternative approach coming up to my mind is that we
can do some kind of feature negotiation at the very beginning of 
migration, and
fails live migration if the feature negotiation fails.


Best,
Wei
Andi Kleen Sept. 25, 2017, 2:57 p.m. UTC | #3
> +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx)
> +{
> +	int i;
> +	struct perf_lbr_stack lbr_stack;
> +
> +	perf_get_lbr_stack(&lbr_stack);
> +
> +	add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0);
> +	add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0);
> +
> +	for (i = 0; i < lbr_stack.lbr_nr; i++) {
> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0);
> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0);
> +		if (lbr_stack.lbr_info)
> +			add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0,
> +					      0);
> +	}

That will be really expensive and add a lot of overhead to every entry/exit.
perf can already context switch the LBRs on task context switch. With that
you can just switch LBR_SELECT, which is *much* cheaper because there
are far less context switches than exit/entries.

It implies that when KVM is running it needs to prevent perf from enabling
LBRs in the context of KVM, but that should be straight forward.

-Andi
Wang, Wei W Sept. 26, 2017, 8:56 a.m. UTC | #4
On 09/25/2017 10:57 PM, Andi Kleen wrote:
>> +static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx)
>> +{
>> +	int i;
>> +	struct perf_lbr_stack lbr_stack;
>> +
>> +	perf_get_lbr_stack(&lbr_stack);
>> +
>> +	add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0);
>> +	add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0);
>> +
>> +	for (i = 0; i < lbr_stack.lbr_nr; i++) {
>> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0);
>> +		add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0);
>> +		if (lbr_stack.lbr_info)
>> +			add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0,
>> +					      0);
>> +	}
> That will be really expensive and add a lot of overhead to every entry/exit.
> perf can already context switch the LBRs on task context switch. With that
> you can just switch LBR_SELECT, which is *much* cheaper because there
> are far less context switches than exit/entries.
>
> It implies that when KVM is running it needs to prevent perf from enabling
> LBRs in the context of KVM, but that should be straight forward.

I kind of have a different thought here:

1) vCPU context switching and guest side task switching are not identical.
That is, when the vCPU is scheduled out, the guest task on the vCPU may not
run out its time slice yet, so the task will continue to run when the 
vCPU is
scheduled in by the host (lbr wasn't save by the guest task when the vCPU is
scheduled out in this case).

It is possible to have the vCPU which runs the guest task (in use of 
lbr) scheduled
out, followed by a new host task being scheduled in on the pCPU to run.
It is not guaranteed that the new host task does not use the LBR feature 
on the
pCPU.

2) Sometimes, people may want this usage: "perf record -b 
./qemu-system-x86_64 ...",
which will need lbr to be used in KVM as well.


I think one possible optimization we could do would be to add the LBR 
MSRs to auto
switching when the guest requests to enable the feature, and remove them 
when
being disabled. This will need to trap guest access to MSR_DEBUGCTL.


Best,
Wei
Andi Kleen Sept. 26, 2017, 4:41 p.m. UTC | #5
> 1) vCPU context switching and guest side task switching are not identical.
> That is, when the vCPU is scheduled out, the guest task on the vCPU may not

guest task lifetime has nothing to do with this. It's completely independent
of what you do here on the VCPU level.

> run out its time slice yet, so the task will continue to run when the vCPU
> is
> scheduled in by the host (lbr wasn't save by the guest task when the vCPU is
> scheduled out in this case).
> 
> It is possible to have the vCPU which runs the guest task (in use of lbr)
> scheduled
> out, followed by a new host task being scheduled in on the pCPU to run.
> It is not guaranteed that the new host task does not use the LBR feature on
> the
> pCPU.

Sure it may use the LBR, and the normal perf context switch
will switch it and everything works fine.

It's like any other per-task LBR user.

> 
> 2) Sometimes, people may want this usage: "perf record -b
> ./qemu-system-x86_64 ...",
> which will need lbr to be used in KVM as well.

In this obscure case you can disable LBR support for the guest.
The common case is far more important.

It sounds like you didn't do any performance measurements.
I expect the performance of your current solution to be terrible.

e.g. a normal perf PMI does at least 1 MSR reads and 4+ MSR writes
for a single counter. With multiple counters it gets worse.

For each of those you'll need to exit. Adding something
to the entry/exit list is similar to the cost of doing 
explicit RD/WRMSRs.

On Skylake we have 32*3=96 MSRs for the LBRs.

So with the 5 exits and entries, you're essentually doing
5*2*96=18432 extra MSR accesses for each PMI.

MSR access is 100+ cycles at least, for writes it is far more
expensive.

-Andi
Wang, Wei W Sept. 27, 2017, 1:27 a.m. UTC | #6
On 09/27/2017 12:41 AM, Andi Kleen wrote:
>> 1) vCPU context switching and guest side task switching are not identical.
>> That is, when the vCPU is scheduled out, the guest task on the vCPU may not
> guest task lifetime has nothing to do with this. It's completely independent
> of what you do here on the VCPU level.
>
>> run out its time slice yet, so the task will continue to run when the vCPU
>> is
>> scheduled in by the host (lbr wasn't save by the guest task when the vCPU is
>> scheduled out in this case).
>>
>> It is possible to have the vCPU which runs the guest task (in use of lbr)
>> scheduled
>> out, followed by a new host task being scheduled in on the pCPU to run.
>> It is not guaranteed that the new host task does not use the LBR feature on
>> the
>> pCPU.
> Sure it may use the LBR, and the normal perf context switch
> will switch it and everything works fine.
>
> It's like any other per-task LBR user.

OK, I see the point, thanks.

Why couldn't we save the LBR_SELECT via task switching too?


Best,
Wei
diff mbox

Patch

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5f5c2f1..35e02a7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -107,6 +107,9 @@  static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+static bool __read_mostly enable_lbrv = 1;
+module_param_named(lbrv, enable_lbrv, bool, 0444);
+
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@@ -5428,6 +5431,25 @@  static void ept_set_mmio_spte_mask(void)
 				   VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
+static void auto_switch_lbr_msrs(struct vcpu_vmx *vmx)
+{
+	int i;
+	struct perf_lbr_stack lbr_stack;
+
+	perf_get_lbr_stack(&lbr_stack);
+
+	add_atomic_switch_msr(vmx, MSR_LBR_SELECT, 0, 0);
+	add_atomic_switch_msr(vmx, lbr_stack.lbr_tos, 0, 0);
+
+	for (i = 0; i < lbr_stack.lbr_nr; i++) {
+		add_atomic_switch_msr(vmx, lbr_stack.lbr_from + i, 0, 0);
+		add_atomic_switch_msr(vmx, lbr_stack.lbr_to + i, 0, 0);
+		if (lbr_stack.lbr_info)
+			add_atomic_switch_msr(vmx, lbr_stack.lbr_info + i, 0,
+					      0);
+	}
+}
+
 #define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
@@ -5508,6 +5530,9 @@  static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	add_atomic_switch_msr(vmx, MSR_IA32_DEBUGCTLMSR, 0, 0);
 
+	if (enable_lbrv)
+		auto_switch_lbr_msrs(vmx);
+
 	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
@@ -6721,6 +6746,28 @@  void vmx_enable_tdp(void)
 	kvm_enable_tdp();
 }
 
+static void vmx_passthrough_lbr_msrs(void)
+{
+	int i;
+	struct perf_lbr_stack lbr_stack;
+
+	if (perf_get_lbr_stack(&lbr_stack) < 0) {
+		enable_lbrv = false;
+		return;
+	}
+
+	vmx_disable_intercept_for_msr(MSR_LBR_SELECT, false);
+	vmx_disable_intercept_for_msr(lbr_stack.lbr_tos, false);
+
+	for (i = 0; i < lbr_stack.lbr_nr; i++) {
+		vmx_disable_intercept_for_msr(lbr_stack.lbr_from + i, false);
+		vmx_disable_intercept_for_msr(lbr_stack.lbr_to + i, false);
+		if (lbr_stack.lbr_info)
+			vmx_disable_intercept_for_msr(lbr_stack.lbr_info + i,
+						      false);
+	}
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6822,6 +6869,9 @@  static __init int hardware_setup(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_DEBUGCTLMSR, false);
 
+	if (enable_lbrv)
+		vmx_passthrough_lbr_msrs();
+
 	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
 	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,