diff mbox series

[v3,09/12] KVM: VMX: Remove vmx->current_tsc_ratio and decache_tsc_multiplier()

Message ID 20210521102449.21505-10-ilstam@amazon.com (mailing list archive)
State New, archived
Headers show
Series KVM: Implement nested TSC scaling | expand

Commit Message

Ilias Stamatis May 21, 2021, 10:24 a.m. UTC
The vmx->current_tsc_ratio field is redundant as
vcpu->arch.tsc_scaling_ratio already tracks the current TSC scaling
ratio. Removing this field makes decache_tsc_multiplier() an one-liner
so remove that too and do a vmcs_write64() directly in order to be more
consistent with surrounding code.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
---
 arch/x86/kvm/vmx/nested.c | 9 ++++-----
 arch/x86/kvm/vmx/vmx.c    | 5 ++---
 arch/x86/kvm/vmx/vmx.h    | 8 --------
 3 files changed, 6 insertions(+), 16 deletions(-)

Comments

Maxim Levitsky May 24, 2021, 5:53 p.m. UTC | #1
On Fri, 2021-05-21 at 11:24 +0100, Ilias Stamatis wrote:
> The vmx->current_tsc_ratio field is redundant as
> vcpu->arch.tsc_scaling_ratio already tracks the current TSC scaling
> ratio. Removing this field makes decache_tsc_multiplier() an one-liner
> so remove that too and do a vmcs_write64() directly in order to be more
> consistent with surrounding code.
Not to mention that 'decache_tsc_multiplier' isn't a good name IMHO
for this....


> 
> Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 9 ++++-----
>  arch/x86/kvm/vmx/vmx.c    | 5 ++---
>  arch/x86/kvm/vmx/vmx.h    | 8 --------
>  3 files changed, 6 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 6058a65a6ede..239154d3e4e7 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2533,9 +2533,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  	}
>  
>  	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
> -
>  	if (kvm_has_tsc_control)
> -		decache_tsc_multiplier(vmx);
> +		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
>  
>  	nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
>  
> @@ -4501,12 +4500,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>  	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
>  	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
>  	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
> +	if (kvm_has_tsc_control)
> +		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
> +
>  	if (vmx->nested.l1_tpr_threshold != -1)
>  		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
>  
> -	if (kvm_has_tsc_control)
> -		decache_tsc_multiplier(vmx);
> -
>  	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
>  		vmx->nested.change_vmcs01_virtual_apic_mode = false;
>  		vmx_set_virtual_apic_mode(vcpu);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4b70431c2edd..7c52c697cfe3 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1392,9 +1392,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
>  	}
>  
>  	/* Setup TSC multiplier */
> -	if (kvm_has_tsc_control &&
> -	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
> -		decache_tsc_multiplier(vmx);
> +	if (kvm_has_tsc_control)
> +		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);

This might have an overhead of writing the TSC scaling ratio even if
it is unchanged. I haven't measured how expensive vmread/vmwrites are but
at least when nested, the vmreads/vmwrites can be very expensive (if they
cause a vmexit).

This is why I think the 'vmx->current_tsc_ratio' exists - to have
a cached value of TSC scale ratio to avoid either 'vmread'ing
or 'vmwrite'ing it without a need.


Best regards,
	Maxim Levitsky

>  }
>  
>  /*
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index aa97c82e3451..3eaa86a0ba3e 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -322,8 +322,6 @@ struct vcpu_vmx {
>  	/* apic deadline value in host tsc */
>  	u64 hv_deadline_tsc;
>  
> -	u64 current_tsc_ratio;
> -
>  	unsigned long host_debugctlmsr;
>  
>  	/*
> @@ -532,12 +530,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
>  			      GFP_KERNEL_ACCOUNT);
>  }
>  
> -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
> -{
> -	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
> -	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
> -}
> -
>  static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
>  {
>  	return vmx->secondary_exec_control &
Sean Christopherson May 24, 2021, 6:44 p.m. UTC | #2
On Mon, May 24, 2021, Maxim Levitsky wrote:
> On Fri, 2021-05-21 at 11:24 +0100, Ilias Stamatis wrote:
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 4b70431c2edd..7c52c697cfe3 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1392,9 +1392,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
> >  	}
> >  
> >  	/* Setup TSC multiplier */
> > -	if (kvm_has_tsc_control &&
> > -	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
> > -		decache_tsc_multiplier(vmx);
> > +	if (kvm_has_tsc_control)
> > +		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
> 
> This might have an overhead of writing the TSC scaling ratio even if
> it is unchanged. I haven't measured how expensive vmread/vmwrites are but
> at least when nested, the vmreads/vmwrites can be very expensive (if they
> cause a vmexit).
> 
> This is why I think the 'vmx->current_tsc_ratio' exists - to have
> a cached value of TSC scale ratio to avoid either 'vmread'ing
> or 'vmwrite'ing it without a need.

Yes, but its existence is a complete hack.  vmx->current_tsc_ratio has the same
scope as vcpu->arch.tsc_scaling_ratio, i.e. vmx == vcpu == vcpu->arch.  Unlike
per-VMCS tracking, it should not be useful, keyword "should".

What I meant by my earlier comment:

  Its use in vmx_vcpu_load_vmcs() is basically "write the VMCS if we forgot to
  earlier", which is all kinds of wrong.

is that vmx_vcpu_load_vmcs() should never write vmcs.TSC_MULTIPLIER.  The correct
behavior is to set the field at VMCS initialization, and then immediately set it
whenever the ratio is changed, e.g. on nested transition, from userspace, etc...
In other words, my unclear feedback was to make it obsolete (and drop it) by 
fixing the underlying mess, not to just drop the optimization hack.
Ilias Stamatis May 25, 2021, 10:41 a.m. UTC | #3
On Mon, 2021-05-24 at 18:44 +0000, Sean Christopherson wrote:
> On Mon, May 24, 2021, Maxim Levitsky wrote:
> > On Fri, 2021-05-21 at 11:24 +0100, Ilias Stamatis wrote:
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 4b70431c2edd..7c52c697cfe3 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -1392,9 +1392,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
> > >     }
> > > 
> > >     /* Setup TSC multiplier */
> > > -   if (kvm_has_tsc_control &&
> > > -       vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
> > > -           decache_tsc_multiplier(vmx);
> > > +   if (kvm_has_tsc_control)
> > > +           vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
> > 
> > This might have an overhead of writing the TSC scaling ratio even if
> > it is unchanged. I haven't measured how expensive vmread/vmwrites are but
> > at least when nested, the vmreads/vmwrites can be very expensive (if they
> > cause a vmexit).
> > 
> > This is why I think the 'vmx->current_tsc_ratio' exists - to have
> > a cached value of TSC scale ratio to avoid either 'vmread'ing
> > or 'vmwrite'ing it without a need.

Right. I thought the overhead might not be that significant since we're doing
lots of vmwrites on vmentry/vmexit anyway, but yeah, why introduce any kind of
extra overhead anyway.

I'm fine with this particular patch getting dropped. It's not directly related 
to the series anyway.

> 
> Yes, but its existence is a complete hack.  vmx->current_tsc_ratio has the same
> scope as vcpu->arch.tsc_scaling_ratio, i.e. vmx == vcpu == vcpu->arch.  Unlike
> per-VMCS tracking, it should not be useful, keyword "should".
> 
> What I meant by my earlier comment:
> 
>   Its use in vmx_vcpu_load_vmcs() is basically "write the VMCS if we forgot to
>   earlier", which is all kinds of wrong.
> 
> is that vmx_vcpu_load_vmcs() should never write vmcs.TSC_MULTIPLIER.  The correct
> behavior is to set the field at VMCS initialization, and then immediately set it
> whenever the ratio is changed, e.g. on nested transition, from userspace, etc...
> In other words, my unclear feedback was to make it obsolete (and drop it) by
> fixing the underlying mess, not to just drop the optimization hack.

I understood this and replied earlier. The right place for the hw multiplier
field to be updated is inside set_tsc_khz() in common code when the ratio
changes. However, this requires adding another vendor callback etc. As all
this is further refactoring I believe it's better to leave this series as is -
ie only touching code that is directly related to nested TSC scaling and not
try to do everything as part of the same series. This makes testing easier
too. We can still implement these changes later.

Thanks,
Ilias
Sean Christopherson May 25, 2021, 3:58 p.m. UTC | #4
On Tue, May 25, 2021, Stamatis, Ilias wrote:
> On Mon, 2021-05-24 at 18:44 +0000, Sean Christopherson wrote:
> > Yes, but its existence is a complete hack.  vmx->current_tsc_ratio has the same
> > scope as vcpu->arch.tsc_scaling_ratio, i.e. vmx == vcpu == vcpu->arch.  Unlike
> > per-VMCS tracking, it should not be useful, keyword "should".
> > 
> > What I meant by my earlier comment:
> > 
> >   Its use in vmx_vcpu_load_vmcs() is basically "write the VMCS if we forgot to
> >   earlier", which is all kinds of wrong.
> > 
> > is that vmx_vcpu_load_vmcs() should never write vmcs.TSC_MULTIPLIER.  The correct
> > behavior is to set the field at VMCS initialization, and then immediately set it
> > whenever the ratio is changed, e.g. on nested transition, from userspace, etc...
> > In other words, my unclear feedback was to make it obsolete (and drop it) by
> > fixing the underlying mess, not to just drop the optimization hack.
> 
> I understood this and replied earlier. The right place for the hw multiplier
> field to be updated is inside set_tsc_khz() in common code when the ratio
> changes. However, this requires adding another vendor callback etc. As all
> this is further refactoring I believe it's better to leave this series as is -
> ie only touching code that is directly related to nested TSC scaling and not
> try to do everything as part of the same series.

But it directly impacts your code, e.g. the nested enter/exit flows would need
to dance around the decache silliness.  And I believe it even more directly
impacts this series: kvm_set_tsc_khz() fails to handle the case where userspace
invokes KVM_SET_TSC_KHZ while L2 is active.

> This makes testing easier too.

Hmm, sort of.  Yes, the fewer patches/modifications in a series definitely makes
the series itself easier to test.  But stepping back and looking at the total
cost of testing, I would argue that punting related changes to a later time
increases the overall cost.  E.g. if someone else picks up the clean up work,
then they have to redo most, if not all, of the testing that you are already
doing, including getting access to the proper hardware, understanding what tests
to prioritize, etc...  Whereas adding one more patch to your series is an
incremental cost since you already have the hardware setup, know which tests to
run, etc...

> We can still implement these changes later.

We can, but we shouldn't.  Simply dropping vmx->current_tsc_ratio is not an
option; it knowingly introduces a (minor) performance regression, for no reason
other than wanting to avoid code churn.  Piling more stuff on top of the flawed
decache logic is impolite, as it adds more work for the person that ends up
doing the cleanup.  I would 100% agree if this were a significant cleanup and/or
completely unrelated, but IMO that's not the case.

Compile tested only...


diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 029c9615378f..34ad7a17458a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -90,6 +90,7 @@ KVM_X86_OP_NULL(has_wbinvd_exit)
 KVM_X86_OP(get_l2_tsc_offset)
 KVM_X86_OP(get_l2_tsc_multiplier)
 KVM_X86_OP(write_tsc_offset)
+KVM_X86_OP(write_tsc_multiplier)
 KVM_X86_OP(get_exit_info)
 KVM_X86_OP(check_intercept)
 KVM_X86_OP(handle_exit_irqoff)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f099277b993d..a334ce7741ab 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1308,6 +1308,7 @@ struct kvm_x86_ops {
        u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
        u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+       void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);

        /*
         * Retrieve somewhat arbitrary exit information.  Intended to be used
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b18f60463073..914afcceb46d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1103,6 +1103,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }

+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+       /*
+        * Handled when loading guest state since the ratio is programmed via
+        * MSR_AMD64_TSC_RATIO, not a field in the VMCB.
+        */
+}
+
 /* Evaluate instruction intercepts that depend on guest CPUID features. */
 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
                                              struct vcpu_svm *svm)
@@ -4528,6 +4536,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .get_l2_tsc_offset = svm_get_l2_tsc_offset,
        .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
        .write_tsc_offset = svm_write_tsc_offset,
+       .write_tsc_multiplier = svm_write_tsc_multiplier,

        .load_mmu_pgd = svm_load_mmu_pgd,

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6058a65a6ede..712190493926 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2535,7 +2535,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);

        if (kvm_has_tsc_control)
-               decache_tsc_multiplier(vmx);
+               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_scaling_ratio);

        nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);

@@ -4505,7 +4505,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
                vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);

        if (kvm_has_tsc_control)
-               decache_tsc_multiplier(vmx);
+               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_scaling_ratio);

        if (vmx->nested.change_vmcs01_virtual_apic_mode) {
                vmx->nested.change_vmcs01_virtual_apic_mode = false;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4b70431c2edd..bf845a08995e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1390,11 +1390,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,

                vmx->loaded_vmcs->cpu = cpu;
        }
-
-       /* Setup TSC multiplier */
-       if (kvm_has_tsc_control &&
-           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
-               decache_tsc_multiplier(vmx);
 }

 /*
@@ -1813,6 +1808,11 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        vmcs_write64(TSC_OFFSET, offset);
...skipping...
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -322,8 +322,6 @@ struct vcpu_vmx {
        /* apic deadline value in host tsc */
        u64 hv_deadline_tsc;

-       u64 current_tsc_ratio;
-
        unsigned long host_debugctlmsr;

        /*
@@ -532,12 +530,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
                              GFP_KERNEL_ACCOUNT);
 }

-static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-{
-       vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
-       vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-}
-
 static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 {
        return vmx->secondary_exec_control &
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b61b54cea495..690de1868873 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2179,14 +2179,16 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
        return v;
 }

+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu,
+                                         u64 l1_multiplier);
+
 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 {
        u64 ratio;

        /* Guest TSC same frequency as host TSC? */
        if (!scale) {
-               vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return 0;
        }

@@ -2212,7 +2214,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                return -1;
        }

-       vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio;
+       kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
        return 0;
 }

@@ -2224,8 +2226,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
        /* tsc_khz can be zero if TSC calibration fails */
        if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
-               vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
-               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
                return -1;
        }

@@ -2383,6 +2384,25 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
        static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 }

+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu,
+                                         u64 l1_multiplier)
+{
+       if (!kvm_has_tsc_control)
+               return;
+
+       vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+       /* Userspace is changing the multiplier while L2 is active... */
+       if (is_guest_mode(vcpu))
+               vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+                       l1_multiplier,
+                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+       else
+               vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+       static_call(kvm_x86_write_tsc_multiplier)(vcpu, vcpu->arch.tsc_scaling_ratio);
+}
+
 static inline bool kvm_check_tsc_unstable(void)
 {
 #ifdef CONFIG_X86_64
Paolo Bonzini May 25, 2021, 4:15 p.m. UTC | #5
On 25/05/21 17:58, Sean Christopherson wrote:
>> The right place for the hw multiplier
>> field to be updated is inside set_tsc_khz() in common code when the ratio
>> changes.

Sort of, the problem is that you have two VMCS's to update.  If properly 
fixed, the cache is useful to fix the issue with KVM_SET_TSC_KHZ needing 
to update both of them.  For that to work, you'd have to move the cache 
to struct loaded_vmcs.

So you can:

1) move the cached tsc_ratio to struct loaded_vmcs

2) add a function in common code (update_tsc_parameters or something 
like that) to update both the offset and the ratio depending on 
is_guest_mode()

3) call that function from nested vmentry/vmexit

And at that point the cache will do its job and figure out whether a 
vmwrite is needed, on both vmentry and vmexit.

I actually like the idea of storing the expected value in kvm_vcpu and 
the current value in loaded_vmcs.  We might use it for other things such 
as reload_vmcs01_apic_access_page perhaps.

Paolo

>> However, this requires adding another vendor callback etc. As all
>> this is further refactoring I believe it's better to leave this series as is -
>> ie only touching code that is directly related to nested TSC scaling and not
>> try to do everything as part of the same series.
> But it directly impacts your code, e.g. the nested enter/exit flows would need
> to dance around the decache silliness.  And I believe it even more directly
> impacts this series: kvm_set_tsc_khz() fails to handle the case where userspace
> invokes KVM_SET_TSC_KHZ while L2 is active.
>
Sean Christopherson May 25, 2021, 4:34 p.m. UTC | #6
On Tue, May 25, 2021, Paolo Bonzini wrote:
> On 25/05/21 17:58, Sean Christopherson wrote:
> > > The right place for the hw multiplier
> > > field to be updated is inside set_tsc_khz() in common code when the ratio
> > > changes.
> 
> Sort of, the problem is that you have two VMCS's to update.  If properly
> fixed, the cache is useful to fix the issue with KVM_SET_TSC_KHZ needing to
> update both of them.  For that to work, you'd have to move the cache to
> struct loaded_vmcs.

vmcs01 and vmcs02 will get updated at enter/exit, if there's no caching then
it all Just Works.

> So you can:
> 
> 1) move the cached tsc_ratio to struct loaded_vmcs
> 
> 2) add a function in common code (update_tsc_parameters or something like
> that) to update both the offset and the ratio depending on is_guest_mode()
> 
> 3) call that function from nested vmentry/vmexit
> 
> And at that point the cache will do its job and figure out whether a vmwrite
> is needed, on both vmentry and vmexit.
> 
> I actually like the idea of storing the expected value in kvm_vcpu and the
> current value in loaded_vmcs.  We might use it for other things such as
> reload_vmcs01_apic_access_page perhaps.

I'm not necessarily opposed to aggressively shadowing the VMCS, but if we go
that route then it should be a standalone series that implements a framework
that can be easily extended to arbitrary fields.  Adding fields to loaded_vmcs
one at a time will be tedious and error prone.  E.g. what makes TSC_MULTIPLIER
more special than TSC_OFFSET, GUEST_IA32_PAT, GUEST_IA32_DEBUGCTL, GUEST_BNDCFGS,
and other number of fields that are likely to persist for a given vmcs02?

The current caching logic is just plain ugly and should not exist.
Paolo Bonzini May 25, 2021, 5:34 p.m. UTC | #7
On 25/05/21 18:34, Sean Christopherson wrote:
>> I actually like the idea of storing the expected value in kvm_vcpu and the
>> current value in loaded_vmcs.  We might use it for other things such as
>> reload_vmcs01_apic_access_page perhaps.
> I'm not necessarily opposed to aggressively shadowing the VMCS, but if we go
> that route then it should be a standalone series that implements a framework
> that can be easily extended to arbitrary fields.  Adding fields to loaded_vmcs
> one at a time will be tedious and error prone.  E.g. what makes TSC_MULTIPLIER
> more special than TSC_OFFSET, GUEST_IA32_PAT, GUEST_IA32_DEBUGCTL, GUEST_BNDCFGS,
> and other number of fields that are likely to persist for a given vmcs02?

That it can be changed via ioctls in a way that affects both vmcs01 and 
vmcs02.  So TSC_MULTIPLIER is in the same boat as TSC_OFFSET, which I 
agree we should shadow more aggressively, but the others are different.

Paolo
Sean Christopherson May 25, 2021, 6:21 p.m. UTC | #8
On Tue, May 25, 2021, Paolo Bonzini wrote:
> On 25/05/21 18:34, Sean Christopherson wrote:
> > > I actually like the idea of storing the expected value in kvm_vcpu and the
> > > current value in loaded_vmcs.  We might use it for other things such as
> > > reload_vmcs01_apic_access_page perhaps.
> > I'm not necessarily opposed to aggressively shadowing the VMCS, but if we go
> > that route then it should be a standalone series that implements a framework
> > that can be easily extended to arbitrary fields.  Adding fields to loaded_vmcs
> > one at a time will be tedious and error prone.  E.g. what makes TSC_MULTIPLIER
> > more special than TSC_OFFSET, GUEST_IA32_PAT, GUEST_IA32_DEBUGCTL, GUEST_BNDCFGS,
> > and other number of fields that are likely to persist for a given vmcs02?
> 
> That it can be changed via ioctls in a way that affects both vmcs01 and vmcs02.

That holds true for any MSR that is conditionally loaded/cleared on enter/exit,
e.g. userspace can stuff MSR_IA32_CR_PAT while L2 is active, and that can affect
L1 if L1 is running without VM_EXIT_LOAD_IA32_PAT.

I'm not saying that the above is likely, but neither is changing the TSC scaling
ratio while L2 is active (I assume it occurs on migration, but in the grand
scheme that's not a common operation).
Ilias Stamatis May 25, 2021, 6:52 p.m. UTC | #9
On Tue, 2021-05-25 at 15:58 +0000, Sean Christopherson wrote:
> On Tue, May 25, 2021, Stamatis, Ilias wrote:
> > On Mon, 2021-05-24 at 18:44 +0000, Sean Christopherson wrote:
> > > Yes, but its existence is a complete hack.  vmx->current_tsc_ratio has the same
> > > scope as vcpu->arch.tsc_scaling_ratio, i.e. vmx == vcpu == vcpu->arch.  Unlike
> > > per-VMCS tracking, it should not be useful, keyword "should".
> > > 
> > > What I meant by my earlier comment:
> > > 
> > >   Its use in vmx_vcpu_load_vmcs() is basically "write the VMCS if we forgot to
> > >   earlier", which is all kinds of wrong.
> > > 
> > > is that vmx_vcpu_load_vmcs() should never write vmcs.TSC_MULTIPLIER.  The correct
> > > behavior is to set the field at VMCS initialization, and then immediately set it
> > > whenever the ratio is changed, e.g. on nested transition, from userspace, etc...
> > > In other words, my unclear feedback was to make it obsolete (and drop it) by
> > > fixing the underlying mess, not to just drop the optimization hack.
> > 
> > I understood this and replied earlier. The right place for the hw multiplier
> > field to be updated is inside set_tsc_khz() in common code when the ratio
> > changes. However, this requires adding another vendor callback etc. As all
> > this is further refactoring I believe it's better to leave this series as is -
> > ie only touching code that is directly related to nested TSC scaling and not
> > try to do everything as part of the same series.
> 
> But it directly impacts your code, e.g. the nested enter/exit flows would need
> to dance around the decache silliness.  And I believe it even more directly
> impacts this series: kvm_set_tsc_khz() fails to handle the case where userspace
> invokes KVM_SET_TSC_KHZ while L2 is active.

Good catch!

> 
> > This makes testing easier too.
> 
> Hmm, sort of.  Yes, the fewer patches/modifications in a series definitely makes
> the series itself easier to test.  But stepping back and looking at the total
> cost of testing, I would argue that punting related changes to a later time
> increases the overall cost.  E.g. if someone else picks up the clean up work,
> then they have to redo most, if not all, of the testing that you are already
> doing, including getting access to the proper hardware, understanding what tests
> to prioritize, etc...  Whereas adding one more patch to your series is an
> incremental cost since you already have the hardware setup, know which tests to
> run, etc...
> 
> > We can still implement these changes later.
> 
> We can, but we shouldn't.  Simply dropping vmx->current_tsc_ratio is not an
> option; it knowingly introduces a (minor) performance regression, for no reason
> other than wanting to avoid code churn.  Piling more stuff on top of the flawed
> decache logic is impolite, as it adds more work for the person that ends up
> doing the cleanup.  I would 100% agree if this were a significant cleanup and/or
> completely unrelated, but IMO that's not the case.
> 
> Compile tested only...

Thank you. 

> 
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 029c9615378f..34ad7a17458a 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -90,6 +90,7 @@ KVM_X86_OP_NULL(has_wbinvd_exit)
>  KVM_X86_OP(get_l2_tsc_offset)
>  KVM_X86_OP(get_l2_tsc_multiplier)
>  KVM_X86_OP(write_tsc_offset)
> +KVM_X86_OP(write_tsc_multiplier)
>  KVM_X86_OP(get_exit_info)
>  KVM_X86_OP(check_intercept)
>  KVM_X86_OP(handle_exit_irqoff)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f099277b993d..a334ce7741ab 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1308,6 +1308,7 @@ struct kvm_x86_ops {
>         u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
>         u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
>         void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> +       void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
> 
>         /*
>          * Retrieve somewhat arbitrary exit information.  Intended to be used
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b18f60463073..914afcceb46d 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1103,6 +1103,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>  }
> 
> +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
> +{
> +       /*
> +        * Handled when loading guest state since the ratio is programmed via
> +        * MSR_AMD64_TSC_RATIO, not a field in the VMCB.
> +        */
> +}
> +

Ok, what I wanted to avoid really is having to dig into SVM code and see where
exactly it sets the TSC multiplier or having to implement
svm_write_tsc_multiplier as I knew AMD uses an MSR instead of a VMCB field.

But if we are fine with introducing this as is above (for now) I will include 
this in the series, apply the other small changes suggested and re-post the 
patches.
Ilias Stamatis May 25, 2021, 7:25 p.m. UTC | #10
On Tue, 2021-05-25 at 15:58 +0000, Sean Christopherson wrote:
> On Tue, May 25, 2021, Stamatis, Ilias wrote:
> > On Mon, 2021-05-24 at 18:44 +0000, Sean Christopherson wrote:
> > > Yes, but its existence is a complete hack.  vmx->current_tsc_ratio has the same
> > > scope as vcpu->arch.tsc_scaling_ratio, i.e. vmx == vcpu == vcpu->arch.  Unlike
> > > per-VMCS tracking, it should not be useful, keyword "should".
> > > 
> > > What I meant by my earlier comment:
> > > 
> > >   Its use in vmx_vcpu_load_vmcs() is basically "write the VMCS if we forgot to
> > >   earlier", which is all kinds of wrong.
> > > 
> > > is that vmx_vcpu_load_vmcs() should never write vmcs.TSC_MULTIPLIER.  The correct
> > > behavior is to set the field at VMCS initialization, and then immediately set it
> > > whenever the ratio is changed, e.g. on nested transition, from userspace, etc...
> > > In other words, my unclear feedback was to make it obsolete (and drop it) by
> > > fixing the underlying mess, not to just drop the optimization hack.
> > 
> > I understood this and replied earlier. The right place for the hw multiplier
> > field to be updated is inside set_tsc_khz() in common code when the ratio
> > changes. However, this requires adding another vendor callback etc. As all
> > this is further refactoring I believe it's better to leave this series as is -
> > ie only touching code that is directly related to nested TSC scaling and not
> > try to do everything as part of the same series.
> 
> But it directly impacts your code, e.g. the nested enter/exit flows would need
> to dance around the decache silliness.  And I believe it even more directly
> impacts this series: kvm_set_tsc_khz() fails to handle the case where userspace
> invokes KVM_SET_TSC_KHZ while L2 is active.
> 
> > This makes testing easier too.
> 
> Hmm, sort of.  Yes, the fewer patches/modifications in a series definitely makes
> the series itself easier to test.  But stepping back and looking at the total
> cost of testing, I would argue that punting related changes to a later time
> increases the overall cost.  E.g. if someone else picks up the clean up work,
> then they have to redo most, if not all, of the testing that you are already
> doing, including getting access to the proper hardware, understanding what tests
> to prioritize, etc...  Whereas adding one more patch to your series is an
> incremental cost since you already have the hardware setup, know which tests to
> run, etc...
> 
> > We can still implement these changes later.
> 
> We can, but we shouldn't.  Simply dropping vmx->current_tsc_ratio is not an
> option; it knowingly introduces a (minor) performance regression, for no reason
> other than wanting to avoid code churn.  Piling more stuff on top of the flawed
> decache logic is impolite, as it adds more work for the person that ends up
> doing the cleanup.  I would 100% agree if this were a significant cleanup and/or
> completely unrelated, but IMO that's not the case.
> 
> Compile tested only...
> 
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 029c9615378f..34ad7a17458a 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -90,6 +90,7 @@ KVM_X86_OP_NULL(has_wbinvd_exit)
>  KVM_X86_OP(get_l2_tsc_offset)
>  KVM_X86_OP(get_l2_tsc_multiplier)
>  KVM_X86_OP(write_tsc_offset)
> +KVM_X86_OP(write_tsc_multiplier)
>  KVM_X86_OP(get_exit_info)
>  KVM_X86_OP(check_intercept)
>  KVM_X86_OP(handle_exit_irqoff)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f099277b993d..a334ce7741ab 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1308,6 +1308,7 @@ struct kvm_x86_ops {
>         u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
>         u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
>         void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> +       void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
> 
>         /*
>          * Retrieve somewhat arbitrary exit information.  Intended to be used
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b18f60463073..914afcceb46d 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1103,6 +1103,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>  }
> 
> +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
> +{
> +       /*
> +        * Handled when loading guest state since the ratio is programmed via
> +        * MSR_AMD64_TSC_RATIO, not a field in the VMCB.
> +        */
> +}
> +
>  /* Evaluate instruction intercepts that depend on guest CPUID features. */
>  static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
>                                               struct vcpu_svm *svm)
> @@ -4528,6 +4536,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
>         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
>         .write_tsc_offset = svm_write_tsc_offset,
> +       .write_tsc_multiplier = svm_write_tsc_multiplier,
> 
>         .load_mmu_pgd = svm_load_mmu_pgd,
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 6058a65a6ede..712190493926 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2535,7 +2535,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
> 
>         if (kvm_has_tsc_control)
> -               decache_tsc_multiplier(vmx);
> +               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_scaling_ratio);
> 
>         nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
> 
> @@ -4505,7 +4505,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>                 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
> 
>         if (kvm_has_tsc_control)
> -               decache_tsc_multiplier(vmx);
> +               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_scaling_ratio);
> 
>         if (vmx->nested.change_vmcs01_virtual_apic_mode) {
>                 vmx->nested.change_vmcs01_virtual_apic_mode = false;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4b70431c2edd..bf845a08995e 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1390,11 +1390,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
> 
>                 vmx->loaded_vmcs->cpu = cpu;
>         }
> -
> -       /* Setup TSC multiplier */
> -       if (kvm_has_tsc_control &&
> -           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
> -               decache_tsc_multiplier(vmx);
>  }
> 
>  /*
> @@ -1813,6 +1808,11 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>         vmcs_write64(TSC_OFFSET, offset);
> ...skipping...
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -322,8 +322,6 @@ struct vcpu_vmx {
>         /* apic deadline value in host tsc */
>         u64 hv_deadline_tsc;
> 
> -       u64 current_tsc_ratio;
> -
>         unsigned long host_debugctlmsr;
> 
>         /*
> @@ -532,12 +530,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
>                               GFP_KERNEL_ACCOUNT);
>  }
> 
> -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
> -{
> -       vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
> -       vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
> -}
> -
>  static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
>  {
>         return vmx->secondary_exec_control &
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b61b54cea495..690de1868873 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2179,14 +2179,16 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
>         return v;
>  }
> 
> +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu,
> +                                         u64 l1_multiplier);
> +
>  static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
>  {
>         u64 ratio;
> 
>         /* Guest TSC same frequency as host TSC? */
>         if (!scale) {
> -               vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
> -               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
> +               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
>                 return 0;
>         }
> 
> @@ -2212,7 +2214,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
>                 return -1;
>         }
> 
> -       vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio;
> +       kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
>         return 0;
>  }
> 
> @@ -2224,8 +2226,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
>         /* tsc_khz can be zero if TSC calibration fails */
>         if (user_tsc_khz == 0) {
>                 /* set tsc_scaling_ratio to a safe value */
> -               vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
> -               vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
> +               kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
>                 return -1;
>         }
> 
> @@ -2383,6 +2384,25 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
>         static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
>  }
> 
> +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu,
> +                                         u64 l1_multiplier)
> +{
> +       if (!kvm_has_tsc_control)
> +               return;
> +
> +       vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
> +
> +       /* Userspace is changing the multiplier while L2 is active... */
> +       if (is_guest_mode(vcpu))
> +               vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
> +                       l1_multiplier,
> +                       static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
> +       else
> +               vcpu->arch.tsc_scaling_ratio = l1_multiplier;
> +
> +       static_call(kvm_x86_write_tsc_multiplier)(vcpu, vcpu->arch.tsc_scaling_ratio);
> +}
> +
>  static inline bool kvm_check_tsc_unstable(void)
>  {
>  #ifdef CONFIG_X86_64

Hmm, this patch actually still removes the caching and introduces a small
performance overhead. For example if neither L1 nor L2 are scaled it will
still do a vmwrite for every L2 entry/write.

So do we want to get rid of decache_tsc_multiplier() but keep 
vmx->current_tsc_ratio and do the check inside write_tsc_multiplier()? Or 
alternatively delete vmx->current_tsc_ratio too and have 
write_tsc_multiplier() receive 2 parameters, one of the old multiplier and 
one of the new?
Sean Christopherson May 25, 2021, 11:35 p.m. UTC | #11
On Tue, May 25, 2021, Stamatis, Ilias wrote:
> Hmm, this patch actually still removes the caching and introduces a small
> performance overhead. For example if neither L1 nor L2 are scaled it will
> still do a vmwrite for every L2 entry/write.

True, but there is an ocean of difference between the relative performance of
vmx_vcpu_load_vmcs() and a nested transition.  vmx_vcpu_load_vmcs() is also
called much more frequently.

> So do we want to get rid of decache_tsc_multiplier() but keep 
> vmx->current_tsc_ratio and do the check inside write_tsc_multiplier()? Or 
> alternatively delete vmx->current_tsc_ratio too and have 
> write_tsc_multiplier() receive 2 parameters, one of the old multiplier and 
> one of the new?

My vote is to kill it, eat the barely-noticeable perf hit on nVMX, and tackle
the aggressive VMCS shadowing in a separate series.
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6058a65a6ede..239154d3e4e7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2533,9 +2533,8 @@  static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
-
 	if (kvm_has_tsc_control)
-		decache_tsc_multiplier(vmx);
+		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
 
 	nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
 
@@ -4501,12 +4500,12 @@  void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+	if (kvm_has_tsc_control)
+		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
 	if (vmx->nested.l1_tpr_threshold != -1)
 		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
 
-	if (kvm_has_tsc_control)
-		decache_tsc_multiplier(vmx);
-
 	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
 		vmx->nested.change_vmcs01_virtual_apic_mode = false;
 		vmx_set_virtual_apic_mode(vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4b70431c2edd..7c52c697cfe3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1392,9 +1392,8 @@  void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
 	}
 
 	/* Setup TSC multiplier */
-	if (kvm_has_tsc_control &&
-	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
-		decache_tsc_multiplier(vmx);
+	if (kvm_has_tsc_control)
+		vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index aa97c82e3451..3eaa86a0ba3e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -322,8 +322,6 @@  struct vcpu_vmx {
 	/* apic deadline value in host tsc */
 	u64 hv_deadline_tsc;
 
-	u64 current_tsc_ratio;
-
 	unsigned long host_debugctlmsr;
 
 	/*
@@ -532,12 +530,6 @@  static inline struct vmcs *alloc_vmcs(bool shadow)
 			      GFP_KERNEL_ACCOUNT);
 }
 
-static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-{
-	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
-	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-}
-
 static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 {
 	return vmx->secondary_exec_control &