diff mbox series

[v2] KVM: SVM: use vmsave/vmload for saving/restoring additional host state

Message ID 20201214174127.1398114-1-michael.roth@amd.com (mailing list archive)
State New, archived
Headers show
Series [v2] KVM: SVM: use vmsave/vmload for saving/restoring additional host state | expand

Commit Message

Michael Roth Dec. 14, 2020, 5:41 p.m. UTC
Using a guest workload which simply issues 'hlt' in a tight loop to
generate VMEXITs, it was observed (on a recent EPYC processor) that a
significant amount of the VMEXIT overhead measured on the host was the
result of MSR reads/writes in svm_vcpu_load/svm_vcpu_put according to
perf:

  67.49%--kvm_arch_vcpu_ioctl_run
          |
          |--23.13%--vcpu_put
          |          kvm_arch_vcpu_put
          |          |
          |          |--21.31%--native_write_msr
          |          |
          |           --1.27%--svm_set_cr4
          |
          |--16.11%--vcpu_load
          |          |
          |           --15.58%--kvm_arch_vcpu_load
          |                     |
          |                     |--13.97%--svm_set_cr4
          |                     |          |
          |                     |          |--12.64%--native_read_msr

Most of these MSRs relate to 'syscall'/'sysenter' and segment bases, and
can be saved/restored using 'vmsave'/'vmload' instructions rather than
explicit MSR reads/writes. In doing so there is a significant reduction
in the svm_vcpu_load/svm_vcpu_put overhead measured for the above
workload:

  50.92%--kvm_arch_vcpu_ioctl_run
          |
          |--19.28%--disable_nmi_singlestep
          |
          |--13.68%--vcpu_load
          |          kvm_arch_vcpu_load
          |          |
          |          |--9.19%--svm_set_cr4
          |          |          |
          |          |           --6.44%--native_read_msr
          |          |
          |           --3.55%--native_write_msr
          |
          |--6.05%--kvm_inject_nmi
          |--2.80%--kvm_sev_es_mmio_read
          |--2.19%--vcpu_put
          |          |
          |           --1.25%--kvm_arch_vcpu_put
          |                     native_write_msr

Quantifying this further, if we look at the raw cycle counts for a
normal iteration of the above workload (according to 'rdtscp'),
kvm_arch_vcpu_ioctl_run() takes ~4600 cycles from start to finish with
the current behavior. Using 'vmsave'/'vmload', this is reduced to
~2800 cycles, a savings of 39%.

While this approach doesn't seem to manifest in any noticeable
improvement for more realistic workloads like UnixBench, netperf, and
kernel builds, likely due to their exit paths generally involving IO
with comparatively high latencies, it does improve overall overhead
of KVM_RUN significantly, which may still be noticeable for certain
situations. It also simplifies some aspects of the code.

With this change, explicit save/restore is no longer needed for the
following host MSRs, since they are documented[1] as being part of the
VMCB State Save Area:

  MSR_STAR, MSR_LSTAR, MSR_CSTAR,
  MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  MSR_IA32_SYSENTER_CS,
  MSR_IA32_SYSENTER_ESP,
  MSR_IA32_SYSENTER_EIP,
  MSR_FS_BASE, MSR_GS_BASE

and only the following MSR needs individual handling in
svm_vcpu_put/svm_vcpu_load:

  MSR_TSC_AUX

We could drop the host_save_user_msrs array/loop and instead handle
MSR read/write of MSR_TSC_AUX directly, but we leave that for now as
a potential follow-up.

Since 'vmsave'/'vmload' also handles the LDTR and FS/GS segment
registers (and associated hidden state)[2], some of the code
previously used to handle this is no longer needed, so we drop it
as well.

The first public release of the SVM spec[3] also documents the same
handling for the host state in question, so we make these changes
unconditionally.

Also worth noting is that we 'vmsave' to the same page that is
subsequently used by 'vmrun' to record some host additional state. This
is okay, since, in accordance with the spec[2], the additional state
written to the page by 'vmrun' does not overwrite any fields written by
'vmsave'. This has also been confirmed through testing (for the above
CPU, at least).

[1] AMD64 Architecture Programmer's Manual, Rev 3.33, Volume 2, Appendix B, Table B-2
[2] AMD64 Architecture Programmer's Manual, Rev 3.31, Volume 3, Chapter 4, VMSAVE/VMLOAD
[3] Secure Virtual Machine Architecture Reference Manual, Rev 3.01

Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
v2:
* rebase on latest kvm/next
* move VMLOAD to just after vmexit so we can use it to handle all FS/GS
  host state restoration and rather than relying on loadsegment() and
  explicit write to MSR_GS_BASE (Andy)
* drop 'host' field from struct vcpu_svm since it is no longer needed
  for storing FS/GS/LDT state (Andy)
---
 arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
 arch/x86/kvm/svm/svm.h | 14 +++-----------
 2 files changed, 20 insertions(+), 38 deletions(-)

Comments

Sean Christopherson Dec. 14, 2020, 7:38 p.m. UTC | #1
+Andy, who provided a lot of feedback on v1.

On Mon, Dec 14, 2020, Michael Roth wrote:

Cc: Andy Lutomirski <luto@kernel.org>

> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
> v2:
> * rebase on latest kvm/next
> * move VMLOAD to just after vmexit so we can use it to handle all FS/GS
>   host state restoration and rather than relying on loadsegment() and
>   explicit write to MSR_GS_BASE (Andy)
> * drop 'host' field from struct vcpu_svm since it is no longer needed
>   for storing FS/GS/LDT state (Andy)
> ---
>  arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
>  arch/x86/kvm/svm/svm.h | 14 +++-----------
>  2 files changed, 20 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 0e52fac4f5ae..fb15b7bd461f 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1367,15 +1367,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  		vmcb_mark_all_dirty(svm->vmcb);
>  	}
>  
> -#ifdef CONFIG_X86_64
> -	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
> -#endif
> -	savesegment(fs, svm->host.fs);
> -	savesegment(gs, svm->host.gs);
> -	svm->host.ldt = kvm_read_ldt();
> -
> -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> +	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>  		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> +	}

Unnecessary change that violates preferred coding style.  Checkpatch explicitly
complains about this.

WARNING: braces {} are not necessary for single statement blocks
#132: FILE: arch/x86/kvm/svm/svm.c:1370:
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
 		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+

> +
> +	asm volatile(__ex("vmsave")
> +		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)

I'm pretty sure this can be page_to_phys().

> +		     : "memory");

I think we can defer this until we're actually planning on running the guest,
i.e. put this in svm_prepare_guest_switch().

> +	/*
> +	 * Store a pointer to the save area to we can access it after
> +	 * vmexit for vmload. This is needed since per-cpu accesses
> +	 * won't be available until GS is restored as part of vmload
> +	 */
> +	svm->host_save_area = sd->save_area;

Unless I'm missing something with how SVM loads guest state, you can avoid
adding host_save_area by saving the PA of the save area on the stack prior to
the vmload of guest state.

>  
>  	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
>  		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
> @@ -1403,18 +1407,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
>  	avic_vcpu_put(vcpu);
>  
>  	++vcpu->stat.host_state_reload;
> -	kvm_load_ldt(svm->host.ldt);
> -#ifdef CONFIG_X86_64
> -	loadsegment(fs, svm->host.fs);
> -	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
> -	load_gs_index(svm->host.gs);
> -#else
> -#ifdef CONFIG_X86_32_LAZY_GS
> -	loadsegment(gs, svm->host.gs);
> -#endif
> -#endif
> -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> +
> +	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>  		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> +	}

Same bogus change and checkpatch warning.

>  }
>  
>  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
> @@ -3507,14 +3503,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
>  
>  	__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);

Tying in with avoiding svm->host_save_area, what about passing in the PA of the
save area and doing the vmload in __svm_vcpu_run()?  One less instance of inline
assembly to stare at...

>  
> -#ifdef CONFIG_X86_64
> -	native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
> -#else
> -	loadsegment(fs, svm->host.fs);
> -#ifndef CONFIG_X86_32_LAZY_GS
> -	loadsegment(gs, svm->host.gs);
> -#endif
> -#endif
> +	asm volatile(__ex("vmload")
> +		     : : "a" (page_to_pfn(svm->host_save_area) << PAGE_SHIFT));
>  
>  	/*
>  	 * VMEXIT disables interrupts (host state), but tracing and lockdep
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index fdff76eb6ceb..bf01a8c19ec0 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -21,11 +21,6 @@
>  #include <asm/svm.h>
>  
>  static const u32 host_save_user_msrs[] = {
> -#ifdef CONFIG_X86_64
> -	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
> -	MSR_FS_BASE,
> -#endif
> -	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
>  	MSR_TSC_AUX,

With this being whittled down to TSC_AUX, a good follow-on series would be to
add SVM usage of the "user return" MSRs to handle TSC_AUX.  If no one objects,
I'll plan on doing that in the not-too-distant future as a ramp task of sorts.

>  };
>  
> @@ -117,12 +112,9 @@ struct vcpu_svm {
>  	u64 next_rip;
>  
>  	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
> -	struct {
> -		u16 fs;
> -		u16 gs;
> -		u16 ldt;
> -		u64 gs_base;
> -	} host;
> +
> +	/* set by vcpu_load(), for use when per-cpu accesses aren't available */
> +	struct page *host_save_area;
>  
>  	u64 spec_ctrl;
>  	/*
> -- 
> 2.25.1
>
Andy Lutomirski Dec. 14, 2020, 8:08 p.m. UTC | #2
On Mon, Dec 14, 2020 at 11:38 AM Sean Christopherson <seanjc@google.com> wrote:
>
> +Andy, who provided a lot of feedback on v1.
>
> >
> >  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
> > @@ -3507,14 +3503,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
> >
> >       __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
>
> Tying in with avoiding svm->host_save_area, what about passing in the PA of the
> save area and doing the vmload in __svm_vcpu_run()?  One less instance of inline
> assembly to stare at...

One potential side benefit is that we wouldn't execute any C code with
the wrong MSR_GS_BASE, which avoids any concerns about
instrumentation, stack protector, or some *SAN feature exploding due
to a percpu memory not working.

--Andy
Michael Roth Dec. 14, 2020, 10:02 p.m. UTC | #3
On Mon, Dec 14, 2020 at 11:38:23AM -0800, Sean Christopherson wrote:
> +Andy, who provided a lot of feedback on v1.
> 
> On Mon, Dec 14, 2020, Michael Roth wrote:
> 
> Cc: Andy Lutomirski <luto@kernel.org>
> 
> > Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---
> > v2:
> > * rebase on latest kvm/next
> > * move VMLOAD to just after vmexit so we can use it to handle all FS/GS
> >   host state restoration and rather than relying on loadsegment() and
> >   explicit write to MSR_GS_BASE (Andy)
> > * drop 'host' field from struct vcpu_svm since it is no longer needed
> >   for storing FS/GS/LDT state (Andy)
> > ---
> >  arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
> >  arch/x86/kvm/svm/svm.h | 14 +++-----------
> >  2 files changed, 20 insertions(+), 38 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 0e52fac4f5ae..fb15b7bd461f 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -1367,15 +1367,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >  		vmcb_mark_all_dirty(svm->vmcb);
> >  	}
> >  
> > -#ifdef CONFIG_X86_64
> > -	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
> > -#endif
> > -	savesegment(fs, svm->host.fs);
> > -	savesegment(gs, svm->host.gs);
> > -	svm->host.ldt = kvm_read_ldt();
> > -
> > -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> > +	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> >  		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> > +	}

Hi Sean,

Hopefully I've got my email situation sorted out now...

> 
> Unnecessary change that violates preferred coding style.  Checkpatch explicitly
> complains about this.
> 
> WARNING: braces {} are not necessary for single statement blocks
> #132: FILE: arch/x86/kvm/svm/svm.c:1370:
> +	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>  		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> +

Sorry, that was an artifact from an earlier version of the patch that I
failed to notice. I'll make sure to run everything through checkpatch
going forward.

> 
> > +
> > +	asm volatile(__ex("vmsave")
> > +		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
> 
> I'm pretty sure this can be page_to_phys().
> 
> > +		     : "memory");
> 
> I think we can defer this until we're actually planning on running the guest,
> i.e. put this in svm_prepare_guest_switch().

One downside to that is that we'd need to do the VMSAVE on every
iteration of vcpu_run(), as opposed to just once when we enter from
userspace via KVM_RUN. It ends up being a similar situation to Andy's
earlier suggestion of moving VMLOAD just after vmexit, but in that case
we were able to remove an MSR write to MSR_GS_BASE, which cancelled out
the overhead, but in this case I think it could only cost us extra.

It looks like the SEV-ES patches might land before this one, and those
introduce similar handling of VMSAVE in svm_vcpu_load(), so I think it
might also create some churn there if we take this approach and want to
keep the SEV-ES and non-SEV-ES handling similar.

> 
> > +	/*
> > +	 * Store a pointer to the save area to we can access it after
> > +	 * vmexit for vmload. This is needed since per-cpu accesses
> > +	 * won't be available until GS is restored as part of vmload
> > +	 */
> > +	svm->host_save_area = sd->save_area;
> 
> Unless I'm missing something with how SVM loads guest state, you can avoid
> adding host_save_area by saving the PA of the save area on the stack prior to
> the vmload of guest state.

Yes, that is much nicer. Not sure what I was thinking there.

> 
> >  
> >  	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
> >  		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
> > @@ -1403,18 +1407,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
> >  	avic_vcpu_put(vcpu);
> >  
> >  	++vcpu->stat.host_state_reload;
> > -	kvm_load_ldt(svm->host.ldt);
> > -#ifdef CONFIG_X86_64
> > -	loadsegment(fs, svm->host.fs);
> > -	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
> > -	load_gs_index(svm->host.gs);
> > -#else
> > -#ifdef CONFIG_X86_32_LAZY_GS
> > -	loadsegment(gs, svm->host.gs);
> > -#endif
> > -#endif
> > -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> > +
> > +	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> >  		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> > +	}
> 
> Same bogus change and checkpatch warning.
> 
> >  }
> >  
> >  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
> > @@ -3507,14 +3503,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
> >  
> >  	__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
> 
> Tying in with avoiding svm->host_save_area, what about passing in the PA of the
> save area and doing the vmload in __svm_vcpu_run()?  One less instance of inline
> assembly to stare at...

That sounds like a nice clean-up, I'll give this a shot.

> 
> >  
> > -#ifdef CONFIG_X86_64
> > -	native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
> > -#else
> > -	loadsegment(fs, svm->host.fs);
> > -#ifndef CONFIG_X86_32_LAZY_GS
> > -	loadsegment(gs, svm->host.gs);
> > -#endif
> > -#endif
> > +	asm volatile(__ex("vmload")
> > +		     : : "a" (page_to_pfn(svm->host_save_area) << PAGE_SHIFT));
> >  
> >  	/*
> >  	 * VMEXIT disables interrupts (host state), but tracing and lockdep
> > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> > index fdff76eb6ceb..bf01a8c19ec0 100644
> > --- a/arch/x86/kvm/svm/svm.h
> > +++ b/arch/x86/kvm/svm/svm.h
> > @@ -21,11 +21,6 @@
> >  #include <asm/svm.h>
> >  
> >  static const u32 host_save_user_msrs[] = {
> > -#ifdef CONFIG_X86_64
> > -	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
> > -	MSR_FS_BASE,
> > -#endif
> > -	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
> >  	MSR_TSC_AUX,
> 
> With this being whittled down to TSC_AUX, a good follow-on series would be to
> add SVM usage of the "user return" MSRs to handle TSC_AUX.  If no one objects,
> I'll plan on doing that in the not-too-distant future as a ramp task of sorts.

No objection here. I mainly held off on removing the list since it might be
nice to have some infrastructure to handle future cases, but if it's already
there then great :)

Thanks for the suggestions. I'll work on a v3 with those applied.

-Mike

> 
> >  };
> >  
> > @@ -117,12 +112,9 @@ struct vcpu_svm {
> >  	u64 next_rip;
> >  
> >  	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
> > -	struct {
> > -		u16 fs;
> > -		u16 gs;
> > -		u16 ldt;
> > -		u64 gs_base;
> > -	} host;
> > +
> > +	/* set by vcpu_load(), for use when per-cpu accesses aren't available */
> > +	struct page *host_save_area;
> >  
> >  	u64 spec_ctrl;
> >  	/*
> > -- 
> > 2.25.1
> >
Sean Christopherson Dec. 14, 2020, 10:23 p.m. UTC | #4
On Mon, Dec 14, 2020, Michael Roth wrote:
> On Mon, Dec 14, 2020 at 11:38:23AM -0800, Sean Christopherson wrote:
> > > +	asm volatile(__ex("vmsave")
> > > +		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
> > 
> > I'm pretty sure this can be page_to_phys().
> > 
> > > +		     : "memory");
> > 
> > I think we can defer this until we're actually planning on running the guest,
> > i.e. put this in svm_prepare_guest_switch().
> 
> One downside to that is that we'd need to do the VMSAVE on every
> iteration of vcpu_run(), as opposed to just once when we enter from
> userspace via KVM_RUN.

That can, and should, be optimized away.  Sorry I didn't make that clear.  The
below will yield high level symmetry with VMX, which I like.

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 523df10fb979..057661723a5c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1399,6 +1399,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)

        avic_vcpu_put(vcpu);

+       svm->host_state_saved = false;
        ++vcpu->stat.host_state_reload;
        if (sev_es_guest(svm->vcpu.kvm)) {
                sev_es_vcpu_put(svm);
@@ -3522,6 +3523,12 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)

 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!svm->host_state_saved) {
+               svm->need_host_state_save = true;
+               vmsave();
+       }
 }


> It ends up being a similar situation to Andy's earlier suggestion of moving
> VMLOAD just after vmexit, but in that case we were able to remove an MSR
> write to MSR_GS_BASE, which cancelled out the overhead, but in this case I
> think it could only cost us extra.
>
> It looks like the SEV-ES patches might land before this one, and those
> introduce similar handling of VMSAVE in svm_vcpu_load(), so I think it
> might also create some churn there if we take this approach and want to
> keep the SEV-ES and non-SEV-ES handling similar.

Hmm, I'll make sure to pay attention to that when I review the SEV-ES patches,
which I was hoping to get to today, but that's looking unlikely at this point.
Andy Lutomirski Dec. 14, 2020, 10:29 p.m. UTC | #5
> On Dec 14, 2020, at 2:02 PM, Michael Roth <michael.roth@amd.com> wrote:
> 
> On Mon, Dec 14, 2020 at 11:38:23AM -0800, Sean Christopherson wrote:
>> +Andy, who provided a lot of feedback on v1.
>> On Mon, Dec 14, 2020, Michael Roth wrote:
>> Cc: Andy Lutomirski <luto@kernel.org>
>>> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
>>> Signed-off-by: Michael Roth <michael.roth@amd.com>
>>> ---
>>> v2:
>>> * rebase on latest kvm/next
>>> * move VMLOAD to just after vmexit so we can use it to handle all FS/GS
>>> host state restoration and rather than relying on loadsegment() and
>>> explicit write to MSR_GS_BASE (Andy)
>>> * drop 'host' field from struct vcpu_svm since it is no longer needed
>>> for storing FS/GS/LDT state (Andy)
>>> ---
>>> arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
>>> arch/x86/kvm/svm/svm.h | 14 +++-----------
>>> 2 files changed, 20 insertions(+), 38 deletions(-)
>>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>>> index 0e52fac4f5ae..fb15b7bd461f 100644
>>> --- a/arch/x86/kvm/svm/svm.c
>>> +++ b/arch/x86/kvm/svm/svm.c
>>> @@ -1367,15 +1367,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>       vmcb_mark_all_dirty(svm->vmcb);
>>>   }
>>> -#ifdef CONFIG_X86_64
>>> -    rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
>>> -#endif
>>> -    savesegment(fs, svm->host.fs);
>>> -    savesegment(gs, svm->host.gs);
>>> -    svm->host.ldt = kvm_read_ldt();
>>> -
>>> -    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
>>> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>>>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
>>> +    }
> 
> Hi Sean,
> 
> Hopefully I've got my email situation sorted out now...
> 
>> Unnecessary change that violates preferred coding style.  Checkpatch explicitly
>> complains about this.
>> WARNING: braces {} are not necessary for single statement blocks
>> #132: FILE: arch/x86/kvm/svm/svm.c:1370:
>> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
>>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
>> +
> 
> Sorry, that was an artifact from an earlier version of the patch that I
> failed to notice. I'll make sure to run everything through checkpatch
> going forward.
> 
>>> +
>>> +    asm volatile(__ex("vmsave")
>>> +             : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
>> I'm pretty sure this can be page_to_phys().
>>> +             : "memory");
>> I think we can defer this until we're actually planning on running the guest,
>> i.e. put this in svm_prepare_guest_switch().
> 
> One downside to that is that we'd need to do the VMSAVE on every
> iteration of vcpu_run(), as opposed to just once when we enter from
> userspace via KVM_RUN. It ends up being a similar situation to Andy's
> earlier suggestion of moving VMLOAD just after vmexit, but in that case
> we were able to remove an MSR write to MSR_GS_BASE, which cancelled out
> the overhead, but in this case I think it could only cost us extra.

If you want to micro-optimize, there is a trick you could play: use WRGSBASE if available.  If X86_FEATURE_GSBASE is available, you could use WRGSBASE to restore GSBASE and defer VMLOAD to vcpu_put().  This would need benchmarking on Zen 3 to see if it’s worthwhile.
Paolo Bonzini Dec. 15, 2020, 10:15 a.m. UTC | #6
On 14/12/20 23:29, Andy Lutomirski wrote:
>> One downside to that is that we'd need to do the VMSAVE on every 
>> iteration of vcpu_run(), as opposed to just once when we enter
>> from userspace via KVM_RUN. It ends up being a similar situation to
>> Andy's earlier suggestion of moving VMLOAD just after vmexit, but
>> in that case we were able to remove an MSR write to MSR_GS_BASE,
>> which cancelled out the overhead, but in this case I think it could
>> only cost us extra.
>
> If you want to micro-optimize, there is a trick you could play: use
> WRGSBASE if available.  If X86_FEATURE_GSBASE is available, you could
> use WRGSBASE to restore GSBASE and defer VMLOAD to vcpu_put().  This
> would need benchmarking on Zen 3 to see if it’s worthwhile.

If the improvement is big (100 cycles or so) it would be worthwhile. 
Otherwise, for the sake of code clarity doing VMLOAD in the assembly 
code is the simplest.

Paolo
Michael Roth Dec. 15, 2020, 6:17 p.m. UTC | #7
On Mon, Dec 14, 2020 at 02:29:46PM -0800, Andy Lutomirski wrote:
> 
> 
> > On Dec 14, 2020, at 2:02 PM, Michael Roth <michael.roth@amd.com> wrote:
> > 
> > On Mon, Dec 14, 2020 at 11:38:23AM -0800, Sean Christopherson wrote:
> >> +Andy, who provided a lot of feedback on v1.
> >> On Mon, Dec 14, 2020, Michael Roth wrote:
> >> Cc: Andy Lutomirski <luto@kernel.org>
> >>> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
> >>> Signed-off-by: Michael Roth <michael.roth@amd.com>
> >>> ---
> >>> v2:
> >>> * rebase on latest kvm/next
> >>> * move VMLOAD to just after vmexit so we can use it to handle all FS/GS
> >>> host state restoration and rather than relying on loadsegment() and
> >>> explicit write to MSR_GS_BASE (Andy)
> >>> * drop 'host' field from struct vcpu_svm since it is no longer needed
> >>> for storing FS/GS/LDT state (Andy)
> >>> ---
> >>> arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
> >>> arch/x86/kvm/svm/svm.h | 14 +++-----------
> >>> 2 files changed, 20 insertions(+), 38 deletions(-)
> >>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> >>> index 0e52fac4f5ae..fb15b7bd461f 100644
> >>> --- a/arch/x86/kvm/svm/svm.c
> >>> +++ b/arch/x86/kvm/svm/svm.c
> >>> @@ -1367,15 +1367,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >>>       vmcb_mark_all_dirty(svm->vmcb);
> >>>   }
> >>> -#ifdef CONFIG_X86_64
> >>> -    rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
> >>> -#endif
> >>> -    savesegment(fs, svm->host.fs);
> >>> -    savesegment(gs, svm->host.gs);
> >>> -    svm->host.ldt = kvm_read_ldt();
> >>> -
> >>> -    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> >>> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> >>>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> >>> +    }
> > 
> > Hi Sean,
> > 
> > Hopefully I've got my email situation sorted out now...
> > 
> >> Unnecessary change that violates preferred coding style.  Checkpatch explicitly
> >> complains about this.
> >> WARNING: braces {} are not necessary for single statement blocks
> >> #132: FILE: arch/x86/kvm/svm/svm.c:1370:
> >> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> >>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> >> +
> > 
> > Sorry, that was an artifact from an earlier version of the patch that I
> > failed to notice. I'll make sure to run everything through checkpatch
> > going forward.
> > 
> >>> +
> >>> +    asm volatile(__ex("vmsave")
> >>> +             : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
> >> I'm pretty sure this can be page_to_phys().
> >>> +             : "memory");
> >> I think we can defer this until we're actually planning on running the guest,
> >> i.e. put this in svm_prepare_guest_switch().
> > 
> > One downside to that is that we'd need to do the VMSAVE on every
> > iteration of vcpu_run(), as opposed to just once when we enter from
> > userspace via KVM_RUN. It ends up being a similar situation to Andy's
> > earlier suggestion of moving VMLOAD just after vmexit, but in that case
> > we were able to remove an MSR write to MSR_GS_BASE, which cancelled out
> > the overhead, but in this case I think it could only cost us extra.
> 
> If you want to micro-optimize, there is a trick you could play: use WRGSBASE if available.  If X86_FEATURE_GSBASE is available, you could use WRGSBASE to restore GSBASE and defer VMLOAD to vcpu_put().  This would need benchmarking on Zen 3 to see if it’s worthwhile.

I'll give this a try. The vmsave only seems to be 100-200 in and of itself so
I'm not sure there's much to be gained, but would be good to know either
way.

> 
>
Michael Roth Dec. 15, 2020, 6:55 p.m. UTC | #8
Hi Sean,

Sorry to reply out-of-thread, our mail server is having issues with
certain email addresses at the moment so I only see your message via
the archives atm. But regarding:

>>> I think we can defer this until we're actually planning on running
>>> the guest,
>>> i.e. put this in svm_prepare_guest_switch().
>>
>> It looks like the SEV-ES patches might land before this one, and those
>> introduce similar handling of VMSAVE in svm_vcpu_load(), so I think it
>> might also create some churn there if we take this approach and want
>> to keep the SEV-ES and non-SEV-ES handling similar.
>
>Hmm, I'll make sure to pay attention to that when I review the SEV-ES
>patches,
>which I was hoping to get to today, but that's looking unlikely at this
>point.

It looks like SEV-ES patches are queued now. Those patches have
undergone a lot of internal testing so I'm really hesitant to introduce
any significant change to those at this stage as a prereq for my little
patch. So for v3 I'm a little unsure how best to approach this.

The main options are:

a) go ahead and move the vmsave handling for non-sev-es case into
   prepare_guest_switch() as you suggested, but leave the sev-es where
   they are. then we can refactor those as a follow-up patch that can be
   tested/reviewed as a separate series after we've had some time to
   re-test, though that would probably just complicate the code in the
   meantime...

b) stick with the current approach for now, and consider a follow-up series
   to refactor both sev-es and non-sev-es as a whole that we can test
   separately.

c) refactor SEV-ES handling as part of this series. it's only a small change
   to the SEV-ES code but it re-orders enough things around that I'm
   concerned it might invalidate some of the internal testing we've done.
   whereas a follow-up refactoring such as the above options can be rolled
   into our internal testing so we can let our test teams re-verify

Obviously I prefer b) but I'm biased on the matter and fine with whatever
you and others think is best. I just wanted to point out my concerns with
the various options.

-Mike
Michael Roth Dec. 16, 2020, 3:12 p.m. UTC | #9
On Tue, Dec 15, 2020 at 12:17:47PM -0600, Michael Roth wrote:
> On Mon, Dec 14, 2020 at 02:29:46PM -0800, Andy Lutomirski wrote:
> > 
> > 
> > > On Dec 14, 2020, at 2:02 PM, Michael Roth <michael.roth@amd.com> wrote:
> > > 
> > > On Mon, Dec 14, 2020 at 11:38:23AM -0800, Sean Christopherson wrote:
> > >> +Andy, who provided a lot of feedback on v1.
> > >> On Mon, Dec 14, 2020, Michael Roth wrote:
> > >> Cc: Andy Lutomirski <luto@kernel.org>
> > >>> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
> > >>> Signed-off-by: Michael Roth <michael.roth@amd.com>
> > >>> ---
> > >>> v2:
> > >>> * rebase on latest kvm/next
> > >>> * move VMLOAD to just after vmexit so we can use it to handle all FS/GS
> > >>> host state restoration and rather than relying on loadsegment() and
> > >>> explicit write to MSR_GS_BASE (Andy)
> > >>> * drop 'host' field from struct vcpu_svm since it is no longer needed
> > >>> for storing FS/GS/LDT state (Andy)
> > >>> ---
> > >>> arch/x86/kvm/svm/svm.c | 44 ++++++++++++++++--------------------------
> > >>> arch/x86/kvm/svm/svm.h | 14 +++-----------
> > >>> 2 files changed, 20 insertions(+), 38 deletions(-)
> > >>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > >>> index 0e52fac4f5ae..fb15b7bd461f 100644
> > >>> --- a/arch/x86/kvm/svm/svm.c
> > >>> +++ b/arch/x86/kvm/svm/svm.c
> > >>> @@ -1367,15 +1367,19 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> > >>>       vmcb_mark_all_dirty(svm->vmcb);
> > >>>   }
> > >>> -#ifdef CONFIG_X86_64
> > >>> -    rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
> > >>> -#endif
> > >>> -    savesegment(fs, svm->host.fs);
> > >>> -    savesegment(gs, svm->host.gs);
> > >>> -    svm->host.ldt = kvm_read_ldt();
> > >>> -
> > >>> -    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> > >>> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> > >>>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> > >>> +    }
> > > 
> > > Hi Sean,
> > > 
> > > Hopefully I've got my email situation sorted out now...
> > > 
> > >> Unnecessary change that violates preferred coding style.  Checkpatch explicitly
> > >> complains about this.
> > >> WARNING: braces {} are not necessary for single statement blocks
> > >> #132: FILE: arch/x86/kvm/svm/svm.c:1370:
> > >> +    for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
> > >>       rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> > >> +
> > > 
> > > Sorry, that was an artifact from an earlier version of the patch that I
> > > failed to notice. I'll make sure to run everything through checkpatch
> > > going forward.
> > > 
> > >>> +
> > >>> +    asm volatile(__ex("vmsave")
> > >>> +             : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
> > >> I'm pretty sure this can be page_to_phys().
> > >>> +             : "memory");
> > >> I think we can defer this until we're actually planning on running the guest,
> > >> i.e. put this in svm_prepare_guest_switch().
> > > 
> > > One downside to that is that we'd need to do the VMSAVE on every
> > > iteration of vcpu_run(), as opposed to just once when we enter from
> > > userspace via KVM_RUN. It ends up being a similar situation to Andy's
> > > earlier suggestion of moving VMLOAD just after vmexit, but in that case
> > > we were able to remove an MSR write to MSR_GS_BASE, which cancelled out
> > > the overhead, but in this case I think it could only cost us extra.
> > 
> > If you want to micro-optimize, there is a trick you could play: use WRGSBASE if available.  If X86_FEATURE_GSBASE is available, you could use WRGSBASE to restore GSBASE and defer VMLOAD to vcpu_put().  This would need benchmarking on Zen 3 to see if it’s worthwhile.
> 
> I'll give this a try. The vmsave only seems to be 100-200 in and of itself so
> I'm not sure there's much to be gained, but would be good to know either
> way.

It looks like it does save us ~20-30 cycles vs. vmload, but maybe not
enough to justify the added complexity. Additionally, since we still
need to call vmload when we exit to userspace, it ends up being a bit
slower for this particular workload at least. So for now I'll plan on
sticking to vmload'ing after vmexit and moving that to the asm code
if there are no objections.

current v2 patch, sample 1
  ioctl entry: 1204722748832
  pre-run:     1204722749408 ( +576)
  post-run:    1204722750784 (+1376)
  ioctl exit:  1204722751360 ( +576)
  total cycles:         2528

current v2 patch, sample 2
  ioctl entry: 1204722754784
  pre-vmrun:   1204722755360 ( +576)
  post-vmrun:  1204722756720 (+1360)
  ioctl exit:  1204722757312 ( +592)
  total cycles          2528

wrgsbase, sample 1
  ioctl entry: 1346624880336
  pre-vmrun:   1346624880912 ( +576)
  post-vmrun:  1346624882256 (+1344)
  ioctl exit:  1346624882912 ( +656)
  total cycles          2576

wrgsbase, sample 2
  ioctl entry: 1346624886272
  pre-vmrun:   1346624886832 ( +560)
  post-vmrun:  1346624888176 (+1344)
  ioctl exit:  1346624888816 ( +640)
  total cycles:         2544

> 
> > 
> >
Paolo Bonzini Dec. 16, 2020, 3:23 p.m. UTC | #10
On 16/12/20 16:12, Michael Roth wrote:
> It looks like it does save us ~20-30 cycles vs. vmload, but maybe not
> enough to justify the added complexity. Additionally, since we still
> need to call vmload when we exit to userspace, it ends up being a bit
> slower for this particular workload at least. So for now I'll plan on
> sticking to vmload'ing after vmexit and moving that to the asm code
> if there are no objections.

Yeah, agreed.  BTW you can use "./x86/run x86/vmexit.flat" from 
kvm-unit-tests to check the numbers for a wide range of vmexit paths.

Paolo

> current v2 patch, sample 1
>    ioctl entry: 1204722748832
>    pre-run:     1204722749408 ( +576)
>    post-run:    1204722750784 (+1376)
>    ioctl exit:  1204722751360 ( +576)
>    total cycles:         2528
> 
> current v2 patch, sample 2
>    ioctl entry: 1204722754784
>    pre-vmrun:   1204722755360 ( +576)
>    post-vmrun:  1204722756720 (+1360)
>    ioctl exit:  1204722757312 ( +592)
>    total cycles          2528
> 
> wrgsbase, sample 1
>    ioctl entry: 1346624880336
>    pre-vmrun:   1346624880912 ( +576)
>    post-vmrun:  1346624882256 (+1344)
>    ioctl exit:  1346624882912 ( +656)
>    total cycles          2576
> 
> wrgsbase, sample 2
>    ioctl entry: 1346624886272
>    pre-vmrun:   1346624886832 ( +560)
>    post-vmrun:  1346624888176 (+1344)
>    ioctl exit:  1346624888816 ( +640)
>    total cycles:         2544
>
Michael Roth Dec. 16, 2020, 5:07 p.m. UTC | #11
On Wed, Dec 16, 2020 at 04:23:22PM +0100, Paolo Bonzini wrote:
> On 16/12/20 16:12, Michael Roth wrote:
> > It looks like it does save us ~20-30 cycles vs. vmload, but maybe not
> > enough to justify the added complexity. Additionally, since we still
> > need to call vmload when we exit to userspace, it ends up being a bit
> > slower for this particular workload at least. So for now I'll plan on
> > sticking to vmload'ing after vmexit and moving that to the asm code
> > if there are no objections.
> 
> Yeah, agreed.  BTW you can use "./x86/run x86/vmexit.flat" from
> kvm-unit-tests to check the numbers for a wide range of vmexit paths.

Wasn't aware of that, this looks really useful. Thanks!

> 
> Paolo
> 
> > current v2 patch, sample 1
> >    ioctl entry: 1204722748832
> >    pre-run:     1204722749408 ( +576)
> >    post-run:    1204722750784 (+1376)
> >    ioctl exit:  1204722751360 ( +576)
> >    total cycles:         2528
> > 
> > current v2 patch, sample 2
> >    ioctl entry: 1204722754784
> >    pre-vmrun:   1204722755360 ( +576)
> >    post-vmrun:  1204722756720 (+1360)
> >    ioctl exit:  1204722757312 ( +592)
> >    total cycles          2528
> > 
> > wrgsbase, sample 1
> >    ioctl entry: 1346624880336
> >    pre-vmrun:   1346624880912 ( +576)
> >    post-vmrun:  1346624882256 (+1344)
> >    ioctl exit:  1346624882912 ( +656)
> >    total cycles          2576
> > 
> > wrgsbase, sample 2
> >    ioctl entry: 1346624886272
> >    pre-vmrun:   1346624886832 ( +560)
> >    post-vmrun:  1346624888176 (+1344)
> >    ioctl exit:  1346624888816 ( +640)
> >    total cycles:         2544
> > 
>
Sean Christopherson Dec. 16, 2020, 11:48 p.m. UTC | #12
On Tue, Dec 15, 2020, Michael Roth wrote:
> Hi Sean,
> 
> Sorry to reply out-of-thread, our mail server is having issues with
> certain email addresses at the moment so I only see your message via
> the archives atm. But regarding:
> 
> >>> I think we can defer this until we're actually planning on running
> >>> the guest,
> >>> i.e. put this in svm_prepare_guest_switch().
> >>
> >> It looks like the SEV-ES patches might land before this one, and those
> >> introduce similar handling of VMSAVE in svm_vcpu_load(), so I think it
> >> might also create some churn there if we take this approach and want
> >> to keep the SEV-ES and non-SEV-ES handling similar.
> >
> >Hmm, I'll make sure to pay attention to that when I review the SEV-ES
> >patches,
> >which I was hoping to get to today, but that's looking unlikely at this
> >point.
> 
> It looks like SEV-ES patches are queued now. Those patches have
> undergone a lot of internal testing so I'm really hesitant to introduce
> any significant change to those at this stage as a prereq for my little
> patch. So for v3 I'm a little unsure how best to approach this.
> 
> The main options are:
> 
> a) go ahead and move the vmsave handling for non-sev-es case into
>    prepare_guest_switch() as you suggested, but leave the sev-es where
>    they are. then we can refactor those as a follow-up patch that can be
>    tested/reviewed as a separate series after we've had some time to
>    re-test, though that would probably just complicate the code in the
>    meantime...
> 
> b) stick with the current approach for now, and consider a follow-up series
>    to refactor both sev-es and non-sev-es as a whole that we can test
>    separately.
> 
> c) refactor SEV-ES handling as part of this series. it's only a small change
>    to the SEV-ES code but it re-orders enough things around that I'm
>    concerned it might invalidate some of the internal testing we've done.
>    whereas a follow-up refactoring such as the above options can be rolled
>    into our internal testing so we can let our test teams re-verify
> 
> Obviously I prefer b) but I'm biased on the matter and fine with whatever
> you and others think is best. I just wanted to point out my concerns with
> the various options.

Definitely (c).  This has already missed 5.11 (unless Paolo plans on shooting
from the hip), which means SEV-ES will get to enjoy a full (LTS) kernel release
before these optimizations take effect.

And, the series can be structured so that the optimization (VMSAVE during
.prepare_guest_switch()) is done in a separate patch.  That way, if it does
break SEV-ES (or legacy VMs), the optimized variant can be easily bisected and
fixed or reverted as needed.  E.g. first convert legacy VMs to use VMSAVE+VMLOAD,
possibly consolidating code along the way, then convert all VM types to do
VMSAVE during .prepare_guest_switch().
Paolo Bonzini Dec. 17, 2020, 8:29 a.m. UTC | #13
On 17/12/20 00:48, Sean Christopherson wrote:
>> c) refactor SEV-ES handling as part of this series. it's only a small change
>>     to the SEV-ES code but it re-orders enough things around that I'm
>>     concerned it might invalidate some of the internal testing we've done.
>>     whereas a follow-up refactoring such as the above options can be rolled
>>     into our internal testing so we can let our test teams re-verify
>>
>> Obviously I prefer b) but I'm biased on the matter and fine with whatever
>> you and others think is best. I just wanted to point out my concerns with
>> the various options.
> Definitely (c).  This has already missed 5.11 (unless Paolo plans on shooting
> from the hip),

No, 5.11 is more or less done as far as x86 is concerned.  I'm sending 
the PR to Linus right now.

Paolo

> which means SEV-ES will get to enjoy a full (LTS) kernel release
> before these optimizations take effect.
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0e52fac4f5ae..fb15b7bd461f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1367,15 +1367,19 @@  static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vmcb_mark_all_dirty(svm->vmcb);
 	}
 
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
-	savesegment(fs, svm->host.fs);
-	savesegment(gs, svm->host.gs);
-	svm->host.ldt = kvm_read_ldt();
-
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
 		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+	}
+
+	asm volatile(__ex("vmsave")
+		     : : "a" (page_to_pfn(sd->save_area) << PAGE_SHIFT)
+		     : "memory");
+	/*
+	 * Store a pointer to the save area to we can access it after
+	 * vmexit for vmload. This is needed since per-cpu accesses
+	 * won't be available until GS is restored as part of vmload
+	 */
+	svm->host_save_area = sd->save_area;
 
 	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
@@ -1403,18 +1407,10 @@  static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	avic_vcpu_put(vcpu);
 
 	++vcpu->stat.host_state_reload;
-	kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
-	loadsegment(fs, svm->host.fs);
-	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
-	load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+
+	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+	}
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -3507,14 +3503,8 @@  static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 	__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
 
-#ifdef CONFIG_X86_64
-	native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-	loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-	loadsegment(gs, svm->host.gs);
-#endif
-#endif
+	asm volatile(__ex("vmload")
+		     : : "a" (page_to_pfn(svm->host_save_area) << PAGE_SHIFT));
 
 	/*
 	 * VMEXIT disables interrupts (host state), but tracing and lockdep
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index fdff76eb6ceb..bf01a8c19ec0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -21,11 +21,6 @@ 
 #include <asm/svm.h>
 
 static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-	MSR_FS_BASE,
-#endif
-	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_TSC_AUX,
 };
 
@@ -117,12 +112,9 @@  struct vcpu_svm {
 	u64 next_rip;
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-	struct {
-		u16 fs;
-		u16 gs;
-		u16 ldt;
-		u64 gs_base;
-	} host;
+
+	/* set by vcpu_load(), for use when per-cpu accesses aren't available */
+	struct page *host_save_area;
 
 	u64 spec_ctrl;
 	/*