diff mbox

[v4] kvm/fpu: Enable fully eager restore kvm FPU

Message ID 1348457323-22616-1-git-send-email-xudong.hao@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Hao, Xudong Sept. 24, 2012, 3:28 a.m. UTC
Enable KVM FPU fully eager restore, if there is other FPU state which isn't
tracked by CR0.TS bit.

v4 changes from v3:
- Wrap up some confused code with a clear functio lazy_fpu_allowed()
- Update fpu while update cr4 too.

v3 changes from v2:
- Make fpu active explicitly while guest xsave is enabling and non-lazy xstate
bit exist.

v2 changes from v1:
- Expand KVM_XSTATE_LAZY to 64 bits before negating it.

Signed-off-by: Xudong Hao <xudong.hao@intel.com>
---
 arch/x86/include/asm/kvm.h      |    4 ++++
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/svm.c              |    1 +
 arch/x86/kvm/vmx.c              |    1 +
 arch/x86/kvm/x86.c              |   32 +++++++++++++++++++++++++++++++-
 5 files changed, 38 insertions(+), 1 deletions(-)

Comments

Avi Kivity Sept. 24, 2012, 2:16 p.m. UTC | #1
On 09/24/2012 05:28 AM, Xudong Hao wrote:
> Enable KVM FPU fully eager restore, if there is other FPU state which isn't
> tracked by CR0.TS bit.
> 
> v4 changes from v3:
> - Wrap up some confused code with a clear functio lazy_fpu_allowed()
> - Update fpu while update cr4 too.
> 
> v3 changes from v2:
> - Make fpu active explicitly while guest xsave is enabling and non-lazy xstate
> bit exist.
> 
> v2 changes from v1:
> - Expand KVM_XSTATE_LAZY to 64 bits before negating it.
>  
>  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
>  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 818fceb..fbdb44a 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -2941,6 +2941,7 @@ static int cr_interception(struct vcpu_svm *svm)
>  			break;
>  		case 4:
>  			err = kvm_set_cr4(&svm->vcpu, val);
> +			update_lazy_fpu(vcpu);
>  			break;
>  		case 8:
>  			err = kvm_set_cr8(&svm->vcpu, val);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 30bcb95..b3880c0 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4488,6 +4488,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>  			return 1;
>  		case 4:
>  			err = handle_set_cr4(vcpu, val);
> +			update_lazy_fpu(vcpu);
>  			kvm_complete_insn_gp(vcpu, err);
>  			return 1;

Why not in kvm_set_cr4()?


>  		case 8: {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fc2a0a1..2e14cec 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -544,6 +544,31 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
>  }
>  EXPORT_SYMBOL_GPL(kvm_lmsw);
>  
> +/*
> + * KVM trigger FPU restore by #NM (via CR0.TS),
> + * only XCR0.bit0, XCR0.bit1, XCR0.bit2 is tracked
> + * by TS bit, there might be other FPU state is not tracked
> + * by TS bit.
> + * This function lazy_fpu_allowed() return true with any of
> + * the following cases: 1)xsave isn't enabled in guest;
> + * 2)all guest FPU states can be tracked by TS bit.
> + */
> +static bool lazy_fpu_allowed(struct kvm_vcpu *vcpu)
> +{
> +	return !!(!kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) ||
> +			!(vcpu->arch.xcr0 & ~((u64)KVM_XSTATE_LAZY)));
> +}

!! is !needed, bool conversion takes care of it.

> +
> +void update_lazy_fpu(struct kvm_vcpu *vcpu)
> +{
> +	if (lazy_fpu_allowed(vcpu)) {
> +		vcpu->fpu_active = 0;
> +		kvm_x86_ops->fpu_deactivate(vcpu);
> +	}

There is no need to deactivate the fpu here.  We try to deactivate the
fpu as late as possible, preempt notifiers or vcpu_put will do that for us.

> +	else
> +		kvm_x86_ops->fpu_activate(vcpu);
> +}
> +
>  int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>  {
>  	u64 xcr0;
> @@ -571,6 +596,7 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>  		kvm_inject_gp(vcpu, 0);
>  		return 1;
>  	}
> +	update_lazy_fpu(vcpu);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_xcr);
> @@ -5985,7 +6011,11 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
>  	vcpu->guest_fpu_loaded = 0;
>  	fpu_save_init(&vcpu->arch.guest_fpu);
>  	++vcpu->stat.fpu_reload;
> -	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
> +	/*
> +	 * Make deactivate request while allow fpu lazy restore.
> +	 */
> +	if (lazy_fpu_allowed(vcpu))
> +		kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
>  	trace_kvm_fpu(0);
>  }
>  
> 

btw, it is clear that long term the fpu will always be eagerly loaded,
as hosts and guests (and hardware) are updated.  At that time it will
make sense to remove the lazy fpu code entirely.  But maybe that time is
here already, since exits are rare and so the guest has a lot of chance
to use the fpu, so eager fpu saves the #NM vmexit.

Can you check a kernel compile on a westmere system?  If eager fpu is
faster there than lazy fpu, we can just make the fpu always eager and
remove quite a bit of code.

It will slow down guests not using in-kernel apic, or guests that just
process interrupts in the kernel and then HLT, or maybe i386 guests, but
I think it's worth it.
Hao, Xudong Sept. 25, 2012, 2:32 a.m. UTC | #2
> -----Original Message-----
> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> Behalf Of Avi Kivity
> Sent: Monday, September 24, 2012 10:17 PM
> To: Hao, Xudong
> Cc: kvm@vger.kernel.org; Zhang, Xiantao
> Subject: Re: [PATCH v4] kvm/fpu: Enable fully eager restore kvm FPU
> 
> On 09/24/2012 05:28 AM, Xudong Hao wrote:
> > Enable KVM FPU fully eager restore, if there is other FPU state which isn't
> > tracked by CR0.TS bit.
> >
> > v4 changes from v3:
> > - Wrap up some confused code with a clear functio lazy_fpu_allowed()
> > - Update fpu while update cr4 too.
> >
> > v3 changes from v2:
> > - Make fpu active explicitly while guest xsave is enabling and non-lazy xstate
> > bit exist.
> >
> > v2 changes from v1:
> > - Expand KVM_XSTATE_LAZY to 64 bits before negating it.
> >
> >  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
> >  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 818fceb..fbdb44a 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -2941,6 +2941,7 @@ static int cr_interception(struct vcpu_svm *svm)
> >  			break;
> >  		case 4:
> >  			err = kvm_set_cr4(&svm->vcpu, val);
> > +			update_lazy_fpu(vcpu);
> >  			break;
> >  		case 8:
> >  			err = kvm_set_cr8(&svm->vcpu, val);
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 30bcb95..b3880c0 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -4488,6 +4488,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
> >  			return 1;
> >  		case 4:
> >  			err = handle_set_cr4(vcpu, val);
> > +			update_lazy_fpu(vcpu);
> >  			kvm_complete_insn_gp(vcpu, err);
> >  			return 1;
> 
> Why not in kvm_set_cr4()?
> 
> 
> >  		case 8: {
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index fc2a0a1..2e14cec 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -544,6 +544,31 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned
> long msw)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_lmsw);
> >
> > +/*
> > + * KVM trigger FPU restore by #NM (via CR0.TS),
> > + * only XCR0.bit0, XCR0.bit1, XCR0.bit2 is tracked
> > + * by TS bit, there might be other FPU state is not tracked
> > + * by TS bit.
> > + * This function lazy_fpu_allowed() return true with any of
> > + * the following cases: 1)xsave isn't enabled in guest;
> > + * 2)all guest FPU states can be tracked by TS bit.
> > + */
> > +static bool lazy_fpu_allowed(struct kvm_vcpu *vcpu)
> > +{
> > +	return !!(!kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) ||
> > +			!(vcpu->arch.xcr0 & ~((u64)KVM_XSTATE_LAZY)));
> > +}
> 
> !! is !needed, bool conversion takes care of it.
> 
> > +
> > +void update_lazy_fpu(struct kvm_vcpu *vcpu)
> > +{
> > +	if (lazy_fpu_allowed(vcpu)) {
> > +		vcpu->fpu_active = 0;
> > +		kvm_x86_ops->fpu_deactivate(vcpu);
> > +	}
> 
> There is no need to deactivate the fpu here.  We try to deactivate the
> fpu as late as possible, preempt notifiers or vcpu_put will do that for us.
> 
> > +	else
> > +		kvm_x86_ops->fpu_activate(vcpu);
> > +}
> > +
> >  int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
> >  {
> >  	u64 xcr0;
> > @@ -571,6 +596,7 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index,
> u64 xcr)
> >  		kvm_inject_gp(vcpu, 0);
> >  		return 1;
> >  	}
> > +	update_lazy_fpu(vcpu);
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_set_xcr);
> > @@ -5985,7 +6011,11 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
> >  	vcpu->guest_fpu_loaded = 0;
> >  	fpu_save_init(&vcpu->arch.guest_fpu);
> >  	++vcpu->stat.fpu_reload;
> > -	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
> > +	/*
> > +	 * Make deactivate request while allow fpu lazy restore.
> > +	 */
> > +	if (lazy_fpu_allowed(vcpu))
> > +		kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
> >  	trace_kvm_fpu(0);
> >  }
> >
> >
> 
> btw, it is clear that long term the fpu will always be eagerly loaded,
> as hosts and guests (and hardware) are updated.  At that time it will
> make sense to remove the lazy fpu code entirely.  But maybe that time is
> here already, since exits are rare and so the guest has a lot of chance
> to use the fpu, so eager fpu saves the #NM vmexit.
> 
> Can you check a kernel compile on a westmere system?  If eager fpu is
> faster there than lazy fpu, we can just make the fpu always eager and
> remove quite a bit of code.
> 
I remember westmere does not support Xsave, do you want performance of fxsave/fresotr ?

> It will slow down guests not using in-kernel apic, or guests that just
> process interrupts in the kernel and then HLT, or maybe i386 guests, but
> I think it's worth it.
> 
> --
> error compiling committee.c: too many arguments to function
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 25, 2012, 8:16 a.m. UTC | #3
On 09/25/2012 04:32 AM, Hao, Xudong wrote:
> > 
> > btw, it is clear that long term the fpu will always be eagerly loaded,
> > as hosts and guests (and hardware) are updated.  At that time it will
> > make sense to remove the lazy fpu code entirely.  But maybe that time is
> > here already, since exits are rare and so the guest has a lot of chance
> > to use the fpu, so eager fpu saves the #NM vmexit.
> > 
> > Can you check a kernel compile on a westmere system?  If eager fpu is
> > faster there than lazy fpu, we can just make the fpu always eager and
> > remove quite a bit of code.
> > 
> I remember westmere does not support Xsave, do you want performance of fxsave/fresotr ?

Yes.   If a westmere is fast enough then we can probably justify it.  If
you can run tests on Sandy/Ivy Bridge, even better.
Hao, Xudong Sept. 26, 2012, 5:54 a.m. UTC | #4
> -----Original Message-----
> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> Behalf Of Avi Kivity
> Sent: Tuesday, September 25, 2012 4:16 PM
> To: Hao, Xudong
> Cc: kvm@vger.kernel.org; Zhang, Xiantao
> Subject: Re: [PATCH v4] kvm/fpu: Enable fully eager restore kvm FPU
> 
> On 09/25/2012 04:32 AM, Hao, Xudong wrote:
> > >
> > > btw, it is clear that long term the fpu will always be eagerly loaded,
> > > as hosts and guests (and hardware) are updated.  At that time it will
> > > make sense to remove the lazy fpu code entirely.  But maybe that time is
> > > here already, since exits are rare and so the guest has a lot of chance
> > > to use the fpu, so eager fpu saves the #NM vmexit.
> > >
> > > Can you check a kernel compile on a westmere system?  If eager fpu is
> > > faster there than lazy fpu, we can just make the fpu always eager and
> > > remove quite a bit of code.
> > >
> > I remember westmere does not support Xsave, do you want performance of
> fxsave/fresotr ?
> 
> Yes.   If a westmere is fast enough then we can probably justify it.  If
> you can run tests on Sandy/Ivy Bridge, even better.
> 
Run kernel compile on westmere, eager fpu is about 0.4% faster, seems eager does not benefit it too much, so remain lazy fpu for lazy_allowed fpu state?

-Xudong
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 27, 2012, 10:11 a.m. UTC | #5
On 09/26/2012 07:54 AM, Hao, Xudong wrote:
>> -----Original Message-----
>> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
>> Behalf Of Avi Kivity
>> Sent: Tuesday, September 25, 2012 4:16 PM
>> To: Hao, Xudong
>> Cc: kvm@vger.kernel.org; Zhang, Xiantao
>> Subject: Re: [PATCH v4] kvm/fpu: Enable fully eager restore kvm FPU
>> 
>> On 09/25/2012 04:32 AM, Hao, Xudong wrote:
>> > >
>> > > btw, it is clear that long term the fpu will always be eagerly loaded,
>> > > as hosts and guests (and hardware) are updated.  At that time it will
>> > > make sense to remove the lazy fpu code entirely.  But maybe that time is
>> > > here already, since exits are rare and so the guest has a lot of chance
>> > > to use the fpu, so eager fpu saves the #NM vmexit.
>> > >
>> > > Can you check a kernel compile on a westmere system?  If eager fpu is
>> > > faster there than lazy fpu, we can just make the fpu always eager and
>> > > remove quite a bit of code.
>> > >
>> > I remember westmere does not support Xsave, do you want performance of
>> fxsave/fresotr ?
>> 
>> Yes.   If a westmere is fast enough then we can probably justify it.  If
>> you can run tests on Sandy/Ivy Bridge, even better.
>> 
> Run kernel compile on westmere, eager fpu is about 0.4% faster, seems eager does not benefit it too much, so remain lazy fpu for lazy_allowed fpu state?

Why not make it eager all the time then?  It will simplify the code
quite a bit, no?

All I was looking for was no regressions, a small speedup is just a bonus.
Hao, Xudong Sept. 28, 2012, 9:18 a.m. UTC | #6
> -----Original Message-----
> From: Avi Kivity [mailto:avi@redhat.com]
> Sent: Thursday, September 27, 2012 6:12 PM
> To: Hao, Xudong
> Cc: kvm@vger.kernel.org; Zhang, Xiantao
> Subject: Re: [PATCH v4] kvm/fpu: Enable fully eager restore kvm FPU
> 
> On 09/26/2012 07:54 AM, Hao, Xudong wrote:
> >> -----Original Message-----
> >> From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On
> >> Behalf Of Avi Kivity
> >> Sent: Tuesday, September 25, 2012 4:16 PM
> >> To: Hao, Xudong
> >> Cc: kvm@vger.kernel.org; Zhang, Xiantao
> >> Subject: Re: [PATCH v4] kvm/fpu: Enable fully eager restore kvm FPU
> >>
> >> On 09/25/2012 04:32 AM, Hao, Xudong wrote:
> >> > >
> >> > > btw, it is clear that long term the fpu will always be eagerly loaded,
> >> > > as hosts and guests (and hardware) are updated.  At that time it will
> >> > > make sense to remove the lazy fpu code entirely.  But maybe that time
> is
> >> > > here already, since exits are rare and so the guest has a lot of chance
> >> > > to use the fpu, so eager fpu saves the #NM vmexit.
> >> > >
> >> > > Can you check a kernel compile on a westmere system?  If eager fpu is
> >> > > faster there than lazy fpu, we can just make the fpu always eager and
> >> > > remove quite a bit of code.
> >> > >
> >> > I remember westmere does not support Xsave, do you want performance
> of
> >> fxsave/fresotr ?
> >>
> >> Yes.   If a westmere is fast enough then we can probably justify it.  If
> >> you can run tests on Sandy/Ivy Bridge, even better.
> >>
> > Run kernel compile on westmere, eager fpu is about 0.4% faster, seems
> eager does not benefit it too much, so remain lazy fpu for lazy_allowed fpu
> state?
> 
> Why not make it eager all the time then?  It will simplify the code
> quite a bit, no?
> 
The code will simple if make it eager, I'll remove the lazy logic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 521bf25..4c27056 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -8,6 +8,8 @@ 
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <asm/user.h>
+#include <asm/xsave.h>
 
 /* Select x86 specific features in <linux/kvm.h> */
 #define __KVM_HAVE_PIT
@@ -30,6 +32,8 @@ 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
 
+#define KVM_XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+
 struct kvm_memory_alias {
 	__u32 slot;  /* this has a different namespace than memory slots */
 	__u32 flags;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b902c9..25fef9e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -826,6 +826,7 @@  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+void update_lazy_fpu(struct kvm_vcpu *vcpu);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 818fceb..fbdb44a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2941,6 +2941,7 @@  static int cr_interception(struct vcpu_svm *svm)
 			break;
 		case 4:
 			err = kvm_set_cr4(&svm->vcpu, val);
+			update_lazy_fpu(vcpu);
 			break;
 		case 8:
 			err = kvm_set_cr8(&svm->vcpu, val);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 30bcb95..b3880c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4488,6 +4488,7 @@  static int handle_cr(struct kvm_vcpu *vcpu)
 			return 1;
 		case 4:
 			err = handle_set_cr4(vcpu, val);
+			update_lazy_fpu(vcpu);
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc2a0a1..2e14cec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -544,6 +544,31 @@  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
+/*
+ * KVM trigger FPU restore by #NM (via CR0.TS),
+ * only XCR0.bit0, XCR0.bit1, XCR0.bit2 is tracked
+ * by TS bit, there might be other FPU state is not tracked
+ * by TS bit.
+ * This function lazy_fpu_allowed() return true with any of
+ * the following cases: 1)xsave isn't enabled in guest;
+ * 2)all guest FPU states can be tracked by TS bit.
+ */
+static bool lazy_fpu_allowed(struct kvm_vcpu *vcpu)
+{
+	return !!(!kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) ||
+			!(vcpu->arch.xcr0 & ~((u64)KVM_XSTATE_LAZY)));
+}
+
+void update_lazy_fpu(struct kvm_vcpu *vcpu)
+{
+	if (lazy_fpu_allowed(vcpu)) {
+		vcpu->fpu_active = 0;
+		kvm_x86_ops->fpu_deactivate(vcpu);
+	}
+	else
+		kvm_x86_ops->fpu_activate(vcpu);
+}
+
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 	u64 xcr0;
@@ -571,6 +596,7 @@  int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
+	update_lazy_fpu(vcpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
@@ -5985,7 +6011,11 @@  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
-	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	/*
+	 * Make deactivate request while allow fpu lazy restore.
+	 */
+	if (lazy_fpu_allowed(vcpu))
+		kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 }