diff mbox series

[v2,03/49] KVM: x86: Account for KVM-reserved CR4 bits when passing through CR4 on VMX

Message ID 20240517173926.965351-4-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: CPUID overhaul, fixes, and caching | expand

Commit Message

Sean Christopherson May 17, 2024, 5:38 p.m. UTC
Drop x86.c's local pre-computed cr4_reserved bits and instead fold KVM's
reserved bits into the guest's reserved bits.  This fixes a bug where VMX's
set_cr4_guest_host_mask() fails to account for KVM-reserved bits when
deciding which bits can be passed through to the guest.  In most cases,
letting the guest directly write reserved CR4 bits is ok, i.e. attempting
to set the bit(s) will still #GP, but not if a feature is available in
hardware but explicitly disabled by the host, e.g. if FSGSBASE support is
disabled via "nofsgsbase".

Note, the extra overhead of computing host reserved bits every time
userspace sets guest CPUID is negligible.  The feature bits that are
queried are packed nicely into a handful of words, and so checking and
setting each reserved bit costs in the neighborhood of ~5 cycles, i.e. the
total cost will be in the noise even if the number of checked CR4 bits
doubles over the next few years.  In other words, x86 will run out of CR4
bits long before the overhead becomes problematic.

Note #2, __cr4_reserved_bits() starts from CR4_RESERVED_BITS, which is
why the existing __kvm_cpu_cap_has() processing doesn't explicitly OR in
CR4_RESERVED_BITS (and why the new code doesn't do so either).

Fixes: 2ed41aa631fc ("KVM: VMX: Intercept guest reserved CR4 bits to inject #GP fault")
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/cpuid.c | 7 +++++--
 arch/x86/kvm/x86.c   | 9 ---------
 2 files changed, 5 insertions(+), 11 deletions(-)

Comments

Maxim Levitsky July 5, 2024, 12:55 a.m. UTC | #1
On Fri, 2024-05-17 at 10:38 -0700, Sean Christopherson wrote:
> Drop x86.c's local pre-computed cr4_reserved bits and instead fold KVM's
> reserved bits into the guest's reserved bits.  This fixes a bug where VMX's
> set_cr4_guest_host_mask() fails to account for KVM-reserved bits when
> deciding which bits can be passed through to the guest.  In most cases,
> letting the guest directly write reserved CR4 bits is ok, i.e. attempting
> to set the bit(s) will still #GP, but not if a feature is available in
> hardware but explicitly disabled by the host, e.g. if FSGSBASE support is
> disabled via "nofsgsbase".
> 
> Note, the extra overhead of computing host reserved bits every time
> userspace sets guest CPUID is negligible.  The feature bits that are
> queried are packed nicely into a handful of words, and so checking and
> setting each reserved bit costs in the neighborhood of ~5 cycles, i.e. the
> total cost will be in the noise even if the number of checked CR4 bits
> doubles over the next few years.  In other words, x86 will run out of CR4
> bits long before the overhead becomes problematic.

It might be just me, but IMHO this justification is confusing, leading me to belive that maybe
the code is on the hot-path instead.

The right justification should be just that this code is in kvm_vcpu_after_set_cpuid
is usually (*) only called once per vCPU (twice after your patch #1)

(*) Qemu also calls it, each time vCPU is hotplugged but this doesn't change anything
performance wise.

> 
> Note #2, __cr4_reserved_bits() starts from CR4_RESERVED_BITS, which is
> why the existing __kvm_cpu_cap_has() processing doesn't explicitly OR in
> CR4_RESERVED_BITS (and why the new code doesn't do so either).
> 
> Fixes: 2ed41aa631fc ("KVM: VMX: Intercept guest reserved CR4 bits to inject #GP fault")
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/cpuid.c | 7 +++++--
>  arch/x86/kvm/x86.c   | 9 ---------
>  2 files changed, 5 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index e60ffb421e4b..f756a91a3f2f 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -383,8 +383,11 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>  	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
>  
>  	kvm_pmu_refresh(vcpu);
> -	vcpu->arch.cr4_guest_rsvd_bits =
> -	    __cr4_reserved_bits(guest_cpuid_has, vcpu);
> +
> +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
> +	vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
> +					 __cr4_reserved_bits(guest_cpuid_has, vcpu);
> +#undef __kvm_cpu_cap_has
>  
>  	kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
>  						    vcpu->arch.cpuid_nent));
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 7adcf56bd45d..3f20de4368a6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -116,8 +116,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
>  static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
>  #endif
>  
> -static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
> -
>  #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
>  
>  #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
> @@ -1134,9 +1132,6 @@ EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
>  
>  bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  {
> -	if (cr4 & cr4_reserved_bits)
> -		return false;
> -
>  	if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
>  		return false;
>  
> @@ -9831,10 +9826,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
>  	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
>  		kvm_caps.supported_xss = 0;
>  
> -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
> -	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
> -#undef __kvm_cpu_cap_has
> -
>  	if (kvm_caps.has_tsc_control) {
>  		/*
>  		 * Make sure the user can only configure tsc_khz values that


I mostly agree with this patch - caching always carries risks and when it doesn't
value performance wise, it should always be removed.


However I don't think that this patch fixes a bug as it claims:

This is the code prior to this patch:

kvm_x86_vendor_init ->

	r = ops->hardware_setup();
		svm_hardware_setup
			svm_set_cpu_caps + kvm_set_cpu_caps

		-- or --

		vmx_hardware_setup ->
			vmx_set_cpu_caps + + kvm_set_cpu_caps


	# read from 'kvm_cpu_caps'
	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);


AFAIK kvm cpu caps are never touched outside of svm_set_cpu_caps/vmx_hardware_setup
(they don't depend on some later post-processing, cpuid, etc).


In fact a good refactoring would to make kvm_cpu_caps const after this point,
using cast, assert or something like that.


This leads me to believe that cr4_reserved_bits is computed correctly.


I could be wrong, but then IMHO it is a very good idea to provide an explanation
on how this bug can happen.


Best regards,
	Maxim Levitsky
Sean Christopherson July 9, 2024, 7:58 p.m. UTC | #2
On Thu, Jul 04, 2024, Maxim Levitsky wrote:
> On Fri, 2024-05-17 at 10:38 -0700, Sean Christopherson wrote:
> > Drop x86.c's local pre-computed cr4_reserved bits and instead fold KVM's
> > reserved bits into the guest's reserved bits.  This fixes a bug where VMX's
> > set_cr4_guest_host_mask() fails to account for KVM-reserved bits when
> > deciding which bits can be passed through to the guest.  In most cases,
> > letting the guest directly write reserved CR4 bits is ok, i.e. attempting
> > to set the bit(s) will still #GP, but not if a feature is available in
> > hardware but explicitly disabled by the host, e.g. if FSGSBASE support is
> > disabled via "nofsgsbase".
> > 
> > Note, the extra overhead of computing host reserved bits every time
> > userspace sets guest CPUID is negligible.  The feature bits that are
> > queried are packed nicely into a handful of words, and so checking and
> > setting each reserved bit costs in the neighborhood of ~5 cycles, i.e. the
> > total cost will be in the noise even if the number of checked CR4 bits
> > doubles over the next few years.  In other words, x86 will run out of CR4
> > bits long before the overhead becomes problematic.
> 
> It might be just me, but IMHO this justification is confusing, leading me to
> belive that maybe the code is on the hot-path instead.
> 
> The right justification should be just that this code is in
> kvm_vcpu_after_set_cpuid is usually (*) only called once per vCPU (twice
> after your patch #1)

Ya.  I was trying to capture that even if that weren't true, i.e. even if userspace
was doing something odd, that the extra cost is irrelevant.  I'll expand and reword
the paragraph to make it clear this isn't a hot path for any sane userspace.

> (*) Qemu also calls it, each time vCPU is hotplugged but this doesn't change
> anything performance wise.

...

> > @@ -9831,10 +9826,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
> >  	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
> >  		kvm_caps.supported_xss = 0;
> >  
> > -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
> > -	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
> > -#undef __kvm_cpu_cap_has
> > -
> >  	if (kvm_caps.has_tsc_control) {
> >  		/*
> >  		 * Make sure the user can only configure tsc_khz values that
> 
> 
> I mostly agree with this patch - caching always carries risks and when it doesn't
> value performance wise, it should always be removed.
> 
> 
> However I don't think that this patch fixes a bug as it claims:
> 
> This is the code prior to this patch:
> 
> kvm_x86_vendor_init ->
> 
> 	r = ops->hardware_setup();
> 		svm_hardware_setup
> 			svm_set_cpu_caps + kvm_set_cpu_caps
> 
> 		-- or --
> 
> 		vmx_hardware_setup ->
> 			vmx_set_cpu_caps + + kvm_set_cpu_caps
> 
> 
> 	# read from 'kvm_cpu_caps'
> 	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
> 
> 
> AFAIK kvm cpu caps are never touched outside of svm_set_cpu_caps/vmx_hardware_setup
> (they don't depend on some later post-processing, cpuid, etc).
> 
> In fact a good refactoring would to make kvm_cpu_caps const after this point,
> using cast, assert or something like that.
> 
> This leads me to believe that cr4_reserved_bits is computed correctly.

cr4_reserved_bits is computed correctly.  The bug is that cr4_reserved_bits isn't
consulted by set_cr4_guest_host_mask(), which is what I meant by "KVM-reserved
bits" in the changelog.

> I could be wrong, but then IMHO it is a very good idea to provide an explanation
> on how this bug can happen.

The first paragraph of the changelog tries to do that, and I'm struggling to come
up with different wording that makes it more clear what's wrong.  Any ideas/suggestions?
Maxim Levitsky July 24, 2024, 5:28 p.m. UTC | #3
On Tue, 2024-07-09 at 12:58 -0700, Sean Christopherson wrote:
> On Thu, Jul 04, 2024, Maxim Levitsky wrote:
> > On Fri, 2024-05-17 at 10:38 -0700, Sean Christopherson wrote:
> > > Drop x86.c's local pre-computed cr4_reserved bits and instead fold KVM's
> > > reserved bits into the guest's reserved bits.  This fixes a bug where VMX's
> > > set_cr4_guest_host_mask() fails to account for KVM-reserved bits when
> > > deciding which bits can be passed through to the guest.  In most cases,
> > > letting the guest directly write reserved CR4 bits is ok, i.e. attempting
> > > to set the bit(s) will still #GP, but not if a feature is available in
> > > hardware but explicitly disabled by the host, e.g. if FSGSBASE support is
> > > disabled via "nofsgsbase".
> > > 
> > > Note, the extra overhead of computing host reserved bits every time
> > > userspace sets guest CPUID is negligible.  The feature bits that are
> > > queried are packed nicely into a handful of words, and so checking and
> > > setting each reserved bit costs in the neighborhood of ~5 cycles, i.e. the
> > > total cost will be in the noise even if the number of checked CR4 bits
> > > doubles over the next few years.  In other words, x86 will run out of CR4
> > > bits long before the overhead becomes problematic.
> > 
> > It might be just me, but IMHO this justification is confusing, leading me to
> > belive that maybe the code is on the hot-path instead.
> > 
> > The right justification should be just that this code is in
> > kvm_vcpu_after_set_cpuid is usually (*) only called once per vCPU (twice
> > after your patch #1)
> 
> Ya.  I was trying to capture that even if that weren't true, i.e. even if userspace
> was doing something odd, that the extra cost is irrelevant.  I'll expand and reword
> the paragraph to make it clear this isn't a hot path for any sane userspace.
Thank you!

> 
> > (*) Qemu also calls it, each time vCPU is hotplugged but this doesn't change
> > anything performance wise.
> 
> ...
> 
> > > @@ -9831,10 +9826,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
> > >  	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
> > >  		kvm_caps.supported_xss = 0;
> > >  
> > > -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
> > > -	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
> > > -#undef __kvm_cpu_cap_has
> > > -
> > >  	if (kvm_caps.has_tsc_control) {
> > >  		/*
> > >  		 * Make sure the user can only configure tsc_khz values that
> > 
> > I mostly agree with this patch - caching always carries risks and when it doesn't
> > value performance wise, it should always be removed.
> > 
> > 
> > However I don't think that this patch fixes a bug as it claims:
> > 
> > This is the code prior to this patch:
> > 
> > kvm_x86_vendor_init ->
> > 
> > 	r = ops->hardware_setup();
> > 		svm_hardware_setup
> > 			svm_set_cpu_caps + kvm_set_cpu_caps
> > 
> > 		-- or --
> > 
> > 		vmx_hardware_setup ->
> > 			vmx_set_cpu_caps + + kvm_set_cpu_caps
> > 
> > 
> > 	# read from 'kvm_cpu_caps'
> > 	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
> > 
> > 
> > AFAIK kvm cpu caps are never touched outside of svm_set_cpu_caps/vmx_hardware_setup
> > (they don't depend on some later post-processing, cpuid, etc).
> > 
> > In fact a good refactoring would to make kvm_cpu_caps const after this point,
> > using cast, assert or something like that.
> > 
> > This leads me to believe that cr4_reserved_bits is computed correctly.
> 
> cr4_reserved_bits is computed correctly.  The bug is that cr4_reserved_bits isn't
> consulted by set_cr4_guest_host_mask(), which is what I meant by "KVM-reserved
> bits" in the changelog.

Ah, I see it now.

I also see that set_cr4_guest_host_mask, limits the guest owned bits to a small
whitelist, and none of these bits looks scary, so it all make sense that this
is mostly a theoretical bug, but for sure worth fixing.


> 
> > I could be wrong, but then IMHO it is a very good idea to provide an explanation
> > on how this bug can happen.
> 
> The first paragraph of the changelog tries to do that, and I'm struggling to come
> up with different wording that makes it more clear what's wrong.  Any ideas/suggestions?
> 

I also re-read it, and now it all makes sense. I guess I just somehow got fixed on
thinking that cr4_reserved_bits was not computed incorrectly rather than just
not used.
The comment indeed now makes sense to me, so let it be as it is.


Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e60ffb421e4b..f756a91a3f2f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -383,8 +383,11 @@  void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 
 	kvm_pmu_refresh(vcpu);
-	vcpu->arch.cr4_guest_rsvd_bits =
-	    __cr4_reserved_bits(guest_cpuid_has, vcpu);
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+	vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
+					 __cr4_reserved_bits(guest_cpuid_has, vcpu);
+#undef __kvm_cpu_cap_has
 
 	kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
 						    vcpu->arch.cpuid_nent));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7adcf56bd45d..3f20de4368a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -116,8 +116,6 @@  u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #endif
 
-static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-
 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
 
 #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
@@ -1134,9 +1132,6 @@  EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
 
 bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	if (cr4 & cr4_reserved_bits)
-		return false;
-
 	if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
 		return false;
 
@@ -9831,10 +9826,6 @@  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
 		kvm_caps.supported_xss = 0;
 
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
-	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
 	if (kvm_caps.has_tsc_control) {
 		/*
 		 * Make sure the user can only configure tsc_khz values that