diff mbox

[v3,2/4] x86/speculation: Support "Enhanced IBRS" on future CPUs

Message ID 1519037457-7643-3-git-send-email-dwmw@amazon.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Woodhouse, David Feb. 19, 2018, 10:50 a.m. UTC
The original IBRS hack in microcode is horribly slow. For the next
generation of CPUs, as a stopgap until we get a proper fix, Intel
promise an "Enhanced IBRS" which will be fast.

The assumption is that predictions in the BTB/RSB will be tagged with
the VMX mode and ring that they were learned in, and thus the CPU will
avoid consuming unsafe predictions without a performance penalty.

Intel's documentation says that it is still required to set the IBRS bit
in the SPEC_CTRL MSR and ensure that it remains set.

Cope with this by trapping and emulating *all* access to SPEC_CTRL from
KVM guests when the IBRS_ALL feature is present, so it can never be
turned off. Guests who see IBRS_ALL should never do anything except
turn it on at boot anyway. And if they didn't know about IBRS_ALL and
they keep frobbing IBRS on every kernel entry/exit... well the vmexit
for a no-op is probably going to be faster than they were expecting
anyway, so they'll live.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
---
 arch/x86/include/asm/nospec-branch.h |  9 ++++++++-
 arch/x86/kernel/cpu/bugs.c           | 17 +++++++++++++++--
 arch/x86/kvm/vmx.c                   | 31 +++++++++++++++++++------------
 3 files changed, 42 insertions(+), 15 deletions(-)

Comments

Thomas Gleixner Feb. 20, 2018, 8:31 a.m. UTC | #1
On Mon, 19 Feb 2018, David Woodhouse wrote:
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index 0995c6a..34cbce3 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -141,9 +141,16 @@ enum spectre_v2_mitigation {
>  	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
>  	SPECTRE_V2_RETPOLINE_GENERIC,
>  	SPECTRE_V2_RETPOLINE_AMD,
> -	SPECTRE_V2_IBRS,
> +	SPECTRE_V2_IBRS_ALL,
>  };
>  
> +extern enum spectre_v2_mitigation spectre_v2_enabled;
> +
> +static inline bool spectre_v2_ibrs_all(void)
> +{
> +	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
> +}
> +
>  extern char __indirect_thunk_start[];
>  extern char __indirect_thunk_end[];
>  
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index bfca937..505c467 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -88,12 +88,14 @@ static const char *spectre_v2_strings[] = {
>  	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
>  	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
>  	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
> +	[SPECTRE_V2_IBRS_ALL]			= "Mitigation: Enhanced IBRS",
>  };
>  
>  #undef pr_fmt
>  #define pr_fmt(fmt)     "Spectre V2 : " fmt
>  
> -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
> +enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
> +EXPORT_SYMBOL_GPL(spectre_v2_enabled);
>  
>  #ifdef RETPOLINE
>  static bool spectre_v2_bad_module;
> @@ -237,6 +239,16 @@ static void __init spectre_v2_select_mitigation(void)
>  
>  	case SPECTRE_V2_CMD_FORCE:
>  	case SPECTRE_V2_CMD_AUTO:
> +		if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
> +			u64 ia32_cap = 0;
> +
> +			rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
> +			if (ia32_cap & ARCH_CAP_IBRS_ALL) {

Hmm. We already read the MSR in cpu/common.c to check for the RDCL_NO
bit. Shouldn't we just read it once and set a feature bit for IBRS_ALL?

> +				mode = SPECTRE_V2_IBRS_ALL;
> +				wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS);
> +				goto ibrs_all;
> +			}
> +		}
>  		if (IS_ENABLED(CONFIG_RETPOLINE))
>  			goto retpoline_auto;
>  		break;

> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 3dec126..5dfeb11 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3387,13 +3387,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  
>  		vmx->spec_ctrl = data;
>  
> -		if (!data)
> +		if (!data && !spectre_v2_ibrs_all())
>  			break;
>  
>  		/*
>  		 * For non-nested:
>  		 * When it's written (to non-zero) for the first time, pass
> -		 * it through.
> +		 * it through unless we have IBRS_ALL and it should just be
> +		 * set for ever.

A non zero write is going to disable the intercept only when IBRS_ALL
is on. The comment says is should be set forever, i.e. not changeable by
the guest. So the condition should be:

		if (!data || spectre_v2_ibrs_all())
			break;
Hmm?

>  		 *
>  		 * For nested:
>  		 * The handling of the MSR bitmap for L2 guests is done in
> @@ -9451,7 +9452,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	 * is no need to worry about the conditional branch over the wrmsr
>  	 * being speculatively taken.
>  	 */
> -	if (vmx->spec_ctrl)
> +	if (!spectre_v2_ibrs_all() && vmx->spec_ctrl)
>  		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);

Which matches the code here.

Thanks,

	tglx
David Woodhouse Feb. 20, 2018, 8:53 a.m. UTC | #2
On Tue, 2018-02-20 at 09:31 +0100, Thomas Gleixner wrote:
> > @@ -237,6 +239,16 @@ static void __init spectre_v2_select_mitigation(void)
> >  
> >  	case SPECTRE_V2_CMD_FORCE:
> >  	case SPECTRE_V2_CMD_AUTO:
> > +		if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
> > +			u64 ia32_cap = 0;
> > +
> > +			rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
> > +			if (ia32_cap & ARCH_CAP_IBRS_ALL) {
>
> Hmm. We already read the MSR in cpu/common.c to check for the RDCL_NO
> bit. Shouldn't we just read it once and set a feature bit for IBRS_ALL?

Yeah. See my comment to Dave where he was going to add a *third* place
that does the same for a different bit. I think we should probably just
turn these into cpufeature bits like the 'scattered' ones.

> > @@ -3387,13 +3387,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  
> >  		vmx->spec_ctrl = data;
> >  
> > -		if (!data)
> > +		if (!data && !spectre_v2_ibrs_all())
> >  			break;
> >  
> >  		/*
> >  		 * For non-nested:
> >  		 * When it's written (to non-zero) for the first time, pass
> > -		 * it through.
> > +		 * it through unless we have IBRS_ALL and it should just be
> > +		 * set for ever.
>
> A non zero write is going to disable the intercept only when IBRS_ALL
> is on. The comment says is should be set forever, i.e. not changeable by
> the guest. So the condition should be:
> 
> 		if (!data || spectre_v2_ibrs_all())
> 			break;
> Hmm?

Yes, good catch. Thanks.

However, Paolo is very insistent that taking the trap every time is
actually a lot *slower* than really frobbing IBRS on certain
microarchitectures, so my hand-waving "pfft, what did they expect?" is
not acceptable.

Which I think puts us back to the "throwing the toys out of the pram"
solution; demanding that Intel give us a new feature bit for "IBRS_ALL,
and the bit in the MSR is a no-op". Which was going to be true for
*most* new CPUs anyway. (Note: a blacklist for those few CPUs on which
it *isn't* true might also suffice instead of a new feature bit.)

Unless someone really wants to implement the atomic MSR save and
restore on vmexit, and make it work with nesting, and make the whole
thing sufficiently simple that we don't throw our toys out of the pram
anyway when we see it?
Thomas Gleixner Feb. 20, 2018, 10:37 a.m. UTC | #3
On Tue, 20 Feb 2018, David Woodhouse wrote:
> On Tue, 2018-02-20 at 09:31 +0100, Thomas Gleixner wrote:
> > > @@ -3387,13 +3387,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > >  
> > >  		vmx->spec_ctrl = data;
> > >  
> > > -		if (!data)
> > > +		if (!data && !spectre_v2_ibrs_all())
> > >  			break;
> > >  
> > >  		/*
> > >  		 * For non-nested:
> > >  		 * When it's written (to non-zero) for the first time, pass
> > > -		 * it through.
> > > +		 * it through unless we have IBRS_ALL and it should just be
> > > +		 * set for ever.
> >
> > A non zero write is going to disable the intercept only when IBRS_ALL
> > is on. The comment says is should be set forever, i.e. not changeable by
> > the guest. So the condition should be:
> > 
> > 		if (!data || spectre_v2_ibrs_all())
> > 			break;
> > Hmm?
> 
> Yes, good catch. Thanks.
> 
> However, Paolo is very insistent that taking the trap every time is
> actually a lot *slower* than really frobbing IBRS on certain
> microarchitectures, so my hand-waving "pfft, what did they expect?" is
> not acceptable.
> 
> Which I think puts us back to the "throwing the toys out of the pram"
> solution; demanding that Intel give us a new feature bit for "IBRS_ALL,
> and the bit in the MSR is a no-op". Which was going to be true for
> *most* new CPUs anyway. (Note: a blacklist for those few CPUs on which
> it *isn't* true might also suffice instead of a new feature bit.)
>
> Unless someone really wants to implement the atomic MSR save and
> restore on vmexit, and make it work with nesting, and make the whole
> thing sufficiently simple that we don't throw our toys out of the pram
> anyway when we see it?

That whole stuff was duct taped into microcode in a rush and the result is
that we have only the choice between fire and frying pan. Whatever we
decide to implement is not going to be a half baken hack.

I fully agree that Intel needs to get their act together and implement
IBRS_ALL sanely.

The right thing to do is to allow the host to lock down the MSR once it
enabled IBRS_ALL so that any write to it will just turn into NOOPs. That
removes all worries about guests and in future generations of CPUs this bit
might just be hardwired to one and the MSR just a dummy for compability
reasons.

Thanks,

	tglx
Thomas Gleixner Feb. 20, 2018, 10:42 a.m. UTC | #4
On Tue, 20 Feb 2018, Thomas Gleixner wrote:
> On Tue, 20 Feb 2018, David Woodhouse wrote:
> > On Tue, 2018-02-20 at 09:31 +0100, Thomas Gleixner wrote:
> > > > @@ -3387,13 +3387,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > > >  
> > > >  		vmx->spec_ctrl = data;
> > > >  
> > > > -		if (!data)
> > > > +		if (!data && !spectre_v2_ibrs_all())
> > > >  			break;
> > > >  
> > > >  		/*
> > > >  		 * For non-nested:
> > > >  		 * When it's written (to non-zero) for the first time, pass
> > > > -		 * it through.
> > > > +		 * it through unless we have IBRS_ALL and it should just be
> > > > +		 * set for ever.
> > >
> > > A non zero write is going to disable the intercept only when IBRS_ALL
> > > is on. The comment says is should be set forever, i.e. not changeable by
> > > the guest. So the condition should be:
> > > 
> > > 		if (!data || spectre_v2_ibrs_all())
> > > 			break;
> > > Hmm?
> > 
> > Yes, good catch. Thanks.
> > 
> > However, Paolo is very insistent that taking the trap every time is
> > actually a lot *slower* than really frobbing IBRS on certain
> > microarchitectures, so my hand-waving "pfft, what did they expect?" is
> > not acceptable.
> > 
> > Which I think puts us back to the "throwing the toys out of the pram"

There are no more toys in the pram. I threw them all out weeks ago ...

> > solution; demanding that Intel give us a new feature bit for "IBRS_ALL,
> > and the bit in the MSR is a no-op". Which was going to be true for
> > *most* new CPUs anyway. (Note: a blacklist for those few CPUs on which
> > it *isn't* true might also suffice instead of a new feature bit.)
> >
> > Unless someone really wants to implement the atomic MSR save and
> > restore on vmexit, and make it work with nesting, and make the whole
> > thing sufficiently simple that we don't throw our toys out of the pram
> > anyway when we see it?
> 
> That whole stuff was duct taped into microcode in a rush and the result is
> that we have only the choice between fire and frying pan. Whatever we
> decide to implement is not going to be a half baken hack.

  s/not// of course

> 
> I fully agree that Intel needs to get their act together and implement
> IBRS_ALL sanely.
> 
> The right thing to do is to allow the host to lock down the MSR once it
> enabled IBRS_ALL so that any write to it will just turn into NOOPs. That
> removes all worries about guests and in future generations of CPUs this bit
> might just be hardwired to one and the MSR just a dummy for compability
> reasons.
> 
> Thanks,
> 
> 	tglx
>
David Woodhouse Feb. 20, 2018, 11:22 a.m. UTC | #5
On Tue, 2018-02-20 at 11:42 +0100, Thomas Gleixner wrote:
> 
> > > However, Paolo is very insistent that taking the trap every time is
> > > actually a lot *slower* than really frobbing IBRS on certain
> > > microarchitectures, so my hand-waving "pfft, what did they expect?" is
> > > not acceptable.
> > > 
> > > Which I think puts us back to the "throwing the toys out of the pram"
> 
> There are no more toys in the pram. I threw them all out weeks ago ...

One option is to take the patch as-is¹ with the trap on every access.
As soon as Intel define that 'IBRS_ALL_AND_THE_BIT_IS_A_NOOP' bit in
MSR_IA32_ARCH_CAPABILITIES, *then* we can expose it to guests directly
again just as we do at the moment.

That way, the slowdown that Paolo is concerned about is limited to a
small set of current CPUs on which we're mostly unlikely to care too
much about KVM anyway.



¹ as-is, except I really will go and turn the IA32_ARCH_CAPABILITIES 
  bits into scattered cpufeatures before I repost it.
Paolo Bonzini Feb. 20, 2018, 11:26 a.m. UTC | #6
On 19/02/2018 11:50, David Woodhouse wrote:
> Cope with this by trapping and emulating *all* access to SPEC_CTRL from
> KVM guests when the IBRS_ALL feature is present, so it can never be
> turned off. Guests who see IBRS_ALL should never do anything except
> turn it on at boot anyway. And if they didn't know about IBRS_ALL and
> they keep frobbing IBRS on every kernel entry/exit... well the vmexit
> for a no-op is probably going to be faster than they were expecting
> anyway, so they'll live.

The problem is, it isn't.  On a Haswell (which has fairly slow
SPEC_CTRL) toggling IBRS is 200 cycles.  This gives a context switch
time of around 2000 clock cycles with PTI enabled.

This is fairly awful, but with a vmexit cost of ~1100 cycles that goes
up to 2000+(1100-200)*2 = 3800.  That's more or less doubling the cost
of a system call.

With newer machines SPEC_CTRL cost goes down but vmexit cost doesn't, so
it's only worse.

For now, we really should do something like

	if (vmx->spec_ctrl != host_spec_ctrl)
		wrmsrl(MSR_IA32_SPEC_CTRL, host_spec_ctrl);
	else
		lfence();

which later can become

	if (vmx->spec_ctrl != host_spec_ctrl)
		wrmsrl(MSR_IA32_SPEC_CTRL, host_spec_ctrl);
	else {
		/* lfence not needed if host_spec_ctrl == 0 */
		if (static_cpu_has(BUG_REALLY_WANTS_IBRS))
			nospec_barrier();
	}

Paolo
Paolo Bonzini Feb. 20, 2018, 11:28 a.m. UTC | #7
On 20/02/2018 12:22, David Woodhouse wrote:
>>>> However, Paolo is very insistent that taking the trap every time is
>>>> actually a lot *slower* than really frobbing IBRS on certain
>>>> microarchitectures, so my hand-waving "pfft, what did they expect?" is
>>>> not acceptable.
>>>>  
>>>> Which I think puts us back to the "throwing the toys out of the pram"
>> There are no more toys in the pram. I threw them all out weeks ago ...
>
> One option is to take the patch as-is¹ with the trap on every access.

Please reword the commit message at least, mentioning that the slow case
is not one we don't care much about yet (no IBRS_ALL CPUs in the wild
afaik) and we won't care much about in the long run either (IBRS_ALL
really only used on a handful of blacklisted processors).

Thanks,

Paolo

> As soon as Intel define that 'IBRS_ALL_AND_THE_BIT_IS_A_NOOP' bit in
> MSR_IA32_ARCH_CAPABILITIES, *then* we can expose it to guests directly
> again just as we do at the moment.
> 
> That way, the slowdown that Paolo is concerned about is limited to a
> small set of current CPUs on which we're mostly unlikely to care too
> much about KVM anyway.
Thomas Gleixner Feb. 26, 2018, 7:55 p.m. UTC | #8
On Tue, 20 Feb 2018, David Woodhouse wrote:
> On Tue, 2018-02-20 at 11:42 +0100, Thomas Gleixner wrote:
> > 
> > > > However, Paolo is very insistent that taking the trap every time is
> > > > actually a lot *slower* than really frobbing IBRS on certain
> > > > microarchitectures, so my hand-waving "pfft, what did they expect?" is
> > > > not acceptable.
> > > > 
> > > > Which I think puts us back to the "throwing the toys out of the pram"
> > 
> > There are no more toys in the pram. I threw them all out weeks ago ...
> 
> One option is to take the patch as-is¹ with the trap on every access.
> As soon as Intel define that 'IBRS_ALL_AND_THE_BIT_IS_A_NOOP' bit in
> MSR_IA32_ARCH_CAPABILITIES, *then* we can expose it to guests directly
> again just as we do at the moment.

Arjan, is there any update on this?

Thanks,

	tglx
diff mbox

Patch

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 0995c6a..34cbce3 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -141,9 +141,16 @@  enum spectre_v2_mitigation {
 	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
 	SPECTRE_V2_RETPOLINE_GENERIC,
 	SPECTRE_V2_RETPOLINE_AMD,
-	SPECTRE_V2_IBRS,
+	SPECTRE_V2_IBRS_ALL,
 };
 
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_ibrs_all(void)
+{
+	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
+}
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bfca937..505c467 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -88,12 +88,14 @@  static const char *spectre_v2_strings[] = {
 	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
 	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
 	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
+	[SPECTRE_V2_IBRS_ALL]			= "Mitigation: Enhanced IBRS",
 };
 
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
-static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+EXPORT_SYMBOL_GPL(spectre_v2_enabled);
 
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
@@ -237,6 +239,16 @@  static void __init spectre_v2_select_mitigation(void)
 
 	case SPECTRE_V2_CMD_FORCE:
 	case SPECTRE_V2_CMD_AUTO:
+		if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+			u64 ia32_cap = 0;
+
+			rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+			if (ia32_cap & ARCH_CAP_IBRS_ALL) {
+				mode = SPECTRE_V2_IBRS_ALL;
+				wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS);
+				goto ibrs_all;
+			}
+		}
 		if (IS_ENABLED(CONFIG_RETPOLINE))
 			goto retpoline_auto;
 		break;
@@ -274,6 +286,7 @@  static void __init spectre_v2_select_mitigation(void)
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
 	}
 
+ ibrs_all:
 	spectre_v2_enabled = mode;
 	pr_info("%s\n", spectre_v2_strings[mode]);
 
@@ -305,7 +318,7 @@  static void __init spectre_v2_select_mitigation(void)
 	 * Retpoline means the kernel is safe because it has no indirect
 	 * branches. But firmware isn't, so use IBRS to protect that.
 	 */
-	if (boot_cpu_has(X86_FEATURE_IBRS)) {
+	if (mode != SPECTRE_V2_IBRS_ALL && boot_cpu_has(X86_FEATURE_IBRS)) {
 		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
 		pr_info("Enabling Restricted Speculation for firmware calls\n");
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3dec126..5dfeb11 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3387,13 +3387,14 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vmx->spec_ctrl = data;
 
-		if (!data)
+		if (!data && !spectre_v2_ibrs_all())
 			break;
 
 		/*
 		 * For non-nested:
 		 * When it's written (to non-zero) for the first time, pass
-		 * it through.
+		 * it through unless we have IBRS_ALL and it should just be
+		 * set for ever.
 		 *
 		 * For nested:
 		 * The handling of the MSR bitmap for L2 guests is done in
@@ -9451,7 +9452,7 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
+	if (!spectre_v2_ibrs_all() && vmx->spec_ctrl)
 		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
@@ -9573,11 +9574,12 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	      );
 
 	/*
-	 * We do not use IBRS in the kernel. If this vCPU has used the
-	 * SPEC_CTRL MSR it may have left it on; save the value and
-	 * turn it off. This is much more efficient than blindly adding
-	 * it to the atomic save/restore list. Especially as the former
-	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+	 * Without IBRS_ALL, we do not use IBRS in the kernel. If this
+	 * vCPU has used the SPEC_CTRL MSR it may have left it on;
+	 * save the value and turn it off. This is much more efficient
+	 * than blindly adding it to the atomic save/restore list.
+	 * Especially as the former (saving guest MSRs on vmexit)
+	 * doesn't even exist in KVM.
 	 *
 	 * For non-nested case:
 	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
@@ -9586,12 +9588,17 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * For nested case:
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * save it.
+	 *
+	 * If IBRS_ALL is present then the whole thing is a no-op fiction
+	 * for guests and every access is trapped, so do nothing.
 	 */
-	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	if (!spectre_v2_ibrs_all()) {
+		if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+			rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
-	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		if (vmx->spec_ctrl)
+			wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	}
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();