diff mbox

[2/2] x86/speculation: Support "Enhanced IBRS" on future CPUs

Message ID 1518449255-2182-2-git-send-email-dwmw@amazon.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Woodhouse, David Feb. 12, 2018, 3:27 p.m. UTC
The original IBRS hack in microcode is horribly slow. For the next
generation of CPUs, as a stopgap until we get a proper fix, Intel
promise an "Enhanced IBRS" which will be fast.

The assumption is that predictions in the BTB/RSB will be tagged with
the VMX mode and ring that they were learned in, and thus the CPU will
avoid consuming unsafe predictions without a performance penalty.

Intel's documentation says that it is still required to set the IBRS bit
in the SPEC_CTRL MSR and ensure that it remains set.

Cope with this by trapping and emulating *all* access to SPEC_CTRL from
KVM guests when the IBRS_ALL feature is present, so it can never be
turned off. Guests who see IBRS_ALL should never do anything except
turn it on at boot anyway. And if they didn't know about IBRS_ALL and
they keep frobbing IBRS on every kernel entry/exit... well the vmexit
for a no-op is probably going to be faster than they were expecting
anyway, so they'll live.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
---
 arch/x86/include/asm/nospec-branch.h |  9 ++++++++-
 arch/x86/kernel/cpu/bugs.c           | 16 ++++++++++++++--
 arch/x86/kvm/vmx.c                   | 17 ++++++++++-------
 3 files changed, 32 insertions(+), 10 deletions(-)

Comments

Ingo Molnar Feb. 13, 2018, 7:47 a.m. UTC | #1
* David Woodhouse <dwmw@amazon.co.uk> wrote:

> +extern enum spectre_v2_mitigation spectre_v2_enabled;

This needs to be exported if the KVM module wants to use it.

> +static inline bool spectre_v2_ibrs_all(void)
> +{
> +	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
> +}

> +     if (vmx->spec_ctrl && !spectre_v2_ibrs_all())

> +	if (!spectre_v2_ibrs_all) {

erm, that's a function, not a flag ...

Thanks,

	Ingo
Paolo Bonzini Feb. 13, 2018, 8:02 a.m. UTC | #2
On 12/02/2018 16:27, David Woodhouse wrote:
> The original IBRS hack in microcode is horribly slow. For the next
> generation of CPUs, as a stopgap until we get a proper fix, Intel
> promise an "Enhanced IBRS" which will be fast.
> 
> The assumption is that predictions in the BTB/RSB will be tagged with
> the VMX mode and ring that they were learned in, and thus the CPU will
> avoid consuming unsafe predictions without a performance penalty.
> 
> Intel's documentation says that it is still required to set the IBRS bit
> in the SPEC_CTRL MSR and ensure that it remains set.
> 
> Cope with this by trapping and emulating *all* access to SPEC_CTRL from
> KVM guests when the IBRS_ALL feature is present, so it can never be
> turned off. Guests who see IBRS_ALL should never do anything except
> turn it on at boot anyway. And if they didn't know about IBRS_ALL and
> they keep frobbing IBRS on every kernel entry/exit... well the vmexit
> for a no-op is probably going to be faster than they were expecting
> anyway, so they'll live.
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
> ---
>  arch/x86/include/asm/nospec-branch.h |  9 ++++++++-
>  arch/x86/kernel/cpu/bugs.c           | 16 ++++++++++++++--
>  arch/x86/kvm/vmx.c                   | 17 ++++++++++-------
>  3 files changed, 32 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index 788c4da..524bb86 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -140,9 +140,16 @@ enum spectre_v2_mitigation {
>  	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
>  	SPECTRE_V2_RETPOLINE_GENERIC,
>  	SPECTRE_V2_RETPOLINE_AMD,
> -	SPECTRE_V2_IBRS,
> +	SPECTRE_V2_IBRS_ALL,
>  };
>  
> +extern enum spectre_v2_mitigation spectre_v2_enabled;
> +
> +static inline bool spectre_v2_ibrs_all(void)
> +{
> +	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
> +}
> +
>  extern char __indirect_thunk_start[];
>  extern char __indirect_thunk_end[];
>  
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index debcdda..047538a 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -88,12 +88,13 @@ static const char *spectre_v2_strings[] = {
>  	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
>  	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
>  	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
> +	[SPECTRE_V2_IBRS_ALL]			= "Mitigation: Enhanced IBRS",
>  };
>  
>  #undef pr_fmt
>  #define pr_fmt(fmt)     "Spectre V2 : " fmt
>  
> -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
> +enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
>  
>  #ifdef RETPOLINE
>  static bool spectre_v2_bad_module;
> @@ -237,6 +238,16 @@ static void __init spectre_v2_select_mitigation(void)
>  
>  	case SPECTRE_V2_CMD_FORCE:
>  	case SPECTRE_V2_CMD_AUTO:
> +		if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
> +			u64 ia32_cap = 0;
> +
> +			rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
> +			if (ia32_cap & ARCH_CAP_IBRS_ALL) {
> +				mode = SPECTRE_V2_IBRS_ALL;
> +				wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS);
> +				goto ibrs_all;
> +			}
> +		}
>  		if (IS_ENABLED(CONFIG_RETPOLINE))
>  			goto retpoline_auto;
>  		break;
> @@ -274,6 +285,7 @@ static void __init spectre_v2_select_mitigation(void)
>  		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
>  	}
>  
> + ibrs_all:
>  	spectre_v2_enabled = mode;
>  	pr_info("%s\n", spectre_v2_strings[mode]);
>  
> @@ -306,7 +318,7 @@ static void __init spectre_v2_select_mitigation(void)
>  	 * branches. But we don't know whether the firmware is safe, so
>  	 * use IBRS to protect against that:
>  	 */
> -	if (boot_cpu_has(X86_FEATURE_IBRS)) {
> +	if (mode != SPECTRE_V2_IBRS_ALL && boot_cpu_has(X86_FEATURE_IBRS)) {
>  		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
>  		pr_info("Spectre mitigation: Restricting branch speculation (enabling IBRS) for firmware calls\n");
>  	}
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 91e3539..d99ba9b 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3419,13 +3419,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  
>  		vmx->spec_ctrl = data;
>  
> -		if (!data)
> +		if (!data && !spectre_v2_ibrs_all())
>  			break;

This should check the value of IBRS_ALL in the VM, not in the host.

>  		/*
>  		 * For non-nested:
>  		 * When it's written (to non-zero) for the first time, pass
> -		 * it through.
> +		 * it through unless we have IBRS_ALL and it should just be
> +		 * set for ever.
>  		 *
>  		 * For nested:
>  		 * The handling of the MSR bitmap for L2 guests is done in
> @@ -9441,7 +9442,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	 * is no need to worry about the conditional branch over the wrmsr
>  	 * being speculatively taken.
>  	 */
> -	if (vmx->spec_ctrl)
> +	if (vmx->spec_ctrl && !spectre_v2_ibrs_all())
>  		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);

Same here, and it should also be

	if (vmx->spec_ctrl != host_spec_ctrl())

where

static inline int host_spec_ctrl()
{
	return spectre_v2_enabled != SPECTRE_V2_NONE;
}

Likewise below.

Paolo


>  	vmx->__launched = vmx->loaded_vmcs->launched;
> @@ -9577,11 +9578,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
>  	 * save it.
>  	 */
> -	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
> -		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
> +	if (!spectre_v2_ibrs_all) {
> +		if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
> +			rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>  
> -	if (vmx->spec_ctrl)
> -		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> +		if (vmx->spec_ctrl)
> +			wrmsrl(MSR_IA32_SPEC_CTRL, 0);
> +	}
>  
>  	/* Eliminate branch target predictions from guest mode */
>  	vmexit_fill_RSB();
>
David Woodhouse Feb. 13, 2018, 8:12 a.m. UTC | #3
On Tue, 2018-02-13 at 08:47 +0100, Ingo Molnar wrote:
> * David Woodhouse <dwmw@amazon.co.uk> wrote:
> 
> > 
> > +extern enum spectre_v2_mitigation spectre_v2_enabled;
>
> This needs to be exported if the KVM module wants to use it.
> 
> > 
> > +static inline bool spectre_v2_ibrs_all(void)
> > +{
> > +	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
> > +}
> > 
> > +     if (vmx->spec_ctrl && !spectre_v2_ibrs_all())
> > 
> > +	if (!spectre_v2_ibrs_all) {
>
> erm, that's a function, not a flag ...

0-day pointed out the latter, which is already fixed in the git tree
ready for the next resend. Not the former though.

I can export it. It does make me ponder for a second whether I should
have gone with my first instinct just to make it another cpufeature
flag like the others. But no, the excuse for doing that for the others
was that it *needs* to be one for using alternatives. And this flag
*isn't* used for alternatives, so it seems a little (more) wrong.
David Woodhouse Feb. 13, 2018, 8:15 a.m. UTC | #4
On Tue, 2018-02-13 at 09:02 +0100, Paolo Bonzini wrote:
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -3419,13 +3419,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> >  
> >  		vmx->spec_ctrl = data;
> >  
> > -		if (!data)
> > +		if (!data && !spectre_v2_ibrs_all())
> >  			break;
> This should check the value of IBRS_ALL in the VM, not in the host.

No, it's host we want. If IBRS_ALL is set in the host, we set the
actual hardware MSR once at boot time and never touch it again. The
SPEC_CTRL MSR we expose to guests is purely a no-op fiction.

If spectre_v2_ibrs_all() is true then KVM should *never* actually pass
through or touch the real MSR.
Paolo Bonzini Feb. 13, 2018, 9:58 a.m. UTC | #5
On 13/02/2018 09:15, David Woodhouse wrote:
>>>  
>>> -		if (!data)
>>> +		if (!data && !spectre_v2_ibrs_all())
>>>  			break;
>> This should check the value of IBRS_ALL in the VM, not in the host.
> No, it's host we want. If IBRS_ALL is set in the host, we set the
> actual hardware MSR once at boot time and never touch it again. The
> SPEC_CTRL MSR we expose to guests is purely a no-op fiction.
> 
> If spectre_v2_ibrs_all() is true then KVM should *never* actually pass
> through or touch the real MSR.

That would be nice but unfortunately it's not possible. :(

The VM might actually not have IBRS_ALL, as usual the reason is
migration compatibility.  In that case, that no-op fiction would be very
slow because the VM will actually do a lot of SPEC_CTRL writes.

So the right logic is:

- if the VM has IBRS_ALL, pass through the MSR when it is zero and
intercept writes when it is one (no writes should happen)

- if the VM doesn't have IBRS_ALL, do as we are doing now, independent
of what the host spectre_v2_ibrs_all() setting is.

Paolo
David Woodhouse Feb. 13, 2018, 10:21 a.m. UTC | #6
On Tue, 2018-02-13 at 10:58 +0100, Paolo Bonzini wrote:

> > If spectre_v2_ibrs_all() is true then KVM should *never* actually pass
> > through or touch the real MSR.
> 
> That would be nice but unfortunately it's not possible. :(
> 
> The VM might actually not have IBRS_ALL, as usual the reason is
> migration compatibility.  In that case, that no-op fiction would be very
> slow because the VM will actually do a lot of SPEC_CTRL writes.

If the VM *thinks* it's bashing on a real SPEC_CTRL register all the
time, and it's actually just trapping to a no-op, then it's actually
going to be a lot *faster* than the VM expects. We can live with that.

> So the right logic is:
> 
> - if the VM has IBRS_ALL, pass through the MSR when it is zero and
> intercept writes when it is one (no writes should happen)
> 
> - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
> of what the host spectre_v2_ibrs_all() setting is.

We end up having to turn IBRS on again on vmexit then, taking care that
no conditional branch can go round it. So that becomes an
*unconditional* wrmsr or lfence in the vmexit path. We really don't
want that.

If we choose to tell a guest that it doesn't have IBRS_ALL, or if the
guest doesn't use IBRS_ALL and does it the old way, it's OK that it's
trapped. It's still faster than they expected.
David Woodhouse Feb. 13, 2018, 10:36 a.m. UTC | #7
On Tue, 2018-02-13 at 10:21 +0000, David Woodhouse wrote:
> 
> > So the right logic is:
> > 
> > - if the VM has IBRS_ALL, pass through the MSR when it is zero and
> > intercept writes when it is one (no writes should happen)
> > 
> > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
> > of what the host spectre_v2_ibrs_all() setting is.
> 
> We end up having to turn IBRS on again on vmexit then, taking care that
> no conditional branch can go round it. So that becomes an
> *unconditional* wrmsr or lfence in the vmexit path. We really don't
> want that.

Note that being able to keep it simple in KVM was basically what made
the difference between me tolerating IBRS_ALL as Intel currently define
it, and throwing my toys out of the pram (as I had done in the first
iterations of this patch).

If we *can't* keep it nice and simple in KVM, then I think we *really*
want Intel to make the hardware bit a no-op. If not for all CPUs with
the IBRS_ALL bit, then for all *future* CPUs and they can define a new
IBRS_ALL_UNCONDITIONAL bit. Which is the only one we'd ever care about.
Paolo Bonzini Feb. 13, 2018, 10:41 a.m. UTC | #8
On 13/02/2018 11:36, David Woodhouse wrote:
>>> - if the VM has IBRS_ALL, pass through the MSR when it is zero and
>>> intercept writes when it is one (no writes should happen)
>>>  
>>> - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
>>> of what the host spectre_v2_ibrs_all() setting is.
>> We end up having to turn IBRS on again on vmexit then, taking care that
>> no conditional branch can go round it. So that becomes an
>> *unconditional* wrmsr or lfence in the vmexit path. We really don't
>> want that.
>
> Note that being able to keep it simple in KVM was basically what made
> the difference between me tolerating IBRS_ALL as Intel currently define
> it, and throwing my toys out of the pram (as I had done in the first
> iterations of this patch).

You have my vote. :)  Really, IBRS_ALL makes no sense and it would be
nice to know _why_ Intel is pushing something that makes no sense.

Paolo
David Woodhouse Feb. 13, 2018, 10:53 a.m. UTC | #9
On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote:
> You have my vote. :)  Really, IBRS_ALL makes no sense and it would be
> nice to know _why_ Intel is pushing something that makes no sense.

No, IBRS_ALL *does* make sense. It's not a complete fix, but it's as
much of a fix as they should shoe-horn into the generation of CPUs
which are currently going to the fabs.

With IBRS_ALL they presumably add tags to the predictions with the VMX
mode and ring, to give complete protection against predictions being
used in a more privileged mode. That saves us from having to do
retpoline, or the horrid old IBRS-as-barrier crap. Or any of the other
as-yet-incomplete Skylake hacks.

But we *do* still need the IBPB on context/VM switch. Because they
*haven't* yet managed to tag with VMID/PCID and/or do appropriate
flushing (on VMPTRLD, CR3 load, etc.). That will have to wait until
hardware which is even further out. But it's a bone of contention that
they haven't even defined the bit which will *advertise* this future
behaviour.

As it stands though, as a stop-gap solution IBRS_ALL isn't completely
senseless.

It *is* a shame that they didn't make the IBRS bit in SPEC_CTRL a no-op 
on CPUs with IBRS_ALL, but there are apparently technical reasons for
that on a certain subset of CPUs.

Again, having relatively simple patches to KVM which to tolerate the
IBRS bit *not* being a no-op is what makes the difference between us
accepting that, and demanding a new 'IBRS_ALL_UNCONDITIONAL' bit for
the CPUs which *aren't* in that "certain subset".
Paolo Bonzini Feb. 13, 2018, 10:55 a.m. UTC | #10
On 13/02/2018 11:53, David Woodhouse wrote:
>> You have my vote. :)  Really, IBRS_ALL makes no sense and it would be
>> nice to know _why_ Intel is pushing something that makes no sense.
> No, IBRS_ALL *does* make sense. It's not a complete fix, but it's as
> much of a fix as they should shoe-horn into the generation of CPUs
> which are currently going to the fabs.
> 
> With IBRS_ALL they presumably add tags to the predictions with the VMX
> mode and ring, to give complete protection against predictions being
> used in a more privileged mode. 

Yeah, but I still don't get why they need an MSR to turn those tags on.
Is it basically a chicken bit in the wrong direction?

Paolo
Pavel Machek Feb. 15, 2018, 3:21 p.m. UTC | #11
On Tue 2018-02-13 09:02:25, Paolo Bonzini wrote:
> On 12/02/2018 16:27, David Woodhouse wrote:
> > The original IBRS hack in microcode is horribly slow. For the next
> > generation of CPUs, as a stopgap until we get a proper fix, Intel
> > promise an "Enhanced IBRS" which will be fast.
> > 
> > The assumption is that predictions in the BTB/RSB will be tagged with
> > the VMX mode and ring that they were learned in, and thus the CPU will
> > avoid consuming unsafe predictions without a performance penalty.
> > 
> > Intel's documentation says that it is still required to set the IBRS bit
> > in the SPEC_CTRL MSR and ensure that it remains set.
> > 
> > Cope with this by trapping and emulating *all* access to SPEC_CTRL from
> > KVM guests when the IBRS_ALL feature is present, so it can never be
> > turned off. Guests who see IBRS_ALL should never do anything except
> > turn it on at boot anyway. And if they didn't know about IBRS_ALL and
> > they keep frobbing IBRS on every kernel entry/exit... well the vmexit
> > for a no-op is probably going to be faster than they were expecting
> > anyway, so they'll live.
> > 
> > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> > Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
> > ---
> >  arch/x86/include/asm/nospec-branch.h |  9 ++++++++-
> >  arch/x86/kernel/cpu/bugs.c           | 16 ++++++++++++++--
> >  arch/x86/kvm/vmx.c                   | 17 ++++++++++-------
> >  3 files changed, 32 insertions(+), 10 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> > index 788c4da..524bb86 100644
> > --- a/arch/x86/include/asm/nospec-branch.h
> > +++ b/arch/x86/include/asm/nospec-branch.h
> > @@ -140,9 +140,16 @@ enum spectre_v2_mitigation {
> >  	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
> >  	SPECTRE_V2_RETPOLINE_GENERIC,
> >  	SPECTRE_V2_RETPOLINE_AMD,
> > -	SPECTRE_V2_IBRS,
> > +	SPECTRE_V2_IBRS_ALL,
> >  };
> >  
> > +extern enum spectre_v2_mitigation spectre_v2_enabled;
> > +
> > +static inline bool spectre_v2_ibrs_all(void)
> > +{
> > +	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
> > +}
> > +
> >  extern char __indirect_thunk_start[];
> >  extern char __indirect_thunk_end[];
> >  
> > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> > index debcdda..047538a 100644
> > --- a/arch/x86/kernel/cpu/bugs.c
> > +++ b/arch/x86/kernel/cpu/bugs.c
> > @@ -88,12 +88,13 @@ static const char *spectre_v2_strings[] = {
> >  	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
> >  	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
> >  	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
> > +	[SPECTRE_V2_IBRS_ALL]			= "Mitigation: Enhanced IBRS",
> >  };

Hmm. Probably not just your problem but these should really get
documentation somewhere -- and adding another one should be treated
like changing the ABI.

How is poor userland expected to do anything inteligent with that
file?

									Pavel
David Woodhouse Feb. 16, 2018, 9:58 a.m. UTC | #12
On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote:

> On 13/02/2018 11:36, David Woodhouse wrote:
> > > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and
> > > > intercept writes when it is one (no writes should happen)
> > > >  
> > > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
> > > > of what the host spectre_v2_ibrs_all() setting is.
> > >
> > > We end up having to turn IBRS on again on vmexit then, taking care that
> > > no conditional branch can go round it. So that becomes an
> > > *unconditional* wrmsr or lfence in the vmexit path. We really don't
> > > want that.
> > >
> > Note that being able to keep it simple in KVM was basically what made
> > the difference between me tolerating IBRS_ALL as Intel currently define
> > it, and throwing my toys out of the pram (as I had done in the first
> > iterations of this patch).

> You have my vote. :) 

I was taking that as assent to the patch... could I trouble you for an
explicit ack, please?
Paolo Bonzini Feb. 16, 2018, 10:08 a.m. UTC | #13
On 16/02/2018 10:58, David Woodhouse wrote:
> On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote:
> 
>> On 13/02/2018 11:36, David Woodhouse wrote:
>>>>> - if the VM has IBRS_ALL, pass through the MSR when it is zero and
>>>>> intercept writes when it is one (no writes should happen)
>>>>>  
>>>>> - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
>>>>> of what the host spectre_v2_ibrs_all() setting is.
>>>>
>>>> We end up having to turn IBRS on again on vmexit then, taking care that
>>>> no conditional branch can go round it. So that becomes an
>>>> *unconditional* wrmsr or lfence in the vmexit path. We really don't
>>>> want that.
>>>>
>>> Note that being able to keep it simple in KVM was basically what made
>>> the difference between me tolerating IBRS_ALL as Intel currently define
>>> it, and throwing my toys out of the pram (as I had done in the first
>>> iterations of this patch).
>>  
>> You have my vote. :) 
> 
> I was taking that as assent to the patch... could I trouble you for an
> explicit ack, please?

No, it's a vote for throwing the toys out of the pram (or running away
with the ball, if you prefer).

Unfortunately, if you want to have a higher-performance mode for
IBRS_ALL that avoids rdmsr on vmexit, you have to do it as sketched above.

Alternatively, if IBRS_ALL is 1, and you don't care about migration
between machines that have IBRS_ALL and those that don't, the host can
simply not enable the SPEC_CTRL CPUID bit in the guest, and instead use
the AMD IBPB flag only.  Need to check if Windows obeys the AMD flag on
Intel machines (or in general) though.

Paolo
David Woodhouse Feb. 16, 2018, 10:21 a.m. UTC | #14
On Fri, 2018-02-16 at 11:08 +0100, Paolo Bonzini wrote:
> On 16/02/2018 10:58, David Woodhouse wrote:
> > 
> > On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote:
> > 
> > > 
> > > On 13/02/2018 11:36, David Woodhouse wrote:
> > > > 
> > > > > 
> > > > > > 
> > > > > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and
> > > > > > intercept writes when it is one (no writes should happen)
> > > > > >  
> > > > > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent
> > > > > > of what the host spectre_v2_ibrs_all() setting is.
> > > > > We end up having to turn IBRS on again on vmexit then, taking care that
> > > > > no conditional branch can go round it. So that becomes an
> > > > > *unconditional* wrmsr or lfence in the vmexit path. We really don't
> > > > > want that.
> > > > > 
> > > > Note that being able to keep it simple in KVM was basically what made
> > > > the difference between me tolerating IBRS_ALL as Intel currently define
> > > > it, and throwing my toys out of the pram (as I had done in the first
> > > > iterations of this patch).
> > >  
> > > You have my vote. :)
> > 
> > I was taking that as assent to the patch... could I trouble you for an
> > explicit ack, please?
>
> No, it's a vote for throwing the toys out of the pram (or running away
> with the ball, if you prefer).
> 
> Unfortunately, if you want to have a higher-performance mode for
> IBRS_ALL that avoids rdmsr on vmexit, you have to do it as sketched above.


Why? With IBRS_ALL the guest *never* gets to affect the actual hardware
MSR, which is always on. The MSR is purely an emulated no-op. Why does
that affect migration?

Even if the guest doesn't have/support IBRS_ALL, and is frobbing the
(now emulated) MSR on every kernel entry/exit, that's *still* going to
be a metric shitload faster than what it *thought* it was doing.
Paolo Bonzini Feb. 16, 2018, 11:04 a.m. UTC | #15
On 16/02/2018 11:21, David Woodhouse wrote:
> Why? With IBRS_ALL the guest *never* gets to affect the actual hardware
> MSR, which is always on. The MSR is purely an emulated no-op. Why does
> that affect migration?

Because even if the host has IBRS_ALL, as long as you want to migrate to
a system without IBRS_ALL the guest will likely not have it.  You can
fake IBRS_ALL on the older system after migration, and forcing the guest
to always run with IBRS=1 even when in user mode; that is slow.  Or...

> Even if the guest doesn't have/support IBRS_ALL, and is frobbing the
> (now emulated) MSR on every kernel entry/exit, that's *still* going to
> be a metric shitload faster than what it *thought* it was doing.

... you are making every kernel entry/exit 3 times slower by adding two
KVM exits (both hypervisor traps and syscalls are in the 1000-1500 clock
cycles ballpark).  That cannot be fast at all.

Paolo
David Woodhouse Feb. 16, 2018, 12:10 p.m. UTC | #16
On Fri, 2018-02-16 at 12:04 +0100, Paolo Bonzini wrote:
> On 16/02/2018 11:21, David Woodhouse wrote:
> > 
> > Why? With IBRS_ALL the guest *never* gets to affect the actual hardware
> > MSR, which is always on. The MSR is purely an emulated no-op. Why does
> > that affect migration?
> Because even if the host has IBRS_ALL, as long as you want to migrate to
> a system without IBRS_ALL the guest will likely not have it.  You can
> fake IBRS_ALL on the older system after migration, and forcing the guest
> to always run with IBRS=1 even when in user mode; that is slow.  Or...

No you can't; it's also a barrier. You can't just leave it on.

> > Even if the guest doesn't have/support IBRS_ALL, and is frobbing the
> > (now emulated) MSR on every kernel entry/exit, that's *still* going to
> > be a metric shitload faster than what it *thought* it was doing.
>
> ... you are making every kernel entry/exit 3 times slower by adding two
> KVM exits (both hypervisor traps and syscalls are in the 1000-1500 clock
> cycles ballpark).  That cannot be fast at all.

I'm not convinced I care. It's still on a par with the performance of
*actually* frobbing IBRS every time. If the guest cares about
performance, they'll be using retpoline instead and not doing that.

We really don't want to penalise the *host*, and other guests, for one
guest which does stupid things.

And if we have a conditional 'set IBRS back on because we're using
IBRS_ALL and the guest had access to it' in the vmexit path, only the
*first* clause (IBRS_ALL) of that condition can be done with
alternatives. The other part is necessarily a runtime thing, and thus
needs the 'else lfence' to make sure it really happens, penalising
*all* guests. (Unless there's something else guaranteed to be
serialising in that path but after Arjan mentioned it last time I took
a quick look and didn't see anything unconditional).

An alternative would be to put the SPEC_CTRL MSR into the list of MSRs 
to be automatically saved/reset at vmexit, but we've already carefully
*changed* from doing that for the non-IBRS_ALL case because doing it
manually in the host is faster. I don't know that we want that
additional complexity — that might be the last straw that makes us tell
Intel "no, in that case we don't want IBRS_ALL as it is. Define a new
bit which is like IBRS_ALL but also promises that it's always like that
and the IBRS bit in the MSR is a no-op". That new bit would be set on
all future hardware except a small handful of current chips, I believe.

I think we can live with trapping and emulating SPEC_CTRL for the
guests which use it, for now.

If we *really* want to explore optimising it somehow, there's nothing
to prevent us doing that in a subsequent patch.
Jon Masters Feb. 19, 2018, 11:37 p.m. UTC | #17
On 02/16/2018 07:10 AM, David Woodhouse wrote:
> On Fri, 2018-02-16 at 12:04 +0100, Paolo Bonzini wrote:
>> On 16/02/2018 11:21, David Woodhouse wrote:

>>> Even if the guest doesn't have/support IBRS_ALL, and is frobbing the
>>> (now emulated) MSR on every kernel entry/exit, that's *still* going to
>>> be a metric shitload faster than what it *thought* it was doing.

Is there any indication/log to the admin that VM doesn't know about
IBRS_ALL and is constantly uselessly writing to an emulated MSR?

While it's probably true that the overhead in time is similar to (or
better than) an actual IBRS MSR write, if the admin/user knows the VM
needs updating, then there's a fighting chance that they might do so.

Jon.
Van De Ven, Arjan Feb. 19, 2018, 11:42 p.m. UTC | #18
> 

> >>> Even if the guest doesn't have/support IBRS_ALL, and is frobbing the

> >>> (now emulated) MSR on every kernel entry/exit, that's *still* going to

> >>> be a metric shitload faster than what it *thought* it was doing.

> 

> Is there any indication/log to the admin that VM doesn't know about

> IBRS_ALL and is constantly uselessly writing to an emulated MSR?

> 

> While it's probably true that the overhead in time is similar to (or

> better than) an actual IBRS MSR write, if the admin/user knows the VM

> needs updating, then there's a fighting chance that they might do so.


the guest is not the problem; guests obviously will already honor if Enhanced IBRS is enumerated.
The problem is mixed migration pools where the hypervisor may need to decide to not pass this enumeration through to the guest.
Valdis Klētnieks Feb. 19, 2018, 11:53 p.m. UTC | #19
On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said:

> the guest is not the problem; guests obviously will already honor if Enhanced
> IBRS is enumerated. The problem is mixed migration pools where the hypervisor
> may need to decide to not pass this enumeration through to the guest.

For bonus points:  What should happen to a VM that is live migrated from one
hypervisor to another, and the hypervisors have different IBRS support?
Van De Ven, Arjan Feb. 20, 2018, midnight UTC | #20
> On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said:

> 

> > the guest is not the problem; guests obviously will already honor if Enhanced

> > IBRS is enumerated. The problem is mixed migration pools where the

> hypervisor

> > may need to decide to not pass this enumeration through to the guest.

> 

> For bonus points:  What should happen to a VM that is live migrated from one

> hypervisor to another, and the hypervisors have different IBRS support?


Doctor Doctor it hurts when I do this....

Migration tends to only work between HV's that are relatively homogeneous, that's nothing new... folks who run clouds or bigger pools know this obviously.
Alan Cox Feb. 20, 2018, 12:13 a.m. UTC | #21
On Tue, 20 Feb 2018 00:00:23 +0000
"Van De Ven, Arjan" <arjan.van.de.ven@intel.com> wrote:

> > On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said:
> >   
> > > the guest is not the problem; guests obviously will already honor if Enhanced
> > > IBRS is enumerated. The problem is mixed migration pools where the  
> > hypervisor  
> > > may need to decide to not pass this enumeration through to the guest.  
> > 
> > For bonus points:  What should happen to a VM that is live migrated from one
> > hypervisor to another, and the hypervisors have different IBRS support?  
> 
> Doctor Doctor it hurts when I do this....
> 
> Migration tends to only work between HV's that are relatively homogeneous, that's nothing new... folks who run clouds or bigger pools know this obviously.

In theory there's nothing stopping a guest getting a 'you are about to
gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one
removed.

It's just that it's always been less painful to avoid the problems you
get with wild mismatches than attmept the cure. And any 'cure' goes
beyond the OS kernel into some of the managed runtimes so I'd agree with
Arjan 'don't do that'.

Alan
Linus Torvalds Feb. 20, 2018, 12:43 a.m. UTC | #22
On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote:
>
> In theory there's nothing stopping a guest getting a 'you are about to
> gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one
> removed.

I'm not convinced we handle the case of hotplug CPU's with different
CPU models correctly.

In fact, I'd be very surprised it it worked in the general case.

            Linus
Alan Cox Feb. 20, 2018, 1:03 a.m. UTC | #23
On Mon, 19 Feb 2018 16:43:50 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote:
> >
> > In theory there's nothing stopping a guest getting a 'you are about to
> > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one
> > removed.  
> 
> I'm not convinced we handle the case of hotplug CPU's with different
> CPU models correctly.

We don't. And even if we did the user space doesn't. It would be a
painful task to resolve.

Alan
Van De Ven, Arjan Feb. 20, 2018, 1:08 a.m. UTC | #24
> > On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk>

> wrote:

> > >

> > > In theory there's nothing stopping a guest getting a 'you are about to

> > > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one

> > > removed.

> >

> > I'm not convinced we handle the case of hotplug CPU's with different

> > CPU models correctly.

> 

> We don't. And even if we did the user space doesn't. It would be a

> painful task to resolve.



in a cloud world it ends up being the launch/restart of the VM in reality.
Just the JIT caches all over in userspace make this case not pretty.

And we as kernel don't even really do mixed cpus without hotplug afaict
(other than maybe taking lowest subset, while the case here would be the opposite direction)
Thomas Gleixner Feb. 20, 2018, 8:52 a.m. UTC | #25
On Mon, 19 Feb 2018, Linus Torvalds wrote:
> On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote:
> >
> > In theory there's nothing stopping a guest getting a 'you are about to
> > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one
> > removed.
> 
> I'm not convinced we handle the case of hotplug CPU's with different
> CPU models correctly.
> 
> In fact, I'd be very surprised it it worked in the general case.

We pretend to work with different CPU models mostly by having per cpu
feature bits, but in practice this would fall apart in bits and pieces all
over the place. 

Thanks,

	tglx
Paolo Bonzini Feb. 20, 2018, 11:43 a.m. UTC | #26
On 20/02/2018 01:00, Van De Ven, Arjan wrote:
>>> the guest is not the problem; guests obviously will already honor
>>> if Enhanced IBRS is enumerated. The problem is mixed migration
>>> pools where the hypervisor
>>> may need to decide to not pass this enumeration through to the
>>> guest.
>> For bonus points:  What should happen to a VM that is live migrated
>> from one hypervisor to another, and the hypervisors have different
>> IBRS support?
>
> Doctor Doctor it hurts when I do this....
> 
> Migration tends to only work between HV's that are relatively
> homogeneous, that's nothing new...

No Arjan, this is just wrong.  Well, I suppose it's right in the present
tense with the IBRS mess on Skylake, but it's _not_ been true until last
year.

Paolo
Van De Ven, Arjan Feb. 20, 2018, 2:08 p.m. UTC | #27
> >> For bonus points:  What should happen to a VM that is live migrated

> >> from one hypervisor to another, and the hypervisors have different

> >> IBRS support?

> >

> > Doctor Doctor it hurts when I do this....

> >

> > Migration tends to only work between HV's that are relatively

> > homogeneous, that's nothing new...

> 

> No Arjan, this is just wrong.  Well, I suppose it's right in the present

> tense with the IBRS mess on Skylake, but it's _not_ been true until last

> year.



I meant software wise. You're not going to live migrate from xen to kvm or backwards. or between very radically different versions of the kvm stack.
Paolo Bonzini Feb. 20, 2018, 2:46 p.m. UTC | #28
On 20/02/2018 15:08, Van De Ven, Arjan wrote:
>>>> For bonus points:  What should happen to a VM that is live migrated
>>>> from one hypervisor to another, and the hypervisors have different
>>>> IBRS support?
>>>
>>> Doctor Doctor it hurts when I do this....
>>>
>>> Migration tends to only work between HV's that are relatively
>>> homogeneous, that's nothing new...
>>
>> No Arjan, this is just wrong.  Well, I suppose it's right in the present
>> tense with the IBRS mess on Skylake, but it's _not_ been true until last
>> year.
> 
> I meant software wise. You're not going to live migrate from xen to
> kvm or backwards. or between very radically different versions of the
> kvm stack.

Forwards migration to a radically newer version certainly happens.  So
when the source hypervisor was too old to tell the VM about IBRS_ALL,
for example, migration should work properly and the VM should perform
well on the destination hypervisor.

Backwards migration to older hypervisors also happens sometimes, but in
general it creates more userspace than kernel issues.

Paolo
Van De Ven, Arjan Feb. 20, 2018, 2:59 p.m. UTC | #29
> > I meant software wise. You're not going to live migrate from xen to

> > kvm or backwards. or between very radically different versions of the

> > kvm stack.

> 

> Forwards migration to a radically newer version certainly happens.  So

> when the source hypervisor was too old to tell the VM about IBRS_ALL,

> for example, migration should work properly and the VM should perform

> well on the destination hypervisor.


that makes sense, compatibility in this direction can be done
and is useful as you move a fleet of servers forward

> 

> Backwards migration to older hypervisors also happens sometimes, but in

> general it creates more userspace than kernel issues.

> 


that direction is obviously harder
Paolo Bonzini Feb. 20, 2018, 3:09 p.m. UTC | #30
On 20/02/2018 15:59, Van De Ven, Arjan wrote:
> 
>>> I meant software wise. You're not going to live migrate from xen to
>>> kvm or backwards. or between very radically different versions of the
>>> kvm stack.
>>
>> Forwards migration to a radically newer version certainly happens.  So
>> when the source hypervisor was too old to tell the VM about IBRS_ALL,
>> for example, migration should work properly and the VM should perform
>> well on the destination hypervisor.
> 
> that makes sense, compatibility in this direction can be done
> and is useful as you move a fleet of servers forward
> 
>> Backwards migration to older hypervisors also happens sometimes, but in
>> general it creates more userspace than kernel issues.
> 
> that direction is obviously harder

From KVM's point of view it's mostly a matter of having a complete ioctl
API right.  I'm not that much worried by it.

Paolo
Konrad Rzeszutek Wilk Feb. 23, 2018, 6:12 p.m. UTC | #31
On Tue, Feb 20, 2018 at 03:46:57PM +0100, Paolo Bonzini wrote:
> On 20/02/2018 15:08, Van De Ven, Arjan wrote:
> >>>> For bonus points:  What should happen to a VM that is live migrated
> >>>> from one hypervisor to another, and the hypervisors have different
> >>>> IBRS support?
> >>>
> >>> Doctor Doctor it hurts when I do this....
> >>>
> >>> Migration tends to only work between HV's that are relatively
> >>> homogeneous, that's nothing new...
> >>
> >> No Arjan, this is just wrong.  Well, I suppose it's right in the present
> >> tense with the IBRS mess on Skylake, but it's _not_ been true until last
> >> year.
> > 
> > I meant software wise. You're not going to live migrate from xen to
> > kvm or backwards. or between very radically different versions of the
> > kvm stack.
> 
> Forwards migration to a radically newer version certainly happens.  So
> when the source hypervisor was too old to tell the VM about IBRS_ALL,
> for example, migration should work properly and the VM should perform
> well on the destination hypervisor.

To add a bit more to this, Intel just updated their IA32_ARCH_CAPABILITIES_MSR
to have a new bit to sample to figure out whether you need IBRS or not
during runtime.

See https://software.intel.com/sites/default/files/managed/1d/46/Retpoline-A-Branch-Target-Injection-Mitigation.pdf
in 5.3
Virtual Machine CPU Identification:

"To remedy this situation, an operating system running as a VM can query bit 2 of the IA32_ARCH_CAPABILITIES MSR, known as “RSB Alternate” (RSBA). When RSBA is set, it indicates that the VM may run on a processor vulnerable to exploits of Empty RSB conditions regardless of the processor’s DisplayFamily/DisplayModel signature, and that the operating system should deploy appropriate mitigations. Virtual machine managers (VMM) may set RSBA via MSR interception to indicate that a virtual machine might run at some time in the future on a vulnerable processor."

New bit.. but not mentioned in the:

336996-Speculative-Execution-Side-Channel-Mitigations.pdf

Paolo, is there some form of callback inside of the guest when KVM guests are migrated?
(It exists under Xen, but I don't see it under KVM?)
> 
> Backwards migration to older hypervisors also happens sometimes, but in
> general it creates more userspace than kernel issues.
> 
> Paolo
Van De Ven, Arjan Feb. 23, 2018, 6:18 p.m. UTC | #32
> To add a bit more to this, Intel just updated their

> IA32_ARCH_CAPABILITIES_MSR

> to have a new bit to sample to figure out whether you need IBRS or not

> during runtime.


actually we updated the document when you need RSB stuffing.
based on the request of various folks here on LKML.
diff mbox

Patch

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 788c4da..524bb86 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -140,9 +140,16 @@  enum spectre_v2_mitigation {
 	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
 	SPECTRE_V2_RETPOLINE_GENERIC,
 	SPECTRE_V2_RETPOLINE_AMD,
-	SPECTRE_V2_IBRS,
+	SPECTRE_V2_IBRS_ALL,
 };
 
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_ibrs_all(void)
+{
+	return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL;
+}
+
 extern char __indirect_thunk_start[];
 extern char __indirect_thunk_end[];
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index debcdda..047538a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -88,12 +88,13 @@  static const char *spectre_v2_strings[] = {
 	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
 	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
 	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
+	[SPECTRE_V2_IBRS_ALL]			= "Mitigation: Enhanced IBRS",
 };
 
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
-static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
@@ -237,6 +238,16 @@  static void __init spectre_v2_select_mitigation(void)
 
 	case SPECTRE_V2_CMD_FORCE:
 	case SPECTRE_V2_CMD_AUTO:
+		if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+			u64 ia32_cap = 0;
+
+			rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+			if (ia32_cap & ARCH_CAP_IBRS_ALL) {
+				mode = SPECTRE_V2_IBRS_ALL;
+				wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS);
+				goto ibrs_all;
+			}
+		}
 		if (IS_ENABLED(CONFIG_RETPOLINE))
 			goto retpoline_auto;
 		break;
@@ -274,6 +285,7 @@  static void __init spectre_v2_select_mitigation(void)
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
 	}
 
+ ibrs_all:
 	spectre_v2_enabled = mode;
 	pr_info("%s\n", spectre_v2_strings[mode]);
 
@@ -306,7 +318,7 @@  static void __init spectre_v2_select_mitigation(void)
 	 * branches. But we don't know whether the firmware is safe, so
 	 * use IBRS to protect against that:
 	 */
-	if (boot_cpu_has(X86_FEATURE_IBRS)) {
+	if (mode != SPECTRE_V2_IBRS_ALL && boot_cpu_has(X86_FEATURE_IBRS)) {
 		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
 		pr_info("Spectre mitigation: Restricting branch speculation (enabling IBRS) for firmware calls\n");
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 91e3539..d99ba9b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3419,13 +3419,14 @@  static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vmx->spec_ctrl = data;
 
-		if (!data)
+		if (!data && !spectre_v2_ibrs_all())
 			break;
 
 		/*
 		 * For non-nested:
 		 * When it's written (to non-zero) for the first time, pass
-		 * it through.
+		 * it through unless we have IBRS_ALL and it should just be
+		 * set for ever.
 		 *
 		 * For nested:
 		 * The handling of the MSR bitmap for L2 guests is done in
@@ -9441,7 +9442,7 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * is no need to worry about the conditional branch over the wrmsr
 	 * being speculatively taken.
 	 */
-	if (vmx->spec_ctrl)
+	if (vmx->spec_ctrl && !spectre_v2_ibrs_all())
 		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
@@ -9577,11 +9578,13 @@  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * save it.
 	 */
-	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	if (!spectre_v2_ibrs_all) {
+		if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+			rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
-	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		if (vmx->spec_ctrl)
+			wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	}
 
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();