Message ID | 1518449255-2182-2-git-send-email-dwmw@amazon.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
* David Woodhouse <dwmw@amazon.co.uk> wrote: > +extern enum spectre_v2_mitigation spectre_v2_enabled; This needs to be exported if the KVM module wants to use it. > +static inline bool spectre_v2_ibrs_all(void) > +{ > + return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL; > +} > + if (vmx->spec_ctrl && !spectre_v2_ibrs_all()) > + if (!spectre_v2_ibrs_all) { erm, that's a function, not a flag ... Thanks, Ingo
On 12/02/2018 16:27, David Woodhouse wrote: > The original IBRS hack in microcode is horribly slow. For the next > generation of CPUs, as a stopgap until we get a proper fix, Intel > promise an "Enhanced IBRS" which will be fast. > > The assumption is that predictions in the BTB/RSB will be tagged with > the VMX mode and ring that they were learned in, and thus the CPU will > avoid consuming unsafe predictions without a performance penalty. > > Intel's documentation says that it is still required to set the IBRS bit > in the SPEC_CTRL MSR and ensure that it remains set. > > Cope with this by trapping and emulating *all* access to SPEC_CTRL from > KVM guests when the IBRS_ALL feature is present, so it can never be > turned off. Guests who see IBRS_ALL should never do anything except > turn it on at boot anyway. And if they didn't know about IBRS_ALL and > they keep frobbing IBRS on every kernel entry/exit... well the vmexit > for a no-op is probably going to be faster than they were expecting > anyway, so they'll live. > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com> > --- > arch/x86/include/asm/nospec-branch.h | 9 ++++++++- > arch/x86/kernel/cpu/bugs.c | 16 ++++++++++++++-- > arch/x86/kvm/vmx.c | 17 ++++++++++------- > 3 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h > index 788c4da..524bb86 100644 > --- a/arch/x86/include/asm/nospec-branch.h > +++ b/arch/x86/include/asm/nospec-branch.h > @@ -140,9 +140,16 @@ enum spectre_v2_mitigation { > SPECTRE_V2_RETPOLINE_MINIMAL_AMD, > SPECTRE_V2_RETPOLINE_GENERIC, > SPECTRE_V2_RETPOLINE_AMD, > - SPECTRE_V2_IBRS, > + SPECTRE_V2_IBRS_ALL, > }; > > +extern enum spectre_v2_mitigation spectre_v2_enabled; > + > +static inline bool spectre_v2_ibrs_all(void) > +{ > + return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL; > +} > + > extern char __indirect_thunk_start[]; > extern char __indirect_thunk_end[]; > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c > index debcdda..047538a 100644 > --- a/arch/x86/kernel/cpu/bugs.c > +++ b/arch/x86/kernel/cpu/bugs.c > @@ -88,12 +88,13 @@ static const char *spectre_v2_strings[] = { > [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", > [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", > [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", > + [SPECTRE_V2_IBRS_ALL] = "Mitigation: Enhanced IBRS", > }; > > #undef pr_fmt > #define pr_fmt(fmt) "Spectre V2 : " fmt > > -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; > +enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; > > #ifdef RETPOLINE > static bool spectre_v2_bad_module; > @@ -237,6 +238,16 @@ static void __init spectre_v2_select_mitigation(void) > > case SPECTRE_V2_CMD_FORCE: > case SPECTRE_V2_CMD_AUTO: > + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { > + u64 ia32_cap = 0; > + > + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); > + if (ia32_cap & ARCH_CAP_IBRS_ALL) { > + mode = SPECTRE_V2_IBRS_ALL; > + wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS); > + goto ibrs_all; > + } > + } > if (IS_ENABLED(CONFIG_RETPOLINE)) > goto retpoline_auto; > break; > @@ -274,6 +285,7 @@ static void __init spectre_v2_select_mitigation(void) > setup_force_cpu_cap(X86_FEATURE_RETPOLINE); > } > > + ibrs_all: > spectre_v2_enabled = mode; > pr_info("%s\n", spectre_v2_strings[mode]); > > @@ -306,7 +318,7 @@ static void __init spectre_v2_select_mitigation(void) > * branches. But we don't know whether the firmware is safe, so > * use IBRS to protect against that: > */ > - if (boot_cpu_has(X86_FEATURE_IBRS)) { > + if (mode != SPECTRE_V2_IBRS_ALL && boot_cpu_has(X86_FEATURE_IBRS)) { > setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); > pr_info("Spectre mitigation: Restricting branch speculation (enabling IBRS) for firmware calls\n"); > } > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 91e3539..d99ba9b 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -3419,13 +3419,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > vmx->spec_ctrl = data; > > - if (!data) > + if (!data && !spectre_v2_ibrs_all()) > break; This should check the value of IBRS_ALL in the VM, not in the host. > /* > * For non-nested: > * When it's written (to non-zero) for the first time, pass > - * it through. > + * it through unless we have IBRS_ALL and it should just be > + * set for ever. > * > * For nested: > * The handling of the MSR bitmap for L2 guests is done in > @@ -9441,7 +9442,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > * is no need to worry about the conditional branch over the wrmsr > * being speculatively taken. > */ > - if (vmx->spec_ctrl) > + if (vmx->spec_ctrl && !spectre_v2_ibrs_all()) > wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); Same here, and it should also be if (vmx->spec_ctrl != host_spec_ctrl()) where static inline int host_spec_ctrl() { return spectre_v2_enabled != SPECTRE_V2_NONE; } Likewise below. Paolo > vmx->__launched = vmx->loaded_vmcs->launched; > @@ -9577,11 +9578,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > * If the L02 MSR bitmap does not intercept the MSR, then we need to > * save it. > */ > - if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) > - rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); > + if (!spectre_v2_ibrs_all) { > + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) > + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); > > - if (vmx->spec_ctrl) > - wrmsrl(MSR_IA32_SPEC_CTRL, 0); > + if (vmx->spec_ctrl) > + wrmsrl(MSR_IA32_SPEC_CTRL, 0); > + } > > /* Eliminate branch target predictions from guest mode */ > vmexit_fill_RSB(); >
On Tue, 2018-02-13 at 08:47 +0100, Ingo Molnar wrote: > * David Woodhouse <dwmw@amazon.co.uk> wrote: > > > > > +extern enum spectre_v2_mitigation spectre_v2_enabled; > > This needs to be exported if the KVM module wants to use it. > > > > > +static inline bool spectre_v2_ibrs_all(void) > > +{ > > + return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL; > > +} > > > > + if (vmx->spec_ctrl && !spectre_v2_ibrs_all()) > > > > + if (!spectre_v2_ibrs_all) { > > erm, that's a function, not a flag ... 0-day pointed out the latter, which is already fixed in the git tree ready for the next resend. Not the former though. I can export it. It does make me ponder for a second whether I should have gone with my first instinct just to make it another cpufeature flag like the others. But no, the excuse for doing that for the others was that it *needs* to be one for using alternatives. And this flag *isn't* used for alternatives, so it seems a little (more) wrong.
On Tue, 2018-02-13 at 09:02 +0100, Paolo Bonzini wrote: > > --- a/arch/x86/kvm/vmx.c > > +++ b/arch/x86/kvm/vmx.c > > @@ -3419,13 +3419,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > > > vmx->spec_ctrl = data; > > > > - if (!data) > > + if (!data && !spectre_v2_ibrs_all()) > > break; > This should check the value of IBRS_ALL in the VM, not in the host. No, it's host we want. If IBRS_ALL is set in the host, we set the actual hardware MSR once at boot time and never touch it again. The SPEC_CTRL MSR we expose to guests is purely a no-op fiction. If spectre_v2_ibrs_all() is true then KVM should *never* actually pass through or touch the real MSR.
On 13/02/2018 09:15, David Woodhouse wrote: >>> >>> - if (!data) >>> + if (!data && !spectre_v2_ibrs_all()) >>> break; >> This should check the value of IBRS_ALL in the VM, not in the host. > No, it's host we want. If IBRS_ALL is set in the host, we set the > actual hardware MSR once at boot time and never touch it again. The > SPEC_CTRL MSR we expose to guests is purely a no-op fiction. > > If spectre_v2_ibrs_all() is true then KVM should *never* actually pass > through or touch the real MSR. That would be nice but unfortunately it's not possible. :( The VM might actually not have IBRS_ALL, as usual the reason is migration compatibility. In that case, that no-op fiction would be very slow because the VM will actually do a lot of SPEC_CTRL writes. So the right logic is: - if the VM has IBRS_ALL, pass through the MSR when it is zero and intercept writes when it is one (no writes should happen) - if the VM doesn't have IBRS_ALL, do as we are doing now, independent of what the host spectre_v2_ibrs_all() setting is. Paolo
On Tue, 2018-02-13 at 10:58 +0100, Paolo Bonzini wrote: > > If spectre_v2_ibrs_all() is true then KVM should *never* actually pass > > through or touch the real MSR. > > That would be nice but unfortunately it's not possible. :( > > The VM might actually not have IBRS_ALL, as usual the reason is > migration compatibility. In that case, that no-op fiction would be very > slow because the VM will actually do a lot of SPEC_CTRL writes. If the VM *thinks* it's bashing on a real SPEC_CTRL register all the time, and it's actually just trapping to a no-op, then it's actually going to be a lot *faster* than the VM expects. We can live with that. > So the right logic is: > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and > intercept writes when it is one (no writes should happen) > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent > of what the host spectre_v2_ibrs_all() setting is. We end up having to turn IBRS on again on vmexit then, taking care that no conditional branch can go round it. So that becomes an *unconditional* wrmsr or lfence in the vmexit path. We really don't want that. If we choose to tell a guest that it doesn't have IBRS_ALL, or if the guest doesn't use IBRS_ALL and does it the old way, it's OK that it's trapped. It's still faster than they expected.
On Tue, 2018-02-13 at 10:21 +0000, David Woodhouse wrote: > > > So the right logic is: > > > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and > > intercept writes when it is one (no writes should happen) > > > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent > > of what the host spectre_v2_ibrs_all() setting is. > > We end up having to turn IBRS on again on vmexit then, taking care that > no conditional branch can go round it. So that becomes an > *unconditional* wrmsr or lfence in the vmexit path. We really don't > want that. Note that being able to keep it simple in KVM was basically what made the difference between me tolerating IBRS_ALL as Intel currently define it, and throwing my toys out of the pram (as I had done in the first iterations of this patch). If we *can't* keep it nice and simple in KVM, then I think we *really* want Intel to make the hardware bit a no-op. If not for all CPUs with the IBRS_ALL bit, then for all *future* CPUs and they can define a new IBRS_ALL_UNCONDITIONAL bit. Which is the only one we'd ever care about.
On 13/02/2018 11:36, David Woodhouse wrote: >>> - if the VM has IBRS_ALL, pass through the MSR when it is zero and >>> intercept writes when it is one (no writes should happen) >>> >>> - if the VM doesn't have IBRS_ALL, do as we are doing now, independent >>> of what the host spectre_v2_ibrs_all() setting is. >> We end up having to turn IBRS on again on vmexit then, taking care that >> no conditional branch can go round it. So that becomes an >> *unconditional* wrmsr or lfence in the vmexit path. We really don't >> want that. > > Note that being able to keep it simple in KVM was basically what made > the difference between me tolerating IBRS_ALL as Intel currently define > it, and throwing my toys out of the pram (as I had done in the first > iterations of this patch). You have my vote. :) Really, IBRS_ALL makes no sense and it would be nice to know _why_ Intel is pushing something that makes no sense. Paolo
On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote: > You have my vote. :) Really, IBRS_ALL makes no sense and it would be > nice to know _why_ Intel is pushing something that makes no sense. No, IBRS_ALL *does* make sense. It's not a complete fix, but it's as much of a fix as they should shoe-horn into the generation of CPUs which are currently going to the fabs. With IBRS_ALL they presumably add tags to the predictions with the VMX mode and ring, to give complete protection against predictions being used in a more privileged mode. That saves us from having to do retpoline, or the horrid old IBRS-as-barrier crap. Or any of the other as-yet-incomplete Skylake hacks. But we *do* still need the IBPB on context/VM switch. Because they *haven't* yet managed to tag with VMID/PCID and/or do appropriate flushing (on VMPTRLD, CR3 load, etc.). That will have to wait until hardware which is even further out. But it's a bone of contention that they haven't even defined the bit which will *advertise* this future behaviour. As it stands though, as a stop-gap solution IBRS_ALL isn't completely senseless. It *is* a shame that they didn't make the IBRS bit in SPEC_CTRL a no-op on CPUs with IBRS_ALL, but there are apparently technical reasons for that on a certain subset of CPUs. Again, having relatively simple patches to KVM which to tolerate the IBRS bit *not* being a no-op is what makes the difference between us accepting that, and demanding a new 'IBRS_ALL_UNCONDITIONAL' bit for the CPUs which *aren't* in that "certain subset".
On 13/02/2018 11:53, David Woodhouse wrote: >> You have my vote. :) Really, IBRS_ALL makes no sense and it would be >> nice to know _why_ Intel is pushing something that makes no sense. > No, IBRS_ALL *does* make sense. It's not a complete fix, but it's as > much of a fix as they should shoe-horn into the generation of CPUs > which are currently going to the fabs. > > With IBRS_ALL they presumably add tags to the predictions with the VMX > mode and ring, to give complete protection against predictions being > used in a more privileged mode. Yeah, but I still don't get why they need an MSR to turn those tags on. Is it basically a chicken bit in the wrong direction? Paolo
On Tue 2018-02-13 09:02:25, Paolo Bonzini wrote: > On 12/02/2018 16:27, David Woodhouse wrote: > > The original IBRS hack in microcode is horribly slow. For the next > > generation of CPUs, as a stopgap until we get a proper fix, Intel > > promise an "Enhanced IBRS" which will be fast. > > > > The assumption is that predictions in the BTB/RSB will be tagged with > > the VMX mode and ring that they were learned in, and thus the CPU will > > avoid consuming unsafe predictions without a performance penalty. > > > > Intel's documentation says that it is still required to set the IBRS bit > > in the SPEC_CTRL MSR and ensure that it remains set. > > > > Cope with this by trapping and emulating *all* access to SPEC_CTRL from > > KVM guests when the IBRS_ALL feature is present, so it can never be > > turned off. Guests who see IBRS_ALL should never do anything except > > turn it on at boot anyway. And if they didn't know about IBRS_ALL and > > they keep frobbing IBRS on every kernel entry/exit... well the vmexit > > for a no-op is probably going to be faster than they were expecting > > anyway, so they'll live. > > > > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > > Acked-by: Arjan van de Ven <arjan.van.de.ven@intel.com> > > --- > > arch/x86/include/asm/nospec-branch.h | 9 ++++++++- > > arch/x86/kernel/cpu/bugs.c | 16 ++++++++++++++-- > > arch/x86/kvm/vmx.c | 17 ++++++++++------- > > 3 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h > > index 788c4da..524bb86 100644 > > --- a/arch/x86/include/asm/nospec-branch.h > > +++ b/arch/x86/include/asm/nospec-branch.h > > @@ -140,9 +140,16 @@ enum spectre_v2_mitigation { > > SPECTRE_V2_RETPOLINE_MINIMAL_AMD, > > SPECTRE_V2_RETPOLINE_GENERIC, > > SPECTRE_V2_RETPOLINE_AMD, > > - SPECTRE_V2_IBRS, > > + SPECTRE_V2_IBRS_ALL, > > }; > > > > +extern enum spectre_v2_mitigation spectre_v2_enabled; > > + > > +static inline bool spectre_v2_ibrs_all(void) > > +{ > > + return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL; > > +} > > + > > extern char __indirect_thunk_start[]; > > extern char __indirect_thunk_end[]; > > > > diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c > > index debcdda..047538a 100644 > > --- a/arch/x86/kernel/cpu/bugs.c > > +++ b/arch/x86/kernel/cpu/bugs.c > > @@ -88,12 +88,13 @@ static const char *spectre_v2_strings[] = { > > [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", > > [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", > > [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", > > + [SPECTRE_V2_IBRS_ALL] = "Mitigation: Enhanced IBRS", > > }; Hmm. Probably not just your problem but these should really get documentation somewhere -- and adding another one should be treated like changing the ABI. How is poor userland expected to do anything inteligent with that file? Pavel
On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote: > On 13/02/2018 11:36, David Woodhouse wrote: > > > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and > > > > intercept writes when it is one (no writes should happen) > > > > > > > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent > > > > of what the host spectre_v2_ibrs_all() setting is. > > > > > > We end up having to turn IBRS on again on vmexit then, taking care that > > > no conditional branch can go round it. So that becomes an > > > *unconditional* wrmsr or lfence in the vmexit path. We really don't > > > want that. > > > > > Note that being able to keep it simple in KVM was basically what made > > the difference between me tolerating IBRS_ALL as Intel currently define > > it, and throwing my toys out of the pram (as I had done in the first > > iterations of this patch). > > You have my vote. :) I was taking that as assent to the patch... could I trouble you for an explicit ack, please?
On 16/02/2018 10:58, David Woodhouse wrote: > On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote: > >> On 13/02/2018 11:36, David Woodhouse wrote: >>>>> - if the VM has IBRS_ALL, pass through the MSR when it is zero and >>>>> intercept writes when it is one (no writes should happen) >>>>> >>>>> - if the VM doesn't have IBRS_ALL, do as we are doing now, independent >>>>> of what the host spectre_v2_ibrs_all() setting is. >>>> >>>> We end up having to turn IBRS on again on vmexit then, taking care that >>>> no conditional branch can go round it. So that becomes an >>>> *unconditional* wrmsr or lfence in the vmexit path. We really don't >>>> want that. >>>> >>> Note that being able to keep it simple in KVM was basically what made >>> the difference between me tolerating IBRS_ALL as Intel currently define >>> it, and throwing my toys out of the pram (as I had done in the first >>> iterations of this patch). >> >> You have my vote. :) > > I was taking that as assent to the patch... could I trouble you for an > explicit ack, please? No, it's a vote for throwing the toys out of the pram (or running away with the ball, if you prefer). Unfortunately, if you want to have a higher-performance mode for IBRS_ALL that avoids rdmsr on vmexit, you have to do it as sketched above. Alternatively, if IBRS_ALL is 1, and you don't care about migration between machines that have IBRS_ALL and those that don't, the host can simply not enable the SPEC_CTRL CPUID bit in the guest, and instead use the AMD IBPB flag only. Need to check if Windows obeys the AMD flag on Intel machines (or in general) though. Paolo
On Fri, 2018-02-16 at 11:08 +0100, Paolo Bonzini wrote: > On 16/02/2018 10:58, David Woodhouse wrote: > > > > On Tue, 2018-02-13 at 11:41 +0100, Paolo Bonzini wrote: > > > > > > > > On 13/02/2018 11:36, David Woodhouse wrote: > > > > > > > > > > > > > > > > > > > > > - if the VM has IBRS_ALL, pass through the MSR when it is zero and > > > > > > intercept writes when it is one (no writes should happen) > > > > > > > > > > > > - if the VM doesn't have IBRS_ALL, do as we are doing now, independent > > > > > > of what the host spectre_v2_ibrs_all() setting is. > > > > > We end up having to turn IBRS on again on vmexit then, taking care that > > > > > no conditional branch can go round it. So that becomes an > > > > > *unconditional* wrmsr or lfence in the vmexit path. We really don't > > > > > want that. > > > > > > > > > Note that being able to keep it simple in KVM was basically what made > > > > the difference between me tolerating IBRS_ALL as Intel currently define > > > > it, and throwing my toys out of the pram (as I had done in the first > > > > iterations of this patch). > > > > > > You have my vote. :) > > > > I was taking that as assent to the patch... could I trouble you for an > > explicit ack, please? > > No, it's a vote for throwing the toys out of the pram (or running away > with the ball, if you prefer). > > Unfortunately, if you want to have a higher-performance mode for > IBRS_ALL that avoids rdmsr on vmexit, you have to do it as sketched above. Why? With IBRS_ALL the guest *never* gets to affect the actual hardware MSR, which is always on. The MSR is purely an emulated no-op. Why does that affect migration? Even if the guest doesn't have/support IBRS_ALL, and is frobbing the (now emulated) MSR on every kernel entry/exit, that's *still* going to be a metric shitload faster than what it *thought* it was doing.
On 16/02/2018 11:21, David Woodhouse wrote: > Why? With IBRS_ALL the guest *never* gets to affect the actual hardware > MSR, which is always on. The MSR is purely an emulated no-op. Why does > that affect migration? Because even if the host has IBRS_ALL, as long as you want to migrate to a system without IBRS_ALL the guest will likely not have it. You can fake IBRS_ALL on the older system after migration, and forcing the guest to always run with IBRS=1 even when in user mode; that is slow. Or... > Even if the guest doesn't have/support IBRS_ALL, and is frobbing the > (now emulated) MSR on every kernel entry/exit, that's *still* going to > be a metric shitload faster than what it *thought* it was doing. ... you are making every kernel entry/exit 3 times slower by adding two KVM exits (both hypervisor traps and syscalls are in the 1000-1500 clock cycles ballpark). That cannot be fast at all. Paolo
On Fri, 2018-02-16 at 12:04 +0100, Paolo Bonzini wrote: > On 16/02/2018 11:21, David Woodhouse wrote: > > > > Why? With IBRS_ALL the guest *never* gets to affect the actual hardware > > MSR, which is always on. The MSR is purely an emulated no-op. Why does > > that affect migration? > Because even if the host has IBRS_ALL, as long as you want to migrate to > a system without IBRS_ALL the guest will likely not have it. You can > fake IBRS_ALL on the older system after migration, and forcing the guest > to always run with IBRS=1 even when in user mode; that is slow. Or... No you can't; it's also a barrier. You can't just leave it on. > > Even if the guest doesn't have/support IBRS_ALL, and is frobbing the > > (now emulated) MSR on every kernel entry/exit, that's *still* going to > > be a metric shitload faster than what it *thought* it was doing. > > ... you are making every kernel entry/exit 3 times slower by adding two > KVM exits (both hypervisor traps and syscalls are in the 1000-1500 clock > cycles ballpark). That cannot be fast at all. I'm not convinced I care. It's still on a par with the performance of *actually* frobbing IBRS every time. If the guest cares about performance, they'll be using retpoline instead and not doing that. We really don't want to penalise the *host*, and other guests, for one guest which does stupid things. And if we have a conditional 'set IBRS back on because we're using IBRS_ALL and the guest had access to it' in the vmexit path, only the *first* clause (IBRS_ALL) of that condition can be done with alternatives. The other part is necessarily a runtime thing, and thus needs the 'else lfence' to make sure it really happens, penalising *all* guests. (Unless there's something else guaranteed to be serialising in that path but after Arjan mentioned it last time I took a quick look and didn't see anything unconditional). An alternative would be to put the SPEC_CTRL MSR into the list of MSRs to be automatically saved/reset at vmexit, but we've already carefully *changed* from doing that for the non-IBRS_ALL case because doing it manually in the host is faster. I don't know that we want that additional complexity — that might be the last straw that makes us tell Intel "no, in that case we don't want IBRS_ALL as it is. Define a new bit which is like IBRS_ALL but also promises that it's always like that and the IBRS bit in the MSR is a no-op". That new bit would be set on all future hardware except a small handful of current chips, I believe. I think we can live with trapping and emulating SPEC_CTRL for the guests which use it, for now. If we *really* want to explore optimising it somehow, there's nothing to prevent us doing that in a subsequent patch.
On 02/16/2018 07:10 AM, David Woodhouse wrote: > On Fri, 2018-02-16 at 12:04 +0100, Paolo Bonzini wrote: >> On 16/02/2018 11:21, David Woodhouse wrote: >>> Even if the guest doesn't have/support IBRS_ALL, and is frobbing the >>> (now emulated) MSR on every kernel entry/exit, that's *still* going to >>> be a metric shitload faster than what it *thought* it was doing. Is there any indication/log to the admin that VM doesn't know about IBRS_ALL and is constantly uselessly writing to an emulated MSR? While it's probably true that the overhead in time is similar to (or better than) an actual IBRS MSR write, if the admin/user knows the VM needs updating, then there's a fighting chance that they might do so. Jon.
> > >>> Even if the guest doesn't have/support IBRS_ALL, and is frobbing the > >>> (now emulated) MSR on every kernel entry/exit, that's *still* going to > >>> be a metric shitload faster than what it *thought* it was doing. > > Is there any indication/log to the admin that VM doesn't know about > IBRS_ALL and is constantly uselessly writing to an emulated MSR? > > While it's probably true that the overhead in time is similar to (or > better than) an actual IBRS MSR write, if the admin/user knows the VM > needs updating, then there's a fighting chance that they might do so. the guest is not the problem; guests obviously will already honor if Enhanced IBRS is enumerated. The problem is mixed migration pools where the hypervisor may need to decide to not pass this enumeration through to the guest.
On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said: > the guest is not the problem; guests obviously will already honor if Enhanced > IBRS is enumerated. The problem is mixed migration pools where the hypervisor > may need to decide to not pass this enumeration through to the guest. For bonus points: What should happen to a VM that is live migrated from one hypervisor to another, and the hypervisors have different IBRS support?
> On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said: > > > the guest is not the problem; guests obviously will already honor if Enhanced > > IBRS is enumerated. The problem is mixed migration pools where the > hypervisor > > may need to decide to not pass this enumeration through to the guest. > > For bonus points: What should happen to a VM that is live migrated from one > hypervisor to another, and the hypervisors have different IBRS support? Doctor Doctor it hurts when I do this.... Migration tends to only work between HV's that are relatively homogeneous, that's nothing new... folks who run clouds or bigger pools know this obviously.
On Tue, 20 Feb 2018 00:00:23 +0000 "Van De Ven, Arjan" <arjan.van.de.ven@intel.com> wrote: > > On Mon, 19 Feb 2018 23:42:24 +0000, "Van De Ven, Arjan" said: > > > > > the guest is not the problem; guests obviously will already honor if Enhanced > > > IBRS is enumerated. The problem is mixed migration pools where the > > hypervisor > > > may need to decide to not pass this enumeration through to the guest. > > > > For bonus points: What should happen to a VM that is live migrated from one > > hypervisor to another, and the hypervisors have different IBRS support? > > Doctor Doctor it hurts when I do this.... > > Migration tends to only work between HV's that are relatively homogeneous, that's nothing new... folks who run clouds or bigger pools know this obviously. In theory there's nothing stopping a guest getting a 'you are about to gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one removed. It's just that it's always been less painful to avoid the problems you get with wild mismatches than attmept the cure. And any 'cure' goes beyond the OS kernel into some of the managed runtimes so I'd agree with Arjan 'don't do that'. Alan
On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote: > > In theory there's nothing stopping a guest getting a 'you are about to > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one > removed. I'm not convinced we handle the case of hotplug CPU's with different CPU models correctly. In fact, I'd be very surprised it it worked in the general case. Linus
On Mon, 19 Feb 2018 16:43:50 -0800 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote: > > > > In theory there's nothing stopping a guest getting a 'you are about to > > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one > > removed. > > I'm not convinced we handle the case of hotplug CPU's with different > CPU models correctly. We don't. And even if we did the user space doesn't. It would be a painful task to resolve. Alan
> > On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> > wrote: > > > > > > In theory there's nothing stopping a guest getting a 'you are about to > > > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one > > > removed. > > > > I'm not convinced we handle the case of hotplug CPU's with different > > CPU models correctly. > > We don't. And even if we did the user space doesn't. It would be a > painful task to resolve. in a cloud world it ends up being the launch/restart of the VM in reality. Just the JIT caches all over in userspace make this case not pretty. And we as kernel don't even really do mixed cpus without hotplug afaict (other than maybe taking lowest subset, while the case here would be the opposite direction)
On Mon, 19 Feb 2018, Linus Torvalds wrote: > On Mon, Feb 19, 2018 at 4:13 PM, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote: > > > > In theory there's nothing stopping a guest getting a 'you are about to > > gain/lose IBRS' message or having a new 'CPU' hotplugged and the old one > > removed. > > I'm not convinced we handle the case of hotplug CPU's with different > CPU models correctly. > > In fact, I'd be very surprised it it worked in the general case. We pretend to work with different CPU models mostly by having per cpu feature bits, but in practice this would fall apart in bits and pieces all over the place. Thanks, tglx
On 20/02/2018 01:00, Van De Ven, Arjan wrote: >>> the guest is not the problem; guests obviously will already honor >>> if Enhanced IBRS is enumerated. The problem is mixed migration >>> pools where the hypervisor >>> may need to decide to not pass this enumeration through to the >>> guest. >> For bonus points: What should happen to a VM that is live migrated >> from one hypervisor to another, and the hypervisors have different >> IBRS support? > > Doctor Doctor it hurts when I do this.... > > Migration tends to only work between HV's that are relatively > homogeneous, that's nothing new... No Arjan, this is just wrong. Well, I suppose it's right in the present tense with the IBRS mess on Skylake, but it's _not_ been true until last year. Paolo
> >> For bonus points: What should happen to a VM that is live migrated > >> from one hypervisor to another, and the hypervisors have different > >> IBRS support? > > > > Doctor Doctor it hurts when I do this.... > > > > Migration tends to only work between HV's that are relatively > > homogeneous, that's nothing new... > > No Arjan, this is just wrong. Well, I suppose it's right in the present > tense with the IBRS mess on Skylake, but it's _not_ been true until last > year. I meant software wise. You're not going to live migrate from xen to kvm or backwards. or between very radically different versions of the kvm stack.
On 20/02/2018 15:08, Van De Ven, Arjan wrote: >>>> For bonus points: What should happen to a VM that is live migrated >>>> from one hypervisor to another, and the hypervisors have different >>>> IBRS support? >>> >>> Doctor Doctor it hurts when I do this.... >>> >>> Migration tends to only work between HV's that are relatively >>> homogeneous, that's nothing new... >> >> No Arjan, this is just wrong. Well, I suppose it's right in the present >> tense with the IBRS mess on Skylake, but it's _not_ been true until last >> year. > > I meant software wise. You're not going to live migrate from xen to > kvm or backwards. or between very radically different versions of the > kvm stack. Forwards migration to a radically newer version certainly happens. So when the source hypervisor was too old to tell the VM about IBRS_ALL, for example, migration should work properly and the VM should perform well on the destination hypervisor. Backwards migration to older hypervisors also happens sometimes, but in general it creates more userspace than kernel issues. Paolo
> > I meant software wise. You're not going to live migrate from xen to > > kvm or backwards. or between very radically different versions of the > > kvm stack. > > Forwards migration to a radically newer version certainly happens. So > when the source hypervisor was too old to tell the VM about IBRS_ALL, > for example, migration should work properly and the VM should perform > well on the destination hypervisor. that makes sense, compatibility in this direction can be done and is useful as you move a fleet of servers forward > > Backwards migration to older hypervisors also happens sometimes, but in > general it creates more userspace than kernel issues. > that direction is obviously harder
On 20/02/2018 15:59, Van De Ven, Arjan wrote: > >>> I meant software wise. You're not going to live migrate from xen to >>> kvm or backwards. or between very radically different versions of the >>> kvm stack. >> >> Forwards migration to a radically newer version certainly happens. So >> when the source hypervisor was too old to tell the VM about IBRS_ALL, >> for example, migration should work properly and the VM should perform >> well on the destination hypervisor. > > that makes sense, compatibility in this direction can be done > and is useful as you move a fleet of servers forward > >> Backwards migration to older hypervisors also happens sometimes, but in >> general it creates more userspace than kernel issues. > > that direction is obviously harder From KVM's point of view it's mostly a matter of having a complete ioctl API right. I'm not that much worried by it. Paolo
On Tue, Feb 20, 2018 at 03:46:57PM +0100, Paolo Bonzini wrote: > On 20/02/2018 15:08, Van De Ven, Arjan wrote: > >>>> For bonus points: What should happen to a VM that is live migrated > >>>> from one hypervisor to another, and the hypervisors have different > >>>> IBRS support? > >>> > >>> Doctor Doctor it hurts when I do this.... > >>> > >>> Migration tends to only work between HV's that are relatively > >>> homogeneous, that's nothing new... > >> > >> No Arjan, this is just wrong. Well, I suppose it's right in the present > >> tense with the IBRS mess on Skylake, but it's _not_ been true until last > >> year. > > > > I meant software wise. You're not going to live migrate from xen to > > kvm or backwards. or between very radically different versions of the > > kvm stack. > > Forwards migration to a radically newer version certainly happens. So > when the source hypervisor was too old to tell the VM about IBRS_ALL, > for example, migration should work properly and the VM should perform > well on the destination hypervisor. To add a bit more to this, Intel just updated their IA32_ARCH_CAPABILITIES_MSR to have a new bit to sample to figure out whether you need IBRS or not during runtime. See https://software.intel.com/sites/default/files/managed/1d/46/Retpoline-A-Branch-Target-Injection-Mitigation.pdf in 5.3 Virtual Machine CPU Identification: "To remedy this situation, an operating system running as a VM can query bit 2 of the IA32_ARCH_CAPABILITIES MSR, known as “RSB Alternate” (RSBA). When RSBA is set, it indicates that the VM may run on a processor vulnerable to exploits of Empty RSB conditions regardless of the processor’s DisplayFamily/DisplayModel signature, and that the operating system should deploy appropriate mitigations. Virtual machine managers (VMM) may set RSBA via MSR interception to indicate that a virtual machine might run at some time in the future on a vulnerable processor." New bit.. but not mentioned in the: 336996-Speculative-Execution-Side-Channel-Mitigations.pdf Paolo, is there some form of callback inside of the guest when KVM guests are migrated? (It exists under Xen, but I don't see it under KVM?) > > Backwards migration to older hypervisors also happens sometimes, but in > general it creates more userspace than kernel issues. > > Paolo
> To add a bit more to this, Intel just updated their > IA32_ARCH_CAPABILITIES_MSR > to have a new bit to sample to figure out whether you need IBRS or not > during runtime. actually we updated the document when you need RSB stuffing. based on the request of various folks here on LKML.
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 788c4da..524bb86 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -140,9 +140,16 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, - SPECTRE_V2_IBRS, + SPECTRE_V2_IBRS_ALL, }; +extern enum spectre_v2_mitigation spectre_v2_enabled; + +static inline bool spectre_v2_ibrs_all(void) +{ + return spectre_v2_enabled == SPECTRE_V2_IBRS_ALL; +} + extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index debcdda..047538a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -88,12 +88,13 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ALL] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -237,6 +238,16 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 ia32_cap = 0; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + if (ia32_cap & ARCH_CAP_IBRS_ALL) { + mode = SPECTRE_V2_IBRS_ALL; + wrmsrl(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS); + goto ibrs_all; + } + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -274,6 +285,7 @@ static void __init spectre_v2_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } + ibrs_all: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -306,7 +318,7 @@ static void __init spectre_v2_select_mitigation(void) * branches. But we don't know whether the firmware is safe, so * use IBRS to protect against that: */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (mode != SPECTRE_V2_IBRS_ALL && boot_cpu_has(X86_FEATURE_IBRS)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Spectre mitigation: Restricting branch speculation (enabling IBRS) for firmware calls\n"); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 91e3539..d99ba9b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3419,13 +3419,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->spec_ctrl = data; - if (!data) + if (!data && !spectre_v2_ibrs_all()) break; /* * For non-nested: * When it's written (to non-zero) for the first time, pass - * it through. + * it through unless we have IBRS_ALL and it should just be + * set for ever. * * For nested: * The handling of the MSR bitmap for L2 guests is done in @@ -9441,7 +9442,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (vmx->spec_ctrl) + if (vmx->spec_ctrl && !spectre_v2_ibrs_all()) wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); vmx->__launched = vmx->loaded_vmcs->launched; @@ -9577,11 +9578,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) - rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + if (!spectre_v2_ibrs_all) { + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); - if (vmx->spec_ctrl) - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + } /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB();