Message ID | 20210809145343.97685-3-eesposit@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: nSVM: avoid TOC/TOU race when checking vmcb12 | expand |
On 09/08/21 16:53, Emanuele Giuseppe Esposito wrote: > svm_switch_vmcb(svm, &svm->nested.vmcb02); > + > + /* Save vmcb12's EFER, CR0 and CR4 to avoid TOC/TOU races. */ > + vmcb12_efer = vmcb12->save.efer; > + vmcb12_cr0 = vmcb12->save.cr0; > + vmcb12_cr4 = vmcb12->save.cr4; > + > + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save, vmcb12_efer, > + vmcb12_cr0, vmcb12_cr4) || > + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { > + vmcb12->control.exit_code = SVM_EXIT_ERR; > + vmcb12->control.exit_code_hi = 0; > + vmcb12->control.exit_info_1 = 0; > + vmcb12->control.exit_info_2 = 0; > + return 1; > + } At this point you have already done a svm_switch_vmcb, so you need to undo its effects. This is indeed what returning 1 achieves. However, if you return 1 then the caller does: if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) { svm->nested.nested_run_pending = 0; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); } where we have three steps: 1) clearing nested_run_pending is all good 2) setting the exit code is good, but then you don't need to do it in enter_svm_guest_mode 3) nested_svm_vmexit is problematic; nested_svm_vmexit copies values from VMCB02 to VMCB12 but those have not been set yet (nested_vmcb02_prepare_save takes care of it). The simplest way to fix this is to add a bool argument to nested_svm_vmexit, saying whether the vmcb12 save area should be updated or not. Paolo
On Mon, 2021-08-09 at 16:53 +0200, Emanuele Giuseppe Esposito wrote: > Move the checks done by nested_vmcb_valid_sregs and nested_vmcb_check_controls > directly in enter_svm_guest_mode, and save the values of vmcb12's > efer, cr0 and cr4 in local variable that are then passed to > nested_vmcb02_prepare_save. This prevents from creating TOC/TOU races. > > This also avoids the need of force-setting EFER_SVME in > nested_vmcb02_prepare_save. > > Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com> > --- > arch/x86/kvm/svm/nested.c | 72 +++++++++++++++++++-------------------- > 1 file changed, 36 insertions(+), 36 deletions(-) > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c > index 0ac2d14add15..04e9e947deb9 100644 > --- a/arch/x86/kvm/svm/nested.c > +++ b/arch/x86/kvm/svm/nested.c > @@ -259,20 +259,14 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, > > /* Common checks that apply to both L1 and L2 state. */ > static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, > - struct vmcb_save_area *save) > + struct vmcb_save_area *save, > + u64 efer, u64 cr0, u64 cr4) > { > - /* > - * FIXME: these should be done after copying the fields, > - * to avoid TOC/TOU races. For these save area checks > - * the possible damage is limited since kvm_set_cr0 and > - * kvm_set_cr4 handle failure; EFER_SVME is an exception > - * so it is force-set later in nested_prepare_vmcb_save. > - */ > - if (CC(!(save->efer & EFER_SVME))) > + if (CC(!(efer & EFER_SVME))) > return false; > > - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || > - CC(save->cr0 & ~0xffffffffULL)) > + if (CC((cr0 & X86_CR0_CD) == 0 && (cr0 & X86_CR0_NW)) || > + CC(cr0 & ~0xffffffffULL)) > return false; > > if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) > @@ -283,17 +277,16 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, > * except that EFER.LMA is not checked by SVM against > * CR0.PG && EFER.LME. > */ > - if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { > - if (CC(!(save->cr4 & X86_CR4_PAE)) || > - CC(!(save->cr0 & X86_CR0_PE)) || > + if ((efer & EFER_LME) && (cr0 & X86_CR0_PG)) { > + if (CC(!(cr4 & X86_CR4_PAE)) || CC(!(cr0 & X86_CR0_PE)) || > CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) > return false; > } > > - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) > + if (CC(!kvm_is_valid_cr4(vcpu, cr4))) > return false; > > - if (CC(!kvm_valid_efer(vcpu, save->efer))) > + if (CC(!kvm_valid_efer(vcpu, efer))) > return false; > > return true; > @@ -434,7 +427,9 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) > svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; > } > > -static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) > +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, > + struct vmcb *vmcb12, > + u64 efer, u64 cr0, u64 cr4) > { > bool new_vmcb12 = false; > > @@ -463,15 +458,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 > > kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); > > - /* > - * Force-set EFER_SVME even though it is checked earlier on the > - * VMCB12, because the guest can flip the bit between the check > - * and now. Clearing EFER_SVME would call svm_free_nested. > - */ > - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); > + svm_set_efer(&svm->vcpu, efer); > > - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); > - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); > + svm_set_cr0(&svm->vcpu, cr0); > + svm_set_cr4(&svm->vcpu, cr4); > > svm->vcpu.arch.cr2 = vmcb12->save.cr2; > > @@ -567,6 +557,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, > { > struct vcpu_svm *svm = to_svm(vcpu); > int ret; > + u64 vmcb12_efer, vmcb12_cr0, vmcb12_cr4; > > trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, > vmcb12->save.rip, > @@ -589,8 +580,25 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, > nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); > > svm_switch_vmcb(svm, &svm->nested.vmcb02); > + > + /* Save vmcb12's EFER, CR0 and CR4 to avoid TOC/TOU races. */ > + vmcb12_efer = vmcb12->save.efer; > + vmcb12_cr0 = vmcb12->save.cr0; > + vmcb12_cr4 = vmcb12->save.cr4; > + > + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save, vmcb12_efer, > + vmcb12_cr0, vmcb12_cr4) || > + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { > + vmcb12->control.exit_code = SVM_EXIT_ERR; > + vmcb12->control.exit_code_hi = 0; > + vmcb12->control.exit_info_1 = 0; > + vmcb12->control.exit_info_2 = 0; > + return 1; > + } > + > nested_vmcb02_prepare_control(svm); > - nested_vmcb02_prepare_save(svm, vmcb12); > + nested_vmcb02_prepare_save(svm, vmcb12, vmcb12_efer, vmcb12_cr0, > + vmcb12_cr4); > > ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, > nested_npt_enabled(svm), true); > @@ -641,15 +649,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) > > nested_load_control_from_vmcb12(svm, &vmcb12->control); > > - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || > - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { > - vmcb12->control.exit_code = SVM_EXIT_ERR; > - vmcb12->control.exit_code_hi = 0; > - vmcb12->control.exit_info_1 = 0; > - vmcb12->control.exit_info_2 = 0; > - goto out; > - } > - > /* > * Since vmcb01 is not in use, we can use it to store some of the L1 > * state. > @@ -1336,7 +1335,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, > if (!(save->cr0 & X86_CR0_PG) || > !(save->cr0 & X86_CR0_PE) || > (save->rflags & X86_EFLAGS_VM) || > - !nested_vmcb_valid_sregs(vcpu, save)) > + !nested_vmcb_valid_sregs(vcpu, save, save->efer, save->cr0, > + save->cr4)) > goto out_free; > > /* This is a very interesting approach. Your approach is to copy only some fields and still pass pointer to the guest controlled save area to relevant functions, thus in the future this pointer can still be used again to trigger TOC/TOI races. My approach was to copy all the vmcb12 (save and control) to a page (we currently only copy the control area), and then just use the copy everywhere. The disadvantage of my approach is that fields are copied twice, once from vmcb12 to its local copy, and then from the local copy to vmcb02, however this approach is generic in such a way that TOC/TOI races become impossible. The disadvantage of your approach is that only some fields are copied and there is still a chance of TOC/TOI race in the future. I have a suggestion: Maybe we should just blindly copy all registers as is to vmcb02 (even before we switch to it), and then check the registers, and for registers for which the user visible value differs from the value that CPU sees (CR0/CR3/CR4/EFER/RFLAGS/), replace the guest CR value temporarily stored in VMCB02 with filtered value, similar to a slight abuse we do as I explained in Note 1 below? Best regards, Maxim Levitsky PS: This is a bit of documentation I had just written and I would like to share, about a few things that might be useful in the context of this work. I might not understand everything correctly, and so there might be mistakes, so feel free to correct me. There also are probably corner cases I didn’t paid much attention to. Also the below is mostly true for SVM, on VMX there is a bit more caching to avoid vmreads/vmwrites). *** Note 1: Explanantion of some state changes that nested SVM entry/exit does: *** In nested_svm_vmrun we have this code which has a slightly misleading comment: /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. */ svm->vmcb01.ptr->save.efer = vcpu->arch.efer; svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); if (!npt_enabled) svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); But the VMCB01 already contains the L1 state, and what is actually happening here is slightly different: First we have: svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); This writes back the RIP promotion that is in KVM register cache back to vmb01, that was done by kvm_skip_emulated_instruction Normally this happens on guest entry (see svm_vcpu_run), but has to be done now as we will not enter vmcb01 but vmcb02. Then we have: svm->vmcb01.ptr->save.efer = vcpu->arch.efer; svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); if (!npt_enabled) svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); The purpose of these writes is to replace the CPU visible values of these registers which are already stored in vmcb01, with guest visible values which we store in vcpu->arch.* and return to the guest in case it reads them, for all registers that differ in CPU and guest visible value. This is a slight abuse of vmb01 but maybe it is the best way to do it. I don't know. Those registers (efer, cr0, cr4, rflags, and cr3), roughly speaking are the only registers that can have different values in regard to what the guest thinks the register value is and what the real value that the CPU sees. In particular: -> EFER as you see in svm_set_efer is modified with a few things when shadow paging is enabled, and the SVME bit is force enabled, since AMD forces us to have EFER.SVME enabled when a guest is running, while the guest might not enable the SVM at all. -> CR0 also has some slight tweaks in svm_set_cr0 mostly for !npt_enabled case as well. -> CR4 - same story, see the svm_set_cr4 -> RFLAGS - we try (only partially) to hide the host set RFLAGS.TF when the KVM_GUESTDBG_SINGLESTEP guest debug feature is active. -> CR3 - when using shadowing paging, it contains the shadow paging root We also have some code for overriding the debug registers like that but it is handled differently and I haven’t dug deep into it. Later on nested VMexit, we do the opposite kvm_set_rflags(vcpu, svm->vmcb->save.rflags); svm_set_efer(vcpu, svm->vmcb->save.efer); svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); svm_set_cr4(vcpu, svm->vmcb->save.cr4); This code achieves two things at the same time -> It updates the KVM register cache back with the L1 values -> It updates the vmcb01 register values back to CPU visible values, since we call the functions that are called when the guest modifies those registers, and so all the 'modifications' are done again prior to loading the value into the vmcb. We also have this on nested VMexit: kvm_rax_write(vcpu, svm->vmcb->save.rax); kvm_rsp_write(vcpu, svm->vmcb->save.rsp); kvm_rip_write(vcpu, svm->vmcb->save.rip); This only updates the KVM register cache with the previous L1 values, while VMCB01 values are already up to date (unchanged) (but to be honest, on next VM entry we will still write these registers from KVM register cache back to vmcb01 pointlessly to be honest) Note that RSP/RAX is not needed to be written back to vmcb01 on nested entry as the L1 RSP/RAX is up to date (skipping the VMRUN instruction doesn’t change those, although otherwise we do write those back to current vmcb). *** Note 2: Note on management and synchronization of the KVM’s internal guest register state: *** I am talking about (vcpu->arch.regs and vcpu->arch.cr*, and such) This state is supposed to be the master copy of the guest register state, and usually it is, and keeping it up to date vs vmcb state is done differently for different registers. -> For general purpose registers which VMCS/VMCB don’t contain, there is nothing to synchronize, and these registers are saved/loaded on each guest entry/exit. -> For general purpose registers which SVM does load/store from/to vmcb we update them manually in the cache (in svm_vcpu_run), in both directions. That includes RIP as well. -> Most of the control registers are intercepted, and thus the register cache is updated when the guest tries to change them. Those that aren’t intercepted (CR3 with NPT for example), their cached value is also updated on each guest entry/exit. -> Some more rare used registers (e.g segment registers on SVM) are not cached in vcpu->arch.* at all, but rather have functions to read them (.get_segment, .get_msr, .get_rflags and such) which read them from the vmcb thus for those registers, the vmcb is the master copy holding them. There is also a notion of available and dirty registers which I didn't mention here, but those aren't used much for SVM. PDPTRs, ahem, ahem.... *** Note 3: A note about nested VMX *** Note that on nested VMX TOC/TOU races are not an issue. On VMX, all VMCS manipulation are done by: 1. Loading a VMCS (using vmptrld instruction), at that point the CPU is free to load the whole VMCS into internal memory 2. Changing the VMCS by using vmread/vmwrite instruction and/or by running a VM, all of them are free to change the internal copy and not write anything back to the memory image. 3. Finally, "clearing" the VMCS by running vmclear instruction in it, which finally have to write back the cpu cache to the vmcs in the memory. For nested VMX, this means that we read the whole vmcb12 into a kernel copy (vmx->nested.cached_vmcs12), and from there only the kernel copy is updated when the nested guest uses vmread/vmwrite. Only when the guest does vmclear we flush back our kernel copy back to the vmcs12 in the memory. On nested guest entry thus, even if a malicious L1 tries to update the vmcs12 memory we won’t access it. There is also the notion of shadow vmcs and evmcs but for our sanity I don't want to talk about them yet.
On Wed, Aug 11, 2021, Maxim Levitsky wrote: > On Mon, 2021-08-09 at 16:53 +0200, Emanuele Giuseppe Esposito wrote: > > @@ -1336,7 +1335,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, > > if (!(save->cr0 & X86_CR0_PG) || > > !(save->cr0 & X86_CR0_PE) || > > (save->rflags & X86_EFLAGS_VM) || > > - !nested_vmcb_valid_sregs(vcpu, save)) > > + !nested_vmcb_valid_sregs(vcpu, save, save->efer, save->cr0, > > + save->cr4)) > > goto out_free; > > > > /* > The disadvantage of my approach is that fields are copied twice, once from > vmcb12 to its local copy, and then from the local copy to vmcb02, however > this approach is generic in such a way that TOC/TOI races become impossible. > > The disadvantage of your approach is that only some fields are copied and > there is still a chance of TOC/TOI race in the future. The partial copy makes me nervous too. I also don't like pulling out select registers and passing them by value; IMO the resulting code is harder to follow and will be more difficult to maintain, e.g. it won't scale if the list of regs to check grows. But I don't think we need to copy _everything_. There's also an opportunity to clean up svm_set_nested_state(), though the ABI ramifications may be problematic. Instead of passing vmcb_control_area and vmcb_save_area to nested_vmcb_valid_sregs() and nested_vmcb_valid_sregs(), pass svm_nested_state and force the helpers to extract the save/control fields from the nested state. If a new check is added to KVM, it will be obvious (and hopefully fail) if the state being check is not copied from vmcb12. Regarding svm_set_nested_state(), if we can clobber svm->nested.ctl and svm->nested.save (doesn't exist currently) on a failed ioctl(), then the temporary allocations for those can be replaced with using svm->nested as the buffer. And to mitigate the cost of copying to a kernel-controlled cache, we should use the VMCB Clean bits as they're intended. Each set bit in the VMCB Clean field allows the processor to load one guest register or group of registers from the hardware cache; E.g. copy from vmcb12 iff the clean bit is clear. Then we could further optimize nested VMRUN to skip checks based on clean bits.
Hi Sean, (Spoiler alert: I am new on all this stuff, so I would like to have some clarifications about your suggestion. Thank you in advance) On 12/08/2021 01:25, Sean Christopherson wrote: > On Wed, Aug 11, 2021, Maxim Levitsky wrote: >> On Mon, 2021-08-09 at 16:53 +0200, Emanuele Giuseppe Esposito wrote: >>> @@ -1336,7 +1335,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, >>> if (!(save->cr0 & X86_CR0_PG) || >>> !(save->cr0 & X86_CR0_PE) || >>> (save->rflags & X86_EFLAGS_VM) || >>> - !nested_vmcb_valid_sregs(vcpu, save)) >>> + !nested_vmcb_valid_sregs(vcpu, save, save->efer, save->cr0, >>> + save->cr4)) >>> goto out_free; >>> >>> /* >> The disadvantage of my approach is that fields are copied twice, once from >> vmcb12 to its local copy, and then from the local copy to vmcb02, however >> this approach is generic in such a way that TOC/TOI races become impossible. >> >> The disadvantage of your approach is that only some fields are copied and >> there is still a chance of TOC/TOI race in the future. > > The partial copy makes me nervous too. I also don't like pulling out select > registers and passing them by value; IMO the resulting code is harder to follow > and will be more difficult to maintain, e.g. it won't scale if the list of regs > to check grows. > > But I don't think we need to copy _everything_. There's also an opportunity to > clean up svm_set_nested_state(), though the ABI ramifications may be problematic. > > Instead of passing vmcb_control_area and vmcb_save_area to nested_vmcb_valid_sregs() > and nested_vmcb_valid_sregs(), pass svm_nested_state and force the helpers to extract > the save/control fields from the nested state. If a new check is added to KVM, it > will be obvious (and hopefully fail) if the state being check is not copied from vmcb12. I think I understood what you mean here, so basically you propose of having svm->nested.save and its helpers similar to copy_vmcb_save_area, but for now just copy the fields that we want to protect, ie only efer, cr0, cr4 and maybe also cr3 (to be consistent with VMCB_CR of clean bits). Then pass svm->nested.save instead of vmcb12->save to nested_vmcb_valid_sregs() and use it also for nested_vmcb02_prepare_save(), to avoid TOC/TOU issues. At least that's how I understood it. > > Regarding svm_set_nested_state(), if we can clobber svm->nested.ctl and svm->nested.save > (doesn't exist currently) on a failed ioctl(), then the temporary allocations for those > can be replaced with using svm->nested as the buffer. I am not sure what you mean with failed ioctl(), but I guess the meaning here is to replace the kzalloc'ed ctl and save variables with these two states (nested.save and nested.ctl). > > And to mitigate the cost of copying to a kernel-controlled cache, we should use > the VMCB Clean bits as they're intended. > > Each set bit in the VMCB Clean field allows the processor to load one guest > register or group of registers from the hardware cache; > > E.g. copy from vmcb12 iff the clean bit is clear. Then we could further optimize > nested VMRUN to skip checks based on clean bits. > I looked up the clean fields, so my understanding is that if we do set EFER/CR0/CR4 in nested_vmcb02_prepare_save() with nested.save, we don't need to check the clean bits because "The guest's execution can cause cached state to be updated, but the hypervisor is not responsible for setting VMCB Clean bits corresponding to any state changes caused by guest execution." and setting the VMCB_CR after copying the vmcb12 save fields into the nested state. But I don't think this is what you mean here, especially when saying > copy from vmcb12 iff the clean bit is clear Thank you, Emanuele
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 0ac2d14add15..04e9e947deb9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -259,20 +259,14 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, /* Common checks that apply to both L1 and L2 state. */ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) + struct vmcb_save_area *save, + u64 efer, u64 cr0, u64 cr4) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ - if (CC(!(save->efer & EFER_SVME))) + if (CC(!(efer & EFER_SVME))) return false; - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || - CC(save->cr0 & ~0xffffffffULL)) + if (CC((cr0 & X86_CR0_CD) == 0 && (cr0 & X86_CR0_NW)) || + CC(cr0 & ~0xffffffffULL)) return false; if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) @@ -283,17 +277,16 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, * except that EFER.LMA is not checked by SVM against * CR0.PG && EFER.LME. */ - if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { - if (CC(!(save->cr4 & X86_CR4_PAE)) || - CC(!(save->cr0 & X86_CR0_PE)) || + if ((efer & EFER_LME) && (cr0 & X86_CR0_PG)) { + if (CC(!(cr4 & X86_CR4_PAE)) || CC(!(cr0 & X86_CR0_PE)) || CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + if (CC(!kvm_is_valid_cr4(vcpu, cr4))) return false; - if (CC(!kvm_valid_efer(vcpu, save->efer))) + if (CC(!kvm_valid_efer(vcpu, efer))) return false; return true; @@ -434,7 +427,9 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; } -static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, + struct vmcb *vmcb12, + u64 efer, u64 cr0, u64 cr4) { bool new_vmcb12 = false; @@ -463,15 +458,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, cr0); + svm_set_cr4(&svm->vcpu, cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -567,6 +557,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, { struct vcpu_svm *svm = to_svm(vcpu); int ret; + u64 vmcb12_efer, vmcb12_cr0, vmcb12_cr4; trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, vmcb12->save.rip, @@ -589,8 +580,25 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); + + /* Save vmcb12's EFER, CR0 and CR4 to avoid TOC/TOU races. */ + vmcb12_efer = vmcb12->save.efer; + vmcb12_cr0 = vmcb12->save.cr0; + vmcb12_cr4 = vmcb12->save.cr4; + + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save, vmcb12_efer, + vmcb12_cr0, vmcb12_cr4) || + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; + return 1; + } + nested_vmcb02_prepare_control(svm); - nested_vmcb02_prepare_save(svm, vmcb12); + nested_vmcb02_prepare_save(svm, vmcb12, vmcb12_efer, vmcb12_cr0, + vmcb12_cr4); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm), true); @@ -641,15 +649,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_load_control_from_vmcb12(svm, &vmcb12->control); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { - vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = 0; - vmcb12->control.exit_info_1 = 0; - vmcb12->control.exit_info_2 = 0; - goto out; - } - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. @@ -1336,7 +1335,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) + !nested_vmcb_valid_sregs(vcpu, save, save->efer, save->cr0, + save->cr4)) goto out_free; /*
Move the checks done by nested_vmcb_valid_sregs and nested_vmcb_check_controls directly in enter_svm_guest_mode, and save the values of vmcb12's efer, cr0 and cr4 in local variable that are then passed to nested_vmcb02_prepare_save. This prevents from creating TOC/TOU races. This also avoids the need of force-setting EFER_SVME in nested_vmcb02_prepare_save. Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com> --- arch/x86/kvm/svm/nested.c | 72 +++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 36 deletions(-)