@@ -633,8 +633,16 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- return rsm_load_state_64(ctxt, &smram.smram64);
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
- return rsm_load_state_32(ctxt, &smram.smram32);
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * Set nested_run_pending to ensure completion of a nested VM-Entry
+ */
+ if (ret == X86EMUL_CONTINUE && ctxt->ops->is_guest_mode(ctxt))
+ vcpu->arch.nested_run_pending = 1;
+
+ return ret;
}
@@ -4699,8 +4699,6 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
if (ret)
goto unmap_save;
- vcpu->arch.nested_run_pending = 1;
-
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save, true);
unmap_map:
@@ -8214,7 +8214,6 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
if (ret)
return ret;
- vcpu->arch.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
Syzkaller found a warning triggered in nested_vmx_vmexit(). The nested_run_pending flag is non-zero, even though we're in nested_vmx_vmexit(). Generally, trying to cancel a pending entry is considered a bug. However in this particular scenario, the kernel behavior seems correct. Syzkaller scenario: 1) Set up VCPU's 2) Run some code with KVM_RUN in L2 as a nested guest 3) Return from KVM_RUN 4) Inject KVM_SMI into the VCPU 5) Change the EFER register with KVM_SET_SREGS to value 0x2501 6) Run some code on the VCPU using KVM_RUN 7) Observe following behavior: kvm_smm_transition: vcpu 0: entering SMM, smbase 0x30000 kvm_entry: vcpu 0, rip 0x8000 kvm_entry: vcpu 0, rip 0x8000 kvm_entry: vcpu 0, rip 0x8002 kvm_smm_transition: vcpu 0: leaving SMM, smbase 0x30000 kvm_nested_vmenter: rip: 0x0000000000008002 vmcs: 0x0000000000007000 nested_rip: 0x0000000000000000 int_ctl: 0x00000000 event_inj: 0x00000000 nested_ept=n guest cr3: 0x0000000000002000 kvm_nested_vmexit_inject: reason: TRIPLE_FAULT ext_inf1: 0x0000000000000000 ext_inf2: 0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000 What happened here is an SMI was injected immediately and the handler was called at address 0x8000; all is good. Later, an RSM instruction is executed in an emulator to return to the nested VM. em_rsm() is called, which leads to emulator_leave_smm(). A part of this function calls the VMX/SVM callback, in this case vmx_leave_smm(). It attempts to set up a pending reentry to the guest VM by calling nested_vmx_enter_non_root_mode() and sets the nested_run_pending flag to one. Unfortunately, later in emulator_leave_smm(), rsm_load_state_64() fails to write an invalid EFER to the MSR. This results in em_rsm() calling the triple_fault callback. At this point, it's clear that KVM should perform a vmexit, but nested_run_pending is left set to 1. Similar flow goes for SVM, as the bug also reproduces on AMD platforms. [Notes above by Michal Wilczynski of the bug analysis] To address the issue, this change sets the nested_run_pending flag only after a successful emulation of the RSM instruction. Previously, it was set (prematurely) inside the vendor-specific leave_smm() callback since commit 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM"). Fixes: 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM") Reported-by: Zheyu Ma <zheyuma97@gmail.com> Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski <michal.wilczynski@intel.com> Signed-off-by: Kishen Maloor <kishen.maloor@intel.com> --- arch/x86/kvm/smm.c | 12 ++++++++++-- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx.c | 1 - 3 files changed, 10 insertions(+), 5 deletions(-)