Message ID | 20170710204936.4001-4-bsd@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 10.07.2017 22:49, Bandan Das wrote: > When L2 uses vmfunc, L0 utilizes the associated vmexit to > emulate a switching of the ept pointer by reloading the > guest MMU. > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > Signed-off-by: Bandan Das <bsd@redhat.com> > --- > arch/x86/include/asm/vmx.h | 6 +++++ > arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 61 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index da5375e..5f63a2e 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -115,6 +115,10 @@ > #define VMX_MISC_SAVE_EFER_LMA 0x00000020 > #define VMX_MISC_ACTIVITY_HLT 0x00000040 > > +/* VMFUNC functions */ > +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 > +#define VMFUNC_EPTP_ENTRIES 512 > + > static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) > { > return vmx_basic & GENMASK_ULL(30, 0); > @@ -200,6 +204,8 @@ enum vmcs_field { > EOI_EXIT_BITMAP2_HIGH = 0x00002021, > EOI_EXIT_BITMAP3 = 0x00002022, > EOI_EXIT_BITMAP3_HIGH = 0x00002023, > + EPTP_LIST_ADDRESS = 0x00002024, > + EPTP_LIST_ADDRESS_HIGH = 0x00002025, > VMREAD_BITMAP = 0x00002026, > VMWRITE_BITMAP = 0x00002028, > XSS_EXIT_BITMAP = 0x0000202C, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index fe8f5fc..0a969fb 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -246,6 +246,7 @@ struct __packed vmcs12 { > u64 eoi_exit_bitmap1; > u64 eoi_exit_bitmap2; > u64 eoi_exit_bitmap3; > + u64 eptp_list_address; > u64 xss_exit_bitmap; > u64 guest_physical_address; > u64 vmcs_link_pointer; > @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { > FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), > FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), > FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), > + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), > FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), > FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), > FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), > @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) > return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); > } > > +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) > +{ > + return nested_cpu_has_vmfunc(vmcs12) && > + (vmcs12->vm_function_control & I wonder if it makes sense to rename vm_function_control to - vmfunc_control - vmfunc_controls (so it matches nested_vmx_vmfunc_controls) - vmfunc_ctrl > + VMX_VMFUNC_EPTP_SWITCHING); > +} > + > static inline bool is_nmi(u32 intr_info) > { > return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) > @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) > if (cpu_has_vmx_vmfunc()) { > vmx->nested.nested_vmx_secondary_ctls_high |= > SECONDARY_EXEC_ENABLE_VMFUNC; > - vmx->nested.nested_vmx_vmfunc_controls = 0; > + /* > + * Advertise EPTP switching unconditionally > + * since we emulate it > + */ > + vmx->nested.nested_vmx_vmfunc_controls = > + VMX_VMFUNC_EPTP_SWITCHING;> } > > /* > @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) > struct vcpu_vmx *vmx = to_vmx(vcpu); > struct vmcs12 *vmcs12; > u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; > + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; > + struct page *page = NULL; > + u64 *l1_eptp_list, address; > > /* > * VMFUNC is only supported for nested guests, but we always enable the > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) > } > > vmcs12 = get_vmcs12(vcpu); > - if ((vmcs12->vm_function_control & (1 << function)) == 0) > + if (((vmcs12->vm_function_control & (1 << function)) == 0) || > + WARN_ON_ONCE(function)) "... instruction causes a VM exit if the bit at position EAX is 0 in the VM-function controls (the selected VM function is not enabled)." So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then completely. > + goto fail; > + > + if (!nested_cpu_has_ept(vmcs12) || > + !nested_cpu_has_eptp_switching(vmcs12)) > + goto fail; > + > + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES) > + goto fail; I can find the definition for an vmexit in case of index >= VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. Can you give me a hint? > + > + page = nested_get_page(vcpu, vmcs12->eptp_list_address); > + if (!page) > goto fail; > - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); > + > + l1_eptp_list = kmap(page); > + address = l1_eptp_list[index]; > + if (!address) > + goto fail; Can you move that check to the other address checks below? (or rework if this make sense, see below) > + /* > + * If the (L2) guest does a vmfunc to the currently > + * active ept pointer, we don't have to do anything else > + */ > + if (vmcs12->ept_pointer != address) { > + if (address >> cpuid_maxphyaddr(vcpu) || > + !IS_ALIGNED(address, 4096)) Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? (triggering a KVM_REQ_TRIPLE_FAULT) > + goto fail; > + kvm_mmu_unload(vcpu); > + vmcs12->ept_pointer = address; > + kvm_mmu_reload(vcpu); I was thinking about something like this: kvm_mmu_unload(vcpu); old = vmcs12->ept_pointer; vmcs12->ept_pointer = address; if (kvm_mmu_reload(vcpu)) { /* pointer invalid, restore previous state */ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); vmcs12->ept_pointer = old; kvm_mmu_reload(vcpu); goto fail; } The you can inherit the checks from mmu_check_root(). Wonder why I can't spot checks for cpuid_maxphyaddr() / IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The checks should be identical. > + kunmap(page); > + nested_release_page_clean(page); shouldn't the kunmap + nested_release_page_clean go outside the if clause? > + } > + return kvm_skip_emulated_instruction(vcpu); > > fail: > + if (page) { > + kunmap(page); > + nested_release_page_clean(page); > + } > nested_vmx_vmexit(vcpu, vmx->exit_reason, > vmcs_read32(VM_EXIT_INTR_INFO), > vmcs_readl(EXIT_QUALIFICATION)); > David and mmu code are not yet best friends. So sorry if I am missing something.
On 11/07/2017 09:51, David Hildenbrand wrote: >> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has_vmfunc(vmcs12) && >> + (vmcs12->vm_function_control & > > I wonder if it makes sense to rename vm_function_control to > - vmfunc_control > - vmfunc_controls (so it matches nested_vmx_vmfunc_controls) > - vmfunc_ctrl Blame Intel for this. :) They use full English names for VMCS fields (so "VM-function control") and uppercase names for MSRs (so "IA32_VMX_VMFUNC"). "nested_vmx_vmfunc_controls" could be renamed to "nested_vmx_vmfunc" for consistency, but I like the longer name too. Paolo
[David did a great review, so I'll just point out things I noticed.] 2017-07-11 09:51+0200, David Hildenbrand: > On 10.07.2017 22:49, Bandan Das wrote: > > When L2 uses vmfunc, L0 utilizes the associated vmexit to > > emulate a switching of the ept pointer by reloading the > > guest MMU. > > > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > > Signed-off-by: Bandan Das <bsd@redhat.com> > > --- > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) > > } > > > > vmcs12 = get_vmcs12(vcpu); > > - if ((vmcs12->vm_function_control & (1 << function)) == 0) > > + if (((vmcs12->vm_function_control & (1 << function)) == 0) || > > + WARN_ON_ONCE(function)) > > "... instruction causes a VM exit if the bit at position EAX is 0 in the > VM-function controls (the selected VM function is > not enabled)." > > So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then > completely. It assumes that vm_function_control is not > 1, which is (should be) guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR is 1. > > + goto fail; The rest of the code assumes that the function is VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable. Writing it as WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING) would be cleared and I'd prefer to move the part that handles VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is going to add more than one VM FUNC. :]) > > + if (!nested_cpu_has_ept(vmcs12) || > > + !nested_cpu_has_eptp_switching(vmcs12)) > > + goto fail; This brings me to a missing vm-entry check: If “EPTP switching” VM-function control is 1, the “enable EPT” VM-execution control must also be 1. In addition, the EPTP-list address must satisfy the following checks: • Bits 11:0 of the address must be 0. • The address must not set any bits beyond the processor’s physical-address width. so this one could be if (!nested_cpu_has_eptp_switching(vmcs12) || WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) after adding the check.
David Hildenbrand <david@redhat.com> writes: > On 10.07.2017 22:49, Bandan Das wrote: >> When L2 uses vmfunc, L0 utilizes the associated vmexit to >> emulate a switching of the ept pointer by reloading the >> guest MMU. >> >> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> >> Signed-off-by: Bandan Das <bsd@redhat.com> >> --- >> arch/x86/include/asm/vmx.h | 6 +++++ >> arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- >> 2 files changed, 61 insertions(+), 3 deletions(-) >> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >> index da5375e..5f63a2e 100644 >> --- a/arch/x86/include/asm/vmx.h >> +++ b/arch/x86/include/asm/vmx.h >> @@ -115,6 +115,10 @@ >> #define VMX_MISC_SAVE_EFER_LMA 0x00000020 >> #define VMX_MISC_ACTIVITY_HLT 0x00000040 >> >> +/* VMFUNC functions */ >> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 >> +#define VMFUNC_EPTP_ENTRIES 512 >> + >> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) >> { >> return vmx_basic & GENMASK_ULL(30, 0); >> @@ -200,6 +204,8 @@ enum vmcs_field { >> EOI_EXIT_BITMAP2_HIGH = 0x00002021, >> EOI_EXIT_BITMAP3 = 0x00002022, >> EOI_EXIT_BITMAP3_HIGH = 0x00002023, >> + EPTP_LIST_ADDRESS = 0x00002024, >> + EPTP_LIST_ADDRESS_HIGH = 0x00002025, >> VMREAD_BITMAP = 0x00002026, >> VMWRITE_BITMAP = 0x00002028, >> XSS_EXIT_BITMAP = 0x0000202C, >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index fe8f5fc..0a969fb 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -246,6 +246,7 @@ struct __packed vmcs12 { >> u64 eoi_exit_bitmap1; >> u64 eoi_exit_bitmap2; >> u64 eoi_exit_bitmap3; >> + u64 eptp_list_address; >> u64 xss_exit_bitmap; >> u64 guest_physical_address; >> u64 vmcs_link_pointer; >> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { >> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), >> FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), >> FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), >> + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), >> FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), >> FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), >> FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), >> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) >> return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); >> } >> >> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has_vmfunc(vmcs12) && >> + (vmcs12->vm_function_control & > > I wonder if it makes sense to rename vm_function_control to > - vmfunc_control > - vmfunc_controls (so it matches nested_vmx_vmfunc_controls) > - vmfunc_ctrl I tend to follow the SDM names because it's easy to look for them. >> + VMX_VMFUNC_EPTP_SWITCHING); >> +} >> + >> static inline bool is_nmi(u32 intr_info) >> { >> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) >> if (cpu_has_vmx_vmfunc()) { >> vmx->nested.nested_vmx_secondary_ctls_high |= >> SECONDARY_EXEC_ENABLE_VMFUNC; >> - vmx->nested.nested_vmx_vmfunc_controls = 0; >> + /* >> + * Advertise EPTP switching unconditionally >> + * since we emulate it >> + */ >> + vmx->nested.nested_vmx_vmfunc_controls = >> + VMX_VMFUNC_EPTP_SWITCHING;> } >> >> /* >> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> struct vmcs12 *vmcs12; >> u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; >> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; >> + struct page *page = NULL; >> + u64 *l1_eptp_list, address; >> >> /* >> * VMFUNC is only supported for nested guests, but we always enable the >> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >> } >> >> vmcs12 = get_vmcs12(vcpu); >> - if ((vmcs12->vm_function_control & (1 << function)) == 0) >> + if (((vmcs12->vm_function_control & (1 << function)) == 0) || >> + WARN_ON_ONCE(function)) > > "... instruction causes a VM exit if the bit at position EAX is 0 in the > VM-function controls (the selected VM function is > not enabled)." > > So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then > completely. It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's not misused. >> + goto fail; >> + >> + if (!nested_cpu_has_ept(vmcs12) || >> + !nested_cpu_has_eptp_switching(vmcs12)) >> + goto fail; >> + >> + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES) >> + goto fail; > > I can find the definition for an vmexit in case of index >= > VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. > > Can you give me a hint? I don't think there is. Since, we are basically emulating eptp switching for L2, this is a good check to have. >> + >> + page = nested_get_page(vcpu, vmcs12->eptp_list_address); >> + if (!page) >> goto fail; >> - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); >> + >> + l1_eptp_list = kmap(page); >> + address = l1_eptp_list[index]; >> + if (!address) >> + goto fail; > > Can you move that check to the other address checks below? (or rework if > this make sense, see below) > >> + /* >> + * If the (L2) guest does a vmfunc to the currently >> + * active ept pointer, we don't have to do anything else >> + */ >> + if (vmcs12->ept_pointer != address) { >> + if (address >> cpuid_maxphyaddr(vcpu) || >> + !IS_ALIGNED(address, 4096)) > > Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? > (triggering a KVM_REQ_TRIPLE_FAULT) If there's a triple fault, I think it's a good idea to inject it back. Basically, there's no need to take care of damage control that L1 is intentionally doing. >> + goto fail; >> + kvm_mmu_unload(vcpu); >> + vmcs12->ept_pointer = address; >> + kvm_mmu_reload(vcpu); > > I was thinking about something like this: > > kvm_mmu_unload(vcpu); > old = vmcs12->ept_pointer; > vmcs12->ept_pointer = address; > if (kvm_mmu_reload(vcpu)) { > /* pointer invalid, restore previous state */ > kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); > vmcs12->ept_pointer = old; > kvm_mmu_reload(vcpu); > goto fail; > } > > The you can inherit the checks from mmu_check_root(). > > > Wonder why I can't spot checks for cpuid_maxphyaddr() / > IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The > checks should be identical. I think the reason is vmcs12->ept_pointer is never used directly. It's used to create a shadow table but nevertheless, the check should be there. > >> + kunmap(page); >> + nested_release_page_clean(page); > > shouldn't the kunmap + nested_release_page_clean go outside the if clause? :) Indeed, thanks for the catch. Bandan >> + } >> + return kvm_skip_emulated_instruction(vcpu); >> >> fail: >> + if (page) { >> + kunmap(page); >> + nested_release_page_clean(page); >> + } >> nested_vmx_vmexit(vcpu, vmx->exit_reason, >> vmcs_read32(VM_EXIT_INTR_INFO), >> vmcs_readl(EXIT_QUALIFICATION)); >> > > David and mmu code are not yet best friends. So sorry if I am missing > something.
Radim Krčmář <rkrcmar@redhat.com> writes: > [David did a great review, so I'll just point out things I noticed.] > > 2017-07-11 09:51+0200, David Hildenbrand: >> On 10.07.2017 22:49, Bandan Das wrote: >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to >> > emulate a switching of the ept pointer by reloading the >> > guest MMU. >> > >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> >> > Signed-off-by: Bandan Das <bsd@redhat.com> >> > --- >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >> > } >> > >> > vmcs12 = get_vmcs12(vcpu); >> > - if ((vmcs12->vm_function_control & (1 << function)) == 0) >> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) || >> > + WARN_ON_ONCE(function)) >> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the >> VM-function controls (the selected VM function is >> not enabled)." >> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then >> completely. > > It assumes that vm_function_control is not > 1, which is (should be) > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR > is 1. > >> > + goto fail; > > The rest of the code assumes that the function is > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable. > > Writing it as > > WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING) > > would be cleared and I'd prefer to move the part that handles > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is > going to add more than one VM FUNC. :]) IMO, for now, this should be fine because we are not even passing through the hardware's eptp switching. Even if there are other vm functions, they won't be available for the nested case and cause any conflict. >> > + if (!nested_cpu_has_ept(vmcs12) || >> > + !nested_cpu_has_eptp_switching(vmcs12)) >> > + goto fail; > > This brings me to a missing vm-entry check: > > If “EPTP switching” VM-function control is 1, the “enable EPT” > VM-execution control must also be 1. In addition, the EPTP-list address > must satisfy the following checks: > • Bits 11:0 of the address must be 0. > • The address must not set any bits beyond the processor’s > physical-address width. > > so this one could be > > if (!nested_cpu_has_eptp_switching(vmcs12) || > WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) I will reverse the order here but the vm entry check is unnecessary because the check on the list address is already being done in this function. > after adding the check.
On Tue, Jul 11, 2017 at 10:58 AM, Bandan Das <bsd@redhat.com> wrote: > David Hildenbrand <david@redhat.com> writes: > >> On 10.07.2017 22:49, Bandan Das wrote: >>> When L2 uses vmfunc, L0 utilizes the associated vmexit to >>> emulate a switching of the ept pointer by reloading the >>> guest MMU. >>> >>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> >>> Signed-off-by: Bandan Das <bsd@redhat.com> >>> --- >>> arch/x86/include/asm/vmx.h | 6 +++++ >>> arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++--- >>> 2 files changed, 61 insertions(+), 3 deletions(-) >>> >>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >>> index da5375e..5f63a2e 100644 >>> --- a/arch/x86/include/asm/vmx.h >>> +++ b/arch/x86/include/asm/vmx.h >>> @@ -115,6 +115,10 @@ >>> #define VMX_MISC_SAVE_EFER_LMA 0x00000020 >>> #define VMX_MISC_ACTIVITY_HLT 0x00000040 >>> >>> +/* VMFUNC functions */ >>> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 >>> +#define VMFUNC_EPTP_ENTRIES 512 >>> + >>> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) >>> { >>> return vmx_basic & GENMASK_ULL(30, 0); >>> @@ -200,6 +204,8 @@ enum vmcs_field { >>> EOI_EXIT_BITMAP2_HIGH = 0x00002021, >>> EOI_EXIT_BITMAP3 = 0x00002022, >>> EOI_EXIT_BITMAP3_HIGH = 0x00002023, >>> + EPTP_LIST_ADDRESS = 0x00002024, >>> + EPTP_LIST_ADDRESS_HIGH = 0x00002025, >>> VMREAD_BITMAP = 0x00002026, >>> VMWRITE_BITMAP = 0x00002028, >>> XSS_EXIT_BITMAP = 0x0000202C, >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>> index fe8f5fc..0a969fb 100644 >>> --- a/arch/x86/kvm/vmx.c >>> +++ b/arch/x86/kvm/vmx.c >>> @@ -246,6 +246,7 @@ struct __packed vmcs12 { >>> u64 eoi_exit_bitmap1; >>> u64 eoi_exit_bitmap2; >>> u64 eoi_exit_bitmap3; >>> + u64 eptp_list_address; >>> u64 xss_exit_bitmap; >>> u64 guest_physical_address; >>> u64 vmcs_link_pointer; >>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { >>> FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), >>> FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), >>> FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), >>> + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), >>> FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), >>> FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), >>> FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), >>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) >>> return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); >>> } >>> >>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >>> +{ >>> + return nested_cpu_has_vmfunc(vmcs12) && >>> + (vmcs12->vm_function_control & >> >> I wonder if it makes sense to rename vm_function_control to >> - vmfunc_control >> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls) >> - vmfunc_ctrl > > I tend to follow the SDM names because it's easy to look for them. > >>> + VMX_VMFUNC_EPTP_SWITCHING); >>> +} >>> + >>> static inline bool is_nmi(u32 intr_info) >>> { >>> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) >>> if (cpu_has_vmx_vmfunc()) { >>> vmx->nested.nested_vmx_secondary_ctls_high |= >>> SECONDARY_EXEC_ENABLE_VMFUNC; >>> - vmx->nested.nested_vmx_vmfunc_controls = 0; >>> + /* >>> + * Advertise EPTP switching unconditionally >>> + * since we emulate it >>> + */ >>> + vmx->nested.nested_vmx_vmfunc_controls = >>> + VMX_VMFUNC_EPTP_SWITCHING;> } >>> >>> /* >>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >>> struct vcpu_vmx *vmx = to_vmx(vcpu); >>> struct vmcs12 *vmcs12; >>> u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; >>> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; >>> + struct page *page = NULL; >>> + u64 *l1_eptp_list, address; >>> >>> /* >>> * VMFUNC is only supported for nested guests, but we always enable the >>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >>> } >>> >>> vmcs12 = get_vmcs12(vcpu); >>> - if ((vmcs12->vm_function_control & (1 << function)) == 0) >>> + if (((vmcs12->vm_function_control & (1 << function)) == 0) || >>> + WARN_ON_ONCE(function)) >> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the >> VM-function controls (the selected VM function is >> not enabled)." >> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then >> completely. > > It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's > not misused. > >>> + goto fail; >>> + >>> + if (!nested_cpu_has_ept(vmcs12) || >>> + !nested_cpu_has_eptp_switching(vmcs12)) >>> + goto fail; >>> + >>> + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES) >>> + goto fail; >> >> I can find the definition for an vmexit in case of index >= >> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. >> >> Can you give me a hint? > > I don't think there is. Since, we are basically emulating eptp switching > for L2, this is a good check to have. There is nothing wrong with a hypervisor using physical page 0 for whatever purpose it likes, including an EPTP list. >>> + >>> + page = nested_get_page(vcpu, vmcs12->eptp_list_address); >>> + if (!page) >>> goto fail; >>> - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); >>> + >>> + l1_eptp_list = kmap(page); >>> + address = l1_eptp_list[index]; >>> + if (!address) >>> + goto fail; >> >> Can you move that check to the other address checks below? (or rework if >> this make sense, see below) >> >>> + /* >>> + * If the (L2) guest does a vmfunc to the currently >>> + * active ept pointer, we don't have to do anything else >>> + */ >>> + if (vmcs12->ept_pointer != address) { >>> + if (address >> cpuid_maxphyaddr(vcpu) || >>> + !IS_ALIGNED(address, 4096)) >> >> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? >> (triggering a KVM_REQ_TRIPLE_FAULT) > > If there's a triple fault, I think it's a good idea to inject it > back. Basically, there's no need to take care of damage control > that L1 is intentionally doing. > >>> + goto fail; >>> + kvm_mmu_unload(vcpu); >>> + vmcs12->ept_pointer = address; >>> + kvm_mmu_reload(vcpu); >> >> I was thinking about something like this: >> >> kvm_mmu_unload(vcpu); >> old = vmcs12->ept_pointer; >> vmcs12->ept_pointer = address; >> if (kvm_mmu_reload(vcpu)) { >> /* pointer invalid, restore previous state */ >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> vmcs12->ept_pointer = old; >> kvm_mmu_reload(vcpu); >> goto fail; >> } >> >> The you can inherit the checks from mmu_check_root(). >> >> >> Wonder why I can't spot checks for cpuid_maxphyaddr() / >> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The >> checks should be identical. > > I think the reason is vmcs12->ept_pointer is never used directly. It's > used to create a shadow table but nevertheless, the check should be there. > >> >>> + kunmap(page); >>> + nested_release_page_clean(page); >> >> shouldn't the kunmap + nested_release_page_clean go outside the if clause? > > :) Indeed, thanks for the catch. > > Bandan > >>> + } >>> + return kvm_skip_emulated_instruction(vcpu); >>> >>> fail: >>> + if (page) { >>> + kunmap(page); >>> + nested_release_page_clean(page); >>> + } >>> nested_vmx_vmexit(vcpu, vmx->exit_reason, >>> vmcs_read32(VM_EXIT_INTR_INFO), >>> vmcs_readl(EXIT_QUALIFICATION)); >>> >> >> David and mmu code are not yet best friends. So sorry if I am missing >> something.
Bandan Das <bsd@redhat.com> writes: .... >>> + /* >>> + * If the (L2) guest does a vmfunc to the currently >>> + * active ept pointer, we don't have to do anything else >>> + */ >>> + if (vmcs12->ept_pointer != address) { >>> + if (address >> cpuid_maxphyaddr(vcpu) || >>> + !IS_ALIGNED(address, 4096)) >> >> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? >> (triggering a KVM_REQ_TRIPLE_FAULT) > > If there's a triple fault, I think it's a good idea to inject it > back. Basically, there's no need to take care of damage control > that L1 is intentionally doing. > >>> + goto fail; >>> + kvm_mmu_unload(vcpu); >>> + vmcs12->ept_pointer = address; >>> + kvm_mmu_reload(vcpu); >> >> I was thinking about something like this: >> >> kvm_mmu_unload(vcpu); >> old = vmcs12->ept_pointer; >> vmcs12->ept_pointer = address; >> if (kvm_mmu_reload(vcpu)) { >> /* pointer invalid, restore previous state */ >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> vmcs12->ept_pointer = old; >> kvm_mmu_reload(vcpu); >> goto fail; >> } >> >> The you can inherit the checks from mmu_check_root(). Actually, thinking about this a bit more, I agree with you. Any fault with a vmfunc operation should end with a vmfunc vmexit, so this is a good thing to have. Thank you for this idea! :) Bandan
Jim Mattson <jmattson@google.com> writes: ... >>> I can find the definition for an vmexit in case of index >= >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. >>> >>> Can you give me a hint? >> >> I don't think there is. Since, we are basically emulating eptp switching >> for L2, this is a good check to have. > > There is nothing wrong with a hypervisor using physical page 0 for > whatever purpose it likes, including an EPTP list. Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list address most likely means it forgot to initialize it. Whatever damage it does will still end up with vmfunc vmexit anyway. Bandan
2017-07-11 14:05-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > > [David did a great review, so I'll just point out things I noticed.] > > > > 2017-07-11 09:51+0200, David Hildenbrand: > >> On 10.07.2017 22:49, Bandan Das wrote: > >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to > >> > emulate a switching of the ept pointer by reloading the > >> > guest MMU. > >> > > >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > >> > Signed-off-by: Bandan Das <bsd@redhat.com> > >> > --- > >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) > >> > } > >> > > >> > vmcs12 = get_vmcs12(vcpu); > >> > - if ((vmcs12->vm_function_control & (1 << function)) == 0) > >> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) || > >> > + WARN_ON_ONCE(function)) > >> > >> "... instruction causes a VM exit if the bit at position EAX is 0 in the > >> VM-function controls (the selected VM function is > >> not enabled)." > >> > >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then > >> completely. > > > > It assumes that vm_function_control is not > 1, which is (should be) > > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR > > is 1. > > > >> > + goto fail; > > > > The rest of the code assumes that the function is > > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable. > > > > Writing it as > > > > WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING) > > > > would be cleared and I'd prefer to move the part that handles > > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is > > going to add more than one VM FUNC. :]) > > IMO, for now, this should be fine because we are not even passing through the > hardware's eptp switching. Even if there are other vm functions, they > won't be available for the nested case and cause any conflict. Yeah, it is fine function-wise, I was just pointing out that it looks ugly to me. Btw. have you looked what we'd need to do for the hardware pass-through? I'd expect big changes to MMU. :) > >> > + if (!nested_cpu_has_ept(vmcs12) || > >> > + !nested_cpu_has_eptp_switching(vmcs12)) > >> > + goto fail; > > > > This brings me to a missing vm-entry check: > > > > If “EPTP switching” VM-function control is 1, the “enable EPT” > > VM-execution control must also be 1. In addition, the EPTP-list address > > must satisfy the following checks: > > • Bits 11:0 of the address must be 0. > > • The address must not set any bits beyond the processor’s > > physical-address width. > > > > so this one could be > > > > if (!nested_cpu_has_eptp_switching(vmcs12) || > > WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) > > I will reverse the order here but the vm entry check is unnecessary because > the check on the list address is already being done in this function. Here is too late, the nested VM-entry should have failed, never letting this situation happen. We want an equivalent of if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; in nested controls checks, right next to the reserved fields check. And then also the check EPTP-list check. All of them only checked when nested_cpu_has_vmfunc(vmcs12).
2017-07-11 14:35-0400, Bandan Das: > Jim Mattson <jmattson@google.com> writes: > ... > >>> I can find the definition for an vmexit in case of index >= > >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. > >>> > >>> Can you give me a hint? > >> > >> I don't think there is. Since, we are basically emulating eptp switching > >> for L2, this is a good check to have. > > > > There is nothing wrong with a hypervisor using physical page 0 for > > whatever purpose it likes, including an EPTP list. > > Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list > address most likely means it forgot to initialize it. Whatever damage it does will > still end up with vmfunc vmexit anyway. Most likely, but not certainly. I also don't see a to diverge from the spec here.
2017-07-11 14:24-0400, Bandan Das: > Bandan Das <bsd@redhat.com> writes: > > If there's a triple fault, I think it's a good idea to inject it > > back. Basically, there's no need to take care of damage control > > that L1 is intentionally doing. > > > >>> + goto fail; > >>> + kvm_mmu_unload(vcpu); > >>> + vmcs12->ept_pointer = address; > >>> + kvm_mmu_reload(vcpu); > >> > >> I was thinking about something like this: > >> > >> kvm_mmu_unload(vcpu); > >> old = vmcs12->ept_pointer; > >> vmcs12->ept_pointer = address; > >> if (kvm_mmu_reload(vcpu)) { > >> /* pointer invalid, restore previous state */ > >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); > >> vmcs12->ept_pointer = old; > >> kvm_mmu_reload(vcpu); > >> goto fail; > >> } > >> > >> The you can inherit the checks from mmu_check_root(). > > Actually, thinking about this a bit more, I agree with you. Any fault > with a vmfunc operation should end with a vmfunc vmexit, so this > is a good thing to have. Thank you for this idea! :) SDM says IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail if in EPTP) THEN VMexit; and no other mentions of a VM exit, so I think that the VM exit happens only under these conditions: — The EPT memory type (bits 2:0) must be a value supported by the processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10). — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating an EPT page-walk length of 4; see Section 28.2.2. — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read as 0, indicating that the processor does not support accessed and dirty flags for EPT. — Reserved bits 11:7 and 63:N (where N is the processor’s physical-address width) must all be 0. And it looks like we need parts of nested_ept_init_mmu_context() to properly handle VMX_EPT_AD_ENABLE_BIT. The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if we just invalidate the MMU.
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 14:05-0400, Bandan Das: >> Radim Krčmář <rkrcmar@redhat.com> writes: >> >> > [David did a great review, so I'll just point out things I noticed.] >> > >> > 2017-07-11 09:51+0200, David Hildenbrand: >> >> On 10.07.2017 22:49, Bandan Das wrote: >> >> > When L2 uses vmfunc, L0 utilizes the associated vmexit to >> >> > emulate a switching of the ept pointer by reloading the >> >> > guest MMU. >> >> > >> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> >> >> > Signed-off-by: Bandan Das <bsd@redhat.com> >> >> > --- >> >> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> >> > @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) >> >> > } >> >> > >> >> > vmcs12 = get_vmcs12(vcpu); >> >> > - if ((vmcs12->vm_function_control & (1 << function)) == 0) >> >> > + if (((vmcs12->vm_function_control & (1 << function)) == 0) || >> >> > + WARN_ON_ONCE(function)) >> >> >> >> "... instruction causes a VM exit if the bit at position EAX is 0 in the >> >> VM-function controls (the selected VM function is >> >> not enabled)." >> >> >> >> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then >> >> completely. >> > >> > It assumes that vm_function_control is not > 1, which is (should be) >> > guaranteed by VM entry check, because the nested_vmx_vmfunc_controls MSR >> > is 1. >> > >> >> > + goto fail; >> > >> > The rest of the code assumes that the function is >> > VMX_VMFUNC_EPTP_SWITCHING, so some WARN_ON_ONCE is reasonable. >> > >> > Writing it as >> > >> > WARN_ON_ONCE(function != VMX_VMFUNC_EPTP_SWITCHING) >> > >> > would be cleared and I'd prefer to move the part that handles >> > VMX_VMFUNC_EPTP_SWITCHING into a new function. (Imagine that Intel is >> > going to add more than one VM FUNC. :]) >> >> IMO, for now, this should be fine because we are not even passing through the >> hardware's eptp switching. Even if there are other vm functions, they >> won't be available for the nested case and cause any conflict. > > Yeah, it is fine function-wise, I was just pointing out that it looks > ugly to me. Ok, lemme switch this to a switch statement style handler function. That way, future additions would be easier. > Btw. have you looked what we'd need to do for the hardware pass-through? > I'd expect big changes to MMU. :) Yes, the first version was actually using vmfunc 0 directly, well not exatly, the first time would go through this path and then the next time the processor would handle it directly. Paolo pointed out an issue that I was ready to fix but he wasn't comfortable with the idea. I actually agree with him, it's too much untested code for something that would be rarely used. >> >> > + if (!nested_cpu_has_ept(vmcs12) || >> >> > + !nested_cpu_has_eptp_switching(vmcs12)) >> >> > + goto fail; >> > >> > This brings me to a missing vm-entry check: >> > >> > If “EPTP switching” VM-function control is 1, the “enable EPT” >> > VM-execution control must also be 1. In addition, the EPTP-list address >> > must satisfy the following checks: >> > • Bits 11:0 of the address must be 0. >> > • The address must not set any bits beyond the processor’s >> > physical-address width. >> > >> > so this one could be >> > >> > if (!nested_cpu_has_eptp_switching(vmcs12) || >> > WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) >> >> I will reverse the order here but the vm entry check is unnecessary because >> the check on the list address is already being done in this function. > > Here is too late, the nested VM-entry should have failed, never letting > this situation happen. We want an equivalent of > > if (nested_cpu_has_eptp_switching(vmcs12) && !nested_cpu_has_ept(vmcs12)) > return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > > in nested controls checks, right next to the reserved fields check. > And then also the check EPTP-list check. All of them only checked when > nested_cpu_has_vmfunc(vmcs12). Actually, I misread 25.5.5.3. There are two checks. Here, the list entry needs to be checked so that eptp won't cause a vmentry failure. The vmentry needs to check the eptp list address itself. I will add that check for the list address in the next version. Bandan
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 14:35-0400, Bandan Das: >> Jim Mattson <jmattson@google.com> writes: >> ... >> >>> I can find the definition for an vmexit in case of index >= >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. >> >>> >> >>> Can you give me a hint? >> >> >> >> I don't think there is. Since, we are basically emulating eptp switching >> >> for L2, this is a good check to have. >> > >> > There is nothing wrong with a hypervisor using physical page 0 for >> > whatever purpose it likes, including an EPTP list. >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list >> address most likely means it forgot to initialize it. Whatever damage it does will >> still end up with vmfunc vmexit anyway. > > Most likely, but not certainly. I also don't see a to diverge from the > spec here. Actually, this is a specific case where I would like to diverge from the spec. But then again, it's L1 shooting itself in the foot and this would be a rarely used code path, so, I am fine removing it.
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 14:24-0400, Bandan Das: >> Bandan Das <bsd@redhat.com> writes: >> > If there's a triple fault, I think it's a good idea to inject it >> > back. Basically, there's no need to take care of damage control >> > that L1 is intentionally doing. >> > >> >>> + goto fail; >> >>> + kvm_mmu_unload(vcpu); >> >>> + vmcs12->ept_pointer = address; >> >>> + kvm_mmu_reload(vcpu); >> >> >> >> I was thinking about something like this: >> >> >> >> kvm_mmu_unload(vcpu); >> >> old = vmcs12->ept_pointer; >> >> vmcs12->ept_pointer = address; >> >> if (kvm_mmu_reload(vcpu)) { >> >> /* pointer invalid, restore previous state */ >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> >> vmcs12->ept_pointer = old; >> >> kvm_mmu_reload(vcpu); >> >> goto fail; >> >> } >> >> >> >> The you can inherit the checks from mmu_check_root(). >> >> Actually, thinking about this a bit more, I agree with you. Any fault >> with a vmfunc operation should end with a vmfunc vmexit, so this >> is a good thing to have. Thank you for this idea! :) > > SDM says > > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail > if in EPTP) THEN VMexit; This section here: As noted in Section 25.5.5.2, an execution of the EPTP-switching VM function that causes a VM exit (as specified above), uses the basic exit reason 59, indicating “VMFUNC”. The length of the VMFUNC instruction is saved into the VM-exit instruction-length field. No additional VM-exit information is provided. Although, it adds (as specified above), from testing, any vmexit that happens as a result of the execution of the vmfunc instruction always has exit reason 59. IMO, the case David pointed out comes under "as a result of the execution of the vmfunc instruction", so I would prefer exiting with reason 59. > and no other mentions of a VM exit, so I think that the VM exit happens > only under these conditions: > > — The EPT memory type (bits 2:0) must be a value supported by the > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see > Appendix A.10). > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating > an EPT page-walk length of 4; see Section 28.2.2. > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read > as 0, indicating that the processor does not support accessed and > dirty flags for EPT. > — Reserved bits 11:7 and 63:N (where N is the processor’s > physical-address width) must all be 0. > > And it looks like we need parts of nested_ept_init_mmu_context() to > properly handle VMX_EPT_AD_ENABLE_BIT. I completely ignored AD and the #VE sections. I will add a TODO item in the comment section. > The KVM_REQ_TRIPLE_FAULT can be handled by kvm_mmu_reload in vcpu_run if > we just invalidate the MMU.
2017-07-11 15:50-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > 2017-07-11 14:24-0400, Bandan Das: > >> Bandan Das <bsd@redhat.com> writes: > >> > If there's a triple fault, I think it's a good idea to inject it > >> > back. Basically, there's no need to take care of damage control > >> > that L1 is intentionally doing. > >> > > >> >>> + goto fail; > >> >>> + kvm_mmu_unload(vcpu); > >> >>> + vmcs12->ept_pointer = address; > >> >>> + kvm_mmu_reload(vcpu); > >> >> > >> >> I was thinking about something like this: > >> >> > >> >> kvm_mmu_unload(vcpu); > >> >> old = vmcs12->ept_pointer; > >> >> vmcs12->ept_pointer = address; > >> >> if (kvm_mmu_reload(vcpu)) { > >> >> /* pointer invalid, restore previous state */ > >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); > >> >> vmcs12->ept_pointer = old; > >> >> kvm_mmu_reload(vcpu); > >> >> goto fail; > >> >> } > >> >> > >> >> The you can inherit the checks from mmu_check_root(). > >> > >> Actually, thinking about this a bit more, I agree with you. Any fault > >> with a vmfunc operation should end with a vmfunc vmexit, so this > >> is a good thing to have. Thank you for this idea! :) > > > > SDM says > > > > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail > > if in EPTP) THEN VMexit; > > This section here: > As noted in Section 25.5.5.2, an execution of the > EPTP-switching VM function that causes a VM exit (as specified > above), uses the basic exit reason 59, indicating “VMFUNC”. > The length of the VMFUNC instruction is saved into the > VM-exit instruction-length field. No additional VM-exit > information is provided. > > Although, it adds (as specified above), from testing, any vmexit that > happens as a result of the execution of the vmfunc instruction always > has exit reason 59. > > IMO, the case David pointed out comes under "as a result of the > execution of the vmfunc instruction", so I would prefer exiting > with reason 59. Right, the exit reason is 59 for reasons that trigger a VM exit (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks unrelated stuff. If the EPTP value is correct, then the switch should succeed. If the EPTP is correct, but bogus, then the guest should get EPT_MISCONFIG VM exit on its first access (when reading the instruction). Source: I added vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40)); shortly before a VMLAUNCH on L0. :) I think that we might be emulating this case incorrectly and throwing triple faults when it should be VM exits in vcpu_run(). > > and no other mentions of a VM exit, so I think that the VM exit happens > > only under these conditions: > > > > — The EPT memory type (bits 2:0) must be a value supported by the > > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see > > Appendix A.10). > > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating > > an EPT page-walk length of 4; see Section 28.2.2. > > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if > > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read > > as 0, indicating that the processor does not support accessed and > > dirty flags for EPT. > > — Reserved bits 11:7 and 63:N (where N is the processor’s > > physical-address width) must all be 0. > > > > And it looks like we need parts of nested_ept_init_mmu_context() to > > properly handle VMX_EPT_AD_ENABLE_BIT. > > I completely ignored AD and the #VE sections. I will add a TODO item > in the comment section. AFAIK, we don't support #VE, but AD would be nice to handle from the beginning. (I think that caling nested_ept_init_mmu_context() as-is isn't that bad.)
2017-07-11 15:38-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > > 2017-07-11 14:35-0400, Bandan Das: > >> Jim Mattson <jmattson@google.com> writes: > >> ... > >> >>> I can find the definition for an vmexit in case of index >= > >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. > >> >>> > >> >>> Can you give me a hint? > >> >> > >> >> I don't think there is. Since, we are basically emulating eptp switching > >> >> for L2, this is a good check to have. > >> > > >> > There is nothing wrong with a hypervisor using physical page 0 for > >> > whatever purpose it likes, including an EPTP list. > >> > >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list > >> address most likely means it forgot to initialize it. Whatever damage it does will > >> still end up with vmfunc vmexit anyway. > > > > Most likely, but not certainly. I also don't see a to diverge from the > > spec here. > > Actually, this is a specific case where I would like to diverge from the spec. > But then again, it's L1 shooting itself in the foot and this would be a rarely > used code path, so, I am fine removing it. Thanks, we're not here to judge the guest, but to provide a bare-metal experience. :)
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 15:50-0400, Bandan Das: >> Radim Krčmář <rkrcmar@redhat.com> writes: >> > 2017-07-11 14:24-0400, Bandan Das: >> >> Bandan Das <bsd@redhat.com> writes: >> >> > If there's a triple fault, I think it's a good idea to inject it >> >> > back. Basically, there's no need to take care of damage control >> >> > that L1 is intentionally doing. >> >> > >> >> >>> + goto fail; >> >> >>> + kvm_mmu_unload(vcpu); >> >> >>> + vmcs12->ept_pointer = address; >> >> >>> + kvm_mmu_reload(vcpu); >> >> >> >> >> >> I was thinking about something like this: >> >> >> >> >> >> kvm_mmu_unload(vcpu); >> >> >> old = vmcs12->ept_pointer; >> >> >> vmcs12->ept_pointer = address; >> >> >> if (kvm_mmu_reload(vcpu)) { >> >> >> /* pointer invalid, restore previous state */ >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> >> >> vmcs12->ept_pointer = old; >> >> >> kvm_mmu_reload(vcpu); >> >> >> goto fail; >> >> >> } >> >> >> >> >> >> The you can inherit the checks from mmu_check_root(). >> >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault >> >> with a vmfunc operation should end with a vmfunc vmexit, so this >> >> is a good thing to have. Thank you for this idea! :) >> > >> > SDM says >> > >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail >> > if in EPTP) THEN VMexit; >> >> This section here: >> As noted in Section 25.5.5.2, an execution of the >> EPTP-switching VM function that causes a VM exit (as specified >> above), uses the basic exit reason 59, indicating “VMFUNC”. >> The length of the VMFUNC instruction is saved into the >> VM-exit instruction-length field. No additional VM-exit >> information is provided. >> >> Although, it adds (as specified above), from testing, any vmexit that >> happens as a result of the execution of the vmfunc instruction always >> has exit reason 59. >> >> IMO, the case David pointed out comes under "as a result of the >> execution of the vmfunc instruction", so I would prefer exiting >> with reason 59. > > Right, the exit reason is 59 for reasons that trigger a VM exit > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks > unrelated stuff. > > If the EPTP value is correct, then the switch should succeed. > If the EPTP is correct, but bogus, then the guest should get > EPT_MISCONFIG VM exit on its first access (when reading the > instruction). Source: I added My point is that we are using kvm_mmu_reload() to emulate eptp switching. If that emulation of vmfunc fails, it should exit with reason 59. > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40)); > > shortly before a VMLAUNCH on L0. :) What happens if this ept pointer is actually in the eptp list and the guest switches to it using vmfunc ? I think it will exit with reason 59. > I think that we might be emulating this case incorrectly and throwing > triple faults when it should be VM exits in vcpu_run(). No, I agree with not throwing a triple fault. We should clear it out. But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails. >> > and no other mentions of a VM exit, so I think that the VM exit happens >> > only under these conditions: >> > >> > — The EPT memory type (bits 2:0) must be a value supported by the >> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see >> > Appendix A.10). >> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating >> > an EPT page-walk length of 4; see Section 28.2.2. >> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if >> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read >> > as 0, indicating that the processor does not support accessed and >> > dirty flags for EPT. >> > — Reserved bits 11:7 and 63:N (where N is the processor’s >> > physical-address width) must all be 0. >> > >> > And it looks like we need parts of nested_ept_init_mmu_context() to >> > properly handle VMX_EPT_AD_ENABLE_BIT. >> >> I completely ignored AD and the #VE sections. I will add a TODO item >> in the comment section. > > AFAIK, we don't support #VE, but AD would be nice to handle from the Nevertheless, it's good to have the nested hypervisor be able to use it just like vmfunc. > beginning. (I think that caling nested_ept_init_mmu_context() as-is > isn't that bad.) Ok, I will take a look.
2017-07-11 16:34-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > > 2017-07-11 15:50-0400, Bandan Das: > >> Radim Krčmář <rkrcmar@redhat.com> writes: > >> > 2017-07-11 14:24-0400, Bandan Das: > >> >> Bandan Das <bsd@redhat.com> writes: > >> >> > If there's a triple fault, I think it's a good idea to inject it > >> >> > back. Basically, there's no need to take care of damage control > >> >> > that L1 is intentionally doing. > >> >> > > >> >> >>> + goto fail; > >> >> >>> + kvm_mmu_unload(vcpu); > >> >> >>> + vmcs12->ept_pointer = address; > >> >> >>> + kvm_mmu_reload(vcpu); > >> >> >> > >> >> >> I was thinking about something like this: > >> >> >> > >> >> >> kvm_mmu_unload(vcpu); > >> >> >> old = vmcs12->ept_pointer; > >> >> >> vmcs12->ept_pointer = address; > >> >> >> if (kvm_mmu_reload(vcpu)) { > >> >> >> /* pointer invalid, restore previous state */ > >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); > >> >> >> vmcs12->ept_pointer = old; > >> >> >> kvm_mmu_reload(vcpu); > >> >> >> goto fail; > >> >> >> } > >> >> >> > >> >> >> The you can inherit the checks from mmu_check_root(). > >> >> > >> >> Actually, thinking about this a bit more, I agree with you. Any fault > >> >> with a vmfunc operation should end with a vmfunc vmexit, so this > >> >> is a good thing to have. Thank you for this idea! :) > >> > > >> > SDM says > >> > > >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail > >> > if in EPTP) THEN VMexit; > >> > >> This section here: > >> As noted in Section 25.5.5.2, an execution of the > >> EPTP-switching VM function that causes a VM exit (as specified > >> above), uses the basic exit reason 59, indicating “VMFUNC”. > >> The length of the VMFUNC instruction is saved into the > >> VM-exit instruction-length field. No additional VM-exit > >> information is provided. > >> > >> Although, it adds (as specified above), from testing, any vmexit that > >> happens as a result of the execution of the vmfunc instruction always > >> has exit reason 59. > >> > >> IMO, the case David pointed out comes under "as a result of the > >> execution of the vmfunc instruction", so I would prefer exiting > >> with reason 59. > > > > Right, the exit reason is 59 for reasons that trigger a VM exit > > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks > > unrelated stuff. > > > > If the EPTP value is correct, then the switch should succeed. > > If the EPTP is correct, but bogus, then the guest should get > > EPT_MISCONFIG VM exit on its first access (when reading the > > instruction). Source: I added > > My point is that we are using kvm_mmu_reload() to emulate eptp > switching. If that emulation of vmfunc fails, it should exit with reason > 59. Yeah, we just disagree on what is a vmfunc failure. > > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40)); > > > > shortly before a VMLAUNCH on L0. :) > > What happens if this ept pointer is actually in the eptp list and the guest > switches to it using vmfunc ? I think it will exit with reason 59. I think otherwise, because it doesn't cause a VM entry failure on bare-metal (and SDM says that we get a VM exit only if there would be a VM entry failure). I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after. (Experiment pending :]) > > I think that we might be emulating this case incorrectly and throwing > > triple faults when it should be VM exits in vcpu_run(). > > No, I agree with not throwing a triple fault. We should clear it out. > But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails. Here we disagree. I think that it's a bug do the VM exit, so we can just keep the original bug -- we want to eventually fix it and it's no worse till then.
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 15:38-0400, Bandan Das: >> Radim Krčmář <rkrcmar@redhat.com> writes: >> >> > 2017-07-11 14:35-0400, Bandan Das: >> >> Jim Mattson <jmattson@google.com> writes: >> >> ... >> >> >>> I can find the definition for an vmexit in case of index >= >> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. >> >> >>> >> >> >>> Can you give me a hint? >> >> >> >> >> >> I don't think there is. Since, we are basically emulating eptp switching >> >> >> for L2, this is a good check to have. >> >> > >> >> > There is nothing wrong with a hypervisor using physical page 0 for >> >> > whatever purpose it likes, including an EPTP list. >> >> >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list >> >> address most likely means it forgot to initialize it. Whatever damage it does will >> >> still end up with vmfunc vmexit anyway. >> > >> > Most likely, but not certainly. I also don't see a to diverge from the >> > spec here. >> >> Actually, this is a specific case where I would like to diverge from the spec. >> But then again, it's L1 shooting itself in the foot and this would be a rarely >> used code path, so, I am fine removing it. > > Thanks, we're not here to judge the guest, but to provide a bare-metal > experience. :) There are certain cases where do. For example, when L2 instruction emulation fails we decide to kill L2 instead of injecting the error to L1 and let it handle that. Anyway, that's a different topic, I was just trying to point out there are cases kvm does a somewhat policy decision...
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-11 16:34-0400, Bandan Das: >> Radim Krčmář <rkrcmar@redhat.com> writes: >> >> > 2017-07-11 15:50-0400, Bandan Das: >> >> Radim Krčmář <rkrcmar@redhat.com> writes: >> >> > 2017-07-11 14:24-0400, Bandan Das: >> >> >> Bandan Das <bsd@redhat.com> writes: >> >> >> > If there's a triple fault, I think it's a good idea to inject it >> >> >> > back. Basically, there's no need to take care of damage control >> >> >> > that L1 is intentionally doing. >> >> >> > >> >> >> >>> + goto fail; >> >> >> >>> + kvm_mmu_unload(vcpu); >> >> >> >>> + vmcs12->ept_pointer = address; >> >> >> >>> + kvm_mmu_reload(vcpu); >> >> >> >> >> >> >> >> I was thinking about something like this: >> >> >> >> >> >> >> >> kvm_mmu_unload(vcpu); >> >> >> >> old = vmcs12->ept_pointer; >> >> >> >> vmcs12->ept_pointer = address; >> >> >> >> if (kvm_mmu_reload(vcpu)) { >> >> >> >> /* pointer invalid, restore previous state */ >> >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); >> >> >> >> vmcs12->ept_pointer = old; >> >> >> >> kvm_mmu_reload(vcpu); >> >> >> >> goto fail; >> >> >> >> } >> >> >> >> >> >> >> >> The you can inherit the checks from mmu_check_root(). >> >> >> >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault >> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this >> >> >> is a good thing to have. Thank you for this idea! :) >> >> > >> >> > SDM says >> >> > >> >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail >> >> > if in EPTP) THEN VMexit; >> >> >> >> This section here: >> >> As noted in Section 25.5.5.2, an execution of the >> >> EPTP-switching VM function that causes a VM exit (as specified >> >> above), uses the basic exit reason 59, indicating “VMFUNC”. >> >> The length of the VMFUNC instruction is saved into the >> >> VM-exit instruction-length field. No additional VM-exit >> >> information is provided. >> >> >> >> Although, it adds (as specified above), from testing, any vmexit that >> >> happens as a result of the execution of the vmfunc instruction always >> >> has exit reason 59. >> >> >> >> IMO, the case David pointed out comes under "as a result of the >> >> execution of the vmfunc instruction", so I would prefer exiting >> >> with reason 59. >> > >> > Right, the exit reason is 59 for reasons that trigger a VM exit >> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks >> > unrelated stuff. >> > >> > If the EPTP value is correct, then the switch should succeed. >> > If the EPTP is correct, but bogus, then the guest should get >> > EPT_MISCONFIG VM exit on its first access (when reading the >> > instruction). Source: I added >> >> My point is that we are using kvm_mmu_reload() to emulate eptp >> switching. If that emulation of vmfunc fails, it should exit with reason >> 59. > > Yeah, we just disagree on what is a vmfunc failure. > >> > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40)); >> > >> > shortly before a VMLAUNCH on L0. :) >> >> What happens if this ept pointer is actually in the eptp list and the guest >> switches to it using vmfunc ? I think it will exit with reason 59. > > I think otherwise, because it doesn't cause a VM entry failure on > bare-metal (and SDM says that we get a VM exit only if there would be a > VM entry failure). > I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after. > (Experiment pending :]) > >> > I think that we might be emulating this case incorrectly and throwing >> > triple faults when it should be VM exits in vcpu_run(). >> >> No, I agree with not throwing a triple fault. We should clear it out. >> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails. > > Here we disagree. I think that it's a bug do the VM exit, so we can Why do you think it's a bug ? The eptp switching function really didn't succeed as far as our emulation goes when kvm_mmu_reload() fails. And as such, the generic vmexit failure event should be a vmfunc vmexit. We cannot strictly follow the spec here, the spec doesn't even mention a way to emulate eptp switching. If setting up the switching succeeded and the new root pointer is invalid or whatever, I really don't care what happens next but this is not the case. We fail to get a new root pointer and without that, we can't even make a switch! > just keep the original bug -- we want to eventually fix it and it's no > worse till then. Anyway, can you please confirm again what is the behavior that you are expecting if kvm_mmu_reload fails ? This would be a rarely used branch and I am actually fine diverging from what I think is right if I can get the reviewers to agree on a common thing. (Thanks for giving this a closer look, Radim. I really appreciate it.) Bandan
2017-07-11 17:08-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > > 2017-07-11 16:34-0400, Bandan Das: > >> Radim Krčmář <rkrcmar@redhat.com> writes: > >> > >> > 2017-07-11 15:50-0400, Bandan Das: > >> >> Radim Krčmář <rkrcmar@redhat.com> writes: > >> >> > 2017-07-11 14:24-0400, Bandan Das: > >> >> >> Bandan Das <bsd@redhat.com> writes: > >> >> >> > If there's a triple fault, I think it's a good idea to inject it > >> >> >> > back. Basically, there's no need to take care of damage control > >> >> >> > that L1 is intentionally doing. > >> >> >> > > >> >> >> >>> + goto fail; > >> >> >> >>> + kvm_mmu_unload(vcpu); > >> >> >> >>> + vmcs12->ept_pointer = address; > >> >> >> >>> + kvm_mmu_reload(vcpu); > >> >> >> >> > >> >> >> >> I was thinking about something like this: > >> >> >> >> > >> >> >> >> kvm_mmu_unload(vcpu); > >> >> >> >> old = vmcs12->ept_pointer; > >> >> >> >> vmcs12->ept_pointer = address; > >> >> >> >> if (kvm_mmu_reload(vcpu)) { > >> >> >> >> /* pointer invalid, restore previous state */ > >> >> >> >> kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); > >> >> >> >> vmcs12->ept_pointer = old; > >> >> >> >> kvm_mmu_reload(vcpu); > >> >> >> >> goto fail; > >> >> >> >> } > >> >> >> >> > >> >> >> >> The you can inherit the checks from mmu_check_root(). > >> >> >> > >> >> >> Actually, thinking about this a bit more, I agree with you. Any fault > >> >> >> with a vmfunc operation should end with a vmfunc vmexit, so this > >> >> >> is a good thing to have. Thank you for this idea! :) > >> >> > > >> >> > SDM says > >> >> > > >> >> > IF tent_EPTP is not a valid EPTP value (would cause VM entry to fail > >> >> > if in EPTP) THEN VMexit; > >> >> > >> >> This section here: > >> >> As noted in Section 25.5.5.2, an execution of the > >> >> EPTP-switching VM function that causes a VM exit (as specified > >> >> above), uses the basic exit reason 59, indicating “VMFUNC”. > >> >> The length of the VMFUNC instruction is saved into the > >> >> VM-exit instruction-length field. No additional VM-exit > >> >> information is provided. > >> >> > >> >> Although, it adds (as specified above), from testing, any vmexit that > >> >> happens as a result of the execution of the vmfunc instruction always > >> >> has exit reason 59. > >> >> > >> >> IMO, the case David pointed out comes under "as a result of the > >> >> execution of the vmfunc instruction", so I would prefer exiting > >> >> with reason 59. > >> > > >> > Right, the exit reason is 59 for reasons that trigger a VM exit > >> > (i.e. invalid EPTP value, the four below), but kvm_mmu_reload() checks > >> > unrelated stuff. > >> > > >> > If the EPTP value is correct, then the switch should succeed. > >> > If the EPTP is correct, but bogus, then the guest should get > >> > EPT_MISCONFIG VM exit on its first access (when reading the > >> > instruction). Source: I added > >> > >> My point is that we are using kvm_mmu_reload() to emulate eptp > >> switching. If that emulation of vmfunc fails, it should exit with reason > >> 59. >> >> Yeah, we just disagree on what is a vmfunc failure. >> >>> > vmcs_write64(EPT_POINTER, vmcs_read64(EPT_POINTER) | (1ULL << 40)); >>> > >>> > shortly before a VMLAUNCH on L0. :) >>> >>> What happens if this ept pointer is actually in the eptp list and the guest >>> switches to it using vmfunc ? I think it will exit with reason 59. >> >> I think otherwise, because it doesn't cause a VM entry failure on >> bare-metal (and SDM says that we get a VM exit only if there would be a >> VM entry failure). >> I expect the vmfunc to succeed and to get a EPT_MISCONFIG right after. >> (Experiment pending :]) >> >>> > I think that we might be emulating this case incorrectly and throwing >>> > triple faults when it should be VM exits in vcpu_run(). >>> >>> No, I agree with not throwing a triple fault. We should clear it out. >>> But we should emulate a vmfunc vmexit back to L1 when kvm_mmu_load fails. >> >> Here we disagree. I think that it's a bug do the VM exit, so we can > > Why do you think it's a bug ? SDM defines a different behavior and hardware doesn't do that either. There are only two reasons for a VMFUNC VM exit from EPTP switching: 1) ECX > 0 2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER KVM can fail for other reasons because of its bugs, but that should be notified to the guest in another way. Rebooting the guest is kind of acceptable in that case. > The eptp switching function really didn't > succeed as far as our emulation goes when kvm_mmu_reload() fails. > And as such, the generic vmexit failure event should be a vmfunc vmexit. I interpret it as two separate events -- at first, the vmfunc succeeds and when it later tries to access memory through the new EPTP (valid, but not pointing into backed memory), it results in a EPT_MISCONFIG VM exit. > We cannot strictly follow the spec here, the spec doesn't even mention a way > to emulate eptp switching. If setting up the switching succeeded and the > new root pointer is invalid or whatever, I really don't care what happens > next but this is not the case. We fail to get a new root pointer and without > that, we can't even make a switch! We just make it behave exactly how the spec says that it behaves. We do have a value (we read 'address') to put into VMCS.EPT_POINTER, which is all we need for the emulation. The function doesn't dereference that pointer, it just looks at its value to decide whether it is valid or not. (btw. we should check that properly, because we cannot depend on VM entry failure pass-through like the normal case.) The dereference done in kvm_mmu_reload() should happen after EPTP switching finishes, because the spec doesn't mention a VM exit for other reason than invalid EPT_POINTER value. >> just keep the original bug -- we want to eventually fix it and it's no >> worse till then. > > Anyway, can you please confirm again what is the behavior that you > are expecting if kvm_mmu_reload fails ? This would be a rarely used > branch and I am actually fine diverging from what I think is right if > I can get the reviewers to agree on a common thing. kvm_mmu_reload() fails when mmu_check_root() is false, which means that the pointed physical address is not backed. We've hit this corner-case in the past -- Jim said that the chipset returns all 1s if a read is not claimed. So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the pointer pointed to a memory of all 1s, which would likely result in EPT_MISCONFIG when the guest does a memory access. It is a mishandled corner case, but turning it into VM exit would only confuse an OS that receives the impossible VM exit and potentially confuse reader of the KVM logic. I think that not using kvm_mmu_reload() directly in EPTP switching is best. The bug is not really something we care about.
2017-07-11 16:45-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > > > 2017-07-11 15:38-0400, Bandan Das: > >> Radim Krčmář <rkrcmar@redhat.com> writes: > >> > >> > 2017-07-11 14:35-0400, Bandan Das: > >> >> Jim Mattson <jmattson@google.com> writes: > >> >> ... > >> >> >>> I can find the definition for an vmexit in case of index >= > >> >> >>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM. > >> >> >>> > >> >> >>> Can you give me a hint? > >> >> >> > >> >> >> I don't think there is. Since, we are basically emulating eptp switching > >> >> >> for L2, this is a good check to have. > >> >> > > >> >> > There is nothing wrong with a hypervisor using physical page 0 for > >> >> > whatever purpose it likes, including an EPTP list. > >> >> > >> >> Right, but of all the things, a l1 hypervisor wanting page 0 for a eptp list > >> >> address most likely means it forgot to initialize it. Whatever damage it does will > >> >> still end up with vmfunc vmexit anyway. > >> > > >> > Most likely, but not certainly. I also don't see a to diverge from the > >> > spec here. > >> > >> Actually, this is a specific case where I would like to diverge from the spec. > >> But then again, it's L1 shooting itself in the foot and this would be a rarely > >> used code path, so, I am fine removing it. > > > > Thanks, we're not here to judge the guest, but to provide a bare-metal > > experience. :) > > There are certain cases where do. For example, when L2 instruction emulation > fails we decide to kill L2 instead of injecting the error to L1 and let it handle > that. Anyway, that's a different topic, I was just trying to point out there > are cases kvm does a somewhat policy decision... Emulation failure is a KVM bug and we are too lazy to implement the bare-metal behavior correctly, but avoiding the EPTP list bug is actually easier than introducing it. You can make KVM simpler and improve bare-metal emulation at the same time.
Radim Krčmář <rkrcmar@redhat.com> writes: ... >> > Thanks, we're not here to judge the guest, but to provide a bare-metal >> > experience. :) >> >> There are certain cases where do. For example, when L2 instruction emulation >> fails we decide to kill L2 instead of injecting the error to L1 and let it handle >> that. Anyway, that's a different topic, I was just trying to point out there >> are cases kvm does a somewhat policy decision... > > Emulation failure is a KVM bug and we are too lazy to implement the > bare-metal behavior correctly, but avoiding the EPTP list bug is > actually easier than introducing it. You can make KVM simpler and > improve bare-metal emulation at the same time. We are just talking past each other here trying to impose point of views. Checking for 0 makes KVM simpler. As I said before, a 0 list_address means that the hypervisor forgot to initialize it. Feel free to show me examples where the hypervisor does indeed use a 0 address for eptp list address or anything vm specific. You disagreed and I am fine with it.
Radim Krčmář <rkrcmar@redhat.com> writes: ... >> Why do you think it's a bug ? > > SDM defines a different behavior and hardware doesn't do that either. > There are only two reasons for a VMFUNC VM exit from EPTP switching: > > 1) ECX > 0 > 2) EPTP would cause VM entry to fail if in VMCS.EPT_POINTER > > KVM can fail for other reasons because of its bugs, but that should be > notified to the guest in another way. Rebooting the guest is kind of > acceptable in that case. > >> The eptp switching function really didn't >> succeed as far as our emulation goes when kvm_mmu_reload() fails. >> And as such, the generic vmexit failure event should be a vmfunc vmexit. > > I interpret it as two separate events -- at first, the vmfunc succeeds > and when it later tries to access memory through the new EPTP (valid, > but not pointing into backed memory), it results in a EPT_MISCONFIG VM > exit. > >> We cannot strictly follow the spec here, the spec doesn't even mention a way >> to emulate eptp switching. If setting up the switching succeeded and the >> new root pointer is invalid or whatever, I really don't care what happens >> next but this is not the case. We fail to get a new root pointer and without >> that, we can't even make a switch! > > We just make it behave exactly how the spec says that it behaves. We do > have a value (we read 'address') to put into VMCS.EPT_POINTER, which is > all we need for the emulation. > The function doesn't dereference that pointer, it just looks at its > value to decide whether it is valid or not. (btw. we should check that > properly, because we cannot depend on VM entry failure pass-through like > the normal case.) > > The dereference done in kvm_mmu_reload() should happen after EPTP > switching finishes, because the spec doesn't mention a VM exit for other > reason than invalid EPT_POINTER value. > >>> just keep the original bug -- we want to eventually fix it and it's no >>> worse till then. >> >> Anyway, can you please confirm again what is the behavior that you >> are expecting if kvm_mmu_reload fails ? This would be a rarely used >> branch and I am actually fine diverging from what I think is right if >> I can get the reviewers to agree on a common thing. > > kvm_mmu_reload() fails when mmu_check_root() is false, which means that > the pointed physical address is not backed. We've hit this corner-case > in the past -- Jim said that the chipset returns all 1s if a read is not > claimed. > > So in theory, KVM should not fail kvm_mmu_reload(), but behave as if the > pointer pointed to a memory of all 1s, which would likely result in > EPT_MISCONFIG when the guest does a memory access. As much as I would like to disagree with you, I have already spent way more time on this then I want. Please let's just leave it here, then ? The mmu unload will make sure there's an invalid root hpa and whatever happens next, happens. > It is a mishandled corner case, but turning it into VM exit would only > confuse an OS that receives the impossible VM exit and potentially > confuse reader of the KVM logic. > > I think that not using kvm_mmu_reload() directly in EPTP switching is > best. The bug is not really something we care about.
2017-07-12 14:11-0400, Bandan Das: > As much as I would like to disagree with you, I have already spent way more > time on this then I want. Please let's just leave it here, then ? The mmu unload > will make sure there's an invalid root hpa and whatever happens next, happens. Sure; let's discuss the subtleties of hardware emulation over a beer.
>>> + /* >>> + * If the (L2) guest does a vmfunc to the currently >>> + * active ept pointer, we don't have to do anything else >>> + */ >>> + if (vmcs12->ept_pointer != address) { >>> + if (address >> cpuid_maxphyaddr(vcpu) || >>> + !IS_ALIGNED(address, 4096)) >> >> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? >> (triggering a KVM_REQ_TRIPLE_FAULT) > > If there's a triple fault, I think it's a good idea to inject it > back. Basically, there's no need to take care of damage control > that L1 is intentionally doing. I quickly rushed over the massive amount of comments. Sounds like you'll be preparing a v5. Would be great if you could add some comments that were the result of this discussion (for parts that are not that obvious - triple faults) - thanks!
David Hildenbrand <david@redhat.com> writes: >>>> + /* >>>> + * If the (L2) guest does a vmfunc to the currently >>>> + * active ept pointer, we don't have to do anything else >>>> + */ >>>> + if (vmcs12->ept_pointer != address) { >>>> + if (address >> cpuid_maxphyaddr(vcpu) || >>>> + !IS_ALIGNED(address, 4096)) >>> >>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail? >>> (triggering a KVM_REQ_TRIPLE_FAULT) >> >> If there's a triple fault, I think it's a good idea to inject it >> back. Basically, there's no need to take care of damage control >> that L1 is intentionally doing. > > I quickly rushed over the massive amount of comments. Sounds like you'll > be preparing a v5. Would be great if you could add some comments that > were the result of this discussion (for parts that are not that obvious > - triple faults) - thanks! Will do. Basically, we agreed that we don't need to do anything with mmu_reload() faillures because the invalid eptp that mmu_unload will write to root_hpa will result in an ept violation. Bandan
Radim Krčmář <rkrcmar@redhat.com> writes: ... >> > and no other mentions of a VM exit, so I think that the VM exit happens >> > only under these conditions: >> > >> > — The EPT memory type (bits 2:0) must be a value supported by the >> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see >> > Appendix A.10). >> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating >> > an EPT page-walk length of 4; see Section 28.2.2. >> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if >> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read >> > as 0, indicating that the processor does not support accessed and >> > dirty flags for EPT. >> > — Reserved bits 11:7 and 63:N (where N is the processor’s >> > physical-address width) must all be 0. >> > >> > And it looks like we need parts of nested_ept_init_mmu_context() to >> > properly handle VMX_EPT_AD_ENABLE_BIT. >> >> I completely ignored AD and the #VE sections. I will add a TODO item >> in the comment section. > > AFAIK, we don't support #VE, but AD would be nice to handle from the > beginning. (I think that caling nested_ept_init_mmu_context() as-is > isn't that bad.) I went back to the spec to take a look at the AD handling. It doesn't look like anything needs to be done since nested_ept_init_mmu_context() is already being called with the correct eptp in prepare_vmcs02 ? Anything else that needs to be done for AD handling in vmfunc context ? Thanks, Bandan
2017-07-17 13:58-0400, Bandan Das: > Radim Krčmář <rkrcmar@redhat.com> writes: > ... >>> > and no other mentions of a VM exit, so I think that the VM exit happens >>> > only under these conditions: >>> > >>> > — The EPT memory type (bits 2:0) must be a value supported by the >>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see >>> > Appendix A.10). >>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating >>> > an EPT page-walk length of 4; see Section 28.2.2. >>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if >>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read >>> > as 0, indicating that the processor does not support accessed and >>> > dirty flags for EPT. >>> > — Reserved bits 11:7 and 63:N (where N is the processor’s >>> > physical-address width) must all be 0. >>> > >>> > And it looks like we need parts of nested_ept_init_mmu_context() to >>> > properly handle VMX_EPT_AD_ENABLE_BIT. >>> >>> I completely ignored AD and the #VE sections. I will add a TODO item >>> in the comment section. >> >> AFAIK, we don't support #VE, but AD would be nice to handle from the >> beginning. (I think that caling nested_ept_init_mmu_context() as-is >> isn't that bad.) > > I went back to the spec to take a look at the AD handling. It doesn't look > like anything needs to be done since nested_ept_init_mmu_context() is already > being called with the correct eptp in prepare_vmcs02 ? Anything else that > needs to be done for AD handling in vmfunc context ? AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and we don't call prepare_vmcs02() after emulating VMFUNC vm exit. We want to forward the new AD configuration to KVM's MMU.
Radim Krčmář <rkrcmar@redhat.com> writes: > 2017-07-17 13:58-0400, Bandan Das: >> Radim Krčmář <rkrcmar@redhat.com> writes: >> ... >>>> > and no other mentions of a VM exit, so I think that the VM exit happens >>>> > only under these conditions: >>>> > >>>> > — The EPT memory type (bits 2:0) must be a value supported by the >>>> > processor as indicated in the IA32_VMX_EPT_VPID_CAP MSR (see >>>> > Appendix A.10). >>>> > — Bits 5:3 (1 less than the EPT page-walk length) must be 3, indicating >>>> > an EPT page-walk length of 4; see Section 28.2.2. >>>> > — Bit 6 (enable bit for accessed and dirty flags for EPT) must be 0 if >>>> > bit 21 of the IA32_VMX_EPT_VPID_CAP MSR (see Appendix A.10) is read >>>> > as 0, indicating that the processor does not support accessed and >>>> > dirty flags for EPT. >>>> > — Reserved bits 11:7 and 63:N (where N is the processor’s >>>> > physical-address width) must all be 0. >>>> > >>>> > And it looks like we need parts of nested_ept_init_mmu_context() to >>>> > properly handle VMX_EPT_AD_ENABLE_BIT. >>>> >>>> I completely ignored AD and the #VE sections. I will add a TODO item >>>> in the comment section. >>> >>> AFAIK, we don't support #VE, but AD would be nice to handle from the >>> beginning. (I think that caling nested_ept_init_mmu_context() as-is >>> isn't that bad.) >> >> I went back to the spec to take a look at the AD handling. It doesn't look >> like anything needs to be done since nested_ept_init_mmu_context() is already >> being called with the correct eptp in prepare_vmcs02 ? Anything else that >> needs to be done for AD handling in vmfunc context ? > > AD is decided by EPTP bit 6, so it can be toggled by EPTP switching and > we don't call prepare_vmcs02() after emulating VMFUNC vm exit. > We want to forward the new AD configuration to KVM's MMU. Thanks, I had incorrectly assumed that prepare_vmcs02 will be called after an exit unconditionally. I will work something up soon. Bandan
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da5375e..5f63a2e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -115,6 +115,10 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +/* VMFUNC functions */ +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 +#define VMFUNC_EPTP_ENTRIES 512 + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); @@ -200,6 +204,8 @@ enum vmcs_field { EOI_EXIT_BITMAP2_HIGH = 0x00002021, EOI_EXIT_BITMAP3 = 0x00002022, EOI_EXIT_BITMAP3_HIGH = 0x00002023, + EPTP_LIST_ADDRESS = 0x00002024, + EPTP_LIST_ADDRESS_HIGH = 0x00002025, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fe8f5fc..0a969fb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -246,6 +246,7 @@ struct __packed vmcs12 { u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; + u64 eptp_list_address; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); } +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (cpu_has_vmx_vmfunc()) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VMFUNC; - vmx->nested.nested_vmx_vmfunc_controls = 0; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; } /* @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + struct page *page = NULL; + u64 *l1_eptp_list, address; /* * VMFUNC is only supported for nested guests, but we always enable the @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) } vmcs12 = get_vmcs12(vcpu); - if ((vmcs12->vm_function_control & (1 << function)) == 0) + if (((vmcs12->vm_function_control & (1 << function)) == 0) || + WARN_ON_ONCE(function)) + goto fail; + + if (!nested_cpu_has_ept(vmcs12) || + !nested_cpu_has_eptp_switching(vmcs12)) + goto fail; + + if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES) + goto fail; + + page = nested_get_page(vcpu, vmcs12->eptp_list_address); + if (!page) goto fail; - WARN_ONCE(1, "VMCS12 VM function control should have been zero"); + + l1_eptp_list = kmap(page); + address = l1_eptp_list[index]; + if (!address) + goto fail; + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != address) { + if (address >> cpuid_maxphyaddr(vcpu) || + !IS_ALIGNED(address, 4096)) + goto fail; + kvm_mmu_unload(vcpu); + vmcs12->ept_pointer = address; + kvm_mmu_reload(vcpu); + kunmap(page); + nested_release_page_clean(page); + } + return kvm_skip_emulated_instruction(vcpu); fail: + if (page) { + kunmap(page); + nested_release_page_clean(page); + } nested_vmx_vmexit(vcpu, vmx->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION));