Message ID | 20170505192515.27833-3-bsd@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 5/6/2017 7:25 AM, Bandan Das wrote: > With EPT A/D enabled, processor access to L2 guest > paging structures will result in a write violation. > When this happens, write the GUEST_PHYSICAL_ADDRESS > to the pml buffer provided by L1 if the access is > write and the dirty bit is being set. > > This patch also adds necessary checks during VMEntry if L1 > has enabled PML. If the PML index overflows, we change the > exit reason and run L1 to simulate a PML full event. > > Signed-off-by: Bandan Das <bsd@redhat.com> > --- > arch/x86/kvm/vmx.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 79 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 2211697..8b9e942 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -248,6 +248,7 @@ struct __packed vmcs12 { > u64 xss_exit_bitmap; > u64 guest_physical_address; > u64 vmcs_link_pointer; > + u64 pml_address; > u64 guest_ia32_debugctl; > u64 guest_ia32_pat; > u64 guest_ia32_efer; > @@ -369,6 +370,7 @@ struct __packed vmcs12 { > u16 guest_ldtr_selector; > u16 guest_tr_selector; > u16 guest_intr_status; > + u16 guest_pml_index; > u16 host_es_selector; > u16 host_cs_selector; > u16 host_ss_selector; > @@ -407,6 +409,7 @@ struct nested_vmx { > /* Has the level1 guest done vmxon? */ > bool vmxon; > gpa_t vmxon_ptr; > + bool pml_full; > > /* The guest-physical address of the current VMCS L1 keeps for L2 */ > gpa_t current_vmptr; > @@ -742,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { > FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), > FIELD(GUEST_TR_SELECTOR, guest_tr_selector), > FIELD(GUEST_INTR_STATUS, guest_intr_status), > + FIELD(GUEST_PML_INDEX, guest_pml_index), > FIELD(HOST_ES_SELECTOR, host_es_selector), > FIELD(HOST_CS_SELECTOR, host_cs_selector), > FIELD(HOST_SS_SELECTOR, host_ss_selector), > @@ -767,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { > FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), > FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), > FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), > + FIELD64(PML_ADDRESS, pml_address), > FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), > FIELD64(GUEST_IA32_PAT, guest_ia32_pat), > FIELD64(GUEST_IA32_EFER, guest_ia32_efer), > @@ -1349,6 +1354,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) > vmx_xsaves_supported(); > } > > +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) > +{ > + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); > +} > + > static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) > { > return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); > @@ -9368,13 +9378,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, > struct x86_exception *fault) > { > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + struct vcpu_vmx *vmx = to_vmx(vcpu); > u32 exit_reason; > + unsigned long exit_qualification = vcpu->arch.exit_qualification; > > - if (fault->error_code & PFERR_RSVD_MASK) > + if (vmx->nested.pml_full) { > + exit_reason = EXIT_REASON_PML_FULL; > + vmx->nested.pml_full = false; > + exit_qualification &= INTR_INFO_UNBLOCK_NMI; Sorry I cannot recall the details. probably better to add a comment to indicate why mask out INTR_INFO_UNBLOCK_NMI? > + } else if (fault->error_code & PFERR_RSVD_MASK) > exit_reason = EXIT_REASON_EPT_MISCONFIG; > else > exit_reason = EXIT_REASON_EPT_VIOLATION; > - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); > + > + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); > vmcs12->guest_physical_address = fault->address; > } > > @@ -9717,6 +9734,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, > return 0; > } > > +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12) > +{ > + u64 address = vmcs12->pml_address; > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + > + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { > + if (!nested_cpu_has_ept(vmcs12) || > + !IS_ALIGNED(address, 4096) || > + address >> maxphyaddr) > + return -EINVAL; > + } Do we also need to check whether EPT A/D has been enabled for vmcs12 to make vmentry work? I cannot recall details but probably not necessary. > + > + return 0; > +} > + > static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, > struct vmx_msr_entry *e) > { > @@ -10252,6 +10285,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) > if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) > return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > > + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) > + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; > + > if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, > vmx->nested.nested_vmx_procbased_ctls_low, > vmx->nested.nested_vmx_procbased_ctls_high) || > @@ -11146,6 +11182,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) > kvm_flush_pml_buffers(kvm); > } > > +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) > +{ > + struct vmcs12 *vmcs12; > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + gpa_t gpa; > + struct page *page = NULL; > + u64 *pml_address; > + > + if (is_guest_mode(vcpu)) { > + WARN_ON_ONCE(vmx->nested.pml_full); > + > + /* > + * Check if PML is enabled for the nested guest. > + * Whether eptp bit 6 is set is already checked > + * as part of A/D emulation. > + */ > + vmcs12 = get_vmcs12(vcpu); > + if (!nested_cpu_has_pml(vmcs12)) > + return 0; Since above nested_vmx_check_pml_controls doesn't check EPT A/D bit in L1, seems we need to add this check here. > + > + if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { > + vmx->nested.pml_full = true; > + return 1; > + } Is the purpose of returning 1 to make upper layer code to inject PML full VMEXIt to L1 in nested_ept_inject_page_fault? > + > + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; > + > + page = nested_get_page(vcpu, vmcs12->pml_address); > + if (!page) > + return 0; If PML is enabled in L1, I think nested_get_page should never return a NULL PML page (unless L1 does something wrong)? Probably better to return 1 rather than 0, and handle error in nested_ept_inject_page_fault according to vmcs12->pml_address? > + > + pml_address = kmap(page); > + pml_address[vmcs12->guest_pml_index--] = gpa; This gpa is L2 guest's GPA. Do we also need to mark L1's GPA (which is related to L2 guest's GPA above) in to dirty-log? Or has this already been done? Thanks, -Kai > + kunmap(page); > + nested_release_page_clean(page); > + } > + > + return 0; > +} > + > static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, > struct kvm_memory_slot *memslot, > gfn_t offset, unsigned long mask) > @@ -11505,6 +11581,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { > .slot_disable_log_dirty = vmx_slot_disable_log_dirty, > .flush_log_dirty = vmx_flush_log_dirty, > .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, > + .write_log_dirty = vmx_write_pml_buffer, > > .pre_block = vmx_pre_block, > .post_block = vmx_post_block, >
On 10/05/2017 12:48, Huang, Kai wrote: >> >> - if (fault->error_code & PFERR_RSVD_MASK) >> + if (vmx->nested.pml_full) { >> + exit_reason = EXIT_REASON_PML_FULL; >> + vmx->nested.pml_full = false; >> + exit_qualification &= INTR_INFO_UNBLOCK_NMI; > > Sorry I cannot recall the details. probably better to add a comment to > indicate why mask out INTR_INFO_UNBLOCK_NMI? It only keeps that bit, because it's the only exit qualification bit defined for PML full vmexits (27.2.2). >> + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { >> + if (!nested_cpu_has_ept(vmcs12) || >> + !IS_ALIGNED(address, 4096) || >> + address >> maxphyaddr) >> + return -EINVAL; >> + } > > Do we also need to check whether EPT A/D has been enabled for vmcs12 to > make vmentry work? I cannot recall details but probably not necessary. The SDM doesn't say so. The "if" above matches the checks in 26.2.1.1 of the manual (Checks on VMX Controls, VM-Execution Control Fields). >> + /* >> + * Check if PML is enabled for the nested guest. >> + * Whether eptp bit 6 is set is already checked >> + * as part of A/D emulation. >> + */ >> + vmcs12 = get_vmcs12(vcpu); >> + if (!nested_cpu_has_pml(vmcs12)) >> + return 0; > > Since above nested_vmx_check_pml_controls doesn't check EPT A/D bit in > L1, seems we need to add this check here. If EPT A/D is disabled we cannot get here (update_accessed_dirty_bits is never called). >> + >> + if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { >> + vmx->nested.pml_full = true; >> + return 1; >> + } > > Is the purpose of returning 1 to make upper layer code to inject PML > full VMEXIt to L1 in nested_ept_inject_page_fault? Yes, it triggers a fault >> + >> + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; >> + >> + page = nested_get_page(vcpu, vmcs12->pml_address); >> + if (!page) >> + return 0; > > If PML is enabled in L1, I think nested_get_page should never return a > NULL PML page (unless L1 does something wrong)? Probably better to > return 1 rather than 0, and handle error in nested_ept_inject_page_fault > according to vmcs12->pml_address? This happens if the PML address is invalid (where on real hardware, the write would just be "eaten") or MMIO (where we expect to diverge from real hardware behavior). >> + >> + pml_address = kmap(page); >> + pml_address[vmcs12->guest_pml_index--] = gpa; > > This gpa is L2 guest's GPA. Do we also need to mark L1's GPA (which is > related to L2 guest's GPA above) in to dirty-log? Or has this already > been done? L1's PML contains L1 host physical addresses, i.e. L0 guest physical addresses. This GPA comes from vmcs02 and hence it is L0's GPA. L0's HPA is marked by hardware through PML, as usual. If L0 has EPT A/D but not PML, it can still provide emulated PML to L1, but L0's HPA will be marked as dirty via write protection. Paolo
Paolo Bonzini <pbonzini@redhat.com> writes: ... >> Is the purpose of returning 1 to make upper layer code to inject PML >> full VMEXIt to L1 in nested_ept_inject_page_fault? > > Yes, it triggers a fault >>> + >>> + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; >>> + >>> + page = nested_get_page(vcpu, vmcs12->pml_address); >>> + if (!page) >>> + return 0; >> >> If PML is enabled in L1, I think nested_get_page should never return a >> NULL PML page (unless L1 does something wrong)? Probably better to >> return 1 rather than 0, and handle error in nested_ept_inject_page_fault >> according to vmcs12->pml_address? > > This happens if the PML address is invalid (where on real hardware, the > write would just be "eaten") or MMIO (where we expect to diverge from Yes, that was my motivation. On real hardware, the hypervisor would still run except that the PML buffer is corrupt. Bandan > real hardware behavior). > >>> + >>> + pml_address = kmap(page); >>> + pml_address[vmcs12->guest_pml_index--] = gpa; >> >> This gpa is L2 guest's GPA. Do we also need to mark L1's GPA (which is >> related to L2 guest's GPA above) in to dirty-log? Or has this already >> been done? > > L1's PML contains L1 host physical addresses, i.e. L0 guest physical > addresses. This GPA comes from vmcs02 and hence it is L0's GPA. > > L0's HPA is marked by hardware through PML, as usual. If L0 has EPT A/D > but not PML, it can still provide emulated PML to L1, but L0's HPA will > be marked as dirty via write protection. > > Paolo
On 5/11/2017 4:00 AM, Bandan Das wrote: > Paolo Bonzini <pbonzini@redhat.com> writes: > ... >>> Is the purpose of returning 1 to make upper layer code to inject PML >>> full VMEXIt to L1 in nested_ept_inject_page_fault? >> >> Yes, it triggers a fault >>>> + >>>> + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; >>>> + >>>> + page = nested_get_page(vcpu, vmcs12->pml_address); >>>> + if (!page) >>>> + return 0; >>> >>> If PML is enabled in L1, I think nested_get_page should never return a >>> NULL PML page (unless L1 does something wrong)? Probably better to >>> return 1 rather than 0, and handle error in nested_ept_inject_page_fault >>> according to vmcs12->pml_address? >> >> This happens if the PML address is invalid (where on real hardware, the >> write would just be "eaten") or MMIO (where we expect to diverge from > > Yes, that was my motivation. On real hardware, the hypervisor would still > run except that the PML buffer is corrupt. Right. Fine to me. :) > > Bandan > >> real hardware behavior). >> >>>> + >>>> + pml_address = kmap(page); >>>> + pml_address[vmcs12->guest_pml_index--] = gpa; >>> >>> This gpa is L2 guest's GPA. Do we also need to mark L1's GPA (which is >>> related to L2 guest's GPA above) in to dirty-log? Or has this already >>> been done? >> >> L1's PML contains L1 host physical addresses, i.e. L0 guest physical >> addresses. This GPA comes from vmcs02 and hence it is L0's GPA. Do you mean pml_address? I was talking about gpa got from vmcs_read64(GUEST_PHYSICAL_ADDRESS). From hardware's point of view, PML always logs "GPA" into PML buffer so I was saying the gpa from vmcs_read64(GUEST_PHYSICAL_ADDRESS) should be L2 guest's PA. Anyway this is not important now. :) >> >> L0's HPA is marked by hardware through PML, as usual. If L0 has EPT A/D >> but not PML, it can still provide emulated PML to L1, but L0's HPA will >> be marked as dirty via write protection. Yes this is what I was thinking. For L0 PML takes care of L1 hpyervisor's dirty page, while write protection takes care of dirty page from L2. No problem. Thanks, -Kai >> >> Paolo >
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2211697..8b9e942 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -248,6 +248,7 @@ struct __packed vmcs12 { u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; + u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -369,6 +370,7 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; + u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -407,6 +409,7 @@ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; + bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -742,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -767,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(PML_ADDRESS, pml_address), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), FIELD64(GUEST_IA32_EFER, guest_ia32_efer), @@ -1349,6 +1354,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -9368,13 +9378,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; - if (fault->error_code & PFERR_RSVD_MASK) + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) exit_reason = EXIT_REASON_EPT_MISCONFIG; else exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -9717,6 +9734,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u64 address = vmcs12->pml_address; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { + if (!nested_cpu_has_ept(vmcs12) || + !IS_ALIGNED(address, 4096) || + address >> maxphyaddr) + return -EINVAL; + } + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -10252,6 +10285,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || @@ -11146,6 +11182,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index > PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = nested_get_page(vcpu, vmcs12->pml_address); + if (!page) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + nested_release_page_clean(page); + } + + return 0; +} + static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -11505,6 +11581,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block,
With EPT A/D enabled, processor access to L2 guest paging structures will result in a write violation. When this happens, write the GUEST_PHYSICAL_ADDRESS to the pml buffer provided by L1 if the access is write and the dirty bit is being set. This patch also adds necessary checks during VMEntry if L1 has enabled PML. If the PML index overflows, we change the exit reason and run L1 to simulate a PML full event. Signed-off-by: Bandan Das <bsd@redhat.com> --- arch/x86/kvm/vmx.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-)