Message ID | 1368939152-11406-5-git-send-email-jun.nakajima@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 05/19/2013 12:52 PM, Jun Nakajima wrote: > From: Nadav Har'El <nyh@il.ibm.com> > > KVM's existing shadow MMU code already supports nested TDP. To use it, we > need to set up a new "MMU context" for nested EPT, and create a few callbacks > for it (nested_ept_*()). This context should also use the EPT versions of > the page table access functions (defined in the previous patch). > Then, we need to switch back and forth between this nested context and the > regular MMU context when switching between L1 and L2 (when L1 runs this L2 > with EPT). > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > --- > arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/mmu.h | 1 + > arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 92 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 6c1670f..37f8d7f 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) > } > EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); > > +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) > +{ > + ASSERT(vcpu); > + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); > + > + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); That means L1 guest always uses page-walk length == 4? But in your previous patch, it can be 2. > + > + context->nx = is_nx(vcpu); /* TODO: ? */ Hmm? EPT always support NX. > + context->new_cr3 = paging_new_cr3; > + context->page_fault = EPT_page_fault; > + context->gva_to_gpa = EPT_gva_to_gpa; > + context->sync_page = EPT_sync_page; > + context->invlpg = EPT_invlpg; > + context->update_pte = EPT_update_pte; > + context->free = paging_free; > + context->root_level = context->shadow_root_level; > + context->root_hpa = INVALID_PAGE; > + context->direct_map = false; > + > + /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need > + something different. > + */ Exactly. :) > + reset_rsvds_bits_mask(vcpu, context); > + > + > + /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why > + they are done, or why they write to vcpu->arch.mmu and not context > + */ > + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); > + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); > + vcpu->arch.mmu.base_role.smep_andnot_wp = > + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) && > + !is_write_protection(vcpu); I guess we need not care these since the permission of EPT page does not depend on these. > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu); > + > static int init_kvm_softmmu(struct kvm_vcpu *vcpu) > { > int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 2adcbc2..8fc94dd 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); > void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); > int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); > int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); > +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); > > static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) > { > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index fb9cae5..a88432f 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, > return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; > } > > +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) > +{ > + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); > +} > + > static inline bool is_exception(u32 intr_info) > { > return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) > @@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) > entry->ecx |= bit(X86_FEATURE_VMX); > } > > +/* Callbacks for nested_ept_init_mmu_context: */ > + > +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) > +{ > + /* return the page table to be shadowed - in our case, EPT12 */ > + return get_vmcs12(vcpu)->ept_pointer; > +} > + > +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, > + struct x86_exception *fault) > +{ > + struct vmcs12 *vmcs12; > + nested_vmx_vmexit(vcpu); > + vmcs12 = get_vmcs12(vcpu); > + /* > + * Note no need to set vmcs12->vm_exit_reason as it is already copied > + * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION. > + */ > + vmcs12->exit_qualification = fault->error_code; Hmm, you directly copy the error code from FNAME(walk_addr_generic), but its format is different and i did not see you cook the error code in the previous patches. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 21, 2013 at 1:50 AM, Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> wrote: > On 05/19/2013 12:52 PM, Jun Nakajima wrote: >> From: Nadav Har'El <nyh@il.ibm.com> >> >> KVM's existing shadow MMU code already supports nested TDP. To use it, we >> need to set up a new "MMU context" for nested EPT, and create a few callbacks >> for it (nested_ept_*()). This context should also use the EPT versions of >> the page table access functions (defined in the previous patch). >> Then, we need to switch back and forth between this nested context and the >> regular MMU context when switching between L1 and L2 (when L1 runs this L2 >> with EPT). >> >> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >> --- >> arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ >> arch/x86/kvm/mmu.h | 1 + >> arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- >> 3 files changed, 92 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index 6c1670f..37f8d7f 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) >> } >> EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); >> >> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) >> +{ >> + ASSERT(vcpu); >> + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); >> + >> + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); > > That means L1 guest always uses page-walk length == 4? But in your previous patch, > it can be 2. We want to support "page-walk length == 4" only. > >> + >> + context->nx = is_nx(vcpu); /* TODO: ? */ > > Hmm? EPT always support NX. > >> + context->new_cr3 = paging_new_cr3; >> + context->page_fault = EPT_page_fault; >> + context->gva_to_gpa = EPT_gva_to_gpa; >> + context->sync_page = EPT_sync_page; >> + context->invlpg = EPT_invlpg; >> + context->update_pte = EPT_update_pte; >> + context->free = paging_free; >> + context->root_level = context->shadow_root_level; >> + context->root_hpa = INVALID_PAGE; >> + context->direct_map = false; >> + >> + /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need >> + something different. >> + */ > > Exactly. :) > >> + reset_rsvds_bits_mask(vcpu, context); >> + >> + >> + /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why >> + they are done, or why they write to vcpu->arch.mmu and not context >> + */ >> + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); >> + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); >> + vcpu->arch.mmu.base_role.smep_andnot_wp = >> + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) && >> + !is_write_protection(vcpu); > > I guess we need not care these since the permission of EPT page does not depend > on these. Right. I'll clean up this. > >> + >> + return 0; >> +} >> +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu); >> + >> static int init_kvm_softmmu(struct kvm_vcpu *vcpu) >> { >> int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); >> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h >> index 2adcbc2..8fc94dd 100644 >> --- a/arch/x86/kvm/mmu.h >> +++ b/arch/x86/kvm/mmu.h >> @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); >> void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); >> int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); >> int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); >> +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); >> >> static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) >> { >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index fb9cae5..a88432f 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, >> return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; >> } >> >> +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); >> +} >> + >> static inline bool is_exception(u32 intr_info) >> { >> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >> @@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) >> entry->ecx |= bit(X86_FEATURE_VMX); >> } >> >> +/* Callbacks for nested_ept_init_mmu_context: */ >> + >> +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) >> +{ >> + /* return the page table to be shadowed - in our case, EPT12 */ >> + return get_vmcs12(vcpu)->ept_pointer; >> +} >> + >> +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, >> + struct x86_exception *fault) >> +{ >> + struct vmcs12 *vmcs12; >> + nested_vmx_vmexit(vcpu); >> + vmcs12 = get_vmcs12(vcpu); >> + /* >> + * Note no need to set vmcs12->vm_exit_reason as it is already copied >> + * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION. >> + */ >> + vmcs12->exit_qualification = fault->error_code; > > Hmm, you directly copy the error code from FNAME(walk_addr_generic), > but its format is different and i did not see you cook the error code > in the previous patches. > Right. Basically this is the original code from Nadav, and 12, 13 fix/cook the error code. -- Jun Intel Open Source Technology Center -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6c1670f..37f8d7f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = is_nx(vcpu); /* TODO: ? */ + context->new_cr3 = paging_new_cr3; + context->page_fault = EPT_page_fault; + context->gva_to_gpa = EPT_gva_to_gpa; + context->sync_page = EPT_sync_page; + context->invlpg = EPT_invlpg; + context->update_pte = EPT_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need + something different. + */ + reset_rsvds_bits_mask(vcpu, context); + + + /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why + they are done, or why they write to vcpu->arch.mmu and not context + */ + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.smep_andnot_wp = + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) && + !is_write_protection(vcpu); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2adcbc2..8fc94dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fb9cae5..a88432f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + /* + * Note no need to set vmcs12->vm_exit_reason as it is already copied + * from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION. + */ + vmcs12->exit_qualification = fault->error_code; + vmcs12->guest_physical_address = fault->address; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7531,6 +7576,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7975,7 +8025,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + if (nested_cpu_has_ept(vmcs12)) + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu);