From patchwork Thu Aug 24 12:27:55 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yu Zhang X-Patchwork-Id: 9919965 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id 1429460349 for ; Thu, 24 Aug 2017 12:51:28 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id EA9C328B66 for ; Thu, 24 Aug 2017 12:51:27 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id DEFAA28B49; Thu, 24 Aug 2017 12:51:27 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI autolearn=unavailable version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 05DA828B49 for ; Thu, 24 Aug 2017 12:51:27 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753036AbdHXMvO (ORCPT ); Thu, 24 Aug 2017 08:51:14 -0400 Received: from mga05.intel.com ([192.55.52.43]:51617 "EHLO mga05.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752855AbdHXMuk (ORCPT ); Thu, 24 Aug 2017 08:50:40 -0400 Received: from fmsmga004.fm.intel.com ([10.253.24.48]) by fmsmga105.fm.intel.com with ESMTP; 24 Aug 2017 05:50:40 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.41,421,1498546800"; d="scan'208";a="303945878" Received: from zhangyu-optiplex-9020.bj.intel.com ([10.238.135.159]) by fmsmga004.fm.intel.com with ESMTP; 24 Aug 2017 05:50:38 -0700 From: Yu Zhang To: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org, pbonzini@redhat.com, rkrcmar@redhat.com, tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com, xiaoguangrong@tencent.com, joro@8bytes.org Subject: [PATCH v3 4/5] KVM: MMU: Add 5 level EPT & Shadow page table support. Date: Thu, 24 Aug 2017 20:27:55 +0800 Message-Id: <1503577676-12345-5-git-send-email-yu.c.zhang@linux.intel.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1503577676-12345-1-git-send-email-yu.c.zhang@linux.intel.com> References: <1503577676-12345-1-git-send-email-yu.c.zhang@linux.intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Extends the shadow paging code, so that 5 level shadow page table can be constructed if VM is running in 5 level paging mode. Also extends the ept code, so that 5 level ept table can be constructed if maxphysaddr of VM exceeds 48 bits. Unlike the shadow logic, KVM should still use 4 level ept table for a VM whose physical address width is less than 48 bits, even when the VM is running in 5 level paging mode. Signed-off-by: Yu Zhang --- arch/x86/include/asm/kvm_host.h | 10 +++++----- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/cpuid.c | 5 +++++ arch/x86/kvm/mmu.c | 43 +++++++++++++++++++++++++++-------------- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu_audit.c | 4 ++-- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 21 ++++++++++++++------ arch/x86/kvm/x86.h | 10 ++++++++++ 9 files changed, 71 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5907d46..bdef532 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -315,7 +315,7 @@ struct kvm_pio_request { int size; }; -#define PT64_ROOT_MAX_LEVEL 4 +#define PT64_ROOT_MAX_LEVEL 5 struct rsvd_bits_validate { u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; @@ -323,9 +323,9 @@ struct rsvd_bits_validate { }; /* - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu - * mode. + * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, + * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the + * current mmu mode. */ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); @@ -982,7 +982,7 @@ struct kvm_x86_ops { void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); - int (*get_tdp_level)(void); + int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 340007a..caec841 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -453,6 +453,7 @@ enum vmcs_field { #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) @@ -471,6 +472,7 @@ enum vmcs_field { #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPTP_PWL_MASK 0x38ull #define VMX_EPTP_PWL_4 0x18ull +#define VMX_EPTP_PWL_5 0x20ull #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) #define VMX_EPTP_MT_MASK 0x7ull #define VMX_EPTP_MT_WB 0x6ull diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1450547..83865a3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -137,6 +137,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); +#ifdef CONFIG_X86_64 + if (vcpu->arch.maxphyaddr > 48) + kvm_mmu_reset_context(vcpu); +#endif + kvm_pmu_refresh(vcpu); return 0; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3faa20c..f47ccca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3322,8 +3322,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && - (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && + (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -3375,13 +3375,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); return 1; } - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL); + sp = kvm_mmu_get_page(vcpu, 0, 0, + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3425,7 +3426,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; MMU_WARN_ON(VALID_PAGE(root)); @@ -3435,8 +3436,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); return 1; } - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL, - 0, ACC_ALL); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3531,7 +3532,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -4057,6 +4058,12 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; + case PT64_ROOT_5LEVEL: + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[1][4] = + rsvd_check->rsvd_bits_mask[0][4]; case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7) | @@ -4098,6 +4105,8 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, { u64 bad_mt_xwr; + rsvd_check->rsvd_bits_mask[0][4] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][2] = @@ -4107,6 +4116,7 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ + rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); @@ -4367,7 +4377,10 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, static void paging64_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL); + int root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; + + paging64_init_context_common(vcpu, context, root_level); } static void paging32_init_context(struct kvm_vcpu *vcpu, @@ -4408,7 +4421,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->root_hpa = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; @@ -4422,7 +4435,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - context->root_level = PT64_ROOT_4LEVEL; + context->root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { @@ -4479,7 +4493,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; context->ept_ad = accessed_dirty; @@ -4488,7 +4502,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->root_level = context->shadow_root_level; + context->root_level = PT64_ROOT_4LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; context->base_role.ad_disabled = !accessed_dirty; @@ -4533,7 +4547,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - g_context->root_level = PT64_ROOT_4LEVEL; + g_context->root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 45b9a11..e2999f5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -37,6 +37,7 @@ #define PT32_DIR_PSE36_MASK \ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) +#define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 2e6996d..d22ddbd 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); return; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 19e43ca..7161d79 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -563,7 +563,7 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); } -static int get_npt_level(void) +static int get_npt_level(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 return PT64_ROOT_4LEVEL; @@ -2370,7 +2370,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu); reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 19aa69a..d4038a1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1205,6 +1205,11 @@ static inline bool cpu_has_vmx_ept_mt_wb(void) return vmx_capability.ept & VMX_EPTP_WB_BIT; } +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; @@ -4301,9 +4306,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } +static int get_ept_level(struct kvm_vcpu *vcpu) +{ + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + return 5; + return 4; +} + static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { - u64 eptp = VMX_EPTP_MT_WB | VMX_EPTP_PWL_4; + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -9501,11 +9515,6 @@ static void __init vmx_check_processor_compat(void *rtn) } } -static int get_ept_level(void) -{ - return 4; -} - static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1134603..69de8bf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -62,6 +62,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) return cs_l; } +static inline bool is_la57_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return (vcpu->arch.efer & EFER_LMA) && + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); +#else + return 0; +#endif +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;