Message ID | 1483003563-25847-4-git-send-email-liang.z.li@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 29/12/2016 10:26, Liang Li wrote: > The future Intel CPU will extend the max physical address to 52 bits. > To support the new physical address width, EPT is extended to support > 5 level page table. > This patch add the 5 level EPT and extend shadow page to support > 5 level paging guest. As the RFC version, this patch enables 5 level > EPT once the hardware supports, and this is not a good choice because > 5 level EPT requires more memory access comparing to use 4 level EPT. > The right thing is to use 5 level EPT only when it's needed, will > change in the future version. > > Signed-off-by: Liang Li <liang.z.li@intel.com> > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > Cc: Dave Hansen <dave.hansen@linux.intel.com> > Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: "Radim Krčmář" <rkrcmar@redhat.com> > --- > arch/x86/include/asm/kvm_host.h | 3 +- > arch/x86/include/asm/vmx.h | 1 + > arch/x86/kvm/cpuid.h | 8 ++ > arch/x86/kvm/mmu.c | 167 +++++++++++++++++++++++++++++++--------- > arch/x86/kvm/mmu_audit.c | 5 +- > arch/x86/kvm/paging_tmpl.h | 19 ++++- > arch/x86/kvm/vmx.c | 19 +++-- > arch/x86/kvm/x86.h | 10 +++ > 8 files changed, 184 insertions(+), 48 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index a7066dc..e505dac 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) > #define KVM_NR_VAR_MTRR 8 > > #define ASYNC_PF_PER_VCPU 64 > +#define PT64_ROOT_5LEVEL 5 > > enum kvm_reg { > VCPU_REGS_RAX = 0, > @@ -310,7 +311,7 @@ struct kvm_pio_request { > }; > > struct rsvd_bits_validate { > - u64 rsvd_bits_mask[2][4]; > + u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL]; > u64 bad_mt_xwr; > }; > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index 2b5b2d4..bf2f178 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -442,6 +442,7 @@ enum vmcs_field { > > #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) > #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) > +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) > #define VMX_EPTP_UC_BIT (1ull << 8) > #define VMX_EPTP_WB_BIT (1ull << 14) > #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h > index 35058c2..4bdf3dc 100644 > --- a/arch/x86/kvm/cpuid.h > +++ b/arch/x86/kvm/cpuid.h > @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) > return best && (best->ecx & bit(X86_FEATURE_PKU)); > } > > +static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu) > +{ > + struct kvm_cpuid_entry2 *best; > + > + best = kvm_find_cpuid_entry(vcpu, 7, 0); > + return best && (best->ecx & bit(X86_FEATURE_LA57)); > +} > + > static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) > { > struct kvm_cpuid_entry2 *best; > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 4c40273..0a56f27 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, > } > > struct mmu_page_path { > - struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL]; > - unsigned int idx[PT64_ROOT_4LEVEL]; > + struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL]; > + unsigned int idx[PT64_ROOT_5LEVEL]; > }; > > #define for_each_sp(pvec, sp, parents, i) \ > @@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, > !vcpu->arch.mmu.direct_map) > --iterator->level; > > + if (iterator->level == PT64_ROOT_5LEVEL && > + vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL && > + !vcpu->arch.mmu.direct_map) > + iterator->level -= 2; This (and the "if" before it as well) might actually be dead code. Please remove it in a separate patch. > if (iterator->level == PT32E_ROOT_LEVEL) { > iterator->shadow_addr > = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; > @@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return; > > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && > - (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > - vcpu->arch.mmu.direct_map)) { > + if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && > + (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.direct_map)) || > + (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL && > + (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL || > + vcpu->arch.mmu.direct_map))) { Same here: if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) should be enough. In general, checking >= PT64_ROOT_4LEVEL is better IMHO than checking for == PT64_ROOT_4LEVEL || == PT64_ROOT_5LEVEL. These "if"s basically need to single out PAE. A hypothetical 6-level page table extension would in all likelihood behave just like 64-bit LA48 and LA57 paging. > hpa_t root = vcpu->arch.mmu.root_hpa; > > spin_lock(&vcpu->kvm->mmu_lock); > @@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > struct kvm_mmu_page *sp; > unsigned i; > > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { Same here and everywhere else. > spin_lock(&vcpu->kvm->mmu_lock); > make_mmu_pages_available(vcpu); > - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL); > + sp = kvm_mmu_get_page(vcpu, 0, 0, > + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); > ++sp->root_count; > spin_unlock(&vcpu->kvm->mmu_lock); > vcpu->arch.mmu.root_hpa = __pa(sp->spt); > @@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * Do we shadow a long mode page table? If so we need to > * write-protect the guests page table root. > */ > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { > hpa_t root = vcpu->arch.mmu.root_hpa; > > MMU_WARN_ON(VALID_PAGE(root)); > > spin_lock(&vcpu->kvm->mmu_lock); > make_mmu_pages_available(vcpu); > - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL, > - 0, ACC_ALL); > + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, > + vcpu->arch.mmu.root_level, 0, ACC_ALL); > root = __pa(sp->spt); > ++sp->root_count; > spin_unlock(&vcpu->kvm->mmu_lock); > @@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * the shadow page table may be a PAE or a long mode page table. > */ > pm_mask = PT_PRESENT_MASK; > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) > pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; > > for (i = 0; i < 4; ++i) { > @@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * If we shadow a 32 bit page table with a long mode page > * table we enter this path. > */ > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { > if (vcpu->arch.mmu.lm_root == NULL) { > /* > * The additional page necessary for this is only > @@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) > return; > > vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); > - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { > hpa_t root = vcpu->arch.mmu.root_hpa; > sp = page_header(root); > mmu_sync_children(vcpu, sp); > @@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) > walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) > { > struct kvm_shadow_walk_iterator iterator; > - u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull; > + u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull; > int root, leaf; > bool reserved = false; > > @@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, > } > > #define PTTYPE_EPT 18 /* arbitrary */ > +#define PTTYPE_LA57 57 > + > #define PTTYPE PTTYPE_EPT > #include "paging_tmpl.h" > #undef PTTYPE > > +#define PTTYPE PTTYPE_LA57 > +#include "paging_tmpl.h" > +#undef PTTYPE This is not needed. The format for LA57 page tables is the same as for LA48. > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > @@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, > rsvd_check->rsvd_bits_mask[1][0] = > rsvd_check->rsvd_bits_mask[0][0]; > break; > + case PT64_ROOT_5LEVEL: > + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | rsvd_bits(7, 7); > + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | rsvd_bits(7, 7); I think the code for this and PT64_ROOT_4LEVEL should be the same (setting rsvd_bits_mask[x][4] for PT64_ROOT_4LEVEL is okay). You are assuming that MAXPHYADDR=52, but the Intel whitepaper doesn't say this is going to be always the case. rsvd_bits in arch/x86/kvm/mmu.h is not a hot path, feel free to add an if (e < s) return 0; there. > + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | gbpages_bit_rsvd; > + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd; > + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd; > + rsvd_check->rsvd_bits_mask[1][4] = > + rsvd_check->rsvd_bits_mask[0][4]; > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | > + gbpages_bit_rsvd | rsvd_bits(13, 29); > + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | > + rsvd_bits(13, 20); /* large page */ > + rsvd_check->rsvd_bits_mask[1][0] = > + rsvd_check->rsvd_bits_mask[0][0]; > + break; > } > } > > @@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, > > static void > __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, > - int maxphyaddr, bool execonly) > + int maxphyaddr, bool execonly, int ept_level) > { > u64 bad_mt_xwr; > > - rsvd_check->rsvd_bits_mask[0][3] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); > - rsvd_check->rsvd_bits_mask[0][2] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > - rsvd_check->rsvd_bits_mask[0][1] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); > - > - /* large page */ > - rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; > - rsvd_check->rsvd_bits_mask[1][2] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); > - rsvd_check->rsvd_bits_mask[1][1] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); > - rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; > + if (ept_level == 5) { > + rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7); Same here, this "if" is not needed at all and the new ept_level argument shouldn't be required either. > + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7); > + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][0] = 0; > + > + /* large page */ > + rsvd_check->rsvd_bits_mask[1][4] = > + rsvd_check->rsvd_bits_mask[0][4]; > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29); > + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20); > + rsvd_check->rsvd_bits_mask[1][0] = 0; > + } else { > + rsvd_check->rsvd_bits_mask[0][3] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); > + rsvd_check->rsvd_bits_mask[0][2] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][1] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); > + /* large page */ > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); > + rsvd_check->rsvd_bits_mask[1][1] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); > + rsvd_check->rsvd_bits_mask[1][0] = > + rsvd_check->rsvd_bits_mask[0][0]; > + } > > bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ > bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ > @@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, > } > > static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, > - struct kvm_mmu *context, bool execonly) > + struct kvm_mmu *context, bool execonly, int ept_level) > { > __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, > - cpuid_maxphyaddr(vcpu), execonly); > + cpuid_maxphyaddr(vcpu), execonly, ept_level); > } > > /* > @@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void) > true, true); > else > __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, > - boot_cpu_data.x86_phys_bits, > - false); > + boot_cpu_data.x86_phys_bits, false, > + context->shadow_root_level); > > } > > @@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void) > struct kvm_mmu *context, bool execonly) > { > __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, > - boot_cpu_data.x86_phys_bits, execonly); > + boot_cpu_data.x86_phys_bits, execonly, > + context->shadow_root_level); > } > > static void update_permission_bitmask(struct kvm_vcpu *vcpu, > @@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu, > paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL); > } > > +static void paging_la57_init_context(struct kvm_vcpu *vcpu, > + struct kvm_mmu *context) > +{ > + context->nx = is_nx(vcpu); > + context->root_level = PT64_ROOT_5LEVEL; > + > + reset_rsvds_bits_mask(vcpu, context); > + update_permission_bitmask(vcpu, context, false); > + update_pkru_bitmask(vcpu, context, false); > + update_last_nonleaf_level(vcpu, context); > + > + MMU_WARN_ON(!is_pae(vcpu)); > + context->page_fault = paging_la57_page_fault; > + context->gva_to_gpa = paging_la57_gva_to_gpa; > + context->sync_page = paging_la57_sync_page; > + context->invlpg = paging_la57_invlpg; > + context->update_pte = paging_la57_update_pte; > + context->shadow_root_level = PT64_ROOT_5LEVEL; > + context->root_hpa = INVALID_PAGE; > + context->direct_map = false; This should be using paging64_init_context_common. Even better, paging64_init_context could do int root_level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; paging64_init_context_common(vcpu, context, root_level); and then you can skip the change in kvm_init_shadow_mmu. > +} > + > static void paging32_init_context(struct kvm_vcpu *vcpu, > struct kvm_mmu *context) > { > @@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) > context->nx = false; > context->gva_to_gpa = nonpaging_gva_to_gpa; > context->root_level = 0; > + } else if (is_la57_mode(vcpu)) { > + context->nx = is_nx(vcpu); > + context->root_level = PT64_ROOT_5LEVEL; > + reset_rsvds_bits_mask(vcpu, context); > + context->gva_to_gpa = paging_la57_gva_to_gpa; Please put the if (is_la57_mode(vcpu)) inside the is_long_mode branch below, since the only difference is context->root_level. > } else if (is_long_mode(vcpu)) { > context->nx = is_nx(vcpu); > context->root_level = PT64_ROOT_4LEVEL; > @@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) > > if (!is_paging(vcpu)) > nonpaging_init_context(vcpu, context); > + else if (is_la57_mode(vcpu)) > + paging_la57_init_context(vcpu, context); > else if (is_long_mode(vcpu)) > paging64_init_context(vcpu, context); > else if (is_pae(vcpu)) > @@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) > > update_permission_bitmask(vcpu, context, true); > update_pkru_bitmask(vcpu, context, true); > - reset_rsvds_bits_mask_ept(vcpu, context, execonly); > + reset_rsvds_bits_mask_ept(vcpu, context, execonly, > + context->shadow_root_level); > reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); > } > EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); > @@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) > g_context->nx = false; > g_context->root_level = 0; > g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; > + } else if (is_la57_mode(vcpu)) { > + g_context->nx = is_nx(vcpu); > + g_context->root_level = PT64_ROOT_5LEVEL; > + reset_rsvds_bits_mask(vcpu, g_context); > + g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested; Same here. > } else if (is_long_mode(vcpu)) { > g_context->nx = is_nx(vcpu); > g_context->root_level = PT64_ROOT_4LEVEL; > diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c > index 2e6996d..bb40094 100644 > --- a/arch/x86/kvm/mmu_audit.c > +++ b/arch/x86/kvm/mmu_audit.c > @@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return; > > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { As above, please use >= PT64_ROOT_4LEVEL here. > hpa_t root = vcpu->arch.mmu.root_hpa; > > sp = page_header(root); > - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL); > + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); > return; > } > > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index a011054..c126cd3 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h This is not needed. > @@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void) > #define CMPXCHG cmpxchg64 > #define PT_MAX_FULL_LEVELS 2 > #endif > +#elif PTTYPE == PTTYPE_LA57 > + #define pt_element_t u64 > + #define guest_walker guest_walker_la57 > + #define FNAME(name) paging_la57_##name > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK > + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK > + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT > + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT > + #define PT_MAX_FULL_LEVELS 5 > + #define CMPXCHG cmpxchg > #elif PTTYPE == 32 > #define pt_element_t u32 > #define guest_walker guest_walker32 > @@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) > { > unsigned pkeys = 0; > -#if PTTYPE == 64 > +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 > pte_t pte = {.pte = gpte}; > > pkeys = pte_flags_pkey(pte_flags(pte)); > @@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > walker->level = mmu->root_level; > pte = mmu->get_cr3(vcpu); > > -#if PTTYPE == 64 > +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 > if (walker->level == PT32E_ROOT_LEVEL) { > pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); > trace_kvm_mmu_paging_element(pte, walker->level); > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 24db5fb..bfc9f0a 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) > return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; > } > > +static inline bool cpu_has_vmx_ept_5levels(void) > +{ > + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; > +} > + > static inline bool cpu_has_vmx_ept_ad_bits(void) > { > return vmx_capability.ept & VMX_EPT_AD_BIT; > @@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) > vmx->emulation_required = emulation_required(vcpu); > } > > +static int get_ept_level(void) > +{ > + if (cpu_has_vmx_ept_5levels()) > + return VMX_EPT_MAX_GAW + 1; > + return VMX_EPT_DEFAULT_GAW + 1; > +} > + > static u64 construct_eptp(unsigned long root_hpa) > { > u64 eptp; > > /* TODO write the value reading from MSR */ > eptp = VMX_EPT_DEFAULT_MT | > - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; > + (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT; > if (enable_ept_ad_bits) > eptp |= VMX_EPT_AD_ENABLE_BIT; > eptp |= (root_hpa & PAGE_MASK); For nested virt you need to set the shift to what L1 uses, so I think you need to add a root_level argument here and in kvm_init_shadow_ept_mmu. Paolo > @@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn) > } > } > > -static int get_ept_level(void) > -{ > - return VMX_EPT_DEFAULT_GAW + 1; > -} > - > static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) > { > u8 cache; > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index e8ff3e4..26627df 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) > return cs_l; > } > > +static inline bool is_la57_mode(struct kvm_vcpu *vcpu) > +{ > +#ifdef CONFIG_X86_64 > + return (vcpu->arch.efer & EFER_LMA) && > + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); > +#else > + return 0; > +#endif > +} > + > static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) > { > return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; >
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7066dc..e505dac 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 +#define PT64_ROOT_5LEVEL 5 enum kvm_reg { VCPU_REGS_RAX = 0, @@ -310,7 +311,7 @@ struct kvm_pio_request { }; struct rsvd_bits_validate { - u64 rsvd_bits_mask[2][4]; + u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL]; u64 bad_mt_xwr; }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b5b2d4..bf2f178 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -442,6 +442,7 @@ enum vmcs_field { #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2..4bdf3dc 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_PKU)); } +static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ecx & bit(X86_FEATURE_LA57)); +} + static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c40273..0a56f27 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, } struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL]; - unsigned int idx[PT64_ROOT_4LEVEL]; + struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL]; + unsigned int idx[PT64_ROOT_5LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ @@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, !vcpu->arch.mmu.direct_map) --iterator->level; + if (iterator->level == PT64_ROOT_5LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL && + !vcpu->arch.mmu.direct_map) + iterator->level -= 2; + if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; @@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && - (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || - vcpu->arch.mmu.direct_map)) { + if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.direct_map)) || + (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL || + vcpu->arch.mmu.direct_map))) { hpa_t root = vcpu->arch.mmu.root_hpa; spin_lock(&vcpu->kvm->mmu_lock); @@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL); + sp = kvm_mmu_get_page(vcpu, 0, 0, + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL, - 0, ACC_ALL); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + vcpu->arch.mmu.root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { @@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * If we shadow a 32 bit page table with a long mode page * table we enter this path. */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { if (vcpu->arch.mmu.lm_root == NULL) { /* * The additional page necessary for this is only @@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull; + u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull; int root, leaf; bool reserved = false; @@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, } #define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE_LA57 57 + #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE +#define PTTYPE PTTYPE_LA57 +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; + case PT64_ROOT_5LEVEL: + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | + nonleaf_bit8_rsvd | gbpages_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd; + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd; + rsvd_check->rsvd_bits_mask[1][4] = + rsvd_check->rsvd_bits_mask[0][4]; + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(13, 29); + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(13, 20); /* large page */ + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; + break; } } @@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - int maxphyaddr, bool execonly) + int maxphyaddr, bool execonly, int ept_level) { u64 bad_mt_xwr; - rsvd_check->rsvd_bits_mask[0][3] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); - - /* large page */ - rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); - rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; + if (ept_level == 5) { + rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = 0; + + /* large page */ + rsvd_check->rsvd_bits_mask[1][4] = + rsvd_check->rsvd_bits_mask[0][4]; + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][0] = 0; + } else { + rsvd_check->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + rsvd_check->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + /* large page */ + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + rsvd_check->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; + } bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ @@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) + struct kvm_mmu *context, bool execonly, int ept_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - cpuid_maxphyaddr(vcpu), execonly); + cpuid_maxphyaddr(vcpu), execonly, ept_level); } /* @@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void) true, true); else __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, - false); + boot_cpu_data.x86_phys_bits, false, + context->shadow_root_level); } @@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void) struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + boot_cpu_data.x86_phys_bits, execonly, + context->shadow_root_level); } static void update_permission_bitmask(struct kvm_vcpu *vcpu, @@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL); } +static void paging_la57_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + context->nx = is_nx(vcpu); + context->root_level = PT64_ROOT_5LEVEL; + + reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); + update_last_nonleaf_level(vcpu, context); + + MMU_WARN_ON(!is_pae(vcpu)); + context->page_fault = paging_la57_page_fault; + context->gva_to_gpa = paging_la57_gva_to_gpa; + context->sync_page = paging_la57_sync_page; + context->invlpg = paging_la57_invlpg; + context->update_pte = paging_la57_update_pte; + context->shadow_root_level = PT64_ROOT_5LEVEL; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; +} + static void paging32_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { @@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; + } else if (is_la57_mode(vcpu)) { + context->nx = is_nx(vcpu); + context->root_level = PT64_ROOT_5LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging_la57_gva_to_gpa; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); context->root_level = PT64_ROOT_4LEVEL; @@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); + else if (is_la57_mode(vcpu)) + paging_la57_init_context(vcpu, context); else if (is_long_mode(vcpu)) paging64_init_context(vcpu, context); else if (is_pae(vcpu)) @@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); - reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_rsvds_bits_mask_ept(vcpu, context, execonly, + context->shadow_root_level); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->nx = false; g_context->root_level = 0; g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + } else if (is_la57_mode(vcpu)) { + g_context->nx = is_nx(vcpu); + g_context->root_level = PT64_ROOT_5LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); + g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); g_context->root_level = PT64_ROOT_4LEVEL; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 2e6996d..bb40094 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); return; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a011054..c126cd3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif +#elif PTTYPE == PTTYPE_LA57 + #define pt_element_t u64 + #define guest_walker guest_walker_la57 + #define FNAME(name) paging_la57_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_MAX_FULL_LEVELS 5 + #define CMPXCHG cmpxchg #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 @@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned pkeys = 0; -#if PTTYPE == 64 +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 pte_t pte = {.pte = gpte}; pkeys = pte_flags_pkey(pte_flags(pte)); @@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); -#if PTTYPE == 64 +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 24db5fb..bfc9f0a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; @@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } +static int get_ept_level(void) +{ + if (cpu_has_vmx_ept_5levels()) + return VMX_EPT_MAX_GAW + 1; + return VMX_EPT_DEFAULT_GAW + 1; +} + static u64 construct_eptp(unsigned long root_hpa) { u64 eptp; /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT; if (enable_ept_ad_bits) eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); @@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn) } } -static int get_ept_level(void) -{ - return VMX_EPT_DEFAULT_GAW + 1; -} - static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4..26627df 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) return cs_l; } +static inline bool is_la57_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return (vcpu->arch.efer & EFER_LMA) && + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); +#else + return 0; +#endif +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
The future Intel CPU will extend the max physical address to 52 bits. To support the new physical address width, EPT is extended to support 5 level page table. This patch add the 5 level EPT and extend shadow page to support 5 level paging guest. As the RFC version, this patch enables 5 level EPT once the hardware supports, and this is not a good choice because 5 level EPT requires more memory access comparing to use 4 level EPT. The right thing is to use 5 level EPT only when it's needed, will change in the future version. Signed-off-by: Liang Li <liang.z.li@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: "Radim Krčmář" <rkrcmar@redhat.com> --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/cpuid.h | 8 ++ arch/x86/kvm/mmu.c | 167 +++++++++++++++++++++++++++++++--------- arch/x86/kvm/mmu_audit.c | 5 +- arch/x86/kvm/paging_tmpl.h | 19 ++++- arch/x86/kvm/vmx.c | 19 +++-- arch/x86/kvm/x86.h | 10 +++ 8 files changed, 184 insertions(+), 48 deletions(-)