Message ID | 8b43a9203c34b5330c4ea5901da5dac3458ac98d.1699368363.git.isaku.yamahata@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM TDX: TDP MMU: large page support | expand |
On 11/7/2023 11:00 PM, isaku.yamahata@intel.com wrote: > From: Xiaoyao Li <xiaoyao.li@intel.com> > > When TDX enabled, a large page cannot be zapped if it contains mixed > pages. In this case, it has to split the large page. > > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> > --- > arch/x86/kvm/Kconfig | 1 + > arch/x86/kvm/mmu/mmu.c | 6 +-- > arch/x86/kvm/mmu/mmu_internal.h | 9 +++++ > arch/x86/kvm/mmu/tdp_mmu.c | 68 +++++++++++++++++++++++++++++++-- > 4 files changed, 78 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index b0f103641547..557479737962 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -93,6 +93,7 @@ config KVM_INTEL > tristate "KVM for Intel (and compatible) processors support" > depends on KVM && IA32_FEAT_CTL > select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST > + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST > select KVM_PRIVATE_MEM if INTEL_TDX_HOST > help > Provides support for KVM on processors equipped with Intel's VT > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 265177cedf37..0bf043812644 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -7463,8 +7463,8 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, > return kvm_unmap_gfn_range(kvm, range); > } > > -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, > - int level) > +bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, > + int level) > { > return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; > } > @@ -7491,7 +7491,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, > return kvm_range_has_memory_attributes(kvm, start, end, attrs); > > for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { > - if (hugepage_test_mixed(slot, gfn, level - 1) || > + if (kvm_hugepage_test_mixed(slot, gfn, level - 1) || > attrs != kvm_get_memory_attributes(kvm, gfn)) > return false; > } > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 1da98be74ad2..653e96769956 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -460,4 +460,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); > void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); > > +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES > +bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level); > +#else > +static inline bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) > +{ > + return false; > +} > +#endif > + > #endif /* __KVM_X86_MMU_INTERNAL_H */ > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 7873e9ee82ad..a209a67decae 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -964,6 +964,14 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > return true; > } > > + > +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, > + struct tdp_iter *iter, > + bool shared); > + > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > + struct kvm_mmu_page *sp, bool shared); > + > /* > * If can_yield is true, will release the MMU lock and reschedule if the > * scheduler needs the CPU or there is contention on the MMU lock. If this > @@ -975,13 +983,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > gfn_t start, gfn_t end, bool can_yield, bool flush, > bool zap_private) > { > + bool is_private = is_private_sp(root); > + struct kvm_mmu_page *split_sp = NULL; > struct tdp_iter iter; > > end = min(end, tdp_mmu_max_gfn_exclusive()); > > lockdep_assert_held_write(&kvm->mmu_lock); > > - WARN_ON_ONCE(zap_private && !is_private_sp(root)); > + WARN_ON_ONCE(zap_private && !is_private); > if (!zap_private && is_private_sp(root)) Can use is_private instead of is_private_sp(root) here as well. > return false; > > @@ -1006,12 +1016,66 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > !is_last_spte(iter.old_spte, iter.level)) > continue; > > + if (is_private && kvm_gfn_shared_mask(kvm) && > + is_large_pte(iter.old_spte)) { > + gfn_t gfn = iter.gfn & ~kvm_gfn_shared_mask(kvm); > + gfn_t mask = KVM_PAGES_PER_HPAGE(iter.level) - 1; > + struct kvm_memory_slot *slot; > + struct kvm_mmu_page *sp; > + > + slot = gfn_to_memslot(kvm, gfn); > + if (kvm_hugepage_test_mixed(slot, gfn, iter.level) || > + (gfn & mask) < start || > + end < (gfn & mask) + KVM_PAGES_PER_HPAGE(iter.level)) { > + WARN_ON_ONCE(!can_yield); > + if (split_sp) { > + sp = split_sp; > + split_sp = NULL; > + sp->role = tdp_iter_child_role(&iter); > + } else { > + WARN_ON(iter.yielded); > + if (flush && can_yield) { > + kvm_flush_remote_tlbs(kvm); > + flush = false; > + } Is it necessary to do the flush here? > + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, false); > + if (iter.yielded) { > + split_sp = sp; > + continue; > + } > + } > + KVM_BUG_ON(!sp, kvm); > + > + tdp_mmu_init_sp(sp, iter.sptep, iter.gfn); > + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) { > + kvm_flush_remote_tlbs(kvm); > + flush = false; Why it needs to flush TLB immediately if tdp_mmu_split_huge_page() fails? Also, when KVM MMU write lock is held, it seems tdp_mmu_split_huge_page() will not fail. But let's assume this condition can be triggered, since sp is local variable, it will lost its value after continue, and split_sp is also NULL, it will try to allocate a new sp, memory leakage here? > + /* force retry on this gfn. */ > + iter.yielded = true; > + } else > + flush = true; > + continue; > + } > + } > + > tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); > flush = true; > } > > rcu_read_unlock(); > > + if (split_sp) { > + WARN_ON(!can_yield); > + if (flush) { > + kvm_flush_remote_tlbs(kvm); > + flush = false; > + } Same here, why we need to do the flush here? Can we delay it till the caller do the flush? > + > + write_unlock(&kvm->mmu_lock); > + tdp_mmu_free_sp(split_sp); > + write_lock(&kvm->mmu_lock); > + } > + > /* > * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need > * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. > @@ -1606,8 +1670,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, > > KVM_BUG_ON(kvm_mmu_page_role_is_private(role) != > is_private_sptep(iter->sptep), kvm); > - /* TODO: Large page isn't supported for private SPTE yet. */ > - KVM_BUG_ON(kvm_mmu_page_role_is_private(role), kvm); > > /* > * Since we are allocating while under the MMU lock we have to be
On Tue, Nov 21, 2023 at 05:57:28PM +0800, Binbin Wu <binbin.wu@linux.intel.com> wrote: > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index 7873e9ee82ad..a209a67decae 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -964,6 +964,14 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > > return true; > > } > > + > > +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, > > + struct tdp_iter *iter, > > + bool shared); > > + > > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > > + struct kvm_mmu_page *sp, bool shared); > > + > > /* > > * If can_yield is true, will release the MMU lock and reschedule if the > > * scheduler needs the CPU or there is contention on the MMU lock. If this > > @@ -975,13 +983,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > > gfn_t start, gfn_t end, bool can_yield, bool flush, > > bool zap_private) > > { > > + bool is_private = is_private_sp(root); > > + struct kvm_mmu_page *split_sp = NULL; > > struct tdp_iter iter; > > end = min(end, tdp_mmu_max_gfn_exclusive()); > > lockdep_assert_held_write(&kvm->mmu_lock); > > - WARN_ON_ONCE(zap_private && !is_private_sp(root)); > > + WARN_ON_ONCE(zap_private && !is_private); > > if (!zap_private && is_private_sp(root)) > Can use is_private instead of is_private_sp(root) here as well. I'll update it. > > > return false; > > @@ -1006,12 +1016,66 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, > > !is_last_spte(iter.old_spte, iter.level)) > > continue; > > + if (is_private && kvm_gfn_shared_mask(kvm) && > > + is_large_pte(iter.old_spte)) { > > + gfn_t gfn = iter.gfn & ~kvm_gfn_shared_mask(kvm); > > + gfn_t mask = KVM_PAGES_PER_HPAGE(iter.level) - 1; > > + struct kvm_memory_slot *slot; > > + struct kvm_mmu_page *sp; > > + > > + slot = gfn_to_memslot(kvm, gfn); > > + if (kvm_hugepage_test_mixed(slot, gfn, iter.level) || > > + (gfn & mask) < start || > > + end < (gfn & mask) + KVM_PAGES_PER_HPAGE(iter.level)) { > > + WARN_ON_ONCE(!can_yield); > > + if (split_sp) { > > + sp = split_sp; > > + split_sp = NULL; > > + sp->role = tdp_iter_child_role(&iter); > > + } else { > > + WARN_ON(iter.yielded); > > + if (flush && can_yield) { > > + kvm_flush_remote_tlbs(kvm); > > + flush = false; > > + } > Is it necessary to do the flush here? Because tdp_mmu_alloc_sp_for_split() may unlock mmu_lock and block. While blocking, other thread operates on KVM MMU and gets confused due to remaining TLB cache. > > + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, false); > > + if (iter.yielded) { > > + split_sp = sp; > > + continue; > > + } > > + } > > + KVM_BUG_ON(!sp, kvm); > > + > > + tdp_mmu_init_sp(sp, iter.sptep, iter.gfn); > > + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) { > > + kvm_flush_remote_tlbs(kvm); > > + flush = false; > Why it needs to flush TLB immediately if tdp_mmu_split_huge_page() fails? Hmm, we don't need it. When breaking up page table, we need to tlb flush before issuing TDH.MEM.PAGE.DEMOTE(), not after it. Will remove those two lines. > Also, when KVM MMU write lock is held, it seems tdp_mmu_split_huge_page() > will not fail. This can happen with TDX_OPERAND_BUSY with secure-ept tree lock with other vcpus TDH.VP.ENTER(). TDH.VP.ENTER() can take exclusive lock of secure-EPT. > But let's assume this condition can be triggered, since sp is > local > variable, it will lost its value after continue, and split_sp is also NULL, > it will try to allocate a new sp, memory leakage here? Nice catch. I'll add split_sp = sp; > > + /* force retry on this gfn. */ > > + iter.yielded = true; > > + } else > > + flush = true; > > + continue; > > + } > > + } > > + > > tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); > > flush = true; > > } > > rcu_read_unlock(); > > + if (split_sp) { > > + WARN_ON(!can_yield); > > + if (flush) { > > + kvm_flush_remote_tlbs(kvm); > > + flush = false; > > + } > Same here, why we need to do the flush here? > Can we delay it till the caller do the flush? No. Because we unlock mmu_lock and may block when freeing memory. > > + > > + write_unlock(&kvm->mmu_lock); > > + tdp_mmu_free_sp(split_sp); > > + write_lock(&kvm->mmu_lock); > > + } > > + > > /* > > * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need > > * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. > > @@ -1606,8 +1670,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, > > KVM_BUG_ON(kvm_mmu_page_role_is_private(role) != > > is_private_sptep(iter->sptep), kvm); > > - /* TODO: Large page isn't supported for private SPTE yet. */ > > - KVM_BUG_ON(kvm_mmu_page_role_is_private(role), kvm); > > /* > > * Since we are allocating while under the MMU lock we have to be > >
On 11/21/2023 7:00 PM, Isaku Yamahata wrote: > On Tue, Nov 21, 2023 at 05:57:28PM +0800, > Binbin Wu <binbin.wu@linux.intel.com> wrote: > >>> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c >>> index 7873e9ee82ad..a209a67decae 100644 >>> --- a/arch/x86/kvm/mmu/tdp_mmu.c >>> +++ b/arch/x86/kvm/mmu/tdp_mmu.c >>> @@ -964,6 +964,14 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) >>> return true; >>> } >>> + >>> +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, >>> + struct tdp_iter *iter, >>> + bool shared); >>> + >>> +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, >>> + struct kvm_mmu_page *sp, bool shared); >>> + >>> /* >>> * If can_yield is true, will release the MMU lock and reschedule if the >>> * scheduler needs the CPU or there is contention on the MMU lock. If this >>> @@ -975,13 +983,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, >>> gfn_t start, gfn_t end, bool can_yield, bool flush, >>> bool zap_private) >>> { >>> + bool is_private = is_private_sp(root); >>> + struct kvm_mmu_page *split_sp = NULL; >>> struct tdp_iter iter; >>> end = min(end, tdp_mmu_max_gfn_exclusive()); >>> lockdep_assert_held_write(&kvm->mmu_lock); >>> - WARN_ON_ONCE(zap_private && !is_private_sp(root)); >>> + WARN_ON_ONCE(zap_private && !is_private); >>> if (!zap_private && is_private_sp(root)) >> Can use is_private instead of is_private_sp(root) here as well. > I'll update it. > >>> return false; >>> @@ -1006,12 +1016,66 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, >>> !is_last_spte(iter.old_spte, iter.level)) >>> continue; >>> + if (is_private && kvm_gfn_shared_mask(kvm) && >>> + is_large_pte(iter.old_spte)) { >>> + gfn_t gfn = iter.gfn & ~kvm_gfn_shared_mask(kvm); >>> + gfn_t mask = KVM_PAGES_PER_HPAGE(iter.level) - 1; >>> + struct kvm_memory_slot *slot; >>> + struct kvm_mmu_page *sp; >>> + >>> + slot = gfn_to_memslot(kvm, gfn); >>> + if (kvm_hugepage_test_mixed(slot, gfn, iter.level) || >>> + (gfn & mask) < start || >>> + end < (gfn & mask) + KVM_PAGES_PER_HPAGE(iter.level)) { >>> + WARN_ON_ONCE(!can_yield); >>> + if (split_sp) { >>> + sp = split_sp; >>> + split_sp = NULL; >>> + sp->role = tdp_iter_child_role(&iter); >>> + } else { >>> + WARN_ON(iter.yielded); >>> + if (flush && can_yield) { >>> + kvm_flush_remote_tlbs(kvm); >>> + flush = false; >>> + } >> Is it necessary to do the flush here? > Because tdp_mmu_alloc_sp_for_split() may unlock mmu_lock and block. > While blocking, other thread operates on KVM MMU and gets confused due to > remaining TLB cache. > > >>> + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, false); >>> + if (iter.yielded) { >>> + split_sp = sp; >>> + continue; >>> + } >>> + } >>> + KVM_BUG_ON(!sp, kvm); >>> + >>> + tdp_mmu_init_sp(sp, iter.sptep, iter.gfn); >>> + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) { >>> + kvm_flush_remote_tlbs(kvm); >>> + flush = false; >> Why it needs to flush TLB immediately if tdp_mmu_split_huge_page() fails? > Hmm, we don't need it. When breaking up page table, we need to tlb flush > before issuing TDH.MEM.PAGE.DEMOTE(), not after it. Will remove those two lines. > > >> Also, when KVM MMU write lock is held, it seems tdp_mmu_split_huge_page() >> will not fail. > This can happen with TDX_OPERAND_BUSY with secure-ept tree lock with other > vcpus TDH.VP.ENTER(). TDH.VP.ENTER() can take exclusive lock of secure-EPT. > > >> But let's assume this condition can be triggered, since sp is >> local >> variable, it will lost its value after continue, and split_sp is also NULL, >> it will try to allocate a new sp, memory leakage here? > Nice catch. I'll add split_sp = sp; > > >>> + /* force retry on this gfn. */ >>> + iter.yielded = true; >>> + } else >>> + flush = true; >>> + continue; >>> + } >>> + } >>> + >>> tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); >>> flush = true; >>> } >>> rcu_read_unlock(); >>> + if (split_sp) { >>> + WARN_ON(!can_yield); >>> + if (flush) { >>> + kvm_flush_remote_tlbs(kvm); >>> + flush = false; >>> + } >> Same here, why we need to do the flush here? >> Can we delay it till the caller do the flush? > No. Because we unlock mmu_lock and may block when freeing memory. But I don't find it may block during freeing memory. Did I miss anything? > >>> + >>> + write_unlock(&kvm->mmu_lock); >>> + tdp_mmu_free_sp(split_sp); >>> + write_lock(&kvm->mmu_lock); >>> + } >>> + >>> /* >>> * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need >>> * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. >>> @@ -1606,8 +1670,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, >>> KVM_BUG_ON(kvm_mmu_page_role_is_private(role) != >>> is_private_sptep(iter->sptep), kvm); >>> - /* TODO: Large page isn't supported for private SPTE yet. */ >>> - KVM_BUG_ON(kvm_mmu_page_role_is_private(role), kvm); >>> /* >>> * Since we are allocating while under the MMU lock we have to be >>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b0f103641547..557479737962 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -93,6 +93,7 @@ config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL select KVM_SW_PROTECTED_VM if INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST select KVM_PRIVATE_MEM if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 265177cedf37..0bf043812644 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7463,8 +7463,8 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) +bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) { return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; } @@ -7491,7 +7491,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, return kvm_range_has_memory_attributes(kvm, start, end, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { - if (hugepage_test_mixed(slot, gfn, level - 1) || + if (kvm_hugepage_test_mixed(slot, gfn, level - 1) || attrs != kvm_get_memory_attributes(kvm, gfn)) return false; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1da98be74ad2..653e96769956 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -460,4 +460,13 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level); +#else +static inline bool kvm_hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) +{ + return false; +} +#endif + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7873e9ee82ad..a209a67decae 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -964,6 +964,14 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return true; } + +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, + struct tdp_iter *iter, + bool shared); + +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared); + /* * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this @@ -975,13 +983,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush, bool zap_private) { + bool is_private = is_private_sp(root); + struct kvm_mmu_page *split_sp = NULL; struct tdp_iter iter; end = min(end, tdp_mmu_max_gfn_exclusive()); lockdep_assert_held_write(&kvm->mmu_lock); - WARN_ON_ONCE(zap_private && !is_private_sp(root)); + WARN_ON_ONCE(zap_private && !is_private); if (!zap_private && is_private_sp(root)) return false; @@ -1006,12 +1016,66 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; + if (is_private && kvm_gfn_shared_mask(kvm) && + is_large_pte(iter.old_spte)) { + gfn_t gfn = iter.gfn & ~kvm_gfn_shared_mask(kvm); + gfn_t mask = KVM_PAGES_PER_HPAGE(iter.level) - 1; + struct kvm_memory_slot *slot; + struct kvm_mmu_page *sp; + + slot = gfn_to_memslot(kvm, gfn); + if (kvm_hugepage_test_mixed(slot, gfn, iter.level) || + (gfn & mask) < start || + end < (gfn & mask) + KVM_PAGES_PER_HPAGE(iter.level)) { + WARN_ON_ONCE(!can_yield); + if (split_sp) { + sp = split_sp; + split_sp = NULL; + sp->role = tdp_iter_child_role(&iter); + } else { + WARN_ON(iter.yielded); + if (flush && can_yield) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, false); + if (iter.yielded) { + split_sp = sp; + continue; + } + } + KVM_BUG_ON(!sp, kvm); + + tdp_mmu_init_sp(sp, iter.sptep, iter.gfn); + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) { + kvm_flush_remote_tlbs(kvm); + flush = false; + /* force retry on this gfn. */ + iter.yielded = true; + } else + flush = true; + continue; + } + } + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); flush = true; } rcu_read_unlock(); + if (split_sp) { + WARN_ON(!can_yield); + if (flush) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + write_unlock(&kvm->mmu_lock); + tdp_mmu_free_sp(split_sp); + write_lock(&kvm->mmu_lock); + } + /* * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. @@ -1606,8 +1670,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, KVM_BUG_ON(kvm_mmu_page_role_is_private(role) != is_private_sptep(iter->sptep), kvm); - /* TODO: Large page isn't supported for private SPTE yet. */ - KVM_BUG_ON(kvm_mmu_page_role_is_private(role), kvm); /* * Since we are allocating while under the MMU lock we have to be