diff mbox series

[v2,9/9] KVM: x86/mmu: Promote pages in-place when disabling dirty logging

Message ID 20220321224358.1305530-10-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/MMU: Optimize disabling dirty logging | expand

Commit Message

Ben Gardon March 21, 2022, 10:43 p.m. UTC
When disabling dirty logging, the TDP MMU currently zaps each leaf entry
mapping memory in the relevant memslot. This is very slow. Doing the zaps
under the mmu read lock requires a TLB flush for every zap and the
zapping causes a storm of ETP/NPT violations.

Instead of zapping, replace the split large pages with large page
mappings directly. While this sort of operation has historically only
been done in the vCPU page fault handler context, refactorings earlier
in this series and the relative simplicity of the TDP MMU make it
possible here as well.

Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
of memory per vCPU, this reduces the time required to disable dirty
logging from over 45 seconds to just over 1 second. It also avoids
provoking page faults, improving vCPU performance while disabling
dirty logging.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/mmu.c          |  4 +-
 arch/x86/kvm/mmu/mmu_internal.h |  6 +++
 arch/x86/kvm/mmu/tdp_mmu.c      | 73 ++++++++++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 4 deletions(-)

Comments

David Matlack March 28, 2022, 5:45 p.m. UTC | #1
On Mon, Mar 21, 2022 at 03:43:58PM -0700, Ben Gardon wrote:
> When disabling dirty logging, the TDP MMU currently zaps each leaf entry
> mapping memory in the relevant memslot. This is very slow. Doing the zaps
> under the mmu read lock requires a TLB flush for every zap and the
> zapping causes a storm of ETP/NPT violations.
> 
> Instead of zapping, replace the split large pages with large page
> mappings directly. While this sort of operation has historically only
> been done in the vCPU page fault handler context, refactorings earlier
> in this series and the relative simplicity of the TDP MMU make it
> possible here as well.
> 
> Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
> of memory per vCPU, this reduces the time required to disable dirty
> logging from over 45 seconds to just over 1 second. It also avoids
> provoking page faults, improving vCPU performance while disabling
> dirty logging.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          |  4 +-
>  arch/x86/kvm/mmu/mmu_internal.h |  6 +++
>  arch/x86/kvm/mmu/tdp_mmu.c      | 73 ++++++++++++++++++++++++++++++++-
>  3 files changed, 79 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6f98111f8f8b..a99c23ef90b6 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -100,7 +100,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>   */
>  bool tdp_enabled = false;
>  
> -static int max_huge_page_level __read_mostly;
> +int max_huge_page_level;
>  static int tdp_root_level __read_mostly;
>  static int max_tdp_level __read_mostly;
>  
> @@ -4486,7 +4486,7 @@ static inline bool boot_cpu_is_amd(void)
>   * the direct page table on host, use as much mmu features as
>   * possible, however, kvm currently does not do execution-protection.
>   */
> -static void
> +void
>  build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
>  				int shadow_root_level)
>  {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1bff453f7cbe..6c08a5731fcb 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  
> +void
> +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> +				int shadow_root_level);
> +
> +extern int max_huge_page_level __read_mostly;
> +
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index af60922906ef..eb8929e394ec 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>  		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
>  }
>  
> +static bool try_promote_lpage(struct kvm *kvm,
> +			      const struct kvm_memory_slot *slot,
> +			      struct tdp_iter *iter)

Use "huge_page" instead of "lpage" to be consistent with eager page
splitting and the rest of the Linux kernel. Some of the old KVM methods
still use "lpage" and "large page", but we're slowly moving away from
that.

> +{
> +	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> +	struct rsvd_bits_validate shadow_zero_check;
> +	bool map_writable;
> +	kvm_pfn_t pfn;
> +	u64 new_spte;
> +	u64 mt_mask;
> +
> +	/*
> +	 * If addresses are being invalidated, don't do in-place promotion to
> +	 * avoid accidentally mapping an invalidated address.
> +	 */
> +	if (unlikely(kvm->mmu_notifier_count))
> +		return false;

Why is this necessary? Seeing this makes me wonder if we need a similar
check for eager page splitting.

> +
> +	if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> +	    iter->gfn >= slot->base_gfn + slot->npages)
> +		return false;
> +
> +	pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> +				   &map_writable, NULL);
> +	if (is_error_noslot_pfn(pfn))
> +		return false;
> +
> +	/*
> +	 * Can't reconstitute an lpage if the consituent pages can't be
> +	 * mapped higher.
> +	 */
> +	if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> +						    pfn, PG_LEVEL_NUM))
> +		return false;
> +
> +	build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> +
> +	/*
> +	 * In some cases, a vCPU pointer is required to get the MT mask,
> +	 * however in most cases it can be generated without one. If a
> +	 * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> +	 * In that case, bail on in-place promotion.
> +	 */
> +	if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
> +							   kvm_is_mmio_pfn(pfn),
> +							   &mt_mask)))
> +		return false;
> +
> +	__make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
> +		  map_writable, mt_mask, &shadow_zero_check, &new_spte);
> +
> +	if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> +		return true;
> +
> +	/* Re-read the SPTE as it must have been changed by another thread. */
> +	iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));

Huge page promotion could be retried in this case.

> +
> +	return false;
> +}
> +
>  /*
>   * Clear leaf entries which could be replaced by large mappings, for
>   * GFNs within the slot.
> @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>  			continue;
>  
> -		if (!is_shadow_present_pte(iter.old_spte) ||
> -		    !is_last_spte(iter.old_spte, iter.level))
> +		if (iter.level > max_huge_page_level ||
> +		    iter.gfn < slot->base_gfn ||
> +		    iter.gfn >= slot->base_gfn + slot->npages)

I feel like I've been seeing this "does slot contain gfn" calculation a
lot in recent commits. It's probably time to create a helper function.
No need to do this clean up as part of your series though, unless you
want to :).

> +			continue;
> +
> +		if (!is_shadow_present_pte(iter.old_spte))
> +			continue;
> +
> +		/* Try to promote the constitutent pages to an lpage. */
> +		if (!is_last_spte(iter.old_spte, iter.level) &&
> +		    try_promote_lpage(kvm, slot, &iter))
>  			continue;

If iter.old_spte is not a leaf, the only loop would always continue to
the next SPTE. Now we try to promote it and if that fails we run through
the rest of the loop. This seems broken. For example, in the next line
we end up grabbing the pfn of the non-leaf SPTE (which would be the PFN
of the TDP MMU page table?) and treat that as the PFN backing this GFN,
which is wrong.

In the worst case we end up zapping an SPTE that we didn't need to, but
we should still fix up this code.

>  
>  		pfn = spte_to_pfn(iter.old_spte);
> -- 
> 2.35.1.894.gb6a874cedc-goog
>
Ben Gardon March 28, 2022, 6:07 p.m. UTC | #2
On Mon, Mar 28, 2022 at 10:45 AM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, Mar 21, 2022 at 03:43:58PM -0700, Ben Gardon wrote:
> > When disabling dirty logging, the TDP MMU currently zaps each leaf entry
> > mapping memory in the relevant memslot. This is very slow. Doing the zaps
> > under the mmu read lock requires a TLB flush for every zap and the
> > zapping causes a storm of ETP/NPT violations.
> >
> > Instead of zapping, replace the split large pages with large page
> > mappings directly. While this sort of operation has historically only
> > been done in the vCPU page fault handler context, refactorings earlier
> > in this series and the relative simplicity of the TDP MMU make it
> > possible here as well.
> >
> > Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
> > of memory per vCPU, this reduces the time required to disable dirty
> > logging from over 45 seconds to just over 1 second. It also avoids
> > provoking page faults, improving vCPU performance while disabling
> > dirty logging.
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c          |  4 +-
> >  arch/x86/kvm/mmu/mmu_internal.h |  6 +++
> >  arch/x86/kvm/mmu/tdp_mmu.c      | 73 ++++++++++++++++++++++++++++++++-
> >  3 files changed, 79 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 6f98111f8f8b..a99c23ef90b6 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -100,7 +100,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
> >   */
> >  bool tdp_enabled = false;
> >
> > -static int max_huge_page_level __read_mostly;
> > +int max_huge_page_level;
> >  static int tdp_root_level __read_mostly;
> >  static int max_tdp_level __read_mostly;
> >
> > @@ -4486,7 +4486,7 @@ static inline bool boot_cpu_is_amd(void)
> >   * the direct page table on host, use as much mmu features as
> >   * possible, however, kvm currently does not do execution-protection.
> >   */
> > -static void
> > +void
> >  build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> >                               int shadow_root_level)
> >  {
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 1bff453f7cbe..6c08a5731fcb 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >
> > +void
> > +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> > +                             int shadow_root_level);
> > +
> > +extern int max_huge_page_level __read_mostly;
> > +
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index af60922906ef..eb8929e394ec 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
> >               clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
> >  }
> >
> > +static bool try_promote_lpage(struct kvm *kvm,
> > +                           const struct kvm_memory_slot *slot,
> > +                           struct tdp_iter *iter)
>
> Use "huge_page" instead of "lpage" to be consistent with eager page
> splitting and the rest of the Linux kernel. Some of the old KVM methods
> still use "lpage" and "large page", but we're slowly moving away from
> that.

Ah good catch. Paolo, if you want me to send a v2 to address all these
comments, I can. Otherwise I'll just reply to the questions below.

>
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> > +     struct rsvd_bits_validate shadow_zero_check;
> > +     bool map_writable;
> > +     kvm_pfn_t pfn;
> > +     u64 new_spte;
> > +     u64 mt_mask;
> > +
> > +     /*
> > +      * If addresses are being invalidated, don't do in-place promotion to
> > +      * avoid accidentally mapping an invalidated address.
> > +      */
> > +     if (unlikely(kvm->mmu_notifier_count))
> > +             return false;
>
> Why is this necessary? Seeing this makes me wonder if we need a similar
> check for eager page splitting.

This is needed here, but not in the page splitting case, because we
are potentially mapping new memory.
If a page is split for dirt logging, but then the backing transparent
huge page is split for some reason, we could race with the THP split.
Since we're mapping the entire huge page, this could wind up mapping
more memory than it should. Checking the MMU notifier count prevents
that. It's not needed in the splitting case because the memory in
question is already mapped. We're essentially trying to do what the
page fault handler does, since we know that's safe and it's what
replaces the zapped page with a huge page. The page fault handler
checks for MMU notifiers, so we need to as well.
>
> > +
> > +     if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> > +         iter->gfn >= slot->base_gfn + slot->npages)
> > +             return false;
> > +
> > +     pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> > +                                &map_writable, NULL);
> > +     if (is_error_noslot_pfn(pfn))
> > +             return false;
> > +
> > +     /*
> > +      * Can't reconstitute an lpage if the consituent pages can't be
> > +      * mapped higher.
> > +      */
> > +     if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> > +                                                 pfn, PG_LEVEL_NUM))
> > +             return false;
> > +
> > +     build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> > +
> > +     /*
> > +      * In some cases, a vCPU pointer is required to get the MT mask,
> > +      * however in most cases it can be generated without one. If a
> > +      * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> > +      * In that case, bail on in-place promotion.
> > +      */
> > +     if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
> > +                                                        kvm_is_mmio_pfn(pfn),
> > +                                                        &mt_mask)))
> > +             return false;
> > +
> > +     __make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
> > +               map_writable, mt_mask, &shadow_zero_check, &new_spte);
> > +
> > +     if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> > +             return true;
> > +
> > +     /* Re-read the SPTE as it must have been changed by another thread. */
> > +     iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
>
> Huge page promotion could be retried in this case.

That's true, but retries always get complicated since we need to
guarantee forward progress and then you get into counting retries and
it adds complexity. Given how rare this race should be, I'm inclined
to just let it fall back to zapping the spte.

>
> > +
> > +     return false;
> > +}
> > +
> >  /*
> >   * Clear leaf entries which could be replaced by large mappings, for
> >   * GFNs within the slot.
> > @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> >               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
> >                       continue;
> >
> > -             if (!is_shadow_present_pte(iter.old_spte) ||
> > -                 !is_last_spte(iter.old_spte, iter.level))
> > +             if (iter.level > max_huge_page_level ||
> > +                 iter.gfn < slot->base_gfn ||
> > +                 iter.gfn >= slot->base_gfn + slot->npages)
>
> I feel like I've been seeing this "does slot contain gfn" calculation a
> lot in recent commits. It's probably time to create a helper function.
> No need to do this clean up as part of your series though, unless you
> want to :).
>
> > +                     continue;
> > +
> > +             if (!is_shadow_present_pte(iter.old_spte))
> > +                     continue;
> > +
> > +             /* Try to promote the constitutent pages to an lpage. */
> > +             if (!is_last_spte(iter.old_spte, iter.level) &&
> > +                 try_promote_lpage(kvm, slot, &iter))
> >                       continue;
>
> If iter.old_spte is not a leaf, the only loop would always continue to
> the next SPTE. Now we try to promote it and if that fails we run through
> the rest of the loop. This seems broken. For example, in the next line
> we end up grabbing the pfn of the non-leaf SPTE (which would be the PFN
> of the TDP MMU page table?) and treat that as the PFN backing this GFN,
> which is wrong.
>
> In the worst case we end up zapping an SPTE that we didn't need to, but
> we should still fix up this code.
>
> >
> >               pfn = spte_to_pfn(iter.old_spte);
> > --
> > 2.35.1.894.gb6a874cedc-goog
> >
David Matlack March 28, 2022, 6:20 p.m. UTC | #3
On Mon, Mar 28, 2022 at 11:07 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Mon, Mar 28, 2022 at 10:45 AM David Matlack <dmatlack@google.com> wrote:
> >
> > On Mon, Mar 21, 2022 at 03:43:58PM -0700, Ben Gardon wrote:
> > > When disabling dirty logging, the TDP MMU currently zaps each leaf entry
> > > mapping memory in the relevant memslot. This is very slow. Doing the zaps
> > > under the mmu read lock requires a TLB flush for every zap and the
> > > zapping causes a storm of ETP/NPT violations.
> > >
> > > Instead of zapping, replace the split large pages with large page
> > > mappings directly. While this sort of operation has historically only
> > > been done in the vCPU page fault handler context, refactorings earlier
> > > in this series and the relative simplicity of the TDP MMU make it
> > > possible here as well.
> > >
> > > Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
> > > of memory per vCPU, this reduces the time required to disable dirty
> > > logging from over 45 seconds to just over 1 second. It also avoids
> > > provoking page faults, improving vCPU performance while disabling
> > > dirty logging.
> > >
> > > Signed-off-by: Ben Gardon <bgardon@google.com>
> > > ---
> > >  arch/x86/kvm/mmu/mmu.c          |  4 +-
> > >  arch/x86/kvm/mmu/mmu_internal.h |  6 +++
> > >  arch/x86/kvm/mmu/tdp_mmu.c      | 73 ++++++++++++++++++++++++++++++++-
> > >  3 files changed, 79 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6f98111f8f8b..a99c23ef90b6 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -100,7 +100,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
> > >   */
> > >  bool tdp_enabled = false;
> > >
> > > -static int max_huge_page_level __read_mostly;
> > > +int max_huge_page_level;
> > >  static int tdp_root_level __read_mostly;
> > >  static int max_tdp_level __read_mostly;
> > >
> > > @@ -4486,7 +4486,7 @@ static inline bool boot_cpu_is_amd(void)
> > >   * the direct page table on host, use as much mmu features as
> > >   * possible, however, kvm currently does not do execution-protection.
> > >   */
> > > -static void
> > > +void
> > >  build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> > >                               int shadow_root_level)
> > >  {
> > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > > index 1bff453f7cbe..6c08a5731fcb 100644
> > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> > >  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> > >  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> > >
> > > +void
> > > +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> > > +                             int shadow_root_level);
> > > +
> > > +extern int max_huge_page_level __read_mostly;
> > > +
> > >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > > index af60922906ef..eb8929e394ec 100644
> > > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > > @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
> > >               clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
> > >  }
> > >
> > > +static bool try_promote_lpage(struct kvm *kvm,
> > > +                           const struct kvm_memory_slot *slot,
> > > +                           struct tdp_iter *iter)
> >
> > Use "huge_page" instead of "lpage" to be consistent with eager page
> > splitting and the rest of the Linux kernel. Some of the old KVM methods
> > still use "lpage" and "large page", but we're slowly moving away from
> > that.
>
> Ah good catch. Paolo, if you want me to send a v2 to address all these
> comments, I can. Otherwise I'll just reply to the questions below.
>
> >
> > > +{
> > > +     struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> > > +     struct rsvd_bits_validate shadow_zero_check;
> > > +     bool map_writable;
> > > +     kvm_pfn_t pfn;
> > > +     u64 new_spte;
> > > +     u64 mt_mask;
> > > +
> > > +     /*
> > > +      * If addresses are being invalidated, don't do in-place promotion to
> > > +      * avoid accidentally mapping an invalidated address.
> > > +      */
> > > +     if (unlikely(kvm->mmu_notifier_count))
> > > +             return false;
> >
> > Why is this necessary? Seeing this makes me wonder if we need a similar
> > check for eager page splitting.
>
> This is needed here, but not in the page splitting case, because we
> are potentially mapping new memory.
> If a page is split for dirt logging, but then the backing transparent
> huge page is split for some reason, we could race with the THP split.
> Since we're mapping the entire huge page, this could wind up mapping
> more memory than it should. Checking the MMU notifier count prevents
> that. It's not needed in the splitting case because the memory in
> question is already mapped. We're essentially trying to do what the
> page fault handler does, since we know that's safe and it's what
> replaces the zapped page with a huge page. The page fault handler
> checks for MMU notifiers, so we need to as well.
> >
> > > +
> > > +     if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> > > +         iter->gfn >= slot->base_gfn + slot->npages)
> > > +             return false;
> > > +
> > > +     pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> > > +                                &map_writable, NULL);
> > > +     if (is_error_noslot_pfn(pfn))
> > > +             return false;
> > > +
> > > +     /*
> > > +      * Can't reconstitute an lpage if the consituent pages can't be
> > > +      * mapped higher.
> > > +      */
> > > +     if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> > > +                                                 pfn, PG_LEVEL_NUM))
> > > +             return false;
> > > +
> > > +     build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> > > +
> > > +     /*
> > > +      * In some cases, a vCPU pointer is required to get the MT mask,
> > > +      * however in most cases it can be generated without one. If a
> > > +      * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> > > +      * In that case, bail on in-place promotion.
> > > +      */
> > > +     if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
> > > +                                                        kvm_is_mmio_pfn(pfn),
> > > +                                                        &mt_mask)))
> > > +             return false;
> > > +
> > > +     __make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
> > > +               map_writable, mt_mask, &shadow_zero_check, &new_spte);
> > > +
> > > +     if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> > > +             return true;
> > > +
> > > +     /* Re-read the SPTE as it must have been changed by another thread. */
> > > +     iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
> >
> > Huge page promotion could be retried in this case.
>
> That's true, but retries always get complicated since we need to
> guarantee forward progress and then you get into counting retries and
> it adds complexity.

There's plenty of unbounding retrying in tdp_mmu.c (search for "goto
retry"). I think that' fine though. I can't imagine a scenario where a
thread is blocked retrying more than a few times.

> Given how rare this race should be, I'm inclined
> to just let it fall back to zapping the spte.

I think that's fine too. Although it'd be pretty easy to plumb by
checking for -EBUSY from tdp_mmu_set_spte_atomic().

Maybe just leave a comment explaining why we don't care about going
through the effort of retrying.

>
> >
> > > +
> > > +     return false;
> > > +}
> > > +
> > >  /*
> > >   * Clear leaf entries which could be replaced by large mappings, for
> > >   * GFNs within the slot.
> > > @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> > >               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
> > >                       continue;
> > >
> > > -             if (!is_shadow_present_pte(iter.old_spte) ||
> > > -                 !is_last_spte(iter.old_spte, iter.level))
> > > +             if (iter.level > max_huge_page_level ||
> > > +                 iter.gfn < slot->base_gfn ||
> > > +                 iter.gfn >= slot->base_gfn + slot->npages)
> >
> > I feel like I've been seeing this "does slot contain gfn" calculation a
> > lot in recent commits. It's probably time to create a helper function.
> > No need to do this clean up as part of your series though, unless you
> > want to :).
> >
> > > +                     continue;
> > > +
> > > +             if (!is_shadow_present_pte(iter.old_spte))
> > > +                     continue;
> > > +
> > > +             /* Try to promote the constitutent pages to an lpage. */
> > > +             if (!is_last_spte(iter.old_spte, iter.level) &&
> > > +                 try_promote_lpage(kvm, slot, &iter))
> > >                       continue;
> >
> > If iter.old_spte is not a leaf, the only loop would always continue to
> > the next SPTE. Now we try to promote it and if that fails we run through
> > the rest of the loop. This seems broken. For example, in the next line
> > we end up grabbing the pfn of the non-leaf SPTE (which would be the PFN
> > of the TDP MMU page table?) and treat that as the PFN backing this GFN,
> > which is wrong.
> >
> > In the worst case we end up zapping an SPTE that we didn't need to, but
> > we should still fix up this code.
> >
> > >
> > >               pfn = spte_to_pfn(iter.old_spte);
> > > --
> > > 2.35.1.894.gb6a874cedc-goog
> > >
David Matlack March 28, 2022, 6:21 p.m. UTC | #4
On Mon, Mar 21, 2022 at 3:44 PM Ben Gardon <bgardon@google.com> wrote:
>
> When disabling dirty logging, the TDP MMU currently zaps each leaf entry
> mapping memory in the relevant memslot. This is very slow. Doing the zaps
> under the mmu read lock requires a TLB flush for every zap and the
> zapping causes a storm of ETP/NPT violations.
>
> Instead of zapping, replace the split large pages with large page
> mappings directly. While this sort of operation has historically only
> been done in the vCPU page fault handler context, refactorings earlier
> in this series and the relative simplicity of the TDP MMU make it
> possible here as well.
>
> Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
> of memory per vCPU, this reduces the time required to disable dirty
> logging from over 45 seconds to just over 1 second. It also avoids
> provoking page faults, improving vCPU performance while disabling
> dirty logging.
>
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c          |  4 +-
>  arch/x86/kvm/mmu/mmu_internal.h |  6 +++
>  arch/x86/kvm/mmu/tdp_mmu.c      | 73 ++++++++++++++++++++++++++++++++-
>  3 files changed, 79 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6f98111f8f8b..a99c23ef90b6 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -100,7 +100,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
>   */
>  bool tdp_enabled = false;
>
> -static int max_huge_page_level __read_mostly;
> +int max_huge_page_level;
>  static int tdp_root_level __read_mostly;
>  static int max_tdp_level __read_mostly;
>
> @@ -4486,7 +4486,7 @@ static inline bool boot_cpu_is_amd(void)
>   * the direct page table on host, use as much mmu features as
>   * possible, however, kvm currently does not do execution-protection.
>   */
> -static void
> +void
>  build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
>                                 int shadow_root_level)
>  {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1bff453f7cbe..6c08a5731fcb 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>
> +void
> +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> +                               int shadow_root_level);
> +
> +extern int max_huge_page_level __read_mostly;
> +
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index af60922906ef..eb8929e394ec 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
>  }
>
> +static bool try_promote_lpage(struct kvm *kvm,
> +                             const struct kvm_memory_slot *slot,
> +                             struct tdp_iter *iter)
> +{
> +       struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> +       struct rsvd_bits_validate shadow_zero_check;
> +       bool map_writable;
> +       kvm_pfn_t pfn;
> +       u64 new_spte;
> +       u64 mt_mask;
> +
> +       /*
> +        * If addresses are being invalidated, don't do in-place promotion to
> +        * avoid accidentally mapping an invalidated address.
> +        */
> +       if (unlikely(kvm->mmu_notifier_count))
> +               return false;
> +
> +       if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> +           iter->gfn >= slot->base_gfn + slot->npages)
> +               return false;
> +
> +       pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> +                                  &map_writable, NULL);
> +       if (is_error_noslot_pfn(pfn))
> +               return false;
> +
> +       /*
> +        * Can't reconstitute an lpage if the consituent pages can't be
> +        * mapped higher.
> +        */
> +       if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> +                                                   pfn, PG_LEVEL_NUM))
> +               return false;
> +
> +       build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> +
> +       /*
> +        * In some cases, a vCPU pointer is required to get the MT mask,
> +        * however in most cases it can be generated without one. If a
> +        * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> +        * In that case, bail on in-place promotion.
> +        */
> +       if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
> +                                                          kvm_is_mmio_pfn(pfn),
> +                                                          &mt_mask)))
> +               return false;
> +
> +       __make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
> +                 map_writable, mt_mask, &shadow_zero_check, &new_spte);
> +
> +       if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> +               return true;

Ah shoot, tdp_mmu_set_spte_atomic() now returns 0/-EBUSY, so this
conditional needs to be flipped.

> +
> +       /* Re-read the SPTE as it must have been changed by another thread. */
> +       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));

tdp_mmu_set_spte_atomic() does this for you now.

> +
> +       return false;
> +}
> +
>  /*
>   * Clear leaf entries which could be replaced by large mappings, for
>   * GFNs within the slot.
> @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>                         continue;
>
> -               if (!is_shadow_present_pte(iter.old_spte) ||
> -                   !is_last_spte(iter.old_spte, iter.level))
> +               if (iter.level > max_huge_page_level ||
> +                   iter.gfn < slot->base_gfn ||
> +                   iter.gfn >= slot->base_gfn + slot->npages)
> +                       continue;
> +
> +               if (!is_shadow_present_pte(iter.old_spte))
> +                       continue;
> +
> +               /* Try to promote the constitutent pages to an lpage. */
> +               if (!is_last_spte(iter.old_spte, iter.level) &&
> +                   try_promote_lpage(kvm, slot, &iter))
>                         continue;
>
>                 pfn = spte_to_pfn(iter.old_spte);
> --
> 2.35.1.894.gb6a874cedc-goog
>
Sean Christopherson April 12, 2022, 4:43 p.m. UTC | #5
On Mon, Mar 21, 2022, Ben Gardon wrote:
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1bff453f7cbe..6c08a5731fcb 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  
> +void
> +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> +				int shadow_root_level);

Same comments from the earlier patch.

> +extern int max_huge_page_level __read_mostly;

Can you put this at the top of the heaader?  x86.h somehow ended up with extern
variables being declared in the middle of the file and I find it very jarring,
e.g. global definitions are pretty much never buried in the middle of a .c file.

>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index af60922906ef..eb8929e394ec 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
>  		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
>  }
>  
> +static bool try_promote_lpage(struct kvm *kvm,

I believe we've settled on huge_page instead of lpage.

And again, I strongly prefer a 0/-errno return instead of a boolean as seeing
-EBUSY or whatever makes it super obviously that the early returns are failure
paths.

> +			      const struct kvm_memory_slot *slot,
> +			      struct tdp_iter *iter)
> +{
> +	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> +	struct rsvd_bits_validate shadow_zero_check;
> +	bool map_writable;
> +	kvm_pfn_t pfn;
> +	u64 new_spte;
> +	u64 mt_mask;
> +
> +	/*
> +	 * If addresses are being invalidated, don't do in-place promotion to
> +	 * avoid accidentally mapping an invalidated address.
> +	 */
> +	if (unlikely(kvm->mmu_notifier_count))
> +		return false;
> +
> +	if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> +	    iter->gfn >= slot->base_gfn + slot->npages)
> +		return false;
> +
> +	pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> +				   &map_writable, NULL);
> +	if (is_error_noslot_pfn(pfn))
> +		return false;
> +
> +	/*
> +	 * Can't reconstitute an lpage if the consituent pages can't be

"huge page", though honestly I'd just drop the comment, IMO this is more intuitive
then say the checks against the slot stuff above.

> +	 * mapped higher.
> +	 */
> +	if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> +						    pfn, PG_LEVEL_NUM))
> +		return false;
> +
> +	build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> +
> +	/*
> +	 * In some cases, a vCPU pointer is required to get the MT mask,
> +	 * however in most cases it can be generated without one. If a
> +	 * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> +	 * In that case, bail on in-place promotion.
> +	 */
> +	if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,

I wouldn't bother with the "unlikely".  It's wrong for a VM with non-coherent DMA,
and it's very unlikely (heh) to actually be a meaningful optimization in any case.

> +							   kvm_is_mmio_pfn(pfn),
> +							   &mt_mask)))
> +		return false;
> +
> +	__make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,

A comment stating the return type is intentionally ignore would be helpful.  Not
strictly necessary because it's mostly obvious after looking at the details, but
it'd save someone from having to dig into said details.

> +		  map_writable, mt_mask, &shadow_zero_check, &new_spte);

Bad indentation.

> +
> +	if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> +		return true;

And by returning an int, and because the failure path rereads the SPTE for you,
this becomes:

	return tdp_mmu_set_spte_atomic(kvm, iter, new_spte);

> +
> +	/* Re-read the SPTE as it must have been changed by another thread. */
> +	iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
> +
> +	return false;
> +}
> +
>  /*
>   * Clear leaf entries which could be replaced by large mappings, for
>   * GFNs within the slot.

This comment needs to be updated to include the huge page promotion behavior. And
maybe renamed the function too?  E.g.

static void zap_or_promote_collapsible_sptes(struct kvm *kvm,
					     struct kvm_mmu_page *root,
					     const struct kvm_memory_slot *slot)

> @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>  		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>  			continue;
>  
> -		if (!is_shadow_present_pte(iter.old_spte) ||
> -		    !is_last_spte(iter.old_spte, iter.level))
> +		if (iter.level > max_huge_page_level ||
> +		    iter.gfn < slot->base_gfn ||
> +		    iter.gfn >= slot->base_gfn + slot->npages)

Isn't this exact check in try_promote_lpage()?  Ditto for the kvm_mmu_max_mapping_level()
check that's just out of sight.  That one in particular can be somewhat expsensive,
especially when KVM is fixed to use a helper that disable IRQs so the host page tables
aren't freed while they're being walked.  Oh, and the huge page promotion path
doesn't incorporate the reserved pfn check.

In other words, shouldn't this be:


		if (!is_shadow_present_pte(iter.old_spte))
			continue;

		if (iter.level > max_huge_page_level ||
		    iter.gfn < slot->base_gfn ||
		    iter.gfn >= slot->base_gfn + slot->npages)
			continue;

		pfn = spte_to_pfn(iter.old_spte);
		if (kvm_is_reserved_pfn(pfn) ||
		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
							    pfn, PG_LEVEL_NUM))
			continue;

Followed by the promotion stuff.  And then unless I'm overlooking something, "pfn"
can be passed into try_promote_huge_page(), it just needs to be masked appropriately.
I.e. the promotion path can avoid the __gfn_to_pfn_memslot() lookup and also drop
its is_error_noslot_pfn() check since the pfn is pulled from the SPTE and KVM should
never install garbage into the SPTE (emulated/noslot MMIO pfns fail the shadow
present check).

> +			continue;
> +
> +		if (!is_shadow_present_pte(iter.old_spte))
> +			continue;

I strongly prefer to keep the !is_shadow_present_pte() check first, it really
should be the first thing any of these flows check.

> +
> +		/* Try to promote the constitutent pages to an lpage. */
> +		if (!is_last_spte(iter.old_spte, iter.level) &&
> +		    try_promote_lpage(kvm, slot, &iter))

There is an undocumented function change here, and I can't tell if it's intentional.
If the promotion fails, KVM continues on an zaps the non-leaf shadow page.  If that
is intentional behavior, it should be done in a follow-up patch, e.g. so that it can
be easily reverted if it turns out that zappping e.g. a PUD is bad for performance.

I.e. shouldn't this be:

		if (!is_last_spte(iter.old_spte, iter.level)) {
			try_promote_huge_page(...);
			continue;
		}

and then converted to the current variant in a follow-up?

>  			continue;
>  
>  		pfn = spte_to_pfn(iter.old_spte);
> -- 
> 2.35.1.894.gb6a874cedc-goog
>
Ben Gardon April 25, 2022, 6:09 p.m. UTC | #6
On Tue, Apr 12, 2022 at 9:43 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, Mar 21, 2022, Ben Gardon wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 1bff453f7cbe..6c08a5731fcb 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >
> > +void
> > +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> > +                             int shadow_root_level);
>
> Same comments from the earlier patch.
>
> > +extern int max_huge_page_level __read_mostly;
>
> Can you put this at the top of the heaader?  x86.h somehow ended up with extern
> variables being declared in the middle of the file and I find it very jarring,
> e.g. global definitions are pretty much never buried in the middle of a .c file.

Will do. I'm working on a v3 of this series now.

>
>
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index af60922906ef..eb8929e394ec 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
> >               clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
> >  }
> >
> > +static bool try_promote_lpage(struct kvm *kvm,
>
> I believe we've settled on huge_page instead of lpage.
>
> And again, I strongly prefer a 0/-errno return instead of a boolean as seeing
> -EBUSY or whatever makes it super obviously that the early returns are failure
> paths.

Will do. To your and David's comments about retries, this makes the
retry scheme really nice and clean.

>
> > +                           const struct kvm_memory_slot *slot,
> > +                           struct tdp_iter *iter)
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> > +     struct rsvd_bits_validate shadow_zero_check;
> > +     bool map_writable;
> > +     kvm_pfn_t pfn;
> > +     u64 new_spte;
> > +     u64 mt_mask;
> > +
> > +     /*
> > +      * If addresses are being invalidated, don't do in-place promotion to
> > +      * avoid accidentally mapping an invalidated address.
> > +      */
> > +     if (unlikely(kvm->mmu_notifier_count))
> > +             return false;
> > +
> > +     if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> > +         iter->gfn >= slot->base_gfn + slot->npages)
> > +             return false;
> > +
> > +     pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> > +                                &map_writable, NULL);
> > +     if (is_error_noslot_pfn(pfn))
> > +             return false;
> > +
> > +     /*
> > +      * Can't reconstitute an lpage if the consituent pages can't be
>
> "huge page", though honestly I'd just drop the comment, IMO this is more intuitive
> then say the checks against the slot stuff above.
>
> > +      * mapped higher.
> > +      */
> > +     if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> > +                                                 pfn, PG_LEVEL_NUM))
> > +             return false;
> > +
> > +     build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> > +
> > +     /*
> > +      * In some cases, a vCPU pointer is required to get the MT mask,
> > +      * however in most cases it can be generated without one. If a
> > +      * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> > +      * In that case, bail on in-place promotion.
> > +      */
> > +     if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
>
> I wouldn't bother with the "unlikely".  It's wrong for a VM with non-coherent DMA,
> and it's very unlikely (heh) to actually be a meaningful optimization in any case.
>
> > +                                                        kvm_is_mmio_pfn(pfn),
> > +                                                        &mt_mask)))
> > +             return false;
> > +
> > +     __make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
>
> A comment stating the return type is intentionally ignore would be helpful.  Not
> strictly necessary because it's mostly obvious after looking at the details, but
> it'd save someone from having to dig into said details.
>
> > +               map_writable, mt_mask, &shadow_zero_check, &new_spte);
>
> Bad indentation.
>
> > +
> > +     if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> > +             return true;
>
> And by returning an int, and because the failure path rereads the SPTE for you,
> this becomes:
>
>         return tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
>
> > +
> > +     /* Re-read the SPTE as it must have been changed by another thread. */
> > +     iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
> > +
> > +     return false;
> > +}
> > +
> >  /*
> >   * Clear leaf entries which could be replaced by large mappings, for
> >   * GFNs within the slot.
>
> This comment needs to be updated to include the huge page promotion behavior. And
> maybe renamed the function too?  E.g.
>
> static void zap_or_promote_collapsible_sptes(struct kvm *kvm,
>                                              struct kvm_mmu_page *root,
>                                              const struct kvm_memory_slot *slot)
>
> > @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> >               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
> >                       continue;
> >
> > -             if (!is_shadow_present_pte(iter.old_spte) ||
> > -                 !is_last_spte(iter.old_spte, iter.level))
> > +             if (iter.level > max_huge_page_level ||
> > +                 iter.gfn < slot->base_gfn ||
> > +                 iter.gfn >= slot->base_gfn + slot->npages)
>
> Isn't this exact check in try_promote_lpage()?  Ditto for the kvm_mmu_max_mapping_level()
> check that's just out of sight.  That one in particular can be somewhat expsensive,
> especially when KVM is fixed to use a helper that disable IRQs so the host page tables
> aren't freed while they're being walked.  Oh, and the huge page promotion path
> doesn't incorporate the reserved pfn check.
>
> In other words, shouldn't this be:
>
>
>                 if (!is_shadow_present_pte(iter.old_spte))
>                         continue;
>
>                 if (iter.level > max_huge_page_level ||
>                     iter.gfn < slot->base_gfn ||
>                     iter.gfn >= slot->base_gfn + slot->npages)
>                         continue;
>
>                 pfn = spte_to_pfn(iter.old_spte);
>                 if (kvm_is_reserved_pfn(pfn) ||
>                     iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
>                                                             pfn, PG_LEVEL_NUM))
>                         continue;
>
> Followed by the promotion stuff.  And then unless I'm overlooking something, "pfn"
> can be passed into try_promote_huge_page(), it just needs to be masked appropriately.
> I.e. the promotion path can avoid the __gfn_to_pfn_memslot() lookup and also drop
> its is_error_noslot_pfn() check since the pfn is pulled from the SPTE and KVM should
> never install garbage into the SPTE (emulated/noslot MMIO pfns fail the shadow
> present check).

I'll work on deduplicating the checks. The big distinction is that in
the promotion function, the iterator is still at the non-leaf SPTE, so
if we get the PFN from the SPTE, it'll be the PFN of a page table, not
a PFN backing guest memory. I could use the same GFN to PFN memslot
conversion in both cases, but it seems more expensive than extracting
the PFN from the SPTE.

__gfn_to_pfn_memslot should never return a reserved PFN right?

>
> > +                     continue;
> > +
> > +             if (!is_shadow_present_pte(iter.old_spte))
> > +                     continue;
>
> I strongly prefer to keep the !is_shadow_present_pte() check first, it really
> should be the first thing any of these flows check.
>
> > +
> > +             /* Try to promote the constitutent pages to an lpage. */
> > +             if (!is_last_spte(iter.old_spte, iter.level) &&
> > +                 try_promote_lpage(kvm, slot, &iter))
>
> There is an undocumented function change here, and I can't tell if it's intentional.
> If the promotion fails, KVM continues on an zaps the non-leaf shadow page.  If that
> is intentional behavior, it should be done in a follow-up patch, e.g. so that it can
> be easily reverted if it turns out that zappping e.g. a PUD is bad for performance.
>
> I.e. shouldn't this be:
>
>                 if (!is_last_spte(iter.old_spte, iter.level)) {
>                         try_promote_huge_page(...);
>                         continue;
>                 }
>
> and then converted to the current variant in a follow-up?

Ah, good point.

>
> >                       continue;
> >
> >               pfn = spte_to_pfn(iter.old_spte);
> > --
> > 2.35.1.894.gb6a874cedc-goog
> >
Sean Christopherson July 12, 2022, 11:21 p.m. UTC | #7
On Mon, Mar 28, 2022, Ben Gardon wrote:
> On Mon, Mar 28, 2022 at 10:45 AM David Matlack <dmatlack@google.com> wrote:
> >
> > On Mon, Mar 21, 2022 at 03:43:58PM -0700, Ben Gardon wrote:
> > > +{
> > > +     struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> > > +     struct rsvd_bits_validate shadow_zero_check;
> > > +     bool map_writable;
> > > +     kvm_pfn_t pfn;
> > > +     u64 new_spte;
> > > +     u64 mt_mask;
> > > +
> > > +     /*
> > > +      * If addresses are being invalidated, don't do in-place promotion to
> > > +      * avoid accidentally mapping an invalidated address.
> > > +      */
> > > +     if (unlikely(kvm->mmu_notifier_count))
> > > +             return false;
> >
> > Why is this necessary? Seeing this makes me wonder if we need a similar
> > check for eager page splitting.
> 
> This is needed here, but not in the page splitting case, because we
> are potentially mapping new memory.

As written, it's required because KVM doesn't check that there's at least one
leaf SPTE for the range.  If KVM were to step down and find a leaf SPTE before
stepping back up to promote, then this check can be dropped because KVM zaps leaf
SPTEs during invalidate_range_start(), and the primary MMU must invalidate the
entire range covered by a huge page if it's splitting a huge page.

I'm inclined to go that route because it allows for a more unified path (with some
other prep work).  Having to find a leaf SPTE could increase the time to disable
dirty logging, but unless it's an order of magnitude or worse, I'm not sure we care
because walking SPTEs doesn't impact vCPUs (unlike actually zapping).

> > > +             /* Try to promote the constitutent pages to an lpage. */
> > > +             if (!is_last_spte(iter.old_spte, iter.level) &&
> > > +                 try_promote_lpage(kvm, slot, &iter))
> > >                       continue;
> >
> > If iter.old_spte is not a leaf, the only loop would always continue to
> > the next SPTE. Now we try to promote it and if that fails we run through
> > the rest of the loop. This seems broken. For example, in the next line
> > we end up grabbing the pfn of the non-leaf SPTE (which would be the PFN
> > of the TDP MMU page table?) and treat that as the PFN backing this GFN,
> > which is wrong.
> >
> > In the worst case we end up zapping an SPTE that we didn't need to, but
> > we should still fix up this code.

My thought to remedy this is to drop the @pfn argument to kvm_mmu_max_mapping_level().
It's used only for historical reasons, where KVM didn't walk the host page tables
to get the max mapping level and instead pulled THP information out of struct page.
I.e. KVM needed the pfn to get the page.

That would also allow KVM to use huge pages for things that aren't backed by
struct page (I know of at least one potential use case).

I _think_ we can do the below.  It's compile tested only at this point, and I
want to split some of the changes into separate patches, e.g. the WARN on the step-up
not going out-of-bounds.  I'll put this on the backburner for now, it's too late
for 5.20, and too many people are OOO :-)

	tdp_root_for_each_pte(iter, root, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
			continue;

		/*
		 * Step down until a PRESENT, leaf SPTE is found, even when
		 * promoting a parent shadow page.  Requiring a leaf SPTE
		 * ensures that KVM is not creating a new mapping while an MMU
		 * notifier invalidation is in-progress (KVM zaps only leaf
		 * SPTEs in response to MMU notifier invlidation events), and
		 * avoids doing work for shadow pages with no children.
		 */
		if (!is_shadow_present_pte(iter.old_spte) ||
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
							      PG_LEVEL_NUM);
		if (iter.level == max_mapping_level)
			continue;

		/*
		 * KVM zaps leaf SPTEs when handling MMU notifier invalidations,
		 * and the primary MMU is supposed to invalidate secondary MMUs
		 * _before_ zapping PTEs in the host page tables.  It should be
		 * impossible for a leaf SPTE to violate the host mapping level.
		 */
		if (WARN_ON_ONCE(max_mapping_level < iter.level))
			continue;

		/*
		 * The page can be remapped at a higher level, so step
		 * up to zap the parent SPTE.
		 */
		while (max_mapping_level > iter.level)
			tdp_iter_step_up(&iter);

		/*
		 * Stepping up should not cause the iter to go out of range of
		 * the memslot, the max mapping level is bounded by the memslot
		 * (among other things).
		 */
		if (WARN_ON_ONCE(iter.gfn < start || iter.gfn >= end))
			continue;

		/*
		 * Attempt to promote the non-leaf SPTE to a huge page.  If the
		 * promotion fails, zap the SPTE and let it be rebuilt on the
		 * next associated TDP page fault.
		 */
		if (!try_promote_to_huge_page(kvm, &rsvd_bits, slot, &iter))
			continue;

		/* Note, a successful atomic zap also does a remote TLB flush. */
		tdp_mmu_zap_spte_atomic(kvm, &iter);

		/*
		 * If the atomic zap fails, the iter will recurse back into
		 * the same subtree to retry.
		 */
	}

and then promotion shrinks a decent amount too as it's just getting the pfn,
memtype, and making the SPTE.

  static int try_promote_to_huge_page(struct kvm *kvm,
				      struct rsvd_bits_validate *rsvd_bits,
				      const struct kvm_memory_slot *slot,
				      struct tdp_iter *iter)
  {
	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
	kvm_pfn_t pfn;
	u64 new_spte;
	u8 mt_mask;
	int r;

	/*
	 * Treat the lookup as a write "fault", in-place promotion is used only
	 * when disabling dirty logging, which requires a writable memslot.
	 */
	pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true, NULL, NULL);
	if (is_error_noslot_pfn(pfn))
		return -EINVAL;

	/*
	 * In some cases, a vCPU pointer is required to get the MT mask,
	 * however in most cases it can be generated without one. If a
	 * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
	 * In that case, bail on in-place promotion.
	 */
	r = static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
						 kvm_is_mmio_pfn(pfn), &mt_mask);
	if (r)
		return r;

	__make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
		    true, mt_mask, rsvd_bits, &new_spte);

	return tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
  }
Sean Christopherson July 13, 2022, 4:20 p.m. UTC | #8
On Tue, Jul 12, 2022, Sean Christopherson wrote:
> On Mon, Mar 28, 2022, Ben Gardon wrote:
> > On Mon, Mar 28, 2022 at 10:45 AM David Matlack <dmatlack@google.com> wrote:
> > > If iter.old_spte is not a leaf, the only loop would always continue to
> > > the next SPTE. Now we try to promote it and if that fails we run through
> > > the rest of the loop. This seems broken. For example, in the next line
> > > we end up grabbing the pfn of the non-leaf SPTE (which would be the PFN
> > > of the TDP MMU page table?) and treat that as the PFN backing this GFN,
> > > which is wrong.
> > >
> > > In the worst case we end up zapping an SPTE that we didn't need to, but
> > > we should still fix up this code.
> 
> My thought to remedy this is to drop the @pfn argument to kvm_mmu_max_mapping_level().
> It's used only for historical reasons, where KVM didn't walk the host page tables
> to get the max mapping level and instead pulled THP information out of struct page.
> I.e. KVM needed the pfn to get the page.
> 
> That would also allow KVM to use huge pages for things that aren't backed by
> struct page (I know of at least one potential use case).
> 
> I _think_ we can do the below.  It's compile tested only at this point, and I
> want to split some of the changes into separate patches, e.g. the WARN on the step-up
> not going out-of-bounds.  I'll put this on the backburner for now, it's too late
> for 5.20, and too many people are OOO :-)

Heh, that was a bit of a lie.  _Now_ it's going on the backburner.

Thinking about the pfn coming from the old leaf SPTE made me realize all of the
information we need to use __make_spte() during promotion is available in the
existing leaf SPTE.

If KVM first retrieves a PRESENT leaf SPTE, then pfn _can't_ be different, because
that would mean KVM done messed up and didn't zap existing entries in response to
a MMU notifier invalidation, and holding mmu_lock prevents new invalidations.
And because the primary MMU must invalidate before splitting a huge page, having a
valid leaf SPTE means that host mapping level can't become stale until mmu_lock is
dropped.  In other words, KVM can compute the huge pfn by using the smaller pfn
and adjusting based on the target mapping level.

As for the EPT memtype, that can also come from the existing leaf SPTE.  KVM only
forces the memtype for host MMIO pfns, and if the primary MMU maps a huge page that
straddles host MMIO (UC) and regular memory (WB), then the kernel is already hosed.
If the VM doesn't have non-coherent DMA, then the EPT memtype will be WB regardless
of the page size.  That means KVM just needs to reject promotion if the VM has
non-coherent DMA and the target pfn is not host MMIO, else KVM can use the leaf's
memtype as-is.

Using the pfn avoids gup() (fast-only, but still), and using the memtype avoids
having to split vmx_get_mt_mask() and add another kvm_x86_ops hook.

And digging into all of that yielded another optimization.  kvm_tdp_page_fault()
needs to restrict the host mapping level if and only if it may consume the guest
MTRRs.  If KVM ignores the guest MTRRs, then the fact that they're inconsistent
across a TDP page is irrelevant because the _guest_ MTRRs are completely virtual
and are not consumed by either EPT or NPT.  I doubt this meaningfully affects
whether or not KVM can create huge pages for real world VMs, but it does avoid
having to walk the guest variable MTRRs when faulting in a huge page.

Compile tested only at this point, but I'm mostly certain my logic is sound.

int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
	/*
	 * If the guest's MTRRs may be used, restrict the mapping level to
	 * ensure KVM uses a consistent memtype across the entire mapping.
	 */
	if (kvm_may_need_guest_mtrrs(vcpu->kvm)) {
		for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
			int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
			gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);

			if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
				break;
		}
	}

	return direct_page_fault(vcpu, fault);
}

static int try_promote_to_huge_page(struct kvm *kvm,
				    struct rsvd_bits_validate *rsvd_bits,
				    const struct kvm_memory_slot *slot,
				    u64 leaf_spte, struct tdp_iter *iter)
{
	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
	kvm_pfn_t pfn;
	u64 new_spte;
	u8 mt_mask;

	if (WARN_ON_ONCE(slot->flags & KVM_MEM_READONLY))
		return -EINVAL;

	pfn = spte_to_pfn(leaf_spte) & ~(KVM_PAGES_PER_HPAGE(iter->level) - 1);
	mt_mask = leaf_spte & shadow_memtype_mask;

	/*
	 * Bail if KVM needs guest MTRRs to compute the memtype and will not
	 * force the memtype (host MMIO).  There is no guarantee the guest uses
	 * a consistent MTRR memtype for the entire huge page, and MTRRs are
	 * tracked per vCPU, not per VM.
	 */
	if (kvm_may_need_guest_mtrrs(kvm) && !kvm_is_mmio_pfn(pfn))
		return -EIO;

	__make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
		    true, mt_mask, rsvd_bits, &new_spte);

	return tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
}
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f98111f8f8b..a99c23ef90b6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -100,7 +100,7 @@  module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
  */
 bool tdp_enabled = false;
 
-static int max_huge_page_level __read_mostly;
+int max_huge_page_level;
 static int tdp_root_level __read_mostly;
 static int max_tdp_level __read_mostly;
 
@@ -4486,7 +4486,7 @@  static inline bool boot_cpu_is_amd(void)
  * the direct page table on host, use as much mmu features as
  * possible, however, kvm currently does not do execution-protection.
  */
-static void
+void
 build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
 				int shadow_root_level)
 {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1bff453f7cbe..6c08a5731fcb 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -171,4 +171,10 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+void
+build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
+				int shadow_root_level);
+
+extern int max_huge_page_level __read_mostly;
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index af60922906ef..eb8929e394ec 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1709,6 +1709,66 @@  void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
 }
 
+static bool try_promote_lpage(struct kvm *kvm,
+			      const struct kvm_memory_slot *slot,
+			      struct tdp_iter *iter)
+{
+	struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
+	struct rsvd_bits_validate shadow_zero_check;
+	bool map_writable;
+	kvm_pfn_t pfn;
+	u64 new_spte;
+	u64 mt_mask;
+
+	/*
+	 * If addresses are being invalidated, don't do in-place promotion to
+	 * avoid accidentally mapping an invalidated address.
+	 */
+	if (unlikely(kvm->mmu_notifier_count))
+		return false;
+
+	if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
+	    iter->gfn >= slot->base_gfn + slot->npages)
+		return false;
+
+	pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
+				   &map_writable, NULL);
+	if (is_error_noslot_pfn(pfn))
+		return false;
+
+	/*
+	 * Can't reconstitute an lpage if the consituent pages can't be
+	 * mapped higher.
+	 */
+	if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
+						    pfn, PG_LEVEL_NUM))
+		return false;
+
+	build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
+
+	/*
+	 * In some cases, a vCPU pointer is required to get the MT mask,
+	 * however in most cases it can be generated without one. If a
+	 * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
+	 * In that case, bail on in-place promotion.
+	 */
+	if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
+							   kvm_is_mmio_pfn(pfn),
+							   &mt_mask)))
+		return false;
+
+	__make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
+		  map_writable, mt_mask, &shadow_zero_check, &new_spte);
+
+	if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+		return true;
+
+	/* Re-read the SPTE as it must have been changed by another thread. */
+	iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
+
+	return false;
+}
+
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
@@ -1729,8 +1789,17 @@  static void zap_collapsible_spte_range(struct kvm *kvm,
 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
 			continue;
 
-		if (!is_shadow_present_pte(iter.old_spte) ||
-		    !is_last_spte(iter.old_spte, iter.level))
+		if (iter.level > max_huge_page_level ||
+		    iter.gfn < slot->base_gfn ||
+		    iter.gfn >= slot->base_gfn + slot->npages)
+			continue;
+
+		if (!is_shadow_present_pte(iter.old_spte))
+			continue;
+
+		/* Try to promote the constitutent pages to an lpage. */
+		if (!is_last_spte(iter.old_spte, iter.level) &&
+		    try_promote_lpage(kvm, slot, &iter))
 			continue;
 
 		pfn = spte_to_pfn(iter.old_spte);