diff mbox series

[v3,1/9] KVM: x86/mmu: Repurpose KVM MMU shrinker to purge shadow page caches

Message ID 20221222023457.1764-2-vipinsh@google.com (mailing list archive)
State New, archived
Headers show
Series NUMA aware page table's pages allocation | expand

Commit Message

Vipin Sharma Dec. 22, 2022, 2:34 a.m. UTC
mmu_shrink_scan() is very disruptive to VMs. It picks the first
VM in the vm_list, zaps the oldest page which is most likely an upper
level SPTEs and most like to be reused. Prior to TDP MMU, this is even
more disruptive in nested VMs case, considering L1 SPTEs will be the
oldest even though most of the entries are for L2 SPTEs.

As discussed in
https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
shrinker logic has not be very useful in actually keeping VMs performant
and reducing memory usage.

Change mmu_shrink_scan() to free pages from the vCPU's shadow page
cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
VM's performance should not be affected.

This also allows to change cache capacities without worrying too much
about high memory usage in cache.

Tested this change by running dirty_log_perf_test while dropping cache
via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
logs from kvm_mmu_memory_cache_alloc(), which is expected.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Vipin Sharma <vipinsh@google.com>
---
 arch/x86/include/asm/kvm_host.h |   5 +
 arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
 arch/x86/kvm/mmu/mmu_internal.h |   2 +
 arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
 include/linux/kvm_host.h        |   1 +
 virt/kvm/kvm_main.c             |  11 ++-
 6 files changed, 114 insertions(+), 71 deletions(-)

Comments

Ben Gardon Dec. 27, 2022, 6:37 p.m. UTC | #1
On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
>
> mmu_shrink_scan() is very disruptive to VMs. It picks the first
> VM in the vm_list, zaps the oldest page which is most likely an upper
> level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> more disruptive in nested VMs case, considering L1 SPTEs will be the
> oldest even though most of the entries are for L2 SPTEs.
>
> As discussed in
> https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> shrinker logic has not be very useful in actually keeping VMs performant
> and reducing memory usage.
>
> Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> VM's performance should not be affected.
>
> This also allows to change cache capacities without worrying too much
> about high memory usage in cache.
>
> Tested this change by running dirty_log_perf_test while dropping cache
> via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> logs from kvm_mmu_memory_cache_alloc(), which is expected.

Oh, that's not a good thing. I don't think we want to be hitting those
warnings. For one, kernel warnings should not be expected behavior,
probably for many reasons, but at least because Syzbot will find it.
In this particular case, we don't want to hit that because in that
case we'll try to do a GFP_ATOMIC, which can fail, and if it fails,
we'll BUG:

void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
{
        void *p;

        if (WARN_ON(!mc->nobjs))
                p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
        else
                p = mc->objects[--mc->nobjs];
        BUG_ON(!p);
        return p;
}

Perhaps the risk of actually panicking is small, but it probably
indicates that we need better error handling around failed allocations
from the cache.
Or, the slightly less elegant approach might be to just hold the cache
lock around the cache topup and use of pages from the cache, but
adding better error handling would probably be cleaner.

>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   5 +
>  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
>  arch/x86/kvm/mmu/mmu_internal.h |   2 +
>  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
>  include/linux/kvm_host.h        |   1 +
>  virt/kvm/kvm_main.c             |  11 ++-
>  6 files changed, 114 insertions(+), 71 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index aa4eb8cfcd7e..89cc809e4a00 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
>         struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>         struct kvm_mmu_memory_cache mmu_page_header_cache;
>
> +       /*
> +        * Protects change in size of mmu_shadow_page_cache cache.
> +        */
> +       spinlock_t mmu_shadow_page_cache_lock;
> +
>         /*
>          * QEMU userspace and the guest each have their own FPU state.
>          * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 254bc46234e0..157417e1cb6e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
>
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
> -static struct percpu_counter kvm_total_used_mmu_pages;
> +/*
> + * Total number of unused pages in MMU shadow page cache.
> + */
> +static struct percpu_counter kvm_total_unused_mmu_pages;
>
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>
> @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>         }
>  }
>
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +                                    spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +       int r;
> +
> +       spin_lock(cache_lock);
> +       orig_nobjs = cache->nobjs;
> +       r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> +       if (orig_nobjs != cache->nobjs)
> +               percpu_counter_add(&kvm_total_unused_mmu_pages,
> +                                  (cache->nobjs - orig_nobjs));
> +       spin_unlock(cache_lock);
> +       return r;
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>         int r;
> @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>         if (r)
>                 return r;
> -       r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -                                      PT64_ROOT_MAX_LEVEL);
> +       r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +                                     &vcpu->arch.mmu_shadow_page_cache_lock);
>         if (r)
>                 return r;
>         if (maybe_indirect) {
> @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>                                           PT64_ROOT_MAX_LEVEL);
>  }
>
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +                                    spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +
> +       spin_lock(cache_lock);
> +       orig_nobjs = cache->nobjs;
> +       kvm_mmu_free_memory_cache(cache);
> +       if (orig_nobjs)
> +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> +
> +       spin_unlock(cache_lock);
> +}
> +
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +       mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +                                &vcpu->arch.mmu_shadow_page_cache_lock);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
>  }
>  #endif
>
> -/*
> - * This value is the sum of all of the kvm instances's
> - * kvm->arch.n_used_mmu_pages values.  We need a global,
> - * aggregate version in order to make the slab shrinker
> - * faster
> - */
> -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> -{
> -       kvm->arch.n_used_mmu_pages += nr;
> -       percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> -}
> -
>  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       kvm_mod_used_mmu_pages(kvm, +1);
> +       kvm->arch.n_used_mmu_pages++;
>         kvm_account_pgtable_pages((void *)sp->spt, +1);
>  }
>
>  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       kvm_mod_used_mmu_pages(kvm, -1);
> +       kvm->arch.n_used_mmu_pages--;
>         kvm_account_pgtable_pages((void *)sp->spt, -1);
>  }
>
> @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
>         struct kvm_mmu_memory_cache *page_header_cache;
>         struct kvm_mmu_memory_cache *shadow_page_cache;
>         struct kvm_mmu_memory_cache *shadowed_info_cache;
> +       /*
> +        * Protects change in size of shadow_page_cache cache.
> +        */
> +       spinlock_t *shadow_page_cache_lock;
>  };
>
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +                                   spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +       void *page;
> +
> +       if (!cache_lock) {
> +               spin_lock(cache_lock);
> +               orig_nobjs = shadow_page_cache->nobjs;
> +       }

I believe this is guaranteed to cause a null pointer dereference.

> +       page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +       if (!cache_lock) {
> +               if (orig_nobjs)
> +                       percpu_counter_dec(&kvm_total_unused_mmu_pages);
> +               spin_unlock(cache_lock);

Again, this will cause a null-pointer dereference. The check above
just needs to be inverted.

> +       }
> +       return page;
> +}
> +
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>                                                       struct shadow_page_caches *caches,
>                                                       gfn_t gfn,
> @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>         struct kvm_mmu_page *sp;
>
>         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -       sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +       sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +                                               caches->shadow_page_cache_lock);
>         if (!role.direct)
>                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>
> @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>                 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
>                 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>                 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +               .shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
>         };
>
>         return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>
>         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +       spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>
>         vcpu->arch.mmu = &vcpu->arch.root_mmu;
>         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
>  }
>
> -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> -{
> -       return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> -}
> -
>  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
>                         struct kvm_memory_slot *slot,
>                         struct kvm_page_track_notifier_node *node)
> @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
>         /* Direct SPs do not require a shadowed_info_cache. */
>         caches.page_header_cache = &kvm->arch.split_page_header_cache;
>         caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> +       caches.shadow_page_cache_lock = NULL;
>
>         /* Safe to pass NULL for vCPU since requesting a direct SP. */
>         return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
>  static unsigned long
>  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -       struct kvm *kvm;
> -       int nr_to_scan = sc->nr_to_scan;
> +       struct kvm_mmu_memory_cache *cache;
> +       struct kvm *kvm, *first_kvm = NULL;
>         unsigned long freed = 0;
> +       /* spinlock for memory cache */
> +       spinlock_t *cache_lock;
> +       struct kvm_vcpu *vcpu;
> +       unsigned long i;
>
>         mutex_lock(&kvm_lock);
>
>         list_for_each_entry(kvm, &vm_list, vm_list) {
> -               int idx;
> -               LIST_HEAD(invalid_list);
> -
> -               /*
> -                * Never scan more than sc->nr_to_scan VM instances.
> -                * Will not hit this condition practically since we do not try
> -                * to shrink more than one VM and it is very unlikely to see
> -                * !n_used_mmu_pages so many times.
> -                */
> -               if (!nr_to_scan--)
> +               if (first_kvm == kvm)
>                         break;
> -               /*
> -                * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> -                * here. We may skip a VM instance errorneosly, but we do not
> -                * want to shrink a VM that only started to populate its MMU
> -                * anyway.
> -                */
> -               if (!kvm->arch.n_used_mmu_pages &&
> -                   !kvm_has_zapped_obsolete_pages(kvm))
> -                       continue;
> +               if (!first_kvm)
> +                       first_kvm = kvm;
> +               list_move_tail(&kvm->vm_list, &vm_list);
>
> -               idx = srcu_read_lock(&kvm->srcu);

I think we still want to do the SRCU read lock here to prevent
use-after-free on the vCPUs.

> -               write_lock(&kvm->mmu_lock);
> +               kvm_for_each_vcpu(i, vcpu, kvm) {
> +                       cache = &vcpu->arch.mmu_shadow_page_cache;
> +                       cache_lock = vcpu->arch.mmu_shadow_page_cache_lock;
> +                       if (READ_ONCE(cache->nobjs)) {
> +                               spin_lock(cache_lock);
> +                               freed += kvm_mmu_empty_memory_cache(cache);

Would it make sense to just have kvm_mmu_empty_memory_cache()
decrement the per-cpu counter itself? I don't think there's much perf
to be gained by reducing percpu counter updates here and it would
consolidate the bookkeeping.

> +                               spin_unlock(cache_lock);
> +                       }
>
> -               if (kvm_has_zapped_obsolete_pages(kvm)) {
> -                       kvm_mmu_commit_zap_page(kvm,
> -                             &kvm->arch.zapped_obsolete_pages);
> -                       goto unlock;
>                 }
>
> -               freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> -
> -unlock:
> -               write_unlock(&kvm->mmu_lock);
> -               srcu_read_unlock(&kvm->srcu, idx);
> -
> -               /*
> -                * unfair on small ones
> -                * per-vm shrinkers cry out
> -                * sadness comes quickly
> -                */

Nooooo, don't delete the beautiful poem!

> -               list_move_tail(&kvm->vm_list, &vm_list);
> -               break;
> +               if (freed >= sc->nr_to_scan)
> +                       break;
>         }
>
> +       if (freed)
> +               percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
>         mutex_unlock(&kvm_lock);
> +       percpu_counter_sync(&kvm_total_unused_mmu_pages);
>         return freed;
>  }
>
>  static unsigned long
>  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -       return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> +       return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);

This will return 0 if the sum of all the per-cpu counters is negative.
It should never be negative though. Might be nice to add a warning if
we would get a negative sum.

>  }
>
>  static struct shrinker mmu_shrinker = {
> @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
>         if (!mmu_page_header_cache)
>                 goto out;
>
> -       if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> +       if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
>                 goto out;
>
>         ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
>         return 0;
>
>  out_shrinker:
> -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>  out:
>         mmu_destroy_caches();
>         return ret;
> @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
>  void kvm_mmu_vendor_module_exit(void)
>  {
>         mmu_destroy_caches();
> -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>         unregister_shrinker(&mmu_shrinker);
>  }
>
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ac00bfbf32f6..c2a342028b6a 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +                                   spinlock_t *cache_lock);
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 764f7c87286f..4974fa96deff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>         struct kvm_mmu_page *sp;
>
>         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -       sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +       sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +                                               &vcpu->arch.mmu_shadow_page_cache_lock);
>
>         return sp;
>  }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 01aad8b74162..efd9b38ea9a2 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
>  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
>  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 13e88297f999..f2d762878b97 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>         return mc->nobjs;
>  }
>
> -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
>  {
> +       int freed = mc->nobjs;
> +
>         while (mc->nobjs) {
>                 if (mc->kmem_cache)
>                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
>                         free_page((unsigned long)mc->objects[--mc->nobjs]);
>         }
>
> -       kvfree(mc->objects);
> +       return freed;
> +}
>
> +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +{
> +       kvm_mmu_empty_memory_cache(mc);
> +       kvfree(mc->objects);
>         mc->objects = NULL;
>         mc->capacity = 0;
>  }
> --
> 2.39.0.314.g84b9a713c41-goog
>
Vipin Sharma Dec. 28, 2022, 10:07 p.m. UTC | #2
On Tue, Dec 27, 2022 at 10:37 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> >
> > mmu_shrink_scan() is very disruptive to VMs. It picks the first
> > VM in the vm_list, zaps the oldest page which is most likely an upper
> > level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> > more disruptive in nested VMs case, considering L1 SPTEs will be the
> > oldest even though most of the entries are for L2 SPTEs.
> >
> > As discussed in
> > https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> > shrinker logic has not be very useful in actually keeping VMs performant
> > and reducing memory usage.
> >
> > Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> > cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> > VM's performance should not be affected.
> >
> > This also allows to change cache capacities without worrying too much
> > about high memory usage in cache.
> >
> > Tested this change by running dirty_log_perf_test while dropping cache
> > via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> > continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> > logs from kvm_mmu_memory_cache_alloc(), which is expected.
>
> Oh, that's not a good thing. I don't think we want to be hitting those
> warnings. For one, kernel warnings should not be expected behavior,
> probably for many reasons, but at least because Syzbot will find it.
> In this particular case, we don't want to hit that because in that
> case we'll try to do a GFP_ATOMIC, which can fail, and if it fails,
> we'll BUG:
>
> void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> {
>         void *p;
>
>         if (WARN_ON(!mc->nobjs))
>                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
>         else
>                 p = mc->objects[--mc->nobjs];
>         BUG_ON(!p);
>         return p;
> }
>
> Perhaps the risk of actually panicking is small, but it probably
> indicates that we need better error handling around failed allocations
> from the cache.
> Or, the slightly less elegant approach might be to just hold the cache
> lock around the cache topup and use of pages from the cache, but
> adding better error handling would probably be cleaner.

I was counting on the fact that shrinker will ideally run only in
extreme cases, i.e. host is running on low memory. So, this WARN_ON
will only be rarely used. I was not aware of Syzbot, it seems like it
will be a concern if it does this kind of testing.

I thought about keeping a mutex, taking it during topup and releasing
it after the whole operation is done but I stopped it as the duration
of holding mutex will be long and might block the memory shrinker
longer. I am not sure though, if this is a valid concern.

I can't think of a better error handling in this situation. I can
change logic to hold mutex if the above mutex hold duration concern
won't be an issue compared to the current WARN_ON() approach.

>
> >
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Vipin Sharma <vipinsh@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |   5 +
> >  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
> >  arch/x86/kvm/mmu/mmu_internal.h |   2 +
> >  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
> >  include/linux/kvm_host.h        |   1 +
> >  virt/kvm/kvm_main.c             |  11 ++-
> >  6 files changed, 114 insertions(+), 71 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index aa4eb8cfcd7e..89cc809e4a00 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
> >         struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
> >         struct kvm_mmu_memory_cache mmu_page_header_cache;
> >
> > +       /*
> > +        * Protects change in size of mmu_shadow_page_cache cache.
> > +        */
> > +       spinlock_t mmu_shadow_page_cache_lock;
> > +
> >         /*
> >          * QEMU userspace and the guest each have their own FPU state.
> >          * In vcpu_run, we switch between the user and guest FPU contexts.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 254bc46234e0..157417e1cb6e 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
> >
> >  static struct kmem_cache *pte_list_desc_cache;
> >  struct kmem_cache *mmu_page_header_cache;
> > -static struct percpu_counter kvm_total_used_mmu_pages;
> > +/*
> > + * Total number of unused pages in MMU shadow page cache.
> > + */
> > +static struct percpu_counter kvm_total_unused_mmu_pages;
> >
> >  static void mmu_spte_set(u64 *sptep, u64 spte);
> >
> > @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> >         }
> >  }
> >
> > +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                    spinlock_t *cache_lock)
> > +{
> > +       int orig_nobjs;
> > +       int r;
> > +
> > +       spin_lock(cache_lock);
> > +       orig_nobjs = cache->nobjs;
> > +       r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> > +       if (orig_nobjs != cache->nobjs)
> > +               percpu_counter_add(&kvm_total_unused_mmu_pages,
> > +                                  (cache->nobjs - orig_nobjs));
> > +       spin_unlock(cache_lock);
> > +       return r;
> > +}
> > +
> >  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >  {
> >         int r;
> > @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
> >         if (r)
> >                 return r;
> > -       r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > -                                      PT64_ROOT_MAX_LEVEL);
> > +       r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > +                                     &vcpu->arch.mmu_shadow_page_cache_lock);
> >         if (r)
> >                 return r;
> >         if (maybe_indirect) {
> > @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >                                           PT64_ROOT_MAX_LEVEL);
> >  }
> >
> > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                    spinlock_t *cache_lock)
> > +{
> > +       int orig_nobjs;
> > +
> > +       spin_lock(cache_lock);
> > +       orig_nobjs = cache->nobjs;
> > +       kvm_mmu_free_memory_cache(cache);
> > +       if (orig_nobjs)
> > +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > +
> > +       spin_unlock(cache_lock);
> > +}
> > +
> >  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> > -       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +       mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > +                                &vcpu->arch.mmu_shadow_page_cache_lock);
> >         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
> >         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
> >  }
> > @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
> >  }
> >  #endif
> >
> > -/*
> > - * This value is the sum of all of the kvm instances's
> > - * kvm->arch.n_used_mmu_pages values.  We need a global,
> > - * aggregate version in order to make the slab shrinker
> > - * faster
> > - */
> > -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> > -{
> > -       kvm->arch.n_used_mmu_pages += nr;
> > -       percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> > -}
> > -
> >  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  {
> > -       kvm_mod_used_mmu_pages(kvm, +1);
> > +       kvm->arch.n_used_mmu_pages++;
> >         kvm_account_pgtable_pages((void *)sp->spt, +1);
> >  }
> >
> >  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  {
> > -       kvm_mod_used_mmu_pages(kvm, -1);
> > +       kvm->arch.n_used_mmu_pages--;
> >         kvm_account_pgtable_pages((void *)sp->spt, -1);
> >  }
> >
> > @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
> >         struct kvm_mmu_memory_cache *page_header_cache;
> >         struct kvm_mmu_memory_cache *shadow_page_cache;
> >         struct kvm_mmu_memory_cache *shadowed_info_cache;
> > +       /*
> > +        * Protects change in size of shadow_page_cache cache.
> > +        */
> > +       spinlock_t *shadow_page_cache_lock;
> >  };
> >
> > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > +                                   spinlock_t *cache_lock)
> > +{
> > +       int orig_nobjs;
> > +       void *page;
> > +
> > +       if (!cache_lock) {
> > +               spin_lock(cache_lock);
> > +               orig_nobjs = shadow_page_cache->nobjs;
> > +       }
>
> I believe this is guaranteed to cause a null pointer dereference.
>
> > +       page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> > +       if (!cache_lock) {
> > +               if (orig_nobjs)
> > +                       percpu_counter_dec(&kvm_total_unused_mmu_pages);
> > +               spin_unlock(cache_lock);
>
> Again, this will cause a null-pointer dereference. The check above
> just needs to be inverted.

Yes, I forgot to change it in the commit and one patch later in the
series removes this whole "if(!cache_lock)" condition so it skipped my
attention. Thanks for catching it.

>
> > +       }
> > +       return page;
> > +}
> > +
> >  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> >                                                       struct shadow_page_caches *caches,
> >                                                       gfn_t gfn,
> > @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> >         struct kvm_mmu_page *sp;
> >
> >         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> > -       sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> > +       sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> > +                                               caches->shadow_page_cache_lock);
> >         if (!role.direct)
> >                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
> >
> > @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
> >                 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
> >                 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
> >                 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> > +               .shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
> >         };
> >
> >         return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> > @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
> >         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
> >
> >         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> > +       spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
> >
> >         vcpu->arch.mmu = &vcpu->arch.root_mmu;
> >         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> > @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> >                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
> >  }
> >
> > -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> > -{
> > -       return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> > -}
> > -
> >  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
> >                         struct kvm_memory_slot *slot,
> >                         struct kvm_page_track_notifier_node *node)
> > @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
> >         /* Direct SPs do not require a shadowed_info_cache. */
> >         caches.page_header_cache = &kvm->arch.split_page_header_cache;
> >         caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> > +       caches.shadow_page_cache_lock = NULL;
> >
> >         /* Safe to pass NULL for vCPU since requesting a direct SP. */
> >         return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> > @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> >  static unsigned long
> >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  {
> > -       struct kvm *kvm;
> > -       int nr_to_scan = sc->nr_to_scan;
> > +       struct kvm_mmu_memory_cache *cache;
> > +       struct kvm *kvm, *first_kvm = NULL;
> >         unsigned long freed = 0;
> > +       /* spinlock for memory cache */
> > +       spinlock_t *cache_lock;
> > +       struct kvm_vcpu *vcpu;
> > +       unsigned long i;
> >
> >         mutex_lock(&kvm_lock);
> >
> >         list_for_each_entry(kvm, &vm_list, vm_list) {
> > -               int idx;
> > -               LIST_HEAD(invalid_list);
> > -
> > -               /*
> > -                * Never scan more than sc->nr_to_scan VM instances.
> > -                * Will not hit this condition practically since we do not try
> > -                * to shrink more than one VM and it is very unlikely to see
> > -                * !n_used_mmu_pages so many times.
> > -                */
> > -               if (!nr_to_scan--)
> > +               if (first_kvm == kvm)
> >                         break;
> > -               /*
> > -                * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> > -                * here. We may skip a VM instance errorneosly, but we do not
> > -                * want to shrink a VM that only started to populate its MMU
> > -                * anyway.
> > -                */
> > -               if (!kvm->arch.n_used_mmu_pages &&
> > -                   !kvm_has_zapped_obsolete_pages(kvm))
> > -                       continue;
> > +               if (!first_kvm)
> > +                       first_kvm = kvm;
> > +               list_move_tail(&kvm->vm_list, &vm_list);
> >
> > -               idx = srcu_read_lock(&kvm->srcu);
>
> I think we still want to do the SRCU read lock here to prevent
> use-after-free on the vCPUs.

Since I am in mutex_lock(&kvm_lock), a kvm will not be removed from
kvm->vm_list, this will block kvm_destroy_vm() moving further to
destroy vcpus via kvm_arch_destroy_vm() > kvm_destroy_vcpus(). Do we
still need the srcu_read_lock()? Also, kvm_for_each_vcpu() using
xa_for_each_range() which uses RCU for traversing the loop, won't
these two be sufficient to avoid needing srcu_read_lock() here?

>
> > -               write_lock(&kvm->mmu_lock);
> > +               kvm_for_each_vcpu(i, vcpu, kvm) {
> > +                       cache = &vcpu->arch.mmu_shadow_page_cache;
> > +                       cache_lock = vcpu->arch.mmu_shadow_page_cache_lock;
> > +                       if (READ_ONCE(cache->nobjs)) {
> > +                               spin_lock(cache_lock);
> > +                               freed += kvm_mmu_empty_memory_cache(cache);
>
> Would it make sense to just have kvm_mmu_empty_memory_cache()
> decrement the per-cpu counter itself? I don't think there's much perf
> to be gained by reducing percpu counter updates here and it would
> consolidate the bookkeeping.

kvm_mmu_empty_memory_cache() is also used by other caches for which
are not keeping the count.

>
> > +                               spin_unlock(cache_lock);
> > +                       }
> >
> > -               if (kvm_has_zapped_obsolete_pages(kvm)) {
> > -                       kvm_mmu_commit_zap_page(kvm,
> > -                             &kvm->arch.zapped_obsolete_pages);
> > -                       goto unlock;
> >                 }
> >
> > -               freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> > -
> > -unlock:
> > -               write_unlock(&kvm->mmu_lock);
> > -               srcu_read_unlock(&kvm->srcu, idx);
> > -
> > -               /*
> > -                * unfair on small ones
> > -                * per-vm shrinkers cry out
> > -                * sadness comes quickly
> > -                */
>
> Nooooo, don't delete the beautiful poem!

I will fix this mistake in the next version, pardon my ignorance :)

>
> > -               list_move_tail(&kvm->vm_list, &vm_list);
> > -               break;
> > +               if (freed >= sc->nr_to_scan)
> > +                       break;
> >         }
> >
> > +       if (freed)
> > +               percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
> >         mutex_unlock(&kvm_lock);
> > +       percpu_counter_sync(&kvm_total_unused_mmu_pages);
> >         return freed;
> >  }
> >
> >  static unsigned long
> >  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> >  {
> > -       return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> > +       return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
>
> This will return 0 if the sum of all the per-cpu counters is negative.
> It should never be negative though. Might be nice to add a warning if
> we would get a negative sum.
>

Sounds good.


> >  }
> >
> >  static struct shrinker mmu_shrinker = {
> > @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
> >         if (!mmu_page_header_cache)
> >                 goto out;
> >
> > -       if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> > +       if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
> >                 goto out;
> >
> >         ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> > @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
> >         return 0;
> >
> >  out_shrinker:
> > -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> >  out:
> >         mmu_destroy_caches();
> >         return ret;
> > @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
> >  void kvm_mmu_vendor_module_exit(void)
> >  {
> >         mmu_destroy_caches();
> > -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> >         unregister_shrinker(&mmu_shrinker);
> >  }
> >
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index ac00bfbf32f6..c2a342028b6a 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >
> > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > +                                   spinlock_t *cache_lock);
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 764f7c87286f..4974fa96deff 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
> >         struct kvm_mmu_page *sp;
> >
> >         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> > -       sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> > +       sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> > +                                               &vcpu->arch.mmu_shadow_page_cache_lock);
> >
> >         return sp;
> >  }
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 01aad8b74162..efd9b38ea9a2 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
> >  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
> >  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
> >  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
> >  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
> >  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  #endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 13e88297f999..f2d762878b97 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
> >         return mc->nobjs;
> >  }
> >
> > -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
> >  {
> > +       int freed = mc->nobjs;
> > +
> >         while (mc->nobjs) {
> >                 if (mc->kmem_cache)
> >                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> > @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> >                         free_page((unsigned long)mc->objects[--mc->nobjs]);
> >         }
> >
> > -       kvfree(mc->objects);
> > +       return freed;
> > +}
> >
> > +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > +{
> > +       kvm_mmu_empty_memory_cache(mc);
> > +       kvfree(mc->objects);
> >         mc->objects = NULL;
> >         mc->capacity = 0;
> >  }
> > --
> > 2.39.0.314.g84b9a713c41-goog
> >
David Matlack Dec. 29, 2022, 9:15 p.m. UTC | #3
On Wed, Dec 28, 2022 at 02:07:49PM -0800, Vipin Sharma wrote:
> On Tue, Dec 27, 2022 at 10:37 AM Ben Gardon <bgardon@google.com> wrote:
> > On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> > >
> > > Tested this change by running dirty_log_perf_test while dropping cache
> > > via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> > > continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> > > logs from kvm_mmu_memory_cache_alloc(), which is expected.
> >
> > Oh, that's not a good thing. I don't think we want to be hitting those
> > warnings. For one, kernel warnings should not be expected behavior,
> > probably for many reasons, but at least because Syzbot will find it.
> > In this particular case, we don't want to hit that because in that
> > case we'll try to do a GFP_ATOMIC, which can fail, and if it fails,
> > we'll BUG:
> >
> > void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> > {
> >         void *p;
> >
> >         if (WARN_ON(!mc->nobjs))
> >                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
> >         else
> >                 p = mc->objects[--mc->nobjs];
> >         BUG_ON(!p);
> >         return p;
> > }
> >
> > Perhaps the risk of actually panicking is small, but it probably
> > indicates that we need better error handling around failed allocations
> > from the cache.
> > Or, the slightly less elegant approach might be to just hold the cache
> > lock around the cache topup and use of pages from the cache, but
> > adding better error handling would probably be cleaner.
> 
> I was counting on the fact that shrinker will ideally run only in
> extreme cases, i.e. host is running on low memory. So, this WARN_ON
> will only be rarely used. I was not aware of Syzbot, it seems like it
> will be a concern if it does this kind of testing.

In an extreme low-memory situation, forcing vCPUS to do GFP_ATOMIC
allocations to handle page faults is risky. Plus it's a waste of time to
free that memory since it's just going to get immediately reallocated.

> 
> I thought about keeping a mutex, taking it during topup and releasing
> it after the whole operation is done but I stopped it as the duration
> of holding mutex will be long and might block the memory shrinker
> longer. I am not sure though, if this is a valid concern.

Use mutex_trylock() to skip any vCPUs that are currently handling page
faults.
David Matlack Dec. 29, 2022, 9:54 p.m. UTC | #4
On Wed, Dec 21, 2022 at 06:34:49PM -0800, Vipin Sharma wrote:
> mmu_shrink_scan() is very disruptive to VMs. It picks the first
> VM in the vm_list, zaps the oldest page which is most likely an upper
> level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> more disruptive in nested VMs case, considering L1 SPTEs will be the
> oldest even though most of the entries are for L2 SPTEs.
> 
> As discussed in
> https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> shrinker logic has not be very useful in actually keeping VMs performant
> and reducing memory usage.
> 
> Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> VM's performance should not be affected.

Can you split this commit up? e.g. First drop the old shrinking logic in
one commit (but leave the shrinking infrastructure in place). Then a
commit to make the shrinker free the per-vCPU shadow page caches. And
then perhaps another to make the shrinker free the per-VM shadow page
cache used for eager splitting.

> 
> This also allows to change cache capacities without worrying too much
> about high memory usage in cache.
> 
> Tested this change by running dirty_log_perf_test while dropping cache
> via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> logs from kvm_mmu_memory_cache_alloc(), which is expected.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   5 +
>  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
>  arch/x86/kvm/mmu/mmu_internal.h |   2 +
>  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
>  include/linux/kvm_host.h        |   1 +
>  virt/kvm/kvm_main.c             |  11 ++-
>  6 files changed, 114 insertions(+), 71 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index aa4eb8cfcd7e..89cc809e4a00 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +	/*
> +	 * Protects change in size of mmu_shadow_page_cache cache.
> +	 */
> +	spinlock_t mmu_shadow_page_cache_lock;
> +
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
>  	 * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 254bc46234e0..157417e1cb6e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
>  
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
> -static struct percpu_counter kvm_total_used_mmu_pages;
> +/*
> + * Total number of unused pages in MMU shadow page cache.
> + */
> +static struct percpu_counter kvm_total_unused_mmu_pages;
>  
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
> @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +				     spinlock_t *cache_lock)
> +{
> +	int orig_nobjs;
> +	int r;
> +
> +	spin_lock(cache_lock);
> +	orig_nobjs = cache->nobjs;
> +	r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> +	if (orig_nobjs != cache->nobjs)
> +		percpu_counter_add(&kvm_total_unused_mmu_pages,
> +				   (cache->nobjs - orig_nobjs));
> +	spin_unlock(cache_lock);
> +	return r;
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>  	int r;
> @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>  	if (r)
>  		return r;
> -	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -				       PT64_ROOT_MAX_LEVEL);
> +	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +				      &vcpu->arch.mmu_shadow_page_cache_lock);
>  	if (r)
>  		return r;
>  	if (maybe_indirect) {
> @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  					  PT64_ROOT_MAX_LEVEL);
>  }
>  
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +				     spinlock_t *cache_lock)
> +{
> +	int orig_nobjs;
> +
> +	spin_lock(cache_lock);
> +	orig_nobjs = cache->nobjs;
> +	kvm_mmu_free_memory_cache(cache);
> +	if (orig_nobjs)
> +		percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> +
> +	spin_unlock(cache_lock);
> +}

It would be nice to avoid adding these wrapper functions.

Once you add a mutex to protect the caches from being freed while vCPUs
are in the middle of a page fault you can drop the spin lock. After that
the only reason to have these wrappers is to update
kvm_total_unused_mmu_pages.

Do we really need kvm_total_unused_mmu_pages? Why not just dynamically
calculate the number of of unused pages in mmu_shrink_count()? Or just
estimate the count, e.g. num_vcpus * KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE?
Or have per-VM or per-vCPU shrinkers to avoid needing to do any
aggregation?

> +
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +				 &vcpu->arch.mmu_shadow_page_cache_lock);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);

mmu_shadowed_info_cache can be freed by the shrinker as well.

>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
>  }
>  #endif
>  
> -/*
> - * This value is the sum of all of the kvm instances's
> - * kvm->arch.n_used_mmu_pages values.  We need a global,
> - * aggregate version in order to make the slab shrinker
> - * faster
> - */
> -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> -{
> -	kvm->arch.n_used_mmu_pages += nr;
> -	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> -}
> -
>  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -	kvm_mod_used_mmu_pages(kvm, +1);
> +	kvm->arch.n_used_mmu_pages++;
>  	kvm_account_pgtable_pages((void *)sp->spt, +1);
>  }
>  
>  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -	kvm_mod_used_mmu_pages(kvm, -1);
> +	kvm->arch.n_used_mmu_pages--;
>  	kvm_account_pgtable_pages((void *)sp->spt, -1);
>  }
>  
> @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
>  	struct kvm_mmu_memory_cache *page_header_cache;
>  	struct kvm_mmu_memory_cache *shadow_page_cache;
>  	struct kvm_mmu_memory_cache *shadowed_info_cache;
> +	/*
> +	 * Protects change in size of shadow_page_cache cache.
> +	 */
> +	spinlock_t *shadow_page_cache_lock;
>  };
>  
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +				    spinlock_t *cache_lock)
> +{
> +	int orig_nobjs;
> +	void *page;
> +
> +	if (!cache_lock) {
> +		spin_lock(cache_lock);
> +		orig_nobjs = shadow_page_cache->nobjs;
> +	}
> +	page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +	if (!cache_lock) {
> +		if (orig_nobjs)
> +			percpu_counter_dec(&kvm_total_unused_mmu_pages);
> +		spin_unlock(cache_lock);
> +	}
> +	return page;
> +}
> +
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  						      struct shadow_page_caches *caches,
>  						      gfn_t gfn,
> @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +	sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +						caches->shadow_page_cache_lock);
>  	if (!role.direct)
>  		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>  
> @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>  		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
>  		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>  		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +		.shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
>  	};
>  
>  	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>  
>  	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +	spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>  
>  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>  		kvm_tdp_mmu_zap_invalidated_roots(kvm);
>  }
>  
> -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> -{
> -	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> -}
> -
>  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
>  			struct kvm_memory_slot *slot,
>  			struct kvm_page_track_notifier_node *node)
> @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
>  	/* Direct SPs do not require a shadowed_info_cache. */
>  	caches.page_header_cache = &kvm->arch.split_page_header_cache;
>  	caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> +	caches.shadow_page_cache_lock = NULL;
>  
>  	/* Safe to pass NULL for vCPU since requesting a direct SP. */
>  	return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
>  static unsigned long
>  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -	struct kvm *kvm;
> -	int nr_to_scan = sc->nr_to_scan;
> +	struct kvm_mmu_memory_cache *cache;
> +	struct kvm *kvm, *first_kvm = NULL;
>  	unsigned long freed = 0;
> +	/* spinlock for memory cache */
> +	spinlock_t *cache_lock;
> +	struct kvm_vcpu *vcpu;
> +	unsigned long i;
>  
>  	mutex_lock(&kvm_lock);
>  
>  	list_for_each_entry(kvm, &vm_list, vm_list) {
> -		int idx;
> -		LIST_HEAD(invalid_list);
> -
> -		/*
> -		 * Never scan more than sc->nr_to_scan VM instances.
> -		 * Will not hit this condition practically since we do not try
> -		 * to shrink more than one VM and it is very unlikely to see
> -		 * !n_used_mmu_pages so many times.
> -		 */
> -		if (!nr_to_scan--)
> +		if (first_kvm == kvm)
>  			break;
> -		/*
> -		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> -		 * here. We may skip a VM instance errorneosly, but we do not
> -		 * want to shrink a VM that only started to populate its MMU
> -		 * anyway.
> -		 */
> -		if (!kvm->arch.n_used_mmu_pages &&
> -		    !kvm_has_zapped_obsolete_pages(kvm))
> -			continue;
> +		if (!first_kvm)
> +			first_kvm = kvm;
> +		list_move_tail(&kvm->vm_list, &vm_list);
>  
> -		idx = srcu_read_lock(&kvm->srcu);
> -		write_lock(&kvm->mmu_lock);
> +		kvm_for_each_vcpu(i, vcpu, kvm) {

What protects this from racing with vCPU creation/deletion?

> +			cache = &vcpu->arch.mmu_shadow_page_cache;
> +			cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock;
> +			if (READ_ONCE(cache->nobjs)) {
> +				spin_lock(cache_lock);
> +				freed += kvm_mmu_empty_memory_cache(cache);
> +				spin_unlock(cache_lock);
> +			}

What about freeing kvm->arch.split_shadow_page_cache as well?

>  
> -		if (kvm_has_zapped_obsolete_pages(kvm)) {
> -			kvm_mmu_commit_zap_page(kvm,
> -			      &kvm->arch.zapped_obsolete_pages);
> -			goto unlock;
>  		}
>  
> -		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> -
> -unlock:
> -		write_unlock(&kvm->mmu_lock);
> -		srcu_read_unlock(&kvm->srcu, idx);
> -
> -		/*
> -		 * unfair on small ones
> -		 * per-vm shrinkers cry out
> -		 * sadness comes quickly
> -		 */
> -		list_move_tail(&kvm->vm_list, &vm_list);
> -		break;
> +		if (freed >= sc->nr_to_scan)
> +			break;
>  	}
>  
> +	if (freed)
> +		percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
>  	mutex_unlock(&kvm_lock);
> +	percpu_counter_sync(&kvm_total_unused_mmu_pages);
>  	return freed;
>  }
>  
>  static unsigned long
>  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> +	return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
>  }
>  
>  static struct shrinker mmu_shrinker = {
> @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
>  	if (!mmu_page_header_cache)
>  		goto out;
>  
> -	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> +	if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
>  		goto out;
>  
>  	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
>  	return 0;
>  
>  out_shrinker:
> -	percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +	percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>  out:
>  	mmu_destroy_caches();
>  	return ret;
> @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
>  void kvm_mmu_vendor_module_exit(void)
>  {
>  	mmu_destroy_caches();
> -	percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +	percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>  	unregister_shrinker(&mmu_shrinker);
>  }
>  
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ac00bfbf32f6..c2a342028b6a 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +				    spinlock_t *cache_lock);
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 764f7c87286f..4974fa96deff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +	sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +						&vcpu->arch.mmu_shadow_page_cache_lock);
>  
>  	return sp;
>  }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 01aad8b74162..efd9b38ea9a2 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
>  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
>  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 13e88297f999..f2d762878b97 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>  	return mc->nobjs;
>  }
>  
> -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
>  {
> +	int freed = mc->nobjs;
> +
>  	while (mc->nobjs) {
>  		if (mc->kmem_cache)
>  			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
>  			free_page((unsigned long)mc->objects[--mc->nobjs]);
>  	}
>  
> -	kvfree(mc->objects);
> +	return freed;
> +}
>  
> +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +{
> +	kvm_mmu_empty_memory_cache(mc);
> +	kvfree(mc->objects);
>  	mc->objects = NULL;
>  	mc->capacity = 0;
>  }
> -- 
> 2.39.0.314.g84b9a713c41-goog
>
Vipin Sharma Jan. 3, 2023, 5:38 p.m. UTC | #5
On Thu, Dec 29, 2022 at 1:15 PM David Matlack <dmatlack@google.com> wrote:
>
> On Wed, Dec 28, 2022 at 02:07:49PM -0800, Vipin Sharma wrote:
> > On Tue, Dec 27, 2022 at 10:37 AM Ben Gardon <bgardon@google.com> wrote:
> > > On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> > > >
> > > > Tested this change by running dirty_log_perf_test while dropping cache
> > > > via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> > > > continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> > > > logs from kvm_mmu_memory_cache_alloc(), which is expected.
> > >
> > > Oh, that's not a good thing. I don't think we want to be hitting those
> > > warnings. For one, kernel warnings should not be expected behavior,
> > > probably for many reasons, but at least because Syzbot will find it.
> > > In this particular case, we don't want to hit that because in that
> > > case we'll try to do a GFP_ATOMIC, which can fail, and if it fails,
> > > we'll BUG:
> > >
> > > void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> > > {
> > >         void *p;
> > >
> > >         if (WARN_ON(!mc->nobjs))
> > >                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
> > >         else
> > >                 p = mc->objects[--mc->nobjs];
> > >         BUG_ON(!p);
> > >         return p;
> > > }
> > >
> > > Perhaps the risk of actually panicking is small, but it probably
> > > indicates that we need better error handling around failed allocations
> > > from the cache.
> > > Or, the slightly less elegant approach might be to just hold the cache
> > > lock around the cache topup and use of pages from the cache, but
> > > adding better error handling would probably be cleaner.
> >
> > I was counting on the fact that shrinker will ideally run only in
> > extreme cases, i.e. host is running on low memory. So, this WARN_ON
> > will only be rarely used. I was not aware of Syzbot, it seems like it
> > will be a concern if it does this kind of testing.
>
> In an extreme low-memory situation, forcing vCPUS to do GFP_ATOMIC
> allocations to handle page faults is risky. Plus it's a waste of time to
> free that memory since it's just going to get immediately reallocated.
>
> >
> > I thought about keeping a mutex, taking it during topup and releasing
> > it after the whole operation is done but I stopped it as the duration
> > of holding mutex will be long and might block the memory shrinker
> > longer. I am not sure though, if this is a valid concern.
>
> Use mutex_trylock() to skip any vCPUs that are currently handling page
> faults.

oh yeah! Thanks.
Vipin Sharma Jan. 3, 2023, 6:01 p.m. UTC | #6
On Thu, Dec 29, 2022 at 1:55 PM David Matlack <dmatlack@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 06:34:49PM -0800, Vipin Sharma wrote:
> > mmu_shrink_scan() is very disruptive to VMs. It picks the first
> > VM in the vm_list, zaps the oldest page which is most likely an upper
> > level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> > more disruptive in nested VMs case, considering L1 SPTEs will be the
> > oldest even though most of the entries are for L2 SPTEs.
> >
> > As discussed in
> > https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> > shrinker logic has not be very useful in actually keeping VMs performant
> > and reducing memory usage.
> >
> > Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> > cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> > VM's performance should not be affected.
>
> Can you split this commit up? e.g. First drop the old shrinking logic in
> one commit (but leave the shrinking infrastructure in place). Then a
> commit to make the shrinker free the per-vCPU shadow page caches. And
> then perhaps another to make the shrinker free the per-VM shadow page
> cache used for eager splitting.
>

Sounds good, I will separate it in two parts, one for dropping old
logic, one for adding per vcpu shadow page caches. Patch 3 is enabling
shrinkerto free per-VM shadow page.

> >
> > This also allows to change cache capacities without worrying too much
> > about high memory usage in cache.
> >
> > Tested this change by running dirty_log_perf_test while dropping cache
> > via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> > continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> > logs from kvm_mmu_memory_cache_alloc(), which is expected.
> >
> > Suggested-by: Sean Christopherson <seanjc@google.com>
> > Signed-off-by: Vipin Sharma <vipinsh@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |   5 +
> >  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
> >  arch/x86/kvm/mmu/mmu_internal.h |   2 +
> >  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
> >  include/linux/kvm_host.h        |   1 +
> >  virt/kvm/kvm_main.c             |  11 ++-
> >  6 files changed, 114 insertions(+), 71 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index aa4eb8cfcd7e..89cc809e4a00 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
> >       struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
> >       struct kvm_mmu_memory_cache mmu_page_header_cache;
> >
> > +     /*
> > +      * Protects change in size of mmu_shadow_page_cache cache.
> > +      */
> > +     spinlock_t mmu_shadow_page_cache_lock;
> > +
> >       /*
> >        * QEMU userspace and the guest each have their own FPU state.
> >        * In vcpu_run, we switch between the user and guest FPU contexts.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 254bc46234e0..157417e1cb6e 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
> >
> >  static struct kmem_cache *pte_list_desc_cache;
> >  struct kmem_cache *mmu_page_header_cache;
> > -static struct percpu_counter kvm_total_used_mmu_pages;
> > +/*
> > + * Total number of unused pages in MMU shadow page cache.
> > + */
> > +static struct percpu_counter kvm_total_unused_mmu_pages;
> >
> >  static void mmu_spte_set(u64 *sptep, u64 spte);
> >
> > @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> >       }
> >  }
> >
> > +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                  spinlock_t *cache_lock)
> > +{
> > +     int orig_nobjs;
> > +     int r;
> > +
> > +     spin_lock(cache_lock);
> > +     orig_nobjs = cache->nobjs;
> > +     r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> > +     if (orig_nobjs != cache->nobjs)
> > +             percpu_counter_add(&kvm_total_unused_mmu_pages,
> > +                                (cache->nobjs - orig_nobjs));
> > +     spin_unlock(cache_lock);
> > +     return r;
> > +}
> > +
> >  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >  {
> >       int r;
> > @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >                                      1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
> >       if (r)
> >               return r;
> > -     r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > -                                    PT64_ROOT_MAX_LEVEL);
> > +     r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > +                                   &vcpu->arch.mmu_shadow_page_cache_lock);
> >       if (r)
> >               return r;
> >       if (maybe_indirect) {
> > @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >                                         PT64_ROOT_MAX_LEVEL);
> >  }
> >
> > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                  spinlock_t *cache_lock)
> > +{
> > +     int orig_nobjs;
> > +
> > +     spin_lock(cache_lock);
> > +     orig_nobjs = cache->nobjs;
> > +     kvm_mmu_free_memory_cache(cache);
> > +     if (orig_nobjs)
> > +             percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > +
> > +     spin_unlock(cache_lock);
> > +}
>
> It would be nice to avoid adding these wrapper functions.
>
> Once you add a mutex to protect the caches from being freed while vCPUs
> are in the middle of a page fault you can drop the spin lock. After that
> the only reason to have these wrappers is to update
> kvm_total_unused_mmu_pages.
>
> Do we really need kvm_total_unused_mmu_pages? Why not just dynamically
> calculate the number of of unused pages in mmu_shrink_count()? Or just
> estimate the count, e.g. num_vcpus * KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE?
> Or have per-VM or per-vCPU shrinkers to avoid needing to do any
> aggregation?
>

I think we can drop this, by default we can return num_kvms *
num_vcpus * nodes * KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE

Whenever mmu_shrink_scan() is called if there are no pages to free
then return SHRINK_STOP which will stop any subsequent calls during
that time.


> > +
> >  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> > -     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +     mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > +                              &vcpu->arch.mmu_shadow_page_cache_lock);
> >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>
> mmu_shadowed_info_cache can be freed by the shrinker as well.
>
> >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
> >  }
> > @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
> >  }
> >  #endif
> >
> > -/*
> > - * This value is the sum of all of the kvm instances's
> > - * kvm->arch.n_used_mmu_pages values.  We need a global,
> > - * aggregate version in order to make the slab shrinker
> > - * faster
> > - */
> > -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> > -{
> > -     kvm->arch.n_used_mmu_pages += nr;
> > -     percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> > -}
> > -
> >  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  {
> > -     kvm_mod_used_mmu_pages(kvm, +1);
> > +     kvm->arch.n_used_mmu_pages++;
> >       kvm_account_pgtable_pages((void *)sp->spt, +1);
> >  }
> >
> >  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> >  {
> > -     kvm_mod_used_mmu_pages(kvm, -1);
> > +     kvm->arch.n_used_mmu_pages--;
> >       kvm_account_pgtable_pages((void *)sp->spt, -1);
> >  }
> >
> > @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
> >       struct kvm_mmu_memory_cache *page_header_cache;
> >       struct kvm_mmu_memory_cache *shadow_page_cache;
> >       struct kvm_mmu_memory_cache *shadowed_info_cache;
> > +     /*
> > +      * Protects change in size of shadow_page_cache cache.
> > +      */
> > +     spinlock_t *shadow_page_cache_lock;
> >  };
> >
> > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > +                                 spinlock_t *cache_lock)
> > +{
> > +     int orig_nobjs;
> > +     void *page;
> > +
> > +     if (!cache_lock) {
> > +             spin_lock(cache_lock);
> > +             orig_nobjs = shadow_page_cache->nobjs;
> > +     }
> > +     page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> > +     if (!cache_lock) {
> > +             if (orig_nobjs)
> > +                     percpu_counter_dec(&kvm_total_unused_mmu_pages);
> > +             spin_unlock(cache_lock);
> > +     }
> > +     return page;
> > +}
> > +
> >  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> >                                                     struct shadow_page_caches *caches,
> >                                                     gfn_t gfn,
> > @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> >       struct kvm_mmu_page *sp;
> >
> >       sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> > -     sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> > +     sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> > +                                             caches->shadow_page_cache_lock);
> >       if (!role.direct)
> >               sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
> >
> > @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
> >               .page_header_cache = &vcpu->arch.mmu_page_header_cache,
> >               .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
> >               .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> > +             .shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
> >       };
> >
> >       return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> > @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
> >       vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
> >
> >       vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> > +     spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
> >
> >       vcpu->arch.mmu = &vcpu->arch.root_mmu;
> >       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> > @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> >               kvm_tdp_mmu_zap_invalidated_roots(kvm);
> >  }
> >
> > -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> > -{
> > -     return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> > -}
> > -
> >  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
> >                       struct kvm_memory_slot *slot,
> >                       struct kvm_page_track_notifier_node *node)
> > @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
> >       /* Direct SPs do not require a shadowed_info_cache. */
> >       caches.page_header_cache = &kvm->arch.split_page_header_cache;
> >       caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> > +     caches.shadow_page_cache_lock = NULL;
> >
> >       /* Safe to pass NULL for vCPU since requesting a direct SP. */
> >       return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> > @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> >  static unsigned long
> >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  {
> > -     struct kvm *kvm;
> > -     int nr_to_scan = sc->nr_to_scan;
> > +     struct kvm_mmu_memory_cache *cache;
> > +     struct kvm *kvm, *first_kvm = NULL;
> >       unsigned long freed = 0;
> > +     /* spinlock for memory cache */
> > +     spinlock_t *cache_lock;
> > +     struct kvm_vcpu *vcpu;
> > +     unsigned long i;
> >
> >       mutex_lock(&kvm_lock);
> >
> >       list_for_each_entry(kvm, &vm_list, vm_list) {
> > -             int idx;
> > -             LIST_HEAD(invalid_list);
> > -
> > -             /*
> > -              * Never scan more than sc->nr_to_scan VM instances.
> > -              * Will not hit this condition practically since we do not try
> > -              * to shrink more than one VM and it is very unlikely to see
> > -              * !n_used_mmu_pages so many times.
> > -              */
> > -             if (!nr_to_scan--)
> > +             if (first_kvm == kvm)
> >                       break;
> > -             /*
> > -              * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> > -              * here. We may skip a VM instance errorneosly, but we do not
> > -              * want to shrink a VM that only started to populate its MMU
> > -              * anyway.
> > -              */
> > -             if (!kvm->arch.n_used_mmu_pages &&
> > -                 !kvm_has_zapped_obsolete_pages(kvm))
> > -                     continue;
> > +             if (!first_kvm)
> > +                     first_kvm = kvm;
> > +             list_move_tail(&kvm->vm_list, &vm_list);
> >
> > -             idx = srcu_read_lock(&kvm->srcu);
> > -             write_lock(&kvm->mmu_lock);
> > +             kvm_for_each_vcpu(i, vcpu, kvm) {
>
> What protects this from racing with vCPU creation/deletion?
>
> > +                     cache = &vcpu->arch.mmu_shadow_page_cache;
> > +                     cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock;
> > +                     if (READ_ONCE(cache->nobjs)) {
> > +                             spin_lock(cache_lock);
> > +                             freed += kvm_mmu_empty_memory_cache(cache);
> > +                             spin_unlock(cache_lock);
> > +                     }
>
> What about freeing kvm->arch.split_shadow_page_cache as well?
>
> >
> > -             if (kvm_has_zapped_obsolete_pages(kvm)) {
> > -                     kvm_mmu_commit_zap_page(kvm,
> > -                           &kvm->arch.zapped_obsolete_pages);
> > -                     goto unlock;
> >               }
> >
> > -             freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> > -
> > -unlock:
> > -             write_unlock(&kvm->mmu_lock);
> > -             srcu_read_unlock(&kvm->srcu, idx);
> > -
> > -             /*
> > -              * unfair on small ones
> > -              * per-vm shrinkers cry out
> > -              * sadness comes quickly
> > -              */
> > -             list_move_tail(&kvm->vm_list, &vm_list);
> > -             break;
> > +             if (freed >= sc->nr_to_scan)
> > +                     break;
> >       }
> >
> > +     if (freed)
> > +             percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
> >       mutex_unlock(&kvm_lock);
> > +     percpu_counter_sync(&kvm_total_unused_mmu_pages);
> >       return freed;
> >  }
> >
> >  static unsigned long
> >  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> >  {
> > -     return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> > +     return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
> >  }
> >
> >  static struct shrinker mmu_shrinker = {
> > @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
> >       if (!mmu_page_header_cache)
> >               goto out;
> >
> > -     if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> > +     if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
> >               goto out;
> >
> >       ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> > @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
> >       return 0;
> >
> >  out_shrinker:
> > -     percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > +     percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> >  out:
> >       mmu_destroy_caches();
> >       return ret;
> > @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
> >  void kvm_mmu_vendor_module_exit(void)
> >  {
> >       mmu_destroy_caches();
> > -     percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > +     percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> >       unregister_shrinker(&mmu_shrinker);
> >  }
> >
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index ac00bfbf32f6..c2a342028b6a 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >
> > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > +                                 spinlock_t *cache_lock);
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 764f7c87286f..4974fa96deff 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
> >       struct kvm_mmu_page *sp;
> >
> >       sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> > -     sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> > +     sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> > +                                             &vcpu->arch.mmu_shadow_page_cache_lock);
> >
> >       return sp;
> >  }
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 01aad8b74162..efd9b38ea9a2 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
> >  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
> >  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
> >  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
> >  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
> >  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  #endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 13e88297f999..f2d762878b97 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
> >       return mc->nobjs;
> >  }
> >
> > -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
> >  {
> > +     int freed = mc->nobjs;
> > +
> >       while (mc->nobjs) {
> >               if (mc->kmem_cache)
> >                       kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> > @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> >                       free_page((unsigned long)mc->objects[--mc->nobjs]);
> >       }
> >
> > -     kvfree(mc->objects);
> > +     return freed;
> > +}
> >
> > +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > +{
> > +     kvm_mmu_empty_memory_cache(mc);
> > +     kvfree(mc->objects);
> >       mc->objects = NULL;
> >       mc->capacity = 0;
> >  }
> > --
> > 2.39.0.314.g84b9a713c41-goog
> >
Mingwei Zhang Jan. 3, 2023, 7:32 p.m. UTC | #7
On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
>
> mmu_shrink_scan() is very disruptive to VMs. It picks the first
> VM in the vm_list, zaps the oldest page which is most likely an upper
> level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> more disruptive in nested VMs case, considering L1 SPTEs will be the
> oldest even though most of the entries are for L2 SPTEs.
>
> As discussed in
> https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> shrinker logic has not be very useful in actually keeping VMs performant
> and reducing memory usage.
>
> Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> VM's performance should not be affected.
>
> This also allows to change cache capacities without worrying too much
> about high memory usage in cache.
>
> Tested this change by running dirty_log_perf_test while dropping cache
> via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> logs from kvm_mmu_memory_cache_alloc(), which is expected.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   5 +
>  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
>  arch/x86/kvm/mmu/mmu_internal.h |   2 +
>  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
>  include/linux/kvm_host.h        |   1 +
>  virt/kvm/kvm_main.c             |  11 ++-
>  6 files changed, 114 insertions(+), 71 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index aa4eb8cfcd7e..89cc809e4a00 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
>         struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>         struct kvm_mmu_memory_cache mmu_page_header_cache;
>
> +       /*
> +        * Protects change in size of mmu_shadow_page_cache cache.
> +        */
> +       spinlock_t mmu_shadow_page_cache_lock;
> +
>         /*
>          * QEMU userspace and the guest each have their own FPU state.
>          * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 254bc46234e0..157417e1cb6e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
>
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
> -static struct percpu_counter kvm_total_used_mmu_pages;
> +/*
> + * Total number of unused pages in MMU shadow page cache.
> + */
> +static struct percpu_counter kvm_total_unused_mmu_pages;
>
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>
> @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>         }
>  }
>
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +                                    spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +       int r;
> +
> +       spin_lock(cache_lock);
> +       orig_nobjs = cache->nobjs;
> +       r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> +       if (orig_nobjs != cache->nobjs)
> +               percpu_counter_add(&kvm_total_unused_mmu_pages,
> +                                  (cache->nobjs - orig_nobjs));
> +       spin_unlock(cache_lock);
> +       return r;
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>         int r;
> @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>         if (r)
>                 return r;
> -       r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -                                      PT64_ROOT_MAX_LEVEL);
> +       r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +                                     &vcpu->arch.mmu_shadow_page_cache_lock);
>         if (r)
>                 return r;
>         if (maybe_indirect) {
> @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>                                           PT64_ROOT_MAX_LEVEL);
>  }
>
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +                                    spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +
> +       spin_lock(cache_lock);
> +       orig_nobjs = cache->nobjs;
> +       kvm_mmu_free_memory_cache(cache);
> +       if (orig_nobjs)
> +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> +
> +       spin_unlock(cache_lock);
> +}

I think the mmu_cache allocation and deallocation may cause the usage
of GFP_ATOMIC (as observed by other reviewers as well). Adding a new
lock would definitely sound like a plan, but I think it might affect
the performance. Alternatively, I am wondering if we could use a
mmu_cache_sequence similar to mmu_notifier_seq to help avoid the
concurrency?

Similar to mmu_notifier_seq, mmu_cache_sequence should be protected by
mmu write lock. In the page fault path, each vcpu has to collect a
snapshot of  mmu_cache_sequence before calling into
mmu_topup_memory_caches() and check the value again when holding the
mmu lock. If the value is different, that means the mmu_shrinker has
removed the cache objects and because of that, the vcpu should retry.


> +
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +       mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> +                                &vcpu->arch.mmu_shadow_page_cache_lock);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
>  }
>  #endif
>
> -/*
> - * This value is the sum of all of the kvm instances's
> - * kvm->arch.n_used_mmu_pages values.  We need a global,
> - * aggregate version in order to make the slab shrinker
> - * faster
> - */
> -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> -{
> -       kvm->arch.n_used_mmu_pages += nr;
> -       percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> -}
> -
>  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       kvm_mod_used_mmu_pages(kvm, +1);
> +       kvm->arch.n_used_mmu_pages++;
>         kvm_account_pgtable_pages((void *)sp->spt, +1);
>  }
>
>  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       kvm_mod_used_mmu_pages(kvm, -1);
> +       kvm->arch.n_used_mmu_pages--;
>         kvm_account_pgtable_pages((void *)sp->spt, -1);
>  }
>
> @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
>         struct kvm_mmu_memory_cache *page_header_cache;
>         struct kvm_mmu_memory_cache *shadow_page_cache;
>         struct kvm_mmu_memory_cache *shadowed_info_cache;
> +       /*
> +        * Protects change in size of shadow_page_cache cache.
> +        */
> +       spinlock_t *shadow_page_cache_lock;
>  };
>
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +                                   spinlock_t *cache_lock)
> +{
> +       int orig_nobjs;
> +       void *page;
> +
> +       if (!cache_lock) {
> +               spin_lock(cache_lock);
> +               orig_nobjs = shadow_page_cache->nobjs;
> +       }
> +       page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +       if (!cache_lock) {
> +               if (orig_nobjs)
> +                       percpu_counter_dec(&kvm_total_unused_mmu_pages);
> +               spin_unlock(cache_lock);
> +       }
> +       return page;
> +}
> +
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>                                                       struct shadow_page_caches *caches,
>                                                       gfn_t gfn,
> @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>         struct kvm_mmu_page *sp;
>
>         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -       sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +       sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +                                               caches->shadow_page_cache_lock);
>         if (!role.direct)
>                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>
> @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>                 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
>                 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>                 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +               .shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
>         };
>
>         return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>
>         vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +       spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>
>         vcpu->arch.mmu = &vcpu->arch.root_mmu;
>         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
>                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
>  }
>
> -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> -{
> -       return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> -}
> -
>  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
>                         struct kvm_memory_slot *slot,
>                         struct kvm_page_track_notifier_node *node)
> @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
>         /* Direct SPs do not require a shadowed_info_cache. */
>         caches.page_header_cache = &kvm->arch.split_page_header_cache;
>         caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> +       caches.shadow_page_cache_lock = NULL;
>
>         /* Safe to pass NULL for vCPU since requesting a direct SP. */
>         return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
>  static unsigned long
>  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -       struct kvm *kvm;
> -       int nr_to_scan = sc->nr_to_scan;
> +       struct kvm_mmu_memory_cache *cache;
> +       struct kvm *kvm, *first_kvm = NULL;
>         unsigned long freed = 0;
> +       /* spinlock for memory cache */
> +       spinlock_t *cache_lock;
> +       struct kvm_vcpu *vcpu;
> +       unsigned long i;
>
>         mutex_lock(&kvm_lock);
>
>         list_for_each_entry(kvm, &vm_list, vm_list) {
> -               int idx;
> -               LIST_HEAD(invalid_list);
> -
> -               /*
> -                * Never scan more than sc->nr_to_scan VM instances.
> -                * Will not hit this condition practically since we do not try
> -                * to shrink more than one VM and it is very unlikely to see
> -                * !n_used_mmu_pages so many times.
> -                */
> -               if (!nr_to_scan--)
> +               if (first_kvm == kvm)
>                         break;
> -               /*
> -                * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> -                * here. We may skip a VM instance errorneosly, but we do not
> -                * want to shrink a VM that only started to populate its MMU
> -                * anyway.
> -                */
> -               if (!kvm->arch.n_used_mmu_pages &&
> -                   !kvm_has_zapped_obsolete_pages(kvm))
> -                       continue;
> +               if (!first_kvm)
> +                       first_kvm = kvm;
> +               list_move_tail(&kvm->vm_list, &vm_list);
>
> -               idx = srcu_read_lock(&kvm->srcu);
> -               write_lock(&kvm->mmu_lock);
> +               kvm_for_each_vcpu(i, vcpu, kvm) {
> +                       cache = &vcpu->arch.mmu_shadow_page_cache;
> +                       cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock;
> +                       if (READ_ONCE(cache->nobjs)) {
> +                               spin_lock(cache_lock);
> +                               freed += kvm_mmu_empty_memory_cache(cache);
> +                               spin_unlock(cache_lock);
> +                       }
>
> -               if (kvm_has_zapped_obsolete_pages(kvm)) {
> -                       kvm_mmu_commit_zap_page(kvm,
> -                             &kvm->arch.zapped_obsolete_pages);
> -                       goto unlock;
>                 }
>
> -               freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> -
> -unlock:
> -               write_unlock(&kvm->mmu_lock);
> -               srcu_read_unlock(&kvm->srcu, idx);
> -
> -               /*
> -                * unfair on small ones
> -                * per-vm shrinkers cry out
> -                * sadness comes quickly
> -                */
> -               list_move_tail(&kvm->vm_list, &vm_list);
> -               break;
> +               if (freed >= sc->nr_to_scan)
> +                       break;
>         }
>
> +       if (freed)
> +               percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
>         mutex_unlock(&kvm_lock);
> +       percpu_counter_sync(&kvm_total_unused_mmu_pages);
>         return freed;
>  }
>
>  static unsigned long
>  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -       return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> +       return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
>  }
>
>  static struct shrinker mmu_shrinker = {
> @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
>         if (!mmu_page_header_cache)
>                 goto out;
>
> -       if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> +       if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
>                 goto out;
>
>         ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
>         return 0;
>
>  out_shrinker:
> -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>  out:
>         mmu_destroy_caches();
>         return ret;
> @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
>  void kvm_mmu_vendor_module_exit(void)
>  {
>         mmu_destroy_caches();
> -       percpu_counter_destroy(&kvm_total_used_mmu_pages);
> +       percpu_counter_destroy(&kvm_total_unused_mmu_pages);
>         unregister_shrinker(&mmu_shrinker);
>  }
>
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ac00bfbf32f6..c2a342028b6a 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>
> +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +                                   spinlock_t *cache_lock);
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 764f7c87286f..4974fa96deff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>         struct kvm_mmu_page *sp;
>
>         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -       sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +       sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +                                               &vcpu->arch.mmu_shadow_page_cache_lock);
>
>         return sp;
>  }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 01aad8b74162..efd9b38ea9a2 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
>  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
>  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 13e88297f999..f2d762878b97 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>         return mc->nobjs;
>  }
>
> -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
>  {
> +       int freed = mc->nobjs;
> +
>         while (mc->nobjs) {
>                 if (mc->kmem_cache)
>                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
>                         free_page((unsigned long)mc->objects[--mc->nobjs]);
>         }
>
> -       kvfree(mc->objects);
> +       return freed;
> +}
>
> +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +{
> +       kvm_mmu_empty_memory_cache(mc);
> +       kvfree(mc->objects);
>         mc->objects = NULL;
>         mc->capacity = 0;
>  }
> --
> 2.39.0.314.g84b9a713c41-goog
>
Vipin Sharma Jan. 4, 2023, 12:25 a.m. UTC | #8
On Tue, Jan 3, 2023 at 10:01 AM Vipin Sharma <vipinsh@google.com> wrote:
>
> On Thu, Dec 29, 2022 at 1:55 PM David Matlack <dmatlack@google.com> wrote:
> >
> > On Wed, Dec 21, 2022 at 06:34:49PM -0800, Vipin Sharma wrote:
> > > mmu_shrink_scan() is very disruptive to VMs. It picks the first
> > > VM in the vm_list, zaps the oldest page which is most likely an upper
> > > level SPTEs and most like to be reused. Prior to TDP MMU, this is even
> > > more disruptive in nested VMs case, considering L1 SPTEs will be the
> > > oldest even though most of the entries are for L2 SPTEs.
> > >
> > > As discussed in
> > > https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com/
> > > shrinker logic has not be very useful in actually keeping VMs performant
> > > and reducing memory usage.
> > >
> > > Change mmu_shrink_scan() to free pages from the vCPU's shadow page
> > > cache.  Freeing pages from cache doesn't cause vCPU exits, therefore, a
> > > VM's performance should not be affected.
> >
> > Can you split this commit up? e.g. First drop the old shrinking logic in
> > one commit (but leave the shrinking infrastructure in place). Then a
> > commit to make the shrinker free the per-vCPU shadow page caches. And
> > then perhaps another to make the shrinker free the per-VM shadow page
> > cache used for eager splitting.
> >
>
> Sounds good, I will separate it in two parts, one for dropping old
> logic, one for adding per vcpu shadow page caches. Patch 3 is enabling
> shrinkerto free per-VM shadow page.
>
> > >
> > > This also allows to change cache capacities without worrying too much
> > > about high memory usage in cache.
> > >
> > > Tested this change by running dirty_log_perf_test while dropping cache
> > > via "echo 2 > /proc/sys/vm/drop_caches" at 1 second interval
> > > continuously. There were WARN_ON(!mc->nobjs) messages printed in kernel
> > > logs from kvm_mmu_memory_cache_alloc(), which is expected.
> > >
> > > Suggested-by: Sean Christopherson <seanjc@google.com>
> > > Signed-off-by: Vipin Sharma <vipinsh@google.com>
> > > ---
> > >  arch/x86/include/asm/kvm_host.h |   5 +
> > >  arch/x86/kvm/mmu/mmu.c          | 163 +++++++++++++++++++-------------
> > >  arch/x86/kvm/mmu/mmu_internal.h |   2 +
> > >  arch/x86/kvm/mmu/tdp_mmu.c      |   3 +-
> > >  include/linux/kvm_host.h        |   1 +
> > >  virt/kvm/kvm_main.c             |  11 ++-
> > >  6 files changed, 114 insertions(+), 71 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index aa4eb8cfcd7e..89cc809e4a00 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -786,6 +786,11 @@ struct kvm_vcpu_arch {
> > >       struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
> > >       struct kvm_mmu_memory_cache mmu_page_header_cache;
> > >
> > > +     /*
> > > +      * Protects change in size of mmu_shadow_page_cache cache.
> > > +      */
> > > +     spinlock_t mmu_shadow_page_cache_lock;
> > > +
> > >       /*
> > >        * QEMU userspace and the guest each have their own FPU state.
> > >        * In vcpu_run, we switch between the user and guest FPU contexts.
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 254bc46234e0..157417e1cb6e 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -164,7 +164,10 @@ struct kvm_shadow_walk_iterator {
> > >
> > >  static struct kmem_cache *pte_list_desc_cache;
> > >  struct kmem_cache *mmu_page_header_cache;
> > > -static struct percpu_counter kvm_total_used_mmu_pages;
> > > +/*
> > > + * Total number of unused pages in MMU shadow page cache.
> > > + */
> > > +static struct percpu_counter kvm_total_unused_mmu_pages;
> > >
> > >  static void mmu_spte_set(u64 *sptep, u64 spte);
> > >
> > > @@ -655,6 +658,22 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> > >       }
> > >  }
> > >
> > > +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > > +                                  spinlock_t *cache_lock)
> > > +{
> > > +     int orig_nobjs;
> > > +     int r;
> > > +
> > > +     spin_lock(cache_lock);
> > > +     orig_nobjs = cache->nobjs;
> > > +     r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
> > > +     if (orig_nobjs != cache->nobjs)
> > > +             percpu_counter_add(&kvm_total_unused_mmu_pages,
> > > +                                (cache->nobjs - orig_nobjs));
> > > +     spin_unlock(cache_lock);
> > > +     return r;
> > > +}
> > > +
> > >  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> > >  {
> > >       int r;
> > > @@ -664,8 +683,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> > >                                      1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
> > >       if (r)
> > >               return r;
> > > -     r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > > -                                    PT64_ROOT_MAX_LEVEL);
> > > +     r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > > +                                   &vcpu->arch.mmu_shadow_page_cache_lock);
> > >       if (r)
> > >               return r;
> > >       if (maybe_indirect) {
> > > @@ -678,10 +697,25 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> > >                                         PT64_ROOT_MAX_LEVEL);
> > >  }
> > >
> > > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > > +                                  spinlock_t *cache_lock)
> > > +{
> > > +     int orig_nobjs;
> > > +
> > > +     spin_lock(cache_lock);
> > > +     orig_nobjs = cache->nobjs;
> > > +     kvm_mmu_free_memory_cache(cache);
> > > +     if (orig_nobjs)
> > > +             percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > > +
> > > +     spin_unlock(cache_lock);
> > > +}
> >
> > It would be nice to avoid adding these wrapper functions.
> >
> > Once you add a mutex to protect the caches from being freed while vCPUs
> > are in the middle of a page fault you can drop the spin lock. After that
> > the only reason to have these wrappers is to update
> > kvm_total_unused_mmu_pages.
> >
> > Do we really need kvm_total_unused_mmu_pages? Why not just dynamically
> > calculate the number of of unused pages in mmu_shrink_count()? Or just
> > estimate the count, e.g. num_vcpus * KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE?
> > Or have per-VM or per-vCPU shrinkers to avoid needing to do any
> > aggregation?
> >
>
> I think we can drop this, by default we can return num_kvms *
> num_vcpus * nodes * KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
>
> Whenever mmu_shrink_scan() is called if there are no pages to free
> then return SHRINK_STOP which will stop any subsequent calls during
> that time.
>
>
> > > +
> > >  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> > >  {
> > >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> > > -     kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > > +     mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> > > +                              &vcpu->arch.mmu_shadow_page_cache_lock);
> > >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
> >
> > mmu_shadowed_info_cache can be freed by the shrinker as well.
> >

Yes, I can do that as well.

> > >       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
> > >  }
> > > @@ -1693,27 +1727,15 @@ static int is_empty_shadow_page(u64 *spt)
> > >  }
> > >  #endif
> > >
> > > -/*
> > > - * This value is the sum of all of the kvm instances's
> > > - * kvm->arch.n_used_mmu_pages values.  We need a global,
> > > - * aggregate version in order to make the slab shrinker
> > > - * faster
> > > - */
> > > -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
> > > -{
> > > -     kvm->arch.n_used_mmu_pages += nr;
> > > -     percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> > > -}
> > > -
> > >  static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> > >  {
> > > -     kvm_mod_used_mmu_pages(kvm, +1);
> > > +     kvm->arch.n_used_mmu_pages++;
> > >       kvm_account_pgtable_pages((void *)sp->spt, +1);
> > >  }
> > >
> > >  static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> > >  {
> > > -     kvm_mod_used_mmu_pages(kvm, -1);
> > > +     kvm->arch.n_used_mmu_pages--;
> > >       kvm_account_pgtable_pages((void *)sp->spt, -1);
> > >  }
> > >
> > > @@ -2150,8 +2172,31 @@ struct shadow_page_caches {
> > >       struct kvm_mmu_memory_cache *page_header_cache;
> > >       struct kvm_mmu_memory_cache *shadow_page_cache;
> > >       struct kvm_mmu_memory_cache *shadowed_info_cache;
> > > +     /*
> > > +      * Protects change in size of shadow_page_cache cache.
> > > +      */
> > > +     spinlock_t *shadow_page_cache_lock;
> > >  };
> > >
> > > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > > +                                 spinlock_t *cache_lock)
> > > +{
> > > +     int orig_nobjs;
> > > +     void *page;
> > > +
> > > +     if (!cache_lock) {
> > > +             spin_lock(cache_lock);
> > > +             orig_nobjs = shadow_page_cache->nobjs;
> > > +     }
> > > +     page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
> > > +     if (!cache_lock) {
> > > +             if (orig_nobjs)
> > > +                     percpu_counter_dec(&kvm_total_unused_mmu_pages);
> > > +             spin_unlock(cache_lock);
> > > +     }
> > > +     return page;
> > > +}
> > > +
> > >  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> > >                                                     struct shadow_page_caches *caches,
> > >                                                     gfn_t gfn,
> > > @@ -2161,7 +2206,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> > >       struct kvm_mmu_page *sp;
> > >
> > >       sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> > > -     sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> > > +     sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> > > +                                             caches->shadow_page_cache_lock);
> > >       if (!role.direct)
> > >               sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
> > >
> > > @@ -2218,6 +2264,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
> > >               .page_header_cache = &vcpu->arch.mmu_page_header_cache,
> > >               .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
> > >               .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> > > +             .shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
> > >       };
> > >
> > >       return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> > > @@ -5916,6 +5963,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
> > >       vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
> > >
> > >       vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> > > +     spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
> > >
> > >       vcpu->arch.mmu = &vcpu->arch.root_mmu;
> > >       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> > > @@ -6051,11 +6099,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> > >               kvm_tdp_mmu_zap_invalidated_roots(kvm);
> > >  }
> > >
> > > -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> > > -{
> > > -     return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> > > -}
> > > -
> > >  static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
> > >                       struct kvm_memory_slot *slot,
> > >                       struct kvm_page_track_notifier_node *node)
> > > @@ -6277,6 +6320,7 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
> > >       /* Direct SPs do not require a shadowed_info_cache. */
> > >       caches.page_header_cache = &kvm->arch.split_page_header_cache;
> > >       caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
> > > +     caches.shadow_page_cache_lock = NULL;
> > >
> > >       /* Safe to pass NULL for vCPU since requesting a direct SP. */
> > >       return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
> > > @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> > >  static unsigned long
> > >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > >  {
> > > -     struct kvm *kvm;
> > > -     int nr_to_scan = sc->nr_to_scan;
> > > +     struct kvm_mmu_memory_cache *cache;
> > > +     struct kvm *kvm, *first_kvm = NULL;
> > >       unsigned long freed = 0;
> > > +     /* spinlock for memory cache */
> > > +     spinlock_t *cache_lock;
> > > +     struct kvm_vcpu *vcpu;
> > > +     unsigned long i;
> > >
> > >       mutex_lock(&kvm_lock);
> > >
> > >       list_for_each_entry(kvm, &vm_list, vm_list) {
> > > -             int idx;
> > > -             LIST_HEAD(invalid_list);
> > > -
> > > -             /*
> > > -              * Never scan more than sc->nr_to_scan VM instances.
> > > -              * Will not hit this condition practically since we do not try
> > > -              * to shrink more than one VM and it is very unlikely to see
> > > -              * !n_used_mmu_pages so many times.
> > > -              */
> > > -             if (!nr_to_scan--)
> > > +             if (first_kvm == kvm)
> > >                       break;
> > > -             /*
> > > -              * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> > > -              * here. We may skip a VM instance errorneosly, but we do not
> > > -              * want to shrink a VM that only started to populate its MMU
> > > -              * anyway.
> > > -              */
> > > -             if (!kvm->arch.n_used_mmu_pages &&
> > > -                 !kvm_has_zapped_obsolete_pages(kvm))
> > > -                     continue;
> > > +             if (!first_kvm)
> > > +                     first_kvm = kvm;
> > > +             list_move_tail(&kvm->vm_list, &vm_list);
> > >
> > > -             idx = srcu_read_lock(&kvm->srcu);
> > > -             write_lock(&kvm->mmu_lock);
> > > +             kvm_for_each_vcpu(i, vcpu, kvm) {
> >
> > What protects this from racing with vCPU creation/deletion?
> >

vCPU deletion:
We take kvm_lock in mmu_shrink_scan(), the same lock is taken in
kvm_destroy_vm() to remove a vm from vm_list. So, once we are
iterating vm_list we will not see any VM removal which will means no
vcpu removal.

I didn't find any other code for vCPU deletion except failures during
VM and VCPU set up. A VM is only added to vm_list after successful
creation.

vCPU creation:
I think it will work.

kvm_vm_ioctl_create_vcpus() initializes the vcpu, adds it to
kvm->vcpu_array which is of the type xarray and is managed by RCU.
After this online_vcpus is incremented. So, kvm_for_each_vcpu() which
uses RCU to read entries, if it sees incremented online_vcpus value
then it will also sees all of the vcpu initialization.

@Sean, Paolo

Is the above explanation correct, kvm_for_each_vcpu() is safe without any lock?

> > > +                     cache = &vcpu->arch.mmu_shadow_page_cache;
> > > +                     cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock;
> > > +                     if (READ_ONCE(cache->nobjs)) {
> > > +                             spin_lock(cache_lock);
> > > +                             freed += kvm_mmu_empty_memory_cache(cache);
> > > +                             spin_unlock(cache_lock);
> > > +                     }
> >
> > What about freeing kvm->arch.split_shadow_page_cache as well?
> >

I am doing this in patch 3.

> > >
> > > -             if (kvm_has_zapped_obsolete_pages(kvm)) {
> > > -                     kvm_mmu_commit_zap_page(kvm,
> > > -                           &kvm->arch.zapped_obsolete_pages);
> > > -                     goto unlock;
> > >               }
> > >
> > > -             freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
> > > -
> > > -unlock:
> > > -             write_unlock(&kvm->mmu_lock);
> > > -             srcu_read_unlock(&kvm->srcu, idx);
> > > -
> > > -             /*
> > > -              * unfair on small ones
> > > -              * per-vm shrinkers cry out
> > > -              * sadness comes quickly
> > > -              */
> > > -             list_move_tail(&kvm->vm_list, &vm_list);
> > > -             break;
> > > +             if (freed >= sc->nr_to_scan)
> > > +                     break;
> > >       }
> > >
> > > +     if (freed)
> > > +             percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
> > >       mutex_unlock(&kvm_lock);
> > > +     percpu_counter_sync(&kvm_total_unused_mmu_pages);
> > >       return freed;
> > >  }
> > >
> > >  static unsigned long
> > >  mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> > >  {
> > > -     return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
> > > +     return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
> > >  }
> > >
> > >  static struct shrinker mmu_shrinker = {
> > > @@ -6820,7 +6847,7 @@ int kvm_mmu_vendor_module_init(void)
> > >       if (!mmu_page_header_cache)
> > >               goto out;
> > >
> > > -     if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
> > > +     if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
> > >               goto out;
> > >
> > >       ret = register_shrinker(&mmu_shrinker, "x86-mmu");
> > > @@ -6830,7 +6857,7 @@ int kvm_mmu_vendor_module_init(void)
> > >       return 0;
> > >
> > >  out_shrinker:
> > > -     percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > > +     percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> > >  out:
> > >       mmu_destroy_caches();
> > >       return ret;
> > > @@ -6847,7 +6874,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
> > >  void kvm_mmu_vendor_module_exit(void)
> > >  {
> > >       mmu_destroy_caches();
> > > -     percpu_counter_destroy(&kvm_total_used_mmu_pages);
> > > +     percpu_counter_destroy(&kvm_total_unused_mmu_pages);
> > >       unregister_shrinker(&mmu_shrinker);
> > >  }
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > > index ac00bfbf32f6..c2a342028b6a 100644
> > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > @@ -325,4 +325,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> > >  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> > >  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> > >
> > > +void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> > > +                                 spinlock_t *cache_lock);
> > >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > > index 764f7c87286f..4974fa96deff 100644
> > > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > > @@ -264,7 +264,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
> > >       struct kvm_mmu_page *sp;
> > >
> > >       sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> > > -     sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> > > +     sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> > > +                                             &vcpu->arch.mmu_shadow_page_cache_lock);
> > >
> > >       return sp;
> > >  }
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 01aad8b74162..efd9b38ea9a2 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1362,6 +1362,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
> > >  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
> > >  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
> > >  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> > > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
> > >  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
> > >  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> > >  #endif
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 13e88297f999..f2d762878b97 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -438,8 +438,10 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
> > >       return mc->nobjs;
> > >  }
> > >
> > > -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > > +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
> > >  {
> > > +     int freed = mc->nobjs;
> > > +
> > >       while (mc->nobjs) {
> > >               if (mc->kmem_cache)
> > >                       kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
> > > @@ -447,8 +449,13 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > >                       free_page((unsigned long)mc->objects[--mc->nobjs]);
> > >       }
> > >
> > > -     kvfree(mc->objects);
> > > +     return freed;
> > > +}
> > >
> > > +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> > > +{
> > > +     kvm_mmu_empty_memory_cache(mc);
> > > +     kvfree(mc->objects);
> > >       mc->objects = NULL;
> > >       mc->capacity = 0;
> > >  }
> > > --
> > > 2.39.0.314.g84b9a713c41-goog
> > >
Vipin Sharma Jan. 4, 2023, 1 a.m. UTC | #9
On Tue, Jan 3, 2023 at 11:32 AM Mingwei Zhang <mizhang@google.com> wrote:
>
> On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> >
> > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > +                                    spinlock_t *cache_lock)
> > +{
> > +       int orig_nobjs;
> > +
> > +       spin_lock(cache_lock);
> > +       orig_nobjs = cache->nobjs;
> > +       kvm_mmu_free_memory_cache(cache);
> > +       if (orig_nobjs)
> > +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > +
> > +       spin_unlock(cache_lock);
> > +}
>
> I think the mmu_cache allocation and deallocation may cause the usage
> of GFP_ATOMIC (as observed by other reviewers as well). Adding a new
> lock would definitely sound like a plan, but I think it might affect
> the performance. Alternatively, I am wondering if we could use a
> mmu_cache_sequence similar to mmu_notifier_seq to help avoid the
> concurrency?
>

Can you explain more about the performance impact? Each vcpu will have
its own mutex. So, only contention will be with the mmu_shrinker. This
shrinker will use mutex_try_lock() which will not block to wait for
the lock, it will just pass on to the next vcpu. While shrinker is
holding the lock, vcpu will be blocked in the page fault path but I
think it should not have a huge impact considering it will execute
rarely and for a small time.

> Similar to mmu_notifier_seq, mmu_cache_sequence should be protected by
> mmu write lock. In the page fault path, each vcpu has to collect a
> snapshot of  mmu_cache_sequence before calling into
> mmu_topup_memory_caches() and check the value again when holding the
> mmu lock. If the value is different, that means the mmu_shrinker has
> removed the cache objects and because of that, the vcpu should retry.
>

Yeah, this can be one approach. I think it will come down to the
performance impact of using mutex which I don't think should be a
concern.
Mingwei Zhang Jan. 4, 2023, 6:29 a.m. UTC | #10
On Tue, Jan 3, 2023 at 5:00 PM Vipin Sharma <vipinsh@google.com> wrote:
>
> On Tue, Jan 3, 2023 at 11:32 AM Mingwei Zhang <mizhang@google.com> wrote:
> >
> > On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> > >
> > > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > > +                                    spinlock_t *cache_lock)
> > > +{
> > > +       int orig_nobjs;
> > > +
> > > +       spin_lock(cache_lock);
> > > +       orig_nobjs = cache->nobjs;
> > > +       kvm_mmu_free_memory_cache(cache);
> > > +       if (orig_nobjs)
> > > +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > > +
> > > +       spin_unlock(cache_lock);
> > > +}
> >
> > I think the mmu_cache allocation and deallocation may cause the usage
> > of GFP_ATOMIC (as observed by other reviewers as well). Adding a new
> > lock would definitely sound like a plan, but I think it might affect
> > the performance. Alternatively, I am wondering if we could use a
> > mmu_cache_sequence similar to mmu_notifier_seq to help avoid the
> > concurrency?
> >
>
> Can you explain more about the performance impact? Each vcpu will have
> its own mutex. So, only contention will be with the mmu_shrinker. This
> shrinker will use mutex_try_lock() which will not block to wait for
> the lock, it will just pass on to the next vcpu. While shrinker is
> holding the lock, vcpu will be blocked in the page fault path but I
> think it should not have a huge impact considering it will execute
> rarely and for a small time.
>
> > Similar to mmu_notifier_seq, mmu_cache_sequence should be protected by
> > mmu write lock. In the page fault path, each vcpu has to collect a
> > snapshot of  mmu_cache_sequence before calling into
> > mmu_topup_memory_caches() and check the value again when holding the
> > mmu lock. If the value is different, that means the mmu_shrinker has
> > removed the cache objects and because of that, the vcpu should retry.
> >
>
> Yeah, this can be one approach. I think it will come down to the
> performance impact of using mutex which I don't think should be a
> concern.

hmm, I think you are right that there is no performance overhead by
adding a mutex and letting the shrinker using mutex_trylock(). The
point of using a sequence counter is to avoid the new lock, since
introducing a new lock will increase management burden. So unless it
is necessary, we probably should choose a simple solution first.

In this case, I think we do have such a choice and since a similar
mechanism has already been used by mmu_notifiers.

best
-Mingwei
Mingwei Zhang Jan. 4, 2023, 6:57 a.m. UTC | #11
On Tue, Jan 3, 2023 at 10:29 PM Mingwei Zhang <mizhang@google.com> wrote:
>
> On Tue, Jan 3, 2023 at 5:00 PM Vipin Sharma <vipinsh@google.com> wrote:
> >
> > On Tue, Jan 3, 2023 at 11:32 AM Mingwei Zhang <mizhang@google.com> wrote:
> > >
> > > On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <vipinsh@google.com> wrote:
> > > >
> > > > +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> > > > +                                    spinlock_t *cache_lock)
> > > > +{
> > > > +       int orig_nobjs;
> > > > +
> > > > +       spin_lock(cache_lock);
> > > > +       orig_nobjs = cache->nobjs;
> > > > +       kvm_mmu_free_memory_cache(cache);
> > > > +       if (orig_nobjs)
> > > > +               percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
> > > > +
> > > > +       spin_unlock(cache_lock);
> > > > +}
> > >
> > > I think the mmu_cache allocation and deallocation may cause the usage
> > > of GFP_ATOMIC (as observed by other reviewers as well). Adding a new
> > > lock would definitely sound like a plan, but I think it might affect
> > > the performance. Alternatively, I am wondering if we could use a
> > > mmu_cache_sequence similar to mmu_notifier_seq to help avoid the
> > > concurrency?
> > >
> >
> > Can you explain more about the performance impact? Each vcpu will have
> > its own mutex. So, only contention will be with the mmu_shrinker. This
> > shrinker will use mutex_try_lock() which will not block to wait for
> > the lock, it will just pass on to the next vcpu. While shrinker is
> > holding the lock, vcpu will be blocked in the page fault path but I
> > think it should not have a huge impact considering it will execute
> > rarely and for a small time.
> >
> > > Similar to mmu_notifier_seq, mmu_cache_sequence should be protected by
> > > mmu write lock. In the page fault path, each vcpu has to collect a
> > > snapshot of  mmu_cache_sequence before calling into
> > > mmu_topup_memory_caches() and check the value again when holding the
> > > mmu lock. If the value is different, that means the mmu_shrinker has
> > > removed the cache objects and because of that, the vcpu should retry.
> > >
> >
> > Yeah, this can be one approach. I think it will come down to the
> > performance impact of using mutex which I don't think should be a
> > concern.
>
> hmm, I think you are right that there is no performance overhead by
> adding a mutex and letting the shrinker using mutex_trylock(). The
> point of using a sequence counter is to avoid the new lock, since
> introducing a new lock will increase management burden. So unless it
> is necessary, we probably should choose a simple solution first.
>
> In this case, I think we do have such a choice and since a similar
> mechanism has already been used by mmu_notifiers.
>

Let me take it back. The per-vcpu sequence number in this case has to
be protected by a VM level mmu write lock. I think this might be less
performant than using a per-vcpu mutex.
Sean Christopherson Jan. 18, 2023, 5:36 p.m. UTC | #12
On Tue, Jan 03, 2023, Mingwei Zhang wrote:
> On Tue, Jan 3, 2023 at 5:00 PM Vipin Sharma <vipinsh@google.com> wrote:
> > > I think the mmu_cache allocation and deallocation may cause the usage
> > > of GFP_ATOMIC (as observed by other reviewers as well). Adding a new
> > > lock would definitely sound like a plan, but I think it might affect
> > > the performance. Alternatively, I am wondering if we could use a
> > > mmu_cache_sequence similar to mmu_notifier_seq to help avoid the
> > > concurrency?
> > >
> >
> > Can you explain more about the performance impact? Each vcpu will have
> > its own mutex. So, only contention will be with the mmu_shrinker. This
> > shrinker will use mutex_try_lock() which will not block to wait for
> > the lock, it will just pass on to the next vcpu. While shrinker is
> > holding the lock, vcpu will be blocked in the page fault path but I
> > think it should not have a huge impact considering it will execute
> > rarely and for a small time.
> >
> > > Similar to mmu_notifier_seq, mmu_cache_sequence should be protected by
> > > mmu write lock. In the page fault path, each vcpu has to collect a
> > > snapshot of  mmu_cache_sequence before calling into
> > > mmu_topup_memory_caches() and check the value again when holding the
> > > mmu lock. If the value is different, that means the mmu_shrinker has
> > > removed the cache objects and because of that, the vcpu should retry.
> > >
> >
> > Yeah, this can be one approach. I think it will come down to the
> > performance impact of using mutex which I don't think should be a
> > concern.
> 
> hmm, I think you are right that there is no performance overhead by
> adding a mutex and letting the shrinker using mutex_trylock(). The
> point of using a sequence counter is to avoid the new lock, since
> introducing a new lock will increase management burden.

No, more locks doesn't necessarily mean higher maintenance cost.  More complexity
definitely means more maintenance, but additional locks doesn't necessarily equate
to increased complexity.

Lockless algorithms are almost always more difficult to reason about, i.e. trying
to use a sequence counter for this case would be more complex than using a per-vCPU
mutex.  The only complexity in adding another mutex is understanding why an additional
lock necessary, and IMO that's fairly easy to explain/understand (the shrinker will
almost never succeed if it has to wait for vcpu->mutex to be dropped).

> So unless it is necessary, we probably should choose a simple solution first.
> 
> In this case, I think we do have such a choice and since a similar
> mechanism has already been used by mmu_notifiers.

The mmu_notifier case is very different.  The invalidations affect the entire VM,
notifiers _must_ succeed, may or may not allowing sleeping, the readers (vCPUs)
effectively need protection while running in the guest, and practically speaking
holding a per-VM (or global) lock of any kind while a vCPU is running in the guest
is not viable, e.g. even holding kvm->srcu is disallowed.

In other words, using a traditional locking scheme to serialize guest accesses
with host-initiated page table (or memslot) updates is simply not an option.
Sean Christopherson Jan. 18, 2023, 5:43 p.m. UTC | #13
@all, trim your replies!

On Tue, Jan 03, 2023, Vipin Sharma wrote:
> On Tue, Jan 3, 2023 at 10:01 AM Vipin Sharma <vipinsh@google.com> wrote:
> >
> > On Thu, Dec 29, 2022 at 1:55 PM David Matlack <dmatlack@google.com> wrote:
> > > > @@ -6646,66 +6690,49 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
> > > >  static unsigned long
> > > >  mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > > >  {
> > > > -     struct kvm *kvm;
> > > > -     int nr_to_scan = sc->nr_to_scan;
> > > > +     struct kvm_mmu_memory_cache *cache;
> > > > +     struct kvm *kvm, *first_kvm = NULL;
> > > >       unsigned long freed = 0;
> > > > +     /* spinlock for memory cache */
> > > > +     spinlock_t *cache_lock;
> > > > +     struct kvm_vcpu *vcpu;
> > > > +     unsigned long i;
> > > >
> > > >       mutex_lock(&kvm_lock);
> > > >
> > > >       list_for_each_entry(kvm, &vm_list, vm_list) {
> > > > -             int idx;
> > > > -             LIST_HEAD(invalid_list);
> > > > -
> > > > -             /*
> > > > -              * Never scan more than sc->nr_to_scan VM instances.
> > > > -              * Will not hit this condition practically since we do not try
> > > > -              * to shrink more than one VM and it is very unlikely to see
> > > > -              * !n_used_mmu_pages so many times.
> > > > -              */
> > > > -             if (!nr_to_scan--)
> > > > +             if (first_kvm == kvm)
> > > >                       break;
> > > > -             /*
> > > > -              * n_used_mmu_pages is accessed without holding kvm->mmu_lock
> > > > -              * here. We may skip a VM instance errorneosly, but we do not
> > > > -              * want to shrink a VM that only started to populate its MMU
> > > > -              * anyway.
> > > > -              */
> > > > -             if (!kvm->arch.n_used_mmu_pages &&
> > > > -                 !kvm_has_zapped_obsolete_pages(kvm))
> > > > -                     continue;
> > > > +             if (!first_kvm)
> > > > +                     first_kvm = kvm;
> > > > +             list_move_tail(&kvm->vm_list, &vm_list);
> > > >
> > > > -             idx = srcu_read_lock(&kvm->srcu);
> > > > -             write_lock(&kvm->mmu_lock);
> > > > +             kvm_for_each_vcpu(i, vcpu, kvm) {
> > >
> > > What protects this from racing with vCPU creation/deletion?
> > >
> 
> vCPU deletion:
> We take kvm_lock in mmu_shrink_scan(), the same lock is taken in
> kvm_destroy_vm() to remove a vm from vm_list. So, once we are
> iterating vm_list we will not see any VM removal which will means no
> vcpu removal.
> 
> I didn't find any other code for vCPU deletion except failures during
> VM and VCPU set up. A VM is only added to vm_list after successful
> creation.

Yep, KVM doesn't support destroying/freeing a vCPU after it's been added.

> vCPU creation:
> I think it will work.
> 
> kvm_vm_ioctl_create_vcpus() initializes the vcpu, adds it to
> kvm->vcpu_array which is of the type xarray and is managed by RCU.
> After this online_vcpus is incremented. So, kvm_for_each_vcpu() which
> uses RCU to read entries, if it sees incremented online_vcpus value
> then it will also sees all of the vcpu initialization.

Yep.  The shrinker may race with a vCPU creation, e.g. not process a just-created
vCPU, but that's totally ok in this case since the shrinker path is best effort
(and purging the caches of a newly created vCPU is likely pointless).

> @Sean, Paolo
> 
> Is the above explanation correct, kvm_for_each_vcpu() is safe without any lock?

Well, in this case, you do need to hold kvm_lock ;-)

But yes, iterating over vCPUs without holding the per-VM kvm->lock is safe, the
caller just needs to ensure the VM can't be destroyed, i.e. either needs to hold
a reference to the VM or needs to hold kvm_lock.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa4eb8cfcd7e..89cc809e4a00 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -786,6 +786,11 @@  struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	/*
+	 * Protects change in size of mmu_shadow_page_cache cache.
+	 */
+	spinlock_t mmu_shadow_page_cache_lock;
+
 	/*
 	 * QEMU userspace and the guest each have their own FPU state.
 	 * In vcpu_run, we switch between the user and guest FPU contexts.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 254bc46234e0..157417e1cb6e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -164,7 +164,10 @@  struct kvm_shadow_walk_iterator {
 
 static struct kmem_cache *pte_list_desc_cache;
 struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
+/*
+ * Total number of unused pages in MMU shadow page cache.
+ */
+static struct percpu_counter kvm_total_unused_mmu_pages;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 
@@ -655,6 +658,22 @@  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
+				     spinlock_t *cache_lock)
+{
+	int orig_nobjs;
+	int r;
+
+	spin_lock(cache_lock);
+	orig_nobjs = cache->nobjs;
+	r = kvm_mmu_topup_memory_cache(cache, PT64_ROOT_MAX_LEVEL);
+	if (orig_nobjs != cache->nobjs)
+		percpu_counter_add(&kvm_total_unused_mmu_pages,
+				   (cache->nobjs - orig_nobjs));
+	spin_unlock(cache_lock);
+	return r;
+}
+
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 {
 	int r;
@@ -664,8 +683,8 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 	if (r)
 		return r;
-	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				       PT64_ROOT_MAX_LEVEL);
+	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+				      &vcpu->arch.mmu_shadow_page_cache_lock);
 	if (r)
 		return r;
 	if (maybe_indirect) {
@@ -678,10 +697,25 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 					  PT64_ROOT_MAX_LEVEL);
 }
 
+static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
+				     spinlock_t *cache_lock)
+{
+	int orig_nobjs;
+
+	spin_lock(cache_lock);
+	orig_nobjs = cache->nobjs;
+	kvm_mmu_free_memory_cache(cache);
+	if (orig_nobjs)
+		percpu_counter_sub(&kvm_total_unused_mmu_pages, orig_nobjs);
+
+	spin_unlock(cache_lock);
+}
+
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+				 &vcpu->arch.mmu_shadow_page_cache_lock);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 }
@@ -1693,27 +1727,15 @@  static int is_empty_shadow_page(u64 *spt)
 }
 #endif
 
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values.  We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
-{
-	kvm->arch.n_used_mmu_pages += nr;
-	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
 static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	kvm_mod_used_mmu_pages(kvm, +1);
+	kvm->arch.n_used_mmu_pages++;
 	kvm_account_pgtable_pages((void *)sp->spt, +1);
 }
 
 static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	kvm_mod_used_mmu_pages(kvm, -1);
+	kvm->arch.n_used_mmu_pages--;
 	kvm_account_pgtable_pages((void *)sp->spt, -1);
 }
 
@@ -2150,8 +2172,31 @@  struct shadow_page_caches {
 	struct kvm_mmu_memory_cache *page_header_cache;
 	struct kvm_mmu_memory_cache *shadow_page_cache;
 	struct kvm_mmu_memory_cache *shadowed_info_cache;
+	/*
+	 * Protects change in size of shadow_page_cache cache.
+	 */
+	spinlock_t *shadow_page_cache_lock;
 };
 
+void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
+				    spinlock_t *cache_lock)
+{
+	int orig_nobjs;
+	void *page;
+
+	if (!cache_lock) {
+		spin_lock(cache_lock);
+		orig_nobjs = shadow_page_cache->nobjs;
+	}
+	page = kvm_mmu_memory_cache_alloc(shadow_page_cache);
+	if (!cache_lock) {
+		if (orig_nobjs)
+			percpu_counter_dec(&kvm_total_unused_mmu_pages);
+		spin_unlock(cache_lock);
+	}
+	return page;
+}
+
 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
 						      struct shadow_page_caches *caches,
 						      gfn_t gfn,
@@ -2161,7 +2206,8 @@  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
 	struct kvm_mmu_page *sp;
 
 	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
-	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+	sp->spt = kvm_mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
+						caches->shadow_page_cache_lock);
 	if (!role.direct)
 		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
 
@@ -2218,6 +2264,7 @@  static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
 		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
 		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
 		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+		.shadow_page_cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock
 	};
 
 	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
@@ -5916,6 +5963,7 @@  int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+	spin_lock_init(&vcpu->arch.mmu_shadow_page_cache_lock);
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6051,11 +6099,6 @@  static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 		kvm_tdp_mmu_zap_invalidated_roots(kvm);
 }
 
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-{
-	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
-}
-
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
 			struct kvm_memory_slot *slot,
 			struct kvm_page_track_notifier_node *node)
@@ -6277,6 +6320,7 @@  static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu
 	/* Direct SPs do not require a shadowed_info_cache. */
 	caches.page_header_cache = &kvm->arch.split_page_header_cache;
 	caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+	caches.shadow_page_cache_lock = NULL;
 
 	/* Safe to pass NULL for vCPU since requesting a direct SP. */
 	return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
@@ -6646,66 +6690,49 @@  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 static unsigned long
 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct kvm *kvm;
-	int nr_to_scan = sc->nr_to_scan;
+	struct kvm_mmu_memory_cache *cache;
+	struct kvm *kvm, *first_kvm = NULL;
 	unsigned long freed = 0;
+	/* spinlock for memory cache */
+	spinlock_t *cache_lock;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
 
 	mutex_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int idx;
-		LIST_HEAD(invalid_list);
-
-		/*
-		 * Never scan more than sc->nr_to_scan VM instances.
-		 * Will not hit this condition practically since we do not try
-		 * to shrink more than one VM and it is very unlikely to see
-		 * !n_used_mmu_pages so many times.
-		 */
-		if (!nr_to_scan--)
+		if (first_kvm == kvm)
 			break;
-		/*
-		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
-		 * here. We may skip a VM instance errorneosly, but we do not
-		 * want to shrink a VM that only started to populate its MMU
-		 * anyway.
-		 */
-		if (!kvm->arch.n_used_mmu_pages &&
-		    !kvm_has_zapped_obsolete_pages(kvm))
-			continue;
+		if (!first_kvm)
+			first_kvm = kvm;
+		list_move_tail(&kvm->vm_list, &vm_list);
 
-		idx = srcu_read_lock(&kvm->srcu);
-		write_lock(&kvm->mmu_lock);
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			cache = &vcpu->arch.mmu_shadow_page_cache;
+			cache_lock = &vcpu->arch.mmu_shadow_page_cache_lock;
+			if (READ_ONCE(cache->nobjs)) {
+				spin_lock(cache_lock);
+				freed += kvm_mmu_empty_memory_cache(cache);
+				spin_unlock(cache_lock);
+			}
 
-		if (kvm_has_zapped_obsolete_pages(kvm)) {
-			kvm_mmu_commit_zap_page(kvm,
-			      &kvm->arch.zapped_obsolete_pages);
-			goto unlock;
 		}
 
-		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
-
-unlock:
-		write_unlock(&kvm->mmu_lock);
-		srcu_read_unlock(&kvm->srcu, idx);
-
-		/*
-		 * unfair on small ones
-		 * per-vm shrinkers cry out
-		 * sadness comes quickly
-		 */
-		list_move_tail(&kvm->vm_list, &vm_list);
-		break;
+		if (freed >= sc->nr_to_scan)
+			break;
 	}
 
+	if (freed)
+		percpu_counter_sub(&kvm_total_unused_mmu_pages, freed);
 	mutex_unlock(&kvm_lock);
+	percpu_counter_sync(&kvm_total_unused_mmu_pages);
 	return freed;
 }
 
 static unsigned long
 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+	return percpu_counter_sum_positive(&kvm_total_unused_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
@@ -6820,7 +6847,7 @@  int kvm_mmu_vendor_module_init(void)
 	if (!mmu_page_header_cache)
 		goto out;
 
-	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
+	if (percpu_counter_init(&kvm_total_unused_mmu_pages, 0, GFP_KERNEL))
 		goto out;
 
 	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
@@ -6830,7 +6857,7 @@  int kvm_mmu_vendor_module_init(void)
 	return 0;
 
 out_shrinker:
-	percpu_counter_destroy(&kvm_total_used_mmu_pages);
+	percpu_counter_destroy(&kvm_total_unused_mmu_pages);
 out:
 	mmu_destroy_caches();
 	return ret;
@@ -6847,7 +6874,7 @@  void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvm_mmu_vendor_module_exit(void)
 {
 	mmu_destroy_caches();
-	percpu_counter_destroy(&kvm_total_used_mmu_pages);
+	percpu_counter_destroy(&kvm_total_unused_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
 }
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ac00bfbf32f6..c2a342028b6a 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -325,4 +325,6 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+void *kvm_mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
+				    spinlock_t *cache_lock);
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 764f7c87286f..4974fa96deff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -264,7 +264,8 @@  static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 	struct kvm_mmu_page *sp;
 
 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
-	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+	sp->spt = kvm_mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
+						&vcpu->arch.mmu_shadow_page_cache_lock);
 
 	return sp;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01aad8b74162..efd9b38ea9a2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1362,6 +1362,7 @@  void kvm_flush_remote_tlbs(struct kvm *kvm);
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
+int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..f2d762878b97 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -438,8 +438,10 @@  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 	return mc->nobjs;
 }
 
-void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
+	int freed = mc->nobjs;
+
 	while (mc->nobjs) {
 		if (mc->kmem_cache)
 			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
@@ -447,8 +449,13 @@  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 			free_page((unsigned long)mc->objects[--mc->nobjs]);
 	}
 
-	kvfree(mc->objects);
+	return freed;
+}
 
+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	kvm_mmu_empty_memory_cache(mc);
+	kvfree(mc->objects);
 	mc->objects = NULL;
 	mc->capacity = 0;
 }