diff mbox series

[v8,04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn

Message ID 20241105184333.2305744-5-jthoughton@google.com (mailing list archive)
State New
Headers show
Series KVM: x86/mmu: Age sptes locklessly | expand

Commit Message

James Houghton Nov. 5, 2024, 6:43 p.m. UTC
Walk the TDP MMU in an RCU read-side critical section without holding
mmu_lock when harvesting and potentially updating age information on
sptes. This requires a way to do RCU-safe walking of the tdp_mmu_roots;
do this with a new macro. The PTE modifications are now done atomically,
and kvm_tdp_mmu_spte_need_atomic_write() has been updated to account for
the fact that kvm_age_gfn can now locklessly update the accessed bit and
the W/R/X bits).

If the cmpxchg for marking the spte for access tracking fails, leave it
as is and treat it as if it were young, as if the spte is being actively
modified, it is most likely young.

Harvesting age information from the shadow MMU is still done while
holding the MMU write lock.

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
 arch/x86/kvm/mmu/tdp_iter.h     | 12 ++++++------
 arch/x86/kvm/mmu/tdp_mmu.c      | 23 ++++++++++++++++-------
 5 files changed, 32 insertions(+), 15 deletions(-)

Comments

kernel test robot Nov. 6, 2024, 8:22 a.m. UTC | #1
Hi James,

kernel test robot noticed the following build warnings:

[auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]

url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
base:   a27e0515592ec9ca28e0d027f42568c47b314784
patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/

All warnings (new ones prefixed by >>):

   arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
>> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
    1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
         |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +1189 arch/x86/kvm/mmu/tdp_mmu.c

  1166	
  1167	/*
  1168	 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  1169	 * if any of the GFNs in the range have been accessed.
  1170	 *
  1171	 * No need to mark the corresponding PFN as accessed as this call is coming
  1172	 * from the clear_young() or clear_flush_young() notifier, which uses the
  1173	 * return value to determine if the page has been accessed.
  1174	 */
  1175	static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
  1176	{
  1177		u64 new_spte;
  1178	
  1179		if (spte_ad_enabled(iter->old_spte)) {
  1180			iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
  1181							shadow_accessed_mask);
  1182			new_spte = iter->old_spte & ~shadow_accessed_mask;
  1183		} else {
  1184			new_spte = mark_spte_for_access_track(iter->old_spte);
  1185			/*
  1186			 * It is safe for the following cmpxchg to fail. Leave the
  1187			 * Accessed bit set, as the spte is most likely young anyway.
  1188			 */
> 1189			(void)__tdp_mmu_set_spte_atomic(iter, new_spte);
  1190		}
  1191	
  1192		trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
  1193					       iter->old_spte, new_spte);
  1194	}
  1195
James Houghton Nov. 8, 2024, 3 a.m. UTC | #2
On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi James,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> base:   a27e0515592ec9ca28e0d027f42568c47b314784
> patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
>
> All warnings (new ones prefixed by >>):
>
>    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
>     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
>          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>

Well, I saw this compiler warning in my latest rebase and thought the
`(void)` would fix it. I guess the next best way to fix it would be to
assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
going to take the series (maybe? :)), go ahead and apply whatever fix
you like.
Sean Christopherson Nov. 8, 2024, 10:45 p.m. UTC | #3
On Thu, Nov 07, 2024, James Houghton wrote:
> On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi James,
> >
> > kernel test robot noticed the following build warnings:
> >
> > [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
> >
> > url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> > base:   a27e0515592ec9ca28e0d027f42568c47b314784
> > patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> > patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> > config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> > compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
> >
> > All warnings (new ones prefixed by >>):
> >
> >    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> > >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
> >     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> >          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> 
> Well, I saw this compiler warning in my latest rebase and thought the
> `(void)` would fix it. I guess the next best way to fix it would be to
> assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
> going to take the series (maybe? :)), go ahead and apply whatever fix
> you like.

Heh, actually, the compiler is correct.  Ignoring the return value is a bug.
KVM should instead return immediately, as falling through to the tracepoint will
log bogus information.  E.g. will show a !PRESENT SPTE, instead of whatever the
current SPTE actually is (iter->old_spte will have been updating to the current
value of the SPTE).

	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
				       iter->old_spte, new_spte);

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f5b4f1060fff..cc8ae998b7c8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1186,7 +1186,8 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
                 * It is safe for the following cmpxchg to fail. Leave the
                 * Accessed bit set, as the spte is most likely young anyway.
                 */
-               (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
+               if (__tdp_mmu_set_spte_atomic(iter, new_spte))
+                       return;
        }
 
        trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
James Houghton Nov. 11, 2024, 2:45 p.m. UTC | #4
On Fri, Nov 8, 2024 at 5:45 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Nov 07, 2024, James Houghton wrote:
> > On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
> > >
> > > Hi James,
> > >
> > > kernel test robot noticed the following build warnings:
> > >
> > > [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
> > >
> > > url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> > > base:   a27e0515592ec9ca28e0d027f42568c47b314784
> > > patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> > > patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> > > config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> > > compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <lkp@intel.com>
> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
> > >
> > > All warnings (new ones prefixed by >>):
> > >
> > >    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> > > >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
> > >     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> > >          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > >
> >
> > Well, I saw this compiler warning in my latest rebase and thought the
> > `(void)` would fix it. I guess the next best way to fix it would be to
> > assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
> > going to take the series (maybe? :)), go ahead and apply whatever fix
> > you like.
>
> Heh, actually, the compiler is correct.  Ignoring the return value is a bug.
> KVM should instead return immediately, as falling through to the tracepoint will
> log bogus information.  E.g. will show a !PRESENT SPTE, instead of whatever the
> current SPTE actually is (iter->old_spte will have been updating to the current
> value of the SPTE).
>
>         trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
>                                        iter->old_spte, new_spte);
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f5b4f1060fff..cc8ae998b7c8 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1186,7 +1186,8 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
>                  * It is safe for the following cmpxchg to fail. Leave the
>                  * Accessed bit set, as the spte is most likely young anyway.
>                  */
> -               (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> +               if (__tdp_mmu_set_spte_atomic(iter, new_spte))
> +                       return;
>         }
>
>         trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
>

Oh yes, you're right. Thanks Sean! The diff LGTM.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 70c7ed0ef184..84ee08078686 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1455,6 +1455,7 @@  struct kvm_arch {
 	 * tdp_mmu_page set.
 	 *
 	 * For reads, this list is protected by:
+	 *	RCU alone or
 	 *	the MMU lock in read mode + RCU or
 	 *	the MMU lock in write mode
 	 *
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1ed1e4f5d51c..97f747d60fe9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@  config KVM_X86
 	select KVM_COMMON
 	select KVM_GENERIC_MMU_NOTIFIER
 	select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+	select KVM_MMU_NOTIFIER_YOUNG_LOCKLESS
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_DIRTY_RING_TSO
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 443845bb2e01..26797ccd34d8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1586,8 +1586,11 @@  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm_memslots_have_rmaps(kvm))
+	if (kvm_memslots_have_rmaps(kvm)) {
+		write_lock(&kvm->mmu_lock);
 		young = kvm_rmap_age_gfn_range(kvm, range, false);
+		write_unlock(&kvm->mmu_lock);
+	}
 
 	if (tdp_mmu_enabled)
 		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1599,8 +1602,11 @@  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm_memslots_have_rmaps(kvm))
+	if (kvm_memslots_have_rmaps(kvm)) {
+		write_lock(&kvm->mmu_lock);
 		young = kvm_rmap_age_gfn_range(kvm, range, true);
+		write_unlock(&kvm->mmu_lock);
+	}
 
 	if (tdp_mmu_enabled)
 		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index a24fca3f9e7f..f26d0b60d2dd 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -39,10 +39,11 @@  static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
 }
 
 /*
- * SPTEs must be modified atomically if they are shadow-present, leaf
- * SPTEs, and have volatile bits, i.e. has bits that can be set outside
- * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
- * handler, and Accessed and Dirty bits can be set by the CPU.
+ * SPTEs must be modified atomically if they have bits that can be set outside
+ * of the mmu_lock. This can happen for any shadow-present leaf SPTEs, as the
+ * Writable bit can be set by KVM's fast page fault handler, the Accessed and
+ * Dirty bits can be set by the CPU, and the Accessed and W/R/X bits can be
+ * cleared by age_gfn_range().
  *
  * Note, non-leaf SPTEs do have Accessed bits and those bits are
  * technically volatile, but KVM doesn't consume the Accessed bit of
@@ -53,8 +54,7 @@  static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
 static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
 {
 	return is_shadow_present_pte(old_spte) &&
-	       is_last_spte(old_spte, level) &&
-	       spte_has_volatile_bits(old_spte);
+	       is_last_spte(old_spte, level);
 }
 
 static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 4508d868f1cd..f5b4f1060fff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -178,6 +178,15 @@  static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 		     ((_only_valid) && (_root)->role.invalid))) {		\
 		} else
 
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ */
+#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)			\
+	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
+		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
+		    (_root)->role.invalid) {					\
+		} else
+
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
 	__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
 
@@ -1168,16 +1177,16 @@  static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
 	u64 new_spte;
 
 	if (spte_ad_enabled(iter->old_spte)) {
-		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
-							 iter->old_spte,
-							 shadow_accessed_mask,
-							 iter->level);
+		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+						shadow_accessed_mask);
 		new_spte = iter->old_spte & ~shadow_accessed_mask;
 	} else {
 		new_spte = mark_spte_for_access_track(iter->old_spte);
-		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
-							iter->old_spte, new_spte,
-							iter->level);
+		/*
+		 * It is safe for the following cmpxchg to fail. Leave the
+		 * Accessed bit set, as the spte is most likely young anyway.
+		 */
+		(void)__tdp_mmu_set_spte_atomic(iter, new_spte);
 	}
 
 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,