diff mbox series

[v2,13/16] KVM: arm64: nv: Invalidate TLBs based on shadow S2 TTL-like information

Message ID 20240529145628.3272630-14-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: nv: Shadow stage-2 page table handling | expand

Commit Message

Marc Zyngier May 29, 2024, 2:56 p.m. UTC
In order to be able to make S2 TLB invalidations more performant on NV,
let's use a scheme derived from the FEAT_TTL extension.

If bits [56:55] in the leaf descriptor translating the address in the
corresponding shadow S2 are non-zero, they indicate a level which can
be used as an invalidation range. This allows further reduction of the
systematic over-invalidation that takes place otherwise.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/nested.c | 83 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

Comments

Oliver Upton June 3, 2024, 6:36 p.m. UTC | #1
On Wed, May 29, 2024 at 03:56:25PM +0100, Marc Zyngier wrote:
> In order to be able to make S2 TLB invalidations more performant on NV,
> let's use a scheme derived from the FEAT_TTL extension.
> 
> If bits [56:55] in the leaf descriptor translating the address in the
> corresponding shadow S2 are non-zero, they indicate a level which can
> be used as an invalidation range. This allows further reduction of the
> systematic over-invalidation that takes place otherwise.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/nested.c | 83 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 82 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 8570b5bd0289..5ab5c43c571b 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -4,6 +4,7 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/bitfield.h>
>  #include <linux/kvm.h>
>  #include <linux/kvm_host.h>
>  
> @@ -421,12 +422,92 @@ static unsigned int ttl_to_size(u8 ttl)
>  	return max_size;
>  }
>  
> +/*
> + * Compute the equivalent of the TTL field by parsing the shadow PT.  The
> + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
> + * retrieved from first entry carrying the level as a tag.
> + */
> +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
> +{

Can you add a lockdep assertion that the MMU lock is held for write
here? At least for me this is far enough away from the 'real' page table
walk that it wasn't clear what locks were held at this point.

> +	u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
> +	kvm_pte_t pte;
> +	u8 ttl, level;
> +
> +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> +	case VTCR_EL2_TG0_4K:
> +		ttl = (TLBI_TTL_TG_4K << 2);
> +		break;
> +	case VTCR_EL2_TG0_16K:
> +		ttl = (TLBI_TTL_TG_16K << 2);
> +		break;
> +	case VTCR_EL2_TG0_64K:
> +	default:	    /* IMPDEF: treat any other value as 64k */
> +		ttl = (TLBI_TTL_TG_64K << 2);
> +		break;
> +	}
> +
> +	tmp = addr;
> +
> +again:
> +	/* Iteratively compute the block sizes for a particular granule size */
> +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> +	case VTCR_EL2_TG0_4K:
> +		if	(sz < SZ_4K)	sz = SZ_4K;
> +		else if (sz < SZ_2M)	sz = SZ_2M;
> +		else if (sz < SZ_1G)	sz = SZ_1G;
> +		else			sz = 0;
> +		break;
> +	case VTCR_EL2_TG0_16K:
> +		if	(sz < SZ_16K)	sz = SZ_16K;
> +		else if (sz < SZ_32M)	sz = SZ_32M;
> +		else			sz = 0;
> +		break;
> +	case VTCR_EL2_TG0_64K:
> +	default:	    /* IMPDEF: treat any other value as 64k */
> +		if	(sz < SZ_64K)	sz = SZ_64K;
> +		else if (sz < SZ_512M)	sz = SZ_512M;
> +		else			sz = 0;
> +		break;
> +	}
> +
> +	if (sz == 0)
> +		return 0;
> +
> +	tmp &= ~(sz - 1);
> +	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
> +		goto again;

Assuming we're virtualizing a larger TG than what's in use at L0 this
may not actually find a valid leaf that exists within the span of a
single virtual TLB entry.

For example, if we're using 4K at L0 and 16K at L1, we could have:

	[ ----- valid 16K entry ------- ]

mapped as:

	[ ----- | ----- | valid | ----- ]

in the shadow S2. kvm_pgtable_get_leaf() will always return the first
splintered page, which could be invalid.

What I'm getting at is: should this use a bespoke table walker that
scans for a valid TTL in the range of [addr, addr + sz)? It may make
sense to back off a bit more aggressively and switch to a conservative,
unscoped TLBI to avoid visiting too many PTEs.
Marc Zyngier June 4, 2024, 7:59 a.m. UTC | #2
On Mon, 03 Jun 2024 19:36:57 +0100,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Wed, May 29, 2024 at 03:56:25PM +0100, Marc Zyngier wrote:

[...]

> > +/*
> > + * Compute the equivalent of the TTL field by parsing the shadow PT.  The
> > + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
> > + * retrieved from first entry carrying the level as a tag.
> > + */
> > +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
> > +{
> 
> Can you add a lockdep assertion that the MMU lock is held for write
> here? At least for me this is far enough away from the 'real' page table
> walk that it wasn't clear what locks were held at this point.

Sure thing.

> 
> > +	u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
> > +	kvm_pte_t pte;
> > +	u8 ttl, level;
> > +
> > +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> > +	case VTCR_EL2_TG0_4K:
> > +		ttl = (TLBI_TTL_TG_4K << 2);
> > +		break;
> > +	case VTCR_EL2_TG0_16K:
> > +		ttl = (TLBI_TTL_TG_16K << 2);
> > +		break;
> > +	case VTCR_EL2_TG0_64K:
> > +	default:	    /* IMPDEF: treat any other value as 64k */
> > +		ttl = (TLBI_TTL_TG_64K << 2);
> > +		break;
> > +	}
> > +
> > +	tmp = addr;
> > +
> > +again:
> > +	/* Iteratively compute the block sizes for a particular granule size */
> > +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> > +	case VTCR_EL2_TG0_4K:
> > +		if	(sz < SZ_4K)	sz = SZ_4K;
> > +		else if (sz < SZ_2M)	sz = SZ_2M;
> > +		else if (sz < SZ_1G)	sz = SZ_1G;
> > +		else			sz = 0;
> > +		break;
> > +	case VTCR_EL2_TG0_16K:
> > +		if	(sz < SZ_16K)	sz = SZ_16K;
> > +		else if (sz < SZ_32M)	sz = SZ_32M;
> > +		else			sz = 0;
> > +		break;
> > +	case VTCR_EL2_TG0_64K:
> > +	default:	    /* IMPDEF: treat any other value as 64k */
> > +		if	(sz < SZ_64K)	sz = SZ_64K;
> > +		else if (sz < SZ_512M)	sz = SZ_512M;
> > +		else			sz = 0;
> > +		break;
> > +	}
> > +
> > +	if (sz == 0)
> > +		return 0;
> > +
> > +	tmp &= ~(sz - 1);
> > +	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
> > +		goto again;
> 
> Assuming we're virtualizing a larger TG than what's in use at L0 this
> may not actually find a valid leaf that exists within the span of a
> single virtual TLB entry.
> 
> For example, if we're using 4K at L0 and 16K at L1, we could have:
> 
> 	[ ----- valid 16K entry ------- ]
> 
> mapped as:
> 
> 	[ ----- | ----- | valid | ----- ]
> 
> in the shadow S2. kvm_pgtable_get_leaf() will always return the first
> splintered page, which could be invalid.
> 
> What I'm getting at is: should this use a bespoke table walker that
> scans for a valid TTL in the range of [addr, addr + sz)? It may make
> sense to back off a bit more aggressively and switch to a conservative,
> unscoped TLBI to avoid visiting too many PTEs.

I had something along those lines at some point (circa 2019), and
quickly dropped it as it had a horrible "look-around" behaviour,
specially if the L1 S2 granule size is much larger than L0's (64k vs
4k). As you pointed out, it needs heuristics to limit the look-around,
which I don't find very satisfying.

Which is why the current code limits the search to be in depth only,
hoping for the head descriptor to be valid, and quickly backs off to
do a level-0 invalidation.

My preferred option would be to allow the use of non-valid entries to
cache the level (always using the first L0 entry that would map the L1
descriptor), but this opens another can of worms: you could end-up
with page table pages containing only invalid descriptors, except for
the presence of a level annotation, which screws the refcounting. I'd
very much like to see this rather than the look-around option.

Now, it is important to consider how useful this is. I expect modern
hypervisors to use either TTL-hinted (which we emulate even if the HW
doesn't support it) or Range-based invalidation in the vast majority
of the cases, so this would only help SW that hasn't got on with the
program.

Thoughts?

	M.
Oliver Upton June 4, 2024, 5:49 p.m. UTC | #3
On Tue, Jun 04, 2024 at 08:59:49AM +0100, Marc Zyngier wrote:
> On Mon, 03 Jun 2024 19:36:57 +0100,
> Oliver Upton <oliver.upton@linux.dev> wrote:
> > 
> > On Wed, May 29, 2024 at 03:56:25PM +0100, Marc Zyngier wrote:
> 
> [...]
> 
> > > +/*
> > > + * Compute the equivalent of the TTL field by parsing the shadow PT.  The
> > > + * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
> > > + * retrieved from first entry carrying the level as a tag.
> > > + */
> > > +static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
> > > +{
> > 
> > Can you add a lockdep assertion that the MMU lock is held for write
> > here? At least for me this is far enough away from the 'real' page table
> > walk that it wasn't clear what locks were held at this point.
> 
> Sure thing.
> 
> > 
> > > +	u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
> > > +	kvm_pte_t pte;
> > > +	u8 ttl, level;
> > > +
> > > +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> > > +	case VTCR_EL2_TG0_4K:
> > > +		ttl = (TLBI_TTL_TG_4K << 2);
> > > +		break;
> > > +	case VTCR_EL2_TG0_16K:
> > > +		ttl = (TLBI_TTL_TG_16K << 2);
> > > +		break;
> > > +	case VTCR_EL2_TG0_64K:
> > > +	default:	    /* IMPDEF: treat any other value as 64k */
> > > +		ttl = (TLBI_TTL_TG_64K << 2);
> > > +		break;
> > > +	}
> > > +
> > > +	tmp = addr;
> > > +
> > > +again:
> > > +	/* Iteratively compute the block sizes for a particular granule size */
> > > +	switch (vtcr & VTCR_EL2_TG0_MASK) {
> > > +	case VTCR_EL2_TG0_4K:
> > > +		if	(sz < SZ_4K)	sz = SZ_4K;
> > > +		else if (sz < SZ_2M)	sz = SZ_2M;
> > > +		else if (sz < SZ_1G)	sz = SZ_1G;
> > > +		else			sz = 0;
> > > +		break;
> > > +	case VTCR_EL2_TG0_16K:
> > > +		if	(sz < SZ_16K)	sz = SZ_16K;
> > > +		else if (sz < SZ_32M)	sz = SZ_32M;
> > > +		else			sz = 0;
> > > +		break;
> > > +	case VTCR_EL2_TG0_64K:
> > > +	default:	    /* IMPDEF: treat any other value as 64k */
> > > +		if	(sz < SZ_64K)	sz = SZ_64K;
> > > +		else if (sz < SZ_512M)	sz = SZ_512M;
> > > +		else			sz = 0;
> > > +		break;
> > > +	}
> > > +
> > > +	if (sz == 0)
> > > +		return 0;
> > > +
> > > +	tmp &= ~(sz - 1);
> > > +	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
> > > +		goto again;
> > 
> > Assuming we're virtualizing a larger TG than what's in use at L0 this
> > may not actually find a valid leaf that exists within the span of a
> > single virtual TLB entry.
> > 
> > For example, if we're using 4K at L0 and 16K at L1, we could have:
> > 
> > 	[ ----- valid 16K entry ------- ]
> > 
> > mapped as:
> > 
> > 	[ ----- | ----- | valid | ----- ]
> > 
> > in the shadow S2. kvm_pgtable_get_leaf() will always return the first
> > splintered page, which could be invalid.
> > 
> > What I'm getting at is: should this use a bespoke table walker that
> > scans for a valid TTL in the range of [addr, addr + sz)? It may make
> > sense to back off a bit more aggressively and switch to a conservative,
> > unscoped TLBI to avoid visiting too many PTEs.
> 
> I had something along those lines at some point (circa 2019), and
> quickly dropped it as it had a horrible "look-around" behaviour,
> specially if the L1 S2 granule size is much larger than L0's (64k vs
> 4k). As you pointed out, it needs heuristics to limit the look-around,
> which I don't find very satisfying.
> 
> Which is why the current code limits the search to be in depth only,
> hoping for the head descriptor to be valid, and quickly backs off to
> do a level-0 invalidation.
> 
> My preferred option would be to allow the use of non-valid entries to
> cache the level (always using the first L0 entry that would map the L1
> descriptor), but this opens another can of worms: you could end-up
> with page table pages containing only invalid descriptors, except for
> the presence of a level annotation, which screws the refcounting. I'd
> very much like to see this rather than the look-around option.

Yeah, this seems to be a better solution, albeit more complex on the map
path. Having said that, such an improvement can obviously be layered on
top after the fact, so I don't view this as a requirement for getting
these patches merged.

> Now, it is important to consider how useful this is. I expect modern
> hypervisors to use either TTL-hinted (which we emulate even if the HW
> doesn't support it) or Range-based invalidation in the vast majority
> of the cases, so this would only help SW that hasn't got on with the
> program.

"enterprise edition" hypervisors could be an annoyance, but you do have
a good point that there are tools available to the L1 to obviate TTL
caching as a requirement for good perf.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 8570b5bd0289..5ab5c43c571b 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -4,6 +4,7 @@ 
  * Author: Jintack Lim <jintack.lim@linaro.org>
  */
 
+#include <linux/bitfield.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 
@@ -421,12 +422,92 @@  static unsigned int ttl_to_size(u8 ttl)
 	return max_size;
 }
 
+/*
+ * Compute the equivalent of the TTL field by parsing the shadow PT.  The
+ * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
+ * retrieved from first entry carrying the level as a tag.
+ */
+static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
+{
+	u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
+	kvm_pte_t pte;
+	u8 ttl, level;
+
+	switch (vtcr & VTCR_EL2_TG0_MASK) {
+	case VTCR_EL2_TG0_4K:
+		ttl = (TLBI_TTL_TG_4K << 2);
+		break;
+	case VTCR_EL2_TG0_16K:
+		ttl = (TLBI_TTL_TG_16K << 2);
+		break;
+	case VTCR_EL2_TG0_64K:
+	default:	    /* IMPDEF: treat any other value as 64k */
+		ttl = (TLBI_TTL_TG_64K << 2);
+		break;
+	}
+
+	tmp = addr;
+
+again:
+	/* Iteratively compute the block sizes for a particular granule size */
+	switch (vtcr & VTCR_EL2_TG0_MASK) {
+	case VTCR_EL2_TG0_4K:
+		if	(sz < SZ_4K)	sz = SZ_4K;
+		else if (sz < SZ_2M)	sz = SZ_2M;
+		else if (sz < SZ_1G)	sz = SZ_1G;
+		else			sz = 0;
+		break;
+	case VTCR_EL2_TG0_16K:
+		if	(sz < SZ_16K)	sz = SZ_16K;
+		else if (sz < SZ_32M)	sz = SZ_32M;
+		else			sz = 0;
+		break;
+	case VTCR_EL2_TG0_64K:
+	default:	    /* IMPDEF: treat any other value as 64k */
+		if	(sz < SZ_64K)	sz = SZ_64K;
+		else if (sz < SZ_512M)	sz = SZ_512M;
+		else			sz = 0;
+		break;
+	}
+
+	if (sz == 0)
+		return 0;
+
+	tmp &= ~(sz - 1);
+	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
+		goto again;
+	if (!(pte & PTE_VALID))
+		goto again;
+	level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
+	if (!level)
+		goto again;
+
+	ttl |= level;
+
+	/*
+	 * We now have found some level information in the shadow S2. Check
+	 * that the resulting range is actually including the original IPA.
+	 */
+	sz = ttl_to_size(ttl);
+	if (addr < (tmp + sz))
+		return ttl;
+
+	return 0;
+}
+
 unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
 {
+	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
 	unsigned long max_size;
 	u8 ttl;
 
-	ttl = FIELD_GET(GENMASK_ULL(47, 44), val);
+	ttl = FIELD_GET(TLBI_TTL_MASK, val);
+
+	if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
+		/* No TTL, check the shadow S2 for a hint */
+		u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
+		ttl = get_guest_mapping_ttl(mmu, addr);
+	}
 
 	max_size = ttl_to_size(ttl);