diff mbox series

[v7,12/12] KVM: arm64: Use TLBI range-based intructions for unmap

Message ID 20230722022251.3446223-13-rananta@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Add support for FEAT_TLBIRANGE | expand

Commit Message

Raghavendra Rao Ananta July 22, 2023, 2:22 a.m. UTC
The current implementation of the stage-2 unmap walker traverses
the given range and, as a part of break-before-make, performs
TLB invalidations with a DSB for every PTE. A multitude of this
combination could cause a performance bottleneck on some systems.

Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
invalidations until the entire walk is finished, and then
use range-based instructions to invalidate the TLBs in one go.
Condition deferred TLB invalidation on the system supporting FWB,
as the optimization is entirely pointless when the unmap walker
needs to perform CMOs.

Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
now serves the stage-2 unmap walker specifically, rather than
acting generic.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 9 deletions(-)

Comments

Shaoqin Huang July 24, 2023, 9:34 a.m. UTC | #1
Hi Raghavendra,

On 7/22/23 10:22, Raghavendra Rao Ananta wrote:
> The current implementation of the stage-2 unmap walker traverses
> the given range and, as a part of break-before-make, performs
> TLB invalidations with a DSB for every PTE. A multitude of this
> combination could cause a performance bottleneck on some systems.
> 
> Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> invalidations until the entire walk is finished, and then
> use range-based instructions to invalidate the TLBs in one go.
> Condition deferred TLB invalidation on the system supporting FWB,
> as the optimization is entirely pointless when the unmap walker
> needs to perform CMOs.
> 
> Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> now serves the stage-2 unmap walker specifically, rather than
> acting generic.
> 
> Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> ---
>   arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
>   1 file changed, 58 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 5ef098af1736..cf88933a2ea0 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
>   	smp_store_release(ctx->ptep, new);
>   }
>   
> -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> -			   struct kvm_pgtable_mm_ops *mm_ops)
> +struct stage2_unmap_data {
> +	struct kvm_pgtable *pgt;
> +	bool defer_tlb_flush_init;
> +};
> +
> +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> +{
> +	/*
> +	 * If FEAT_TLBIRANGE is implemented, defer the individual
> +	 * TLB invalidations until the entire walk is finished, and
> +	 * then use the range-based TLBI instructions to do the
> +	 * invalidations. Condition deferred TLB invalidation on the
> +	 * system supporting FWB, as the optimization is entirely
> +	 * pointless when the unmap walker needs to perform CMOs.
> +	 */
> +	return system_supports_tlb_range() && stage2_has_fwb(pgt);
> +}
> +
> +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> +{
> +	bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> +
> +	/*
> +	 * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> +	 * patching and the TLBIs' operations behavior depend on this,
> +	 * track if there's any change in the state during the unmap sequence.
> +	 */
> +	WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> +	return defer_tlb_flush;
> +}
> +
> +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
> +				struct kvm_s2_mmu *mmu,
> +				struct kvm_pgtable_mm_ops *mm_ops)
>   {
> +	struct stage2_unmap_data *unmap_data = ctx->arg;
> +
>   	/*
> -	 * Clear the existing PTE, and perform break-before-make with
> -	 * TLB maintenance if it was valid.
> +	 * Clear the existing PTE, and perform break-before-make if it was
> +	 * valid. Depending on the system support, the TLB maintenance for
> +	 * the same can be deferred until the entire unmap is completed.
>   	 */
>   	if (kvm_pte_valid(ctx->old)) {
>   		kvm_clear_pte(ctx->ptep);
> -		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
> +
> +		if (!stage2_unmap_defer_tlb_flush(unmap_data))
Why not directly check (unmap_data->defer_tlb_flush_init) here?

> +			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
> +					ctx->addr, ctx->level);
Small indent hint. The ctx->addr can align with __kvm_tlb_flush_vmid_ipa.

Thanks,
Shaoqin
>   	}
>   
>   	mm_ops->put_page(ctx->ptep);
> @@ -1070,7 +1108,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
>   static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>   			       enum kvm_pgtable_walk_flags visit)
>   {
> -	struct kvm_pgtable *pgt = ctx->arg;
> +	struct stage2_unmap_data *unmap_data = ctx->arg;
> +	struct kvm_pgtable *pgt = unmap_data->pgt;
>   	struct kvm_s2_mmu *mmu = pgt->mmu;
>   	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
>   	kvm_pte_t *childp = NULL;
> @@ -1098,7 +1137,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>   	 * block entry and rely on the remaining portions being faulted
>   	 * back lazily.
>   	 */
> -	stage2_put_pte(ctx, mmu, mm_ops);
> +	stage2_unmap_put_pte(ctx, mmu, mm_ops);
>   
>   	if (need_flush && mm_ops->dcache_clean_inval_poc)
>   		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
> @@ -1112,13 +1151,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>   
>   int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
>   {
> +	int ret;
> +	struct stage2_unmap_data unmap_data = {
> +		.pgt = pgt,
> +		.defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
> +	};
>   	struct kvm_pgtable_walker walker = {
>   		.cb	= stage2_unmap_walker,
> -		.arg	= pgt,
> +		.arg	= &unmap_data,
>   		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
>   	};
>   
> -	return kvm_pgtable_walk(pgt, addr, size, &walker);
> +	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> +	if (stage2_unmap_defer_tlb_flush(&unmap_data))
> +		/* Perform the deferred TLB invalidations */
> +		kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
> +
> +	return ret;
>   }
>   
>   struct stage2_attr_data {
Raghavendra Rao Ananta July 24, 2023, 4:47 p.m. UTC | #2
On Mon, Jul 24, 2023 at 2:35 AM Shaoqin Huang <shahuang@redhat.com> wrote:
>
> Hi Raghavendra,
>
> On 7/22/23 10:22, Raghavendra Rao Ananta wrote:
> > The current implementation of the stage-2 unmap walker traverses
> > the given range and, as a part of break-before-make, performs
> > TLB invalidations with a DSB for every PTE. A multitude of this
> > combination could cause a performance bottleneck on some systems.
> >
> > Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> > invalidations until the entire walk is finished, and then
> > use range-based instructions to invalidate the TLBs in one go.
> > Condition deferred TLB invalidation on the system supporting FWB,
> > as the optimization is entirely pointless when the unmap walker
> > needs to perform CMOs.
> >
> > Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> > now serves the stage-2 unmap walker specifically, rather than
> > acting generic.
> >
> > Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> > ---
> >   arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
> >   1 file changed, 58 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 5ef098af1736..cf88933a2ea0 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
> >       smp_store_release(ctx->ptep, new);
> >   }
> >
> > -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> > -                        struct kvm_pgtable_mm_ops *mm_ops)
> > +struct stage2_unmap_data {
> > +     struct kvm_pgtable *pgt;
> > +     bool defer_tlb_flush_init;
> > +};
> > +
> > +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> > +{
> > +     /*
> > +      * If FEAT_TLBIRANGE is implemented, defer the individual
> > +      * TLB invalidations until the entire walk is finished, and
> > +      * then use the range-based TLBI instructions to do the
> > +      * invalidations. Condition deferred TLB invalidation on the
> > +      * system supporting FWB, as the optimization is entirely
> > +      * pointless when the unmap walker needs to perform CMOs.
> > +      */
> > +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
> > +}
> > +
> > +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> > +{
> > +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> > +
> > +     /*
> > +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> > +      * patching and the TLBIs' operations behavior depend on this,
> > +      * track if there's any change in the state during the unmap sequence.
> > +      */
> > +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> > +     return defer_tlb_flush;
> > +}
> > +
> > +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
> > +                             struct kvm_s2_mmu *mmu,
> > +                             struct kvm_pgtable_mm_ops *mm_ops)
> >   {
> > +     struct stage2_unmap_data *unmap_data = ctx->arg;
> > +
> >       /*
> > -      * Clear the existing PTE, and perform break-before-make with
> > -      * TLB maintenance if it was valid.
> > +      * Clear the existing PTE, and perform break-before-make if it was
> > +      * valid. Depending on the system support, the TLB maintenance for
> > +      * the same can be deferred until the entire unmap is completed.
> >        */
> >       if (kvm_pte_valid(ctx->old)) {
> >               kvm_clear_pte(ctx->ptep);
> > -             kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
> > +
> > +             if (!stage2_unmap_defer_tlb_flush(unmap_data))
> Why not directly check (unmap_data->defer_tlb_flush_init) here?
>
(Re-sending the reply as the previous one was formatted as HTML and
was blocked by many lists)

No particular reason per say, but I was just going with the logic of
determining if we need to defer the flush and the WARN_ON() parts
separate.
Any advantage if we directly check in stage2_unmap_put_pte() that I
missed or is this purely for readability?

> > +                     kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
> > +                                     ctx->addr, ctx->level);
> Small indent hint. The ctx->addr can align with __kvm_tlb_flush_vmid_ipa.
>
Ah, yes. I'll adjust this if I send out a v8.

Thank you.
Raghavendra
> Thanks,
> Shaoqin
> >       }
> >
> >       mm_ops->put_page(ctx->ptep);
> > @@ -1070,7 +1108,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >   static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >                              enum kvm_pgtable_walk_flags visit)
> >   {
> > -     struct kvm_pgtable *pgt = ctx->arg;
> > +     struct stage2_unmap_data *unmap_data = ctx->arg;
> > +     struct kvm_pgtable *pgt = unmap_data->pgt;
> >       struct kvm_s2_mmu *mmu = pgt->mmu;
> >       struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> >       kvm_pte_t *childp = NULL;
> > @@ -1098,7 +1137,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >        * block entry and rely on the remaining portions being faulted
> >        * back lazily.
> >        */
> > -     stage2_put_pte(ctx, mmu, mm_ops);
> > +     stage2_unmap_put_pte(ctx, mmu, mm_ops);
> >
> >       if (need_flush && mm_ops->dcache_clean_inval_poc)
> >               mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
> > @@ -1112,13 +1151,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >
> >   int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
> >   {
> > +     int ret;
> > +     struct stage2_unmap_data unmap_data = {
> > +             .pgt = pgt,
> > +             .defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
> > +     };
> >       struct kvm_pgtable_walker walker = {
> >               .cb     = stage2_unmap_walker,
> > -             .arg    = pgt,
> > +             .arg    = &unmap_data,
> >               .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
> >       };
> >
> > -     return kvm_pgtable_walk(pgt, addr, size, &walker);
> > +     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> > +     if (stage2_unmap_defer_tlb_flush(&unmap_data))
> > +             /* Perform the deferred TLB invalidations */
> > +             kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
> > +
> > +     return ret;
> >   }
> >
> >   struct stage2_attr_data {
>
> --
> Shaoqin
>
Shaoqin Huang July 25, 2023, 2:32 a.m. UTC | #3
On 7/25/23 00:47, Raghavendra Rao Ananta wrote:
> On Mon, Jul 24, 2023 at 2:35 AM Shaoqin Huang <shahuang@redhat.com> wrote:
>>
>> Hi Raghavendra,
>>
>> On 7/22/23 10:22, Raghavendra Rao Ananta wrote:
>>> The current implementation of the stage-2 unmap walker traverses
>>> the given range and, as a part of break-before-make, performs
>>> TLB invalidations with a DSB for every PTE. A multitude of this
>>> combination could cause a performance bottleneck on some systems.
>>>
>>> Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
>>> invalidations until the entire walk is finished, and then
>>> use range-based instructions to invalidate the TLBs in one go.
>>> Condition deferred TLB invalidation on the system supporting FWB,
>>> as the optimization is entirely pointless when the unmap walker
>>> needs to perform CMOs.
>>>
>>> Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
>>> now serves the stage-2 unmap walker specifically, rather than
>>> acting generic.
>>>
>>> Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
>>> ---
>>>    arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
>>>    1 file changed, 58 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>> index 5ef098af1736..cf88933a2ea0 100644
>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>> @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
>>>        smp_store_release(ctx->ptep, new);
>>>    }
>>>
>>> -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
>>> -                        struct kvm_pgtable_mm_ops *mm_ops)
>>> +struct stage2_unmap_data {
>>> +     struct kvm_pgtable *pgt;
>>> +     bool defer_tlb_flush_init;
>>> +};
>>> +
>>> +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
>>> +{
>>> +     /*
>>> +      * If FEAT_TLBIRANGE is implemented, defer the individual
>>> +      * TLB invalidations until the entire walk is finished, and
>>> +      * then use the range-based TLBI instructions to do the
>>> +      * invalidations. Condition deferred TLB invalidation on the
>>> +      * system supporting FWB, as the optimization is entirely
>>> +      * pointless when the unmap walker needs to perform CMOs.
>>> +      */
>>> +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
>>> +}
>>> +
>>> +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
>>> +{
>>> +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
>>> +
>>> +     /*
>>> +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
>>> +      * patching and the TLBIs' operations behavior depend on this,
>>> +      * track if there's any change in the state during the unmap sequence.
>>> +      */
>>> +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
>>> +     return defer_tlb_flush;
>>> +}
>>> +
>>> +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
>>> +                             struct kvm_s2_mmu *mmu,
>>> +                             struct kvm_pgtable_mm_ops *mm_ops)
>>>    {
>>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
>>> +
>>>        /*
>>> -      * Clear the existing PTE, and perform break-before-make with
>>> -      * TLB maintenance if it was valid.
>>> +      * Clear the existing PTE, and perform break-before-make if it was
>>> +      * valid. Depending on the system support, the TLB maintenance for
>>> +      * the same can be deferred until the entire unmap is completed.
>>>         */
>>>        if (kvm_pte_valid(ctx->old)) {
>>>                kvm_clear_pte(ctx->ptep);
>>> -             kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
>>> +
>>> +             if (!stage2_unmap_defer_tlb_flush(unmap_data))
>> Why not directly check (unmap_data->defer_tlb_flush_init) here?
>>
> (Re-sending the reply as the previous one was formatted as HTML and
> was blocked by many lists)
> 
> No particular reason per say, but I was just going with the logic of
> determining if we need to defer the flush and the WARN_ON() parts
> separate.
> Any advantage if we directly check in stage2_unmap_put_pte() that I
> missed or is this purely for readability?

Hi Raghavendra,

I just wondering if before the stage2 walk, we want to defer the tlb 
flush, but if when walk the stage2, the stage2_unmap_defer_tlb_flush() 
trigger the WARN_ON() and return don't defer the tlb flush, it will 
flush the ctx->addr's tlb. But before the WARN_ON() triggered, these tlb 
will not be flushed, since when walk stage2 done in the 
kvm_pgtable_stage2_unmap(), the stage2_unmap_defer_tlb_flush() still 
trigger the WARN_ON() and don't use the tlb range-based flush. Thus some 
of the tlb are not flushed.

If we directly check the (unmap_data->defer_tlb_flush_init), this isn't 
change during walking the stage2, so the WARN_ON() should only trigger 
in kvm_pgtable_stage2_unmap()->stage2_unmap_defer_tlb_flush().

I'm not sure if it's right since I just think once we set up use the 
TLBI range-based flush, the result of the 
__stage2_unmap_defer_tlb_flush() shouldn't change. Otherwise there must 
have some stale TLB entry.

Thanks,
Shaoqin

> 
>>> +                     kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
>>> +                                     ctx->addr, ctx->level);
>> Small indent hint. The ctx->addr can align with __kvm_tlb_flush_vmid_ipa.
>>
> Ah, yes. I'll adjust this if I send out a v8.
> 
> Thank you.
> Raghavendra
>> Thanks,
>> Shaoqin
>>>        }
>>>
>>>        mm_ops->put_page(ctx->ptep);
>>> @@ -1070,7 +1108,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
>>>    static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>                               enum kvm_pgtable_walk_flags visit)
>>>    {
>>> -     struct kvm_pgtable *pgt = ctx->arg;
>>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
>>> +     struct kvm_pgtable *pgt = unmap_data->pgt;
>>>        struct kvm_s2_mmu *mmu = pgt->mmu;
>>>        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
>>>        kvm_pte_t *childp = NULL;
>>> @@ -1098,7 +1137,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>         * block entry and rely on the remaining portions being faulted
>>>         * back lazily.
>>>         */
>>> -     stage2_put_pte(ctx, mmu, mm_ops);
>>> +     stage2_unmap_put_pte(ctx, mmu, mm_ops);
>>>
>>>        if (need_flush && mm_ops->dcache_clean_inval_poc)
>>>                mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
>>> @@ -1112,13 +1151,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>
>>>    int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
>>>    {
>>> +     int ret;
>>> +     struct stage2_unmap_data unmap_data = {
>>> +             .pgt = pgt,
>>> +             .defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
>>> +     };
>>>        struct kvm_pgtable_walker walker = {
>>>                .cb     = stage2_unmap_walker,
>>> -             .arg    = pgt,
>>> +             .arg    = &unmap_data,
>>>                .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
>>>        };
>>>
>>> -     return kvm_pgtable_walk(pgt, addr, size, &walker);
>>> +     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
>>> +     if (stage2_unmap_defer_tlb_flush(&unmap_data))
>>> +             /* Perform the deferred TLB invalidations */
>>> +             kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
>>> +
>>> +     return ret;
>>>    }
>>>
>>>    struct stage2_attr_data {
>>
>> --
>> Shaoqin
>>
>
Raghavendra Rao Ananta July 25, 2023, 5:23 p.m. UTC | #4
Hi Shaoqin,

On Mon, Jul 24, 2023 at 7:32 PM Shaoqin Huang <shahuang@redhat.com> wrote:
>
>
>
> On 7/25/23 00:47, Raghavendra Rao Ananta wrote:
> > On Mon, Jul 24, 2023 at 2:35 AM Shaoqin Huang <shahuang@redhat.com> wrote:
> >>
> >> Hi Raghavendra,
> >>
> >> On 7/22/23 10:22, Raghavendra Rao Ananta wrote:
> >>> The current implementation of the stage-2 unmap walker traverses
> >>> the given range and, as a part of break-before-make, performs
> >>> TLB invalidations with a DSB for every PTE. A multitude of this
> >>> combination could cause a performance bottleneck on some systems.
> >>>
> >>> Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> >>> invalidations until the entire walk is finished, and then
> >>> use range-based instructions to invalidate the TLBs in one go.
> >>> Condition deferred TLB invalidation on the system supporting FWB,
> >>> as the optimization is entirely pointless when the unmap walker
> >>> needs to perform CMOs.
> >>>
> >>> Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> >>> now serves the stage-2 unmap walker specifically, rather than
> >>> acting generic.
> >>>
> >>> Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> >>> ---
> >>>    arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
> >>>    1 file changed, 58 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> >>> index 5ef098af1736..cf88933a2ea0 100644
> >>> --- a/arch/arm64/kvm/hyp/pgtable.c
> >>> +++ b/arch/arm64/kvm/hyp/pgtable.c
> >>> @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
> >>>        smp_store_release(ctx->ptep, new);
> >>>    }
> >>>
> >>> -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> >>> -                        struct kvm_pgtable_mm_ops *mm_ops)
> >>> +struct stage2_unmap_data {
> >>> +     struct kvm_pgtable *pgt;
> >>> +     bool defer_tlb_flush_init;
> >>> +};
> >>> +
> >>> +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> >>> +{
> >>> +     /*
> >>> +      * If FEAT_TLBIRANGE is implemented, defer the individual
> >>> +      * TLB invalidations until the entire walk is finished, and
> >>> +      * then use the range-based TLBI instructions to do the
> >>> +      * invalidations. Condition deferred TLB invalidation on the
> >>> +      * system supporting FWB, as the optimization is entirely
> >>> +      * pointless when the unmap walker needs to perform CMOs.
> >>> +      */
> >>> +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
> >>> +}
> >>> +
> >>> +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> >>> +{
> >>> +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> >>> +
> >>> +     /*
> >>> +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> >>> +      * patching and the TLBIs' operations behavior depend on this,
> >>> +      * track if there's any change in the state during the unmap sequence.
> >>> +      */
> >>> +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> >>> +     return defer_tlb_flush;
> >>> +}
> >>> +
> >>> +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
> >>> +                             struct kvm_s2_mmu *mmu,
> >>> +                             struct kvm_pgtable_mm_ops *mm_ops)
> >>>    {
> >>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
> >>> +
> >>>        /*
> >>> -      * Clear the existing PTE, and perform break-before-make with
> >>> -      * TLB maintenance if it was valid.
> >>> +      * Clear the existing PTE, and perform break-before-make if it was
> >>> +      * valid. Depending on the system support, the TLB maintenance for
> >>> +      * the same can be deferred until the entire unmap is completed.
> >>>         */
> >>>        if (kvm_pte_valid(ctx->old)) {
> >>>                kvm_clear_pte(ctx->ptep);
> >>> -             kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
> >>> +
> >>> +             if (!stage2_unmap_defer_tlb_flush(unmap_data))
> >> Why not directly check (unmap_data->defer_tlb_flush_init) here?
> >>
> > (Re-sending the reply as the previous one was formatted as HTML and
> > was blocked by many lists)
> >
> > No particular reason per say, but I was just going with the logic of
> > determining if we need to defer the flush and the WARN_ON() parts
> > separate.
> > Any advantage if we directly check in stage2_unmap_put_pte() that I
> > missed or is this purely for readability?
>
> Hi Raghavendra,
>
> I just wondering if before the stage2 walk, we want to defer the tlb
> flush, but if when walk the stage2, the stage2_unmap_defer_tlb_flush()
> trigger the WARN_ON() and return don't defer the tlb flush, it will
> flush the ctx->addr's tlb. But before the WARN_ON() triggered, these tlb
> will not be flushed, since when walk stage2 done in the
> kvm_pgtable_stage2_unmap(), the stage2_unmap_defer_tlb_flush() still
> trigger the WARN_ON() and don't use the tlb range-based flush. Thus some
> of the tlb are not flushed.
>
Excellent point!
> If we directly check the (unmap_data->defer_tlb_flush_init), this isn't
> change during walking the stage2, so the WARN_ON() should only trigger
> in kvm_pgtable_stage2_unmap()->stage2_unmap_defer_tlb_flush().
>
> I'm not sure if it's right since I just think once we set up use the
> TLBI range-based flush, the result of the
> __stage2_unmap_defer_tlb_flush() shouldn't change. Otherwise there must
> have some stale TLB entry.
>
One solution that I can think of is, if the code triggers the
WARN_ON(), we flush the entire VM's TLB using
kvm_call_hyp(__kvm_tlb_flush_vmid) after the entire walk is finished.
In this special/rare situation, it'll be a little expensive, but we
will at least be correct, leaving no stale TLBs behind. WDYT?

Thank you.
Raghavendra
> Thanks,
> Shaoqin
>
> >
> >>> +                     kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
> >>> +                                     ctx->addr, ctx->level);
> >> Small indent hint. The ctx->addr can align with __kvm_tlb_flush_vmid_ipa.
> >>
> > Ah, yes. I'll adjust this if I send out a v8.
> >
> > Thank you.
> > Raghavendra
> >> Thanks,
> >> Shaoqin
> >>>        }
> >>>
> >>>        mm_ops->put_page(ctx->ptep);
> >>> @@ -1070,7 +1108,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >>>    static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >>>                               enum kvm_pgtable_walk_flags visit)
> >>>    {
> >>> -     struct kvm_pgtable *pgt = ctx->arg;
> >>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
> >>> +     struct kvm_pgtable *pgt = unmap_data->pgt;
> >>>        struct kvm_s2_mmu *mmu = pgt->mmu;
> >>>        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
> >>>        kvm_pte_t *childp = NULL;
> >>> @@ -1098,7 +1137,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >>>         * block entry and rely on the remaining portions being faulted
> >>>         * back lazily.
> >>>         */
> >>> -     stage2_put_pte(ctx, mmu, mm_ops);
> >>> +     stage2_unmap_put_pte(ctx, mmu, mm_ops);
> >>>
> >>>        if (need_flush && mm_ops->dcache_clean_inval_poc)
> >>>                mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
> >>> @@ -1112,13 +1151,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
> >>>
> >>>    int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
> >>>    {
> >>> +     int ret;
> >>> +     struct stage2_unmap_data unmap_data = {
> >>> +             .pgt = pgt,
> >>> +             .defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
> >>> +     };
> >>>        struct kvm_pgtable_walker walker = {
> >>>                .cb     = stage2_unmap_walker,
> >>> -             .arg    = pgt,
> >>> +             .arg    = &unmap_data,
> >>>                .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
> >>>        };
> >>>
> >>> -     return kvm_pgtable_walk(pgt, addr, size, &walker);
> >>> +     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
> >>> +     if (stage2_unmap_defer_tlb_flush(&unmap_data))
> >>> +             /* Perform the deferred TLB invalidations */
> >>> +             kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
> >>> +
> >>> +     return ret;
> >>>    }
> >>>
> >>>    struct stage2_attr_data {
> >>
> >> --
> >> Shaoqin
> >>
> >
>
> --
> Shaoqin
>
Shaoqin Huang July 26, 2023, 4:06 a.m. UTC | #5
Hi Raghavendra,

On 7/26/23 01:23, Raghavendra Rao Ananta wrote:
> Hi Shaoqin,
> 
> On Mon, Jul 24, 2023 at 7:32 PM Shaoqin Huang <shahuang@redhat.com> wrote:
>>
>>
>>
>> On 7/25/23 00:47, Raghavendra Rao Ananta wrote:
>>> On Mon, Jul 24, 2023 at 2:35 AM Shaoqin Huang <shahuang@redhat.com> wrote:
>>>>
>>>> Hi Raghavendra,
>>>>
>>>> On 7/22/23 10:22, Raghavendra Rao Ananta wrote:
>>>>> The current implementation of the stage-2 unmap walker traverses
>>>>> the given range and, as a part of break-before-make, performs
>>>>> TLB invalidations with a DSB for every PTE. A multitude of this
>>>>> combination could cause a performance bottleneck on some systems.
>>>>>
>>>>> Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
>>>>> invalidations until the entire walk is finished, and then
>>>>> use range-based instructions to invalidate the TLBs in one go.
>>>>> Condition deferred TLB invalidation on the system supporting FWB,
>>>>> as the optimization is entirely pointless when the unmap walker
>>>>> needs to perform CMOs.
>>>>>
>>>>> Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
>>>>> now serves the stage-2 unmap walker specifically, rather than
>>>>> acting generic.
>>>>>
>>>>> Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
>>>>> ---
>>>>>     arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
>>>>>     1 file changed, 58 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>>>> index 5ef098af1736..cf88933a2ea0 100644
>>>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>>>> @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
>>>>>         smp_store_release(ctx->ptep, new);
>>>>>     }
>>>>>
>>>>> -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
>>>>> -                        struct kvm_pgtable_mm_ops *mm_ops)
>>>>> +struct stage2_unmap_data {
>>>>> +     struct kvm_pgtable *pgt;
>>>>> +     bool defer_tlb_flush_init;
>>>>> +};
>>>>> +
>>>>> +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
>>>>> +{
>>>>> +     /*
>>>>> +      * If FEAT_TLBIRANGE is implemented, defer the individual
>>>>> +      * TLB invalidations until the entire walk is finished, and
>>>>> +      * then use the range-based TLBI instructions to do the
>>>>> +      * invalidations. Condition deferred TLB invalidation on the
>>>>> +      * system supporting FWB, as the optimization is entirely
>>>>> +      * pointless when the unmap walker needs to perform CMOs.
>>>>> +      */
>>>>> +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
>>>>> +}
>>>>> +
>>>>> +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
>>>>> +{
>>>>> +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
>>>>> +
>>>>> +     /*
>>>>> +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
>>>>> +      * patching and the TLBIs' operations behavior depend on this,
>>>>> +      * track if there's any change in the state during the unmap sequence.
>>>>> +      */
>>>>> +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
>>>>> +     return defer_tlb_flush;
>>>>> +}
>>>>> +
>>>>> +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
>>>>> +                             struct kvm_s2_mmu *mmu,
>>>>> +                             struct kvm_pgtable_mm_ops *mm_ops)
>>>>>     {
>>>>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
>>>>> +
>>>>>         /*
>>>>> -      * Clear the existing PTE, and perform break-before-make with
>>>>> -      * TLB maintenance if it was valid.
>>>>> +      * Clear the existing PTE, and perform break-before-make if it was
>>>>> +      * valid. Depending on the system support, the TLB maintenance for
>>>>> +      * the same can be deferred until the entire unmap is completed.
>>>>>          */
>>>>>         if (kvm_pte_valid(ctx->old)) {
>>>>>                 kvm_clear_pte(ctx->ptep);
>>>>> -             kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
>>>>> +
>>>>> +             if (!stage2_unmap_defer_tlb_flush(unmap_data))
>>>> Why not directly check (unmap_data->defer_tlb_flush_init) here?
>>>>
>>> (Re-sending the reply as the previous one was formatted as HTML and
>>> was blocked by many lists)
>>>
>>> No particular reason per say, but I was just going with the logic of
>>> determining if we need to defer the flush and the WARN_ON() parts
>>> separate.
>>> Any advantage if we directly check in stage2_unmap_put_pte() that I
>>> missed or is this purely for readability?
>>
>> Hi Raghavendra,
>>
>> I just wondering if before the stage2 walk, we want to defer the tlb
>> flush, but if when walk the stage2, the stage2_unmap_defer_tlb_flush()
>> trigger the WARN_ON() and return don't defer the tlb flush, it will
>> flush the ctx->addr's tlb. But before the WARN_ON() triggered, these tlb
>> will not be flushed, since when walk stage2 done in the
>> kvm_pgtable_stage2_unmap(), the stage2_unmap_defer_tlb_flush() still
>> trigger the WARN_ON() and don't use the tlb range-based flush. Thus some
>> of the tlb are not flushed.
>>
> Excellent point!
>> If we directly check the (unmap_data->defer_tlb_flush_init), this isn't
>> change during walking the stage2, so the WARN_ON() should only trigger
>> in kvm_pgtable_stage2_unmap()->stage2_unmap_defer_tlb_flush().
>>
>> I'm not sure if it's right since I just think once we set up use the
>> TLBI range-based flush, the result of the
>> __stage2_unmap_defer_tlb_flush() shouldn't change. Otherwise there must
>> have some stale TLB entry.
>>
> One solution that I can think of is, if the code triggers the
> WARN_ON(), we flush the entire VM's TLB using
> kvm_call_hyp(__kvm_tlb_flush_vmid) after the entire walk is finished.
> In this special/rare situation, it'll be a little expensive, but we
> will at least be correct, leaving no stale TLBs behind. WDYT?
> 

I think it will be good to have this handling.

Thanks,
Shaoqin

> Thank you.
> Raghavendra
>> Thanks,
>> Shaoqin
>>
>>>
>>>>> +                     kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
>>>>> +                                     ctx->addr, ctx->level);
>>>> Small indent hint. The ctx->addr can align with __kvm_tlb_flush_vmid_ipa.
>>>>
>>> Ah, yes. I'll adjust this if I send out a v8.
>>>
>>> Thank you.
>>> Raghavendra
>>>> Thanks,
>>>> Shaoqin
>>>>>         }
>>>>>
>>>>>         mm_ops->put_page(ctx->ptep);
>>>>> @@ -1070,7 +1108,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
>>>>>     static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>>>                                enum kvm_pgtable_walk_flags visit)
>>>>>     {
>>>>> -     struct kvm_pgtable *pgt = ctx->arg;
>>>>> +     struct stage2_unmap_data *unmap_data = ctx->arg;
>>>>> +     struct kvm_pgtable *pgt = unmap_data->pgt;
>>>>>         struct kvm_s2_mmu *mmu = pgt->mmu;
>>>>>         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
>>>>>         kvm_pte_t *childp = NULL;
>>>>> @@ -1098,7 +1137,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>>>          * block entry and rely on the remaining portions being faulted
>>>>>          * back lazily.
>>>>>          */
>>>>> -     stage2_put_pte(ctx, mmu, mm_ops);
>>>>> +     stage2_unmap_put_pte(ctx, mmu, mm_ops);
>>>>>
>>>>>         if (need_flush && mm_ops->dcache_clean_inval_poc)
>>>>>                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
>>>>> @@ -1112,13 +1151,23 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
>>>>>
>>>>>     int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
>>>>>     {
>>>>> +     int ret;
>>>>> +     struct stage2_unmap_data unmap_data = {
>>>>> +             .pgt = pgt,
>>>>> +             .defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
>>>>> +     };
>>>>>         struct kvm_pgtable_walker walker = {
>>>>>                 .cb     = stage2_unmap_walker,
>>>>> -             .arg    = pgt,
>>>>> +             .arg    = &unmap_data,
>>>>>                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
>>>>>         };
>>>>>
>>>>> -     return kvm_pgtable_walk(pgt, addr, size, &walker);
>>>>> +     ret = kvm_pgtable_walk(pgt, addr, size, &walker);
>>>>> +     if (stage2_unmap_defer_tlb_flush(&unmap_data))
>>>>> +             /* Perform the deferred TLB invalidations */
>>>>> +             kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
>>>>> +
>>>>> +     return ret;
>>>>>     }
>>>>>
>>>>>     struct stage2_attr_data {
>>>>
>>>> --
>>>> Shaoqin
>>>>
>>>
>>
>> --
>> Shaoqin
>>
>
Marc Zyngier July 27, 2023, 1:12 p.m. UTC | #6
On Sat, 22 Jul 2023 03:22:51 +0100,
Raghavendra Rao Ananta <rananta@google.com> wrote:
> 
> The current implementation of the stage-2 unmap walker traverses
> the given range and, as a part of break-before-make, performs
> TLB invalidations with a DSB for every PTE. A multitude of this
> combination could cause a performance bottleneck on some systems.
> 
> Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> invalidations until the entire walk is finished, and then
> use range-based instructions to invalidate the TLBs in one go.
> Condition deferred TLB invalidation on the system supporting FWB,
> as the optimization is entirely pointless when the unmap walker
> needs to perform CMOs.
> 
> Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> now serves the stage-2 unmap walker specifically, rather than
> acting generic.
> 
> Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> ---
>  arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
>  1 file changed, 58 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 5ef098af1736..cf88933a2ea0 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
>  	smp_store_release(ctx->ptep, new);
>  }
>  
> -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> -			   struct kvm_pgtable_mm_ops *mm_ops)
> +struct stage2_unmap_data {
> +	struct kvm_pgtable *pgt;
> +	bool defer_tlb_flush_init;
> +};
> +
> +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> +{
> +	/*
> +	 * If FEAT_TLBIRANGE is implemented, defer the individual
> +	 * TLB invalidations until the entire walk is finished, and
> +	 * then use the range-based TLBI instructions to do the
> +	 * invalidations. Condition deferred TLB invalidation on the
> +	 * system supporting FWB, as the optimization is entirely
> +	 * pointless when the unmap walker needs to perform CMOs.
> +	 */
> +	return system_supports_tlb_range() && stage2_has_fwb(pgt);
> +}
> +
> +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> +{
> +	bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> +
> +	/*
> +	 * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> +	 * patching and the TLBIs' operations behavior depend on this,
> +	 * track if there's any change in the state during the unmap sequence.
> +	 */
> +	WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> +	return defer_tlb_flush;

I really don't understand what you're testing here. The ability to
defer TLB invalidation is a function of the system capabilities
(range+FWB) and a single flag that is only set on the host for pKVM.

How could that change in the middle of the life of the system? if
further begs the question about the need for the unmap_data data
structure.

It looks to me that we could simply pass the pgt pointer around and be
done with it. Am I missing something obvious?

	M.
Raghavendra Rao Ananta July 31, 2023, 6:26 p.m. UTC | #7
On Thu, Jul 27, 2023 at 6:12 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Sat, 22 Jul 2023 03:22:51 +0100,
> Raghavendra Rao Ananta <rananta@google.com> wrote:
> >
> > The current implementation of the stage-2 unmap walker traverses
> > the given range and, as a part of break-before-make, performs
> > TLB invalidations with a DSB for every PTE. A multitude of this
> > combination could cause a performance bottleneck on some systems.
> >
> > Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> > invalidations until the entire walk is finished, and then
> > use range-based instructions to invalidate the TLBs in one go.
> > Condition deferred TLB invalidation on the system supporting FWB,
> > as the optimization is entirely pointless when the unmap walker
> > needs to perform CMOs.
> >
> > Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> > now serves the stage-2 unmap walker specifically, rather than
> > acting generic.
> >
> > Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> > ---
> >  arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
> >  1 file changed, 58 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 5ef098af1736..cf88933a2ea0 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
> >       smp_store_release(ctx->ptep, new);
> >  }
> >
> > -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> > -                        struct kvm_pgtable_mm_ops *mm_ops)
> > +struct stage2_unmap_data {
> > +     struct kvm_pgtable *pgt;
> > +     bool defer_tlb_flush_init;
> > +};
> > +
> > +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> > +{
> > +     /*
> > +      * If FEAT_TLBIRANGE is implemented, defer the individual
> > +      * TLB invalidations until the entire walk is finished, and
> > +      * then use the range-based TLBI instructions to do the
> > +      * invalidations. Condition deferred TLB invalidation on the
> > +      * system supporting FWB, as the optimization is entirely
> > +      * pointless when the unmap walker needs to perform CMOs.
> > +      */
> > +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
> > +}
> > +
> > +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> > +{
> > +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> > +
> > +     /*
> > +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> > +      * patching and the TLBIs' operations behavior depend on this,
> > +      * track if there's any change in the state during the unmap sequence.
> > +      */
> > +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> > +     return defer_tlb_flush;
>
> I really don't understand what you're testing here. The ability to
> defer TLB invalidation is a function of the system capabilities
> (range+FWB) and a single flag that is only set on the host for pKVM.
>
> How could that change in the middle of the life of the system? if
> further begs the question about the need for the unmap_data data
> structure.
>
> It looks to me that we could simply pass the pgt pointer around and be
> done with it. Am I missing something obvious?
>
From one of the previous comments [1] (used in a different context),
I'm given to understand that since these feature checks are governed
by alternative patching, they can potentially change (at runtime?). Is
that not the case and I have misunderstood the idea in comment [1]
entirely? Is it solely used for optimization purposes and set only
once?
If that's the case, I can get rid of the WARN_ON() and unmap_data.

- Raghavendra

[1]: https://lore.kernel.org/all/ZGPPj1AXS0Uah2Ug@linux.dev/
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Marc Zyngier Aug. 2, 2023, 11:28 p.m. UTC | #8
On Mon, 31 Jul 2023 19:26:09 +0100,
Raghavendra Rao Ananta <rananta@google.com> wrote:
> 
> On Thu, Jul 27, 2023 at 6:12 AM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Sat, 22 Jul 2023 03:22:51 +0100,
> > Raghavendra Rao Ananta <rananta@google.com> wrote:
> > >
> > > The current implementation of the stage-2 unmap walker traverses
> > > the given range and, as a part of break-before-make, performs
> > > TLB invalidations with a DSB for every PTE. A multitude of this
> > > combination could cause a performance bottleneck on some systems.
> > >
> > > Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> > > invalidations until the entire walk is finished, and then
> > > use range-based instructions to invalidate the TLBs in one go.
> > > Condition deferred TLB invalidation on the system supporting FWB,
> > > as the optimization is entirely pointless when the unmap walker
> > > needs to perform CMOs.
> > >
> > > Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> > > now serves the stage-2 unmap walker specifically, rather than
> > > acting generic.
> > >
> > > Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> > > ---
> > >  arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
> > >  1 file changed, 58 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > index 5ef098af1736..cf88933a2ea0 100644
> > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
> > >       smp_store_release(ctx->ptep, new);
> > >  }
> > >
> > > -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> > > -                        struct kvm_pgtable_mm_ops *mm_ops)
> > > +struct stage2_unmap_data {
> > > +     struct kvm_pgtable *pgt;
> > > +     bool defer_tlb_flush_init;
> > > +};
> > > +
> > > +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> > > +{
> > > +     /*
> > > +      * If FEAT_TLBIRANGE is implemented, defer the individual
> > > +      * TLB invalidations until the entire walk is finished, and
> > > +      * then use the range-based TLBI instructions to do the
> > > +      * invalidations. Condition deferred TLB invalidation on the
> > > +      * system supporting FWB, as the optimization is entirely
> > > +      * pointless when the unmap walker needs to perform CMOs.
> > > +      */
> > > +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
> > > +}
> > > +
> > > +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> > > +{
> > > +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> > > +
> > > +     /*
> > > +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> > > +      * patching and the TLBIs' operations behavior depend on this,
> > > +      * track if there's any change in the state during the unmap sequence.
> > > +      */
> > > +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> > > +     return defer_tlb_flush;
> >
> > I really don't understand what you're testing here. The ability to
> > defer TLB invalidation is a function of the system capabilities
> > (range+FWB) and a single flag that is only set on the host for pKVM.
> >
> > How could that change in the middle of the life of the system? if
> > further begs the question about the need for the unmap_data data
> > structure.
> >
> > It looks to me that we could simply pass the pgt pointer around and be
> > done with it. Am I missing something obvious?
> >
> From one of the previous comments [1] (used in a different context),
> I'm given to understand that since these feature checks are governed
> by alternative patching, they can potentially change (at runtime?). Is
> that not the case and I have misunderstood the idea in comment [1]
> entirely? Is it solely used for optimization purposes and set only
> once?

Alternative patching, just like the static branches used to implement
the capability stuff, is a one way street. At the point where KVM is
initialised, these configurations are set in stone, and there is no
going back.

> If that's the case, I can get rid of the WARN_ON() and unmap_data.

yes, please.

Thanks,

	M.
Raghavendra Rao Ananta Aug. 2, 2023, 11:33 p.m. UTC | #9
On Wed, Aug 2, 2023 at 4:28 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Mon, 31 Jul 2023 19:26:09 +0100,
> Raghavendra Rao Ananta <rananta@google.com> wrote:
> >
> > On Thu, Jul 27, 2023 at 6:12 AM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On Sat, 22 Jul 2023 03:22:51 +0100,
> > > Raghavendra Rao Ananta <rananta@google.com> wrote:
> > > >
> > > > The current implementation of the stage-2 unmap walker traverses
> > > > the given range and, as a part of break-before-make, performs
> > > > TLB invalidations with a DSB for every PTE. A multitude of this
> > > > combination could cause a performance bottleneck on some systems.
> > > >
> > > > Hence, if the system supports FEAT_TLBIRANGE, defer the TLB
> > > > invalidations until the entire walk is finished, and then
> > > > use range-based instructions to invalidate the TLBs in one go.
> > > > Condition deferred TLB invalidation on the system supporting FWB,
> > > > as the optimization is entirely pointless when the unmap walker
> > > > needs to perform CMOs.
> > > >
> > > > Rename stage2_put_pte() to stage2_unmap_put_pte() as the function
> > > > now serves the stage-2 unmap walker specifically, rather than
> > > > acting generic.
> > > >
> > > > Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
> > > > ---
> > > >  arch/arm64/kvm/hyp/pgtable.c | 67 +++++++++++++++++++++++++++++++-----
> > > >  1 file changed, 58 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > > > index 5ef098af1736..cf88933a2ea0 100644
> > > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > > @@ -831,16 +831,54 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
> > > >       smp_store_release(ctx->ptep, new);
> > > >  }
> > > >
> > > > -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
> > > > -                        struct kvm_pgtable_mm_ops *mm_ops)
> > > > +struct stage2_unmap_data {
> > > > +     struct kvm_pgtable *pgt;
> > > > +     bool defer_tlb_flush_init;
> > > > +};
> > > > +
> > > > +static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
> > > > +{
> > > > +     /*
> > > > +      * If FEAT_TLBIRANGE is implemented, defer the individual
> > > > +      * TLB invalidations until the entire walk is finished, and
> > > > +      * then use the range-based TLBI instructions to do the
> > > > +      * invalidations. Condition deferred TLB invalidation on the
> > > > +      * system supporting FWB, as the optimization is entirely
> > > > +      * pointless when the unmap walker needs to perform CMOs.
> > > > +      */
> > > > +     return system_supports_tlb_range() && stage2_has_fwb(pgt);
> > > > +}
> > > > +
> > > > +static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
> > > > +{
> > > > +     bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
> > > > +
> > > > +     /*
> > > > +      * Since __stage2_unmap_defer_tlb_flush() is based on alternative
> > > > +      * patching and the TLBIs' operations behavior depend on this,
> > > > +      * track if there's any change in the state during the unmap sequence.
> > > > +      */
> > > > +     WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
> > > > +     return defer_tlb_flush;
> > >
> > > I really don't understand what you're testing here. The ability to
> > > defer TLB invalidation is a function of the system capabilities
> > > (range+FWB) and a single flag that is only set on the host for pKVM.
> > >
> > > How could that change in the middle of the life of the system? if
> > > further begs the question about the need for the unmap_data data
> > > structure.
> > >
> > > It looks to me that we could simply pass the pgt pointer around and be
> > > done with it. Am I missing something obvious?
> > >
> > From one of the previous comments [1] (used in a different context),
> > I'm given to understand that since these feature checks are governed
> > by alternative patching, they can potentially change (at runtime?). Is
> > that not the case and I have misunderstood the idea in comment [1]
> > entirely? Is it solely used for optimization purposes and set only
> > once?
>
> Alternative patching, just like the static branches used to implement
> the capability stuff, is a one way street. At the point where KVM is
> initialised, these configurations are set in stone, and there is no
> going back.
>
Understood.
> > If that's the case, I can get rid of the WARN_ON() and unmap_data.
>
> yes, please.
>
Sure, I'll get rid of the WARN_ON and 'struct stage2_unmap_data' in v8.

Thanks,
Raghavendra
> Thanks,
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 5ef098af1736..cf88933a2ea0 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -831,16 +831,54 @@  static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n
 	smp_store_release(ctx->ptep, new);
 }
 
-static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
-			   struct kvm_pgtable_mm_ops *mm_ops)
+struct stage2_unmap_data {
+	struct kvm_pgtable *pgt;
+	bool defer_tlb_flush_init;
+};
+
+static bool __stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
+{
+	/*
+	 * If FEAT_TLBIRANGE is implemented, defer the individual
+	 * TLB invalidations until the entire walk is finished, and
+	 * then use the range-based TLBI instructions to do the
+	 * invalidations. Condition deferred TLB invalidation on the
+	 * system supporting FWB, as the optimization is entirely
+	 * pointless when the unmap walker needs to perform CMOs.
+	 */
+	return system_supports_tlb_range() && stage2_has_fwb(pgt);
+}
+
+static bool stage2_unmap_defer_tlb_flush(struct stage2_unmap_data *unmap_data)
+{
+	bool defer_tlb_flush = __stage2_unmap_defer_tlb_flush(unmap_data->pgt);
+
+	/*
+	 * Since __stage2_unmap_defer_tlb_flush() is based on alternative
+	 * patching and the TLBIs' operations behavior depend on this,
+	 * track if there's any change in the state during the unmap sequence.
+	 */
+	WARN_ON(unmap_data->defer_tlb_flush_init != defer_tlb_flush);
+	return defer_tlb_flush;
+}
+
+static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
+				struct kvm_s2_mmu *mmu,
+				struct kvm_pgtable_mm_ops *mm_ops)
 {
+	struct stage2_unmap_data *unmap_data = ctx->arg;
+
 	/*
-	 * Clear the existing PTE, and perform break-before-make with
-	 * TLB maintenance if it was valid.
+	 * Clear the existing PTE, and perform break-before-make if it was
+	 * valid. Depending on the system support, the TLB maintenance for
+	 * the same can be deferred until the entire unmap is completed.
 	 */
 	if (kvm_pte_valid(ctx->old)) {
 		kvm_clear_pte(ctx->ptep);
-		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+
+		if (!stage2_unmap_defer_tlb_flush(unmap_data))
+			kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
+					ctx->addr, ctx->level);
 	}
 
 	mm_ops->put_page(ctx->ptep);
@@ -1070,7 +1108,8 @@  int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 			       enum kvm_pgtable_walk_flags visit)
 {
-	struct kvm_pgtable *pgt = ctx->arg;
+	struct stage2_unmap_data *unmap_data = ctx->arg;
+	struct kvm_pgtable *pgt = unmap_data->pgt;
 	struct kvm_s2_mmu *mmu = pgt->mmu;
 	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 	kvm_pte_t *childp = NULL;
@@ -1098,7 +1137,7 @@  static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	 * block entry and rely on the remaining portions being faulted
 	 * back lazily.
 	 */
-	stage2_put_pte(ctx, mmu, mm_ops);
+	stage2_unmap_put_pte(ctx, mmu, mm_ops);
 
 	if (need_flush && mm_ops->dcache_clean_inval_poc)
 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
@@ -1112,13 +1151,23 @@  static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 
 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
+	int ret;
+	struct stage2_unmap_data unmap_data = {
+		.pgt = pgt,
+		.defer_tlb_flush_init = __stage2_unmap_defer_tlb_flush(pgt),
+	};
 	struct kvm_pgtable_walker walker = {
 		.cb	= stage2_unmap_walker,
-		.arg	= pgt,
+		.arg	= &unmap_data,
 		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
 	};
 
-	return kvm_pgtable_walk(pgt, addr, size, &walker);
+	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+	if (stage2_unmap_defer_tlb_flush(&unmap_data))
+		/* Perform the deferred TLB invalidations */
+		kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
+
+	return ret;
 }
 
 struct stage2_attr_data {