diff mbox series

arm64: mte: avoid TFSR related operations unless in async mode

Message ID 20210701031448.2173-1-pcc@google.com (mailing list archive)
State New, archived
Headers show
Series arm64: mte: avoid TFSR related operations unless in async mode | expand

Commit Message

Peter Collingbourne July 1, 2021, 3:14 a.m. UTC
There is no reason to touch TFSR nor issue a DSB unless our task is
in asynchronous mode. Since these operations (especially the DSB)
may be expensive on certain microarchitectures, only perform them
if necessary.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/Ib353a63e3d0abc2b0b008e96aa2d9692cfc1b815
---
 arch/arm64/kernel/entry.S | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

Comments

Catalin Marinas July 1, 2021, 5:37 p.m. UTC | #1
On Wed, Jun 30, 2021 at 08:14:48PM -0700, Peter Collingbourne wrote:
> There is no reason to touch TFSR nor issue a DSB unless our task is
> in asynchronous mode. Since these operations (especially the DSB)
> may be expensive on certain microarchitectures, only perform them
> if necessary.

Have you noticed any performance degradation in practice?

> Signed-off-by: Peter Collingbourne <pcc@google.com>
> Link: https://linux-review.googlesource.com/id/Ib353a63e3d0abc2b0b008e96aa2d9692cfc1b815
> ---
>  arch/arm64/kernel/entry.S | 27 +++++++++++++++++++--------
>  1 file changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 863d44f73028..c2338414c558 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -133,12 +133,18 @@ alternative_cb_end
>  	.endm
>  
>  	/* Check for MTE asynchronous tag check faults */
> -	.macro check_mte_async_tcf, tmp, ti_flags
> +	.macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr
>  #ifdef CONFIG_ARM64_MTE
>  	.arch_extension lse
>  alternative_if_not ARM64_MTE
>  	b	1f
>  alternative_else_nop_endif
> +	/*
> +	 * Asynchronous tag check faults are only possible in ASYNC (2) or
> +	 * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is
> +	 * set, so skip the check if it is unset.
> +	 */
> +	tbz	\thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
>  	mrs_s	\tmp, SYS_TFSRE0_EL1
>  	tbz	\tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f

I don't think this one is that expensive.

>  	/* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
> @@ -151,11 +157,14 @@ alternative_else_nop_endif
>  	.endm
>  
>  	/* Clear the MTE asynchronous tag check faults */
> -	.macro clear_mte_async_tcf
> +	.macro clear_mte_async_tcf thread_sctlr
>  #ifdef CONFIG_ARM64_MTE
>  alternative_if ARM64_MTE
> +	/* See comment in check_mte_async_tcf above. */
> +	tbz	\thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
>  	dsb	ish
>  	msr_s	SYS_TFSRE0_EL1, xzr
> +1:

Here, maybe, as we have a DSB.
Peter Collingbourne July 1, 2021, 6:11 p.m. UTC | #2
On Thu, Jul 1, 2021 at 10:37 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Wed, Jun 30, 2021 at 08:14:48PM -0700, Peter Collingbourne wrote:
> > There is no reason to touch TFSR nor issue a DSB unless our task is
> > in asynchronous mode. Since these operations (especially the DSB)
> > may be expensive on certain microarchitectures, only perform them
> > if necessary.
>
> Have you noticed any performance degradation in practice?

Yes, we measured a degradation on the hardware that we have access to.
But as I'm sure you can understand I have to be a bit vague about the
numbers here.

> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > Link: https://linux-review.googlesource.com/id/Ib353a63e3d0abc2b0b008e96aa2d9692cfc1b815
> > ---
> >  arch/arm64/kernel/entry.S | 27 +++++++++++++++++++--------
> >  1 file changed, 19 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 863d44f73028..c2338414c558 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -133,12 +133,18 @@ alternative_cb_end
> >       .endm
> >
> >       /* Check for MTE asynchronous tag check faults */
> > -     .macro check_mte_async_tcf, tmp, ti_flags
> > +     .macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr
> >  #ifdef CONFIG_ARM64_MTE
> >       .arch_extension lse
> >  alternative_if_not ARM64_MTE
> >       b       1f
> >  alternative_else_nop_endif
> > +     /*
> > +      * Asynchronous tag check faults are only possible in ASYNC (2) or
> > +      * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is
> > +      * set, so skip the check if it is unset.
> > +      */
> > +     tbz     \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
> >       mrs_s   \tmp, SYS_TFSRE0_EL1
> >       tbz     \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f
>
> I don't think this one is that expensive.
>
> >       /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
> > @@ -151,11 +157,14 @@ alternative_else_nop_endif
> >       .endm
> >
> >       /* Clear the MTE asynchronous tag check faults */
> > -     .macro clear_mte_async_tcf
> > +     .macro clear_mte_async_tcf thread_sctlr
> >  #ifdef CONFIG_ARM64_MTE
> >  alternative_if ARM64_MTE
> > +     /* See comment in check_mte_async_tcf above. */
> > +     tbz     \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
> >       dsb     ish
> >       msr_s   SYS_TFSRE0_EL1, xzr
> > +1:
>
> Here, maybe, as we have a DSB.

Yes, disabling clear_mte_async_tcf offered an order of magnitude
larger speedup than disabing check_mte_async_tcf, presumably due to
the DSB. I would reckon though that if we're going to make some of the
code conditional on TCF we might as well make all of it conditional in
order to get the maximum possible benefit.

Nevertheless, isn't it the case that disabling check_mte_async_tcf for
non-ASYNC tasks is necessary for correctness if we want to disable
clear_mte_async_tcf? Imagine that we just disable clear_mte_async_tcf,
and then we get a tag check failing uaccess in a TCF=ASYNC task which
then gets preempted by a TCF=NONE task which will skip clear on kernel
exit. If we don't disable check on kernel entry then I believe that we
will get a false positive tag check fault in the TCF=NONE task the
next time it enters the kernel.

Peter
Catalin Marinas July 2, 2021, 5:37 p.m. UTC | #3
On Thu, Jul 01, 2021 at 11:11:34AM -0700, Peter Collingbourne wrote:
> On Thu, Jul 1, 2021 at 10:37 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Wed, Jun 30, 2021 at 08:14:48PM -0700, Peter Collingbourne wrote:
> > >       /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
> > > @@ -151,11 +157,14 @@ alternative_else_nop_endif
> > >       .endm
> > >
> > >       /* Clear the MTE asynchronous tag check faults */
> > > -     .macro clear_mte_async_tcf
> > > +     .macro clear_mte_async_tcf thread_sctlr
> > >  #ifdef CONFIG_ARM64_MTE
> > >  alternative_if ARM64_MTE
> > > +     /* See comment in check_mte_async_tcf above. */
> > > +     tbz     \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
> > >       dsb     ish
> > >       msr_s   SYS_TFSRE0_EL1, xzr
> > > +1:
> >
> > Here, maybe, as we have a DSB.
> 
> Yes, disabling clear_mte_async_tcf offered an order of magnitude
> larger speedup than disabing check_mte_async_tcf, presumably due to
> the DSB. I would reckon though that if we're going to make some of the
> code conditional on TCF we might as well make all of it conditional in
> order to get the maximum possible benefit.

I'd like to avoid a TBZ on sctlr_user if it's not necessary. I reckon
the big CPUs would prefer async mode anyway.

> Nevertheless, isn't it the case that disabling check_mte_async_tcf for
> non-ASYNC tasks is necessary for correctness if we want to disable
> clear_mte_async_tcf? Imagine that we just disable clear_mte_async_tcf,
> and then we get a tag check failing uaccess in a TCF=ASYNC task which
> then gets preempted by a TCF=NONE task which will skip clear on kernel
> exit. If we don't disable check on kernel entry then I believe that we
> will get a false positive tag check fault in the TCF=NONE task the
> next time it enters the kernel.

You are right, only doing one side would cause potential issues.

The uaccess routines honour the SCTLR_EL1.TCF0 setting (it's been
corrected in the architecture pseudocode some months ago). If we zero
TFSRE0_EL1 in mte_tread_switch(), it should cover your case. This
shouldn't be expensive since we already have a DSB on that path. I'm not
sure it's better than your proposal but not allowing the TFSRE0_EL1
state to span multiple threads makes reasoning about it a bit easier.

If the above context switch zeroing doesn't work, we could go ahead with
your patch. But since TFSRE0_EL1 != 0 is a rare event and we expect to
run in async mode on some CPUs, we could move the TBZ on sctlr_user in
check_mte_async_tcf after the tbz for the actual TFSRE0_EL1. IOW, only
check it prior to setting the TIF flag.

BTW, I think currently on entry we can avoid zeroing TFSRE0_EL1 since we
clear it on return anyway, so one less instruction (irrespective of your
patch).
Peter Collingbourne July 3, 2021, 2:46 a.m. UTC | #4
On Fri, Jul 2, 2021 at 10:37 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Thu, Jul 01, 2021 at 11:11:34AM -0700, Peter Collingbourne wrote:
> > On Thu, Jul 1, 2021 at 10:37 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > On Wed, Jun 30, 2021 at 08:14:48PM -0700, Peter Collingbourne wrote:
> > > >       /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
> > > > @@ -151,11 +157,14 @@ alternative_else_nop_endif
> > > >       .endm
> > > >
> > > >       /* Clear the MTE asynchronous tag check faults */
> > > > -     .macro clear_mte_async_tcf
> > > > +     .macro clear_mte_async_tcf thread_sctlr
> > > >  #ifdef CONFIG_ARM64_MTE
> > > >  alternative_if ARM64_MTE
> > > > +     /* See comment in check_mte_async_tcf above. */
> > > > +     tbz     \thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
> > > >       dsb     ish
> > > >       msr_s   SYS_TFSRE0_EL1, xzr
> > > > +1:
> > >
> > > Here, maybe, as we have a DSB.
> >
> > Yes, disabling clear_mte_async_tcf offered an order of magnitude
> > larger speedup than disabing check_mte_async_tcf, presumably due to
> > the DSB. I would reckon though that if we're going to make some of the
> > code conditional on TCF we might as well make all of it conditional in
> > order to get the maximum possible benefit.
>
> I'd like to avoid a TBZ on sctlr_user if it's not necessary. I reckon
> the big CPUs would prefer async mode anyway.

This is more targeted to tasks running with TCF=NONE. In this case, we
should avoid the MTE-related overheads as much as possible.

I measured the performance impact of this change with TCF=ASYNC tasks
and was barely able to measure a difference with 30 samples at 95% CI
(and then only on one cluster). So I don't think we should worry about
the TBZ here.

> > Nevertheless, isn't it the case that disabling check_mte_async_tcf for
> > non-ASYNC tasks is necessary for correctness if we want to disable
> > clear_mte_async_tcf? Imagine that we just disable clear_mte_async_tcf,
> > and then we get a tag check failing uaccess in a TCF=ASYNC task which
> > then gets preempted by a TCF=NONE task which will skip clear on kernel
> > exit. If we don't disable check on kernel entry then I believe that we
> > will get a false positive tag check fault in the TCF=NONE task the
> > next time it enters the kernel.
>
> You are right, only doing one side would cause potential issues.
>
> The uaccess routines honour the SCTLR_EL1.TCF0 setting (it's been
> corrected in the architecture pseudocode some months ago). If we zero
> TFSRE0_EL1 in mte_tread_switch(), it should cover your case. This
> shouldn't be expensive since we already have a DSB on that path. I'm not
> sure it's better than your proposal but not allowing the TFSRE0_EL1
> state to span multiple threads makes reasoning about it a bit easier.

I think it's debatable whether it makes it easier to reason about. If
you think of the "clear" operation as "priming" the CPU to check for
tag check faults later, then a code structure where the "priming"
operation clearly always happens before the check operation (because
they use the same condition, and are placed symmetrically to one
another) would seem easier to reason about IMHO.

> If the above context switch zeroing doesn't work, we could go ahead with
> your patch. But since TFSRE0_EL1 != 0 is a rare event and we expect to
> run in async mode on some CPUs, we could move the TBZ on sctlr_user in
> check_mte_async_tcf after the tbz for the actual TFSRE0_EL1. IOW, only
> check it prior to setting the TIF flag.
>
> BTW, I think currently on entry we can avoid zeroing TFSRE0_EL1 since we
> clear it on return anyway, so one less instruction (irrespective of your
> patch).

Nice observation. I'll fold that into v2.

Peter
diff mbox series

Patch

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 863d44f73028..c2338414c558 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -133,12 +133,18 @@  alternative_cb_end
 	.endm
 
 	/* Check for MTE asynchronous tag check faults */
-	.macro check_mte_async_tcf, tmp, ti_flags
+	.macro check_mte_async_tcf, tmp, ti_flags, thread_sctlr
 #ifdef CONFIG_ARM64_MTE
 	.arch_extension lse
 alternative_if_not ARM64_MTE
 	b	1f
 alternative_else_nop_endif
+	/*
+	 * Asynchronous tag check faults are only possible in ASYNC (2) or
+	 * ASYM (3) modes. In each of these modes bit 1 of SCTLR_EL1.TCF0 is
+	 * set, so skip the check if it is unset.
+	 */
+	tbz	\thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
 	mrs_s	\tmp, SYS_TFSRE0_EL1
 	tbz	\tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f
 	/* Asynchronous TCF occurred for TTBR0 access, set the TI flag */
@@ -151,11 +157,14 @@  alternative_else_nop_endif
 	.endm
 
 	/* Clear the MTE asynchronous tag check faults */
-	.macro clear_mte_async_tcf
+	.macro clear_mte_async_tcf thread_sctlr
 #ifdef CONFIG_ARM64_MTE
 alternative_if ARM64_MTE
+	/* See comment in check_mte_async_tcf above. */
+	tbz	\thread_sctlr, #(SCTLR_EL1_TCF0_SHIFT + 1), 1f
 	dsb	ish
 	msr_s	SYS_TFSRE0_EL1, xzr
+1:
 alternative_else_nop_endif
 #endif
 	.endm
@@ -231,8 +240,8 @@  alternative_else_nop_endif
 	disable_step_tsk x19, x20
 
 	/* Check for asynchronous tag check faults in user space */
-	check_mte_async_tcf x22, x23
-	apply_ssbd 1, x22, x23
+	ldr	x0, [tsk, THREAD_SCTLR_USER]
+	check_mte_async_tcf x22, x23, x0
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 alternative_if ARM64_HAS_ADDRESS_AUTH
@@ -245,7 +254,6 @@  alternative_if ARM64_HAS_ADDRESS_AUTH
 	 * was disabled on kernel exit then we would have left the kernel IA
 	 * installed so there is no need to install it again.
 	 */
-	ldr	x0, [tsk, THREAD_SCTLR_USER]
 	tbz	x0, SCTLR_ELx_ENIA_SHIFT, 1f
 	__ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23
 	b	2f
@@ -258,6 +266,8 @@  alternative_if ARM64_HAS_ADDRESS_AUTH
 alternative_else_nop_endif
 #endif
 
+	apply_ssbd 1, x22, x23
+
 	mte_set_kernel_gcr x22, x23
 
 	scs_load tsk
@@ -362,6 +372,10 @@  alternative_else_nop_endif
 3:
 	scs_save tsk
 
+	/* Ignore asynchronous tag check faults in the uaccess routines */
+	ldr	x0, [tsk, THREAD_SCTLR_USER]
+	clear_mte_async_tcf x0
+
 #ifdef CONFIG_ARM64_PTR_AUTH
 alternative_if ARM64_HAS_ADDRESS_AUTH
 	/*
@@ -371,7 +385,6 @@  alternative_if ARM64_HAS_ADDRESS_AUTH
 	 *
 	 * No kernel C function calls after this.
 	 */
-	ldr	x0, [tsk, THREAD_SCTLR_USER]
 	tbz	x0, SCTLR_ELx_ENIA_SHIFT, 1f
 	__ptrauth_keys_install_user tsk, x0, x1, x2
 	b	2f
@@ -599,8 +612,6 @@  SYM_CODE_START_LOCAL(ret_to_user)
 	cbnz	x2, work_pending
 finish_ret_to_user:
 	user_enter_irqoff
-	/* Ignore asynchronous tag check faults in the uaccess routines */
-	clear_mte_async_tcf
 	enable_step_tsk x19, x2
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 	bl	stackleak_erase