diff mbox series

arm64: mte: avoid clearing PSTATE.TCO on entry unless necessary

Message ID 20210929194525.3252555-1-pcc@google.com (mailing list archive)
State New, archived
Headers show
Series arm64: mte: avoid clearing PSTATE.TCO on entry unless necessary | expand

Commit Message

Peter Collingbourne Sept. 29, 2021, 7:45 p.m. UTC
On some microarchitectures, clearing PSTATE.TCO is expensive. Clearing
TCO is only necessary if in-kernel MTE is enabled, or if MTE is
enabled in the userspace process in synchronous (or, soon, asymmetric)
mode, because we do not report uaccess faults to userspace in none
or asynchronous modes. Therefore, adjust the kernel entry code to
clear TCO only if necessary.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/I52d82a580bd0500d420be501af2c35fa8c90729e
---
 arch/arm64/kernel/entry.S | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

Comments

Catalin Marinas Oct. 5, 2021, 4:46 p.m. UTC | #1
On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2f69ae43941d..85ead6bbb38e 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -269,7 +269,28 @@ alternative_else_nop_endif
>  	.else
>  	add	x21, sp, #PT_REGS_SIZE
>  	get_current_task tsk
> +	ldr	x0, [tsk, THREAD_SCTLR_USER]
>  	.endif /* \el == 0 */
> +
> +	/*
> +	 * Re-enable tag checking (TCO set on exception entry). This is only
> +	 * necessary if MTE is enabled in either the kernel or the userspace
> +	 * task in synchronous mode. With MTE disabled in the kernel and
> +	 * disabled or asynchronous in userspace, tag check faults (including in
> +	 * uaccesses) are not reported, therefore there is no need to re-enable
> +	 * checking. This is beneficial on microarchitectures where re-enabling
> +	 * TCO is expensive.
> +	 */
> +#ifdef CONFIG_ARM64_MTE
> +alternative_cb	kasan_hw_tags_enable
> +	tbz	x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> +alternative_cb_end
> +alternative_if ARM64_MTE
> +	SET_PSTATE_TCO(0)
> +alternative_else_nop_endif
> +1:
> +#endif

I think we can get here from an interrupt as well. Can we guarantee that
the sctlr_user is valid? We are not always in a user process context.

Maybe only do the above checks if \el == 0, otherwise just bracket it
with kasan_hw_tags_enable.
Peter Collingbourne Oct. 5, 2021, 7:08 p.m. UTC | #2
On Tue, Oct 5, 2021 at 9:46 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 2f69ae43941d..85ead6bbb38e 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -269,7 +269,28 @@ alternative_else_nop_endif
> >       .else
> >       add     x21, sp, #PT_REGS_SIZE
> >       get_current_task tsk
> > +     ldr     x0, [tsk, THREAD_SCTLR_USER]
> >       .endif /* \el == 0 */
> > +
> > +     /*
> > +      * Re-enable tag checking (TCO set on exception entry). This is only
> > +      * necessary if MTE is enabled in either the kernel or the userspace
> > +      * task in synchronous mode. With MTE disabled in the kernel and
> > +      * disabled or asynchronous in userspace, tag check faults (including in
> > +      * uaccesses) are not reported, therefore there is no need to re-enable
> > +      * checking. This is beneficial on microarchitectures where re-enabling
> > +      * TCO is expensive.
> > +      */
> > +#ifdef CONFIG_ARM64_MTE
> > +alternative_cb       kasan_hw_tags_enable
> > +     tbz     x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> > +alternative_cb_end
> > +alternative_if ARM64_MTE
> > +     SET_PSTATE_TCO(0)
> > +alternative_else_nop_endif
> > +1:
> > +#endif
>
> I think we can get here from an interrupt as well. Can we guarantee that
> the sctlr_user is valid? We are not always in a user process context.

Looking through the code in entry.S carefully it doesn't appear that
the tsk pointer is ever used when taking an exception from EL1. The
last user appears to have been removed in commit 3d2403fd10a1 ("arm64:
uaccess: remove set_fs()"). I just did a quick boot test on a couple
of platforms and things seem to work without the "get_current_task
tsk" line.

So I can't be confident that tsk points to a valid task, but at least
it looks like it was the case prior to that commit.

> Maybe only do the above checks if \el == 0, otherwise just bracket it
> with kasan_hw_tags_enable.

Is it possible for us to do a uaccess inside the EL1 -> EL1 exception
handler? That was the one thing I was unsure about and it's why I
opted to do the same check regardless of EL.

If we don't think these uaccesses can happen, or we don't think it
matters that they will either fail or not depending on whether KASAN
is enabled, then I think we can make it dependent on only
kasan_hw_tags_enable.

Peter
Mark Rutland Oct. 7, 2021, 10:20 a.m. UTC | #3
On Tue, Oct 05, 2021 at 12:08:58PM -0700, Peter Collingbourne wrote:
> On Tue, Oct 5, 2021 at 9:46 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > > index 2f69ae43941d..85ead6bbb38e 100644
> > > --- a/arch/arm64/kernel/entry.S
> > > +++ b/arch/arm64/kernel/entry.S
> > > @@ -269,7 +269,28 @@ alternative_else_nop_endif
> > >       .else
> > >       add     x21, sp, #PT_REGS_SIZE
> > >       get_current_task tsk
> > > +     ldr     x0, [tsk, THREAD_SCTLR_USER]
> > >       .endif /* \el == 0 */
> > > +
> > > +     /*
> > > +      * Re-enable tag checking (TCO set on exception entry). This is only
> > > +      * necessary if MTE is enabled in either the kernel or the userspace
> > > +      * task in synchronous mode. With MTE disabled in the kernel and
> > > +      * disabled or asynchronous in userspace, tag check faults (including in
> > > +      * uaccesses) are not reported, therefore there is no need to re-enable
> > > +      * checking. This is beneficial on microarchitectures where re-enabling
> > > +      * TCO is expensive.
> > > +      */
> > > +#ifdef CONFIG_ARM64_MTE
> > > +alternative_cb       kasan_hw_tags_enable
> > > +     tbz     x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> > > +alternative_cb_end
> > > +alternative_if ARM64_MTE
> > > +     SET_PSTATE_TCO(0)
> > > +alternative_else_nop_endif
> > > +1:
> > > +#endif
> >
> > I think we can get here from an interrupt as well. Can we guarantee that
> > the sctlr_user is valid? We are not always in a user process context.
> 
> Looking through the code in entry.S carefully it doesn't appear that
> the tsk pointer is ever used when taking an exception from EL1. The
> last user appears to have been removed in commit 3d2403fd10a1 ("arm64:
> uaccess: remove set_fs()"). I just did a quick boot test on a couple
> of platforms and things seem to work without the "get_current_task
> tsk" line.
> 
> So I can't be confident that tsk points to a valid task, but at least
> it looks like it was the case prior to that commit.
> 
> > Maybe only do the above checks if \el == 0, otherwise just bracket it
> > with kasan_hw_tags_enable.
> 
> Is it possible for us to do a uaccess inside the EL1 -> EL1 exception
> handler? That was the one thing I was unsure about and it's why I
> opted to do the same check regardless of EL.

Today we can do so if we take a PMU IRQ from EL1 and try to do an EL0
stack unwind or stack dump, and similar holds for some eBPF stuff.

We also need to ensure that PSTATE.TCO is configured consistently so
that context-switch works, otherwise where a CPU switches between tasks
A and B, where A is preempted by an EL1 IRQ, and B is explicitly
switching via a direct call to schedule(), the state of TCO will not be
as expected (unless we track this per thread, and handle it in the
context switch).

I'd strongly prefer that the state of PSTATE.TCO upon taking an
exception to EL1 is consistent (by the end of the early entry code),
regardless of where that was taken from.

Is it easier to manage this from within entry-common.c?

Thanks,
Mark.
Peter Collingbourne Nov. 5, 2021, 8:18 p.m. UTC | #4
On Thu, Oct 7, 2021 at 3:20 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Oct 05, 2021 at 12:08:58PM -0700, Peter Collingbourne wrote:
> > On Tue, Oct 5, 2021 at 9:46 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> > > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > > > index 2f69ae43941d..85ead6bbb38e 100644
> > > > --- a/arch/arm64/kernel/entry.S
> > > > +++ b/arch/arm64/kernel/entry.S
> > > > @@ -269,7 +269,28 @@ alternative_else_nop_endif
> > > >       .else
> > > >       add     x21, sp, #PT_REGS_SIZE
> > > >       get_current_task tsk
> > > > +     ldr     x0, [tsk, THREAD_SCTLR_USER]
> > > >       .endif /* \el == 0 */
> > > > +
> > > > +     /*
> > > > +      * Re-enable tag checking (TCO set on exception entry). This is only
> > > > +      * necessary if MTE is enabled in either the kernel or the userspace
> > > > +      * task in synchronous mode. With MTE disabled in the kernel and
> > > > +      * disabled or asynchronous in userspace, tag check faults (including in
> > > > +      * uaccesses) are not reported, therefore there is no need to re-enable
> > > > +      * checking. This is beneficial on microarchitectures where re-enabling
> > > > +      * TCO is expensive.
> > > > +      */
> > > > +#ifdef CONFIG_ARM64_MTE
> > > > +alternative_cb       kasan_hw_tags_enable
> > > > +     tbz     x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> > > > +alternative_cb_end
> > > > +alternative_if ARM64_MTE
> > > > +     SET_PSTATE_TCO(0)
> > > > +alternative_else_nop_endif
> > > > +1:
> > > > +#endif
> > >
> > > I think we can get here from an interrupt as well. Can we guarantee that
> > > the sctlr_user is valid? We are not always in a user process context.
> >
> > Looking through the code in entry.S carefully it doesn't appear that
> > the tsk pointer is ever used when taking an exception from EL1. The
> > last user appears to have been removed in commit 3d2403fd10a1 ("arm64:
> > uaccess: remove set_fs()"). I just did a quick boot test on a couple
> > of platforms and things seem to work without the "get_current_task
> > tsk" line.
> >
> > So I can't be confident that tsk points to a valid task, but at least
> > it looks like it was the case prior to that commit.

The EL1 entries unconditionally call enter_from_kernel_mode() which
(at least in some configurations) unconditionally accesses current, so
I'm reasonably confident that we have a valid current pointer here.

> > > Maybe only do the above checks if \el == 0, otherwise just bracket it
> > > with kasan_hw_tags_enable.
> >
> > Is it possible for us to do a uaccess inside the EL1 -> EL1 exception
> > handler? That was the one thing I was unsure about and it's why I
> > opted to do the same check regardless of EL.
>
> Today we can do so if we take a PMU IRQ from EL1 and try to do an EL0
> stack unwind or stack dump, and similar holds for some eBPF stuff.
>
> We also need to ensure that PSTATE.TCO is configured consistently so
> that context-switch works, otherwise where a CPU switches between tasks
> A and B, where A is preempted by an EL1 IRQ, and B is explicitly
> switching via a direct call to schedule(), the state of TCO will not be
> as expected (unless we track this per thread, and handle it in the
> context switch).

Isn't this already context switched via storing the per-task SPSR?
Otherwise e.g. the TCO controls in __uaccess_disable_tco() and
__uaccess_enable_tco() would have the same problem.

> I'd strongly prefer that the state of PSTATE.TCO upon taking an
> exception to EL1 is consistent (by the end of the early entry code),
> regardless of where that was taken from.
>
> Is it easier to manage this from within entry-common.c?

It doesn't look like we have any common location to add code that runs
on every kernel entry in entry-common.c. I tried adding one (patch
below), but this ended up leading to a performance regression (versus
baseline) on some of the cores on the hardware that I have access to.

Peter

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 64cfe4a3798f..ad066b09a6b7 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -17,6 +17,23 @@
 #include <asm/mmu.h>
 #include <asm/sysreg.h>

+static void mte_disable_tco_entry(void)
+{
+ /*
+ * Re-enable tag checking (TCO set on exception entry). This is only
+ * necessary if MTE is enabled in either the kernel or the userspace
+ * task in synchronous mode. With MTE disabled in the kernel and
+ * disabled or asynchronous in userspace, tag check faults (including in
+ * uaccesses) are not reported, therefore there is no need to re-enable
+ * checking. This is beneficial on microarchitectures where re-enabling
+ * TCO is expensive.
+ */
+ if (IS_ENABLED(CONFIG_ARM64_MTE) &&
+     (kasan_hw_tags_enabled() ||
+      (current->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))))
+ asm volatile(SET_PSTATE_TCO(0));
+}
+
 /*
  * This is intended to match the logic in irqentry_enter(), handling the kernel
  * mode transitions only.
@@ -39,6 +56,7 @@ static void noinstr enter_from_kernel_mode(struct
pt_regs *regs)
  trace_hardirqs_off_finish();

  mte_check_tfsr_entry();
+ mte_disable_tco_entry();
 }

 /*
@@ -235,6 +253,7 @@ asmlinkage void noinstr enter_from_user_mode(void)
  CT_WARN_ON(ct_state() != CONTEXT_USER);
  user_exit_irqoff();
  trace_hardirqs_off_finish();
+ mte_disable_tco_entry();
 }

 asmlinkage void noinstr exit_to_user_mode(void)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 42c404d508ca..8c63279ffea7 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -339,13 +339,6 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
  msr_s SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif

- /* Re-enable tag checking (TCO set on exception entry) */
-#ifdef CONFIG_ARM64_MTE
-alternative_if ARM64_MTE
- SET_PSTATE_TCO(0)
-alternative_else_nop_endif
-#endif
-
  /*
  * Registers that may be useful after this macro is invoked:
  *
Mark Rutland Nov. 8, 2021, 11:01 a.m. UTC | #5
On Fri, Nov 05, 2021 at 01:18:38PM -0700, Peter Collingbourne wrote:
> On Thu, Oct 7, 2021 at 3:20 AM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > On Tue, Oct 05, 2021 at 12:08:58PM -0700, Peter Collingbourne wrote:
> > > On Tue, Oct 5, 2021 at 9:46 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > > On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> > > > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > > > > index 2f69ae43941d..85ead6bbb38e 100644
> > > > > --- a/arch/arm64/kernel/entry.S
> > > > > +++ b/arch/arm64/kernel/entry.S
> > > > > @@ -269,7 +269,28 @@ alternative_else_nop_endif
> > > > >       .else
> > > > >       add     x21, sp, #PT_REGS_SIZE
> > > > >       get_current_task tsk
> > > > > +     ldr     x0, [tsk, THREAD_SCTLR_USER]
> > > > >       .endif /* \el == 0 */
> > > > > +
> > > > > +     /*
> > > > > +      * Re-enable tag checking (TCO set on exception entry). This is only
> > > > > +      * necessary if MTE is enabled in either the kernel or the userspace
> > > > > +      * task in synchronous mode. With MTE disabled in the kernel and
> > > > > +      * disabled or asynchronous in userspace, tag check faults (including in
> > > > > +      * uaccesses) are not reported, therefore there is no need to re-enable
> > > > > +      * checking. This is beneficial on microarchitectures where re-enabling
> > > > > +      * TCO is expensive.
> > > > > +      */
> > > > > +#ifdef CONFIG_ARM64_MTE
> > > > > +alternative_cb       kasan_hw_tags_enable
> > > > > +     tbz     x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> > > > > +alternative_cb_end
> > > > > +alternative_if ARM64_MTE
> > > > > +     SET_PSTATE_TCO(0)
> > > > > +alternative_else_nop_endif
> > > > > +1:
> > > > > +#endif
> > > >
> > > > I think we can get here from an interrupt as well. Can we guarantee that
> > > > the sctlr_user is valid? We are not always in a user process context.
> > >
> > > Looking through the code in entry.S carefully it doesn't appear that
> > > the tsk pointer is ever used when taking an exception from EL1. The
> > > last user appears to have been removed in commit 3d2403fd10a1 ("arm64:
> > > uaccess: remove set_fs()"). I just did a quick boot test on a couple
> > > of platforms and things seem to work without the "get_current_task
> > > tsk" line.
> > >
> > > So I can't be confident that tsk points to a valid task, but at least
> > > it looks like it was the case prior to that commit.
> 
> The EL1 entries unconditionally call enter_from_kernel_mode() which
> (at least in some configurations) unconditionally accesses current, so
> I'm reasonably confident that we have a valid current pointer here.

That's the value in SP_EL0, which is valid so long as we never take an
exception from the entry assembly itself. We restore that from __entry_task in
the EL0 entry paths, and context-switch it when switching threads.

I would like to get rid of `tsk` in the entry assembly, but that requires
moving more of that assembly to C.

> > > > Maybe only do the above checks if \el == 0, otherwise just bracket it
> > > > with kasan_hw_tags_enable.
> > >
> > > Is it possible for us to do a uaccess inside the EL1 -> EL1 exception
> > > handler? That was the one thing I was unsure about and it's why I
> > > opted to do the same check regardless of EL.
> >
> > Today we can do so if we take a PMU IRQ from EL1 and try to do an EL0
> > stack unwind or stack dump, and similar holds for some eBPF stuff.
> >
> > We also need to ensure that PSTATE.TCO is configured consistently so
> > that context-switch works, otherwise where a CPU switches between tasks
> > A and B, where A is preempted by an EL1 IRQ, and B is explicitly
> > switching via a direct call to schedule(), the state of TCO will not be
> > as expected (unless we track this per thread, and handle it in the
> > context switch).
> 
> Isn't this already context switched via storing the per-task SPSR?
> Otherwise e.g. the TCO controls in __uaccess_disable_tco() and
> __uaccess_enable_tco() would have the same problem.

That's the case for pre-emptive scheduling off the back of an interrupt, but
tasks can call into schedule without an intervening exception, e.g.  blocking
on something like a mutex. Those blocking primitives aren't permitted to be
used within a __uaccess_{disable,enable)_tco() critical section.

> > I'd strongly prefer that the state of PSTATE.TCO upon taking an
> > exception to EL1 is consistent (by the end of the early entry code),
> > regardless of where that was taken from.
> >
> > Is it easier to manage this from within entry-common.c?
> 
> It doesn't look like we have any common location to add code that runs
> on every kernel entry in entry-common.c.

I was thinking we could rework the existing mte_check_tfsr_*() callsites into
mte_enter_from_{user,kernel}() hooks, which'd allow us to avoid redundant checks
for MTE support.

> I tried adding one (patch
> below), but this ended up leading to a performance regression (versus
> baseline) on some of the cores on the hardware that I have access to.

How big a regression? On a real workload or a microbenchmark?

Is that true for a simple move of the current logic to C, i.e. for an
unconditional SET_PSTATE_TCO(0) gated by cpus_have_final_cap(ARM64_MTE) ?

Thanks,
Mark.

> 
> Peter
> 
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index 64cfe4a3798f..ad066b09a6b7 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -17,6 +17,23 @@
>  #include <asm/mmu.h>
>  #include <asm/sysreg.h>
> 
> +static void mte_disable_tco_entry(void)
> +{
> + /*
> + * Re-enable tag checking (TCO set on exception entry). This is only
> + * necessary if MTE is enabled in either the kernel or the userspace
> + * task in synchronous mode. With MTE disabled in the kernel and
> + * disabled or asynchronous in userspace, tag check faults (including in
> + * uaccesses) are not reported, therefore there is no need to re-enable
> + * checking. This is beneficial on microarchitectures where re-enabling
> + * TCO is expensive.
> + */
> + if (IS_ENABLED(CONFIG_ARM64_MTE) &&
> +     (kasan_hw_tags_enabled() ||
> +      (current->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))))
> + asm volatile(SET_PSTATE_TCO(0));
> +}
> +
>  /*
>   * This is intended to match the logic in irqentry_enter(), handling the kernel
>   * mode transitions only.
> @@ -39,6 +56,7 @@ static void noinstr enter_from_kernel_mode(struct
> pt_regs *regs)
>   trace_hardirqs_off_finish();
> 
>   mte_check_tfsr_entry();
> + mte_disable_tco_entry();
>  }
> 
>  /*
> @@ -235,6 +253,7 @@ asmlinkage void noinstr enter_from_user_mode(void)
>   CT_WARN_ON(ct_state() != CONTEXT_USER);
>   user_exit_irqoff();
>   trace_hardirqs_off_finish();
> + mte_disable_tco_entry();
>  }
> 
>  asmlinkage void noinstr exit_to_user_mode(void)
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 42c404d508ca..8c63279ffea7 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -339,13 +339,6 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   msr_s SYS_ICC_PMR_EL1, x20
>  alternative_else_nop_endif
> 
> - /* Re-enable tag checking (TCO set on exception entry) */
> -#ifdef CONFIG_ARM64_MTE
> -alternative_if ARM64_MTE
> - SET_PSTATE_TCO(0)
> -alternative_else_nop_endif
> -#endif
> -
>   /*
>   * Registers that may be useful after this macro is invoked:
>   *
Peter Collingbourne Nov. 10, 2021, 10:07 p.m. UTC | #6
On Mon, Nov 8, 2021 at 3:01 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Fri, Nov 05, 2021 at 01:18:38PM -0700, Peter Collingbourne wrote:
> > On Thu, Oct 7, 2021 at 3:20 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > >
> > > On Tue, Oct 05, 2021 at 12:08:58PM -0700, Peter Collingbourne wrote:
> > > > On Tue, Oct 5, 2021 at 9:46 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > > > On Wed, Sep 29, 2021 at 12:45:24PM -0700, Peter Collingbourne wrote:
> > > > > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > > > > > index 2f69ae43941d..85ead6bbb38e 100644
> > > > > > --- a/arch/arm64/kernel/entry.S
> > > > > > +++ b/arch/arm64/kernel/entry.S
> > > > > > @@ -269,7 +269,28 @@ alternative_else_nop_endif
> > > > > >       .else
> > > > > >       add     x21, sp, #PT_REGS_SIZE
> > > > > >       get_current_task tsk
> > > > > > +     ldr     x0, [tsk, THREAD_SCTLR_USER]
> > > > > >       .endif /* \el == 0 */
> > > > > > +
> > > > > > +     /*
> > > > > > +      * Re-enable tag checking (TCO set on exception entry). This is only
> > > > > > +      * necessary if MTE is enabled in either the kernel or the userspace
> > > > > > +      * task in synchronous mode. With MTE disabled in the kernel and
> > > > > > +      * disabled or asynchronous in userspace, tag check faults (including in
> > > > > > +      * uaccesses) are not reported, therefore there is no need to re-enable
> > > > > > +      * checking. This is beneficial on microarchitectures where re-enabling
> > > > > > +      * TCO is expensive.
> > > > > > +      */
> > > > > > +#ifdef CONFIG_ARM64_MTE
> > > > > > +alternative_cb       kasan_hw_tags_enable
> > > > > > +     tbz     x0, #SCTLR_EL1_TCF0_SHIFT, 1f
> > > > > > +alternative_cb_end
> > > > > > +alternative_if ARM64_MTE
> > > > > > +     SET_PSTATE_TCO(0)
> > > > > > +alternative_else_nop_endif
> > > > > > +1:
> > > > > > +#endif
> > > > >
> > > > > I think we can get here from an interrupt as well. Can we guarantee that
> > > > > the sctlr_user is valid? We are not always in a user process context.
> > > >
> > > > Looking through the code in entry.S carefully it doesn't appear that
> > > > the tsk pointer is ever used when taking an exception from EL1. The
> > > > last user appears to have been removed in commit 3d2403fd10a1 ("arm64:
> > > > uaccess: remove set_fs()"). I just did a quick boot test on a couple
> > > > of platforms and things seem to work without the "get_current_task
> > > > tsk" line.
> > > >
> > > > So I can't be confident that tsk points to a valid task, but at least
> > > > it looks like it was the case prior to that commit.
> >
> > The EL1 entries unconditionally call enter_from_kernel_mode() which
> > (at least in some configurations) unconditionally accesses current, so
> > I'm reasonably confident that we have a valid current pointer here.
>
> That's the value in SP_EL0, which is valid so long as we never take an
> exception from the entry assembly itself. We restore that from __entry_task in
> the EL0 entry paths, and context-switch it when switching threads.
>
> I would like to get rid of `tsk` in the entry assembly, but that requires
> moving more of that assembly to C.
>
> > > > > Maybe only do the above checks if \el == 0, otherwise just bracket it
> > > > > with kasan_hw_tags_enable.
> > > >
> > > > Is it possible for us to do a uaccess inside the EL1 -> EL1 exception
> > > > handler? That was the one thing I was unsure about and it's why I
> > > > opted to do the same check regardless of EL.
> > >
> > > Today we can do so if we take a PMU IRQ from EL1 and try to do an EL0
> > > stack unwind or stack dump, and similar holds for some eBPF stuff.
> > >
> > > We also need to ensure that PSTATE.TCO is configured consistently so
> > > that context-switch works, otherwise where a CPU switches between tasks
> > > A and B, where A is preempted by an EL1 IRQ, and B is explicitly
> > > switching via a direct call to schedule(), the state of TCO will not be
> > > as expected (unless we track this per thread, and handle it in the
> > > context switch).
> >
> > Isn't this already context switched via storing the per-task SPSR?
> > Otherwise e.g. the TCO controls in __uaccess_disable_tco() and
> > __uaccess_enable_tco() would have the same problem.
>
> That's the case for pre-emptive scheduling off the back of an interrupt, but
> tasks can call into schedule without an intervening exception, e.g.  blocking
> on something like a mutex. Those blocking primitives aren't permitted to be
> used within a __uaccess_{disable,enable)_tco() critical section.

I see. I guess you wouldn't even need an EL1 IRQ to expose the bug
here -- you could schedule() directly from A to B where A is NONE and
B is SYNC. I think we can handle this by clearing TCO in
cpu_switch_to() with the same condition as for exception entry; done
in v2.

> > > I'd strongly prefer that the state of PSTATE.TCO upon taking an
> > > exception to EL1 is consistent (by the end of the early entry code),
> > > regardless of where that was taken from.
> > >
> > > Is it easier to manage this from within entry-common.c?
> >
> > It doesn't look like we have any common location to add code that runs
> > on every kernel entry in entry-common.c.
>
> I was thinking we could rework the existing mte_check_tfsr_*() callsites into
> mte_enter_from_{user,kernel}() hooks, which'd allow us to avoid redundant checks
> for MTE support.

We currently only check TFSR on entry from EL1 (which makes sense,
since we would not expect an entry from EL0 to have triggered an EL1
tag check fault earlier). So it seems like we'd want slightly
different code for the two ELs anyway.

> > I tried adding one (patch
> > below), but this ended up leading to a performance regression (versus
> > baseline) on some of the cores on the hardware that I have access to.
>
> How big a regression? On a real workload or a microbenchmark?

This was on the microbenchmark from:
https://patchwork.kernel.org/project/linux-arm-kernel/patch/20200801011152.39838-1-pcc@google.com/#23572981

I'm sorry, but I'm not sure that I can get into specifics of how large
the regression was.

> Is that true for a simple move of the current logic to C, i.e. for an
> unconditional SET_PSTATE_TCO(0) gated by cpus_have_final_cap(ARM64_MTE) ?

With the patch below I measured a performance regression relative to
baseline (this time on all cores).

Peter

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 64cfe4a3798f..23dd72b0866c 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -17,6 +17,21 @@
 #include <asm/mmu.h>
 #include <asm/sysreg.h>

+static void mte_disable_tco_entry(void)
+{
+ /*
+ * Re-enable tag checking (TCO set on exception entry). This is only
+ * necessary if MTE is enabled in either the kernel or the userspace
+ * task in synchronous mode. With MTE disabled in the kernel and
+ * disabled or asynchronous in userspace, tag check faults (including in
+ * uaccesses) are not reported, therefore there is no need to re-enable
+ * checking. This is beneficial on microarchitectures where re-enabling
+ * TCO is expensive.
+ */
+ if (IS_ENABLED(CONFIG_ARM64_MTE))
+ asm volatile(SET_PSTATE_TCO(0));
+}
+
 /*
  * This is intended to match the logic in irqentry_enter(), handling the kernel
  * mode transitions only.
@@ -39,6 +54,7 @@ static void noinstr enter_from_kernel_mode(struct
pt_regs *regs)
  trace_hardirqs_off_finish();

  mte_check_tfsr_entry();
+ mte_disable_tco_entry();
 }

 /*
@@ -235,6 +251,7 @@ asmlinkage void noinstr enter_from_user_mode(void)


  CT_WARN_ON(ct_state() != CONTEXT_USER);
  user_exit_irqoff();
  trace_hardirqs_off_finish();
+ mte_disable_tco_entry();
 }

 asmlinkage void noinstr exit_to_user_mode(void)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 42c404d508ca..8c63279ffea7 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -339,13 +339,6 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
  msr_s SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif

- /* Re-enable tag checking (TCO set on exception entry) */
-#ifdef CONFIG_ARM64_MTE
-alternative_if ARM64_MTE
- SET_PSTATE_TCO(0)
-alternative_else_nop_endif
-#endif
-
  /*
  * Registers that may be useful after this macro is invoked:
  *
diff mbox series

Patch

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..85ead6bbb38e 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -269,7 +269,28 @@  alternative_else_nop_endif
 	.else
 	add	x21, sp, #PT_REGS_SIZE
 	get_current_task tsk
+	ldr	x0, [tsk, THREAD_SCTLR_USER]
 	.endif /* \el == 0 */
+
+	/*
+	 * Re-enable tag checking (TCO set on exception entry). This is only
+	 * necessary if MTE is enabled in either the kernel or the userspace
+	 * task in synchronous mode. With MTE disabled in the kernel and
+	 * disabled or asynchronous in userspace, tag check faults (including in
+	 * uaccesses) are not reported, therefore there is no need to re-enable
+	 * checking. This is beneficial on microarchitectures where re-enabling
+	 * TCO is expensive.
+	 */
+#ifdef CONFIG_ARM64_MTE
+alternative_cb	kasan_hw_tags_enable
+	tbz	x0, #SCTLR_EL1_TCF0_SHIFT, 1f
+alternative_cb_end
+alternative_if ARM64_MTE
+	SET_PSTATE_TCO(0)
+alternative_else_nop_endif
+1:
+#endif
+
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
@@ -308,13 +329,6 @@  alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
 
-	/* Re-enable tag checking (TCO set on exception entry) */
-#ifdef CONFIG_ARM64_MTE
-alternative_if ARM64_MTE
-	SET_PSTATE_TCO(0)
-alternative_else_nop_endif
-#endif
-
 	/*
 	 * Registers that may be useful after this macro is invoked:
 	 *