diff mbox series

[v13,6/7] arm64: mte: Report async tag faults before suspend

Message ID 20210211153353.29094-7-vincenzo.frascino@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: ARMv8.5-A: MTE: Add async mode support | expand

Commit Message

Vincenzo Frascino Feb. 11, 2021, 3:33 p.m. UTC
When MTE async mode is enabled TFSR_EL1 contains the accumulative
asynchronous tag check faults for EL1 and EL0.

During the suspend/resume operations the firmware might perform some
operations that could change the state of the register resulting in
a spurious tag check fault report.

Report asynchronous tag faults before suspend and clear the TFSR_EL1
register after resume to prevent this to happen.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
---
 arch/arm64/include/asm/mte.h |  4 ++++
 arch/arm64/kernel/mte.c      | 20 ++++++++++++++++++++
 arch/arm64/kernel/suspend.c  |  3 +++
 3 files changed, 27 insertions(+)

Comments

Lorenzo Pieralisi Feb. 12, 2021, noon UTC | #1
On Thu, Feb 11, 2021 at 03:33:52PM +0000, Vincenzo Frascino wrote:
> When MTE async mode is enabled TFSR_EL1 contains the accumulative
> asynchronous tag check faults for EL1 and EL0.
> 
> During the suspend/resume operations the firmware might perform some
> operations that could change the state of the register resulting in
> a spurious tag check fault report.
> 
> Report asynchronous tag faults before suspend and clear the TFSR_EL1
> register after resume to prevent this to happen.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
> ---
>  arch/arm64/include/asm/mte.h |  4 ++++
>  arch/arm64/kernel/mte.c      | 20 ++++++++++++++++++++
>  arch/arm64/kernel/suspend.c  |  3 +++
>  3 files changed, 27 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> index 43169b978cd3..33e88a470357 100644
> --- a/arch/arm64/include/asm/mte.h
> +++ b/arch/arm64/include/asm/mte.h
> @@ -41,6 +41,7 @@ void mte_sync_tags(pte_t *ptep, pte_t pte);
>  void mte_copy_page_tags(void *kto, const void *kfrom);
>  void flush_mte_state(void);
>  void mte_thread_switch(struct task_struct *next);
> +void mte_suspend_enter(void);
>  void mte_suspend_exit(void);
>  long set_mte_ctrl(struct task_struct *task, unsigned long arg);
>  long get_mte_ctrl(struct task_struct *task);
> @@ -66,6 +67,9 @@ static inline void flush_mte_state(void)
>  static inline void mte_thread_switch(struct task_struct *next)
>  {
>  }
> +static inline void mte_suspend_enter(void)
> +{
> +}
>  static inline void mte_suspend_exit(void)
>  {
>  }
> diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
> index f5aa5bea6dfe..de905102245a 100644
> --- a/arch/arm64/kernel/mte.c
> +++ b/arch/arm64/kernel/mte.c
> @@ -258,12 +258,32 @@ void mte_thread_switch(struct task_struct *next)
>  	mte_check_tfsr_el1();
>  }
>  
> +void mte_suspend_enter(void)
> +{
> +	if (!system_supports_mte())
> +		return;
> +
> +	/*
> +	 * The barriers are required to guarantee that the indirect writes
> +	 * to TFSR_EL1 are synchronized before we report the state.
> +	 */
> +	dsb(nsh);
> +	isb();
> +
> +	/* Report SYS_TFSR_EL1 before suspend entry */
> +	mte_check_tfsr_el1();
> +}
> +
>  void mte_suspend_exit(void)
>  {
>  	if (!system_supports_mte())
>  		return;
>  
>  	update_gcr_el1_excl(gcr_kernel_excl);
> +
> +	/* Clear SYS_TFSR_EL1 after suspend exit */
> +	write_sysreg_s(0, SYS_TFSR_EL1);

AFAICS it is not needed, it is done already in __cpu_setup() (that is
called by cpu_resume on return from cpu_suspend() from firmware).

However, I have a question. We are relying on context switch to set
sctlr_el1_tfc0 right ? If that's the case, till the thread resuming from
low power switches context we are running with SCTLR_EL1_TCF0 not
reflecting the actual value.

Just making sure that I understand it correctly, I need to check the
resume from suspend-to-RAM path, it is something that came up with perf
save/restore already in the past.

Lorenzo

> +
>  }
>  
>  long set_mte_ctrl(struct task_struct *task, unsigned long arg)
> diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
> index a67b37a7a47e..25a02926ad88 100644
> --- a/arch/arm64/kernel/suspend.c
> +++ b/arch/arm64/kernel/suspend.c
> @@ -91,6 +91,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
>  	unsigned long flags;
>  	struct sleep_stack_data state;
>  
> +	/* Report any MTE async fault before going to suspend */
> +	mte_suspend_enter();
> +
>  	/*
>  	 * From this point debug exceptions are disabled to prevent
>  	 * updates to mdscr register (saved and restored along with
> -- 
> 2.30.0
>
Lorenzo Pieralisi Feb. 12, 2021, 12:30 p.m. UTC | #2
On Fri, Feb 12, 2021 at 12:00:15PM +0000, Lorenzo Pieralisi wrote:
> On Thu, Feb 11, 2021 at 03:33:52PM +0000, Vincenzo Frascino wrote:
> > When MTE async mode is enabled TFSR_EL1 contains the accumulative
> > asynchronous tag check faults for EL1 and EL0.
> > 
> > During the suspend/resume operations the firmware might perform some
> > operations that could change the state of the register resulting in
> > a spurious tag check fault report.
> > 
> > Report asynchronous tag faults before suspend and clear the TFSR_EL1
> > register after resume to prevent this to happen.
> > 
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> > Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
> > ---
> >  arch/arm64/include/asm/mte.h |  4 ++++
> >  arch/arm64/kernel/mte.c      | 20 ++++++++++++++++++++
> >  arch/arm64/kernel/suspend.c  |  3 +++
> >  3 files changed, 27 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> > index 43169b978cd3..33e88a470357 100644
> > --- a/arch/arm64/include/asm/mte.h
> > +++ b/arch/arm64/include/asm/mte.h
> > @@ -41,6 +41,7 @@ void mte_sync_tags(pte_t *ptep, pte_t pte);
> >  void mte_copy_page_tags(void *kto, const void *kfrom);
> >  void flush_mte_state(void);
> >  void mte_thread_switch(struct task_struct *next);
> > +void mte_suspend_enter(void);
> >  void mte_suspend_exit(void);
> >  long set_mte_ctrl(struct task_struct *task, unsigned long arg);
> >  long get_mte_ctrl(struct task_struct *task);
> > @@ -66,6 +67,9 @@ static inline void flush_mte_state(void)
> >  static inline void mte_thread_switch(struct task_struct *next)
> >  {
> >  }
> > +static inline void mte_suspend_enter(void)
> > +{
> > +}
> >  static inline void mte_suspend_exit(void)
> >  {
> >  }
> > diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
> > index f5aa5bea6dfe..de905102245a 100644
> > --- a/arch/arm64/kernel/mte.c
> > +++ b/arch/arm64/kernel/mte.c
> > @@ -258,12 +258,32 @@ void mte_thread_switch(struct task_struct *next)
> >  	mte_check_tfsr_el1();
> >  }
> >  
> > +void mte_suspend_enter(void)
> > +{
> > +	if (!system_supports_mte())
> > +		return;
> > +
> > +	/*
> > +	 * The barriers are required to guarantee that the indirect writes
> > +	 * to TFSR_EL1 are synchronized before we report the state.
> > +	 */
> > +	dsb(nsh);
> > +	isb();
> > +
> > +	/* Report SYS_TFSR_EL1 before suspend entry */
> > +	mte_check_tfsr_el1();
> > +}
> > +
> >  void mte_suspend_exit(void)
> >  {
> >  	if (!system_supports_mte())
> >  		return;
> >  
> >  	update_gcr_el1_excl(gcr_kernel_excl);
> > +
> > +	/* Clear SYS_TFSR_EL1 after suspend exit */
> > +	write_sysreg_s(0, SYS_TFSR_EL1);
> 
> AFAICS it is not needed, it is done already in __cpu_setup() (that is
> called by cpu_resume on return from cpu_suspend() from firmware).
> 
> However, I have a question. We are relying on context switch to set
> sctlr_el1_tfc0 right ? If that's the case, till the thread resuming from
> low power switches context we are running with SCTLR_EL1_TCF0 not
> reflecting the actual value.

Forget this, we obviously restore sctlr_el1 on resume (cpu_do_resume()).

With the line above removed:

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>

> Just making sure that I understand it correctly, I need to check the
> resume from suspend-to-RAM path, it is something that came up with perf
> save/restore already in the past.
> 
> Lorenzo
> 
> > +
> >  }
> >  
> >  long set_mte_ctrl(struct task_struct *task, unsigned long arg)
> > diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
> > index a67b37a7a47e..25a02926ad88 100644
> > --- a/arch/arm64/kernel/suspend.c
> > +++ b/arch/arm64/kernel/suspend.c
> > @@ -91,6 +91,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
> >  	unsigned long flags;
> >  	struct sleep_stack_data state;
> >  
> > +	/* Report any MTE async fault before going to suspend */
> > +	mte_suspend_enter();
> > +
> >  	/*
> >  	 * From this point debug exceptions are disabled to prevent
> >  	 * updates to mdscr register (saved and restored along with
> > -- 
> > 2.30.0
> > 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Vincenzo Frascino Feb. 12, 2021, 1:05 p.m. UTC | #3
On 2/12/21 12:30 PM, Lorenzo Pieralisi wrote:
>> However, I have a question. We are relying on context switch to set
>> sctlr_el1_tfc0 right ? If that's the case, till the thread resuming from
>> low power switches context we are running with SCTLR_EL1_TCF0 not
>> reflecting the actual value.
> Forget this, we obviously restore sctlr_el1 on resume (cpu_do_resume()).
> 
> With the line above removed:
> 
> Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
> 

Thanks Lorenzo, I will remove the register write in the next version and add
your tag.
Catalin Marinas Feb. 12, 2021, 1:19 p.m. UTC | #4
On Fri, Feb 12, 2021 at 12:00:15PM +0000, Lorenzo Pieralisi wrote:
> On Thu, Feb 11, 2021 at 03:33:52PM +0000, Vincenzo Frascino wrote:
> > +void mte_suspend_enter(void)
> > +{
> > +	if (!system_supports_mte())
> > +		return;
> > +
> > +	/*
> > +	 * The barriers are required to guarantee that the indirect writes
> > +	 * to TFSR_EL1 are synchronized before we report the state.
> > +	 */
> > +	dsb(nsh);
> > +	isb();
> > +
> > +	/* Report SYS_TFSR_EL1 before suspend entry */
> > +	mte_check_tfsr_el1();
> > +}
> > +
> >  void mte_suspend_exit(void)
> >  {
> >  	if (!system_supports_mte())
> >  		return;
> >  
> >  	update_gcr_el1_excl(gcr_kernel_excl);
> > +
> > +	/* Clear SYS_TFSR_EL1 after suspend exit */
> > +	write_sysreg_s(0, SYS_TFSR_EL1);
> 
> AFAICS it is not needed, it is done already in __cpu_setup() (that is
> called by cpu_resume on return from cpu_suspend() from firmware).
> 
> However, I have a question. We are relying on context switch to set
> sctlr_el1_tfc0 right ? If that's the case, till the thread resuming from
> low power switches context we are running with SCTLR_EL1_TCF0 not
> reflecting the actual value.

I think you have a point here, though not for SCTLR_EL1 as it is already
restored. GCR_EL1 is only updated after some C code has run and may mess
up stack tagging when/if we ever support it. Anyway, something to worry
about later, I think even the boot path gets this wrong.
Catalin Marinas Feb. 12, 2021, 5:24 p.m. UTC | #5
On Thu, Feb 11, 2021 at 03:33:52PM +0000, Vincenzo Frascino wrote:
>  void mte_suspend_exit(void)
>  {
>  	if (!system_supports_mte())
>  		return;
>  
>  	update_gcr_el1_excl(gcr_kernel_excl);
> +
> +	/* Clear SYS_TFSR_EL1 after suspend exit */
> +	write_sysreg_s(0, SYS_TFSR_EL1);
> +
>  }

As per Lorenzo's comment, with this hunk removed:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 43169b978cd3..33e88a470357 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -41,6 +41,7 @@  void mte_sync_tags(pte_t *ptep, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void flush_mte_state(void);
 void mte_thread_switch(struct task_struct *next);
+void mte_suspend_enter(void);
 void mte_suspend_exit(void);
 long set_mte_ctrl(struct task_struct *task, unsigned long arg);
 long get_mte_ctrl(struct task_struct *task);
@@ -66,6 +67,9 @@  static inline void flush_mte_state(void)
 static inline void mte_thread_switch(struct task_struct *next)
 {
 }
+static inline void mte_suspend_enter(void)
+{
+}
 static inline void mte_suspend_exit(void)
 {
 }
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index f5aa5bea6dfe..de905102245a 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -258,12 +258,32 @@  void mte_thread_switch(struct task_struct *next)
 	mte_check_tfsr_el1();
 }
 
+void mte_suspend_enter(void)
+{
+	if (!system_supports_mte())
+		return;
+
+	/*
+	 * The barriers are required to guarantee that the indirect writes
+	 * to TFSR_EL1 are synchronized before we report the state.
+	 */
+	dsb(nsh);
+	isb();
+
+	/* Report SYS_TFSR_EL1 before suspend entry */
+	mte_check_tfsr_el1();
+}
+
 void mte_suspend_exit(void)
 {
 	if (!system_supports_mte())
 		return;
 
 	update_gcr_el1_excl(gcr_kernel_excl);
+
+	/* Clear SYS_TFSR_EL1 after suspend exit */
+	write_sysreg_s(0, SYS_TFSR_EL1);
+
 }
 
 long set_mte_ctrl(struct task_struct *task, unsigned long arg)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index a67b37a7a47e..25a02926ad88 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -91,6 +91,9 @@  int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	unsigned long flags;
 	struct sleep_stack_data state;
 
+	/* Report any MTE async fault before going to suspend */
+	mte_suspend_enter();
+
 	/*
 	 * From this point debug exceptions are disabled to prevent
 	 * updates to mdscr register (saved and restored along with