Message ID | 1484244924-24786-5-git-send-email-tbaicar@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: > SEA exceptions are often caused by an uncorrected hardware > error, and are handled when data abort and instruction abort > exception classes have specific values for their Fault Status > Code. > When SEA occurs, before killing the process, go through > the handlers registered in the notification list. > Update fault_info[] with specific SEA faults so that the > new SEA handler is used. > > Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> > Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> > Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> > --- > arch/arm64/include/asm/system_misc.h | 13 ++++++++ > arch/arm64/mm/fault.c | 58 +++++++++++++++++++++++++++++------- > 2 files changed, 61 insertions(+), 10 deletions(-) > > diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h > index 57f110b..e7f3440 100644 > --- a/arch/arm64/include/asm/system_misc.h > +++ b/arch/arm64/include/asm/system_misc.h > @@ -64,4 +64,17 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); > > #endif /* __ASSEMBLY__ */ > > +/* > + * The functions below are used to register and unregister callbacks > + * that are to be invoked when a Synchronous External Abort (SEA) > + * occurs. An SEA is raised by certain fault status codes that have > + * either data or instruction abort as the exception class, and > + * callbacks may be registered to parse or handle such hardware errors. > + * > + * Registered callbacks are run in an interrupt/atomic context. They > + * are not allowed to block or sleep. > + */ > +int register_sea_notifier(struct notifier_block *nb); > +void unregister_sea_notifier(struct notifier_block *nb); I still don't understand why you need notifiers for this. You register precisely one hook in the series. > #endif /* __ASM_SYSTEM_MISC_H */ > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index 05d2bd7..81039c7 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -39,6 +39,22 @@ > #include <asm/pgtable.h> > #include <asm/tlbflush.h> > > +/* > + * GHES SEA handler code may register a notifier call here to > + * handle HW error record passed from platform. > + */ > +static ATOMIC_NOTIFIER_HEAD(sea_handler_chain); > + > +int register_sea_notifier(struct notifier_block *nb) > +{ > + return atomic_notifier_chain_register(&sea_handler_chain, nb); > +} > + > +void unregister_sea_notifier(struct notifier_block *nb) > +{ > + atomic_notifier_chain_unregister(&sea_handler_chain, nb); > +} > + > static const char *fault_name(unsigned int esr); > > #ifdef CONFIG_KPROBES > @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) > return 1; > } > > +/* > + * This abort handler deals with Synchronous External Abort. > + * It calls notifiers, and then returns "fault". > + */ > +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) > +{ > + struct siginfo info; > + > + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); > + > + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", > + fault_name(esr), esr, addr); > + > + info.si_signo = SIGBUS; > + info.si_errno = 0; > + info.si_code = 0; > + info.si_addr = (void __user *)addr; > + arm64_notify_die("", regs, &info, esr); > + > + return 0; > +} > + > static const struct fault_info { > int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); > int sig; > @@ -502,22 +540,22 @@ static const struct fault_info { > { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, > { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, > { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, > - { do_bad, SIGBUS, 0, "synchronous external abort" }, > + { do_sea, SIGBUS, 0, "synchronous external abort" }, > { do_bad, SIGBUS, 0, "unknown 17" }, > { do_bad, SIGBUS, 0, "unknown 18" }, > { do_bad, SIGBUS, 0, "unknown 19" }, > - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, > - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, > - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, > - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, > - { do_bad, SIGBUS, 0, "synchronous parity error" }, > + { do_sea, SIGBUS, 0, "level 0 SEA (translation table walk)" }, > + { do_sea, SIGBUS, 0, "level 1 SEA (translation table walk)" }, > + { do_sea, SIGBUS, 0, "level 2 SEA (translation table walk)" }, > + { do_sea, SIGBUS, 0, "level 3 SEA (translation table walk)" }, Perhaps I wasn't clear enough in my previous review, but please expand the acronym for strings and comments. > + { do_sea, SIGBUS, 0, "synchronous parity or ECC err" }, s/err/error/ Will -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hello Will, On 1/16/2017 4:53 AM, Will Deacon wrote: > On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: >> SEA exceptions are often caused by an uncorrected hardware >> error, and are handled when data abort and instruction abort >> exception classes have specific values for their Fault Status >> Code. >> When SEA occurs, before killing the process, go through >> the handlers registered in the notification list. >> Update fault_info[] with specific SEA faults so that the >> new SEA handler is used. >> >> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> >> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> >> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> >> --- >> arch/arm64/include/asm/system_misc.h | 13 ++++++++ >> arch/arm64/mm/fault.c | 58 +++++++++++++++++++++++++++++------- >> 2 files changed, 61 insertions(+), 10 deletions(-) >> >> diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h >> index 57f110b..e7f3440 100644 >> --- a/arch/arm64/include/asm/system_misc.h >> +++ b/arch/arm64/include/asm/system_misc.h >> @@ -64,4 +64,17 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); >> >> #endif /* __ASSEMBLY__ */ >> >> +/* >> + * The functions below are used to register and unregister callbacks >> + * that are to be invoked when a Synchronous External Abort (SEA) >> + * occurs. An SEA is raised by certain fault status codes that have >> + * either data or instruction abort as the exception class, and >> + * callbacks may be registered to parse or handle such hardware errors. >> + * >> + * Registered callbacks are run in an interrupt/atomic context. They >> + * are not allowed to block or sleep. >> + */ >> +int register_sea_notifier(struct notifier_block *nb); >> +void unregister_sea_notifier(struct notifier_block *nb); > I still don't understand why you need notifiers for this. You register > precisely one hook in the series. I didn't see a response to my last comment on the previous series so I just left it in for this series. The notifier usage is consistent with the GHES code for SCI errors which are also only used a single time in the code. If you think making the call directly is a better option I will remove the notifiers. >> #endif /* __ASM_SYSTEM_MISC_H */ >> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >> index 05d2bd7..81039c7 100644 >> --- a/arch/arm64/mm/fault.c >> +++ b/arch/arm64/mm/fault.c >> @@ -39,6 +39,22 @@ >> #include <asm/pgtable.h> >> #include <asm/tlbflush.h> >> >> +/* >> + * GHES SEA handler code may register a notifier call here to >> + * handle HW error record passed from platform. >> + */ >> +static ATOMIC_NOTIFIER_HEAD(sea_handler_chain); >> + >> +int register_sea_notifier(struct notifier_block *nb) >> +{ >> + return atomic_notifier_chain_register(&sea_handler_chain, nb); >> +} >> + >> +void unregister_sea_notifier(struct notifier_block *nb) >> +{ >> + atomic_notifier_chain_unregister(&sea_handler_chain, nb); >> +} >> + >> static const char *fault_name(unsigned int esr); >> >> #ifdef CONFIG_KPROBES >> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> return 1; >> } >> >> +/* >> + * This abort handler deals with Synchronous External Abort. >> + * It calls notifiers, and then returns "fault". >> + */ >> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> +{ >> + struct siginfo info; >> + >> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >> + >> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >> + fault_name(esr), esr, addr); >> + >> + info.si_signo = SIGBUS; >> + info.si_errno = 0; >> + info.si_code = 0; >> + info.si_addr = (void __user *)addr; >> + arm64_notify_die("", regs, &info, esr); >> + >> + return 0; >> +} >> + >> static const struct fault_info { >> int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); >> int sig; >> @@ -502,22 +540,22 @@ static const struct fault_info { >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, >> - { do_bad, SIGBUS, 0, "synchronous external abort" }, >> + { do_sea, SIGBUS, 0, "synchronous external abort" }, >> { do_bad, SIGBUS, 0, "unknown 17" }, >> { do_bad, SIGBUS, 0, "unknown 18" }, >> { do_bad, SIGBUS, 0, "unknown 19" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous parity error" }, >> + { do_sea, SIGBUS, 0, "level 0 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 1 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 2 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 3 SEA (translation table walk)" }, > Perhaps I wasn't clear enough in my previous review, but please expand the > acronym for strings and comments. Sorry, I will expand these. >> + { do_sea, SIGBUS, 0, "synchronous parity or ECC err" }, > s/err/error/ > > Will Thanks, Tyler
Hi Tyler, On 16/01/17 11:53, Will Deacon wrote: > On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: >> SEA exceptions are often caused by an uncorrected hardware >> error, and are handled when data abort and instruction abort >> exception classes have specific values for their Fault Status >> Code. >> When SEA occurs, before killing the process, go through >> the handlers registered in the notification list. >> Update fault_info[] with specific SEA faults so that the >> new SEA handler is used. >> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >> index 05d2bd7..81039c7 100644 >> --- a/arch/arm64/mm/fault.c >> +++ b/arch/arm64/mm/fault.c >> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> return 1; >> } >> >> +/* >> + * This abort handler deals with Synchronous External Abort. >> + * It calls notifiers, and then returns "fault". >> + */ >> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> +{ >> + struct siginfo info; >> + >> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >> + >> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >> + fault_name(esr), esr, addr); >> + >> + info.si_signo = SIGBUS; >> + info.si_errno = 0; >> + info.si_code = 0; >> + info.si_addr = (void __user *)addr; >> + arm64_notify_die("", regs, &info, esr); >> + >> + return 0; >> +} >> + >> static const struct fault_info { >> int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); >> int sig; >> @@ -502,22 +540,22 @@ static const struct fault_info { >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, >> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, >> - { do_bad, SIGBUS, 0, "synchronous external abort" }, >> + { do_sea, SIGBUS, 0, "synchronous external abort" }, >> { do_bad, SIGBUS, 0, "unknown 17" }, >> { do_bad, SIGBUS, 0, "unknown 18" }, >> { do_bad, SIGBUS, 0, "unknown 19" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >> - { do_bad, SIGBUS, 0, "synchronous parity error" }, >> + { do_sea, SIGBUS, 0, "level 0 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 1 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 2 SEA (translation table walk)" }, >> + { do_sea, SIGBUS, 0, "level 3 SEA (translation table walk)" }, > > Perhaps I wasn't clear enough in my previous review, but please expand the > acronym for strings and comments. The 'SEA' in this user-string doesn't add anything. Now that these use do_sea() instead of do_bad(), when they are printed won't it be: > Synchronous External Abort: level 3 SEA (translation table walk) (...) at .... Thanks, James -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jan 16, 2017 at 01:09:22PM -0700, Baicar, Tyler wrote: > On 1/16/2017 4:53 AM, Will Deacon wrote: > >On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: > >>SEA exceptions are often caused by an uncorrected hardware > >>error, and are handled when data abort and instruction abort > >>exception classes have specific values for their Fault Status > >>Code. > >>When SEA occurs, before killing the process, go through > >>the handlers registered in the notification list. > >>Update fault_info[] with specific SEA faults so that the > >>new SEA handler is used. > >> > >>Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> > >>Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> > >>Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> > >>--- > >> arch/arm64/include/asm/system_misc.h | 13 ++++++++ > >> arch/arm64/mm/fault.c | 58 +++++++++++++++++++++++++++++------- > >> 2 files changed, 61 insertions(+), 10 deletions(-) > >> > >>diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h > >>index 57f110b..e7f3440 100644 > >>--- a/arch/arm64/include/asm/system_misc.h > >>+++ b/arch/arm64/include/asm/system_misc.h > >>@@ -64,4 +64,17 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); > >> #endif /* __ASSEMBLY__ */ > >>+/* > >>+ * The functions below are used to register and unregister callbacks > >>+ * that are to be invoked when a Synchronous External Abort (SEA) > >>+ * occurs. An SEA is raised by certain fault status codes that have > >>+ * either data or instruction abort as the exception class, and > >>+ * callbacks may be registered to parse or handle such hardware errors. > >>+ * > >>+ * Registered callbacks are run in an interrupt/atomic context. They > >>+ * are not allowed to block or sleep. > >>+ */ > >>+int register_sea_notifier(struct notifier_block *nb); > >>+void unregister_sea_notifier(struct notifier_block *nb); > >I still don't understand why you need notifiers for this. You register > >precisely one hook in the series. > I didn't see a response to my last comment on the previous series so I just > left it in for this series. > The notifier usage is consistent with the GHES code for SCI errors which are > also only used a single > time in the code. If you think making the call directly is a better option I > will remove the notifiers. Yes, please. It's easy to add the notifier infrastructure back if/when it's actually needed and I don't see why the low-level fault dispatching needs to be consistent with the GHES/SCI code. Will -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Tyler, On 12/01/17 18:15, Tyler Baicar wrote: > SEA exceptions are often caused by an uncorrected hardware > error, and are handled when data abort and instruction abort > exception classes have specific values for their Fault Status > Code. > When SEA occurs, before killing the process, go through > the handlers registered in the notification list. > Update fault_info[] with specific SEA faults so that the > new SEA handler is used. > @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) > return 1; > } > > +/* > + * This abort handler deals with Synchronous External Abort. > + * It calls notifiers, and then returns "fault". > + */ > +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) > +{ > + struct siginfo info; > + > + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); > + > + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", > + fault_name(esr), esr, addr); > + > + info.si_signo = SIGBUS; > + info.si_errno = 0; > + info.si_code = 0; Half of the other do_*() functions in this file read the signo and code from the fault_info table. > + info.si_addr = (void __user *)addr; addr here was read from FAR_EL1, but for some of the classes of exception you have listed below this register isn't updated with the faulting address. The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an Exception level that is using Aarch64" has: > The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External > Aborts other than Synchronous External Aborts on Translation Table Walks. In > this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is > valid. This is a problem if we get 'synchronous external abort' or 'synchronous parity error' while a user space process was running. > + arm64_notify_die("", regs, &info, esr); > + > + return 0; > +} > + > static const struct fault_info { > int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); > int sig; Thanks, James -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 1/17/2017 3:23 AM, James Morse wrote: > Hi Tyler, > > On 16/01/17 11:53, Will Deacon wrote: >> On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: >>> SEA exceptions are often caused by an uncorrected hardware >>> error, and are handled when data abort and instruction abort >>> exception classes have specific values for their Fault Status >>> Code. >>> When SEA occurs, before killing the process, go through >>> the handlers registered in the notification list. >>> Update fault_info[] with specific SEA faults so that the >>> new SEA handler is used. >>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >>> index 05d2bd7..81039c7 100644 >>> --- a/arch/arm64/mm/fault.c >>> +++ b/arch/arm64/mm/fault.c >>> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) >>> return 1; >>> } >>> >>> +/* >>> + * This abort handler deals with Synchronous External Abort. >>> + * It calls notifiers, and then returns "fault". >>> + */ >>> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >>> +{ >>> + struct siginfo info; >>> + >>> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >>> + >>> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >>> + fault_name(esr), esr, addr); >>> + >>> + info.si_signo = SIGBUS; >>> + info.si_errno = 0; >>> + info.si_code = 0; >>> + info.si_addr = (void __user *)addr; >>> + arm64_notify_die("", regs, &info, esr); >>> + >>> + return 0; >>> +} >>> + >>> static const struct fault_info { >>> int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); >>> int sig; >>> @@ -502,22 +540,22 @@ static const struct fault_info { >>> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, >>> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, >>> { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, >>> - { do_bad, SIGBUS, 0, "synchronous external abort" }, >>> + { do_sea, SIGBUS, 0, "synchronous external abort" }, >>> { do_bad, SIGBUS, 0, "unknown 17" }, >>> { do_bad, SIGBUS, 0, "unknown 18" }, >>> { do_bad, SIGBUS, 0, "unknown 19" }, >>> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >>> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >>> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >>> - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, >>> - { do_bad, SIGBUS, 0, "synchronous parity error" }, >>> + { do_sea, SIGBUS, 0, "level 0 SEA (translation table walk)" }, >>> + { do_sea, SIGBUS, 0, "level 1 SEA (translation table walk)" }, >>> + { do_sea, SIGBUS, 0, "level 2 SEA (translation table walk)" }, >>> + { do_sea, SIGBUS, 0, "level 3 SEA (translation table walk)" }, >> Perhaps I wasn't clear enough in my previous review, but please expand the >> acronym for strings and comments. > > The 'SEA' in this user-string doesn't add anything. Now that these use do_sea() > instead of do_bad(), when they are printed won't it be: >> Synchronous External Abort: level 3 SEA (translation table walk) (...) at .... Good point, yes they will: + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", + fault_name(esr), esr, addr); I can just remove SEA here then. Thanks, Tyler
On 1/17/2017 3:27 AM, Will Deacon wrote: > On Mon, Jan 16, 2017 at 01:09:22PM -0700, Baicar, Tyler wrote: >> On 1/16/2017 4:53 AM, Will Deacon wrote: >>> On Thu, Jan 12, 2017 at 11:15:18AM -0700, Tyler Baicar wrote: >>>> SEA exceptions are often caused by an uncorrected hardware >>>> error, and are handled when data abort and instruction abort >>>> exception classes have specific values for their Fault Status >>>> Code. >>>> When SEA occurs, before killing the process, go through >>>> the handlers registered in the notification list. >>>> Update fault_info[] with specific SEA faults so that the >>>> new SEA handler is used. >>>> >>>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> >>>> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org> >>>> Signed-off-by: Naveen Kaje <nkaje@codeaurora.org> >>>> --- >>>> arch/arm64/include/asm/system_misc.h | 13 ++++++++ >>>> arch/arm64/mm/fault.c | 58 +++++++++++++++++++++++++++++------- >>>> 2 files changed, 61 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h >>>> index 57f110b..e7f3440 100644 >>>> --- a/arch/arm64/include/asm/system_misc.h >>>> +++ b/arch/arm64/include/asm/system_misc.h >>>> @@ -64,4 +64,17 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); >>>> #endif /* __ASSEMBLY__ */ >>>> +/* >>>> + * The functions below are used to register and unregister callbacks >>>> + * that are to be invoked when a Synchronous External Abort (SEA) >>>> + * occurs. An SEA is raised by certain fault status codes that have >>>> + * either data or instruction abort as the exception class, and >>>> + * callbacks may be registered to parse or handle such hardware errors. >>>> + * >>>> + * Registered callbacks are run in an interrupt/atomic context. They >>>> + * are not allowed to block or sleep. >>>> + */ >>>> +int register_sea_notifier(struct notifier_block *nb); >>>> +void unregister_sea_notifier(struct notifier_block *nb); >>> I still don't understand why you need notifiers for this. You register >>> precisely one hook in the series. >> I didn't see a response to my last comment on the previous series so I just >> left it in for this series. >> The notifier usage is consistent with the GHES code for SCI errors which are >> also only used a single >> time in the code. If you think making the call directly is a better option I >> will remove the notifiers. > Yes, please. It's easy to add the notifier infrastructure back if/when it's > actually needed and I don't see why the low-level fault dispatching needs to > be consistent with the GHES/SCI code. > > Will Sounds good, I will remove the notifier in the next patchset. Thanks, Tyler
Hello James, On 1/17/2017 3:31 AM, James Morse wrote: > Hi Tyler, > > On 12/01/17 18:15, Tyler Baicar wrote: >> SEA exceptions are often caused by an uncorrected hardware >> error, and are handled when data abort and instruction abort >> exception classes have specific values for their Fault Status >> Code. >> When SEA occurs, before killing the process, go through >> the handlers registered in the notification list. >> Update fault_info[] with specific SEA faults so that the >> new SEA handler is used. >> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> return 1; >> } >> >> +/* >> + * This abort handler deals with Synchronous External Abort. >> + * It calls notifiers, and then returns "fault". >> + */ >> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >> +{ >> + struct siginfo info; >> + >> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >> + >> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >> + fault_name(esr), esr, addr); >> + >> + info.si_signo = SIGBUS; >> + info.si_errno = 0; >> + info.si_code = 0; > Half of the other do_*() functions in this file read the signo and code from the > fault_info table. > > >> + info.si_addr = (void __user *)addr; > addr here was read from FAR_EL1, but for some of the classes of exception you > have listed below this register isn't updated with the faulting address. > > The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an > Exception level that is using Aarch64" has: >> The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External >> Aborts other than Synchronous External Aborts on Translation Table Walks. In >> this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is >> valid. > This is a problem if we get 'synchronous external abort' or 'synchronous parity > error' while a user space process was running. It looks like this would just cause an incorrect address to be printed in the above pr_err. Unless I'm missing something, I don't see arm64_notify_die or anything that gets called from there using the info.si_addr variable. What do you suggest I do here? The firmware should be reporting the physical and virtual address information if it is available in the HEST entry that the kernel will parse. So should I just remove the use of the addr parameter in do_sea? Thanks, Tyler >> + arm64_notify_die("", regs, &info, esr); >> + >> + return 0; >> +} >> + >> static const struct fault_info { >> int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); >> int sig; > > Thanks, > > James > >
Hi Tyler, On 18/01/17 23:26, Baicar, Tyler wrote: > On 1/17/2017 3:31 AM, James Morse wrote: >> On 12/01/17 18:15, Tyler Baicar wrote: >>> SEA exceptions are often caused by an uncorrected hardware >>> error, and are handled when data abort and instruction abort >>> exception classes have specific values for their Fault Status >>> Code. >>> When SEA occurs, before killing the process, go through >>> the handlers registered in the notification list. >>> Update fault_info[] with specific SEA faults so that the >>> new SEA handler is used. >>> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, >>> struct pt_regs *regs) >>> return 1; >>> } >>> +/* >>> + * This abort handler deals with Synchronous External Abort. >>> + * It calls notifiers, and then returns "fault". >>> + */ >>> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >>> +{ >>> + struct siginfo info; >>> + >>> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >>> + >>> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >>> + fault_name(esr), esr, addr); >>> + >>> + info.si_signo = SIGBUS; >>> + info.si_errno = 0; >>> + info.si_code = 0; >> Half of the other do_*() functions in this file read the signo and code from the >> fault_info table. >> >> >>> + info.si_addr = (void __user *)addr; >> addr here was read from FAR_EL1, but for some of the classes of exception you >> have listed below this register isn't updated with the faulting address. >> >> The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an >> Exception level that is using Aarch64" has: >>> The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External >>> Aborts other than Synchronous External Aborts on Translation Table Walks. In >>> this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is >>> valid. >> This is a problem if we get 'synchronous external abort' or 'synchronous parity >> error' while a user space process was running. > It looks like this would just cause an incorrect address to be printed in the > above pr_err. > Unless I'm missing something, I don't see arm64_notify_die or anything that gets > called from > there using the info.si_addr variable. I may be misreading something here... This patch has: > info.si_addr = (void __user *)addr; > arm64_notify_die("", regs, &info, esr); From arch/arm64/kernel/traps.c:arm64_notify_die(): > if (user_mode(regs)) { > current->thread.fault_address = 0; > current->thread.fault_code = err; > force_sig_info(info->si_signo, info, current); > } So if the SEA interrupted userspace, we put maybe-unknown addr into force_sig_info() to deliver a signal to user space. User-space then gets a copy of the info struct containing the maybe-unknown addr. I think this is an existing bug, but if we are separating the synchronous external aborts from the generic do_bad handler, we should probably check the FnV bit. (I think we should still print it out) > What do you suggest I do here? The firmware should be reporting the physical and > virtual > address information if it is available in the HEST entry that the kernel will > parse. Its not just firmware that may trigger this, other SoCs may use it for parity or ECC errors, and they may not always have a valid address in FAR_EL1. I think we should check the FnV bit in the esr variable and set info.si_addr to 0 if the addr we have isn't valid: 'For some implementations, the value of si_addr may be inaccurate.' [0] Thanks, James [0] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 1/19/2017 10:55 AM, James Morse wrote: > Hi Tyler, > > On 18/01/17 23:26, Baicar, Tyler wrote: >> On 1/17/2017 3:31 AM, James Morse wrote: >>> On 12/01/17 18:15, Tyler Baicar wrote: >>>> SEA exceptions are often caused by an uncorrected hardware >>>> error, and are handled when data abort and instruction abort >>>> exception classes have specific values for their Fault Status >>>> Code. >>>> When SEA occurs, before killing the process, go through >>>> the handlers registered in the notification list. >>>> Update fault_info[] with specific SEA faults so that the >>>> new SEA handler is used. >>>> @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, >>>> struct pt_regs *regs) >>>> return 1; >>>> } >>>> +/* >>>> + * This abort handler deals with Synchronous External Abort. >>>> + * It calls notifiers, and then returns "fault". >>>> + */ >>>> +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) >>>> +{ >>>> + struct siginfo info; >>>> + >>>> + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); >>>> + >>>> + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", >>>> + fault_name(esr), esr, addr); >>>> + >>>> + info.si_signo = SIGBUS; >>>> + info.si_errno = 0; >>>> + info.si_code = 0; >>> Half of the other do_*() functions in this file read the signo and code from the >>> fault_info table. >>> >>> >>>> + info.si_addr = (void __user *)addr; >>> addr here was read from FAR_EL1, but for some of the classes of exception you >>> have listed below this register isn't updated with the faulting address. >>> >>> The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an >>> Exception level that is using Aarch64" has: >>>> The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External >>>> Aborts other than Synchronous External Aborts on Translation Table Walks. In >>>> this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is >>>> valid. >>> This is a problem if we get 'synchronous external abort' or 'synchronous parity >>> error' while a user space process was running. >> It looks like this would just cause an incorrect address to be printed in the >> above pr_err. >> Unless I'm missing something, I don't see arm64_notify_die or anything that gets >> called from >> there using the info.si_addr variable. > I may be misreading something here... > > This patch has: >> info.si_addr = (void __user *)addr; >> arm64_notify_die("", regs, &info, esr); > From arch/arm64/kernel/traps.c:arm64_notify_die(): >> if (user_mode(regs)) { >> current->thread.fault_address = 0; >> current->thread.fault_code = err; >> force_sig_info(info->si_signo, info, current); >> } > So if the SEA interrupted userspace, we put maybe-unknown addr into > force_sig_info() to deliver a signal to user space. User-space then gets a copy > of the info struct containing the maybe-unknown addr. > > I think this is an existing bug, but if we are separating the synchronous > external aborts from the generic do_bad handler, we should probably check the > FnV bit. (I think we should still print it out) > > >> What do you suggest I do here? The firmware should be reporting the physical and >> virtual >> address information if it is available in the HEST entry that the kernel will >> parse. > Its not just firmware that may trigger this, other SoCs may use it for parity or > ECC errors, and they may not always have a valid address in FAR_EL1. > > I think we should check the FnV bit in the esr variable and set info.si_addr to > 0 if the addr we have isn't valid: > 'For some implementations, the value of si_addr may be inaccurate.' [0] > > > Thanks, > > James > > > [0] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html Hello James, Okay, that makes sense, we don't want userspace to be notified with an incorrect address. I will add the check to verify it's valid. Which bit in the ESR is the FnV bit? I'm not finding the bit breakdown of the ISS that shows it. Thanks, Tyler
Hi Tyler, On 20/01/17 20:35, Baicar, Tyler wrote: > On 1/19/2017 10:55 AM, James Morse wrote: >> On 18/01/17 23:26, Baicar, Tyler wrote: >>> On 1/17/2017 3:31 AM, James Morse wrote: >>>> On 12/01/17 18:15, Tyler Baicar wrote: >>>>> + info.si_addr = (void __user *)addr; >>>> addr here was read from FAR_EL1, but for some of the classes of exception you >>>> have listed below this register isn't updated with the faulting address. >>>> >>>> The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an >>>> Exception level that is using Aarch64" has: >>>>> The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External >>>>> Aborts other than Synchronous External Aborts on Translation Table Walks. In >>>>> this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is >>>>> valid. >>>> This is a problem if we get 'synchronous external abort' or 'synchronous parity >>>> error' while a user space process was running. >>> It looks like this would just cause an incorrect address to be printed in the >>> above pr_err. >>> Unless I'm missing something, I don't see arm64_notify_die or anything that gets >>> called from >>> there using the info.si_addr variable. >> I may be misreading something here... >> >> This patch has: >>> info.si_addr = (void __user *)addr; >>> arm64_notify_die("", regs, &info, esr); >> From arch/arm64/kernel/traps.c:arm64_notify_die(): >>> if (user_mode(regs)) { >>> current->thread.fault_address = 0; >>> current->thread.fault_code = err; >>> force_sig_info(info->si_signo, info, current); >>> } >> So if the SEA interrupted userspace, we put maybe-unknown addr into >> force_sig_info() to deliver a signal to user space. User-space then gets a copy >> of the info struct containing the maybe-unknown addr. >> >> I think this is an existing bug, but if we are separating the synchronous >> external aborts from the generic do_bad handler, we should probably check the >> FnV bit. (I think we should still print it out) >> >> >>> What do you suggest I do here? The firmware should be reporting the physical and >>> virtual >>> address information if it is available in the HEST entry that the kernel will >>> parse. >> Its not just firmware that may trigger this, other SoCs may use it for parity or >> ECC errors, and they may not always have a valid address in FAR_EL1. >> >> I think we should check the FnV bit in the esr variable and set info.si_addr to >> 0 if the addr we have isn't valid: >> 'For some implementations, the value of si_addr may be inaccurate.' [0] > Okay, that makes sense, we don't want userspace to be notified with an incorrect > address. > I will add the check to verify it's valid. Which bit in the ESR is the FnV bit? > I'm not finding > the bit breakdown of the ISS that shows it. The bits in ISS vary depending on the EC, so a little digging is required. "D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)" lists the EC values, from there 'Instruction Abort' and 'Data Abort' both list FnV as bit 10. Version 'k' of the ARM-ARM has this on pages D7-1953 to D7-1956. Thanks, James
On 1/23/2017 3:01 AM, James Morse wrote: > Hi Tyler, > > On 20/01/17 20:35, Baicar, Tyler wrote: >> On 1/19/2017 10:55 AM, James Morse wrote: >>> On 18/01/17 23:26, Baicar, Tyler wrote: >>>> On 1/17/2017 3:31 AM, James Morse wrote: >>>>> On 12/01/17 18:15, Tyler Baicar wrote: >>>>>> + info.si_addr = (void __user *)addr; >>>>> addr here was read from FAR_EL1, but for some of the classes of exception you >>>>> have listed below this register isn't updated with the faulting address. >>>>> >>>>> The ARM-ARM version 'k' in D1.10.5 "Summary of registers on faults taken to an >>>>> Exception level that is using Aarch64" has: >>>>>> The architecture permits that the FAR_ELx is UNKNOWN for Synchronous External >>>>>> Aborts other than Synchronous External Aborts on Translation Table Walks. In >>>>>> this case, the ISS.FnV bit returned in ESR_ELx indicates whether FAR_ELx is >>>>>> valid. >>>>> This is a problem if we get 'synchronous external abort' or 'synchronous parity >>>>> error' while a user space process was running. >>>> It looks like this would just cause an incorrect address to be printed in the >>>> above pr_err. >>>> Unless I'm missing something, I don't see arm64_notify_die or anything that gets >>>> called from >>>> there using the info.si_addr variable. >>> I may be misreading something here... >>> >>> This patch has: >>>> info.si_addr = (void __user *)addr; >>>> arm64_notify_die("", regs, &info, esr); >>> From arch/arm64/kernel/traps.c:arm64_notify_die(): >>>> if (user_mode(regs)) { >>>> current->thread.fault_address = 0; >>>> current->thread.fault_code = err; >>>> force_sig_info(info->si_signo, info, current); >>>> } >>> So if the SEA interrupted userspace, we put maybe-unknown addr into >>> force_sig_info() to deliver a signal to user space. User-space then gets a copy >>> of the info struct containing the maybe-unknown addr. >>> >>> I think this is an existing bug, but if we are separating the synchronous >>> external aborts from the generic do_bad handler, we should probably check the >>> FnV bit. (I think we should still print it out) >>> >>> >>>> What do you suggest I do here? The firmware should be reporting the physical and >>>> virtual >>>> address information if it is available in the HEST entry that the kernel will >>>> parse. >>> Its not just firmware that may trigger this, other SoCs may use it for parity or >>> ECC errors, and they may not always have a valid address in FAR_EL1. >>> >>> I think we should check the FnV bit in the esr variable and set info.si_addr to >>> 0 if the addr we have isn't valid: >>> 'For some implementations, the value of si_addr may be inaccurate.' [0] >> Okay, that makes sense, we don't want userspace to be notified with an incorrect >> address. >> I will add the check to verify it's valid. Which bit in the ESR is the FnV bit? >> I'm not finding >> the bit breakdown of the ISS that shows it. > The bits in ISS vary depending on the EC, so a little digging is required. > "D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)" lists the EC values, from > there 'Instruction Abort' and 'Data Abort' both list FnV as bit 10. Version 'k' > of the ARM-ARM has this on pages D7-1953 to D7-1956. Got it! I'll add the check for this in my next patchset. Thanks, Tyler
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 57f110b..e7f3440 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -64,4 +64,17 @@ extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); #endif /* __ASSEMBLY__ */ +/* + * The functions below are used to register and unregister callbacks + * that are to be invoked when a Synchronous External Abort (SEA) + * occurs. An SEA is raised by certain fault status codes that have + * either data or instruction abort as the exception class, and + * callbacks may be registered to parse or handle such hardware errors. + * + * Registered callbacks are run in an interrupt/atomic context. They + * are not allowed to block or sleep. + */ +int register_sea_notifier(struct notifier_block *nb); +void unregister_sea_notifier(struct notifier_block *nb); + #endif /* __ASM_SYSTEM_MISC_H */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 05d2bd7..81039c7 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -39,6 +39,22 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> +/* + * GHES SEA handler code may register a notifier call here to + * handle HW error record passed from platform. + */ +static ATOMIC_NOTIFIER_HEAD(sea_handler_chain); + +int register_sea_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&sea_handler_chain, nb); +} + +void unregister_sea_notifier(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&sea_handler_chain, nb); +} + static const char *fault_name(unsigned int esr); #ifdef CONFIG_KPROBES @@ -480,6 +496,28 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; } +/* + * This abort handler deals with Synchronous External Abort. + * It calls notifiers, and then returns "fault". + */ +static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +{ + struct siginfo info; + + atomic_notifier_call_chain(&sea_handler_chain, 0, NULL); + + pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", + fault_name(esr), esr, addr); + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = 0; + info.si_addr = (void __user *)addr; + arm64_notify_die("", regs, &info, esr); + + return 0; +} + static const struct fault_info { int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); int sig; @@ -502,22 +540,22 @@ static const struct fault_info { { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, - { do_bad, SIGBUS, 0, "synchronous external abort" }, + { do_sea, SIGBUS, 0, "synchronous external abort" }, { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error" }, + { do_sea, SIGBUS, 0, "level 0 SEA (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 SEA (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 SEA (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 SEA (translation table walk)" }, + { do_sea, SIGBUS, 0, "synchronous parity or ECC err" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, { do_bad, SIGBUS, 0, "unknown 27" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 0 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 1 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 2 synchronous parity error (translation table walk)" }, + { do_sea, SIGBUS, 0, "level 3 synchronous parity error (translation table walk)" }, { do_bad, SIGBUS, 0, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGBUS, 0, "unknown 34" },