Message ID | 1498275503-137890-1-git-send-email-xiexiuqi@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, 24 Jun 2017 11:38:23 +0800 Xie XiuQi <xiexiuqi@huawei.com> wrote: > diff --git a/include/linux/cper.h b/include/linux/cper.h > index 4c671fc..17546bf 100644 > --- a/include/linux/cper.h > +++ b/include/linux/cper.h > @@ -275,6 +275,11 @@ enum { > #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) > #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) > > +#define CPER_ARM_INFO_TYPE_CACHE 0 > +#define CPER_ARM_INFO_TYPE_TLB 1 > +#define CPER_ARM_INFO_TYPE_BUS 2 > +#define CPER_ARM_INFO_TYPE_UARCH 3 > + > /* > * All tables and structs must be byte-packed to match CPER > * specification, since the tables are provided by the system BIOS > diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h > index 429f46f..dd91ba8 100644 > --- a/include/ras/ras_event.h > +++ b/include/ras/ras_event.h > @@ -206,6 +206,85 @@ > __entry->running_state, __entry->psci_state) > ); > > +#define ARM_PROC_ERR_TYPE \ > + EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" ) \ > + EM ( CPER_ARM_INFO_TYPE_TLB, "TLB error" ) \ > + EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" ) \ > + EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" ) These are all defines. As the name suggests, the TRACE_DEFINE_ENUM() is for use with enums, not defines. You can nuke this part. > + > +/* > + * First define the enums in MM_ACTION_RESULT to be exported to userspace > + * via TRACE_DEFINE_ENUM(). > + */ > +#undef EM > +#undef EMe > +#define EM(a, b) TRACE_DEFINE_ENUM(a); > +#define EMe(a, b) TRACE_DEFINE_ENUM(a); > + > +ARM_PROC_ERR_TYPE > + > +/* > + * Now redefine the EM() and EMe() macros to map the enums to the strings > + * that will be printed in the output. > + */ > +#undef EM > +#undef EMe > +#define EM(a, b) { a, b }, > +#define EMe(a, b) { a, b } All the EM* and friends above are not needed. The macro below will translate into numbers not names in the format files in tracefs. -- Steve > + > +#define show_proc_err_flags(flags) __print_flags(flags, "|", \ > + { CPER_ARM_INFO_FLAGS_FIRST, "First error captured" }, \ > + { CPER_ARM_INFO_FLAGS_LAST, "Last error captured" }, \ > + { CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" }, \ > + { CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" }) > + > +TRACE_EVENT(arm_err_info_event, > + > + TP_PROTO(const struct cper_arm_err_info *err), > + > + TP_ARGS(err), > + > + TP_STRUCT__entry( > + __field(u8, type) > + __field(u8, flags) > + __field(u16, multiple_error) > + __field(u64, error_info) > + __field(u64, virt_fault_addr) > + __field(u64, physical_fault_addr) > + ), > + > + TP_fast_assign( > + __entry->type = err->type; > + memset(&__entry->flags, 0, > + sizeof(*__entry) - offsetof(typeof(*__entry), flags)); > + __entry->multiple_error = ~0; > + > + if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR) > + __entry->multiple_error = err->multiple_error; > + > + if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS) > + __entry->flags = err->flags; > + > + if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) > + __entry->error_info = err->error_info; > + > + if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR) > + __entry->virt_fault_addr = err->virt_fault_addr; > + > + if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR) > + __entry->physical_fault_addr = err->physical_fault_addr; > + ), > + > + TP_printk("type: %s; count: %u; flags: %s;" > + " error info: %016llx; virtual address: %016llx;" > + " physical address: %016llx", > + __print_symbolic(__entry->type, ARM_PROC_ERR_TYPE), > + __entry->multiple_error, > + show_proc_err_flags(__entry->flags), > + __entry->error_info, __entry->virt_fault_addr, > + __entry->physical_fault_addr) > +); > + > /* > * Non-Standard Section Report > *
On Sat, Jun 24, 2017 at 11:38:23AM +0800, Xie XiuQi wrote: > Add a new trace event for ARM processor error information, so that > the user will know what error occurred. With this information the > user may take appropriate action. > > These trace events are consistent with the ARM processor error > information table which defined in UEFI 2.6 spec section N.2.4.4.1. > > --- > v5: add trace enabled condition which is lost on v4 back again > put flag after the type to keep multiple_error on a 2 byte boundary > > v4: use __print_flags instead of __print_symbolic, because ARM_PROC_ERR_FLAGS > might have more than on bit set. > setting up default values for __entry to avoid a lot of else branches. > set flags to 0 by default instead of ~0. > fix a typo > rename arm_proc_err to arm_err_info_event > remove "ARM Processor Error: " prefix > rebase on Tyler's patchset v17 "Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64" > > https://patchwork.kernel.org/patch/9806267/ > > v3: no change > > v2: add trace enabled condition as Steven's suggestion. > fix a typo. > > https://patchwork.kernel.org/patch/9653767/ > --- > > Cc: Steven Rostedt <rostedt@goodmis.org> > Cc: Tyler Baicar <tbaicar@codeaurora.org> > Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> > --- > drivers/ras/ras.c | 11 +++++++ > include/linux/cper.h | 5 ++++ > include/ras/ras_event.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 95 insertions(+) > > diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c > index 39701a5..f76ab0f 100644 > --- a/drivers/ras/ras.c > +++ b/drivers/ras/ras.c > @@ -22,7 +22,17 @@ void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, > > void log_arm_hw_error(struct cper_sec_proc_arm *err) > { > + int i; > + struct cper_arm_err_info *err_info; > + > trace_arm_event(err); > + > + if (!trace_arm_err_info_event_enabled()) > + return; If we're going to check whether the tracepoint is enabled, you need to do that for arm_event TP too. Because from looking at the spec, arm_event dumps Table 260. ARM Processor Error Section and you're dumping Table 261. ARM Processor Error Information Structure which is embedded in the previous table. So this is basically a single error event and the error info structures can describe different incarnations to that error event. And you need to mirror exactly that behavior. Then, when you do that, you need to document somewhere so that userspace knows to open *both* TPs in order to get the full error information. Alternatively, you can extend arm_event to get issued with *each* cper_arm_err_info but that would mean a lot of redundant information being shuffled out to userspace. So I guess that's ARM folks' call.
Hi Steve, Thanks for your comments. On 2017/6/26 21:36, Steven Rostedt wrote: > On Sat, 24 Jun 2017 11:38:23 +0800 > Xie XiuQi <xiexiuqi@huawei.com> wrote: > >> diff --git a/include/linux/cper.h b/include/linux/cper.h >> index 4c671fc..17546bf 100644 >> --- a/include/linux/cper.h >> +++ b/include/linux/cper.h >> @@ -275,6 +275,11 @@ enum { >> #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) >> #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) >> >> +#define CPER_ARM_INFO_TYPE_CACHE 0 >> +#define CPER_ARM_INFO_TYPE_TLB 1 >> +#define CPER_ARM_INFO_TYPE_BUS 2 >> +#define CPER_ARM_INFO_TYPE_UARCH 3 >> + >> /* >> * All tables and structs must be byte-packed to match CPER >> * specification, since the tables are provided by the system BIOS >> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h >> index 429f46f..dd91ba8 100644 >> --- a/include/ras/ras_event.h >> +++ b/include/ras/ras_event.h >> @@ -206,6 +206,85 @@ >> __entry->running_state, __entry->psci_state) >> ); >> >> +#define ARM_PROC_ERR_TYPE \ >> + EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" ) \ >> + EM ( CPER_ARM_INFO_TYPE_TLB, "TLB error" ) \ >> + EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" ) \ >> + EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" ) > > These are all defines. As the name suggests, the TRACE_DEFINE_ENUM() is > for use with enums, not defines. You can nuke this part. > >> + >> +/* >> + * First define the enums in MM_ACTION_RESULT to be exported to userspace >> + * via TRACE_DEFINE_ENUM(). >> + */ >> +#undef EM >> +#undef EMe >> +#define EM(a, b) TRACE_DEFINE_ENUM(a); >> +#define EMe(a, b) TRACE_DEFINE_ENUM(a); >> + >> +ARM_PROC_ERR_TYPE >> + >> +/* >> + * Now redefine the EM() and EMe() macros to map the enums to the strings >> + * that will be printed in the output. >> + */ >> +#undef EM >> +#undef EMe >> +#define EM(a, b) { a, b }, >> +#define EMe(a, b) { a, b } > > All the EM* and friends above are not needed. The macro below will > translate into numbers not names in the format files in tracefs. So, do you mean we just use __print_symbolic like below: #define show_proc_err_type(type) \ __print_symbolic(type, \ { CPER_ARM_INFO_TYPE_CACHE, "cache error" }, \ { CPER_ARM_INFO_TYPE_TLB, "TLB error" }, \ { CPER_ARM_INFO_TYPE_BUS, "bus error" }, \ { CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" }) > > -- Steve > > >> + >> +#define show_proc_err_flags(flags) __print_flags(flags, "|", \ >> + { CPER_ARM_INFO_FLAGS_FIRST, "First error captured" }, \ >> + { CPER_ARM_INFO_FLAGS_LAST, "Last error captured" }, \ >> + { CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" }, \ >> + { CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" }) >> + >> +TRACE_EVENT(arm_err_info_event, >> + >> + TP_PROTO(const struct cper_arm_err_info *err), >> + >> + TP_ARGS(err), >> + >> + TP_STRUCT__entry( >> + __field(u8, type) >> + __field(u8, flags) >> + __field(u16, multiple_error) >> + __field(u64, error_info) >> + __field(u64, virt_fault_addr) >> + __field(u64, physical_fault_addr) >> + ), >> + >> + TP_fast_assign( >> + __entry->type = err->type; >> + memset(&__entry->flags, 0, >> + sizeof(*__entry) - offsetof(typeof(*__entry), flags)); >> + __entry->multiple_error = ~0; >> + >> + if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR) >> + __entry->multiple_error = err->multiple_error; >> + >> + if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS) >> + __entry->flags = err->flags; >> + >> + if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) >> + __entry->error_info = err->error_info; >> + >> + if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR) >> + __entry->virt_fault_addr = err->virt_fault_addr; >> + >> + if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR) >> + __entry->physical_fault_addr = err->physical_fault_addr; >> + ), >> + >> + TP_printk("type: %s; count: %u; flags: %s;" >> + " error info: %016llx; virtual address: %016llx;" >> + " physical address: %016llx", >> + __print_symbolic(__entry->type, ARM_PROC_ERR_TYPE), >> + __entry->multiple_error, >> + show_proc_err_flags(__entry->flags), >> + __entry->error_info, __entry->virt_fault_addr, >> + __entry->physical_fault_addr) >> +); >> + >> /* >> * Non-Standard Section Report >> * > > > . >
Hi Boris, Thanks for your comments. On 2017/6/26 22:06, Borislav Petkov wrote: > On Sat, Jun 24, 2017 at 11:38:23AM +0800, Xie XiuQi wrote: >> Add a new trace event for ARM processor error information, so that >> the user will know what error occurred. With this information the >> user may take appropriate action. >> >> These trace events are consistent with the ARM processor error >> information table which defined in UEFI 2.6 spec section N.2.4.4.1. >> >> --- >> v5: add trace enabled condition which is lost on v4 back again >> put flag after the type to keep multiple_error on a 2 byte boundary >> >> v4: use __print_flags instead of __print_symbolic, because ARM_PROC_ERR_FLAGS >> might have more than on bit set. >> setting up default values for __entry to avoid a lot of else branches. >> set flags to 0 by default instead of ~0. >> fix a typo >> rename arm_proc_err to arm_err_info_event >> remove "ARM Processor Error: " prefix >> rebase on Tyler's patchset v17 "Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64" >> >> https://patchwork.kernel.org/patch/9806267/ >> >> v3: no change >> >> v2: add trace enabled condition as Steven's suggestion. >> fix a typo. >> >> https://patchwork.kernel.org/patch/9653767/ >> --- >> >> Cc: Steven Rostedt <rostedt@goodmis.org> >> Cc: Tyler Baicar <tbaicar@codeaurora.org> >> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> >> --- >> drivers/ras/ras.c | 11 +++++++ >> include/linux/cper.h | 5 ++++ >> include/ras/ras_event.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 95 insertions(+) >> >> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c >> index 39701a5..f76ab0f 100644 >> --- a/drivers/ras/ras.c >> +++ b/drivers/ras/ras.c >> @@ -22,7 +22,17 @@ void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, >> >> void log_arm_hw_error(struct cper_sec_proc_arm *err) >> { >> + int i; >> + struct cper_arm_err_info *err_info; >> + >> trace_arm_event(err); >> + >> + if (!trace_arm_err_info_event_enabled()) >> + return; > > If we're going to check whether the tracepoint is enabled, you need > to do that for arm_event TP too. Because from looking at the spec, > arm_event dumps > > Table 260. ARM Processor Ejkrror Section > > and you're dumping > > Table 261. ARM Processor Error Information Structure > > which is embedded in the previous table. > > So this is basically a single error event and the error info structures > can describe different incarnations to that error event. > > And you need to mirror exactly that behavior. > > Then, when you do that, you need to document somewhere so that userspace > knows to open *both* TPs in order to get the full error information. > > Alternatively, you can extend arm_event to get issued with *each* > cper_arm_err_info but that would mean a lot of redundant information > being shuffled out to userspace. How about we report the full info via arm_err_info_event which just for someone who want the detail information, and leave arm_event closed. If someone do not care the error detail, who could just open arm_event. It may like this for each err_info in one section: arm_err_info_event: affinity level: 1; MPIDR: 0000001; MIDR: 0000001; running state: 0; PSCI state: 1; type: TLB error; count: 65535; flags: First error captured|Last error captured|Propagated|Overflow; error info: 0000000005244678; virtual address: 0000000000013579; physical address: 0000000000024680 One problem is that may report some redundant information if we have more than one err_info in a section. Does Tyler have any good idea? > > So I guess that's ARM folks' call. >
On Tue, Jun 27, 2017 at 02:51:22PM +0800, Xie XiuQi wrote: > How about we report the full info via arm_err_info_event which just for someone > who want the detail information, and leave arm_event closed. If someone do not > care the error detail, who could just open arm_event. So the way I read the spec is, an error event is being described by the Processor Error section and then it "may contain multiple instances of error information structures associated to a single error event." So you can't leave the arm_event thing closed because it describes the event. If you want to merge the two, then sure, by all means, change arm_event to contain some of the processor error info structure. It wouldn't matter too much as this tracepoint is not fully cast in stone yet. Bottomline is, you want to carry as much information to userspace as possible in order to handle the error properly. But not more - you don't need redundant information because then that bloats the whole machinery around transporting and processing error records and you don't want that in critical situations where you want to act as quickly and as lean as possible. And "handle properly" means any and all actions which the kernel or user needs to do to prolong the system lifetime or be able to reliably schedule maintenance as to replace the faulty hw component. And so on and so on... So it all comes down to what RAS actions you guys wanna do on ARM.
On Tue, 27 Jun 2017 10:41:34 +0800 Xie XiuQi <xiexiuqi@huawei.com> wrote: > Hi Steve, > > Thanks for your comments. > > On 2017/6/26 21:36, Steven Rostedt wrote: > > On Sat, 24 Jun 2017 11:38:23 +0800 > > Xie XiuQi <xiexiuqi@huawei.com> wrote: > > > >> diff --git a/include/linux/cper.h b/include/linux/cper.h > >> index 4c671fc..17546bf 100644 > >> --- a/include/linux/cper.h > >> +++ b/include/linux/cper.h > >> @@ -275,6 +275,11 @@ enum { > >> #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) > >> #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) > >> > >> +#define CPER_ARM_INFO_TYPE_CACHE 0 > >> +#define CPER_ARM_INFO_TYPE_TLB 1 > >> +#define CPER_ARM_INFO_TYPE_BUS 2 > >> +#define CPER_ARM_INFO_TYPE_UARCH 3 > >> + > >> /* > >> * All tables and structs must be byte-packed to match CPER > >> * specification, since the tables are provided by the system BIOS > >> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h > >> index 429f46f..dd91ba8 100644 > >> --- a/include/ras/ras_event.h > >> +++ b/include/ras/ras_event.h > >> @@ -206,6 +206,85 @@ > >> __entry->running_state, __entry->psci_state) > >> ); > >> > >> +#define ARM_PROC_ERR_TYPE \ > >> + EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" ) \ > >> + EM ( CPER_ARM_INFO_TYPE_TLB, "TLB error" ) \ > >> + EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" ) \ > >> + EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" ) > > > > These are all defines. As the name suggests, the TRACE_DEFINE_ENUM() is > > for use with enums, not defines. You can nuke this part. > > > >> + > >> +/* > >> + * First define the enums in MM_ACTION_RESULT to be exported to userspace > >> + * via TRACE_DEFINE_ENUM(). > >> + */ > >> +#undef EM > >> +#undef EMe > >> +#define EM(a, b) TRACE_DEFINE_ENUM(a); > >> +#define EMe(a, b) TRACE_DEFINE_ENUM(a); > >> + > >> +ARM_PROC_ERR_TYPE > >> + > >> +/* > >> + * Now redefine the EM() and EMe() macros to map the enums to the strings > >> + * that will be printed in the output. > >> + */ > >> +#undef EM > >> +#undef EMe > >> +#define EM(a, b) { a, b }, > >> +#define EMe(a, b) { a, b } > > > > All the EM* and friends above are not needed. The macro below will > > translate into numbers not names in the format files in tracefs. > > So, do you mean we just use __print_symbolic like below: > > #define show_proc_err_type(type) \ > __print_symbolic(type, \ > { CPER_ARM_INFO_TYPE_CACHE, "cache error" }, \ > { CPER_ARM_INFO_TYPE_TLB, "TLB error" }, \ > { CPER_ARM_INFO_TYPE_BUS, "bus error" }, \ > { CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" }) > > > Correct. If you want to make sure, just boot the kernel and look at the format file within the tracepoint: /sys/kernel/debug/tracing/events/ras/arm_err_info_event/format And make sure there's no constant names there. -- Steve
On 6/27/2017 1:25 AM, Borislav Petkov wrote: > On Tue, Jun 27, 2017 at 02:51:22PM +0800, Xie XiuQi wrote: >> How about we report the full info via arm_err_info_event which just for someone >> who want the detail information, and leave arm_event closed. If someone do not >> care the error detail, who could just open arm_event. > So the way I read the spec is, an error event is being described by the > Processor Error section and then it "may contain multiple instances of > error information structures associated to a single error event." Hello Xie, I originally included an error information structure in the arm_event, but that won't work in the case of multiple error information structures. The spec says the error must contain at least 1 error information structure, but it could be several. I'm unaware of a way to represent a tracepoint with multiple structures inside of it. I figured the best way to do it would be to have the arm_event TP and then a separate TP for the error information structure which could be triggered several times for the same arm_event. The same thing is true for the context information structure, but there could be 0 or many of those structures. There is also an optional vendor information buffer that can be included, but there is obviously only one of those. That is something that may be easy to add to the arm_event TP...or do that in a separate TP as well. Thanks, Tyler > > So you can't leave the arm_event thing closed because it describes the > event. > > If you want to merge the two, then sure, by all means, change arm_event > to contain some of the processor error info structure. > > It wouldn't matter too much as this tracepoint is not fully cast in > stone yet. > > Bottomline is, you want to carry as much information to userspace as > possible in order to handle the error properly. But not more - you don't > need redundant information because then that bloats the whole machinery > around transporting and processing error records and you don't want that > in critical situations where you want to act as quickly and as lean as > possible. > > And "handle properly" means any and all actions which the kernel or > user needs to do to prolong the system lifetime or be able to reliably > schedule maintenance as to replace the faulty hw component. And so on > and so on... > > So it all comes down to what RAS actions you guys wanna do on ARM. >
On Tue, Jun 27, 2017 at 09:40:18AM -0600, Baicar, Tyler wrote: > ... but there is obviously only one of those. That is something that may > be easy to add to the arm_event TP...or do that in a separate TP as > well. See here: https://lkml.kernel.org/r/20170626140647.anigiqhk3l6ltet7@pd.tnic for possible things you could do.
Hi guys, (CC: Punit) On 26/06/17 15:06, Borislav Petkov wrote: > On Sat, Jun 24, 2017 at 11:38:23AM +0800, Xie XiuQi wrote: >> Add a new trace event for ARM processor error information, so that >> the user will know what error occurred. With this information the >> user may take appropriate action. >> >> These trace events are consistent with the ARM processor error >> information table which defined in UEFI 2.6 spec section N.2.4.4.1. Sorry I've been quiet on this - I'm not familiar with how the trace event stuff works, or what picks it up in user space. >> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c >> index 39701a5..f76ab0f 100644 >> --- a/drivers/ras/ras.c >> +++ b/drivers/ras/ras.c >> @@ -22,7 +22,17 @@ void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, >> >> void log_arm_hw_error(struct cper_sec_proc_arm *err) >> { >> + int i; >> + struct cper_arm_err_info *err_info; >> + >> trace_arm_event(err); >> + >> + if (!trace_arm_err_info_event_enabled()) >> + return; > > If we're going to check whether the tracepoint is enabled, you need > to do that for arm_event TP too. Because from looking at the spec, > arm_event dumps > > Table 260. ARM Processor Error Section > > and you're dumping > > Table 261. ARM Processor Error Information Structure > > which is embedded in the previous table. > > So this is basically a single error event and the error info structures > can describe different incarnations to that error event. > > And you need to mirror exactly that behavior. > > Then, when you do that, you need to document somewhere so that userspace > knows to open *both* TPs in order to get the full error information. I don't see how the data in Table 261 is usable without the corresponding information in Table 260. For example 261's 'Type: cache error' has to be interpreted with 260's 'Error affinity level: 0', 'MPIDR_EL1: 0x100' to know that this cache error only affected that specific CPU. While I like the idea of just spitting out the CPER records (as we don't need to invent a new format to pass the information to user space), I don't think we can easily do this through tracepoints. One tracepoint per cper header/table would be tricky to parse, wouldn't we have to rely on the cpu-id and timestamps to pair the records back up? > Alternatively, you can extend arm_event to get issued with *each* > cper_arm_err_info but that would mean a lot of redundant information > being shuffled out to userspace. I think this is what we should do, but only keep the number of fields we export to a minimum. If we always use the names in the spec, and user-space always parses the 'format' file, we should be able to add more fields when they turn out to be necessary. (looks like the trace infrastructure makes inventing a new format easy!) On that note, how does user space get the 'error severity' from Table 247, 'section severity', 'flags' and the other stringy bits of table 249? Does it need them? Thanks, James
On Wed, Jun 28, 2017 at 05:42:42PM +0100, James Morse wrote: > > Alternatively, you can extend arm_event to get issued with *each* > > cper_arm_err_info but that would mean a lot of redundant information > > being shuffled out to userspace. > > I think this is what we should do, Yes, that should be easier for userspace. > but only keep the number of fields we export to a minimum. If we > always use the names in the spec, and user-space always parses the > 'format' file, we should be able to add more fields when they turn out > to be necessary. (looks like the trace infrastructure makes inventing > a new format easy!) Right, except if you have userspace consumers already using them, you're potentially breaking them. Unless you teach them all to parse the format file first, from the very beginning. But in general, we try to be very wary when touching tracepoints as they become an ABI of sorts. Also, do try to shovel only the really needed info to userspace - not everything the spec dumps but maybe just the fields that are really necessary for doing hw error recovery.
On Wed, 28 Jun 2017 18:55:14 +0200 Borislav Petkov <bp@suse.de> wrote: > > but only keep the number of fields we export to a minimum. If we > > always use the names in the spec, and user-space always parses the > > 'format' file, we should be able to add more fields when they turn out > > to be necessary. (looks like the trace infrastructure makes inventing > > a new format easy!) > > Right, except if you have userspace consumers already using them, you're > potentially breaking them. Unless you teach them all to parse the format > file first, from the very beginning. But in general, we try to be very > wary when touching tracepoints as they become an ABI of sorts. No no. You can add new fields easily. There should be no breakage, as the userspace tools should be using the traceevent library code. It only breaks when you remove a field. Which reminds me, I really need to get that out into distros *this* year. I've been putting this off for far too long. The ras tool has its own copy of the traceevent library parser. -- Steve > > Also, do try to shovel only the really needed info to userspace - not > everything the spec dumps but maybe just the fields that are really > necessary for doing hw error recovery. >
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 39701a5..f76ab0f 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -22,7 +22,17 @@ void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, void log_arm_hw_error(struct cper_sec_proc_arm *err) { + int i; + struct cper_arm_err_info *err_info; + trace_arm_event(err); + + if (!trace_arm_err_info_event_enabled()) + return; + + err_info = (struct cper_arm_err_info *)(err + 1); + for (i = 0; i < err->err_info_num; i++, err_info++) + trace_arm_err_info_event(err_info); } static int __init ras_init(void) @@ -42,6 +52,7 @@ static int __init ras_init(void) EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); EXPORT_TRACEPOINT_SYMBOL_GPL(non_standard_event); EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event); +EXPORT_TRACEPOINT_SYMBOL_GPL(arm_err_info_event); int __init parse_ras_param(char *str) { diff --git a/include/linux/cper.h b/include/linux/cper.h index 4c671fc..17546bf 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -275,6 +275,11 @@ enum { #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) +#define CPER_ARM_INFO_TYPE_CACHE 0 +#define CPER_ARM_INFO_TYPE_TLB 1 +#define CPER_ARM_INFO_TYPE_BUS 2 +#define CPER_ARM_INFO_TYPE_UARCH 3 + /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 429f46f..dd91ba8 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -206,6 +206,85 @@ __entry->running_state, __entry->psci_state) ); +#define ARM_PROC_ERR_TYPE \ + EM ( CPER_ARM_INFO_TYPE_CACHE, "cache error" ) \ + EM ( CPER_ARM_INFO_TYPE_TLB, "TLB error" ) \ + EM ( CPER_ARM_INFO_TYPE_BUS, "bus error" ) \ + EMe ( CPER_ARM_INFO_TYPE_UARCH, "micro-architectural error" ) + +/* + * First define the enums in MM_ACTION_RESULT to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +ARM_PROC_ERR_TYPE + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +#define show_proc_err_flags(flags) __print_flags(flags, "|", \ + { CPER_ARM_INFO_FLAGS_FIRST, "First error captured" }, \ + { CPER_ARM_INFO_FLAGS_LAST, "Last error captured" }, \ + { CPER_ARM_INFO_FLAGS_PROPAGATED, "Propagated" }, \ + { CPER_ARM_INFO_FLAGS_OVERFLOW, "Overflow" }) + +TRACE_EVENT(arm_err_info_event, + + TP_PROTO(const struct cper_arm_err_info *err), + + TP_ARGS(err), + + TP_STRUCT__entry( + __field(u8, type) + __field(u8, flags) + __field(u16, multiple_error) + __field(u64, error_info) + __field(u64, virt_fault_addr) + __field(u64, physical_fault_addr) + ), + + TP_fast_assign( + __entry->type = err->type; + memset(&__entry->flags, 0, + sizeof(*__entry) - offsetof(typeof(*__entry), flags)); + __entry->multiple_error = ~0; + + if (err->validation_bits & CPER_ARM_INFO_VALID_MULTI_ERR) + __entry->multiple_error = err->multiple_error; + + if (err->validation_bits & CPER_ARM_INFO_VALID_FLAGS) + __entry->flags = err->flags; + + if (err->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) + __entry->error_info = err->error_info; + + if (err->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR) + __entry->virt_fault_addr = err->virt_fault_addr; + + if (err->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR) + __entry->physical_fault_addr = err->physical_fault_addr; + ), + + TP_printk("type: %s; count: %u; flags: %s;" + " error info: %016llx; virtual address: %016llx;" + " physical address: %016llx", + __print_symbolic(__entry->type, ARM_PROC_ERR_TYPE), + __entry->multiple_error, + show_proc_err_flags(__entry->flags), + __entry->error_info, __entry->virt_fault_addr, + __entry->physical_fault_addr) +); + /* * Non-Standard Section Report *
Add a new trace event for ARM processor error information, so that the user will know what error occurred. With this information the user may take appropriate action. These trace events are consistent with the ARM processor error information table which defined in UEFI 2.6 spec section N.2.4.4.1. --- v5: add trace enabled condition which is lost on v4 back again put flag after the type to keep multiple_error on a 2 byte boundary v4: use __print_flags instead of __print_symbolic, because ARM_PROC_ERR_FLAGS might have more than on bit set. setting up default values for __entry to avoid a lot of else branches. set flags to 0 by default instead of ~0. fix a typo rename arm_proc_err to arm_err_info_event remove "ARM Processor Error: " prefix rebase on Tyler's patchset v17 "Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64" https://patchwork.kernel.org/patch/9806267/ v3: no change v2: add trace enabled condition as Steven's suggestion. fix a typo. https://patchwork.kernel.org/patch/9653767/ --- Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Tyler Baicar <tbaicar@codeaurora.org> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> --- drivers/ras/ras.c | 11 +++++++ include/linux/cper.h | 5 ++++ include/ras/ras_event.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+)