diff mbox

[V12,09/10] trace, ras: add ARM processor error trace event

Message ID 1488833103-21082-10-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Tyler Baicar March 6, 2017, 8:45 p.m. UTC
Currently there are trace events for the various RAS
errors with the exception of ARM processor type errors.
Add a new trace event for such errors so that the user
will know when they occur. These trace events are
consistent with the ARM processor error section type
defined in UEFI 2.6 spec section N.2.4.4.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
---
 drivers/acpi/apei/ghes.c    |  8 +++++++-
 drivers/firmware/efi/cper.c |  1 +
 drivers/ras/ras.c           |  1 +
 include/ras/ras_event.h     | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 1 deletion(-)

Comments

Xie XiuQi March 9, 2017, 9:41 a.m. UTC | #1
Hi Tyler Baicar,

On 2017/3/7 4:45, Tyler Baicar wrote:
> Currently there are trace events for the various RAS
> errors with the exception of ARM processor type errors.
> Add a new trace event for such errors so that the user
> will know when they occur. These trace events are
> consistent with the ARM processor error section type
> defined in UEFI 2.6 spec section N.2.4.4.
> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> Acked-by: Steven Rostedt <rostedt@goodmis.org>
> ---
>  drivers/acpi/apei/ghes.c    |  8 +++++++-
>  drivers/firmware/efi/cper.c |  1 +
>  drivers/ras/ras.c           |  1 +
>  include/ras/ras_event.h     | 34 ++++++++++++++++++++++++++++++++++
>  4 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 842c0cc..81d7b79 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -514,7 +514,13 @@ static void ghes_do_proc(struct ghes *ghes,
>  		}
>  #endif
>  #ifdef CONFIG_RAS
> -		else if (trace_unknown_sec_event_enabled()) {
> +		else if (!uuid_le_cmp(sec_type, CPER_SEC_PROC_ARM) &&
> +			 trace_arm_event_enabled()) {
> +			struct cper_sec_proc_arm *arm_err;
> +
> +			arm_err = acpi_hest_generic_data_payload(gdata);
> +			trace_arm_event(arm_err);
> +		} else if (trace_unknown_sec_event_enabled()) {
>  			void *unknown_err = acpi_hest_generic_data_payload(gdata);
>  			trace_unknown_sec_event(&sec_type,
>  					fru_id, fru_text, sec_sev,
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index 545a6c2..e9fb56a 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -35,6 +35,7 @@
>  #include <linux/printk.h>
>  #include <linux/bcd.h>
>  #include <acpi/ghes.h>
> +#include <ras/ras_event.h>
>  
>  #define INDENT_SP	" "
>  
> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
> index fb2500b..8ba5a94 100644
> --- a/drivers/ras/ras.c
> +++ b/drivers/ras/ras.c
> @@ -28,3 +28,4 @@ static int __init ras_init(void)
>  #endif
>  EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(unknown_sec_event);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event);
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index 5861b6f..b36db48 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -162,6 +162,40 @@
>  );
>  
>  /*
> + * ARM Processor Events Report
> + *
> + * This event is generated when hardware detects an ARM processor error
> + * has occurred. UEFI 2.6 spec section N.2.4.4.
> + */
> +TRACE_EVENT(arm_event,
> +
> +	TP_PROTO(const struct cper_sec_proc_arm *proc),
> +
> +	TP_ARGS(proc),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, mpidr)
> +		__field(u64, midr)
> +		__field(u32, running_state)
> +		__field(u32, psci_state)
> +		__field(u8, affinity)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->affinity = proc->affinity_level;
> +		__entry->mpidr = proc->mpidr;
> +		__entry->midr = proc->midr;
> +		__entry->running_state = proc->running_state;
> +		__entry->psci_state = proc->psci_state;
> +	),
> +
> +	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
> +		  "running state: %d; PSCI state: %d",
> +		  __entry->affinity, __entry->mpidr, __entry->midr,
> +		  __entry->running_state, __entry->psci_state)
> +);
> +

I think these fields are not enough, we need also export arm processor error
information (UEFI 2.6 spec section N.2.4.4.1), or at least the error type,
address, etc. So that the userspace (such as rasdaemon tool) could know what
error occurred.

Thanks,
Xie XiuQi

> +/*
>   * Unknown Section Report
>   *
>   * This event is generated when hardware detected a hardware
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tyler Baicar March 10, 2017, 6:23 p.m. UTC | #2
Hello Xie XiuQi,


On 3/9/2017 2:41 AM, Xie XiuQi wrote:
> On 2017/3/7 4:45, Tyler Baicar wrote:
>> Currently there are trace events for the various RAS
>> errors with the exception of ARM processor type errors.
>> Add a new trace event for such errors so that the user
>> will know when they occur. These trace events are
>> consistent with the ARM processor error section type
>> defined in UEFI 2.6 spec section N.2.4.4.
>>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> Acked-by: Steven Rostedt <rostedt@goodmis.org>
>> ---
>>   drivers/acpi/apei/ghes.c    |  8 +++++++-
>>   drivers/firmware/efi/cper.c |  1 +
>>   drivers/ras/ras.c           |  1 +
>>   include/ras/ras_event.h     | 34 ++++++++++++++++++++++++++++++++++
>>   4 files changed, 43 insertions(+), 1 deletion(-)

>> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
>> index 5861b6f..b36db48 100644
>> --- a/include/ras/ras_event.h
>> +++ b/include/ras/ras_event.h
>> @@ -162,6 +162,40 @@
>>   );
>>   
>>   /*
>> + * ARM Processor Events Report
>> + *
>> + * This event is generated when hardware detects an ARM processor error
>> + * has occurred. UEFI 2.6 spec section N.2.4.4.
>> + */
>> +TRACE_EVENT(arm_event,
>> +
>> +	TP_PROTO(const struct cper_sec_proc_arm *proc),
>> +
>> +	TP_ARGS(proc),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u64, mpidr)
>> +		__field(u64, midr)
>> +		__field(u32, running_state)
>> +		__field(u32, psci_state)
>> +		__field(u8, affinity)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->affinity = proc->affinity_level;
>> +		__entry->mpidr = proc->mpidr;
>> +		__entry->midr = proc->midr;
>> +		__entry->running_state = proc->running_state;
>> +		__entry->psci_state = proc->psci_state;
>> +	),
>> +
>> +	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>> +		  "running state: %d; PSCI state: %d",
>> +		  __entry->affinity, __entry->mpidr, __entry->midr,
>> +		  __entry->running_state, __entry->psci_state)
>> +);
>> +
> I think these fields are not enough, we need also export arm processor error
> information (UEFI 2.6 spec section N.2.4.4.1), or at least the error type,
> address, etc. So that the userspace (such as rasdaemon tool) could know what
> error occurred.
This is something I am planning on adding in later. It is not clear to 
me how to actually do this at this point. If you look at the spec, there 
is not a single error information structure. There is at least one, but 
possibly a lot. There is also an unknown amount of context information 
structures. In "Table 260. ARM Processor Error Section" there are 
ERR_INFO_NUM and CONTEXT_INFO_NUM which give the number of these 
structures. I think there will need to be separate trace events added in 
for each of these structures because I don't think there is a way to 
have variable amounts of structures inside of a trace event.

The ARM processor error section also has a vendor specific error info 
buffer which will need to be exposed to userspace. This may be something 
that can reuse the unknown section type trace event or have it's own 
trace event for.

Thanks,
Tyler
Xie XiuQi March 13, 2017, 2:31 a.m. UTC | #3
Hi Baicar Tyler,

On 2017/3/11 2:23, Baicar, Tyler wrote:
> Hello Xie XiuQi,
> 
> 
> On 3/9/2017 2:41 AM, Xie XiuQi wrote:
>> On 2017/3/7 4:45, Tyler Baicar wrote:
>>> Currently there are trace events for the various RAS
>>> errors with the exception of ARM processor type errors.
>>> Add a new trace event for such errors so that the user
>>> will know when they occur. These trace events are
>>> consistent with the ARM processor error section type
>>> defined in UEFI 2.6 spec section N.2.4.4.
>>>
>>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>>> Acked-by: Steven Rostedt <rostedt@goodmis.org>
>>> ---
>>>   drivers/acpi/apei/ghes.c    |  8 +++++++-
>>>   drivers/firmware/efi/cper.c |  1 +
>>>   drivers/ras/ras.c           |  1 +
>>>   include/ras/ras_event.h     | 34 ++++++++++++++++++++++++++++++++++
>>>   4 files changed, 43 insertions(+), 1 deletion(-)
> 
>>> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
>>> index 5861b6f..b36db48 100644
>>> --- a/include/ras/ras_event.h
>>> +++ b/include/ras/ras_event.h
>>> @@ -162,6 +162,40 @@
>>>   );
>>>     /*
>>> + * ARM Processor Events Report
>>> + *
>>> + * This event is generated when hardware detects an ARM processor error
>>> + * has occurred. UEFI 2.6 spec section N.2.4.4.
>>> + */
>>> +TRACE_EVENT(arm_event,
>>> +
>>> +    TP_PROTO(const struct cper_sec_proc_arm *proc),
>>> +
>>> +    TP_ARGS(proc),
>>> +
>>> +    TP_STRUCT__entry(
>>> +        __field(u64, mpidr)
>>> +        __field(u64, midr)
>>> +        __field(u32, running_state)
>>> +        __field(u32, psci_state)
>>> +        __field(u8, affinity)
>>> +    ),
>>> +
>>> +    TP_fast_assign(
>>> +        __entry->affinity = proc->affinity_level;
>>> +        __entry->mpidr = proc->mpidr;
>>> +        __entry->midr = proc->midr;
>>> +        __entry->running_state = proc->running_state;
>>> +        __entry->psci_state = proc->psci_state;
>>> +    ),
>>> +
>>> +    TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>>> +          "running state: %d; PSCI state: %d",
>>> +          __entry->affinity, __entry->mpidr, __entry->midr,
>>> +          __entry->running_state, __entry->psci_state)
>>> +);
>>> +
>> I think these fields are not enough, we need also export arm processor error
>> information (UEFI 2.6 spec section N.2.4.4.1), or at least the error type,
>> address, etc. So that the userspace (such as rasdaemon tool) could know what
>> error occurred.
>
> This is something I am planning on adding in later. It is not clear to me how to
> actually do this at this point. If you look at the spec, there is not a single
> error information structure. There is at least one, but possibly a lot. There is
> also an unknown amount of context information structures. In "Table 260. ARM Processor
> Error Section" there are ERR_INFO_NUM and CONTEXT_INFO_NUM which give the number of these
> structures. I think there will need to be separate trace events added in for each of
> these structures because I don't think there is a way to have variable amounts of
> structures inside of a trace event.

Yes, I agree.

Additional, cper_sec_proc_arm has validation bit, which indicates whether or not each of
the fields is valid in this section. How could we show it in this trace event? If the filed
is invalid, we would get a wrong value here.

--
Thanks,
Xie XiuQi

> 
> The ARM processor error section also has a vendor specific error info buffer which will need to be exposed to userspace. This may be something that can reuse the unknown section type trace event or have it's own trace event for.
> 
> Thanks,
> Tyler
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tyler Baicar March 14, 2017, 7:29 p.m. UTC | #4
Hello Xie XiUQi,


On 3/12/2017 8:31 PM, Xie XiuQi wrote:
> Hi Baicar Tyler,
>
> On 2017/3/11 2:23, Baicar, Tyler wrote:
>> Hello Xie XiuQi,
>>
>>
>> On 3/9/2017 2:41 AM, Xie XiuQi wrote:
>>> On 2017/3/7 4:45, Tyler Baicar wrote:
>>>> Currently there are trace events for the various RAS
>>>> errors with the exception of ARM processor type errors.
>>>> Add a new trace event for such errors so that the user
>>>> will know when they occur. These trace events are
>>>> consistent with the ARM processor error section type
>>>> defined in UEFI 2.6 spec section N.2.4.4.
>>>>
>>>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>>>> Acked-by: Steven Rostedt <rostedt@goodmis.org>
>>>> ---
>>>>    drivers/acpi/apei/ghes.c    |  8 +++++++-
>>>>    drivers/firmware/efi/cper.c |  1 +
>>>>    drivers/ras/ras.c           |  1 +
>>>>    include/ras/ras_event.h     | 34 ++++++++++++++++++++++++++++++++++
>>>>    4 files changed, 43 insertions(+), 1 deletion(-)
>>>> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
>>>> index 5861b6f..b36db48 100644
>>>> --- a/include/ras/ras_event.h
>>>> +++ b/include/ras/ras_event.h
>>>> @@ -162,6 +162,40 @@
>>>>    );
>>>>      /*
>>>> + * ARM Processor Events Report
>>>> + *
>>>> + * This event is generated when hardware detects an ARM processor error
>>>> + * has occurred. UEFI 2.6 spec section N.2.4.4.
>>>> + */
>>>> +TRACE_EVENT(arm_event,
>>>> +
>>>> +    TP_PROTO(const struct cper_sec_proc_arm *proc),
>>>> +
>>>> +    TP_ARGS(proc),
>>>> +
>>>> +    TP_STRUCT__entry(
>>>> +        __field(u64, mpidr)
>>>> +        __field(u64, midr)
>>>> +        __field(u32, running_state)
>>>> +        __field(u32, psci_state)
>>>> +        __field(u8, affinity)
>>>> +    ),
>>>> +
>>>> +    TP_fast_assign(
>>>> +        __entry->affinity = proc->affinity_level;
>>>> +        __entry->mpidr = proc->mpidr;
>>>> +        __entry->midr = proc->midr;
>>>> +        __entry->running_state = proc->running_state;
>>>> +        __entry->psci_state = proc->psci_state;
>>>> +    ),
>>>> +
>>>> +    TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>>>> +          "running state: %d; PSCI state: %d",
>>>> +          __entry->affinity, __entry->mpidr, __entry->midr,
>>>> +          __entry->running_state, __entry->psci_state)
>>>> +);
>>>> +
>>> I think these fields are not enough, we need also export arm processor error
>>> information (UEFI 2.6 spec section N.2.4.4.1), or at least the error type,
>>> address, etc. So that the userspace (such as rasdaemon tool) could know what
>>> error occurred.
>> This is something I am planning on adding in later. It is not clear to me how to
>> actually do this at this point. If you look at the spec, there is not a single
>> error information structure. There is at least one, but possibly a lot. There is
>> also an unknown amount of context information structures. In "Table 260. ARM Processor
>> Error Section" there are ERR_INFO_NUM and CONTEXT_INFO_NUM which give the number of these
>> structures. I think there will need to be separate trace events added in for each of
>> these structures because I don't think there is a way to have variable amounts of
>> structures inside of a trace event.
> Yes, I agree.
>
> Additional, cper_sec_proc_arm has validation bit, which indicates whether or not each of
> the fields is valid in this section. How could we show it in this trace event? If the filed
> is invalid, we would get a wrong value here.
>
I will add in checks for whether the fields are valid similar to what 
you did for the error info patch.

Thanks,
Tyler
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 842c0cc..81d7b79 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -514,7 +514,13 @@  static void ghes_do_proc(struct ghes *ghes,
 		}
 #endif
 #ifdef CONFIG_RAS
-		else if (trace_unknown_sec_event_enabled()) {
+		else if (!uuid_le_cmp(sec_type, CPER_SEC_PROC_ARM) &&
+			 trace_arm_event_enabled()) {
+			struct cper_sec_proc_arm *arm_err;
+
+			arm_err = acpi_hest_generic_data_payload(gdata);
+			trace_arm_event(arm_err);
+		} else if (trace_unknown_sec_event_enabled()) {
 			void *unknown_err = acpi_hest_generic_data_payload(gdata);
 			trace_unknown_sec_event(&sec_type,
 					fru_id, fru_text, sec_sev,
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 545a6c2..e9fb56a 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -35,6 +35,7 @@ 
 #include <linux/printk.h>
 #include <linux/bcd.h>
 #include <acpi/ghes.h>
+#include <ras/ras_event.h>
 
 #define INDENT_SP	" "
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index fb2500b..8ba5a94 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -28,3 +28,4 @@  static int __init ras_init(void)
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(unknown_sec_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 5861b6f..b36db48 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,40 @@ 
 );
 
 /*
+ * ARM Processor Events Report
+ *
+ * This event is generated when hardware detects an ARM processor error
+ * has occurred. UEFI 2.6 spec section N.2.4.4.
+ */
+TRACE_EVENT(arm_event,
+
+	TP_PROTO(const struct cper_sec_proc_arm *proc),
+
+	TP_ARGS(proc),
+
+	TP_STRUCT__entry(
+		__field(u64, mpidr)
+		__field(u64, midr)
+		__field(u32, running_state)
+		__field(u32, psci_state)
+		__field(u8, affinity)
+	),
+
+	TP_fast_assign(
+		__entry->affinity = proc->affinity_level;
+		__entry->mpidr = proc->mpidr;
+		__entry->midr = proc->midr;
+		__entry->running_state = proc->running_state;
+		__entry->psci_state = proc->psci_state;
+	),
+
+	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+		  "running state: %d; PSCI state: %d",
+		  __entry->affinity, __entry->mpidr, __entry->midr,
+		  __entry->running_state, __entry->psci_state)
+);
+
+/*
  * Unknown Section Report
  *
  * This event is generated when hardware detected a hardware