diff mbox

[V3,08/10] ras: acpi / apei: generate trace event for unrecognized CPER section

Message ID 1475875882-2604-9-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Tyler Baicar Oct. 7, 2016, 9:31 p.m. UTC
UEFI spec allows for non-standard section in Common Platform Error
Record. This is defined in section N.2.3 of UEFI version 2.5.

Currently if the CPER section's type (UUID) does not match with
any section type that the kernel knows how to parse, trace event
is not generated for such section. And thus user is not able to know
happening of such hardware error, including error record of
non-standard section.

This commit generates a trace event which contains raw error data
for unrecognized CPER section.

Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 18 +++++++++++++++++-
 drivers/ras/ras.c        |  1 +
 include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

Comments

Punit Agrawal Oct. 13, 2016, 10:54 a.m. UTC | #1
Hi Tyler,

One last comment...

Tyler Baicar <tbaicar@codeaurora.org> writes:

> UEFI spec allows for non-standard section in Common Platform Error
> Record. This is defined in section N.2.3 of UEFI version 2.5.
>
> Currently if the CPER section's type (UUID) does not match with
> any section type that the kernel knows how to parse, trace event
> is not generated for such section. And thus user is not able to know
> happening of such hardware error, including error record of
> non-standard section.
>
> This commit generates a trace event which contains raw error data
> for unrecognized CPER section.
>
> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 18 +++++++++++++++++-
>  drivers/ras/ras.c        |  1 +
>  include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 36894c8..cb4c7f4 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -49,6 +49,7 @@
>  #include <acpi/ghes.h>
>  #include <acpi/apei.h>
>  #include <asm/tlbflush.h>
> +#include <ras/ras_event.h>
>  
>  #ifdef CONFIG_HAVE_ACPI_APEI_SEA
>  #include <asm/system_misc.h>
> @@ -468,12 +469,21 @@ static void ghes_do_proc(struct ghes *ghes,
>  	int sev, sec_sev;
>  	struct acpi_hest_generic_data *gdata;
>  	uuid_le sec_type;
> +	uuid_le *fru_id;
> +	char *fru_text = "";
>  
>  	sev = ghes_severity(estatus->error_severity);
>  	apei_estatus_for_each_section(estatus, gdata) {
>  		sec_sev = ghes_severity(gdata->error_severity);
>  		sec_type = *(uuid_le *)gdata->section_type;
>  
> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
> +			fru_id = (uuid_le *)gdata->fru_id;
> +		else
> +			fru_id = &NULL_UUID_LE;

fru_id can be initialised at declaration and drop the else here. The
same is already being done for fru_text.

Thanks,
Punit

> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
> +			fru_text = gdata->fru_text;
> +
>  		if (!uuid_le_cmp(sec_type,
>  				 CPER_SEC_PLATFORM_MEM)) {
>  			struct cper_sec_mem_err *mem_err;

[...]
Tyler Baicar Oct. 13, 2016, 8:15 p.m. UTC | #2
Hello Punit,

On 10/13/2016 4:54 AM, Punit Agrawal wrote:
> Hi Tyler,
>
> One last comment...
>
> Tyler Baicar <tbaicar@codeaurora.org> writes:
>
>> UEFI spec allows for non-standard section in Common Platform Error
>> Record. This is defined in section N.2.3 of UEFI version 2.5.
>>
>> Currently if the CPER section's type (UUID) does not match with
>> any section type that the kernel knows how to parse, trace event
>> is not generated for such section. And thus user is not able to know
>> happening of such hardware error, including error record of
>> non-standard section.
>>
>> This commit generates a trace event which contains raw error data
>> for unrecognized CPER section.
>>
>> Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> ---
>>   drivers/acpi/apei/ghes.c | 18 +++++++++++++++++-
>>   drivers/ras/ras.c        |  1 +
>>   include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 63 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 36894c8..cb4c7f4 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -49,6 +49,7 @@
>>   #include <acpi/ghes.h>
>>   #include <acpi/apei.h>
>>   #include <asm/tlbflush.h>
>> +#include <ras/ras_event.h>
>>   
>>   #ifdef CONFIG_HAVE_ACPI_APEI_SEA
>>   #include <asm/system_misc.h>
>> @@ -468,12 +469,21 @@ static void ghes_do_proc(struct ghes *ghes,
>>   	int sev, sec_sev;
>>   	struct acpi_hest_generic_data *gdata;
>>   	uuid_le sec_type;
>> +	uuid_le *fru_id;
>> +	char *fru_text = "";
>>   
>>   	sev = ghes_severity(estatus->error_severity);
>>   	apei_estatus_for_each_section(estatus, gdata) {
>>   		sec_sev = ghes_severity(gdata->error_severity);
>>   		sec_type = *(uuid_le *)gdata->section_type;
>>   
>> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
>> +			fru_id = (uuid_le *)gdata->fru_id;
>> +		else
>> +			fru_id = &NULL_UUID_LE;
> fru_id can be initialised at declaration and drop the else here. The
> same is already being done for fru_text.
Yes, I will make this change in the next version.

Thanks,
Tyler
> Thanks,
> Punit
>
>> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
>> +			fru_text = gdata->fru_text;
>> +
>>   		if (!uuid_le_cmp(sec_type,
>>   				 CPER_SEC_PLATFORM_MEM)) {
>>   			struct cper_sec_mem_err *mem_err;
> [...]
>
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 36894c8..cb4c7f4 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -49,6 +49,7 @@ 
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
+#include <ras/ras_event.h>
 
 #ifdef CONFIG_HAVE_ACPI_APEI_SEA
 #include <asm/system_misc.h>
@@ -468,12 +469,21 @@  static void ghes_do_proc(struct ghes *ghes,
 	int sev, sec_sev;
 	struct acpi_hest_generic_data *gdata;
 	uuid_le sec_type;
+	uuid_le *fru_id;
+	char *fru_text = "";
 
 	sev = ghes_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
 		sec_sev = ghes_severity(gdata->error_severity);
 		sec_type = *(uuid_le *)gdata->section_type;
 
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+			fru_id = (uuid_le *)gdata->fru_id;
+		else
+			fru_id = &NULL_UUID_LE;
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+			fru_text = gdata->fru_text;
+
 		if (!uuid_le_cmp(sec_type,
 				 CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err;
@@ -485,7 +495,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_handle_memory_failure(gdata, sev);
 		}
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+		else if (!uuid_le_cmp(sec_type,
 				      CPER_SEC_PCIE)) {
 			struct cper_sec_pcie *pcie_err;
 
@@ -518,6 +528,12 @@  static void ghes_do_proc(struct ghes *ghes,
 
 		}
 #endif
+		else {
+			void *unknown_err = acpi_hest_generic_data_payload(gdata);
+			trace_unknown_sec_event(&sec_type,
+					fru_id, fru_text, sec_sev,
+					unknown_err, gdata->error_data_length);
+		}
 	}
 }
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd36..fb2500b 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,4 @@  subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unknown_sec_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 1791a12..5861b6f 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,51 @@  TRACE_EVENT(mc_event,
 );
 
 /*
+ * Unknown Section Report
+ *
+ * This event is generated when hardware detected a hardware
+ * error event, which may be of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record", or may
+ * be of sections for which TRACE_EVENT is not defined.
+ *
+ */
+TRACE_EVENT(unknown_sec_event,
+
+	TP_PROTO(const uuid_le *sec_type,
+		 const uuid_le *fru_id,
+		 const char *fru_text,
+		 const u8 sev,
+		 const u8 *err,
+		 const u32 len),
+
+	TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
+
+	TP_STRUCT__entry(
+		__array(char, sec_type, 16)
+		__array(char, fru_id, 16)
+		__string(fru_text, fru_text)
+		__field(u8, sev)
+		__field(u32, len)
+		__dynamic_array(u8, buf, len)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->sec_type, sec_type, sizeof(uuid_le));
+		memcpy(__entry->fru_id, fru_id, sizeof(uuid_le));
+		__assign_str(fru_text, fru_text);
+		__entry->sev = sev;
+		__entry->len = len;
+		memcpy(__get_dynamic_array(buf), err, len);
+	),
+
+	TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
+		  __entry->sev, __entry->sec_type,
+		  __entry->fru_id, __get_str(fru_text),
+		  __entry->len,
+		  __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
+/*
  * PCIe AER Trace event
  *
  * These events are generated when hardware detects a corrected or