diff mbox

[V15,09/11] ras: acpi / apei: generate trace event for unrecognized CPER section

Message ID 1492556723-9189-10-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Tyler Baicar April 18, 2017, 11:05 p.m. UTC
UEFI spec allows for non-standard section in Common Platform Error
Record. This is defined in section N.2.3 of UEFI version 2.5.

Currently if the CPER section's type (UUID) does not match with
any section type that the kernel knows how to parse, trace event
is not generated for such section. And thus user is not able to know
happening of such hardware error, including error record of
non-standard section.

This commit generates a trace event which contains raw error data
for unrecognized CPER section.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Tested-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/acpi/apei/ghes.c | 27 +++++++++++++++++++++++----
 drivers/ras/ras.c        |  1 +
 include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 4 deletions(-)

Comments

Borislav Petkov May 5, 2017, 5:53 p.m. UTC | #1
On Tue, Apr 18, 2017 at 05:05:21PM -0600, Tyler Baicar wrote:
> UEFI spec allows for non-standard section in Common Platform Error
> Record. This is defined in section N.2.3 of UEFI version 2.5.

If the spec calls it non-standard why are we calling it "unknown
section"?

> Currently if the CPER section's type (UUID) does not match with
> any section type that the kernel knows how to parse, trace event
> is not generated for such section. And thus user is not able to know
> happening of such hardware error, including error record of
> non-standard section.

That's sentence sounds funny.

> This commit generates a trace event which contains raw error data
> for unrecognized CPER section.

Never write "This commit" or "This patch" in your commit message -
that's a given.

> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Tested-by: Shiju Jose <shiju.jose@huawei.com>
> ---
>  drivers/acpi/apei/ghes.c | 27 +++++++++++++++++++++++----
>  drivers/ras/ras.c        |  1 +
>  include/ras/ras_event.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 69 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index b91123f..3d9f63b 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -45,11 +45,13 @@
>  #include <linux/aer.h>
>  #include <linux/nmi.h>
>  #include <linux/sched/clock.h>
> +#include <linux/uuid.h>
>  
>  #include <acpi/actbl1.h>
>  #include <acpi/ghes.h>
>  #include <acpi/apei.h>
>  #include <asm/tlbflush.h>
> +#include <ras/ras_event.h>
>  
>  #include "apei-internal.h"
>  
> @@ -461,12 +463,21 @@ static void ghes_do_proc(struct ghes *ghes,
>  {
>  	int sev, sec_sev;
>  	struct acpi_hest_generic_data *gdata;
> +	uuid_le sec_type;
> +	uuid_le *fru_id = &NULL_UUID_LE;
> +	char *fru_text = "";
>  
>  	sev = ghes_severity(estatus->error_severity);
>  	apei_estatus_for_each_section(estatus, gdata) {
>  		sec_sev = ghes_severity(gdata->error_severity);
> -		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> -				 CPER_SEC_PLATFORM_MEM)) {
> +		sec_type = *(uuid_le *)gdata->section_type;
> +
> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
> +			fru_id = (uuid_le *)gdata->fru_id;
> +		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
> +			fru_text = gdata->fru_text;
> +
> +		if (!uuid_le_cmp(sec_type, CPER_SEC_PLATFORM_MEM)) {
>  			struct cper_sec_mem_err *mem_err;
>  			mem_err = acpi_hest_get_payload(gdata);
>  			ghes_edac_report_mem_error(ghes, sev, mem_err);
> @@ -475,8 +486,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_memory_failure(gdata, sev);
>  		}
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
> -		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
> -				      CPER_SEC_PCIE)) {
> +		else if (!uuid_le_cmp(sec_type, CPER_SEC_PCIE)) {
>  			struct cper_sec_pcie *pcie_err;
>  			pcie_err = acpi_hest_get_payload(gdata);
>  			if (sev == GHES_SEV_RECOVERABLE &&
> @@ -507,6 +517,15 @@ static void ghes_do_proc(struct ghes *ghes,
>  
>  		}
>  #endif
> +#ifdef CONFIG_RAS
> +		else if (trace_unknown_sec_event_enabled()) {
> +			void *unknown_err = acpi_hest_get_payload(gdata);
> +
> +			trace_unknown_sec_event(&sec_type,
> +					fru_id, fru_text, sec_sev,
> +					unknown_err, gdata->error_data_length);
> +		}
> +#endif

Put that in a function in ras.c along with a prototype for
include/linux/ras.h for the !CONFIG_RAS case so that you can save
yourself the ifdeffery in an already not really easy to read function.
Steven Rostedt May 5, 2017, 6:44 p.m. UTC | #2
Sorry for the late reply. Borislav pinged me to look at this.


On Tue, 18 Apr 2017 17:05:21 -0600
Tyler Baicar <tbaicar@codeaurora.org> wrote:



> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index 1791a12..5861b6f 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -162,6 +162,51 @@
>  );
>  
>  /*
> + * Unknown Section Report
> + *
> + * This event is generated when hardware detected a hardware
> + * error event, which may be of non-standard section as defined
> + * in UEFI spec appendix "Common Platform Error Record", or may
> + * be of sections for which TRACE_EVENT is not defined.
> + *
> + */
> +TRACE_EVENT(unknown_sec_event,
> +
> +	TP_PROTO(const uuid_le *sec_type,
> +		 const uuid_le *fru_id,
> +		 const char *fru_text,
> +		 const u8 sev,
> +		 const u8 *err,
> +		 const u32 len),
> +
> +	TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
> +
> +	TP_STRUCT__entry(
> +		__array(char, sec_type, 16)
> +		__array(char, fru_id, 16)
> +		__string(fru_text, fru_text)
> +		__field(u8, sev)
> +		__field(u32, len)
> +		__dynamic_array(u8, buf, len)
> +	),
> +
> +	TP_fast_assign(
> +		memcpy(__entry->sec_type, sec_type, sizeof(uuid_le));
> +		memcpy(__entry->fru_id, fru_id, sizeof(uuid_le));

My only concern here is that you are using sizeof(uuid_le) into an
array that is hardcoded as 16 bytes. I don't expect the size of uuid_le
to ever change, but if it does, you just created an exploit.

I would suggest having a macro about the size of uuid_le and use both
here and include/uapi/linux/uuid.h.

#define UUID_SIZE

typedef struct {
	__u8 b[UUID_SIZE];
} uuid_le;

And then we can just use UUID_SIZE safely here:

	__array(char, sec_type, UUID_SIZE)

[...]

	memcpy(__entry->sec_type, sec_type, UUID_SIZE));

Alternatively we could add in the C file that defines the tracepoints:

BUILD_BUG(sizeof(uuid_le) > 16);

But that's hacky.


> +		__assign_str(fru_text, fru_text);
> +		__entry->sev = sev;
> +		__entry->len = len;
> +		memcpy(__get_dynamic_array(buf), err, len);
> +	),
> +
> +	TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
> +		  __entry->sev, __entry->sec_type,

Hmm, I wonder if %pU is defined in the libtraceevent library?

-- Steve

> +		  __entry->fru_id, __get_str(fru_text),
> +		  __entry->len,
> +		  __print_hex(__get_dynamic_array(buf), __entry->len))
> +);
> +
> +/*
>   * PCIe AER Trace event
>   *
>   * These events are generated when hardware detects a corrected or

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index b91123f..3d9f63b 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -45,11 +45,13 @@ 
 #include <linux/aer.h>
 #include <linux/nmi.h>
 #include <linux/sched/clock.h>
+#include <linux/uuid.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
+#include <ras/ras_event.h>
 
 #include "apei-internal.h"
 
@@ -461,12 +463,21 @@  static void ghes_do_proc(struct ghes *ghes,
 {
 	int sev, sec_sev;
 	struct acpi_hest_generic_data *gdata;
+	uuid_le sec_type;
+	uuid_le *fru_id = &NULL_UUID_LE;
+	char *fru_text = "";
 
 	sev = ghes_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
 		sec_sev = ghes_severity(gdata->error_severity);
-		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
-				 CPER_SEC_PLATFORM_MEM)) {
+		sec_type = *(uuid_le *)gdata->section_type;
+
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+			fru_id = (uuid_le *)gdata->fru_id;
+		if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+			fru_text = gdata->fru_text;
+
+		if (!uuid_le_cmp(sec_type, CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err;
 			mem_err = acpi_hest_get_payload(gdata);
 			ghes_edac_report_mem_error(ghes, sev, mem_err);
@@ -475,8 +486,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_handle_memory_failure(gdata, sev);
 		}
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
-				      CPER_SEC_PCIE)) {
+		else if (!uuid_le_cmp(sec_type, CPER_SEC_PCIE)) {
 			struct cper_sec_pcie *pcie_err;
 			pcie_err = acpi_hest_get_payload(gdata);
 			if (sev == GHES_SEV_RECOVERABLE &&
@@ -507,6 +517,15 @@  static void ghes_do_proc(struct ghes *ghes,
 
 		}
 #endif
+#ifdef CONFIG_RAS
+		else if (trace_unknown_sec_event_enabled()) {
+			void *unknown_err = acpi_hest_get_payload(gdata);
+
+			trace_unknown_sec_event(&sec_type,
+					fru_id, fru_text, sec_sev,
+					unknown_err, gdata->error_data_length);
+		}
+#endif
 	}
 }
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd36..fb2500b 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,4 @@  static int __init ras_init(void)
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(unknown_sec_event);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 1791a12..5861b6f 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,51 @@ 
 );
 
 /*
+ * Unknown Section Report
+ *
+ * This event is generated when hardware detected a hardware
+ * error event, which may be of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record", or may
+ * be of sections for which TRACE_EVENT is not defined.
+ *
+ */
+TRACE_EVENT(unknown_sec_event,
+
+	TP_PROTO(const uuid_le *sec_type,
+		 const uuid_le *fru_id,
+		 const char *fru_text,
+		 const u8 sev,
+		 const u8 *err,
+		 const u32 len),
+
+	TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
+
+	TP_STRUCT__entry(
+		__array(char, sec_type, 16)
+		__array(char, fru_id, 16)
+		__string(fru_text, fru_text)
+		__field(u8, sev)
+		__field(u32, len)
+		__dynamic_array(u8, buf, len)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->sec_type, sec_type, sizeof(uuid_le));
+		memcpy(__entry->fru_id, fru_id, sizeof(uuid_le));
+		__assign_str(fru_text, fru_text);
+		__entry->sev = sev;
+		__entry->len = len;
+		memcpy(__get_dynamic_array(buf), err, len);
+	),
+
+	TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
+		  __entry->sev, __entry->sec_type,
+		  __entry->fru_id, __get_str(fru_text),
+		  __entry->len,
+		  __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
+/*
  * PCIe AER Trace event
  *
  * These events are generated when hardware detects a corrected or