diff mbox series

edac,ghes,cper: Add Row Extension to Memory Error Record

Message ID 20200727181445.111002-1-alex.kluver@hpe.com (mailing list archive)
State New, archived
Headers show
Series edac,ghes,cper: Add Row Extension to Memory Error Record | expand

Commit Message

Alex Kluver July 27, 2020, 6:14 p.m. UTC
Memory errors could be printed with incorrect row values since the DIMM
size has outgrown the 16 bit row field in the CPER structure. UEFI
Specification Version 2.8 has increased the size of row by allowing it to
use the first 2 bits from a previously reserved space within the structure.

When needed, add the extension bits to the row value printed.

Based on UEFI 2.8 Table 299. Memory Error Record

Tested-by: Russ Anderson <russ.anderson@hpe.com>
Signed-off-by: Alex Kluver <alex.kluver@hpe.com>
---
 drivers/edac/ghes_edac.c    | 10 ++++++++--
 drivers/firmware/efi/cper.c | 11 +++++++++--
 include/linux/cper.h        |  9 +++++++--
 3 files changed, 24 insertions(+), 6 deletions(-)

Comments

Ard Biesheuvel Aug. 14, 2020, 6:33 a.m. UTC | #1
On Mon, 27 Jul 2020 at 20:15, Alex Kluver <alex.kluver@hpe.com> wrote:
>
> Memory errors could be printed with incorrect row values since the DIMM
> size has outgrown the 16 bit row field in the CPER structure. UEFI
> Specification Version 2.8 has increased the size of row by allowing it to
> use the first 2 bits from a previously reserved space within the structure.
>
> When needed, add the extension bits to the row value printed.
>
> Based on UEFI 2.8 Table 299. Memory Error Record
>
> Tested-by: Russ Anderson <russ.anderson@hpe.com>
> Signed-off-by: Alex Kluver <alex.kluver@hpe.com>
> ---
>  drivers/edac/ghes_edac.c    | 10 ++++++++--
>  drivers/firmware/efi/cper.c | 11 +++++++++--
>  include/linux/cper.h        |  9 +++++++--
>  3 files changed, 24 insertions(+), 6 deletions(-)
>

Unless any of the EDAC folks object, I will take this as a fix via the EFI tree.

Thanks,
Ard.


> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index cb3dab56a875..cfa3156300f5 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>                 p += sprintf(p, "rank:%d ", mem_err->rank);
>         if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
>                 p += sprintf(p, "bank:%d ", mem_err->bank);
> -       if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
> -               p += sprintf(p, "row:%d ", mem_err->row);
> +       if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +               u32 row_extended = 0;
> +               if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +                       row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK)
> +                               <<CPER_MEM_EXT_ROW_SHIFT;
> +               row_extended |= mem_err->row;
> +               p += sprintf(p, "row:%d ", row_extended);
> +       }
>         if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
>                 p += sprintf(p, "col:%d ", mem_err->column);
>         if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index f564e15fbc7e..5faaf6ecd659 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
>                 n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
>         if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
>                 n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
> -       if (mem->validation_bits & CPER_MEM_VALID_ROW)
> -               n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
> +       if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +               u32 row_extended = 0;
> +               if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +                       row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK)
> +                               <<CPER_MEM_EXT_ROW_SHIFT;
> +               row_extended |= mem->row;
> +               n += scnprintf(msg + n, len - n, "row: %d ", row_extended);
> +       }
>         if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
>                 n += scnprintf(msg + n, len - n, "column: %d ", mem->column);
>         if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
> @@ -292,6 +298,7 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
>         cmem->requestor_id = mem->requestor_id;
>         cmem->responder_id = mem->responder_id;
>         cmem->target_id = mem->target_id;
> +       cmem->extended = mem->extended;
>         cmem->rank = mem->rank;
>         cmem->mem_array_handle = mem->mem_array_handle;
>         cmem->mem_dev_handle = mem->mem_dev_handle;
> diff --git a/include/linux/cper.h b/include/linux/cper.h
> index 8537e9282a65..c8313753ee49 100644
> --- a/include/linux/cper.h
> +++ b/include/linux/cper.h
> @@ -230,6 +230,10 @@ enum {
>  #define CPER_MEM_VALID_RANK_NUMBER             0x8000
>  #define CPER_MEM_VALID_CARD_HANDLE             0x10000
>  #define CPER_MEM_VALID_MODULE_HANDLE           0x20000
> +#define CPER_MEM_VALID_ROW_EXT                 0x40000
> +
> +#define CPER_MEM_EXT_ROW_MASK                  0x3
> +#define CPER_MEM_EXT_ROW_SHIFT                 16
>
>  #define CPER_PCIE_VALID_PORT_TYPE              0x0001
>  #define CPER_PCIE_VALID_VERSION                        0x0002
> @@ -443,7 +447,7 @@ struct cper_sec_mem_err_old {
>         u8      error_type;
>  };
>
> -/* Memory Error Section (UEFI >= v2.3), UEFI v2.7 sec N.2.5 */
> +/* Memory Error Section (UEFI >= v2.3), UEFI v2.8 sec N.2.5 */
>  struct cper_sec_mem_err {
>         u64     validation_bits;
>         u64     error_status;
> @@ -461,7 +465,7 @@ struct cper_sec_mem_err {
>         u64     responder_id;
>         u64     target_id;
>         u8      error_type;
> -       u8      reserved;
> +       u8      extended;
>         u16     rank;
>         u16     mem_array_handle;       /* "card handle" in UEFI 2.4 */
>         u16     mem_dev_handle;         /* "module handle" in UEFI 2.4 */
> @@ -483,6 +487,7 @@ struct cper_mem_err_compact {
>         u16     rank;
>         u16     mem_array_handle;
>         u16     mem_dev_handle;
> +       u8      extended;
>  };
>
>  /* PCI Express Error Section, UEFI v2.7 sec N.2.7 */
Borislav Petkov Aug. 15, 2020, 9:33 a.m. UTC | #2
On Mon, Jul 27, 2020 at 01:14:45PM -0500, Alex Kluver wrote:
> Memory errors could be printed with incorrect row values since the DIMM
> size has outgrown the 16 bit row field in the CPER structure. UEFI
> Specification Version 2.8 has increased the size of row by allowing it to
> use the first 2 bits from a previously reserved space within the structure.
> 
> When needed, add the extension bits to the row value printed.
> 
> Based on UEFI 2.8 Table 299. Memory Error Record
> 
> Tested-by: Russ Anderson <russ.anderson@hpe.com>
> Signed-off-by: Alex Kluver <alex.kluver@hpe.com>
> ---
>  drivers/edac/ghes_edac.c    | 10 ++++++++--
>  drivers/firmware/efi/cper.c | 11 +++++++++--
>  include/linux/cper.h        |  9 +++++++--
>  3 files changed, 24 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
> index cb3dab56a875..cfa3156300f5 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>  		p += sprintf(p, "rank:%d ", mem_err->rank);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
>  		p += sprintf(p, "bank:%d ", mem_err->bank);
> -	if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
> -		p += sprintf(p, "row:%d ", mem_err->row);
> +	if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +		u32 row_extended = 0;
> +		if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +			row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK)
> +				<<CPER_MEM_EXT_ROW_SHIFT;
> +		row_extended |= mem_err->row;
> +		p += sprintf(p, "row:%d ", row_extended);
> +	}
>  	if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
>  		p += sprintf(p, "col:%d ", mem_err->column);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index f564e15fbc7e..5faaf6ecd659 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
>  		n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
>  	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
>  		n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
> -	if (mem->validation_bits & CPER_MEM_VALID_ROW)
> -		n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
> +	if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +		u32 row_extended = 0;
> +		if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +			row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK)
> +				<<CPER_MEM_EXT_ROW_SHIFT;

This is not very readable.

> +		row_extended |= mem->row;
> +		n += scnprintf(msg + n, len - n, "row: %d ", row_extended);
> +	}

Both those hunks contain duplicated code which kinda wants to be an
inline function in cper.h which returns row_extended and gets called by
both sites. And then the call site can look very simple:

        if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
                row = mem_err->row;

        /* add row extension */
        row |= cper_get_mem_extension();

        p += sprintf(p, "row:%d ", row);

with

static inline u32 cper_get_mem_extension(void)
{
	if (!(mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT))
		return 0;

	return (mem_err->extended & CPER_MEM_EXT_ROW_MASK) << CPER_MEM_EXT_ROW_SHIFT;
}

Something along those lines...

Thx.
Alex Kluver Aug. 17, 2020, 1:48 p.m. UTC | #3
Yes, I am working on a resubmit. The updated patch will be resubmitted in a series of patches that include other updates to the cper memory record.

Thanks,
--Alex Kluver



-----Original Message-----
From: Borislav Petkov <bp@alien8.de> 
Sent: Saturday, August 15, 2020 4:34 AM
To: Kluver, Alex <alex.kluver@hpe.com>
Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; ardb@kernel.org; mchehab@kernel.org; Anderson, Russ <russ.anderson@hpe.com>; Sivanich, Dimitri <dimitri.sivanich@hpe.com>
Subject: Re: [PATCH] edac,ghes,cper: Add Row Extension to Memory Error Record

On Mon, Jul 27, 2020 at 01:14:45PM -0500, Alex Kluver wrote:
> Memory errors could be printed with incorrect row values since the 
> DIMM size has outgrown the 16 bit row field in the CPER structure. 
> UEFI Specification Version 2.8 has increased the size of row by 
> allowing it to use the first 2 bits from a previously reserved space within the structure.
> 
> When needed, add the extension bits to the row value printed.
> 
> Based on UEFI 2.8 Table 299. Memory Error Record
> 
> Tested-by: Russ Anderson <russ.anderson@hpe.com>
> Signed-off-by: Alex Kluver <alex.kluver@hpe.com>
> ---
>  drivers/edac/ghes_edac.c    | 10 ++++++++--
>  drivers/firmware/efi/cper.c | 11 +++++++++--
>  include/linux/cper.h        |  9 +++++++--
>  3 files changed, 24 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 
> cb3dab56a875..cfa3156300f5 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
>  		p += sprintf(p, "rank:%d ", mem_err->rank);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
>  		p += sprintf(p, "bank:%d ", mem_err->bank);
> -	if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
> -		p += sprintf(p, "row:%d ", mem_err->row);
> +	if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +		u32 row_extended = 0;
> +		if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +			row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK)
> +				<<CPER_MEM_EXT_ROW_SHIFT;
> +		row_extended |= mem_err->row;
> +		p += sprintf(p, "row:%d ", row_extended);
> +	}
>  	if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
>  		p += sprintf(p, "col:%d ", mem_err->column);
>  	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) diff 
> --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c 
> index f564e15fbc7e..5faaf6ecd659 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
>  		n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
>  	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
>  		n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
> -	if (mem->validation_bits & CPER_MEM_VALID_ROW)
> -		n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
> +	if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
> +		u32 row_extended = 0;
> +		if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT)
> +			row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK)
> +				<<CPER_MEM_EXT_ROW_SHIFT;

This is not very readable.

> +		row_extended |= mem->row;
> +		n += scnprintf(msg + n, len - n, "row: %d ", row_extended);
> +	}

Both those hunks contain duplicated code which kinda wants to be an inline function in cper.h which returns row_extended and gets called by both sites. And then the call site can look very simple:

        if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
                row = mem_err->row;

        /* add row extension */
        row |= cper_get_mem_extension();

        p += sprintf(p, "row:%d ", row);

with

static inline u32 cper_get_mem_extension(void) {
	if (!(mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT))
		return 0;

	return (mem_err->extended & CPER_MEM_EXT_ROW_MASK) << CPER_MEM_EXT_ROW_SHIFT; }

Something along those lines...

Thx.

--
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
diff mbox series

Patch

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index cb3dab56a875..cfa3156300f5 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -337,8 +337,14 @@  void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 		p += sprintf(p, "rank:%d ", mem_err->rank);
 	if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
 		p += sprintf(p, "bank:%d ", mem_err->bank);
-	if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
-		p += sprintf(p, "row:%d ", mem_err->row);
+	if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
+		u32 row_extended = 0;
+		if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)
+			row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK)
+				<<CPER_MEM_EXT_ROW_SHIFT;
+		row_extended |= mem_err->row;
+		p += sprintf(p, "row:%d ", row_extended);
+	}
 	if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
 		p += sprintf(p, "col:%d ", mem_err->column);
 	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index f564e15fbc7e..5faaf6ecd659 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -234,8 +234,14 @@  static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
 		n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
 	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
 		n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
-	if (mem->validation_bits & CPER_MEM_VALID_ROW)
-		n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
+	if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
+		u32 row_extended = 0;
+		if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT)
+			row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK)
+				<<CPER_MEM_EXT_ROW_SHIFT;
+		row_extended |= mem->row;
+		n += scnprintf(msg + n, len - n, "row: %d ", row_extended);
+	}
 	if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
 		n += scnprintf(msg + n, len - n, "column: %d ", mem->column);
 	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
@@ -292,6 +298,7 @@  void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
 	cmem->requestor_id = mem->requestor_id;
 	cmem->responder_id = mem->responder_id;
 	cmem->target_id = mem->target_id;
+	cmem->extended = mem->extended;
 	cmem->rank = mem->rank;
 	cmem->mem_array_handle = mem->mem_array_handle;
 	cmem->mem_dev_handle = mem->mem_dev_handle;
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 8537e9282a65..c8313753ee49 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -230,6 +230,10 @@  enum {
 #define CPER_MEM_VALID_RANK_NUMBER		0x8000
 #define CPER_MEM_VALID_CARD_HANDLE		0x10000
 #define CPER_MEM_VALID_MODULE_HANDLE		0x20000
+#define CPER_MEM_VALID_ROW_EXT			0x40000
+
+#define CPER_MEM_EXT_ROW_MASK			0x3
+#define CPER_MEM_EXT_ROW_SHIFT			16
 
 #define CPER_PCIE_VALID_PORT_TYPE		0x0001
 #define CPER_PCIE_VALID_VERSION			0x0002
@@ -443,7 +447,7 @@  struct cper_sec_mem_err_old {
 	u8	error_type;
 };
 
-/* Memory Error Section (UEFI >= v2.3), UEFI v2.7 sec N.2.5 */
+/* Memory Error Section (UEFI >= v2.3), UEFI v2.8 sec N.2.5 */
 struct cper_sec_mem_err {
 	u64	validation_bits;
 	u64	error_status;
@@ -461,7 +465,7 @@  struct cper_sec_mem_err {
 	u64	responder_id;
 	u64	target_id;
 	u8	error_type;
-	u8	reserved;
+	u8	extended;
 	u16	rank;
 	u16	mem_array_handle;	/* "card handle" in UEFI 2.4 */
 	u16	mem_dev_handle;		/* "module handle" in UEFI 2.4 */
@@ -483,6 +487,7 @@  struct cper_mem_err_compact {
 	u16	rank;
 	u16	mem_array_handle;
 	u16	mem_dev_handle;
+	u8      extended;
 };
 
 /* PCI Express Error Section, UEFI v2.7 sec N.2.7 */