Message ID | 20200727181445.111002-1-alex.kluver@hpe.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | edac,ghes,cper: Add Row Extension to Memory Error Record | expand |
On Mon, 27 Jul 2020 at 20:15, Alex Kluver <alex.kluver@hpe.com> wrote: > > Memory errors could be printed with incorrect row values since the DIMM > size has outgrown the 16 bit row field in the CPER structure. UEFI > Specification Version 2.8 has increased the size of row by allowing it to > use the first 2 bits from a previously reserved space within the structure. > > When needed, add the extension bits to the row value printed. > > Based on UEFI 2.8 Table 299. Memory Error Record > > Tested-by: Russ Anderson <russ.anderson@hpe.com> > Signed-off-by: Alex Kluver <alex.kluver@hpe.com> > --- > drivers/edac/ghes_edac.c | 10 ++++++++-- > drivers/firmware/efi/cper.c | 11 +++++++++-- > include/linux/cper.h | 9 +++++++-- > 3 files changed, 24 insertions(+), 6 deletions(-) > Unless any of the EDAC folks object, I will take this as a fix via the EFI tree. Thanks, Ard. > diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c > index cb3dab56a875..cfa3156300f5 100644 > --- a/drivers/edac/ghes_edac.c > +++ b/drivers/edac/ghes_edac.c > @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) > p += sprintf(p, "rank:%d ", mem_err->rank); > if (mem_err->validation_bits & CPER_MEM_VALID_BANK) > p += sprintf(p, "bank:%d ", mem_err->bank); > - if (mem_err->validation_bits & CPER_MEM_VALID_ROW) > - p += sprintf(p, "row:%d ", mem_err->row); > + if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; > + row_extended |= mem_err->row; > + p += sprintf(p, "row:%d ", row_extended); > + } > if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) > p += sprintf(p, "col:%d ", mem_err->column); > if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) > diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c > index f564e15fbc7e..5faaf6ecd659 100644 > --- a/drivers/firmware/efi/cper.c > +++ b/drivers/firmware/efi/cper.c > @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) > n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); > if (mem->validation_bits & CPER_MEM_VALID_DEVICE) > n += scnprintf(msg + n, len - n, "device: %d ", mem->device); > - if (mem->validation_bits & CPER_MEM_VALID_ROW) > - n += scnprintf(msg + n, len - n, "row: %d ", mem->row); > + if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; > + row_extended |= mem->row; > + n += scnprintf(msg + n, len - n, "row: %d ", row_extended); > + } > if (mem->validation_bits & CPER_MEM_VALID_COLUMN) > n += scnprintf(msg + n, len - n, "column: %d ", mem->column); > if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) > @@ -292,6 +298,7 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *mem, > cmem->requestor_id = mem->requestor_id; > cmem->responder_id = mem->responder_id; > cmem->target_id = mem->target_id; > + cmem->extended = mem->extended; > cmem->rank = mem->rank; > cmem->mem_array_handle = mem->mem_array_handle; > cmem->mem_dev_handle = mem->mem_dev_handle; > diff --git a/include/linux/cper.h b/include/linux/cper.h > index 8537e9282a65..c8313753ee49 100644 > --- a/include/linux/cper.h > +++ b/include/linux/cper.h > @@ -230,6 +230,10 @@ enum { > #define CPER_MEM_VALID_RANK_NUMBER 0x8000 > #define CPER_MEM_VALID_CARD_HANDLE 0x10000 > #define CPER_MEM_VALID_MODULE_HANDLE 0x20000 > +#define CPER_MEM_VALID_ROW_EXT 0x40000 > + > +#define CPER_MEM_EXT_ROW_MASK 0x3 > +#define CPER_MEM_EXT_ROW_SHIFT 16 > > #define CPER_PCIE_VALID_PORT_TYPE 0x0001 > #define CPER_PCIE_VALID_VERSION 0x0002 > @@ -443,7 +447,7 @@ struct cper_sec_mem_err_old { > u8 error_type; > }; > > -/* Memory Error Section (UEFI >= v2.3), UEFI v2.7 sec N.2.5 */ > +/* Memory Error Section (UEFI >= v2.3), UEFI v2.8 sec N.2.5 */ > struct cper_sec_mem_err { > u64 validation_bits; > u64 error_status; > @@ -461,7 +465,7 @@ struct cper_sec_mem_err { > u64 responder_id; > u64 target_id; > u8 error_type; > - u8 reserved; > + u8 extended; > u16 rank; > u16 mem_array_handle; /* "card handle" in UEFI 2.4 */ > u16 mem_dev_handle; /* "module handle" in UEFI 2.4 */ > @@ -483,6 +487,7 @@ struct cper_mem_err_compact { > u16 rank; > u16 mem_array_handle; > u16 mem_dev_handle; > + u8 extended; > }; > > /* PCI Express Error Section, UEFI v2.7 sec N.2.7 */
On Mon, Jul 27, 2020 at 01:14:45PM -0500, Alex Kluver wrote: > Memory errors could be printed with incorrect row values since the DIMM > size has outgrown the 16 bit row field in the CPER structure. UEFI > Specification Version 2.8 has increased the size of row by allowing it to > use the first 2 bits from a previously reserved space within the structure. > > When needed, add the extension bits to the row value printed. > > Based on UEFI 2.8 Table 299. Memory Error Record > > Tested-by: Russ Anderson <russ.anderson@hpe.com> > Signed-off-by: Alex Kluver <alex.kluver@hpe.com> > --- > drivers/edac/ghes_edac.c | 10 ++++++++-- > drivers/firmware/efi/cper.c | 11 +++++++++-- > include/linux/cper.h | 9 +++++++-- > 3 files changed, 24 insertions(+), 6 deletions(-) > > diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c > index cb3dab56a875..cfa3156300f5 100644 > --- a/drivers/edac/ghes_edac.c > +++ b/drivers/edac/ghes_edac.c > @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) > p += sprintf(p, "rank:%d ", mem_err->rank); > if (mem_err->validation_bits & CPER_MEM_VALID_BANK) > p += sprintf(p, "bank:%d ", mem_err->bank); > - if (mem_err->validation_bits & CPER_MEM_VALID_ROW) > - p += sprintf(p, "row:%d ", mem_err->row); > + if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; > + row_extended |= mem_err->row; > + p += sprintf(p, "row:%d ", row_extended); > + } > if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) > p += sprintf(p, "col:%d ", mem_err->column); > if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) > diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c > index f564e15fbc7e..5faaf6ecd659 100644 > --- a/drivers/firmware/efi/cper.c > +++ b/drivers/firmware/efi/cper.c > @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) > n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); > if (mem->validation_bits & CPER_MEM_VALID_DEVICE) > n += scnprintf(msg + n, len - n, "device: %d ", mem->device); > - if (mem->validation_bits & CPER_MEM_VALID_ROW) > - n += scnprintf(msg + n, len - n, "row: %d ", mem->row); > + if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; This is not very readable. > + row_extended |= mem->row; > + n += scnprintf(msg + n, len - n, "row: %d ", row_extended); > + } Both those hunks contain duplicated code which kinda wants to be an inline function in cper.h which returns row_extended and gets called by both sites. And then the call site can look very simple: if (mem_err->validation_bits & CPER_MEM_VALID_ROW) row = mem_err->row; /* add row extension */ row |= cper_get_mem_extension(); p += sprintf(p, "row:%d ", row); with static inline u32 cper_get_mem_extension(void) { if (!(mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)) return 0; return (mem_err->extended & CPER_MEM_EXT_ROW_MASK) << CPER_MEM_EXT_ROW_SHIFT; } Something along those lines... Thx.
Yes, I am working on a resubmit. The updated patch will be resubmitted in a series of patches that include other updates to the cper memory record. Thanks, --Alex Kluver -----Original Message----- From: Borislav Petkov <bp@alien8.de> Sent: Saturday, August 15, 2020 4:34 AM To: Kluver, Alex <alex.kluver@hpe.com> Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; ardb@kernel.org; mchehab@kernel.org; Anderson, Russ <russ.anderson@hpe.com>; Sivanich, Dimitri <dimitri.sivanich@hpe.com> Subject: Re: [PATCH] edac,ghes,cper: Add Row Extension to Memory Error Record On Mon, Jul 27, 2020 at 01:14:45PM -0500, Alex Kluver wrote: > Memory errors could be printed with incorrect row values since the > DIMM size has outgrown the 16 bit row field in the CPER structure. > UEFI Specification Version 2.8 has increased the size of row by > allowing it to use the first 2 bits from a previously reserved space within the structure. > > When needed, add the extension bits to the row value printed. > > Based on UEFI 2.8 Table 299. Memory Error Record > > Tested-by: Russ Anderson <russ.anderson@hpe.com> > Signed-off-by: Alex Kluver <alex.kluver@hpe.com> > --- > drivers/edac/ghes_edac.c | 10 ++++++++-- > drivers/firmware/efi/cper.c | 11 +++++++++-- > include/linux/cper.h | 9 +++++++-- > 3 files changed, 24 insertions(+), 6 deletions(-) > > diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index > cb3dab56a875..cfa3156300f5 100644 > --- a/drivers/edac/ghes_edac.c > +++ b/drivers/edac/ghes_edac.c > @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) > p += sprintf(p, "rank:%d ", mem_err->rank); > if (mem_err->validation_bits & CPER_MEM_VALID_BANK) > p += sprintf(p, "bank:%d ", mem_err->bank); > - if (mem_err->validation_bits & CPER_MEM_VALID_ROW) > - p += sprintf(p, "row:%d ", mem_err->row); > + if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; > + row_extended |= mem_err->row; > + p += sprintf(p, "row:%d ", row_extended); > + } > if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) > p += sprintf(p, "col:%d ", mem_err->column); > if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) diff > --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c > index f564e15fbc7e..5faaf6ecd659 100644 > --- a/drivers/firmware/efi/cper.c > +++ b/drivers/firmware/efi/cper.c > @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) > n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); > if (mem->validation_bits & CPER_MEM_VALID_DEVICE) > n += scnprintf(msg + n, len - n, "device: %d ", mem->device); > - if (mem->validation_bits & CPER_MEM_VALID_ROW) > - n += scnprintf(msg + n, len - n, "row: %d ", mem->row); > + if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { > + u32 row_extended = 0; > + if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT) > + row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK) > + <<CPER_MEM_EXT_ROW_SHIFT; This is not very readable. > + row_extended |= mem->row; > + n += scnprintf(msg + n, len - n, "row: %d ", row_extended); > + } Both those hunks contain duplicated code which kinda wants to be an inline function in cper.h which returns row_extended and gets called by both sites. And then the call site can look very simple: if (mem_err->validation_bits & CPER_MEM_VALID_ROW) row = mem_err->row; /* add row extension */ row |= cper_get_mem_extension(); p += sprintf(p, "row:%d ", row); with static inline u32 cper_get_mem_extension(void) { if (!(mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT)) return 0; return (mem_err->extended & CPER_MEM_EXT_ROW_MASK) << CPER_MEM_EXT_ROW_SHIFT; } Something along those lines... Thx. -- Regards/Gruss, Boris. https://people.kernel.org/tglx/notes-about-netiquette
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index cb3dab56a875..cfa3156300f5 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -337,8 +337,14 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) p += sprintf(p, "rank:%d ", mem_err->rank); if (mem_err->validation_bits & CPER_MEM_VALID_BANK) p += sprintf(p, "bank:%d ", mem_err->bank); - if (mem_err->validation_bits & CPER_MEM_VALID_ROW) - p += sprintf(p, "row:%d ", mem_err->row); + if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { + u32 row_extended = 0; + if (mem_err->validation_bits & CPER_MEM_VALID_ROW_EXT) + row_extended = (mem_err->extended & CPER_MEM_EXT_ROW_MASK) + <<CPER_MEM_EXT_ROW_SHIFT; + row_extended |= mem_err->row; + p += sprintf(p, "row:%d ", row_extended); + } if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) p += sprintf(p, "col:%d ", mem_err->column); if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index f564e15fbc7e..5faaf6ecd659 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -234,8 +234,14 @@ static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); if (mem->validation_bits & CPER_MEM_VALID_DEVICE) n += scnprintf(msg + n, len - n, "device: %d ", mem->device); - if (mem->validation_bits & CPER_MEM_VALID_ROW) - n += scnprintf(msg + n, len - n, "row: %d ", mem->row); + if (mem->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) { + u32 row_extended = 0; + if (mem->validation_bits & CPER_MEM_VALID_ROW_EXT) + row_extended = (mem->extended & CPER_MEM_EXT_ROW_MASK) + <<CPER_MEM_EXT_ROW_SHIFT; + row_extended |= mem->row; + n += scnprintf(msg + n, len - n, "row: %d ", row_extended); + } if (mem->validation_bits & CPER_MEM_VALID_COLUMN) n += scnprintf(msg + n, len - n, "column: %d ", mem->column); if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) @@ -292,6 +298,7 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *mem, cmem->requestor_id = mem->requestor_id; cmem->responder_id = mem->responder_id; cmem->target_id = mem->target_id; + cmem->extended = mem->extended; cmem->rank = mem->rank; cmem->mem_array_handle = mem->mem_array_handle; cmem->mem_dev_handle = mem->mem_dev_handle; diff --git a/include/linux/cper.h b/include/linux/cper.h index 8537e9282a65..c8313753ee49 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -230,6 +230,10 @@ enum { #define CPER_MEM_VALID_RANK_NUMBER 0x8000 #define CPER_MEM_VALID_CARD_HANDLE 0x10000 #define CPER_MEM_VALID_MODULE_HANDLE 0x20000 +#define CPER_MEM_VALID_ROW_EXT 0x40000 + +#define CPER_MEM_EXT_ROW_MASK 0x3 +#define CPER_MEM_EXT_ROW_SHIFT 16 #define CPER_PCIE_VALID_PORT_TYPE 0x0001 #define CPER_PCIE_VALID_VERSION 0x0002 @@ -443,7 +447,7 @@ struct cper_sec_mem_err_old { u8 error_type; }; -/* Memory Error Section (UEFI >= v2.3), UEFI v2.7 sec N.2.5 */ +/* Memory Error Section (UEFI >= v2.3), UEFI v2.8 sec N.2.5 */ struct cper_sec_mem_err { u64 validation_bits; u64 error_status; @@ -461,7 +465,7 @@ struct cper_sec_mem_err { u64 responder_id; u64 target_id; u8 error_type; - u8 reserved; + u8 extended; u16 rank; u16 mem_array_handle; /* "card handle" in UEFI 2.4 */ u16 mem_dev_handle; /* "module handle" in UEFI 2.4 */ @@ -483,6 +487,7 @@ struct cper_mem_err_compact { u16 rank; u16 mem_array_handle; u16 mem_dev_handle; + u8 extended; }; /* PCI Express Error Section, UEFI v2.7 sec N.2.7 */