diff mbox series

[v2,3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

Message ID 20241001005234.61409-4-Smita.KoralahalliChannabasappa@amd.com
State New
Headers show
Series acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors | expand

Commit Message

Koralahalli Channabasappa, Smita Oct. 1, 2024, 12:52 a.m. UTC
UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.

Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
Severity, Device ID, Device Serial number and CXL RAS capability struct in
struct cxl_cper_prot_err. Include this struct as a member of struct
cxl_cper_work_data.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
v2:
	Defined array of structures for Device ID and Serial number
	comparison.
	p_err -> rec/p_rec.
---
 drivers/acpi/apei/ghes.c        |  10 +++
 drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
 include/cxl/event.h             |  26 ++++++++
 3 files changed, 151 insertions(+)

Comments

Ira Weiny Oct. 1, 2024, 3:47 p.m. UTC | #1
Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> 
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> v2:
> 	Defined array of structures for Device ID and Serial number
> 	comparison.
> 	p_err -> rec/p_rec.
> ---
>  drivers/acpi/apei/ghes.c        |  10 +++
>  drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
>  include/cxl/event.h             |  26 ++++++++
>  3 files changed, 151 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index ada93cfde9ba..9dcf0f78458f 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>  	schedule_work(cxl_cper_work);
>  }
>  
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cxl_cper_work_data wd;
> +
> +	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
> +		return;
> +}

This is a bit odd.  One could call cxl_cper_handle_prot_err_info()
directly from ghes_do_proc() in this patch.  Then add
cxl_cper_handle_prot_err() in patch 4/4 but either way is fine with me.

Reviewed-by: Ira Weiny <ira.weiny@intel.com>

> +
>  int cxl_cper_register_work(struct work_struct *work)
>  {
>  	if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>  
>  			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> +		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +			cxl_cper_handle_prot_err(gdata);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..08da7764c066 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <acpi/ghes.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -44,6 +45,66 @@ enum {
>  	USP,	/* CXL Upstream Switch Port */
>  };
>  
> +struct agent_info {
> +	const char *string;
> +	bool req_sn;
> +	bool req_sbdf;
> +};
> +
> +static const struct agent_info agent_info[] = {
> +	[RCD] = {
> +		.string = "Restricted CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RCH_DP] = {
> +		.string = "Restricted CXL Host Downstream Port",
> +		.req_sbdf = false,
> +		.req_sn = false,
> +	},
> +	[DEVICE] = {
> +		.string = "CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[LD] = {
> +		.string = "CXL Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[FMLD] = {
> +		.string = "CXL Fabric Manager managed Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RP] = {
> +		.string = "CXL Root Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[DSP] = {
> +		.string = "CXL Downstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[USP] = {
> +		.string = "CXL Upstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +};
> +
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> +	switch (cper_severity) {
> +	case CPER_SEV_RECOVERABLE:
> +	case CPER_SEV_FATAL:
> +		return CXL_AER_UNCORRECTABLE;
> +	default:
> +		return CXL_AER_CORRECTABLE;
> +	}
> +}
> +
>  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>  {
>  	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +237,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			       sizeof(cxl_ras->header_log), 0);
>  	}
>  }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec)
> +{
> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +	u8 *dvsec_start, *cap_start;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> +		pr_err(FW_WARN "No Device ID\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * The device ID or agent address is required for CXL RCD, CXL
> +	 * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> +	 * CXL Downstream Switch Port and CXL Upstream Switch Port.
> +	 */
> +	if (!(agent_info[prot_err->agent_type].req_sbdf)) {
> +		pr_err(FW_WARN "Invalid agent type\n");
> +		return -EINVAL;
> +	}
> +
> +	rec->segment = prot_err->agent_addr.segment;
> +	rec->bus = prot_err->agent_addr.bus;
> +	rec->device = prot_err->agent_addr.device;
> +	rec->function = prot_err->agent_addr.function;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err(FW_WARN "Invalid Protocol Error log\n");
> +		return -EINVAL;
> +	}
> +
> +	dvsec_start = (u8 *)(prot_err + 1);
> +	cap_start = dvsec_start + prot_err->dvsec_len;
> +	rec->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> +	/*
> +	 * Set device serial number unconditionally.
> +	 *
> +	 * Print a warning message if it is not valid. The device serial
> +	 * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> +	 * Manager Managed LD.
> +	 */
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> +	    !(agent_info[prot_err->agent_type].req_sn))
> +		pr_warn(FW_WARN "No Device Serial number\n");
> +
> +	rec->lower_dw = prot_err->dev_serial_num.lower_dw;
> +	rec->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> +	rec->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> +	return 0;
> +}
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index 57b4630568f6..5b316150556a 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -158,11 +158,37 @@ struct cxl_ras_capability_regs {
>  	u32 header_log[16];
>  };
>  
> +enum cxl_aer_err_type {
> +	CXL_AER_UNCORRECTABLE,
> +	CXL_AER_CORRECTABLE,
> +};
> +
> +struct cxl_cper_prot_err {
> +	struct cxl_ras_capability_regs cxl_ras;
> +
> +	/* Device ID */
> +	u8 function;
> +	u8 device;
> +	u8 bus;
> +	u16 segment;
> +
> +	/* Device Serial Number */
> +	u32 lower_dw;
> +	u32 upper_dw;
> +
> +	int severity;
> +};
> +
>  struct cxl_cper_work_data {
>  	enum cxl_event_type event_type;
>  	struct cxl_cper_event_rec rec;
> +	struct cxl_cper_prot_err p_rec;
>  };
>  
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec);
> +
>  #ifdef CONFIG_ACPI_APEI_GHES
>  int cxl_cper_register_work(struct work_struct *work);
>  int cxl_cper_unregister_work(struct work_struct *work);
> -- 
> 2.17.1
>
Fan Ni Oct. 1, 2024, 5:41 p.m. UTC | #2
On Tue, Oct 01, 2024 at 12:52:33AM +0000, Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> 
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> v2:
> 	Defined array of structures for Device ID and Serial number
> 	comparison.
> 	p_err -> rec/p_rec.
> ---
>  drivers/acpi/apei/ghes.c        |  10 +++
>  drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
>  include/cxl/event.h             |  26 ++++++++
>  3 files changed, 151 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index ada93cfde9ba..9dcf0f78458f 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>  	schedule_work(cxl_cper_work);
>  }
>  
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cxl_cper_work_data wd;
> +
> +	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
> +		return;
> +}

Why we need a if here? It seems the function will return anyway.

Fan
> +
>  int cxl_cper_register_work(struct work_struct *work)
>  {
>  	if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>  
>  			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> +		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +			cxl_cper_handle_prot_err(gdata);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..08da7764c066 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <acpi/ghes.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -44,6 +45,66 @@ enum {
>  	USP,	/* CXL Upstream Switch Port */
>  };
>  
> +struct agent_info {
> +	const char *string;
> +	bool req_sn;
> +	bool req_sbdf;
> +};
> +
> +static const struct agent_info agent_info[] = {
> +	[RCD] = {
> +		.string = "Restricted CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RCH_DP] = {
> +		.string = "Restricted CXL Host Downstream Port",
> +		.req_sbdf = false,
> +		.req_sn = false,
> +	},
> +	[DEVICE] = {
> +		.string = "CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[LD] = {
> +		.string = "CXL Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[FMLD] = {
> +		.string = "CXL Fabric Manager managed Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RP] = {
> +		.string = "CXL Root Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[DSP] = {
> +		.string = "CXL Downstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[USP] = {
> +		.string = "CXL Upstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +};
> +
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> +	switch (cper_severity) {
> +	case CPER_SEV_RECOVERABLE:
> +	case CPER_SEV_FATAL:
> +		return CXL_AER_UNCORRECTABLE;
> +	default:
> +		return CXL_AER_CORRECTABLE;
> +	}
> +}
> +
>  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>  {
>  	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +237,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			       sizeof(cxl_ras->header_log), 0);
>  	}
>  }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec)
> +{
> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +	u8 *dvsec_start, *cap_start;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> +		pr_err(FW_WARN "No Device ID\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * The device ID or agent address is required for CXL RCD, CXL
> +	 * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> +	 * CXL Downstream Switch Port and CXL Upstream Switch Port.
> +	 */
> +	if (!(agent_info[prot_err->agent_type].req_sbdf)) {
> +		pr_err(FW_WARN "Invalid agent type\n");
> +		return -EINVAL;
> +	}
> +
> +	rec->segment = prot_err->agent_addr.segment;
> +	rec->bus = prot_err->agent_addr.bus;
> +	rec->device = prot_err->agent_addr.device;
> +	rec->function = prot_err->agent_addr.function;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err(FW_WARN "Invalid Protocol Error log\n");
> +		return -EINVAL;
> +	}
> +
> +	dvsec_start = (u8 *)(prot_err + 1);
> +	cap_start = dvsec_start + prot_err->dvsec_len;
> +	rec->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> +
> +	/*
> +	 * Set device serial number unconditionally.
> +	 *
> +	 * Print a warning message if it is not valid. The device serial
> +	 * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> +	 * Manager Managed LD.
> +	 */
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> +	    !(agent_info[prot_err->agent_type].req_sn))
> +		pr_warn(FW_WARN "No Device Serial number\n");
> +
> +	rec->lower_dw = prot_err->dev_serial_num.lower_dw;
> +	rec->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> +	rec->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> +	return 0;
> +}
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index 57b4630568f6..5b316150556a 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -158,11 +158,37 @@ struct cxl_ras_capability_regs {
>  	u32 header_log[16];
>  };
>  
> +enum cxl_aer_err_type {
> +	CXL_AER_UNCORRECTABLE,
> +	CXL_AER_CORRECTABLE,
> +};
> +
> +struct cxl_cper_prot_err {
> +	struct cxl_ras_capability_regs cxl_ras;
> +
> +	/* Device ID */
> +	u8 function;
> +	u8 device;
> +	u8 bus;
> +	u16 segment;
> +
> +	/* Device Serial Number */
> +	u32 lower_dw;
> +	u32 upper_dw;
> +
> +	int severity;
> +};
> +
>  struct cxl_cper_work_data {
>  	enum cxl_event_type event_type;
>  	struct cxl_cper_event_rec rec;
> +	struct cxl_cper_prot_err p_rec;
>  };
>  
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec);
> +
>  #ifdef CONFIG_ACPI_APEI_GHES
>  int cxl_cper_register_work(struct work_struct *work);
>  int cxl_cper_unregister_work(struct work_struct *work);
> -- 
> 2.17.1
>
Dan Williams Oct. 2, 2024, 11:47 p.m. UTC | #3
Smita Koralahalli wrote:
> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> 
> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> struct cxl_cper_prot_err. Include this struct as a member of struct
> cxl_cper_work_data.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> v2:
> 	Defined array of structures for Device ID and Serial number
> 	comparison.
> 	p_err -> rec/p_rec.
> ---
>  drivers/acpi/apei/ghes.c        |  10 +++
>  drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
>  include/cxl/event.h             |  26 ++++++++
>  3 files changed, 151 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index ada93cfde9ba..9dcf0f78458f 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>  	schedule_work(cxl_cper_work);
>  }
>  
> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cxl_cper_work_data wd;
> +
> +	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
> +		return;
> +}
> +
>  int cxl_cper_register_work(struct work_struct *work)
>  {
>  	if (cxl_cper_work)
> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>  
>  			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
> +		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +			cxl_cper_handle_prot_err(gdata);

I would prefer this follow the format of cxl_cper_post_event and pass a
'struct cxl_cper_sec_prot_err *' directly.

>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..08da7764c066 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <acpi/ghes.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -44,6 +45,66 @@ enum {
>  	USP,	/* CXL Upstream Switch Port */
>  };
>  
> +struct agent_info {
> +	const char *string;
> +	bool req_sn;
> +	bool req_sbdf;
> +};
> +
> +static const struct agent_info agent_info[] = {
> +	[RCD] = {
> +		.string = "Restricted CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RCH_DP] = {
> +		.string = "Restricted CXL Host Downstream Port",
> +		.req_sbdf = false,
> +		.req_sn = false,
> +	},
> +	[DEVICE] = {
> +		.string = "CXL Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[LD] = {
> +		.string = "CXL Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[FMLD] = {
> +		.string = "CXL Fabric Manager managed Logical Device",
> +		.req_sbdf = true,
> +		.req_sn = true,
> +	},
> +	[RP] = {
> +		.string = "CXL Root Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[DSP] = {
> +		.string = "CXL Downstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +	[USP] = {
> +		.string = "CXL Upstream Switch Port",
> +		.req_sbdf = true,
> +		.req_sn = false,
> +	},
> +};
> +
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> +	switch (cper_severity) {
> +	case CPER_SEV_RECOVERABLE:
> +	case CPER_SEV_FATAL:
> +		return CXL_AER_UNCORRECTABLE;
> +	default:
> +		return CXL_AER_CORRECTABLE;
> +	}

Why does the CPER severity need to be converted to a new CXL specific
enum value?

> +}
> +
>  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>  {
>  	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +237,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			       sizeof(cxl_ras->header_log), 0);
>  	}
>  }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec)
> +{
> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);

Late feedback, but cper_sec_prot_err is too generic of a name. Lets make
if cxl_cper_sec_prot_err similar to cxl_cper_event_rec.


> +	u8 *dvsec_start, *cap_start;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> +		pr_err(FW_WARN "No Device ID\n");

This should be pr_err_ratelimited().

This feedback likely also applies to the existing support, but I think
protocol errors are even more likely than component errors to be bursty
and persistent.

This error message and all the others should clarify that they are
coming from the CXL CPER code with something like:

    #define pr_fmt(fmt) "cxl/cper: " fmt

...at the top of the file.

> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * The device ID or agent address is required for CXL RCD, CXL
> +	 * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
> +	 * CXL Downstream Switch Port and CXL Upstream Switch Port.
> +	 */
> +	if (!(agent_info[prot_err->agent_type].req_sbdf)) {
> +		pr_err(FW_WARN "Invalid agent type\n");
> +		return -EINVAL;
> +	}

All CPER records without a device-id have already been dropped above, so
why reject agent-types that do not require a device-id here?

I think this agent_info[] scheme makes the code more difficult to read
especially since agent_info() is only consulted a couple times. Just put
a "switch (prot_err->agent_type)" in the code directly and skip the
indirection.

> +
> +	rec->segment = prot_err->agent_addr.segment;
> +	rec->bus = prot_err->agent_addr.bus;
> +	rec->device = prot_err->agent_addr.device;
> +	rec->function = prot_err->agent_addr.function;
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err(FW_WARN "Invalid Protocol Error log\n");
> +		return -EINVAL;
> +	}
> +
> +	dvsec_start = (u8 *)(prot_err + 1);
> +	cap_start = dvsec_start + prot_err->dvsec_len;
> +	rec->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;

Where is the validation that the size of the RAS field matches
expectations? I.e. what if the BIOS builds a bad error record?

> +
> +	/*
> +	 * Set device serial number unconditionally.
> +	 *
> +	 * Print a warning message if it is not valid. The device serial
> +	 * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
> +	 * Manager Managed LD.
> +	 */
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> +	    !(agent_info[prot_err->agent_type].req_sn))
> +		pr_warn(FW_WARN "No Device Serial number\n");
> +
> +	rec->lower_dw = prot_err->dev_serial_num.lower_dw;
> +	rec->upper_dw = prot_err->dev_serial_num.upper_dw;

Serial numbers are u64s, so if any conversion is to be done here it
should be from upper+lower to a u64, but then again see below on my
question about why a new cxl_cper_prot_err is being added.

> +
> +	rec->severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> +	return 0;
> +}
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index 57b4630568f6..5b316150556a 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -158,11 +158,37 @@ struct cxl_ras_capability_regs {
>  	u32 header_log[16];
>  };
>  
> +enum cxl_aer_err_type {
> +	CXL_AER_UNCORRECTABLE,
> +	CXL_AER_CORRECTABLE,
> +};
> +
> +struct cxl_cper_prot_err {
> +	struct cxl_ras_capability_regs cxl_ras;
> +
> +	/* Device ID */
> +	u8 function;
> +	u8 device;
> +	u8 bus;
> +	u16 segment;
> +
> +	/* Device Serial Number */
> +	u32 lower_dw;
> +	u32 upper_dw;
> +
> +	int severity;
> +};

Hmm, 'struct cxl_cper_event_rec' follows the raw format of the record
from the specification, and 'struct cxl_cper_sec_prot_err' (formerly
cper_sec_prot_err) already exists, so why is this new intermediate data
structure needed?

> +
>  struct cxl_cper_work_data {
>  	enum cxl_event_type event_type;
>  	struct cxl_cper_event_rec rec;
> +	struct cxl_cper_prot_err p_rec;

>  };
>  
> +struct acpi_hest_generic_data;
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_prot_err *rec);
> +
>  #ifdef CONFIG_ACPI_APEI_GHES
>  int cxl_cper_register_work(struct work_struct *work);
>  int cxl_cper_unregister_work(struct work_struct *work);
> -- 
> 2.17.1
>
Koralahalli Channabasappa, Smita Oct. 3, 2024, 7:15 p.m. UTC | #4
On 10/2/2024 4:47 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> v2:
>> 	Defined array of structures for Device ID and Serial number
>> 	comparison.
>> 	p_err -> rec/p_rec.
>> ---
>>   drivers/acpi/apei/ghes.c        |  10 +++
>>   drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
>>   include/cxl/event.h             |  26 ++++++++
>>   3 files changed, 151 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index ada93cfde9ba..9dcf0f78458f 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>>   	schedule_work(cxl_cper_work);
>>   }
>>   
>> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>> +{
>> +	struct cxl_cper_work_data wd;
>> +
>> +	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
>> +		return;
>> +}
>> +
>>   int cxl_cper_register_work(struct work_struct *work)
>>   {
>>   	if (cxl_cper_work)
>> @@ -791,6 +799,8 @@ static bool ghes_do_proc(struct ghes *ghes,
>>   			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>>   
>>   			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
>> +		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
>> +			cxl_cper_handle_prot_err(gdata);
> 
> I would prefer this follow the format of cxl_cper_post_event and pass a
> 'struct cxl_cper_sec_prot_err *' directly.

Sure.

> 
>>   		} else {
>>   			void *err = acpi_hest_get_payload(gdata);
>>   
>> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
>> index 4fd8d783993e..08da7764c066 100644
>> --- a/drivers/firmware/efi/cper_cxl.c
>> +++ b/drivers/firmware/efi/cper_cxl.c
>> @@ -8,6 +8,7 @@
>>    */
>>   
>>   #include <linux/cper.h>
>> +#include <acpi/ghes.h>
>>   #include "cper_cxl.h"
>>   
>>   #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
>> @@ -44,6 +45,66 @@ enum {
>>   	USP,	/* CXL Upstream Switch Port */
>>   };
>>   
>> +struct agent_info {
>> +	const char *string;
>> +	bool req_sn;
>> +	bool req_sbdf;
>> +};
>> +
>> +static const struct agent_info agent_info[] = {
>> +	[RCD] = {
>> +		.string = "Restricted CXL Device",
>> +		.req_sbdf = true,
>> +		.req_sn = true,
>> +	},
>> +	[RCH_DP] = {
>> +		.string = "Restricted CXL Host Downstream Port",
>> +		.req_sbdf = false,
>> +		.req_sn = false,
>> +	},
>> +	[DEVICE] = {
>> +		.string = "CXL Device",
>> +		.req_sbdf = true,
>> +		.req_sn = true,
>> +	},
>> +	[LD] = {
>> +		.string = "CXL Logical Device",
>> +		.req_sbdf = true,
>> +		.req_sn = true,
>> +	},
>> +	[FMLD] = {
>> +		.string = "CXL Fabric Manager managed Logical Device",
>> +		.req_sbdf = true,
>> +		.req_sn = true,
>> +	},
>> +	[RP] = {
>> +		.string = "CXL Root Port",
>> +		.req_sbdf = true,
>> +		.req_sn = false,
>> +	},
>> +	[DSP] = {
>> +		.string = "CXL Downstream Switch Port",
>> +		.req_sbdf = true,
>> +		.req_sn = false,
>> +	},
>> +	[USP] = {
>> +		.string = "CXL Upstream Switch Port",
>> +		.req_sbdf = true,
>> +		.req_sn = false,
>> +	},
>> +};
>> +
>> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
>> +{
>> +	switch (cper_severity) {
>> +	case CPER_SEV_RECOVERABLE:
>> +	case CPER_SEV_FATAL:
>> +		return CXL_AER_UNCORRECTABLE;
>> +	default:
>> +		return CXL_AER_CORRECTABLE;
>> +	}
> 
> Why does the CPER severity need to be converted to a new CXL specific
> enum value?

I was just following up the same convention here as done for AER..
cper_severity_to_aer(). I can change if there is no value in doing it.

> 
>> +}
>> +
>>   void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>>   {
>>   	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
>> @@ -176,3 +237,57 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>>   			       sizeof(cxl_ras->header_log), 0);
>>   	}
>>   }
>> +
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> +				  struct cxl_cper_prot_err *rec)
>> +{
>> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> 
> Late feedback, but cper_sec_prot_err is too generic of a name. Lets make
> if cxl_cper_sec_prot_err similar to cxl_cper_event_rec.

okay

> 
> 
>> +	u8 *dvsec_start, *cap_start;
>> +
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
>> +		pr_err(FW_WARN "No Device ID\n");
> 
> This should be pr_err_ratelimited().
> 
> This feedback likely also applies to the existing support, but I think
> protocol errors are even more likely than component errors to be bursty
> and persistent.
> 
> This error message and all the others should clarify that they are
> coming from the CXL CPER code with something like:
> 
>      #define pr_fmt(fmt) "cxl/cper: " fmt
> 
> ...at the top of the file.
> 

okay

>> +		return -EINVAL;
>> +	}
>> +
>> +	/*
>> +	 * The device ID or agent address is required for CXL RCD, CXL
>> +	 * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
>> +	 * CXL Downstream Switch Port and CXL Upstream Switch Port.
>> +	 */
>> +	if (!(agent_info[prot_err->agent_type].req_sbdf)) {
>> +		pr_err(FW_WARN "Invalid agent type\n");
>> +		return -EINVAL;
>> +	}
> 
> All CPER records without a device-id have already been dropped above, so
> why reject agent-types that do not require a device-id here?
> 
> I think this agent_info[] scheme makes the code more difficult to read
> especially since agent_info() is only consulted a couple times. Just put
> a "switch (prot_err->agent_type)" in the code directly and skip the
> indirection.

Hmm, I initially thought I would do switch case and then changed it to 
if-else thinking that would look cleaner.

What would you suggest? Just incorporate switch case similar to 
cper_print_prot_err() here as well or clean up switch case in 
cper_print_prot_err() and reuse the agent_info[] there?

This agent_info[] would include 3 fields then, two as above and another 
for valid_cap. Maybe unify sbdf with valid_cap..

> 
>> +
>> +	rec->segment = prot_err->agent_addr.segment;
>> +	rec->bus = prot_err->agent_addr.bus;
>> +	rec->device = prot_err->agent_addr.device;
>> +	rec->function = prot_err->agent_addr.function;
>> +
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
>> +		pr_err(FW_WARN "Invalid Protocol Error log\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	dvsec_start = (u8 *)(prot_err + 1);
>> +	cap_start = dvsec_start + prot_err->dvsec_len;
>> +	rec->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
> 
> Where is the validation that the size of the RAS field matches
> expectations? I.e. what if the BIOS builds a bad error record?

Will include check for size.

> 
>> +
>> +	/*
>> +	 * Set device serial number unconditionally.
>> +	 *
>> +	 * Print a warning message if it is not valid. The device serial
>> +	 * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
>> +	 * Manager Managed LD.
>> +	 */
>> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
>> +	    !(agent_info[prot_err->agent_type].req_sn))
>> +		pr_warn(FW_WARN "No Device Serial number\n");
>> +
>> +	rec->lower_dw = prot_err->dev_serial_num.lower_dw;
>> +	rec->upper_dw = prot_err->dev_serial_num.upper_dw;
> 
> Serial numbers are u64s, so if any conversion is to be done here it
> should be from upper+lower to a u64, but then again see below on my
> question about why a new cxl_cper_prot_err is being added.
> 
>> +
>> +	rec->severity = cper_severity_cxl_aer(gdata->error_severity);
>> +
>> +	return 0;
>> +}
>> diff --git a/include/cxl/event.h b/include/cxl/event.h
>> index 57b4630568f6..5b316150556a 100644
>> --- a/include/cxl/event.h
>> +++ b/include/cxl/event.h
>> @@ -158,11 +158,37 @@ struct cxl_ras_capability_regs {
>>   	u32 header_log[16];
>>   };
>>   
>> +enum cxl_aer_err_type {
>> +	CXL_AER_UNCORRECTABLE,
>> +	CXL_AER_CORRECTABLE,
>> +};
>> +
>> +struct cxl_cper_prot_err {
>> +	struct cxl_ras_capability_regs cxl_ras;
>> +
>> +	/* Device ID */
>> +	u8 function;
>> +	u8 device;
>> +	u8 bus;
>> +	u16 segment;
>> +
>> +	/* Device Serial Number */
>> +	u32 lower_dw;
>> +	u32 upper_dw;
>> +
>> +	int severity;
>> +};
> 
> Hmm, 'struct cxl_cper_event_rec' follows the raw format of the record
> from the specification, and 'struct cxl_cper_sec_prot_err' (formerly
> cper_sec_prot_err) already exists, so why is this new intermediate data
> structure needed?

Yeah, the intention was to extract only necessary info to be consumed by 
cxl_pci driver and to be passed to cxl_cper_fifo.

Going forward, while handling protocol errors separately, I can send the 
entire cxl_cper_sec_prot_err.

Thanks
Smita

> 
>> +
>>   struct cxl_cper_work_data {
>>   	enum cxl_event_type event_type;
>>   	struct cxl_cper_event_rec rec;
>> +	struct cxl_cper_prot_err p_rec;
> 
>>   };
>>   
>> +struct acpi_hest_generic_data;
>> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
>> +				  struct cxl_cper_prot_err *rec);
>> +
>>   #ifdef CONFIG_ACPI_APEI_GHES
>>   int cxl_cper_register_work(struct work_struct *work);
>>   int cxl_cper_unregister_work(struct work_struct *work);
>> -- 
>> 2.17.1
>>
> 
> 
>
Koralahalli Channabasappa, Smita Oct. 3, 2024, 7:19 p.m. UTC | #5
On 10/1/2024 10:41 AM, Fan Ni wrote:
> On Tue, Oct 01, 2024 at 12:52:33AM +0000, Smita Koralahalli wrote:
>> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
>>
>> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
>> Severity, Device ID, Device Serial number and CXL RAS capability struct in
>> struct cxl_cper_prot_err. Include this struct as a member of struct
>> cxl_cper_work_data.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> ---
>> v2:
>> 	Defined array of structures for Device ID and Serial number
>> 	comparison.
>> 	p_err -> rec/p_rec.
>> ---
>>   drivers/acpi/apei/ghes.c        |  10 +++
>>   drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
>>   include/cxl/event.h             |  26 ++++++++
>>   3 files changed, 151 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index ada93cfde9ba..9dcf0f78458f 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -717,6 +717,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>>   	schedule_work(cxl_cper_work);
>>   }
>>   
>> +static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
>> +{
>> +	struct cxl_cper_work_data wd;
>> +
>> +	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
>> +		return;
>> +}
> 
> Why we need a if here? It seems the function will return anyway.
> 
> Fan

Yeah, it doesn't make sense here without 4/4.

Thanks
Smita

[snip]
Dan Williams Oct. 3, 2024, 11:21 p.m. UTC | #6
Smita Koralahalli wrote:
> On 10/2/2024 4:47 PM, Dan Williams wrote:
> > Smita Koralahalli wrote:
> >> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> >>
> >> Add GHES support to detect CXL CPER Protocol Error Record and Cache Error
> >> Severity, Device ID, Device Serial number and CXL RAS capability struct in
> >> struct cxl_cper_prot_err. Include this struct as a member of struct
> >> cxl_cper_work_data.
> >>
> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> >> ---
> >> v2:
> >> 	Defined array of structures for Device ID and Serial number
> >> 	comparison.
> >> 	p_err -> rec/p_rec.
> >> ---
> >>   drivers/acpi/apei/ghes.c        |  10 +++
> >>   drivers/firmware/efi/cper_cxl.c | 115 ++++++++++++++++++++++++++++++++
> >>   include/cxl/event.h             |  26 ++++++++
> >>   3 files changed, 151 insertions(+)
> >>
[..]
> >> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> >> +{
> >> +	switch (cper_severity) {
> >> +	case CPER_SEV_RECOVERABLE:
> >> +	case CPER_SEV_FATAL:
> >> +		return CXL_AER_UNCORRECTABLE;
> >> +	default:
> >> +		return CXL_AER_CORRECTABLE;
> >> +	}
> > 
> > Why does the CPER severity need to be converted to a new CXL specific
> > enum value?
> 
> I was just following up the same convention here as done for AER..
> cper_severity_to_aer(). I can change if there is no value in doing it.

I think because PCIe and CXL are both using AER as the transport they
can both use cper_severity_to_aer(), or at a minimum do not introduce
'enum cxl_aer_err_type' and instead use the existing AER_ values.

[..]
> > All CPER records without a device-id have already been dropped above, so
> > why reject agent-types that do not require a device-id here?
> > 
> > I think this agent_info[] scheme makes the code more difficult to read
> > especially since agent_info() is only consulted a couple times. Just put
> > a "switch (prot_err->agent_type)" in the code directly and skip the
> > indirection.
> 
> Hmm, I initially thought I would do switch case and then changed it to 
> if-else thinking that would look cleaner.
> 
> What would you suggest? Just incorporate switch case similar to 
> cper_print_prot_err() here as well or clean up switch case in 
> cper_print_prot_err() and reuse the agent_info[] there?

Even though it is more lines, I find cper_print_prot_err() easier to
read because I do not need to switch back and forth to the agent_info
definition.

> This agent_info[] would include 3 fields then, two as above and another 
> for valid_cap. Maybe unify sbdf with valid_cap..

agent_info[] ends up being code-logic masquerading as a data-structure.
Keep the logic all in one coherent readable block.

[..]
> > Hmm, 'struct cxl_cper_event_rec' follows the raw format of the record
> > from the specification, and 'struct cxl_cper_sec_prot_err' (formerly
> > cper_sec_prot_err) already exists, so why is this new intermediate data
> > structure needed?
> 
> Yeah, the intention was to extract only necessary info to be consumed by 
> cxl_pci driver and to be passed to cxl_cper_fifo.
> 
> Going forward, while handling protocol errors separately, I can send the 
> entire cxl_cper_sec_prot_err.

I think it defensible to send all of it. Recall that the real consumer
is not cxl_pci it is rasdaemon in userspace. The kernel is not in a good
position to filter what rasdaemon expects to find in the record based on
the EFI specification.
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ada93cfde9ba..9dcf0f78458f 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -717,6 +717,14 @@  static void cxl_cper_post_event(enum cxl_event_type event_type,
 	schedule_work(cxl_cper_work);
 }
 
+static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+	struct cxl_cper_work_data wd;
+
+	if (cxl_cper_handle_prot_err_info(gdata, &wd.p_rec))
+		return;
+}
+
 int cxl_cper_register_work(struct work_struct *work)
 {
 	if (cxl_cper_work)
@@ -791,6 +799,8 @@  static bool ghes_do_proc(struct ghes *ghes,
 			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
 
 			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+			cxl_cper_handle_prot_err(gdata);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..08da7764c066 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@ 
  */
 
 #include <linux/cper.h>
+#include <acpi/ghes.h>
 #include "cper_cxl.h"
 
 #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
@@ -44,6 +45,66 @@  enum {
 	USP,	/* CXL Upstream Switch Port */
 };
 
+struct agent_info {
+	const char *string;
+	bool req_sn;
+	bool req_sbdf;
+};
+
+static const struct agent_info agent_info[] = {
+	[RCD] = {
+		.string = "Restricted CXL Device",
+		.req_sbdf = true,
+		.req_sn = true,
+	},
+	[RCH_DP] = {
+		.string = "Restricted CXL Host Downstream Port",
+		.req_sbdf = false,
+		.req_sn = false,
+	},
+	[DEVICE] = {
+		.string = "CXL Device",
+		.req_sbdf = true,
+		.req_sn = true,
+	},
+	[LD] = {
+		.string = "CXL Logical Device",
+		.req_sbdf = true,
+		.req_sn = true,
+	},
+	[FMLD] = {
+		.string = "CXL Fabric Manager managed Logical Device",
+		.req_sbdf = true,
+		.req_sn = true,
+	},
+	[RP] = {
+		.string = "CXL Root Port",
+		.req_sbdf = true,
+		.req_sn = false,
+	},
+	[DSP] = {
+		.string = "CXL Downstream Switch Port",
+		.req_sbdf = true,
+		.req_sn = false,
+	},
+	[USP] = {
+		.string = "CXL Upstream Switch Port",
+		.req_sbdf = true,
+		.req_sn = false,
+	},
+};
+
+static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
+{
+	switch (cper_severity) {
+	case CPER_SEV_RECOVERABLE:
+	case CPER_SEV_FATAL:
+		return CXL_AER_UNCORRECTABLE;
+	default:
+		return CXL_AER_CORRECTABLE;
+	}
+}
+
 void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
 {
 	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -176,3 +237,57 @@  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
 			       sizeof(cxl_ras->header_log), 0);
 	}
 }
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+				  struct cxl_cper_prot_err *rec)
+{
+	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+	u8 *dvsec_start, *cap_start;
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+		pr_err(FW_WARN "No Device ID\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * The device ID or agent address is required for CXL RCD, CXL
+	 * SLD, CXL LD, CXL Fabric Manager Managed LD, CXL Root Port,
+	 * CXL Downstream Switch Port and CXL Upstream Switch Port.
+	 */
+	if (!(agent_info[prot_err->agent_type].req_sbdf)) {
+		pr_err(FW_WARN "Invalid agent type\n");
+		return -EINVAL;
+	}
+
+	rec->segment = prot_err->agent_addr.segment;
+	rec->bus = prot_err->agent_addr.bus;
+	rec->device = prot_err->agent_addr.device;
+	rec->function = prot_err->agent_addr.function;
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+		pr_err(FW_WARN "Invalid Protocol Error log\n");
+		return -EINVAL;
+	}
+
+	dvsec_start = (u8 *)(prot_err + 1);
+	cap_start = dvsec_start + prot_err->dvsec_len;
+	rec->cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;
+
+	/*
+	 * Set device serial number unconditionally.
+	 *
+	 * Print a warning message if it is not valid. The device serial
+	 * number is required for CXL RCD, CXL SLD, CXL LD and CXL Fabric
+	 * Manager Managed LD.
+	 */
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+	    !(agent_info[prot_err->agent_type].req_sn))
+		pr_warn(FW_WARN "No Device Serial number\n");
+
+	rec->lower_dw = prot_err->dev_serial_num.lower_dw;
+	rec->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+	rec->severity = cper_severity_cxl_aer(gdata->error_severity);
+
+	return 0;
+}
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 57b4630568f6..5b316150556a 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -158,11 +158,37 @@  struct cxl_ras_capability_regs {
 	u32 header_log[16];
 };
 
+enum cxl_aer_err_type {
+	CXL_AER_UNCORRECTABLE,
+	CXL_AER_CORRECTABLE,
+};
+
+struct cxl_cper_prot_err {
+	struct cxl_ras_capability_regs cxl_ras;
+
+	/* Device ID */
+	u8 function;
+	u8 device;
+	u8 bus;
+	u16 segment;
+
+	/* Device Serial Number */
+	u32 lower_dw;
+	u32 upper_dw;
+
+	int severity;
+};
+
 struct cxl_cper_work_data {
 	enum cxl_event_type event_type;
 	struct cxl_cper_event_rec rec;
+	struct cxl_cper_prot_err p_rec;
 };
 
+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+				  struct cxl_cper_prot_err *rec);
+
 #ifdef CONFIG_ACPI_APEI_GHES
 int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);