diff mbox

[RFC,v3,2/3] acpi: apei: Do not panic() on PCIe errors reported through GHES

Message ID 20180425203957.18224-3-mr.nuke.me@gmail.com (mailing list archive)
State RFC, archived
Headers show

Commit Message

Alex G. April 25, 2018, 8:39 p.m. UTC
The policy was to panic() when GHES said that an error is "Fatal".
This logic is wrong for several reasons, as it doesn't take into
account what caused the error.

PCIe fatal errors indicate that the link to a device is either
unstable or unusable. They don't indicate that the machine is on fire,
and they are not severe enough that we need to panic(). Instead of
relying on crackmonkey firmware, evaluate the error severity based on
what caused the error (GHES subsections).

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

Comments

Borislav Petkov April 26, 2018, 11:19 a.m. UTC | #1
On Wed, Apr 25, 2018 at 03:39:50PM -0500, Alexandru Gagniuc wrote:
> @@ -932,7 +971,7 @@ static void __process_error(struct ghes *ghes)
>  static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
>  {
>  	struct ghes *ghes;
> -	int sev, ret = NMI_DONE;
> +	int sev, asev, ret = NMI_DONE;
>  
>  	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
>  		return ret;
> @@ -945,8 +984,9 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
>  			ret = NMI_HANDLED;
>  		}
>  
> +		asev = ghes_actual_severity(ghes);
>  		sev = ghes_severity(ghes->estatus->error_severity);

So renaming ghes_deferrable_severity() to ghes_actual_severity() is not
a big change. And that's not what I meant.

I'd like to see here:

		 sev = ghes_severity(ghes);

and inside you do all the required mapping/severity processing/etc. And
you can rename the current ghes_severity() to ghes_map_cper_severity()
or whatever...

> -		if (sev >= GHES_SEV_PANIC) {
> +		if ((sev >= GHES_SEV_PANIC) && (asev >= GHES_SEV_PANIC)) {

... so that this change doesn't happen and there are not two severities
queried but a single one.
Alex G. April 26, 2018, 5:44 p.m. UTC | #2
Hi Borislav,

On 04/26/2018 06:19 AM, Borislav Petkov wrote:
> On Wed, Apr 25, 2018 at 03:39:50PM -0500, Alexandru Gagniuc wrote:
>> @@ -932,7 +971,7 @@ static void __process_error(struct ghes *ghes)
>>   static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
>>   {
>>   	struct ghes *ghes;
>> -	int sev, ret = NMI_DONE;
>> +	int sev, asev, ret = NMI_DONE;
>>   
>>   	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
>>   		return ret;
>> @@ -945,8 +984,9 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
>>   			ret = NMI_HANDLED;
>>   		}
>>   
>> +		asev = ghes_actual_severity(ghes);
>>   		sev = ghes_severity(ghes->estatus->error_severity);
> 
> So renaming ghes_deferrable_severity() to ghes_actual_severity() is not
> a big change. And that's not what I meant.

I'm sorry I misunderstood you.

> I'd like to see here:
> 
> 		 sev = ghes_severity(ghes);

		 sev = ghes_severity(ghes);


> and inside you do all the required mapping/severity processing/etc. And
> you can rename the current ghes_severity() to ghes_map_cper_severity()
> or whatever...

I agree that the current ghes_severity() name is vague. I'll get it done 
properly in v4 (next week).

>> -		if (sev >= GHES_SEV_PANIC) {
>> +		if ((sev >= GHES_SEV_PANIC) && (asev >= GHES_SEV_PANIC)) {
> 
> ... so that this change doesn't happen and there are not two severities
> queried but a single one.

Two severities is a result of the wanky GHES data structure. Nothing 
says we have to use the severity field in the header... if you're okay 
with just ignoring it.

Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f9b53a6f55f3..8ccb9cc10fc8 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -425,8 +425,7 @@  static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
  * GHES_SEV_RECOVERABLE -> AER_NONFATAL
  * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
  *     These both need to be reported and recovered from by the AER driver.
- * GHES_SEV_PANIC does not make it to this handling since the kernel must
- *     panic.
+ * GHES_SEV_PANIC -> AER_FATAL
  */
 static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 {
@@ -459,6 +458,46 @@  static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+/* PCIe errors should not cause a panic. */
+static int ghes_sec_pcie_severity(struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO &&
+	    IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER))
+		return CPER_SEV_RECOVERABLE;
+
+	return ghes_severity(gdata->error_severity);
+}
+/*
+ * The severity field in the status block is oftentimes more severe than it
+ * needs to be. This makes it an unreliable metric for the severity. A more
+ * reliable way is to look at each subsection and correlate it with how well
+ * the error can be handled.
+ *   - SEC_PCIE: All PCIe errors can be handled by AER.
+ */
+static int ghes_actual_severity(struct ghes *ghes)
+{
+	int worst_sev, sec_sev;
+	struct acpi_hest_generic_data *gdata;
+	const guid_t *section_type;
+	const struct acpi_hest_generic_status *estatus = ghes->estatus;
+
+	worst_sev = GHES_SEV_NO;
+	apei_estatus_for_each_section(estatus, gdata) {
+		section_type = (guid_t *)gdata->section_type;
+		sec_sev = ghes_severity(gdata->error_severity);
+
+		if (guid_equal(section_type, &CPER_SEC_PCIE))
+			sec_sev = ghes_sec_pcie_severity(gdata);
+
+		worst_sev = max(worst_sev, sec_sev);
+	}
+
+	return worst_sev;
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -932,7 +971,7 @@  static void __process_error(struct ghes *ghes)
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
 	struct ghes *ghes;
-	int sev, ret = NMI_DONE;
+	int sev, asev, ret = NMI_DONE;
 
 	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
 		return ret;
@@ -945,8 +984,9 @@  static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 			ret = NMI_HANDLED;
 		}
 
+		asev = ghes_actual_severity(ghes);
 		sev = ghes_severity(ghes->estatus->error_severity);
-		if (sev >= GHES_SEV_PANIC) {
+		if ((sev >= GHES_SEV_PANIC) && (asev >= GHES_SEV_PANIC)) {
 			oops_begin();
 			ghes_print_queued_estatus();
 			__ghes_panic(ghes);