diff mbox series

Dump cper error table in mce_panic

Message ID 20201104065057.40442-1-yaoaili126@163.com (mailing list archive)
State New, archived
Headers show
Series Dump cper error table in mce_panic | expand

Commit Message

yaoaili126@163.com Nov. 4, 2020, 6:50 a.m. UTC
From: Aili Yao <yaoaili@kingsoft.com>

For X86_MCE, When there is a fatal ue error, BIOS will prepare one
detailed cper error table before raising MCE, this cper table is meant
to supply addtional error information and not to race with mce handler
to panic.

Usually possible unexpected cper process from NMI watchdog race panic
with MCE panic is not a problem, the panic process will coordinate with
each core. But When the CPER is not processed in the first kernel and
leave it to the second kernel, this is a problem, lead to a kdump fail.

Now in this patch, the mce_panic will race with unexpected NMI to dump
the cper error log and get it cleaned, this will prevent the cper table
leak to the second kernel, which will fix the kdump fail problem, and
also guarrante the cper log is collected which it's meant to.

Anyway,For x86_mce platform, the ghes module is still needed not to
panic for fatal memory UE as it's MCE handler's work.

Signed-off-by: Aili Yao <yaoaili@kingsoft.com>
---
 arch/x86/kernel/cpu/mce/core.c     |  2 +
 arch/x86/kernel/cpu/mce/internal.h |  5 ++
 drivers/acpi/apei/ghes.c           | 79 ++++++++++++++++++++++++++++++
 include/acpi/ghes.h                |  2 +
 4 files changed, 88 insertions(+)

Comments

kernel test robot Nov. 4, 2020, 10:16 a.m. UTC | #1
Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/x86/core]
[also build test ERROR on pm/linux-next v5.10-rc2 next-20201103]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/yaoaili126-163-com/Dump-cper-error-table-in-mce_panic/20201104-150937
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 238c91115cd05c71447ea071624a4c9fe661f970
config: x86_64-randconfig-s021-20201104 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-76-gf680124b-dirty
        # https://github.com/0day-ci/linux/commit/b11831c841cb8046a9e01300f5d91985c293e045
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review yaoaili126-163-com/Dump-cper-error-table-in-mce_panic/20201104-150937
        git checkout b11831c841cb8046a9e01300f5d91985c293e045
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: arch/x86/kernel/cpu/mce/core.o: in function `mce_panic':
>> arch/x86/kernel/cpu/mce/core.c:346: undefined reference to `ghes_in_mce_cper_entry_check'

vim +346 arch/x86/kernel/cpu/mce/core.c

   297	
   298	static void mce_panic(const char *msg, struct mce *final, char *exp)
   299	{
   300		int apei_err = 0;
   301		struct llist_node *pending;
   302		struct mce_evt_llist *l;
   303	
   304		if (!fake_panic) {
   305			/*
   306			 * Make sure only one CPU runs in machine check panic
   307			 */
   308			if (atomic_inc_return(&mce_panicked) > 1)
   309				wait_for_panic();
   310			barrier();
   311	
   312			bust_spinlocks(1);
   313			console_verbose();
   314		} else {
   315			/* Don't log too much for fake panic */
   316			if (atomic_inc_return(&mce_fake_panicked) > 1)
   317				return;
   318		}
   319		pending = mce_gen_pool_prepare_records();
   320		/* First print corrected ones that are still unlogged */
   321		llist_for_each_entry(l, pending, llnode) {
   322			struct mce *m = &l->mce;
   323			if (!(m->status & MCI_STATUS_UC)) {
   324				print_mce(m);
   325				if (!apei_err)
   326					apei_err = apei_write_mce(m);
   327			}
   328		}
   329		/* Now print uncorrected but with the final one last */
   330		llist_for_each_entry(l, pending, llnode) {
   331			struct mce *m = &l->mce;
   332			if (!(m->status & MCI_STATUS_UC))
   333				continue;
   334			if (!final || mce_cmp(m, final)) {
   335				print_mce(m);
   336				if (!apei_err)
   337					apei_err = apei_write_mce(m);
   338			}
   339		}
   340		if (final) {
   341			print_mce(final);
   342			if (!apei_err)
   343				apei_err = apei_write_mce(final);
   344		}
   345		/* Print possible additional cper error info, get cper cleared */
 > 346		ghes_in_mce_cper_entry_check();
   347		if (cpu_missing)
   348			pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
   349		if (exp)
   350			pr_emerg(HW_ERR "Machine check: %s\n", exp);
   351		if (!fake_panic) {
   352			if (panic_timeout == 0)
   353				panic_timeout = mca_cfg.panic_timeout;
   354			panic(msg);
   355		} else
   356			pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
   357	}
   358	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
James Morse Nov. 6, 2020, 7:35 p.m. UTC | #2
Hello!

On 04/11/2020 06:50, yaoaili126@163.com wrote:
> From: Aili Yao <yaoaili@kingsoft.com>
> 
> For X86_MCE, When there is a fatal ue error, BIOS will prepare one
> detailed cper error table before raising MCE,

(outside GHES-ASSIST), Its not supposed to do this.

There is an example flow described in 18.4.1 "Example: Firmware First Handling Using NMI
Notification" of ACPI v6.3:
https://uefi.org/sites/default/files/resources/ACPI_Spec_6_3_A_Oct_6_2020.pdf


The machine-check is the notification from hardware, which in step 1 of the above should
go to firmware. You should only see an NMI, which is step 8.
Step 7 is to clear the error from hardware, so triggering a machine-check is pointless.
(but I agree no firmware ever follows this!)


You appear to have something that behaves as GHES-ASSIST. Can you post the decompiled dump
of your HEST table? (decompiled, no binaries!) If its large, you can post it to me off
list and I'll copy the relevant bits here...


> this cper table is meant
> to supply addtional error information and not to race with mce handler
> to panic.

This is a description of GHES_ASSIST. See 18.7 "GHES_ASSIST Error Reporting" of the above pdf.


> Usually possible unexpected cper process from NMI watchdog race panic
> with MCE panic is not a problem, the panic process will coordinate with
> each core. But When the CPER is not processed in the first kernel and
> leave it to the second kernel, this is a problem, lead to a kdump fail.

> Now in this patch, the mce_panic will race with unexpected NMI to dump
> the cper error log and get it cleaned, this will prevent the cper table
> leak to the second kernel, which will fix the kdump fail problem, and
> also guarrante the cper log is collected which it's meant to.

> Anyway,For x86_mce platform, the ghes module is still needed not to
> panic for fatal memory UE as it's MCE handler's work.

If and only if those GHES are marked as GHES_ASSIST.

If they are not, then you have a fully fledged firwmare-first system.

Could you share what your system is describing it as in the HEST so we can work out what
is going on here?!

We need to work this out first.


Thanks,

James
yaoaili [么爱利] Nov. 18, 2020, 3:12 a.m. UTC | #3
Hi, Thanks for your comments!

On Fri, 6 Nov 2020 19:35:32 +0000
James Morse <james.morse@arm.com> wrote:

> You appear to have something that behaves as GHES-ASSIST. Can you post the decompiled dump
> of your HEST table? (decompiled, no binaries!) If its large, you can post it to me off
> list and I'll copy the relevant bits here...
> 
I think we we can reach a consensus, from and follow Intel Document #563361 23.1 :
Feature Name MCA 2.0 Recovery (as per EMCA Gen2 architecture)
Feature Description
Software layer assisted recovery from uncorrected data errors as defined by the EMCA Gen2
specification. EMCA Gen2 is a capability that allows firmware to intercept errors triggered via Machine
Check Architecture (corrected and uncorrected errors) enabling a Firmware First Model (FFM) of error
handling and possible recovery.
Use Case
Enhanced error reporting to support Firmware First Model (FFM) with following attributes:
1. Allows the SMM code to intercept the MCE/CMCI.
2. Allows the SMM code to write the MCA Status/Add/Misc registers.
3. Allows the SMM code to generate MCEs.
4. Allows the DSM based pointer for enhanced error logs.
5. Additional IA32_MCG_CAP bit for eMCA support

> 
> If and only if those GHES are marked as GHES_ASSIST.
> 
> If they are not, then you have a fully fledged firwmare-first system.
> 

Yeah, This should be GHES_ASSIST, But For x86, BIOS don't supply a hest table for it as BIOS will trigger
MCE, It's out of APEI scope.
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..22efa708ef53 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -341,6 +341,8 @@  static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (!apei_err)
 			apei_err = apei_write_mce(final);
 	}
+	/* Print possible additional cper error info, get cper cleared */
+	ghes_in_mce_cper_entry_check();
 	if (cpu_missing)
 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..3aea48400af3 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -71,6 +71,7 @@  int apei_write_mce(struct mce *m);
 ssize_t apei_read_mce(struct mce *m, u64 *record_id);
 int apei_check_mce(void);
 int apei_clear_mce(u64 record_id);
+extern int ghes_in_mce_cper_entry_check(void);
 #else
 static inline int apei_write_mce(struct mce *m)
 {
@@ -88,6 +89,10 @@  static inline int apei_clear_mce(u64 record_id)
 {
 	return -EINVAL;
 }
+static inline int ghes_in_mce_cper_entry_check(void)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fce7ade2aba9..2c4274a0bec0 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1147,9 +1147,88 @@  static void ghes_nmi_remove(struct ghes *ghes)
 	 */
 	synchronize_rcu();
 }
+
+/*
+ * Only called by mce_panic, Return value will be ignored, just for debug
+ * purpose; when mce_panic is called, there may be meanwhile other hw error
+ * triggered through NMI, this function may lead that NMI unhandled,
+ * as we are in panic, collecting log will be sufficient.
+ */
+int ghes_in_mce_cper_entry_check(void)
+{
+	int ret = -ENOENT;
+	struct ghes *ghes;
+	struct list_head *rcu_list = &ghes_nmi;
+	enum fixed_addresses fixmap_idx = FIX_APEI_GHES_NMI;
+	struct acpi_hest_generic_status *estatus, tmp_header;
+	struct ghes_estatus_node *estatus_node;
+	u32 len, node_len;
+	u64 buf_paddr;
+
+	/* if NMI handler already in process, let NMI do its job */
+	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, rcu_list, list) {
+		int rc;
+
+		rc = __ghes_peek_estatus(ghes, &tmp_header, &buf_paddr, fixmap_idx);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		rc = __ghes_check_estatus(ghes, &tmp_header);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		len = cper_estatus_len(&tmp_header);
+		node_len = GHES_ESTATUS_NODE_LEN(len);
+		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
+		if (!estatus_node) {
+			/* Going to panic, No need to keep the error. */
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = -ENOMEM;
+			goto done;
+		}
+
+		estatus_node->ghes = ghes;
+		estatus_node->generic = ghes->generic;
+		estatus_node->task_work.func = NULL;
+		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+
+		if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
+			ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+			gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+			ret = -ENOENT;
+			continue;
+		}
+
+		/*
+		 * As we are going to panic, and preemt the possible NMI handing,
+		 * dump all the info and get it cleared.
+		 */
+		ghes_print_queued_estatus();
+		__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
+		ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+		      node_len);
+	}
+done:
+	rcu_read_unlock();
+	atomic_dec(&ghes_in_nmi);
+	return ret;
+}
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
 static inline void ghes_nmi_add(struct ghes *ghes) { }
 static inline void ghes_nmi_remove(struct ghes *ghes) { }
+int ghes_in_mce_cper_entry_check(void) {}
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static void ghes_nmi_init_cxt(void)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 34fb3431a8f3..be1ee0e993d2 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -145,4 +145,6 @@  int ghes_notify_sea(void);
 static inline int ghes_notify_sea(void) { return -ENOENT; }
 #endif
 
+int ghes_in_mce_cper_entry_check(void);
+
 #endif /* GHES_H */