diff mbox series

[v5,2/2] x86/mce: Add support for Extended Physical Address MCA changes

Message ID 20220412154038.261750-3-Smita.KoralahalliChannabasappa@amd.com (mailing list archive)
State New, archived
Headers show
Series x86/mce: Support extended MCA_ADDR address on SMCA systems | expand

Commit Message

Smita Koralahalli April 12, 2022, 3:40 p.m. UTC
Newer AMD CPUs support more physical address bits.

That is the MCA_ADDR registers on Scalable MCA systems contain the
ErrorAddr in bits [56:0] instead of [55:0]. Hence the existing LSB field
from bits [61:56] in MCA_ADDR must be moved around to accommodate the
larger ErrorAddr size.

MCA_CONFIG[McaLsbInStatusSupported] indicates this change. If set, the
LSB field will be found in MCA_STATUS rather than MCA_ADDR.

Each logical CPU has unique MCA bank in hardware and is not shared with
other logical CPUs. Additionally on SMCA systems, each feature bit may be
different for each bank within same logical CPU.

Check for MCA_CONFIG[McaLsbInStatusSupported] for each MCA bank and for
each CPU.

Additionally, all MCA banks do not support maximum ErrorAddr bits in
MCA_ADDR. Some banks might support fewer bits but the remaining bits are
marked as reserved.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Link: https://lkml.kernel.org/r/20220225193342.215780-4-Smita.KoralahalliChannabasappa@amd.com
---
v2:
	Declared lsb_in_status in existing mce_bank[] struct.
	Moved struct mce_bank[] declaration from core.c -> internal.h
v3:
	Rebased on the latest tip tree. No functional changes.
v4:
	No change.
v5:
	Extend comment for smca_extract_err_addr if AddrLsb is found in
	MCA_STATUS registers.
---
 arch/x86/kernel/cpu/mce/amd.c      | 11 ++++++++++
 arch/x86/kernel/cpu/mce/core.c     | 11 ++++------
 arch/x86/kernel/cpu/mce/internal.h | 32 +++++++++++++++++++++++++++---
 3 files changed, 44 insertions(+), 10 deletions(-)

Comments

Borislav Petkov April 13, 2022, 10:21 a.m. UTC | #1
On Tue, Apr 12, 2022 at 10:40:38AM -0500, Smita Koralahalli wrote:
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index f809eacac523..4f2744324d9b 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -722,6 +722,17 @@ bool amd_mce_is_memory_error(struct mce *m)
>  	return m->bank == 4 && xec == 0x8;
>  }
>  
> +void smca_feature_init(void)
> +{
> +	unsigned int bank;
> +	u64 mca_cfg;
> +
> +	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
> +		rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(bank), mca_cfg);
> +		this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(mca_cfg & BIT(8));
> +	}
> +}

We have smca_configure() for SMCA banks init and there it even reads
MCx_CONFIG.

Do you guys not see this?

Or integrating new stuff into the existing code doesn't really matter -
just bolt it on wherever it works?!

> diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
> index 64dbae6b8a09..0f4934fb3d93 100644
> --- a/arch/x86/kernel/cpu/mce/internal.h
> +++ b/arch/x86/kernel/cpu/mce/internal.h
> @@ -177,6 +177,22 @@ struct mce_vendor_flags {
>  
>  extern struct mce_vendor_flags mce_flags;
>  
> +struct mce_bank {
> +	u64			ctl;			/* subevents to enable */
> +
> +	__u64 init			: 1,		/* initialise bank? */
> +
> +	/*
> +	 * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: This bit indicates
> +	 * the LSB field is found in MCA_STATUS, when set.
> +	 */
> +	      lsb_in_status		: 1,
> +
> +	      __reserved_1		: 62;
> +};

Put those comments over the members, while at it:

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 0f4934fb3d93..770a31120fd2 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -178,17 +178,18 @@ struct mce_vendor_flags {
 extern struct mce_vendor_flags mce_flags;
 
 struct mce_bank {
-       u64                     ctl;                    /* subevents to enable */
+       /* subevents to enable */
+       u64                     ctl;
 
-       __u64 init                      : 1,            /* initialise bank? */
+       /* initialise bank? */
+       __u64 init              : 1,
 
        /*
         * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: This bit indicates
         * the LSB field is found in MCA_STATUS, when set.
         */
-             lsb_in_status             : 1,
-
-             __reserved_1              : 62;
+             lsb_in_status     : 1,
+             __reserved_1      : 62;
 };

> +DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
> +
>  enum mca_msr {
>  	MCA_CTL,
>  	MCA_STATUS,
> @@ -190,7 +206,9 @@ extern bool filter_mce(struct mce *m);
>  #ifdef CONFIG_X86_MCE_AMD
>  extern bool amd_filter_mce(struct mce *m);
>  
> -/* Extract [55:<lsb>] where lsb is the LS-*valid* bit of the address bits. */
> +/* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
> + * [56:0], else in bits [55:0] of MCA_ADDR.
> + */

verify_comment_style: Warning: Multi-line comment needs to start text on the second line:
 [+/* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits]

Documentation/process/maintainer-tip.rst, Section "Comment style".
Yazen Ghannam April 13, 2022, 2:10 p.m. UTC | #2
On Wed, Apr 13, 2022 at 12:21:41PM +0200, Borislav Petkov wrote:
> On Tue, Apr 12, 2022 at 10:40:38AM -0500, Smita Koralahalli wrote:
> > diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> > index f809eacac523..4f2744324d9b 100644
> > --- a/arch/x86/kernel/cpu/mce/amd.c
> > +++ b/arch/x86/kernel/cpu/mce/amd.c
> > @@ -722,6 +722,17 @@ bool amd_mce_is_memory_error(struct mce *m)
> >  	return m->bank == 4 && xec == 0x8;
> >  }
> >  
> > +void smca_feature_init(void)
> > +{
> > +	unsigned int bank;
> > +	u64 mca_cfg;
> > +
> > +	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
> > +		rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(bank), mca_cfg);
> > +		this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(mca_cfg & BIT(8));
> > +	}
> > +}
> 
> We have smca_configure() for SMCA banks init and there it even reads
> MCx_CONFIG.
> 
> Do you guys not see this?
> 
> Or integrating new stuff into the existing code doesn't really matter -
> just bolt it on wherever it works?!
>

This function gets called from __mcheck_cpu_init_early() so that the info is
available before the MCA banks are polled in __mcheck_cpu_init_generic().

Maybe we can avoid some confusion by renaming this new function? It doesn't do
any feature initialization, so maybe smca_feature_detect_early() would be
better? The goal is to get just enough info that's needed before the bulk of
generic and vendor MCA initiliazation happens.

Or do you recommend unifying this with smca_configure()?

Thanks,
Yazen
Borislav Petkov April 13, 2022, 2:54 p.m. UTC | #3
On Wed, Apr 13, 2022 at 02:10:30PM +0000, Yazen Ghannam wrote:
> This function gets called from __mcheck_cpu_init_early() so that the info is
> available before the MCA banks are polled in __mcheck_cpu_init_generic().

Would that work?

I've moved first bank polling into __mcheck_cpu_init_clear_banks()
because, well, this function is clearing the banks so it might as well
poll them first. First bank polling in a init_generic function doesn't
make too much sense anyway.

And __mcheck_cpu_check_banks() functionality is moved into
__mcheck_cpu_init_clear_banks() because, well, silly.

On a quick scan, I don't see problems with such move but the devil is in
the detail.

Hmm?

---

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 99e3ff9607a3..345e068215c4 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1732,21 +1732,8 @@ static void __mcheck_cpu_cap_init(void)
 
 static void __mcheck_cpu_init_generic(void)
 {
-	enum mcp_flags m_fl = 0;
-	mce_banks_t all_banks;
 	u64 cap;
 
-	if (!mca_cfg.bootlog)
-		m_fl = MCP_DONTLOG;
-
-	/*
-	 * Log the machine checks left over from the previous reset. Log them
-	 * only, do not start processing them. That will happen in mcheck_late_init()
-	 * when all consumers have been registered on the notifier chain.
-	 */
-	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
-
 	cr4_set_bits(X86_CR4_MCE);
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -1757,33 +1744,21 @@ static void __mcheck_cpu_init_generic(void)
 static void __mcheck_cpu_init_clear_banks(void)
 {
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	enum mcp_flags m_fl = 0;
+	mce_banks_t all_banks;
+	u64 msrval;
 	int i;
 
-	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
-		struct mce_bank *b = &mce_banks[i];
-
-		if (!b->init)
-			continue;
-		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
-		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
-	}
-}
+	if (!mca_cfg.bootlog)
+		m_fl = MCP_DONTLOG;
 
-/*
- * Do a final check to see if there are any unused/RAZ banks.
- *
- * This must be done after the banks have been initialized and any quirks have
- * been applied.
- *
- * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
- * Otherwise, a user who disables a bank will not be able to re-enable it
- * without a system reboot.
- */
-static void __mcheck_cpu_check_banks(void)
-{
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-	u64 msrval;
-	int i;
+	/*
+	 * Log the machine checks left over from the previous reset. Log them
+	 * only, do not start processing them. That will happen in mcheck_late_init()
+	 * when all consumers have been registered on the notifier chain.
+	 */
+	bitmap_fill(all_banks, MAX_NR_BANKS);
+	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
 
 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
@@ -1791,6 +1766,9 @@ static void __mcheck_cpu_check_banks(void)
 		if (!b->init)
 			continue;
 
+		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+
 		rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
 		b->init = !!msrval;
 	}
@@ -2159,7 +2137,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_clear_banks();
-	__mcheck_cpu_check_banks();
 	__mcheck_cpu_setup_timer();
 }
Luck, Tony April 13, 2022, 3:59 p.m. UTC | #4
On Wed, Apr 13, 2022 at 04:54:00PM +0200, Borislav Petkov wrote:
> +	if (!mca_cfg.bootlog)
> +		m_fl = MCP_DONTLOG;
>  
> -/*
> - * Do a final check to see if there are any unused/RAZ banks.
> - *
> - * This must be done after the banks have been initialized and any quirks have
> - * been applied.
> - *
> - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
> - * Otherwise, a user who disables a bank will not be able to re-enable it
> - * without a system reboot.
> - */
> -static void __mcheck_cpu_check_banks(void)
> -{
> -	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> -	u64 msrval;
> -	int i;
> +	/*
> +	 * Log the machine checks left over from the previous reset. Log them
> +	 * only, do not start processing them. That will happen in mcheck_late_init()
> +	 * when all consumers have been registered on the notifier chain.
> +	 */
> +	bitmap_fill(all_banks, MAX_NR_BANKS);
> +	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);

If MCP_DONTLOG bit is set, then this does little. It will find
banks with valid records, NOT log them, clear them. But then we
loop and clear all banks.

So maybe do:

	if (mca_cfg.bootlog) {
		bitmap_fill(all_banks, MAX_NR_BANKS);
		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
	}


>  	__mcheck_cpu_init_clear_banks();

This will a new name to indicate that it is logging as well as init & clear.

Otherwise seeems fine.

-Tony
Borislav Petkov April 13, 2022, 4:19 p.m. UTC | #5
On Wed, Apr 13, 2022 at 08:59:41AM -0700, Luck, Tony wrote:
> If MCP_DONTLOG bit is set, then this does little. It will find
> banks with valid records, NOT log them, clear them. But then we
> loop and clear all banks.
> 
> So maybe do:
> 
> 	if (mca_cfg.bootlog) {
> 		bitmap_fill(all_banks, MAX_NR_BANKS);
> 		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
> 	}

Ack, thx.

> This will a new name to indicate that it is logging as well as init & clear.

Ok.

> Otherwise seeems fine.

Thanks, here it is (untested yet).

---
From: Borislav Petkov <bp@suse.de>
Date: Wed, 13 Apr 2022 18:11:01 +0200
Subject: [PATCH] x86/mce: Cleanup bank processing on init

Unify the bank preparation into __mcheck_cpu_init_clear_banks(), rename
that function to what it does now - prepares banks. Do this so that
generic and vendor banks init goes first so that settings done during
that init can take effect before the first bank polling takes place.

Move __mcheck_cpu_check_banks() into __mcheck_cpu_init_prepare_banks()
as it already loops over the banks.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/mce.h     |  3 +-
 arch/x86/kernel/cpu/mce/core.c | 64 ++++++++++------------------------
 2 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cc73061e7255..5450df861ec5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -252,8 +252,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 enum mcp_flags {
 	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
 	MCP_UC		= BIT(1),	/* log uncorrected errors */
-	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
-	MCP_QUEUE_LOG	= BIT(3),	/* only queue to genpool */
+	MCP_QUEUE_LOG	= BIT(2),	/* only queue to genpool */
 };
 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 99e3ff9607a3..6e49dda09a2a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -724,9 +724,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 log_it:
 		error_seen = true;
 
-		if (flags & MCP_DONTLOG)
-			goto clear_it;
-
 		mce_read_aux(&m, i);
 		m.severity = mce_severity(&m, NULL, NULL, false);
 		/*
@@ -1693,7 +1690,7 @@ static void __mcheck_cpu_mce_banks_init(void)
 		/*
 		 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
 		 * the required vendor quirks before
-		 * __mcheck_cpu_init_clear_banks() does the final bank setup.
+		 * __mcheck_cpu_init_prepare_banks() does the final bank setup.
 		 */
 		b->ctl = -1ULL;
 		b->init = true;
@@ -1732,21 +1729,8 @@ static void __mcheck_cpu_cap_init(void)
 
 static void __mcheck_cpu_init_generic(void)
 {
-	enum mcp_flags m_fl = 0;
-	mce_banks_t all_banks;
 	u64 cap;
 
-	if (!mca_cfg.bootlog)
-		m_fl = MCP_DONTLOG;
-
-	/*
-	 * Log the machine checks left over from the previous reset. Log them
-	 * only, do not start processing them. That will happen in mcheck_late_init()
-	 * when all consumers have been registered on the notifier chain.
-	 */
-	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
-
 	cr4_set_bits(X86_CR4_MCE);
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -1754,36 +1738,22 @@ static void __mcheck_cpu_init_generic(void)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 }
 
-static void __mcheck_cpu_init_clear_banks(void)
+static void __mcheck_cpu_init_prepare_banks(void)
 {
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	mce_banks_t all_banks;
+	u64 msrval;
 	int i;
 
-	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
-		struct mce_bank *b = &mce_banks[i];
-
-		if (!b->init)
-			continue;
-		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
-		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+	/*
+	 * Log the machine checks left over from the previous reset. Log them
+	 * only, do not start processing them. That will happen in mcheck_late_init()
+	 * when all consumers have been registered on the notifier chain.
+	 */
+	if (mca_cfg.bootlog) {
+		bitmap_fill(all_banks, MAX_NR_BANKS);
+		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
 	}
-}
-
-/*
- * Do a final check to see if there are any unused/RAZ banks.
- *
- * This must be done after the banks have been initialized and any quirks have
- * been applied.
- *
- * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
- * Otherwise, a user who disables a bank will not be able to re-enable it
- * without a system reboot.
- */
-static void __mcheck_cpu_check_banks(void)
-{
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-	u64 msrval;
-	int i;
 
 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
@@ -1791,6 +1761,9 @@ static void __mcheck_cpu_check_banks(void)
 		if (!b->init)
 			continue;
 
+		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+
 		rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
 		b->init = !!msrval;
 	}
@@ -2158,8 +2131,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_early(c);
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
-	__mcheck_cpu_init_clear_banks();
-	__mcheck_cpu_check_banks();
+	__mcheck_cpu_init_prepare_banks();
 	__mcheck_cpu_setup_timer();
 }
 
@@ -2327,7 +2299,7 @@ static void mce_syscore_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
-	__mcheck_cpu_init_clear_banks();
+	__mcheck_cpu_init_prepare_banks();
 }
 
 static struct syscore_ops mce_syscore_ops = {
@@ -2345,7 +2317,7 @@ static void mce_cpu_restart(void *data)
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 	__mcheck_cpu_init_generic();
-	__mcheck_cpu_init_clear_banks();
+	__mcheck_cpu_init_prepare_banks();
 	__mcheck_cpu_init_timer();
 }
Yazen Ghannam April 13, 2022, 7:40 p.m. UTC | #6
On Wed, Apr 13, 2022 at 06:19:11PM +0200, Borislav Petkov wrote:

...

>  static void __mcheck_cpu_init_generic(void)
>  {
> -	enum mcp_flags m_fl = 0;
> -	mce_banks_t all_banks;
>  	u64 cap;
>  
> -	if (!mca_cfg.bootlog)
> -		m_fl = MCP_DONTLOG;
> -
> -	/*
> -	 * Log the machine checks left over from the previous reset. Log them
> -	 * only, do not start processing them. That will happen in mcheck_late_init()
> -	 * when all consumers have been registered on the notifier chain.
> -	 */
> -	bitmap_fill(all_banks, MAX_NR_BANKS);
> -	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
> -
>  	cr4_set_bits(X86_CR4_MCE);

I think the init logic breaks here. MCE now gets enabled before clearing old
errors. So it's possible that the old errors get overwritten by new ones.

>  
>  	rdmsrl(MSR_IA32_MCG_CAP, cap);
> @@ -1754,36 +1738,22 @@ static void __mcheck_cpu_init_generic(void)
>  		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
>  }
>  
> -static void __mcheck_cpu_init_clear_banks(void)
> +static void __mcheck_cpu_init_prepare_banks(void)
>  {
>  	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> +	mce_banks_t all_banks;
> +	u64 msrval;
>  	int i;
>  
> -	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
> -		struct mce_bank *b = &mce_banks[i];
> -
> -		if (!b->init)
> -			continue;
> -		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
> -		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
> +	/*
> +	 * Log the machine checks left over from the previous reset. Log them
> +	 * only, do not start processing them. That will happen in mcheck_late_init()
> +	 * when all consumers have been registered on the notifier chain.
> +	 */
> +	if (mca_cfg.bootlog) {
> +		bitmap_fill(all_banks, MAX_NR_BANKS);
> +		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
>  	}
> -}
> -
> -/*
> - * Do a final check to see if there are any unused/RAZ banks.
> - *
> - * This must be done after the banks have been initialized and any quirks have
> - * been applied.
> - *
> - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
> - * Otherwise, a user who disables a bank will not be able to re-enable it
> - * without a system reboot.
> - */
> -static void __mcheck_cpu_check_banks(void)
> -{
> -	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
> -	u64 msrval;
> -	int i;
>  
>  	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
>  		struct mce_bank *b = &mce_banks[i];
> @@ -1791,6 +1761,9 @@ static void __mcheck_cpu_check_banks(void)
>  		if (!b->init)
>  			continue;
>  
> +		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
> +		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);

Same idea here. STATUS should be cleared before turning on reporting in a bank
using MCA_CTL.

> +
>  		rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
>  		b->init = !!msrval;
>  	}
> @@ -2158,8 +2131,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
>  	__mcheck_cpu_init_early(c);
>  	__mcheck_cpu_init_generic();
>  	__mcheck_cpu_init_vendor(c);
> -	__mcheck_cpu_init_clear_banks();
> -	__mcheck_cpu_check_banks();
> +	__mcheck_cpu_init_prepare_banks();

I think moving __mcheck_cpu_init_generic() after
__mcheck_cpu_init_prepare_banks() and swapping the MCA_STATUS and MCA_CTL
writes above would be correct.

In summary:
1) __mcheck_cpu_init_vendor()
2) __mcheck_cpu_init_prepare_banks()
  a) Read and clear MCA_STATUS.
  b) Initialize MCA_CTL.
3) __mcheck_cpu_init_generic()

I realize this is still different than the original flow. But it seems to
maintain the goal of this patch. And it actually matches the AMD documentation
more closely than the original flow.

One downside though is that the system goes longer with CR4.MCE cleared. So
there's greater risk of encountering a shutdown due to a machine check error.

Thanks,
Yazen
Borislav Petkov April 14, 2022, 9:11 a.m. UTC | #7
On Wed, Apr 13, 2022 at 07:40:39PM +0000, Yazen Ghannam wrote:
> I think the init logic breaks here. MCE now gets enabled before clearing old
> errors. So it's possible that the old errors get overwritten by new ones.

Err, I don't understand. CR4.MCE bit description has:

"Regardless of whether machine-check exceptions are enabled, the
processor records enabled-errors when they occur."

I'm guessing enabled errors are those for which the respective bits in
the MCi_CTL banks are set. And I think the CPU comes out of reset with
those bits set.

So the overwriting will happen regardless.

The only difference here is that "[s]etting MCE to 1 enables the
machine-check exception mechanism." So you'll get a #MC raised vs
shutdown on a fatal error.

Or am I missing an angle?

> > @@ -1791,6 +1761,9 @@ static void __mcheck_cpu_check_banks(void)
> >  		if (!b->init)
> >  			continue;
> >  
> > +		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
> > +		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
> 
> Same idea here. STATUS should be cleared before turning on reporting in a bank
> using MCA_CTL.

Look at the current code. Called in this order:

__mcheck_cpu_init_clear_banks:
        wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
        wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
__mcheck_cpu_check_banks
	rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
        b->init = !!msrval;

STATUS *is* cleared after MCA_CTL now too.

If this ordering is wrong - and it sounds like it is - then this needs
to be a separate patch and Cc: <stable@vger.kernel.org> and needs to go
in now.

> One downside though is that the system goes longer with CR4.MCE cleared. So
> there's greater risk of encountering a shutdown due to a machine check error.

Yeah, I don't think the couple of msecs matter.

Thx.
Yazen Ghannam April 15, 2022, 2:56 p.m. UTC | #8
On Thu, Apr 14, 2022 at 11:11:01AM +0200, Borislav Petkov wrote:
> On Wed, Apr 13, 2022 at 07:40:39PM +0000, Yazen Ghannam wrote:
> > I think the init logic breaks here. MCE now gets enabled before clearing old
> > errors. So it's possible that the old errors get overwritten by new ones.
> 
> Err, I don't understand. CR4.MCE bit description has:
> 
> "Regardless of whether machine-check exceptions are enabled, the
> processor records enabled-errors when they occur."
> 
> I'm guessing enabled errors are those for which the respective bits in
> the MCi_CTL banks are set. And I think the CPU comes out of reset with
> those bits set.
> 
> So the overwriting will happen regardless.
> 
> The only difference here is that "[s]etting MCE to 1 enables the
> machine-check exception mechanism." So you'll get a #MC raised vs
> shutdown on a fatal error.
> 
> Or am I missing an angle?
>

I agree with you about the CR4.MCE statement. But MCi_CTL needs to be set by
system software. The reset value is '0' at least on AMD systems.

Here's a example scenario.

1) OS has fully booted.
   a) MCi_CTL, MCG_CTL, CR4.MCE are all enabled.
2) Fatal MCA error occurs causing hardware-initialzed reset. No OS handling.
   a) MCA state info is warm reset persistent.
   b) MCi_STATUS, MCi_ADDR, etc. have valid info.
   c) MCi_CTL, MCG_CTL, CR4.MCE are all set to reset value: '0'.
3) OS, or optionally BIOS, polls MCA banks and logs any valid errors.
   a) Since MCi_CTL, etc. are cleared due to reset, any errors detected are
      from before the reset.
4) MCi_STATUS is cleared to discard old error information.
5) MCA is initiliazed (MCi_CTL, MCG_CTL, CR4.MCE, etc.). Any error detected
   now is a new error from this session.
 
> > > @@ -1791,6 +1761,9 @@ static void __mcheck_cpu_check_banks(void)
> > >  		if (!b->init)
> > >  			continue;
> > >  
> > > +		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
> > > +		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
> > 
> > Same idea here. STATUS should be cleared before turning on reporting in a bank
> > using MCA_CTL.
> 
> Look at the current code. Called in this order:
> 
> __mcheck_cpu_init_clear_banks:
>         wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
>         wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
> __mcheck_cpu_check_banks
> 	rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
>         b->init = !!msrval;
> 
> STATUS *is* cleared after MCA_CTL now too.
> 
> If this ordering is wrong - and it sounds like it is - then this needs
> to be a separate patch and Cc: <stable@vger.kernel.org> and needs to go
> in now.
>

I agree. The Intel SDM and AMD APM have the following procedure, in summary.

1) Set MCG_CTL
2) Set MCi_CTL for all banks
3) Read MCi_STATUS and log valid errors.
4) Clear MCi_STATUS
5) Set CR4.MCE

I don't know of a reason why STATUS needs to be cleared after MCi_CTL is set.
The only thing I can think of is that enabling MCi_CTL may cause spurious info
logged in MCi_STATUS, and that needs to be cleared out. I'm asking AMD folks
about it.

Of course, this contradicts the flow I outlined above, and also the flow given
in the AMD Processor Programming Reference (PPR). I wonder if the
architectural documents have gotten stale compared to current guidelines. I'm
asking about this too.

Tony,
Do you have any thoughts on this?

> > One downside though is that the system goes longer with CR4.MCE cleared. So
> > there's greater risk of encountering a shutdown due to a machine check error.
> 
> Yeah, I don't think the couple of msecs matter.
>

Okay, yeah then maybe there should be another small patch to bring the init
flow closer to x86 documentation.

Thanks,
Yazen
Luck, Tony April 15, 2022, 4:37 p.m. UTC | #9
On Fri, Apr 15, 2022 at 02:56:54PM +0000, Yazen Ghannam wrote:
> 3) OS, or optionally BIOS, polls MCA banks and logs any valid errors.
>    a) Since MCi_CTL, etc. are cleared due to reset, any errors detected are
>       from before the reset.

On Intel not quite any error. H/w can still log to a bank but MCi_STATUS.EN bit
will be zero. We've also had some BIOS code that did things that logged errors
and then left them for the OS to find during boot.

But this sequence does give more confidence that errors found in banks duing
boot are "old".

> I agree. The Intel SDM and AMD APM have the following procedure, in summary.
> 
> 1) Set MCG_CTL
> 2) Set MCi_CTL for all banks
> 3) Read MCi_STATUS and log valid errors.
> 4) Clear MCi_STATUS
> 5) Set CR4.MCE

Yes. That's what the pseudo-code in Intel SDM Example 15-1 says :-(
> 
> I don't know of a reason why STATUS needs to be cleared after MCi_CTL is set.
> The only thing I can think of is that enabling MCi_CTL may cause spurious info
> logged in MCi_STATUS, and that needs to be cleared out. I'm asking AMD folks
> about it.
> 
> Of course, this contradicts the flow I outlined above, and also the flow given
> in the AMD Processor Programming Reference (PPR). I wonder if the
> architectural documents have gotten stale compared to current guidelines. I'm
> asking about this too.

I will ask architects about this sequence too.

-Tony
Yazen Ghannam June 9, 2022, 7:19 p.m. UTC | #10
On Fri, Apr 15, 2022 at 09:37:57AM -0700, Luck, Tony wrote:
> On Fri, Apr 15, 2022 at 02:56:54PM +0000, Yazen Ghannam wrote:
> > 3) OS, or optionally BIOS, polls MCA banks and logs any valid errors.
> >    a) Since MCi_CTL, etc. are cleared due to reset, any errors detected are
> >       from before the reset.
> 
> On Intel not quite any error. H/w can still log to a bank but MCi_STATUS.EN bit
> will be zero. We've also had some BIOS code that did things that logged errors
> and then left them for the OS to find during boot.
> 
> But this sequence does give more confidence that errors found in banks duing
> boot are "old".
> 
> > I agree. The Intel SDM and AMD APM have the following procedure, in summary.
> > 
> > 1) Set MCG_CTL
> > 2) Set MCi_CTL for all banks
> > 3) Read MCi_STATUS and log valid errors.
> > 4) Clear MCi_STATUS
> > 5) Set CR4.MCE
> 
> Yes. That's what the pseudo-code in Intel SDM Example 15-1 says :-(
> > 
> > I don't know of a reason why STATUS needs to be cleared after MCi_CTL is set.
> > The only thing I can think of is that enabling MCi_CTL may cause spurious info
> > logged in MCi_STATUS, and that needs to be cleared out. I'm asking AMD folks
> > about it.
> > 
> > Of course, this contradicts the flow I outlined above, and also the flow given
> > in the AMD Processor Programming Reference (PPR). I wonder if the
> > architectural documents have gotten stale compared to current guidelines. I'm
> > asking about this too.
> 
> I will ask architects about this sequence too.
>

Hi everyone,
It looks like the discrepancy between the Linux code and the x86 documents
isn't a major concern for AMD systems. However, it is highly recommended that
the banks are polled before enabling MCA to find any errors from before OS
boot. It is possible that BIOS may enable MCA before the OS on some systems,
but this isn't always the case.

Tony,
Did you get any feedback regarding the sequence above?

Also, please see the patch below which is based on Boris' patch from earlier
in this thread.

Thanks,
Yazen

-------

From dc4f5b862080daae1aae22f1ec460d9c4c8b6d20 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Thu, 19 May 2022 17:25:47 +0000
Subject: [PATCH] x86/mce: Remove __mcheck_cpu_init_early()

The __mcheck_cpu_init_early() function was introduced so that some
vendor-specific features are detected before the first MCA polling event
done in __mcheck_cpu_init_generic().

Currently, __mcheck_cpu_init_early() is only used on AMD-based systems and
additional code will be needed to support various system configurations.

However, the current and future vendor-specific code should be done during
vendor init. This keeps all the vendor code in a common location and
simplifies the generic init flow.

Move all the __mcheck_cpu_init_early() code into mce_amd_feature_init().
Also, move __mcheck_cpu_init_generic() after
__mcheck_cpu_init_prepare_banks() so that MCA is enabled after the first
MCA polling event.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/kernel/cpu/mce/amd.c  |  4 ++++
 arch/x86/kernel/cpu/mce/core.c | 20 +++-----------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..f65224a2b02d 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -681,6 +681,10 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 	u32 low = 0, high = 0, address = 0;
 	int offset = -1;
 
+	mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+	mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
+	mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
+	mce_flags.amd_threshold	 = 1;
 
 	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
 		if (mce_flags.smca)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5f406d135d32..9efd6d010e2d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1906,19 +1906,6 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 	return 0;
 }
 
-/*
- * Init basic CPU features needed for early decoding of MCEs.
- */
-static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
-{
-	if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
-		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
-		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
-		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
-		mce_flags.amd_threshold	 = 1;
-	}
-}
-
 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
 {
 	struct mca_config *cfg = &mca_cfg;
@@ -2139,10 +2126,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 
 	mca_cfg.initialized = 1;
 
-	__mcheck_cpu_init_early(c);
-	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_prepare_banks();
+	__mcheck_cpu_init_generic();
 	__mcheck_cpu_setup_timer();
 }
 
@@ -2308,9 +2294,9 @@ static void mce_syscore_shutdown(void)
  */
 static void mce_syscore_resume(void)
 {
-	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
 	__mcheck_cpu_init_prepare_banks();
+	__mcheck_cpu_init_generic();
 }
 
 static struct syscore_ops mce_syscore_ops = {
@@ -2327,8 +2313,8 @@ static void mce_cpu_restart(void *data)
 {
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
-	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_prepare_banks();
+	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_timer();
 }
Borislav Petkov June 27, 2022, 3:56 p.m. UTC | #11
On Thu, Jun 09, 2022 at 07:19:29PM +0000, Yazen Ghannam wrote:
> Also, please see the patch below which is based on Boris' patch from earlier
> in this thread.

Looks good to me. You can send me one against latest tip:ras/core
whenever you can.
Yazen Ghannam July 12, 2022, 1:51 p.m. UTC | #12
On Mon, Jun 27, 2022 at 05:56:34PM +0200, Borislav Petkov wrote:
> On Thu, Jun 09, 2022 at 07:19:29PM +0000, Yazen Ghannam wrote:
> > Also, please see the patch below which is based on Boris' patch from earlier
> > in this thread.
> 
> Looks good to me. You can send me one against latest tip:ras/core
> whenever you can.
>

Thanks. Do you want a set with yours, mine, and Smita's patches all together?

-Yazen
Borislav Petkov July 12, 2022, 2:08 p.m. UTC | #13
On Tue, Jul 12, 2022 at 01:51:25PM +0000, Yazen Ghannam wrote:
> Thanks. Do you want a set with yours, mine, and Smita's patches all together?

Sure, whatever you have ready.

No hurry, though, we'll probably have merge window next week...

Thx.
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index f809eacac523..4f2744324d9b 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -722,6 +722,17 @@  bool amd_mce_is_memory_error(struct mce *m)
 	return m->bank == 4 && xec == 0x8;
 }
 
+void smca_feature_init(void)
+{
+	unsigned int bank;
+	u64 mca_cfg;
+
+	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
+		rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(bank), mca_cfg);
+		this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(mca_cfg & BIT(8));
+	}
+}
+
 static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
 {
 	struct mce m;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 39614c19da25..99e3ff9607a3 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -67,13 +67,7 @@  DEFINE_PER_CPU(unsigned, mce_exception_count);
 
 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
 
-struct mce_bank {
-	u64			ctl;			/* subevents to enable */
-
-	__u64 init			: 1,		/* initialise bank? */
-	      __reserved_1		: 63;
-};
-static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
 
 #define ATTR_LEN               16
 /* One object for each MCE bank, shared by all CPUs */
@@ -1935,6 +1929,9 @@  static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
 		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
 		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
 		mce_flags.amd_threshold	 = 1;
+
+		if (mce_flags.smca)
+			smca_feature_init();
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 64dbae6b8a09..0f4934fb3d93 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -177,6 +177,22 @@  struct mce_vendor_flags {
 
 extern struct mce_vendor_flags mce_flags;
 
+struct mce_bank {
+	u64			ctl;			/* subevents to enable */
+
+	__u64 init			: 1,		/* initialise bank? */
+
+	/*
+	 * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: This bit indicates
+	 * the LSB field is found in MCA_STATUS, when set.
+	 */
+	      lsb_in_status		: 1,
+
+	      __reserved_1		: 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
 enum mca_msr {
 	MCA_CTL,
 	MCA_STATUS,
@@ -190,7 +206,9 @@  extern bool filter_mce(struct mce *m);
 #ifdef CONFIG_X86_MCE_AMD
 extern bool amd_filter_mce(struct mce *m);
 
-/* Extract [55:<lsb>] where lsb is the LS-*valid* bit of the address bits. */
+/* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0], else in bits [55:0] of MCA_ADDR.
+ */
 static __always_inline void smca_extract_err_addr(struct mce *m)
 {
 	u8 lsb;
@@ -198,14 +216,22 @@  static __always_inline void smca_extract_err_addr(struct mce *m)
 	if (!mce_flags.smca)
 		return;
 
-	lsb = (m->addr >> 56) & 0x3f;
+	if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+		lsb = (m->status >> 24) & 0x3f;
 
-	m->addr &= GENMASK_ULL(55, lsb);
+		m->addr &= GENMASK_ULL(56, lsb);
+	} else {
+		lsb = (m->addr >> 56) & 0x3f;
+
+		m->addr &= GENMASK_ULL(55, lsb);
+	}
 }
 
+void smca_feature_init(void);
 #else
 static inline bool amd_filter_mce(struct mce *m) { return false; }
 static inline void smca_extract_err_addr(struct mce *m) { }
+static inline void smca_feature_init(void) { }
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE