diff mbox series

x86/mce: drop cpu_missing since we have more capable mce_missing_cpus

Message ID 20211108082832.142436-1-zhangzl2013@126.com (mailing list archive)
State New, archived
Headers show
Series x86/mce: drop cpu_missing since we have more capable mce_missing_cpus | expand

Commit Message

Zhaolong Zhang Nov. 8, 2021, 8:28 a.m. UTC
move mce_missing_cpus checking into mce_panic() as well, because we don't want
to lose the cpu missing information in case mca_cfg.tolerant > 1 and there is
no_way_out.

Signed-off-by: Zhaolong Zhang <zhangzl2013@126.com>
---
 arch/x86/kernel/cpu/mce/core.c | 38 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

Comments

Borislav Petkov Nov. 8, 2021, 9:31 a.m. UTC | #1
On Mon, Nov 08, 2021 at 04:28:32PM +0800, Zhaolong Zhang wrote:
> move mce_missing_cpus checking into mce_panic() as well, because we don't want
> to lose the cpu missing information in case mca_cfg.tolerant > 1 and there is
> no_way_out.
> 
> Signed-off-by: Zhaolong Zhang <zhangzl2013@126.com>
> ---
>  arch/x86/kernel/cpu/mce/core.c | 38 ++++++++++++++++++++--------------
>  1 file changed, 22 insertions(+), 16 deletions(-)

I was actually expecting to see something like this:

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6ed365337a3b..30de00fe0d7a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {
 
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static unsigned long mce_need_notify;
-static int cpu_missing;
 
 /*
  * MCA banks polled by the period polling timer for corrected events.
@@ -314,8 +313,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (!apei_err)
 			apei_err = apei_write_mce(final);
 	}
-	if (cpu_missing)
-		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 	if (!fake_panic) {
@@ -891,7 +888,6 @@ static int mce_timed_out(u64 *t, const char *msg)
 					 cpumask_pr_args(&mce_missing_cpus));
 			mce_panic(msg, NULL, NULL);
 		}
-		cpu_missing = 1;
 		return 1;
 	}
 	*t -= SPINUNIT;
@@ -2702,7 +2698,6 @@ struct dentry *mce_get_debugfs_dir(void)
 
 static void mce_reset(void)
 {
-	cpu_missing = 0;
 	atomic_set(&mce_fake_panicked, 0);
 	atomic_set(&mce_executing, 0);
 	atomic_set(&mce_callin, 0);
Zhaolong Zhang Nov. 8, 2021, 10:13 a.m. UTC | #2
At 2021-11-08 17:31:52, "Borislav Petkov" <bp@alien8.de> wrote:
>On Mon, Nov 08, 2021 at 04:28:32PM +0800, Zhaolong Zhang wrote:
>> move mce_missing_cpus checking into mce_panic() as well, because we don't want
>> to lose the cpu missing information in case mca_cfg.tolerant > 1 and there is
>> no_way_out.
>> 
>> Signed-off-by: Zhaolong Zhang <zhangzl2013@126.com>
>> ---
>>  arch/x86/kernel/cpu/mce/core.c | 38 ++++++++++++++++++++--------------
>>  1 file changed, 22 insertions(+), 16 deletions(-)
>
>I was actually expecting to see something like this:

Hi Boris,

I was concerning that if I simply remove the cpu_missing code, we will lose the log in the
situation where mca_cfg.tolerant > 1 and no_way_out is set afterwards.

Do you think we can safely ignore that situation?

Regards,
Zhaolong


>
>diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>index 6ed365337a3b..30de00fe0d7a 100644
>--- a/arch/x86/kernel/cpu/mce/core.c
>+++ b/arch/x86/kernel/cpu/mce/core.c
>@@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {
> 
> static DEFINE_PER_CPU(struct mce, mces_seen);
> static unsigned long mce_need_notify;
>-static int cpu_missing;
> 
> /*
>  * MCA banks polled by the period polling timer for corrected events.
>@@ -314,8 +313,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
> 		if (!apei_err)
> 			apei_err = apei_write_mce(final);
> 	}
>-	if (cpu_missing)
>-		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
> 	if (exp)
> 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
> 	if (!fake_panic) {
>@@ -891,7 +888,6 @@ static int mce_timed_out(u64 *t, const char *msg)
> 					 cpumask_pr_args(&mce_missing_cpus));
> 			mce_panic(msg, NULL, NULL);
> 		}
>-		cpu_missing = 1;
> 		return 1;
> 	}
> 	*t -= SPINUNIT;
>@@ -2702,7 +2698,6 @@ struct dentry *mce_get_debugfs_dir(void)
> 
> static void mce_reset(void)
> {
>-	cpu_missing = 0;
> 	atomic_set(&mce_fake_panicked, 0);
> 	atomic_set(&mce_executing, 0);
> 	atomic_set(&mce_callin, 0);
>
>-- 
>Regards/Gruss,
>    Boris.
>
>https://people.kernel.org/tglx/notes-about-netiquette
Borislav Petkov Nov. 8, 2021, 10:31 a.m. UTC | #3
On Mon, Nov 08, 2021 at 06:13:04PM +0800, Zhaolong Zhang wrote:
> I was concerning that if I simply remove the cpu_missing code, we will lose the log in the
> situation where mca_cfg.tolerant > 1 and no_way_out is set afterwards.
> 
> Do you think we can safely ignore that situation?

Well, how likely is to have such a situation in practice?
Zhaolong Zhang Nov. 8, 2021, 12:47 p.m. UTC | #4
At 2021-11-08 18:31:38, "Borislav Petkov" <bp@alien8.de> wrote:
>On Mon, Nov 08, 2021 at 06:13:04PM +0800, Zhaolong Zhang wrote:
>> I was concerning that if I simply remove the cpu_missing code, we will lose the log in the
>> situation where mca_cfg.tolerant > 1 and no_way_out is set afterwards.
>> 
>> Do you think we can safely ignore that situation?
>
>Well, how likely is to have such a situation in practice?

It is difficult to answer...
But since current code is dealing with this situation, I think I should cover it too,
although it is only a piece of log.

Regards,
Zhaolong
Zhaolong Zhang Nov. 9, 2021, 8:31 a.m. UTC | #5
At 2021-11-08 20:47:59, "Zhaolong Zhang" <zhangzl2013@126.com> wrote:
>At 2021-11-08 18:31:38, "Borislav Petkov" <bp@alien8.de> wrote:
>>On Mon, Nov 08, 2021 at 06:13:04PM +0800, Zhaolong Zhang wrote:
>>> I was concerning that if I simply remove the cpu_missing code, we will lose the log in the
>>> situation where mca_cfg.tolerant > 1 and no_way_out is set afterwards.
>>> 
>>> Do you think we can safely ignore that situation?
>>
>>Well, how likely is to have such a situation in practice?
>
>It is difficult to answer...
>But since current code is dealing with this situation, I think I should cover it too,
>although it is only a piece of log.

Hi Boris,

I reconsidered the situation.
If there is a non-recoverable mce as well, just let it print that reason. No need to bring the
timeout message indeed. Because since the tolerant was set to a high level to ignore the timeout,
we can eventually ignore them.

So simply drop cpu_missing variable as you mentioned should work.

I am not sure whether it should be authored by you or suggested by you.
Anyway, I will post a new patch exactly as you suggested. Please pick it or ignore it as appropriate :)

Thanks,
Zhaolong
Borislav Petkov Nov. 9, 2021, 9:07 a.m. UTC | #6
On Tue, Nov 09, 2021 at 04:31:23PM +0800, Zhaolong Zhang wrote:
> If there is a non-recoverable mce as well, just let it print that
> reason. No need to bring the timeout message indeed. Because since
> the tolerant was set to a high level to ignore the timeout, we can
> eventually ignore them.

Here's how I see it:

	/*
	 * Tolerant levels:
	 * 0: always panic on uncorrected errors, log corrected errors
	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
	 * 3: never panic or SIGBUS, log all errors (for testing only)
	 */

So on normal deployments, no one should fiddle with tolerant levels - so
you'll be running at tolerance level 0 by default and all should print
out. Same for level 1.

Levels 2 and 3 are, to me at least, purely for testing *only*. And,
actually, that error message should be issued regardless of the
tolerance level - only the panicking should be controlled by that. IOW,
that code should do:

        if ((s64)*t < SPINUNIT) {
                if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
                        pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
                                 cpumask_pr_args(&mce_missing_cpus));
                if (mca_cfg.tolerant <= 1)
                        mce_panic(msg, NULL, NULL);
                return 1;
        }

because, regardless of tolerance level, saying that some cores didn't
respond is important info.

You could do that as a separate patch, on top, if you feel like it.

> I am not sure whether it should be authored by you or suggested by
> you.

Suggested is fine.

> Anyway, I will post a new patch exactly as you suggested. Please pick
> it or ignore it as appropriate :)

Thx.
Luck, Tony Nov. 9, 2021, 4:06 p.m. UTC | #7
>        if ((s64)*t < SPINUNIT) {
>                if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
>                        pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
>                                 cpumask_pr_args(&mce_missing_cpus));
>                if (mca_cfg.tolerant <= 1)
>                        mce_panic(msg, NULL, NULL);
>                return 1;
>        }

Just a note that skipping the mce_panic() here isn't going to help much. With some CPUs
stuck not responding to #MC the system is going to lock up or crash for other timeouts in
the next few seconds.

-Tony
Borislav Petkov Nov. 9, 2021, 7:48 p.m. UTC | #8
On Tue, Nov 09, 2021 at 04:06:48PM +0000, Luck, Tony wrote:
> Just a note that skipping the mce_panic() here isn't going to help
> much. With some CPUs stuck not responding to #MC the system is going
> to lock up or crash for other timeouts in the next few seconds.

Yeh, I spent a couple of minutes today staring at this ->tolerant
thing and wondering why we need it at all. I wouldn't mind ripping it
altogether unless you're using it for testing or so.
Luck, Tony Nov. 9, 2021, 7:50 p.m. UTC | #9
>> Just a note that skipping the mce_panic() here isn't going to help
>> much. With some CPUs stuck not responding to #MC the system is going
>> to lock up or crash for other timeouts in the next few seconds.
>
> Yeh, I spent a couple of minutes today staring at this ->tolerant
> thing and wondering why we need it at all. I wouldn't mind ripping it
> altogether unless you're using it for testing or so.

I think it might have been useful before recoverable machine checks. But
now it just seems to cause confusion. I do not ever use it. I would not be
sad to see it go.

-Tony
Borislav Petkov Nov. 9, 2021, 8:21 p.m. UTC | #10
On Tue, Nov 09, 2021 at 07:50:57PM +0000, Luck, Tony wrote:
> I think it might have been useful before recoverable machine checks. But
> now it just seems to cause confusion. I do not ever use it. I would not be
> sad to see it go.

Yeah,

what do we do with the sysfs knob? It probably is an ABI:

/sys/devices/system/machinecheck/machinecheck1/tolerant
/sys/devices/system/machinecheck/machinecheck2/tolerant
...
Luck, Tony Nov. 9, 2021, 8:44 p.m. UTC | #11
> what do we do with the sysfs knob? It probably is an ABI:
>
> /sys/devices/system/machinecheck/machinecheck1/tolerant
> /sys/devices/system/machinecheck/machinecheck2/tolerant

$ git grep tolerant -- Documentation/ABI/
$

An undocumented ABI! Well, not documented with all the other sysfs bits.

It does appear in:
Documentation/x86/x86_64/machinecheck.rst

Of course, like a lot of documentation, it isn't accurate. It wasn't
updated to describe what happens with recoverable errors.
Final paragraph says:

        Note this only makes a difference if the CPU allows recovery
        from a machine check exception. Current x86 CPUs generally do not.

Recovery was first introduced in the Nehalem generation which ark.intel.com
says was launched in Q1'2010. So over a decade.

Choices:
1) Leave the file there, but remove the code that uses the value
2) Delete the file too

Option 1 doesn't break any scripts that look for the file, but may make
people shout louder when they find it no longer does anything.

Option 2 is the more honest approach.


-Tony
Borislav Petkov Nov. 9, 2021, 9:30 p.m. UTC | #12
On Tue, Nov 09, 2021 at 08:44:41PM +0000, Luck, Tony wrote:
> > what do we do with the sysfs knob? It probably is an ABI:
> >
> > /sys/devices/system/machinecheck/machinecheck1/tolerant
> > /sys/devices/system/machinecheck/machinecheck2/tolerant
> 
> $ git grep tolerant -- Documentation/ABI/
> $
> 
> An undocumented ABI! Well, not documented with all the other sysfs bits.
> 
> It does appear in:
> Documentation/x86/x86_64/machinecheck.rst

Yeah, we have some spreading of documentation which is not necessarily
helpful.

> Of course, like a lot of documentation, it isn't accurate. It wasn't
> updated to describe what happens with recoverable errors. Final
> paragraph says:
>
>         Note this only makes a difference if the CPU allows recovery
>         from a machine check exception. Current x86 CPUs generally do
>         not.
>
> Recovery was first introduced in the Nehalem generation which
> ark.intel.com says was launched in Q1'2010. So over a decade.
>
> Choices: 1) Leave the file there, but remove the code that uses the
> value 2) Delete the file too
>
> Option 1 doesn't break any scripts that look for the file, but may
> make people shout louder when they find it no longer does anything.
>
> Option 2 is the more honest approach.

Ack, we can try 2 and see who cries.
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 50a3e455cded..0bb59e68a457 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -99,7 +99,6 @@  struct mca_config mca_cfg __read_mostly = {
 
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static unsigned long mce_need_notify;
-static int cpu_missing;
 
 /*
  * MCA banks polled by the period polling timer for corrected events.
@@ -253,6 +252,12 @@  static atomic_t mce_panicked;
 static int fake_panic;
 static atomic_t mce_fake_panicked;
 
+/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
 /* Panic in progress. Enable interrupts and wait for final IPI */
 static void wait_for_panic(void)
 {
@@ -314,8 +319,13 @@  static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (!apei_err)
 			apei_err = apei_write_mce(final);
 	}
-	if (cpu_missing)
-		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+	/*
+	 * cpu_online_mask == &mce_missing_cpus means it is reset and no timeout happens.
+	 */
+	if (!cpumask_equal(cpu_online_mask, &mce_missing_cpus) &&
+	    cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+		pr_emerg(HW_ERR "CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+			 cpumask_pr_args(&mce_missing_cpus));
 	if (exp)
 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 	if (!fake_panic) {
@@ -880,12 +890,6 @@  static atomic_t mce_executing;
  */
 static atomic_t mce_callin;
 
-/*
- * Track which CPUs entered the MCA broadcast synchronization and which not in
- * order to print holdouts.
- */
-static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
-
 /*
  * Check if a timeout waiting for other CPUs happened.
  */
@@ -904,12 +908,8 @@  static int mce_timed_out(u64 *t, const char *msg)
 		goto out;
 	if ((s64)*t < SPINUNIT) {
 		if (mca_cfg.tolerant <= 1) {
-			if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
-				pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
-					 cpumask_pr_args(&mce_missing_cpus));
 			mce_panic(msg, NULL, NULL);
 		}
-		cpu_missing = 1;
 		return 1;
 	}
 	*t -= SPINUNIT;
@@ -1079,8 +1079,10 @@  static int mce_end(int order)
 
 	if (!timeout)
 		goto reset;
-	if (order < 0)
+	if (order < 0) {
+		timeout = 0;
 		goto reset;
+	}
 
 	/*
 	 * Allow others to run.
@@ -1128,7 +1130,12 @@  static int mce_end(int order)
 reset:
 	atomic_set(&global_nwo, 0);
 	atomic_set(&mce_callin, 0);
-	cpumask_setall(&mce_missing_cpus);
+	/*
+ 	 * Don't reset mce_missing_cpus if there is mce_timed_out() so that
+ 	 * mce_panic() can report right thing.
+ 	 */
+	if (!((s64)timeout < SPINUNIT))
+		cpumask_setall(&mce_missing_cpus);
 	barrier();
 
 	/*
@@ -2720,7 +2727,6 @@  struct dentry *mce_get_debugfs_dir(void)
 
 static void mce_reset(void)
 {
-	cpu_missing = 0;
 	atomic_set(&mce_fake_panicked, 0);
 	atomic_set(&mce_executing, 0);
 	atomic_set(&mce_callin, 0);