diff mbox series

[v3,3/5] x86/mce: Introduce mce_handle_storm() to deal with begin/end of storms

Message ID 20230317172042.117201-4-tony.luck@intel.com (mailing list archive)
State New, archived
Headers show
Series Handle corrected machine check interrupt storms | expand

Commit Message

Tony Luck March 17, 2023, 5:20 p.m. UTC
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

Intel and AMD need to take different actions when a storm begins or
ends. Prepare for the storm code moving from intel.c into core.c by
adding a function that checks CPU vendor to pick the right action.

No functional changes.

[Tony: Changed from function pointer to regular function]

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/mce/internal.h |  3 +++
 arch/x86/kernel/cpu/mce/core.c     |  9 +++++++++
 arch/x86/kernel/cpu/mce/intel.c    | 12 ++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

Comments

Yazen Ghannam March 23, 2023, 3:22 p.m. UTC | #1
On Fri, Mar 17, 2023 at 10:20:40AM -0700, Tony Luck wrote:
> From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> 
> Intel and AMD need to take different actions when a storm begins or
> ends. Prepare for the storm code moving from intel.c into core.c by
> adding a function that checks CPU vendor to pick the right action.
> 
> No functional changes.
> 
> [Tony: Changed from function pointer to regular function]
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/mce/internal.h |  3 +++
>  arch/x86/kernel/cpu/mce/core.c     |  9 +++++++++
>  arch/x86/kernel/cpu/mce/intel.c    | 12 ++++++++++--
>  3 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
> index 72fbec8f6c3c..f37816b4d4cf 100644
> --- a/arch/x86/kernel/cpu/mce/internal.h
> +++ b/arch/x86/kernel/cpu/mce/internal.h
> @@ -43,12 +43,14 @@ extern mce_banks_t mce_banks_ce_disabled;
>  void track_cmci_storm(int bank, u64 status);
>  
>  #ifdef CONFIG_X86_MCE_INTEL
> +void mce_intel_handle_storm(int bank, bool on);
>  void cmci_disable_bank(int bank);
>  void intel_init_cmci(void);
>  void intel_init_lmce(void);
>  void intel_clear_lmce(void);
>  bool intel_filter_mce(struct mce *m);
>  #else
> +static inline void mce_intel_handle_storm(int bank, bool on) { }
>  static inline void cmci_disable_bank(int bank) { }
>  static inline void intel_init_cmci(void) { }
>  static inline void intel_init_lmce(void) { }
> @@ -57,6 +59,7 @@ static inline bool intel_filter_mce(struct mce *m) { return false; }
>  #endif
>  
>  void mce_timer_kick(bool storm);
> +void mce_handle_storm(int bank, bool on);
>  
>  #ifdef CONFIG_ACPI_APEI
>  int apei_write_mce(struct mce *m);
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 776d4724b1e0..f4d2a7ba29f7 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1985,6 +1985,15 @@ static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
>  	intel_clear_lmce();
>  }
>  
> +void mce_handle_storm(int bank, bool on)
> +{
> +	switch (boot_cpu_data.x86_vendor) {
> +	case X86_VENDOR_INTEL:
> +		mce_intel_handle_storm(bank, on);
> +		break;
> +	}
> +}
> +
>  static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
>  {
>  	switch (c->x86_vendor) {
> diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
> index 4106877de028..4238b73c2143 100644
> --- a/arch/x86/kernel/cpu/mce/intel.c
> +++ b/arch/x86/kernel/cpu/mce/intel.c
> @@ -152,6 +152,14 @@ static void cmci_set_threshold(int bank, int thresh)
>  	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>  }
>  
> +void mce_intel_handle_storm(int bank, bool on)
> +{
> +	if (on)
> +		cmci_set_threshold(bank, cmci_threshold[bank]);
> +	else
> +		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);

I think these conditions are reversed. When storm handling is 'on' we should
use CMCI_STORM_THRESHOLD, and when off use the saved bank threshold.

> +}
> +
>  static void cmci_storm_begin(int bank)
>  {
>  	__set_bit(bank, this_cpu_ptr(mce_poll_banks));
> @@ -211,13 +219,13 @@ void track_cmci_storm(int bank, u64 status)
>  		if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0))
>  			return;
>  		pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank);
> -		cmci_set_threshold(bank, cmci_threshold[bank]);
> +		mce_handle_storm(bank, true);

Should be 'false' when the storm subsides.

>  		cmci_storm_end(bank);
>  	} else {
>  		if (hweight64(history) < STORM_BEGIN_THRESHOLD)
>  			return;
>  		pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank);
> -		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
> +		mce_handle_storm(bank, false);

Should be 'true' when the storm starts.

>  		cmci_storm_begin(bank);
>  	}
>  }
> --

Thanks,
Yazen
Tony Luck March 23, 2023, 6 p.m. UTC | #2
On Thu, Mar 23, 2023 at 11:22:22AM -0400, Yazen Ghannam wrote:
> On Fri, Mar 17, 2023 at 10:20:40AM -0700, Tony Luck wrote:
> > +void mce_intel_handle_storm(int bank, bool on)
> > +{
> > +	if (on)
> > +		cmci_set_threshold(bank, cmci_threshold[bank]);
> > +	else
> > +		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
> 
> I think these conditions are reversed. When storm handling is 'on' we should
> use CMCI_STORM_THRESHOLD, and when off use the saved bank threshold.
> 
> > +}
> > +
> >  static void cmci_storm_begin(int bank)
> >  {
> >  	__set_bit(bank, this_cpu_ptr(mce_poll_banks));
> > @@ -211,13 +219,13 @@ void track_cmci_storm(int bank, u64 status)
> >  		if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0))
> >  			return;
> >  		pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank);
> > -		cmci_set_threshold(bank, cmci_threshold[bank]);
> > +		mce_handle_storm(bank, true);
> 
> Should be 'false' when the storm subsides.
> 
> >  		cmci_storm_end(bank);
> >  	} else {
> >  		if (hweight64(history) < STORM_BEGIN_THRESHOLD)
> >  			return;
> >  		pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank);
> > -		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
> > +		mce_handle_storm(bank, false);
> 
> Should be 'true' when the storm starts.
> 
> >  		cmci_storm_begin(bank);
> >  	}
> >  }

There's a saying that two wrongs do not make a right (but three lefts do).

My code was working, but only because the second mistake cancelled
out the first.

Changing them both as you suggest (diff below) and the code still
works, and makes sense too!

Thanks

-Tony

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 74b560476424..c3e1bb790680 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -677,13 +677,13 @@ void track_cmci_storm(int bank, u64 status)
 		if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0))
 			return;
 		pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank);
-		mce_handle_storm(bank, true);
+		mce_handle_storm(bank, false);
 		cmci_storm_end(bank);
 	} else {
 		if (hweight64(history) < STORM_BEGIN_THRESHOLD)
 			return;
 		pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank);
-		mce_handle_storm(bank, false);
+		mce_handle_storm(bank, true);
 		cmci_storm_begin(bank);
 	}
 }
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 6cc9aa97c092..20c2143a68c1 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -134,9 +134,9 @@ static void cmci_set_threshold(int bank, int thresh)
 void mce_intel_handle_storm(int bank, bool on)
 {
 	if (on)
-		cmci_set_threshold(bank, cmci_threshold[bank]);
-	else
 		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+	else
+		cmci_set_threshold(bank, cmci_threshold[bank]);
 }
 
 /*
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 72fbec8f6c3c..f37816b4d4cf 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -43,12 +43,14 @@  extern mce_banks_t mce_banks_ce_disabled;
 void track_cmci_storm(int bank, u64 status);
 
 #ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_handle_storm(int bank, bool on);
 void cmci_disable_bank(int bank);
 void intel_init_cmci(void);
 void intel_init_lmce(void);
 void intel_clear_lmce(void);
 bool intel_filter_mce(struct mce *m);
 #else
+static inline void mce_intel_handle_storm(int bank, bool on) { }
 static inline void cmci_disable_bank(int bank) { }
 static inline void intel_init_cmci(void) { }
 static inline void intel_init_lmce(void) { }
@@ -57,6 +59,7 @@  static inline bool intel_filter_mce(struct mce *m) { return false; }
 #endif
 
 void mce_timer_kick(bool storm);
+void mce_handle_storm(int bank, bool on);
 
 #ifdef CONFIG_ACPI_APEI
 int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 776d4724b1e0..f4d2a7ba29f7 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1985,6 +1985,15 @@  static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
 	intel_clear_lmce();
 }
 
+void mce_handle_storm(int bank, bool on)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		mce_intel_handle_storm(bank, on);
+		break;
+	}
+}
+
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 4106877de028..4238b73c2143 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -152,6 +152,14 @@  static void cmci_set_threshold(int bank, int thresh)
 	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
 }
 
+void mce_intel_handle_storm(int bank, bool on)
+{
+	if (on)
+		cmci_set_threshold(bank, cmci_threshold[bank]);
+	else
+		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+}
+
 static void cmci_storm_begin(int bank)
 {
 	__set_bit(bank, this_cpu_ptr(mce_poll_banks));
@@ -211,13 +219,13 @@  void track_cmci_storm(int bank, u64 status)
 		if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0))
 			return;
 		pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank);
-		cmci_set_threshold(bank, cmci_threshold[bank]);
+		mce_handle_storm(bank, true);
 		cmci_storm_end(bank);
 	} else {
 		if (hweight64(history) < STORM_BEGIN_THRESHOLD)
 			return;
 		pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank);
-		cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+		mce_handle_storm(bank, false);
 		cmci_storm_begin(bank);
 	}
 }