Message ID | 20230317172042.117201-4-tony.luck@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Handle corrected machine check interrupt storms | expand |
On Fri, Mar 17, 2023 at 10:20:40AM -0700, Tony Luck wrote: > From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > > Intel and AMD need to take different actions when a storm begins or > ends. Prepare for the storm code moving from intel.c into core.c by > adding a function that checks CPU vendor to pick the right action. > > No functional changes. > > [Tony: Changed from function pointer to regular function] > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > Signed-off-by: Tony Luck <tony.luck@intel.com> > --- > arch/x86/kernel/cpu/mce/internal.h | 3 +++ > arch/x86/kernel/cpu/mce/core.c | 9 +++++++++ > arch/x86/kernel/cpu/mce/intel.c | 12 ++++++++++-- > 3 files changed, 22 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h > index 72fbec8f6c3c..f37816b4d4cf 100644 > --- a/arch/x86/kernel/cpu/mce/internal.h > +++ b/arch/x86/kernel/cpu/mce/internal.h > @@ -43,12 +43,14 @@ extern mce_banks_t mce_banks_ce_disabled; > void track_cmci_storm(int bank, u64 status); > > #ifdef CONFIG_X86_MCE_INTEL > +void mce_intel_handle_storm(int bank, bool on); > void cmci_disable_bank(int bank); > void intel_init_cmci(void); > void intel_init_lmce(void); > void intel_clear_lmce(void); > bool intel_filter_mce(struct mce *m); > #else > +static inline void mce_intel_handle_storm(int bank, bool on) { } > static inline void cmci_disable_bank(int bank) { } > static inline void intel_init_cmci(void) { } > static inline void intel_init_lmce(void) { } > @@ -57,6 +59,7 @@ static inline bool intel_filter_mce(struct mce *m) { return false; } > #endif > > void mce_timer_kick(bool storm); > +void mce_handle_storm(int bank, bool on); > > #ifdef CONFIG_ACPI_APEI > int apei_write_mce(struct mce *m); > diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c > index 776d4724b1e0..f4d2a7ba29f7 100644 > --- a/arch/x86/kernel/cpu/mce/core.c > +++ b/arch/x86/kernel/cpu/mce/core.c > @@ -1985,6 +1985,15 @@ static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) > intel_clear_lmce(); > } > > +void mce_handle_storm(int bank, bool on) > +{ > + switch (boot_cpu_data.x86_vendor) { > + case X86_VENDOR_INTEL: > + mce_intel_handle_storm(bank, on); > + break; > + } > +} > + > static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) > { > switch (c->x86_vendor) { > diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c > index 4106877de028..4238b73c2143 100644 > --- a/arch/x86/kernel/cpu/mce/intel.c > +++ b/arch/x86/kernel/cpu/mce/intel.c > @@ -152,6 +152,14 @@ static void cmci_set_threshold(int bank, int thresh) > raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); > } > > +void mce_intel_handle_storm(int bank, bool on) > +{ > + if (on) > + cmci_set_threshold(bank, cmci_threshold[bank]); > + else > + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); I think these conditions are reversed. When storm handling is 'on' we should use CMCI_STORM_THRESHOLD, and when off use the saved bank threshold. > +} > + > static void cmci_storm_begin(int bank) > { > __set_bit(bank, this_cpu_ptr(mce_poll_banks)); > @@ -211,13 +219,13 @@ void track_cmci_storm(int bank, u64 status) > if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0)) > return; > pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank); > - cmci_set_threshold(bank, cmci_threshold[bank]); > + mce_handle_storm(bank, true); Should be 'false' when the storm subsides. > cmci_storm_end(bank); > } else { > if (hweight64(history) < STORM_BEGIN_THRESHOLD) > return; > pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank); > - cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); > + mce_handle_storm(bank, false); Should be 'true' when the storm starts. > cmci_storm_begin(bank); > } > } > -- Thanks, Yazen
On Thu, Mar 23, 2023 at 11:22:22AM -0400, Yazen Ghannam wrote: > On Fri, Mar 17, 2023 at 10:20:40AM -0700, Tony Luck wrote: > > +void mce_intel_handle_storm(int bank, bool on) > > +{ > > + if (on) > > + cmci_set_threshold(bank, cmci_threshold[bank]); > > + else > > + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); > > I think these conditions are reversed. When storm handling is 'on' we should > use CMCI_STORM_THRESHOLD, and when off use the saved bank threshold. > > > +} > > + > > static void cmci_storm_begin(int bank) > > { > > __set_bit(bank, this_cpu_ptr(mce_poll_banks)); > > @@ -211,13 +219,13 @@ void track_cmci_storm(int bank, u64 status) > > if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0)) > > return; > > pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank); > > - cmci_set_threshold(bank, cmci_threshold[bank]); > > + mce_handle_storm(bank, true); > > Should be 'false' when the storm subsides. > > > cmci_storm_end(bank); > > } else { > > if (hweight64(history) < STORM_BEGIN_THRESHOLD) > > return; > > pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank); > > - cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); > > + mce_handle_storm(bank, false); > > Should be 'true' when the storm starts. > > > cmci_storm_begin(bank); > > } > > } There's a saying that two wrongs do not make a right (but three lefts do). My code was working, but only because the second mistake cancelled out the first. Changing them both as you suggest (diff below) and the code still works, and makes sense too! Thanks -Tony diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 74b560476424..c3e1bb790680 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -677,13 +677,13 @@ void track_cmci_storm(int bank, u64 status) if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0)) return; pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank); - mce_handle_storm(bank, true); + mce_handle_storm(bank, false); cmci_storm_end(bank); } else { if (hweight64(history) < STORM_BEGIN_THRESHOLD) return; pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank); - mce_handle_storm(bank, false); + mce_handle_storm(bank, true); cmci_storm_begin(bank); } } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 6cc9aa97c092..20c2143a68c1 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -134,9 +134,9 @@ static void cmci_set_threshold(int bank, int thresh) void mce_intel_handle_storm(int bank, bool on) { if (on) - cmci_set_threshold(bank, cmci_threshold[bank]); - else cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); + else + cmci_set_threshold(bank, cmci_threshold[bank]); } /*
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 72fbec8f6c3c..f37816b4d4cf 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -43,12 +43,14 @@ extern mce_banks_t mce_banks_ce_disabled; void track_cmci_storm(int bank, u64 status); #ifdef CONFIG_X86_MCE_INTEL +void mce_intel_handle_storm(int bank, bool on); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); #else +static inline void mce_intel_handle_storm(int bank, bool on) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } @@ -57,6 +59,7 @@ static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(bool storm); +void mce_handle_storm(int bank, bool on); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 776d4724b1e0..f4d2a7ba29f7 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1985,6 +1985,15 @@ static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) intel_clear_lmce(); } +void mce_handle_storm(int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_handle_storm(bank, on); + break; + } +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 4106877de028..4238b73c2143 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -152,6 +152,14 @@ static void cmci_set_threshold(int bank, int thresh) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +void mce_intel_handle_storm(int bank, bool on) +{ + if (on) + cmci_set_threshold(bank, cmci_threshold[bank]); + else + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); +} + static void cmci_storm_begin(int bank) { __set_bit(bank, this_cpu_ptr(mce_poll_banks)); @@ -211,13 +219,13 @@ void track_cmci_storm(int bank, u64 status) if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD - 1, 0)) return; pr_notice("CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), bank); - cmci_set_threshold(bank, cmci_threshold[bank]); + mce_handle_storm(bank, true); cmci_storm_end(bank); } else { if (hweight64(history) < STORM_BEGIN_THRESHOLD) return; pr_notice("CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), bank); - cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); + mce_handle_storm(bank, false); cmci_storm_begin(bank); } }