Message ID | 20240404151359.47970-2-yazen.ghannam@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | MCA Updates | expand |
On Thu, Apr 04, 2024 at 10:13:44AM -0500, Yazen Ghannam wrote: > diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c > index b5cc557cfc37..7a857b33f515 100644 > --- a/arch/x86/kernel/cpu/mce/core.c > +++ b/arch/x86/kernel/cpu/mce/core.c > @@ -117,20 +117,32 @@ static struct irq_work mce_irq_work; > */ > BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); > > -/* Do initial initialization of a struct mce */ > -void mce_setup(struct mce *m) > +void mce_setup_common(struct mce *m) Since we're touching this... mce_setup() is a perfectly wrong name for what it does. So let's clean it up. Diff ontop below. * mce_prep_record() - the name says what the function does. * mce_prep_record_per_cpu() - "per_cpu" as this is a common kernel concept and we do use per_cpu data in there. Please do this in two patches: - the first one renames mce_setup() only without adding the additional functionality - the second one does the split Thx. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index dfd2e9699bd7..491f3d78c46a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -221,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); +void mce_prep_record(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9a0133ef7e20..14bf8c232e45 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -780,7 +780,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; - mce_setup(&m); + mce_prep_record(&m); m.status = status; m.misc = misc; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7f7309ff67d0..8f509c8a4e98 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); + mce_prep_record(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; @@ -97,7 +97,7 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_setup(&m); + mce_prep_record(&m); m.extcpu = -1; m.socketid = -1; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c0ce2de7fb51..a89508327b0d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -117,7 +117,7 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -void mce_setup_common(struct mce *m) +void mce_prep_record_common(struct mce *m) { memset(m, 0, sizeof(struct mce)); @@ -128,21 +128,21 @@ void mce_setup_common(struct mce *m) m->time = __ktime_get_real_seconds(); } -void mce_setup_for_cpu(unsigned int cpu, struct mce *m) +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) { - m->cpu = cpu; - m->extcpu = cpu; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->microcode = cpu_data(m->extcpu).microcode; - m->ppin = cpu_data(m->extcpu).ppin; - m->socketid = cpu_data(m->extcpu).topo.pkg_id; + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(m->extcpu).topo.initial_apicid; + m->microcode = cpu_data(m->extcpu).microcode; + m->ppin = cpu_data(m->extcpu).ppin; + m->socketid = cpu_data(m->extcpu).topo.pkg_id; } /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record(struct mce *m) { - mce_setup_common(m); - mce_setup_for_cpu(smp_processor_id(), m); + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); @@ -448,11 +448,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(m); instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index e86e53695828..43c7f3b71df5 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -261,8 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); -void mce_setup_common(struct mce *m); -void mce_setup_for_cpu(unsigned int cpu, struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m);
On 4/16/24 06:02, Borislav Petkov wrote: > On Thu, Apr 04, 2024 at 10:13:44AM -0500, Yazen Ghannam wrote: >> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c >> index b5cc557cfc37..7a857b33f515 100644 >> --- a/arch/x86/kernel/cpu/mce/core.c >> +++ b/arch/x86/kernel/cpu/mce/core.c >> @@ -117,20 +117,32 @@ static struct irq_work mce_irq_work; >> */ >> BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); >> >> -/* Do initial initialization of a struct mce */ >> -void mce_setup(struct mce *m) >> +void mce_setup_common(struct mce *m) > > Since we're touching this... > > mce_setup() is a perfectly wrong name for what it does. So let's clean > it up. Diff ontop below. > > * mce_prep_record() - the name says what the function does. > > * mce_prep_record_per_cpu() - "per_cpu" as this is a common kernel > concept and we do use per_cpu data in there. > > Please do this in two patches: > > - the first one renames mce_setup() only without adding the additional > functionality > > - the second one does the split > > Thx. > Okay, will do. Should I send another revision of this entire set? Or should I split out the mce_setup() patches? Thanks, Yazen
On Wed, Apr 17, 2024 at 09:50:58AM -0400, Yazen Ghannam wrote: > Should I send another revision of this entire set? Or should I split out > the mce_setup() patches? I leave it up to you. If it makes sense to keep it all together then wait for me to go through the rest first and send a whole new set or if you want to break this out, that's fine too. Thx.
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5cc557cfc37..7a857b33f515 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -117,20 +117,32 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_setup_common(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_setup_for_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(m->extcpu).topo.initial_apicid; + m->microcode = cpu_data(m->extcpu).microcode; + m->ppin = cpu_data(m->extcpu).ppin; + m->socketid = cpu_data(m->extcpu).topo.pkg_id; +} + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + mce_setup_common(m); + mce_setup_for_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 01f8f03969e6..e86e53695828 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -261,6 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_setup_common(struct mce *m); +void mce_setup_for_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m);
Generally, MCA information for an error is gathered on the CPU that reported the error. In this case, CPU-specific information from the running CPU will be correct. However, this will be incorrect if the MCA information is gathered while running on a CPU that didn't report the error. One example is creating an MCA record using mce_setup() for errors reported from ACPI. Split mce_setup() so that there is a helper function to gather common, i.e. not CPU-specific, information and another helper for CPU-specific information. Leave mce_setup() defined as-is for the common case when running on the reporting CPU. Get MCG_CAP in the global helper even though the register is per-CPU. This value is not already cached per-CPU like other values. And it does not assist with any per-CPU decoding or handling. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> --- Notes: Link: https://lkml.kernel.org/r/20231118193248.1296798-3-yazen.ghannam@amd.com v1->v2: * Change helper names and pass-in CPU number (Boris) arch/x86/kernel/cpu/mce/core.c | 34 ++++++++++++++++++++---------- arch/x86/kernel/cpu/mce/internal.h | 2 ++ 2 files changed, 25 insertions(+), 11 deletions(-)