Message ID | e8029c58c7d4b5094ec274c78dee01d390317d4d.1449861203.git.tony.luck@intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Thu, Dec 10, 2015 at 4:14 PM, Tony Luck <tony.luck@intel.com> wrote: > Extend the severity checking code to add a new context IN_KERN_RECOV > which is used to indicate that the machine check was triggered by code > in the kernel with a fixup entry. > > Add code to check for this situation and respond by altering the return > IP to the fixup address and changing the regs->ax so that the recovery > code knows the physical address of the error. Note that we also set bit > 63 because 0x0 is a legal physical address. > > Major re-work to the tail code in do_machine_check() to make all this > readable/maintainable. One functional change is that tolerant=3 no longer > stops recovery actions. Revert to only skipping sending SIGBUS to the > current process. This is IMO much, much nicer than the old code. Thanks! --Andy
On Thu, Dec 10, 2015 at 04:14:44PM -0800, Tony Luck wrote: > Extend the severity checking code to add a new context IN_KERN_RECOV > which is used to indicate that the machine check was triggered by code > in the kernel with a fixup entry. > > Add code to check for this situation and respond by altering the return > IP to the fixup address and changing the regs->ax so that the recovery > code knows the physical address of the error. Note that we also set bit > 63 because 0x0 is a legal physical address. > > Major re-work to the tail code in do_machine_check() to make all this > readable/maintainable. One functional change is that tolerant=3 no longer > stops recovery actions. Revert to only skipping sending SIGBUS to the > current process. > > Signed-off-by: Tony Luck <tony.luck@intel.com> > --- > arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 +++++++++- > arch/x86/kernel/cpu/mcheck/mce.c | 69 ++++++++++++++++--------------- > 2 files changed, 55 insertions(+), 36 deletions(-) > > diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c > index 9c682c222071..ac7fbb0689fb 100644 > --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c > +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c > @@ -12,6 +12,7 @@ > #include <linux/kernel.h> > #include <linux/seq_file.h> > #include <linux/init.h> > +#include <linux/module.h> > #include <linux/debugfs.h> > #include <asm/mce.h> > > @@ -29,7 +30,7 @@ > * panic situations) > */ > > -enum context { IN_KERNEL = 1, IN_USER = 2 }; > +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; > enum ser { SER_REQUIRED = 1, NO_SER = 2 }; > enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; > > @@ -48,6 +49,7 @@ static struct severity { > #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } > #define KERNEL .context = IN_KERNEL > #define USER .context = IN_USER > +#define KERNEL_RECOV .context = IN_KERNEL_RECOV > #define SER .ser = SER_REQUIRED > #define NOSER .ser = NO_SER > #define EXCP .excp = EXCP_CONTEXT > @@ -87,6 +89,10 @@ static struct severity { > EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) > ), > MCESEV( > + PANIC, "In kernel and no restart IP", > + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) > + ), > + MCESEV( > DEFERRED, "Deferred error", > NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) > ), > @@ -123,6 +129,11 @@ static struct severity { > MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) > ), > MCESEV( > + AR, "Action required: data load error recoverable area of kernel", ... in ... > + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), > + KERNEL_RECOV > + ), > + MCESEV( > AR, "Action required: data load error in a user process", > SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), > USER > @@ -170,6 +181,9 @@ static struct severity { > ) /* always matches. keep at end */ > }; > > +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ > + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) > + > /* > * If mcgstatus indicated that ip/cs on the stack were > * no good, then "m->cs" will be zero and we will have > @@ -183,7 +197,11 @@ static struct severity { > */ > static int error_context(struct mce *m) > { > - return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; > + if ((m->cs & 3) == 3) > + return IN_USER; > + if (mc_recoverable(m->mcgstatus) && search_mcexception_tables(m->ip)) > + return IN_KERNEL_RECOV; > + return IN_KERNEL; > } > > /* > diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c > index 9d014b82a124..f2f568ad6409 100644 > --- a/arch/x86/kernel/cpu/mcheck/mce.c > +++ b/arch/x86/kernel/cpu/mcheck/mce.c > @@ -31,6 +31,7 @@ > #include <linux/types.h> > #include <linux/slab.h> > #include <linux/init.h> > +#include <linux/module.h> > #include <linux/kmod.h> > #include <linux/poll.h> > #include <linux/nmi.h> > @@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear) > } > } > > +static int do_memory_failure(struct mce *m) > +{ > + int flags = MF_ACTION_REQUIRED; > + int ret; > + > + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); > + if (!(m->mcgstatus & MCG_STATUS_RIPV)) > + flags |= MF_MUST_KILL; > + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); > + if (ret) > + pr_err("Memory error not recovered"); > + return ret; > +} > + > /* > * The actual machine check handler. This only handles real > * exceptions when something got corrupted coming in through int 18. > @@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) > DECLARE_BITMAP(toclear, MAX_NR_BANKS); > DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); > char *msg = "Unknown"; > - u64 recover_paddr = ~0ull; > - int flags = MF_ACTION_REQUIRED; > int lmce = 0; > > ist_enter(regs); > @@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) > } > > /* > - * At insane "tolerant" levels we take no action. Otherwise > - * we only die if we have no other choice. For less serious > - * issues we try to recover, or limit damage to the current > - * process. > + * If tolerant is at an insane level we drop requests to kill > + * processes and continue even when there is no way out ^ | . Fullstop here. > */ > - if (cfg->tolerant < 3) { > - if (no_way_out) > - mce_panic("Fatal machine check on current CPU", &m, msg); > - if (worst == MCE_AR_SEVERITY) { > - recover_paddr = m.addr; > - if (!(m.mcgstatus & MCG_STATUS_RIPV)) > - flags |= MF_MUST_KILL; > - } else if (kill_it) { > - force_sig(SIGBUS, current); > - } > - } > + if (cfg->tolerant == 3) Btw, I don't see where we limit the input values for that tolerant setting, i.e., user could easily enter something > 3. I think we should add a check in a separate patch to not allow anything except [0-3]. > + kill_it = 0; > + else if (no_way_out) > + mce_panic("Fatal machine check on current CPU", &m, msg); > > if (worst > 0) > mce_report_event(regs); > @@ -1146,25 +1150,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) > out: > sync_core(); > > - if (recover_paddr == ~0ull) > - goto done; > + /* Fault was in user mode and we need to take some action */ > + if ((m.cs & 3) == 3 && (worst == MCE_AR_SEVERITY || kill_it)) { > + ist_begin_non_atomic(regs); > + local_irq_enable(); > > - pr_err("Uncorrected hardware memory error in user-access at %llx", > - recover_paddr); > - /* > - * We must call memory_failure() here even if the current process is > - * doomed. We still need to mark the page as poisoned and alert any > - * other users of the page. > - */ > - ist_begin_non_atomic(regs); > - local_irq_enable(); > - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { > - pr_err("Memory error not recovered"); > - force_sig(SIGBUS, current); > + if (kill_it || do_memory_failure(&m)) > + force_sig(SIGBUS, current); > + local_irq_disable(); > + ist_end_non_atomic(); > } > - local_irq_disable(); > - ist_end_non_atomic(); > -done: > + > + /* Fault was in recoverable area of the kernel */ > + if ((m.cs & 3) != 3 && worst == MCE_AR_SEVERITY) > + if (!fixup_mcexception(regs, m.addr)) > + mce_panic("Failed kernel mode recovery", &m, NULL); ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Does that always imply a failed kernel mode recovery? I don't see (m.cs == 0 and MCE_AR_SEVERITY) MCEs always meaning that a recovery should be attempted there. I think this should simply say mce_panic("Fatal machine check on current CPU", &m, msg); Also, how about taking out that worst and kill_it check. It is a bit more readable this way IMO: --- out: sync_core(); if (worst < MCE_AR_SEVERITY && !kill_it) goto out_ist; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { ist_begin_non_atomic(regs); local_irq_enable(); if (kill_it || do_memory_failure(&m)) force_sig(SIGBUS, current); local_irq_disable(); ist_end_non_atomic(); } else { if (!fixup_mcexception(regs, m.addr)) mce_panic("Fatal machine check on current CPU", &m, NULL); } out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); --- Hmm...
>> + /* Fault was in recoverable area of the kernel */ >> + if ((m.cs & 3) != 3 && worst == MCE_AR_SEVERITY) >> + if (!fixup_mcexception(regs, m.addr)) >> + mce_panic("Failed kernel mode recovery", &m, NULL); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > Does that always imply a failed kernel mode recovery? I don't see > > (m.cs == 0 and MCE_AR_SEVERITY) > > MCEs always meaning that a recovery should be attempted there. I think > this should simply say > > mce_panic("Fatal machine check on current CPU", &m, msg); I don't think this can ever happen. If we were in kernel mode and decided that the severity was AR_SEVERITY ... then search_mcexception_table() found an entry for the IP where the machine check happened. The only way for fixup_exception to fail is if search_mcexception_table() now suddenly doesn't find the entry it found earlier. But if this "can't happen" thing actually does happen ... I'd like the panic message to be different from other mce_panic() so you'll know to blame me. Applied all the other suggestions. -Tony
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 9c682c222071..ac7fbb0689fb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/seq_file.h> #include <linux/init.h> +#include <linux/module.h> #include <linux/debugfs.h> #include <asm/mce.h> @@ -29,7 +30,7 @@ * panic situations) */ -enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; @@ -48,6 +49,7 @@ static struct severity { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER #define EXCP .excp = EXCP_CONTEXT @@ -87,6 +89,10 @@ static struct severity { EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( DEFERRED, "Deferred error", NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), @@ -123,6 +129,11 @@ static struct severity { MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( + AR, "Action required: data load error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( AR, "Action required: data load error in a user process", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER @@ -170,6 +181,9 @@ static struct severity { ) /* always matches. keep at end */ }; +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -183,7 +197,11 @@ static struct severity { */ static int error_context(struct mce *m) { - return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; + if ((m->cs & 3) == 3) + return IN_USER; + if (mc_recoverable(m->mcgstatus) && search_mcexception_tables(m->ip)) + return IN_KERNEL_RECOV; + return IN_KERNEL; } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9d014b82a124..f2f568ad6409 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -31,6 +31,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/module.h> #include <linux/kmod.h> #include <linux/poll.h> #include <linux/nmi.h> @@ -958,6 +959,20 @@ static void mce_clear_state(unsigned long *toclear) } } +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + if (ret) + pr_err("Memory error not recovered"); + return ret; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -995,8 +1010,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - u64 recover_paddr = ~0ull; - int flags = MF_ACTION_REQUIRED; int lmce = 0; ist_enter(regs); @@ -1123,22 +1136,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) } /* - * At insane "tolerant" levels we take no action. Otherwise - * we only die if we have no other choice. For less serious - * issues we try to recover, or limit damage to the current - * process. + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out */ - if (cfg->tolerant < 3) { - if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - if (worst == MCE_AR_SEVERITY) { - recover_paddr = m.addr; - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - } else if (kill_it) { - force_sig(SIGBUS, current); - } - } + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) mce_report_event(regs); @@ -1146,25 +1150,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) out: sync_core(); - if (recover_paddr == ~0ull) - goto done; + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3 && (worst == MCE_AR_SEVERITY || kill_it)) { + ist_begin_non_atomic(regs); + local_irq_enable(); - pr_err("Uncorrected hardware memory error in user-access at %llx", - recover_paddr); - /* - * We must call memory_failure() here even if the current process is - * doomed. We still need to mark the page as poisoned and alert any - * other users of the page. - */ - ist_begin_non_atomic(regs); - local_irq_enable(); - if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { - pr_err("Memory error not recovered"); - force_sig(SIGBUS, current); + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); } - local_irq_disable(); - ist_end_non_atomic(); -done: + + /* Fault was in recoverable area of the kernel */ + if ((m.cs & 3) != 3 && worst == MCE_AR_SEVERITY) + if (!fixup_mcexception(regs, m.addr)) + mce_panic("Failed kernel mode recovery", &m, NULL); + ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check);
Extend the severity checking code to add a new context IN_KERN_RECOV which is used to indicate that the machine check was triggered by code in the kernel with a fixup entry. Add code to check for this situation and respond by altering the return IP to the fixup address and changing the regs->ax so that the recovery code knows the physical address of the error. Note that we also set bit 63 because 0x0 is a legal physical address. Major re-work to the tail code in do_machine_check() to make all this readable/maintainable. One functional change is that tolerant=3 no longer stops recovery actions. Revert to only skipping sending SIGBUS to the current process. Signed-off-by: Tony Luck <tony.luck@intel.com> --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 22 +++++++++- arch/x86/kernel/cpu/mcheck/mce.c | 69 ++++++++++++++++--------------- 2 files changed, 55 insertions(+), 36 deletions(-)