diff mbox

[v3,1/9] x86/mce: handle LMCE locally

Message ID 20170330062003.9119-2-haozhong.zhang@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang March 30, 2017, 6:19 a.m. UTC
LMCE is sent to only one CPU thread, so MCE handler, barriers and
softirq handler should go without waiting for other CPUs, when
handling LMCE. Note LMCE is still broadcast to all vcpus as regular
MCE on Intel CPU right now.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>

Changes in v3:
 * Do not add nowait argument to currently unused mce_barrier().
 * Do not access the global variable 'severity_cpu' when handling LMCE.
 * Rename per-cpu flag 'mce_in_process' to 'nonlocal_mce_in_progress'
   to clarify it's for non-local MCE.
 * Add code comments for tricky points.
---
 xen/arch/x86/cpu/mcheck/barrier.c  |  12 ++--
 xen/arch/x86/cpu/mcheck/barrier.h  |  12 +++-
 xen/arch/x86/cpu/mcheck/mcaction.c |   4 +-
 xen/arch/x86/cpu/mcheck/mce.c      | 122 ++++++++++++++++++++++++++++---------
 xen/arch/x86/cpu/mcheck/mce.h      |   2 +
 xen/arch/x86/cpu/mcheck/x86_mca.h  |   4 +-
 6 files changed, 118 insertions(+), 38 deletions(-)

Comments

Jan Beulich March 30, 2017, 2:35 p.m. UTC | #1
>>> On 30.03.17 at 08:19, <haozhong.zhang@intel.com> wrote:
> --- a/xen/arch/x86/cpu/mcheck/mce.c
> +++ b/xen/arch/x86/cpu/mcheck/mce.c
> @@ -42,6 +42,13 @@ DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
>  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
>  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
>  
> +/*
> + * Flag to indicate that at least one non-local MCE on this CPU has
> + * not been completed handled. It's set by mcheck_cmn_handler() and

completely

> + * cleared by mce_softirq().
> + */
> +static DEFINE_PER_CPU(bool, nonlocal_mce_in_progress);

Does a boolean really suffice here? I.e. can't there be a 2nd one
while the first one is still being dealt with?

> @@ -186,7 +193,29 @@ static struct mce_softirq_barrier mce_trap_bar;
>   */
>  static DEFINE_SPINLOCK(mce_logout_lock);
>  
> +/*
> + * 'severity_cpu' is used in both mcheck_cmn_handler() and mce_softirq().
> + *
> + * When handling broadcasting MCE, MCE barriers take effect to prevent
> + * 'severity_cpu' being modified in one function on one CPU and accessed
> + * in another on a different CPU.
> + *
> + * When handling LMCE, mce_barrier_enter() and mce_barrier_exit() are
> + * effectively NOP, so it's possible that mcheck_cmn_handler() is handling
> + * a LMCE on CPUx while mce_softirq() is handling another LMCE on CPUy.
> + * If both are modifying 'severity_cpu', they may interfere with each
> + * other. Therefore, it's better for mcheck_cmn_handler() and mce_softirq()
> + * to avoid accessing 'severity_cpu' when handling LMCE, unless other
> + * approaches are taken to avoid the above interference.
> + */
>  static atomic_t severity_cpu = ATOMIC_INIT(-1);
> +/*
> + * The following two global variables are used only in mcheck_cmn_handler().

In which case the better approach would be to move them into
that function. In no event are these "global" variables - they're
static after all.

> @@ -1700,38 +1736,61 @@ static void mce_softirq(void)
>  {
>      int cpu = smp_processor_id();
>      unsigned int workcpu;
> +    /*
> +     * On platforms supporting broadcasting MCE, if there is a
> +     * non-local MCE waiting for process on this CPU, it implies other
> +     * CPUs received the same MCE as well. Therefore, mce_softirq()
> +     * will be launched on other CPUs as well and compete with this
> +     * mce_softirq() to handle all MCE's on all CPUs. In that case, we
> +     * should use MCE barriers to sync with other CPUs. Otherwise, we
> +     * do not need to wait for other CPUs.
> +     */
> +    bool nowait = !this_cpu(nonlocal_mce_in_progress);
>  
>      mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
>  
> -    mce_barrier_enter(&mce_inside_bar);
> +    mce_barrier_enter(&mce_inside_bar, nowait);
>  
>      /*
> -     * Everybody is here. Now let's see who gets to do the
> -     * recovery work. Right now we just see if there's a CPU
> -     * that did not have any problems, and pick that one.
> -     *
> -     * First, just set a default value: the last CPU who reaches this
> -     * will overwrite the value and become the default.
> +     * When LMCE is being handled and no non-local MCE is waiting for
> +     * process, mce_softirq() does not need to set 'severity_cpu'. And
> +     * it should not, because the modification in this function may
> +     * interfere with the simultaneous modification in mce_cmn_handler()
> +     * on another CPU.
>       */
> -
> -    atomic_set(&severity_cpu, cpu);
> -
> -    mce_barrier_enter(&mce_severity_bar);
> -    if (!mctelem_has_deferred(cpu))
> +    if (!nowait) {
> +        /*
> +         * Everybody is here. Now let's see who gets to do the
> +         * recovery work. Right now we just see if there's a CPU
> +         * that did not have any problems, and pick that one.
> +         *
> +         * First, just set a default value: the last CPU who reaches this
> +         * will overwrite the value and become the default.
> +         */
>          atomic_set(&severity_cpu, cpu);
> -    mce_barrier_exit(&mce_severity_bar);
>  
> -    /* We choose severity_cpu for further processing */
> -    if (atomic_read(&severity_cpu) == cpu) {
> +        mce_barrier_enter(&mce_severity_bar, nowait);
> +        if (!mctelem_has_deferred(cpu))
> +            atomic_set(&severity_cpu, cpu);
> +        mce_barrier_exit(&mce_severity_bar, nowait);

You're in a !nowait conditional - please pass false explicitly to
enter/exit.

> @@ -1740,7 +1799,14 @@ static void mce_softirq(void)
>          }
>      }
>  
> -    mce_barrier_exit(&mce_inside_bar);
> +    mce_barrier_exit(&mce_inside_bar, nowait);
> +
> +    /*
> +     * At this point, either the only LMCE has been handled, or all MCE's
> +     * on this CPU waiting for process have been handled on this CPU (if

"for processing" or "to be processed" I think.

> --- a/xen/arch/x86/cpu/mcheck/mce.h
> +++ b/xen/arch/x86/cpu/mcheck/mce.h
> @@ -109,12 +109,14 @@ struct mca_summary {
>      int         eipv;   /* meaningful on #MC */
>      bool        uc;     /* UC flag */
>      bool        pcc;    /* PCC flag */
> +    bool        lmce;   /* LMCE flag (Intel only) */
>      bool        recoverable; /* software error recoverable flag */
>  };
>  
>  DECLARE_PER_CPU(struct mca_banks *, poll_bankmask);
>  DECLARE_PER_CPU(struct mca_banks *, no_cmci_banks);
>  DECLARE_PER_CPU(struct mca_banks *, mce_clear_banks);
> +DECLARE_PER_CPU(bool, mce_in_process);

This should have been deleted afaict.

Jan
Haozhong Zhang March 31, 2017, 2:34 a.m. UTC | #2
On 03/30/17 08:35 -0600, Jan Beulich wrote:
> >>> On 30.03.17 at 08:19, <haozhong.zhang@intel.com> wrote:
> > --- a/xen/arch/x86/cpu/mcheck/mce.c
> > +++ b/xen/arch/x86/cpu/mcheck/mce.c
> > @@ -42,6 +42,13 @@ DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
> >  
> > +/*
> > + * Flag to indicate that at least one non-local MCE on this CPU has
> > + * not been completed handled. It's set by mcheck_cmn_handler() and
> 
> completely
> 
> > + * cleared by mce_softirq().
> > + */
> > +static DEFINE_PER_CPU(bool, nonlocal_mce_in_progress);
> 
> Does a boolean really suffice here? I.e. can't there be a 2nd one
> while the first one is still being dealt with?

It's to indicate the existence of a non-local MCE, and mce_softirq()
will call mctelem_process_deferred() to handle all MCE's if it's true,
so a boolean flag is suffice.

Haozhong
Jan Beulich March 31, 2017, 7:24 a.m. UTC | #3
>>> On 31.03.17 at 04:34, <haozhong.zhang@intel.com> wrote:
> On 03/30/17 08:35 -0600, Jan Beulich wrote:
>> >>> On 30.03.17 at 08:19, <haozhong.zhang@intel.com> wrote:
>> > --- a/xen/arch/x86/cpu/mcheck/mce.c
>> > +++ b/xen/arch/x86/cpu/mcheck/mce.c
>> > @@ -42,6 +42,13 @@ DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
>> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
>> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
>> >  
>> > +/*
>> > + * Flag to indicate that at least one non-local MCE on this CPU has
>> > + * not been completed handled. It's set by mcheck_cmn_handler() and
>> > + * cleared by mce_softirq().
>> > + */
>> > +static DEFINE_PER_CPU(bool, nonlocal_mce_in_progress);
>> 
>> Does a boolean really suffice here? I.e. can't there be a 2nd one
>> while the first one is still being dealt with?
> 
> It's to indicate the existence of a non-local MCE, and mce_softirq()
> will call mctelem_process_deferred() to handle all MCE's if it's true,
> so a boolean flag is suffice.

I don't buy this, I'm sorry. What if a 2nd #MC occurs at the
instruction boundary of setting the variable to false (in
mce_softirq())? A subsequent mce_softirq() would then see
the variable being false, wouldn't it?

Jan
Haozhong Zhang April 6, 2017, 4:50 a.m. UTC | #4
On 03/31/17 01:24 -0600, Jan Beulich wrote:
> >>> On 31.03.17 at 04:34, <haozhong.zhang@intel.com> wrote:
> > On 03/30/17 08:35 -0600, Jan Beulich wrote:
> >> >>> On 30.03.17 at 08:19, <haozhong.zhang@intel.com> wrote:
> >> > --- a/xen/arch/x86/cpu/mcheck/mce.c
> >> > +++ b/xen/arch/x86/cpu/mcheck/mce.c
> >> > @@ -42,6 +42,13 @@ DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
> >> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
> >> >  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
> >> >  
> >> > +/*
> >> > + * Flag to indicate that at least one non-local MCE on this CPU has
> >> > + * not been completed handled. It's set by mcheck_cmn_handler() and
> >> > + * cleared by mce_softirq().
> >> > + */
> >> > +static DEFINE_PER_CPU(bool, nonlocal_mce_in_progress);
> >> 
> >> Does a boolean really suffice here? I.e. can't there be a 2nd one
> >> while the first one is still being dealt with?
> > 
> > It's to indicate the existence of a non-local MCE, and mce_softirq()
> > will call mctelem_process_deferred() to handle all MCE's if it's true,
> > so a boolean flag is suffice.
> 
> I don't buy this, I'm sorry. What if a 2nd #MC occurs at the
> instruction boundary of setting the variable to false (in
> mce_softirq())? A subsequent mce_softirq() would then see
> the variable being false, wouldn't it?
> 

You are right. I missed the point that MC# can happen during
mce_softirq() and this patch 1 is problematic in that case. Let me
rethink about this.

Thanks,
Haozhong
diff mbox

Patch

diff --git a/xen/arch/x86/cpu/mcheck/barrier.c b/xen/arch/x86/cpu/mcheck/barrier.c
index 5dce1fb..0b3b091 100644
--- a/xen/arch/x86/cpu/mcheck/barrier.c
+++ b/xen/arch/x86/cpu/mcheck/barrier.c
@@ -16,11 +16,11 @@  void mce_barrier_dec(struct mce_softirq_barrier *bar)
     atomic_dec(&bar->val);
 }
 
-void mce_barrier_enter(struct mce_softirq_barrier *bar)
+void mce_barrier_enter(struct mce_softirq_barrier *bar, bool nowait)
 {
     int gen;
 
-    if (!mce_broadcast)
+    if ( !mce_broadcast || nowait )
         return;
     atomic_inc(&bar->ingen);
     gen = atomic_read(&bar->outgen);
@@ -34,11 +34,11 @@  void mce_barrier_enter(struct mce_softirq_barrier *bar)
     }
 }
 
-void mce_barrier_exit(struct mce_softirq_barrier *bar)
+void mce_barrier_exit(struct mce_softirq_barrier *bar, bool nowait)
 {
     int gen;
 
-    if ( !mce_broadcast )
+    if ( !mce_broadcast || nowait )
         return;
     atomic_inc(&bar->outgen);
     gen = atomic_read(&bar->ingen);
@@ -54,6 +54,6 @@  void mce_barrier_exit(struct mce_softirq_barrier *bar)
 
 void mce_barrier(struct mce_softirq_barrier *bar)
 {
-    mce_barrier_enter(bar);
-    mce_barrier_exit(bar);
+    mce_barrier_enter(bar, false);
+    mce_barrier_exit(bar, false);
 }
diff --git a/xen/arch/x86/cpu/mcheck/barrier.h b/xen/arch/x86/cpu/mcheck/barrier.h
index 87f7550..f2613e6 100644
--- a/xen/arch/x86/cpu/mcheck/barrier.h
+++ b/xen/arch/x86/cpu/mcheck/barrier.h
@@ -25,6 +25,14 @@  void mce_barrier_init(struct mce_softirq_barrier *);
 void mce_barrier_dec(struct mce_softirq_barrier *);
 
 /*
+ * If nowait is true, mce_barrier_enter/exit() will return immediately
+ * without touching the barrier. It's used when handling a LMCE which
+ * is received on only one CPU and thus does not invoke
+ * mce_barrier_enter/exit() calls on all CPUs.
+ *
+ * If nowait is false, mce_barrier_enter/exit() will handle the given
+ * barrier as below.
+ *
  * Increment the generation number and the value. The generation number
  * is incremented when entering a barrier. This way, it can be checked
  * on exit if a CPU is trying to re-enter the barrier. This can happen
@@ -36,8 +44,8 @@  void mce_barrier_dec(struct mce_softirq_barrier *);
  * These barrier functions should always be paired, so that the
  * counter value will reach 0 again after all CPUs have exited.
  */
-void mce_barrier_enter(struct mce_softirq_barrier *);
-void mce_barrier_exit(struct mce_softirq_barrier *);
+void mce_barrier_enter(struct mce_softirq_barrier *, bool nowait);
+void mce_barrier_exit(struct mce_softirq_barrier *, bool nowait);
 
 void mce_barrier(struct mce_softirq_barrier *);
 
diff --git a/xen/arch/x86/cpu/mcheck/mcaction.c b/xen/arch/x86/cpu/mcheck/mcaction.c
index dab9eac..ca17d22 100644
--- a/xen/arch/x86/cpu/mcheck/mcaction.c
+++ b/xen/arch/x86/cpu/mcheck/mcaction.c
@@ -96,7 +96,9 @@  mc_memerr_dhandler(struct mca_binfo *binfo,
 
                 bank->mc_addr = gfn << PAGE_SHIFT |
                   (bank->mc_addr & (PAGE_SIZE -1 ));
-                if (fill_vmsr_data(bank, d, global->mc_gstatus,
+                /* TODO: support injecting LMCE */
+                if (fill_vmsr_data(bank, d,
+                                   global->mc_gstatus & ~MCG_STATUS_LMCE,
                                    vmce_vcpuid == VMCE_INJECT_BROADCAST))
                 {
                     mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 11d0e23..c1a59bf 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -42,6 +42,13 @@  DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
 
+/*
+ * Flag to indicate that at least one non-local MCE on this CPU has
+ * not been completed handled. It's set by mcheck_cmn_handler() and
+ * cleared by mce_softirq().
+ */
+static DEFINE_PER_CPU(bool, nonlocal_mce_in_progress);
+
 static void intpose_init(void);
 static void mcinfo_clear(struct mc_info *);
 struct mca_banks *mca_allbanks;
@@ -186,7 +193,29 @@  static struct mce_softirq_barrier mce_trap_bar;
  */
 static DEFINE_SPINLOCK(mce_logout_lock);
 
+/*
+ * 'severity_cpu' is used in both mcheck_cmn_handler() and mce_softirq().
+ *
+ * When handling broadcasting MCE, MCE barriers take effect to prevent
+ * 'severity_cpu' being modified in one function on one CPU and accessed
+ * in another on a different CPU.
+ *
+ * When handling LMCE, mce_barrier_enter() and mce_barrier_exit() are
+ * effectively NOP, so it's possible that mcheck_cmn_handler() is handling
+ * a LMCE on CPUx while mce_softirq() is handling another LMCE on CPUy.
+ * If both are modifying 'severity_cpu', they may interfere with each
+ * other. Therefore, it's better for mcheck_cmn_handler() and mce_softirq()
+ * to avoid accessing 'severity_cpu' when handling LMCE, unless other
+ * approaches are taken to avoid the above interference.
+ */
 static atomic_t severity_cpu = ATOMIC_INIT(-1);
+/*
+ * The following two global variables are used only in mcheck_cmn_handler().
+ * Because there can be at most one MCE (including LMCE) outstanding
+ * in the hardware platform, we don't need to worry about the above
+ * interference, where two code paths handing different MCE's access
+ * the same global variables.
+ */
 static atomic_t found_error = ATOMIC_INIT(0);
 static cpumask_t mce_fatal_cpus;
 
@@ -395,6 +424,7 @@  mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
         sp->errcnt = errcnt;
         sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
         sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+        sp->lmce = (gstatus & MCG_STATUS_LMCE) != 0;
         sp->uc = uc;
         sp->pcc = pcc;
         sp->recoverable = recover;
@@ -458,6 +488,7 @@  void mcheck_cmn_handler(const struct cpu_user_regs *regs)
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
+    bool lmce;
 
     mce_spin_lock(&mce_logout_lock);
 
@@ -467,6 +498,10 @@  void mcheck_cmn_handler(const struct cpu_user_regs *regs)
     }
     mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
 
+    lmce = bs.lmce;
+    if (!lmce)
+        this_cpu(nonlocal_mce_in_progress) = true;
+
     if (bs.errcnt) {
         /*
          * Uncorrected errors must be dealt with in softirq context.
@@ -488,8 +523,9 @@  void mcheck_cmn_handler(const struct cpu_user_regs *regs)
         }
         atomic_set(&found_error, 1);
 
-        /* The last CPU will be take check/clean-up etc */
-        atomic_set(&severity_cpu, smp_processor_id());
+        /* The last CPU will be take check/clean-up etc. */
+        if (!lmce)
+            atomic_set(&severity_cpu, smp_processor_id());
 
         mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
                 *((unsigned long*)clear_bank), smp_processor_id());
@@ -501,16 +537,16 @@  void mcheck_cmn_handler(const struct cpu_user_regs *regs)
     }
     mce_spin_unlock(&mce_logout_lock);
 
-    mce_barrier_enter(&mce_trap_bar);
+    mce_barrier_enter(&mce_trap_bar, lmce);
     if ( mctc != NULL && mce_urgent_action(regs, mctc))
         cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
-    mce_barrier_exit(&mce_trap_bar);
+    mce_barrier_exit(&mce_trap_bar, lmce);
 
     /*
      * Wait until everybody has processed the trap.
      */
-    mce_barrier_enter(&mce_trap_bar);
-    if (atomic_read(&severity_cpu) == smp_processor_id())
+    mce_barrier_enter(&mce_trap_bar, lmce);
+    if (lmce || atomic_read(&severity_cpu) == smp_processor_id())
     {
         /* According to SDM, if no error bank found on any cpus,
          * something unexpected happening, we can't do any
@@ -527,16 +563,16 @@  void mcheck_cmn_handler(const struct cpu_user_regs *regs)
         }
         atomic_set(&found_error, 0);
     }
-    mce_barrier_exit(&mce_trap_bar);
+    mce_barrier_exit(&mce_trap_bar, lmce);
 
     /* Clear flags after above fatal check */
-    mce_barrier_enter(&mce_trap_bar);
+    mce_barrier_enter(&mce_trap_bar, lmce);
     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
     if ((gstatus & MCG_STATUS_MCIP) != 0) {
         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
         mca_wrmsr(MSR_IA32_MCG_STATUS, 0);
     }
-    mce_barrier_exit(&mce_trap_bar);
+    mce_barrier_exit(&mce_trap_bar, lmce);
 
     raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }
@@ -1700,38 +1736,61 @@  static void mce_softirq(void)
 {
     int cpu = smp_processor_id();
     unsigned int workcpu;
+    /*
+     * On platforms supporting broadcasting MCE, if there is a
+     * non-local MCE waiting for process on this CPU, it implies other
+     * CPUs received the same MCE as well. Therefore, mce_softirq()
+     * will be launched on other CPUs as well and compete with this
+     * mce_softirq() to handle all MCE's on all CPUs. In that case, we
+     * should use MCE barriers to sync with other CPUs. Otherwise, we
+     * do not need to wait for other CPUs.
+     */
+    bool nowait = !this_cpu(nonlocal_mce_in_progress);
 
     mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
 
-    mce_barrier_enter(&mce_inside_bar);
+    mce_barrier_enter(&mce_inside_bar, nowait);
 
     /*
-     * Everybody is here. Now let's see who gets to do the
-     * recovery work. Right now we just see if there's a CPU
-     * that did not have any problems, and pick that one.
-     *
-     * First, just set a default value: the last CPU who reaches this
-     * will overwrite the value and become the default.
+     * When LMCE is being handled and no non-local MCE is waiting for
+     * process, mce_softirq() does not need to set 'severity_cpu'. And
+     * it should not, because the modification in this function may
+     * interfere with the simultaneous modification in mce_cmn_handler()
+     * on another CPU.
      */
-
-    atomic_set(&severity_cpu, cpu);
-
-    mce_barrier_enter(&mce_severity_bar);
-    if (!mctelem_has_deferred(cpu))
+    if (!nowait) {
+        /*
+         * Everybody is here. Now let's see who gets to do the
+         * recovery work. Right now we just see if there's a CPU
+         * that did not have any problems, and pick that one.
+         *
+         * First, just set a default value: the last CPU who reaches this
+         * will overwrite the value and become the default.
+         */
         atomic_set(&severity_cpu, cpu);
-    mce_barrier_exit(&mce_severity_bar);
 
-    /* We choose severity_cpu for further processing */
-    if (atomic_read(&severity_cpu) == cpu) {
+        mce_barrier_enter(&mce_severity_bar, nowait);
+        if (!mctelem_has_deferred(cpu))
+            atomic_set(&severity_cpu, cpu);
+        mce_barrier_exit(&mce_severity_bar, nowait);
+    }
+
+    /*
+     * Either no non-local MCE is being handled, or this CPU is chose as
+     * severity_cpu for further processing ...
+     */
+    if (nowait || atomic_read(&severity_cpu) == cpu) {
 
         mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
 
         /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
          * vMCE MSRs virtualization buffer
          */
-        for_each_online_cpu(workcpu) {
-            mctelem_process_deferred(workcpu, mce_delayed_action);
-        }
+        if (nowait)
+            mctelem_process_deferred(cpu, mce_delayed_action);
+        else
+            for_each_online_cpu(workcpu)
+                mctelem_process_deferred(workcpu, mce_delayed_action);
 
         /* Step2: Send Log to DOM0 through vIRQ */
         if (dom0_vmce_enabled()) {
@@ -1740,7 +1799,14 @@  static void mce_softirq(void)
         }
     }
 
-    mce_barrier_exit(&mce_inside_bar);
+    mce_barrier_exit(&mce_inside_bar, nowait);
+
+    /*
+     * At this point, either the only LMCE has been handled, or all MCE's
+     * on this CPU waiting for process have been handled on this CPU (if
+     * it was chose as severity_cpu) or by mce_softirq() on another CPU.
+     */
+    this_cpu(nonlocal_mce_in_progress) = false;
 }
 
 /* Machine Check owner judge algorithm:
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
index 10e5ceb..3a9134e 100644
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -109,12 +109,14 @@  struct mca_summary {
     int         eipv;   /* meaningful on #MC */
     bool        uc;     /* UC flag */
     bool        pcc;    /* PCC flag */
+    bool        lmce;   /* LMCE flag (Intel only) */
     bool        recoverable; /* software error recoverable flag */
 };
 
 DECLARE_PER_CPU(struct mca_banks *, poll_bankmask);
 DECLARE_PER_CPU(struct mca_banks *, no_cmci_banks);
 DECLARE_PER_CPU(struct mca_banks *, mce_clear_banks);
+DECLARE_PER_CPU(bool, mce_in_process);
 
 extern bool cmci_support;
 extern bool is_mc_panic;
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index 34d1921..de03f82 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -42,7 +42,9 @@ 
 #define MCG_STATUS_RIPV         0x0000000000000001ULL
 #define MCG_STATUS_EIPV         0x0000000000000002ULL
 #define MCG_STATUS_MCIP         0x0000000000000004ULL
-/* Bits 3-63 are reserved */
+#define MCG_STATUS_LMCE         0x0000000000000008ULL  /* Intel specific */
+/* Bits 3-63 are reserved on CPU not supporting LMCE */
+/* Bits 4-63 are reserved on CPU supporting LMCE */
 
 /* Bitfield of MSR_K8_MCi_STATUS registers */
 /* MCA error code */