diff mbox

[3/4] x86/xstate: fix fault behavior on XRSTORS

Message ID 56AB4D1402000078000CC543@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich Jan. 29, 2016, 10:29 a.m. UTC
XRSTORS unconditionally faults when xcomp_bv has bit 63 clear. Instead
of just fixing this issue, overhaul the fault recovery code, which -
one of the many mistakes made when xstate support got introduced - was
blindly mirroring that accompanying FXRSTOR, neglecting the fact that
XRSTOR{,S} aren't all-or-nothing instructions. The new code, first of
all, does all the recovery actions in C, simplifying the inline
assembly used. And it does its work in a multi-stage fashion: Upon
first seeing a fault, state fixups get applied strictly based on what
architecturally may cause #GP. When seeing another fault despite the
fixups done, state gets fully reset. A third fault would then lead to
crashing the domain (instead of hanging the hypervisor in an infinite
loop of recurring faults).

Reported-by: Harmandeep Kaur <write.harmandeep@gmail.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
x86/xstate: fix fault behavior on XRSTORS

XRSTORS unconditionally faults when xcomp_bv has bit 63 clear. Instead
of just fixing this issue, overhaul the fault recovery code, which -
one of the many mistakes made when xstate support got introduced - was
blindly mirroring that accompanying FXRSTOR, neglecting the fact that
XRSTOR{,S} aren't all-or-nothing instructions. The new code, first of
all, does all the recovery actions in C, simplifying the inline
assembly used. And it does its work in a multi-stage fashion: Upon
first seeing a fault, state fixups get applied strictly based on what
architecturally may cause #GP. When seeing another fault despite the
fixups done, state gets fully reset. A third fault would then lead to
crashing the domain (instead of hanging the hypervisor in an infinite
loop of recurring faults).

Reported-by: Harmandeep Kaur <write.harmandeep@gmail.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes
 static unsigned int __read_mostly xstate_features;
 static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8];
 
+static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT;
+
 /* Cached xcr0 for fast read */
 static DEFINE_PER_CPU(uint64_t, xcr0);
 
@@ -342,6 +344,7 @@ void xrstor(struct vcpu *v, uint64_t mas
     uint32_t hmask = mask >> 32;
     uint32_t lmask = mask;
     struct xsave_struct *ptr = v->arch.xsave_area;
+    unsigned int faults, prev_faults;
 
     /*
      * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
@@ -361,35 +364,85 @@ void xrstor(struct vcpu *v, uint64_t mas
     /*
      * XRSTOR can fault if passed a corrupted data block. We handle this
      * possibility, which may occur if the block was passed to us by control
-     * tools or through VCPUOP_initialise, by silently clearing the block.
+     * tools or through VCPUOP_initialise, by silently adjusting state.
      */
-    switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
+    for ( prev_faults = faults = 0; ; prev_faults = faults )
     {
+        switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
+        {
 #define XRSTOR(pfx) \
         alternative_io("1: .byte " pfx "0x0f,0xae,0x2f\n" \
+                       "3:\n" \
                        "   .section .fixup,\"ax\"\n" \
-                       "2: mov %[size],%%ecx\n" \
-                       "   xor %[lmask_out],%[lmask_out]\n" \
-                       "   rep stosb\n" \
-                       "   lea %[mem],%[ptr]\n" \
-                       "   mov %[lmask_in],%[lmask_out]\n" \
-                       "   jmp 1b\n" \
+                       "2: inc%z[faults] %[faults]\n" \
+                       "   jmp 3b\n" \
                        "   .previous\n" \
                        _ASM_EXTABLE(1b, 2b), \
                        ".byte " pfx "0x0f,0xc7,0x1f\n", \
                        X86_FEATURE_XSAVES, \
-                       ASM_OUTPUT2([ptr] "+&D" (ptr), [lmask_out] "+&a" (lmask)), \
-                       [mem] "m" (*ptr), [lmask_in] "g" (lmask), \
-                       [hmask] "d" (hmask), [size] "m" (xsave_cntxt_size) \
-                       : "ecx")
-
-    default:
-        XRSTOR("0x48,");
-        break;
-    case 4: case 2:
-        XRSTOR("");
-        break;
+                       ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \
+                       [lmask] "a" (lmask), [hmask] "d" (hmask), \
+                       [ptr] "D" (ptr))
+
+        default:
+            XRSTOR("0x48,");
+            break;
+        case 4: case 2:
+            XRSTOR("");
+            break;
 #undef XRSTOR
+        }
+        if ( likely(faults == prev_faults) )
+            break;
+#ifndef NDEBUG
+        gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n",
+                faults, ptr->fpu_sse.mxcsr);
+        gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n",
+                ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv);
+        gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n",
+                ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]);
+        gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n",
+                ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]);
+        gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n",
+                ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]);
+#endif
+        switch ( faults )
+        {
+        case 1:
+            /* Stage 1: Reset state to be loaded. */
+            ptr->xsave_hdr.xstate_bv &= ~mask;
+            /*
+             * Also try to eliminate fault reasons, even if this shouldn't be
+             * needed here (other code should ensure the sanity of the data).
+             */
+            if ( ((mask & XSTATE_SSE) ||
+                  ((mask & XSTATE_YMM) &&
+                   !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) )
+                ptr->fpu_sse.mxcsr &= mxcsr_mask;
+            if ( cpu_has_xsaves || cpu_has_xsavec )
+            {
+                ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss);
+                ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv;
+                ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED;
+            }
+            else
+            {
+                ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0);
+                ptr->xsave_hdr.xcomp_bv = 0;
+            }
+            memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
+            continue;
+        case 2:
+            /* Stage 2: Reset all state. */
+            ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
+            ptr->xsave_hdr.xstate_bv = 0;
+            ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves
+                                      ? XSTATE_COMPACTION_ENABLED : 0;
+            continue;
+        default:
+            domain_crash(current->domain);
+            break;
+        }
     }
 }
 
@@ -496,6 +549,8 @@ void xstate_init(struct cpuinfo_x86 *c)
 
     if ( bsp )
     {
+        static typeof(current->arch.xsave_area->fpu_sse) __initdata ctxt;
+
         xfeature_mask = feature_mask;
         /*
          * xsave_cntxt_size is the max size required by enabled features.
@@ -504,6 +559,10 @@ void xstate_init(struct cpuinfo_x86 *c)
         xsave_cntxt_size = _xstate_ctxt_size(feature_mask);
         printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
             __func__, xsave_cntxt_size, xfeature_mask);
+
+        asm ( "fxsave %0" : "=m" (ctxt) );
+        if ( ctxt.mxcsr_mask )
+            mxcsr_mask = ctxt.mxcsr_mask;
     }
     else
     {

Comments

Andrew Cooper Jan. 29, 2016, 4:48 p.m. UTC | #1
On 29/01/16 10:29, Jan Beulich wrote:
> XRSTORS unconditionally faults when xcomp_bv has bit 63 clear. Instead
> of just fixing this issue, overhaul the fault recovery code, which -
> one of the many mistakes made when xstate support got introduced - was
> blindly mirroring that accompanying FXRSTOR, neglecting the fact that
> XRSTOR{,S} aren't all-or-nothing instructions. The new code, first of
> all, does all the recovery actions in C, simplifying the inline
> assembly used. And it does its work in a multi-stage fashion: Upon
> first seeing a fault, state fixups get applied strictly based on what
> architecturally may cause #GP. When seeing another fault despite the
> fixups done, state gets fully reset. A third fault would then lead to
> crashing the domain (instead of hanging the hypervisor in an infinite
> loop of recurring faults).
>
> Reported-by: Harmandeep Kaur <write.harmandeep@gmail.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/xstate.c
> +++ b/xen/arch/x86/xstate.c
> @@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes
>  static unsigned int __read_mostly xstate_features;
>  static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8];
>  
> +static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT;

The default mxcsr_mask value is 0xFFBF, which is different to the
default mxcsr value.


> +
>  /* Cached xcr0 for fast read */
>  static DEFINE_PER_CPU(uint64_t, xcr0);
>  
> @@ -342,6 +344,7 @@ void xrstor(struct vcpu *v, uint64_t mas
>      uint32_t hmask = mask >> 32;
>      uint32_t lmask = mask;
>      struct xsave_struct *ptr = v->arch.xsave_area;
> +    unsigned int faults, prev_faults;
>  
>      /*
>       * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
> @@ -361,35 +364,85 @@ void xrstor(struct vcpu *v, uint64_t mas
>      /*
>       * XRSTOR can fault if passed a corrupted data block. We handle this
>       * possibility, which may occur if the block was passed to us by control
> -     * tools or through VCPUOP_initialise, by silently clearing the block.
> +     * tools or through VCPUOP_initialise, by silently adjusting state.
>       */
> -    switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
> +    for ( prev_faults = faults = 0; ; prev_faults = faults )
>      {
> +        switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
> +        {
>  #define XRSTOR(pfx) \
>          alternative_io("1: .byte " pfx "0x0f,0xae,0x2f\n" \
> +                       "3:\n" \
>                         "   .section .fixup,\"ax\"\n" \
> -                       "2: mov %[size],%%ecx\n" \
> -                       "   xor %[lmask_out],%[lmask_out]\n" \
> -                       "   rep stosb\n" \
> -                       "   lea %[mem],%[ptr]\n" \
> -                       "   mov %[lmask_in],%[lmask_out]\n" \
> -                       "   jmp 1b\n" \
> +                       "2: inc%z[faults] %[faults]\n" \
> +                       "   jmp 3b\n" \
>                         "   .previous\n" \
>                         _ASM_EXTABLE(1b, 2b), \
>                         ".byte " pfx "0x0f,0xc7,0x1f\n", \
>                         X86_FEATURE_XSAVES, \
> -                       ASM_OUTPUT2([ptr] "+&D" (ptr), [lmask_out] "+&a" (lmask)), \
> -                       [mem] "m" (*ptr), [lmask_in] "g" (lmask), \
> -                       [hmask] "d" (hmask), [size] "m" (xsave_cntxt_size) \
> -                       : "ecx")
> -
> -    default:
> -        XRSTOR("0x48,");
> -        break;
> -    case 4: case 2:
> -        XRSTOR("");
> -        break;
> +                       ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \
> +                       [lmask] "a" (lmask), [hmask] "d" (hmask), \
> +                       [ptr] "D" (ptr))
> +
> +        default:
> +            XRSTOR("0x48,");

Not relevant to this patch specifically, but this would be clearer if it
actually used the rex64 mnemonic

> +            break;
> +        case 4: case 2:
> +            XRSTOR("");
> +            break;
>  #undef XRSTOR
> +        }
> +        if ( likely(faults == prev_faults) )
> +            break;
> +#ifndef NDEBUG
> +        gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n",
> +                faults, ptr->fpu_sse.mxcsr);
> +        gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n",
> +                ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv);
> +        gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n",
> +                ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]);
> +        gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n",
> +                ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]);
> +        gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n",
> +                ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]);
> +#endif
> +        switch ( faults )
> +        {
> +        case 1:
> +            /* Stage 1: Reset state to be loaded. */
> +            ptr->xsave_hdr.xstate_bv &= ~mask;
> +            /*
> +             * Also try to eliminate fault reasons, even if this shouldn't be
> +             * needed here (other code should ensure the sanity of the data).
> +             */
> +            if ( ((mask & XSTATE_SSE) ||
> +                  ((mask & XSTATE_YMM) &&
> +                   !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) )
> +                ptr->fpu_sse.mxcsr &= mxcsr_mask;
> +            if ( cpu_has_xsaves || cpu_has_xsavec )
> +            {
> +                ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss);
> +                ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv;
> +                ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED;
> +            }
> +            else
> +            {
> +                ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0);
> +                ptr->xsave_hdr.xcomp_bv = 0;
> +            }
> +            memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
> +            continue;

Newline here please.

> +        case 2:
> +            /* Stage 2: Reset all state. */
> +            ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
> +            ptr->xsave_hdr.xstate_bv = 0;
> +            ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves
> +                                      ? XSTATE_COMPACTION_ENABLED : 0;
> +            continue;

And here.

> +        default:
> +            domain_crash(current->domain);
> +            break;

This breaks out of the switch statement, not out of the loop.  It looks
like it will end in an infinite loop of calling domain_crash().

~Andrew

> +        }
>      }
>  }
>  
> @@ -496,6 +549,8 @@ void xstate_init(struct cpuinfo_x86 *c)
>  
>      if ( bsp )
>      {
> +        static typeof(current->arch.xsave_area->fpu_sse) __initdata ctxt;
> +
>          xfeature_mask = feature_mask;
>          /*
>           * xsave_cntxt_size is the max size required by enabled features.
> @@ -504,6 +559,10 @@ void xstate_init(struct cpuinfo_x86 *c)
>          xsave_cntxt_size = _xstate_ctxt_size(feature_mask);
>          printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
>              __func__, xsave_cntxt_size, xfeature_mask);
> +
> +        asm ( "fxsave %0" : "=m" (ctxt) );
> +        if ( ctxt.mxcsr_mask )
> +            mxcsr_mask = ctxt.mxcsr_mask;
>      }
>      else
>      {
>
>
Jan Beulich Jan. 29, 2016, 5 p.m. UTC | #2
>>> On 29.01.16 at 17:48, <andrew.cooper3@citrix.com> wrote:
> On 29/01/16 10:29, Jan Beulich wrote:
>> --- a/xen/arch/x86/xstate.c
>> +++ b/xen/arch/x86/xstate.c
>> @@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes
>>  static unsigned int __read_mostly xstate_features;
>>  static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8];
>>  
>> +static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT;
> 
> The default mxcsr_mask value is 0xFFBF, which is different to the
> default mxcsr value.

Oh, right, of course. Clearly I have no old enough box in regular
use anymore to actually notice the difference.

>> -    case 4: case 2:
>> -        XRSTOR("");
>> -        break;
>> +                       ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \
>> +                       [lmask] "a" (lmask), [hmask] "d" (hmask), \
>> +                       [ptr] "D" (ptr))
>> +
>> +        default:
>> +            XRSTOR("0x48,");
> 
> Not relevant to this patch specifically, but this would be clearer if it
> actually used the rex64 mnemonic

Prefixing a .byte with rex64 is, well, odd?

>> +            break;
>> +        case 4: case 2:
>> +            XRSTOR("");
>> +            break;
>>  #undef XRSTOR
>> +        }
>> +        if ( likely(faults == prev_faults) )
>> +            break;
>> +#ifndef NDEBUG
>> +        gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n",
>> +                faults, ptr->fpu_sse.mxcsr);
>> +        gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n",
>> +                ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv);
>> +        gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n",
>> +                ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]);
>> +        gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n",
>> +                ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]);
>> +        gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n",
>> +                ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]);
>> +#endif
>> +        switch ( faults )
>> +        {
>> +        case 1:
>> +            /* Stage 1: Reset state to be loaded. */
>> +            ptr->xsave_hdr.xstate_bv &= ~mask;
>> +            /*
>> +             * Also try to eliminate fault reasons, even if this shouldn't be
>> +             * needed here (other code should ensure the sanity of the data).
>> +             */
>> +            if ( ((mask & XSTATE_SSE) ||
>> +                  ((mask & XSTATE_YMM) &&
>> +                   !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) )
>> +                ptr->fpu_sse.mxcsr &= mxcsr_mask;
>> +            if ( cpu_has_xsaves || cpu_has_xsavec )
>> +            {
>> +                ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss);
>> +                ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv;
>> +                ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED;
>> +            }
>> +            else
>> +            {
>> +                ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0);
>> +                ptr->xsave_hdr.xcomp_bv = 0;
>> +            }
>> +            memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
>> +            continue;
> 
> Newline here please.
> 
>> +        case 2:
>> +            /* Stage 2: Reset all state. */
>> +            ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
>> +            ptr->xsave_hdr.xstate_bv = 0;
>> +            ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves
>> +                                      ? XSTATE_COMPACTION_ENABLED : 0;
>> +            continue;
> 
> And here.
> 
>> +        default:
>> +            domain_crash(current->domain);
>> +            break;
> 
> This breaks out of the switch statement, not out of the loop.  It looks
> like it will end in an infinite loop of calling domain_crash().

Oh, indeed, I forgot that: After all, the whole point of the
"continue"s was for there to be a break after the switch()'s end.

Jan
diff mbox

Patch

--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -29,6 +29,8 @@  unsigned int *__read_mostly xstate_sizes
 static unsigned int __read_mostly xstate_features;
 static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8];
 
+static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT;
+
 /* Cached xcr0 for fast read */
 static DEFINE_PER_CPU(uint64_t, xcr0);
 
@@ -342,6 +344,7 @@  void xrstor(struct vcpu *v, uint64_t mas
     uint32_t hmask = mask >> 32;
     uint32_t lmask = mask;
     struct xsave_struct *ptr = v->arch.xsave_area;
+    unsigned int faults, prev_faults;
 
     /*
      * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
@@ -361,35 +364,85 @@  void xrstor(struct vcpu *v, uint64_t mas
     /*
      * XRSTOR can fault if passed a corrupted data block. We handle this
      * possibility, which may occur if the block was passed to us by control
-     * tools or through VCPUOP_initialise, by silently clearing the block.
+     * tools or through VCPUOP_initialise, by silently adjusting state.
      */
-    switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
+    for ( prev_faults = faults = 0; ; prev_faults = faults )
     {
+        switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) )
+        {
 #define XRSTOR(pfx) \
         alternative_io("1: .byte " pfx "0x0f,0xae,0x2f\n" \
+                       "3:\n" \
                        "   .section .fixup,\"ax\"\n" \
-                       "2: mov %[size],%%ecx\n" \
-                       "   xor %[lmask_out],%[lmask_out]\n" \
-                       "   rep stosb\n" \
-                       "   lea %[mem],%[ptr]\n" \
-                       "   mov %[lmask_in],%[lmask_out]\n" \
-                       "   jmp 1b\n" \
+                       "2: inc%z[faults] %[faults]\n" \
+                       "   jmp 3b\n" \
                        "   .previous\n" \
                        _ASM_EXTABLE(1b, 2b), \
                        ".byte " pfx "0x0f,0xc7,0x1f\n", \
                        X86_FEATURE_XSAVES, \
-                       ASM_OUTPUT2([ptr] "+&D" (ptr), [lmask_out] "+&a" (lmask)), \
-                       [mem] "m" (*ptr), [lmask_in] "g" (lmask), \
-                       [hmask] "d" (hmask), [size] "m" (xsave_cntxt_size) \
-                       : "ecx")
-
-    default:
-        XRSTOR("0x48,");
-        break;
-    case 4: case 2:
-        XRSTOR("");
-        break;
+                       ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \
+                       [lmask] "a" (lmask), [hmask] "d" (hmask), \
+                       [ptr] "D" (ptr))
+
+        default:
+            XRSTOR("0x48,");
+            break;
+        case 4: case 2:
+            XRSTOR("");
+            break;
 #undef XRSTOR
+        }
+        if ( likely(faults == prev_faults) )
+            break;
+#ifndef NDEBUG
+        gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n",
+                faults, ptr->fpu_sse.mxcsr);
+        gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n",
+                ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv);
+        gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n",
+                ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]);
+        gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n",
+                ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]);
+        gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n",
+                ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]);
+#endif
+        switch ( faults )
+        {
+        case 1:
+            /* Stage 1: Reset state to be loaded. */
+            ptr->xsave_hdr.xstate_bv &= ~mask;
+            /*
+             * Also try to eliminate fault reasons, even if this shouldn't be
+             * needed here (other code should ensure the sanity of the data).
+             */
+            if ( ((mask & XSTATE_SSE) ||
+                  ((mask & XSTATE_YMM) &&
+                   !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) )
+                ptr->fpu_sse.mxcsr &= mxcsr_mask;
+            if ( cpu_has_xsaves || cpu_has_xsavec )
+            {
+                ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss);
+                ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv;
+                ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED;
+            }
+            else
+            {
+                ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0);
+                ptr->xsave_hdr.xcomp_bv = 0;
+            }
+            memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved));
+            continue;
+        case 2:
+            /* Stage 2: Reset all state. */
+            ptr->fpu_sse.mxcsr = MXCSR_DEFAULT;
+            ptr->xsave_hdr.xstate_bv = 0;
+            ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves
+                                      ? XSTATE_COMPACTION_ENABLED : 0;
+            continue;
+        default:
+            domain_crash(current->domain);
+            break;
+        }
     }
 }
 
@@ -496,6 +549,8 @@  void xstate_init(struct cpuinfo_x86 *c)
 
     if ( bsp )
     {
+        static typeof(current->arch.xsave_area->fpu_sse) __initdata ctxt;
+
         xfeature_mask = feature_mask;
         /*
          * xsave_cntxt_size is the max size required by enabled features.
@@ -504,6 +559,10 @@  void xstate_init(struct cpuinfo_x86 *c)
         xsave_cntxt_size = _xstate_ctxt_size(feature_mask);
         printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
             __func__, xsave_cntxt_size, xfeature_mask);
+
+        asm ( "fxsave %0" : "=m" (ctxt) );
+        if ( ctxt.mxcsr_mask )
+            mxcsr_mask = ctxt.mxcsr_mask;
     }
     else
     {