Message ID | 56AB4D1402000078000CC543@prv-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 29/01/16 10:29, Jan Beulich wrote: > XRSTORS unconditionally faults when xcomp_bv has bit 63 clear. Instead > of just fixing this issue, overhaul the fault recovery code, which - > one of the many mistakes made when xstate support got introduced - was > blindly mirroring that accompanying FXRSTOR, neglecting the fact that > XRSTOR{,S} aren't all-or-nothing instructions. The new code, first of > all, does all the recovery actions in C, simplifying the inline > assembly used. And it does its work in a multi-stage fashion: Upon > first seeing a fault, state fixups get applied strictly based on what > architecturally may cause #GP. When seeing another fault despite the > fixups done, state gets fully reset. A third fault would then lead to > crashing the domain (instead of hanging the hypervisor in an infinite > loop of recurring faults). > > Reported-by: Harmandeep Kaur <write.harmandeep@gmail.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> > > --- a/xen/arch/x86/xstate.c > +++ b/xen/arch/x86/xstate.c > @@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes > static unsigned int __read_mostly xstate_features; > static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8]; > > +static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT; The default mxcsr_mask value is 0xFFBF, which is different to the default mxcsr value. > + > /* Cached xcr0 for fast read */ > static DEFINE_PER_CPU(uint64_t, xcr0); > > @@ -342,6 +344,7 @@ void xrstor(struct vcpu *v, uint64_t mas > uint32_t hmask = mask >> 32; > uint32_t lmask = mask; > struct xsave_struct *ptr = v->arch.xsave_area; > + unsigned int faults, prev_faults; > > /* > * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception > @@ -361,35 +364,85 @@ void xrstor(struct vcpu *v, uint64_t mas > /* > * XRSTOR can fault if passed a corrupted data block. We handle this > * possibility, which may occur if the block was passed to us by control > - * tools or through VCPUOP_initialise, by silently clearing the block. > + * tools or through VCPUOP_initialise, by silently adjusting state. > */ > - switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) ) > + for ( prev_faults = faults = 0; ; prev_faults = faults ) > { > + switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) ) > + { > #define XRSTOR(pfx) \ > alternative_io("1: .byte " pfx "0x0f,0xae,0x2f\n" \ > + "3:\n" \ > " .section .fixup,\"ax\"\n" \ > - "2: mov %[size],%%ecx\n" \ > - " xor %[lmask_out],%[lmask_out]\n" \ > - " rep stosb\n" \ > - " lea %[mem],%[ptr]\n" \ > - " mov %[lmask_in],%[lmask_out]\n" \ > - " jmp 1b\n" \ > + "2: inc%z[faults] %[faults]\n" \ > + " jmp 3b\n" \ > " .previous\n" \ > _ASM_EXTABLE(1b, 2b), \ > ".byte " pfx "0x0f,0xc7,0x1f\n", \ > X86_FEATURE_XSAVES, \ > - ASM_OUTPUT2([ptr] "+&D" (ptr), [lmask_out] "+&a" (lmask)), \ > - [mem] "m" (*ptr), [lmask_in] "g" (lmask), \ > - [hmask] "d" (hmask), [size] "m" (xsave_cntxt_size) \ > - : "ecx") > - > - default: > - XRSTOR("0x48,"); > - break; > - case 4: case 2: > - XRSTOR(""); > - break; > + ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \ > + [lmask] "a" (lmask), [hmask] "d" (hmask), \ > + [ptr] "D" (ptr)) > + > + default: > + XRSTOR("0x48,"); Not relevant to this patch specifically, but this would be clearer if it actually used the rex64 mnemonic > + break; > + case 4: case 2: > + XRSTOR(""); > + break; > #undef XRSTOR > + } > + if ( likely(faults == prev_faults) ) > + break; > +#ifndef NDEBUG > + gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n", > + faults, ptr->fpu_sse.mxcsr); > + gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n", > + ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv); > + gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n", > + ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]); > + gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n", > + ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]); > + gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n", > + ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]); > +#endif > + switch ( faults ) > + { > + case 1: > + /* Stage 1: Reset state to be loaded. */ > + ptr->xsave_hdr.xstate_bv &= ~mask; > + /* > + * Also try to eliminate fault reasons, even if this shouldn't be > + * needed here (other code should ensure the sanity of the data). > + */ > + if ( ((mask & XSTATE_SSE) || > + ((mask & XSTATE_YMM) && > + !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) ) > + ptr->fpu_sse.mxcsr &= mxcsr_mask; > + if ( cpu_has_xsaves || cpu_has_xsavec ) > + { > + ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss); > + ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv; > + ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED; > + } > + else > + { > + ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0); > + ptr->xsave_hdr.xcomp_bv = 0; > + } > + memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved)); > + continue; Newline here please. > + case 2: > + /* Stage 2: Reset all state. */ > + ptr->fpu_sse.mxcsr = MXCSR_DEFAULT; > + ptr->xsave_hdr.xstate_bv = 0; > + ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves > + ? XSTATE_COMPACTION_ENABLED : 0; > + continue; And here. > + default: > + domain_crash(current->domain); > + break; This breaks out of the switch statement, not out of the loop. It looks like it will end in an infinite loop of calling domain_crash(). ~Andrew > + } > } > } > > @@ -496,6 +549,8 @@ void xstate_init(struct cpuinfo_x86 *c) > > if ( bsp ) > { > + static typeof(current->arch.xsave_area->fpu_sse) __initdata ctxt; > + > xfeature_mask = feature_mask; > /* > * xsave_cntxt_size is the max size required by enabled features. > @@ -504,6 +559,10 @@ void xstate_init(struct cpuinfo_x86 *c) > xsave_cntxt_size = _xstate_ctxt_size(feature_mask); > printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", > __func__, xsave_cntxt_size, xfeature_mask); > + > + asm ( "fxsave %0" : "=m" (ctxt) ); > + if ( ctxt.mxcsr_mask ) > + mxcsr_mask = ctxt.mxcsr_mask; > } > else > { > >
>>> On 29.01.16 at 17:48, <andrew.cooper3@citrix.com> wrote: > On 29/01/16 10:29, Jan Beulich wrote: >> --- a/xen/arch/x86/xstate.c >> +++ b/xen/arch/x86/xstate.c >> @@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes >> static unsigned int __read_mostly xstate_features; >> static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8]; >> >> +static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT; > > The default mxcsr_mask value is 0xFFBF, which is different to the > default mxcsr value. Oh, right, of course. Clearly I have no old enough box in regular use anymore to actually notice the difference. >> - case 4: case 2: >> - XRSTOR(""); >> - break; >> + ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \ >> + [lmask] "a" (lmask), [hmask] "d" (hmask), \ >> + [ptr] "D" (ptr)) >> + >> + default: >> + XRSTOR("0x48,"); > > Not relevant to this patch specifically, but this would be clearer if it > actually used the rex64 mnemonic Prefixing a .byte with rex64 is, well, odd? >> + break; >> + case 4: case 2: >> + XRSTOR(""); >> + break; >> #undef XRSTOR >> + } >> + if ( likely(faults == prev_faults) ) >> + break; >> +#ifndef NDEBUG >> + gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n", >> + faults, ptr->fpu_sse.mxcsr); >> + gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n", >> + ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv); >> + gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n", >> + ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]); >> + gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n", >> + ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]); >> + gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n", >> + ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]); >> +#endif >> + switch ( faults ) >> + { >> + case 1: >> + /* Stage 1: Reset state to be loaded. */ >> + ptr->xsave_hdr.xstate_bv &= ~mask; >> + /* >> + * Also try to eliminate fault reasons, even if this shouldn't be >> + * needed here (other code should ensure the sanity of the data). >> + */ >> + if ( ((mask & XSTATE_SSE) || >> + ((mask & XSTATE_YMM) && >> + !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) ) >> + ptr->fpu_sse.mxcsr &= mxcsr_mask; >> + if ( cpu_has_xsaves || cpu_has_xsavec ) >> + { >> + ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss); >> + ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv; >> + ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED; >> + } >> + else >> + { >> + ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0); >> + ptr->xsave_hdr.xcomp_bv = 0; >> + } >> + memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved)); >> + continue; > > Newline here please. > >> + case 2: >> + /* Stage 2: Reset all state. */ >> + ptr->fpu_sse.mxcsr = MXCSR_DEFAULT; >> + ptr->xsave_hdr.xstate_bv = 0; >> + ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves >> + ? XSTATE_COMPACTION_ENABLED : 0; >> + continue; > > And here. > >> + default: >> + domain_crash(current->domain); >> + break; > > This breaks out of the switch statement, not out of the loop. It looks > like it will end in an infinite loop of calling domain_crash(). Oh, indeed, I forgot that: After all, the whole point of the "continue"s was for there to be a break after the switch()'s end. Jan
--- a/xen/arch/x86/xstate.c +++ b/xen/arch/x86/xstate.c @@ -29,6 +29,8 @@ unsigned int *__read_mostly xstate_sizes static unsigned int __read_mostly xstate_features; static unsigned int __read_mostly xstate_comp_offsets[sizeof(xfeature_mask)*8]; +static uint32_t __read_mostly mxcsr_mask = MXCSR_DEFAULT; + /* Cached xcr0 for fast read */ static DEFINE_PER_CPU(uint64_t, xcr0); @@ -342,6 +344,7 @@ void xrstor(struct vcpu *v, uint64_t mas uint32_t hmask = mask >> 32; uint32_t lmask = mask; struct xsave_struct *ptr = v->arch.xsave_area; + unsigned int faults, prev_faults; /* * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception @@ -361,35 +364,85 @@ void xrstor(struct vcpu *v, uint64_t mas /* * XRSTOR can fault if passed a corrupted data block. We handle this * possibility, which may occur if the block was passed to us by control - * tools or through VCPUOP_initialise, by silently clearing the block. + * tools or through VCPUOP_initialise, by silently adjusting state. */ - switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) ) + for ( prev_faults = faults = 0; ; prev_faults = faults ) { + switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) ) + { #define XRSTOR(pfx) \ alternative_io("1: .byte " pfx "0x0f,0xae,0x2f\n" \ + "3:\n" \ " .section .fixup,\"ax\"\n" \ - "2: mov %[size],%%ecx\n" \ - " xor %[lmask_out],%[lmask_out]\n" \ - " rep stosb\n" \ - " lea %[mem],%[ptr]\n" \ - " mov %[lmask_in],%[lmask_out]\n" \ - " jmp 1b\n" \ + "2: inc%z[faults] %[faults]\n" \ + " jmp 3b\n" \ " .previous\n" \ _ASM_EXTABLE(1b, 2b), \ ".byte " pfx "0x0f,0xc7,0x1f\n", \ X86_FEATURE_XSAVES, \ - ASM_OUTPUT2([ptr] "+&D" (ptr), [lmask_out] "+&a" (lmask)), \ - [mem] "m" (*ptr), [lmask_in] "g" (lmask), \ - [hmask] "d" (hmask), [size] "m" (xsave_cntxt_size) \ - : "ecx") - - default: - XRSTOR("0x48,"); - break; - case 4: case 2: - XRSTOR(""); - break; + ASM_OUTPUT2([mem] "+m" (*ptr), [faults] "+g" (faults)), \ + [lmask] "a" (lmask), [hmask] "d" (hmask), \ + [ptr] "D" (ptr)) + + default: + XRSTOR("0x48,"); + break; + case 4: case 2: + XRSTOR(""); + break; #undef XRSTOR + } + if ( likely(faults == prev_faults) ) + break; +#ifndef NDEBUG + gprintk(XENLOG_WARNING, "fault#%u: mxcsr=%08x\n", + faults, ptr->fpu_sse.mxcsr); + gprintk(XENLOG_WARNING, "xs=%016lx xc=%016lx\n", + ptr->xsave_hdr.xstate_bv, ptr->xsave_hdr.xcomp_bv); + gprintk(XENLOG_WARNING, "r0=%016lx r1=%016lx\n", + ptr->xsave_hdr.reserved[0], ptr->xsave_hdr.reserved[1]); + gprintk(XENLOG_WARNING, "r2=%016lx r3=%016lx\n", + ptr->xsave_hdr.reserved[2], ptr->xsave_hdr.reserved[3]); + gprintk(XENLOG_WARNING, "r4=%016lx r5=%016lx\n", + ptr->xsave_hdr.reserved[4], ptr->xsave_hdr.reserved[5]); +#endif + switch ( faults ) + { + case 1: + /* Stage 1: Reset state to be loaded. */ + ptr->xsave_hdr.xstate_bv &= ~mask; + /* + * Also try to eliminate fault reasons, even if this shouldn't be + * needed here (other code should ensure the sanity of the data). + */ + if ( ((mask & XSTATE_SSE) || + ((mask & XSTATE_YMM) && + !(ptr->xsave_hdr.xcomp_bv & XSTATE_COMPACTION_ENABLED))) ) + ptr->fpu_sse.mxcsr &= mxcsr_mask; + if ( cpu_has_xsaves || cpu_has_xsavec ) + { + ptr->xsave_hdr.xcomp_bv &= this_cpu(xcr0) | this_cpu(xss); + ptr->xsave_hdr.xstate_bv &= ptr->xsave_hdr.xcomp_bv; + ptr->xsave_hdr.xcomp_bv |= XSTATE_COMPACTION_ENABLED; + } + else + { + ptr->xsave_hdr.xstate_bv &= this_cpu(xcr0); + ptr->xsave_hdr.xcomp_bv = 0; + } + memset(ptr->xsave_hdr.reserved, 0, sizeof(ptr->xsave_hdr.reserved)); + continue; + case 2: + /* Stage 2: Reset all state. */ + ptr->fpu_sse.mxcsr = MXCSR_DEFAULT; + ptr->xsave_hdr.xstate_bv = 0; + ptr->xsave_hdr.xcomp_bv = cpu_has_xsaves + ? XSTATE_COMPACTION_ENABLED : 0; + continue; + default: + domain_crash(current->domain); + break; + } } } @@ -496,6 +549,8 @@ void xstate_init(struct cpuinfo_x86 *c) if ( bsp ) { + static typeof(current->arch.xsave_area->fpu_sse) __initdata ctxt; + xfeature_mask = feature_mask; /* * xsave_cntxt_size is the max size required by enabled features. @@ -504,6 +559,10 @@ void xstate_init(struct cpuinfo_x86 *c) xsave_cntxt_size = _xstate_ctxt_size(feature_mask); printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", __func__, xsave_cntxt_size, xfeature_mask); + + asm ( "fxsave %0" : "=m" (ctxt) ); + if ( ctxt.mxcsr_mask ) + mxcsr_mask = ctxt.mxcsr_mask; } else {