From patchwork Mon Apr 27 11:15:50 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 11511885 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 41BC51392 for ; Mon, 27 Apr 2020 11:16:38 +0000 (UTC) Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id 2836E20575 for ; Mon, 27 Apr 2020 11:16:38 +0000 (UTC) DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 2836E20575 Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=suse.com Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=xen-devel-bounces@lists.xenproject.org Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.92) (envelope-from ) id 1jT1jv-0006Fw-2r; Mon, 27 Apr 2020 11:15:55 +0000 Received: from all-amaz-eas1.inumbo.com ([34.197.232.57] helo=us1-amaz-eas2.inumbo.com) by lists.xenproject.org with esmtp (Exim 4.92) (envelope-from ) id 1jT1ju-0006Fh-17 for xen-devel@lists.xenproject.org; Mon, 27 Apr 2020 11:15:54 +0000 X-Inumbo-ID: 7496dcd1-8878-11ea-9763-12813bfff9fa Received: from mx2.suse.de (unknown [195.135.220.15]) by us1-amaz-eas2.inumbo.com (Halon) with ESMTPS id 7496dcd1-8878-11ea-9763-12813bfff9fa; Mon, 27 Apr 2020 11:15:52 +0000 (UTC) X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.220.254]) by mx2.suse.de (Postfix) with ESMTP id 0619DAD6B; Mon, 27 Apr 2020 11:15:50 +0000 (UTC) Subject: [PATCH v7 08/11] x86emul: support FXSAVE/FXRSTOR From: Jan Beulich To: "xen-devel@lists.xenproject.org" References: Message-ID: <47c22359-cd7e-22d8-f74b-c45d8dc47b5d@suse.com> Date: Mon, 27 Apr 2020 13:15:50 +0200 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Thunderbird/68.7.0 MIME-Version: 1.0 In-Reply-To: Content-Language: en-US X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" Note that FPU selector handling as well as MXCSR mask saving for now does not honor differences between host and guest visible featuresets. Since operation of the insns on CR4.OSFXSR is implementation dependent, the easiest solution is used: Simply don't look at the bit in the first place. Signed-off-by: Jan Beulich --- v7: New. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -767,6 +767,12 @@ static void zap_fpsel(unsigned int *env, } } +static void zap_xfpsel(unsigned int *env) +{ + env[3] &= ~0xffff; + env[5] &= ~0xffff; +} + #ifdef __x86_64__ # define STKVAL_DISP 64 static const struct { @@ -2517,6 +2523,91 @@ int main(int argc, char **argv) else printf("skipped\n"); + printf("%-40s", "Testing fxsave 4(%ecx)..."); + if ( stack_exec && cpu_has_fxsr ) + { + const uint16_t nine = 9; + + memset(res + 0x80, 0xcc, 0x400); + if ( cpu_has_sse2 ) + asm volatile ( "pcmpeqd %xmm7, %xmm7\n\t" + "pxor %xmm6, %xmm6\n\t" + "psubw %xmm7, %xmm6" ); + asm volatile ( "fninit\n\t" + "fld1\n\t" + "fidivs %1\n\t" + "fxsave %0" + : "=m" (res[0x100]) : "m" (nine) : "memory" ); + zap_xfpsel(&res[0x100]); + instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x41; instr[3] = 0x04; + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)(res + 0x7f); + memset(res + 0x100 + 0x74, 0x33, 0x30); + memset(res + 0x80 + 0x74, 0x33, 0x30); + rc = x86_emulate(&ctxt, &emulops); + zap_xfpsel(&res[0x80]); + if ( (rc != X86EMUL_OKAY) || + memcmp(res + 0x80, res + 0x100, 0x200) || + (regs.eip != (unsigned long)&instr[4]) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing fxrstor -4(%ecx)..."); + if ( stack_exec && cpu_has_fxsr ) + { + const uint16_t eleven = 11; + + memset(res + 0x80, 0xcc, 0x400); + asm volatile ( "fxsave %0" : "=m" (res[0x80]) :: "memory" ); + zap_xfpsel(&res[0x80]); + if ( cpu_has_sse2 ) + asm volatile ( "pxor %xmm7, %xmm6\n\t" + "pxor %xmm7, %xmm3\n\t" + "pxor %xmm7, %xmm0\n\t" + "pxor %xmm7, %xmm7" ); + asm volatile ( "fninit\n\t" + "fld1\n\t" + "fidivs %0\n\t" + :: "m" (eleven) ); + instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x49; instr[3] = 0xfc; + regs.eip = (unsigned long)&instr[0]; + regs.ecx = (unsigned long)(res + 0x81); + rc = x86_emulate(&ctxt, &emulops); + asm volatile ( "fxsave %0" : "=m" (res[0x100]) :: "memory" ); + if ( (rc != X86EMUL_OKAY) || + memcmp(res + 0x100, res + 0x80, 0x200) || + (regs.eip != (unsigned long)&instr[4]) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + +#ifdef __x86_64__ + printf("%-40s", "Testing fxsaveq 8(%edx)..."); + if ( stack_exec && cpu_has_fxsr ) + { + memset(res + 0x80, 0xcc, 0x400); + asm volatile ( "fxsaveq %0" : "=m" (res[0x100]) :: "memory" ); + instr[0] = 0x48; instr[1] = 0x0f; instr[2] = 0xae; instr[3] = 0x42; instr[4] = 0x08; + regs.eip = (unsigned long)&instr[0]; + regs.edx = (unsigned long)(res + 0x7e); + memset(res + 0x100 + 0x74, 0x33, 0x30); + memset(res + 0x80 + 0x74, 0x33, 0x30); + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + memcmp(res + 0x80, res + 0x100, 0x200) || + (regs.eip != (unsigned long)&instr[5]) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); +#endif + printf("%-40s", "Testing movq %mm3,(%ecx)..."); if ( stack_exec && cpu_has_mmx ) { --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -30,6 +30,13 @@ struct cpuid_policy cp; static char fpu_save_area[4096] __attribute__((__aligned__((64)))); static bool use_xsave; +/* + * Re-use the area above also as scratch space for the emulator itself. + * (When debugging the emulator, care needs to be taken when inserting + * printf() or alike function calls into regions using this.) + */ +#define FXSAVE_AREA ((struct x86_fxsr *)fpu_save_area) + void emul_save_fpu_state(void) { if ( use_xsave ) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -860,6 +860,11 @@ struct x86_emulate_state { blk_fld, /* FLDENV, FRSTOR */ blk_fst, /* FNSTENV, FNSAVE */ #endif +#if !defined(X86EMUL_NO_FPU) && !defined(X86EMUL_NO_MMX) && \ + !defined(X86EMUL_NO_SIMD) + blk_fxrstor, + blk_fxsave, +#endif blk_movdir, } blk; uint8_t modrm, modrm_mod, modrm_reg, modrm_rm; @@ -953,6 +958,29 @@ typedef union { uint32_t data32[16]; } mmval_t; +struct x86_fxsr { + uint16_t fcw; + uint16_t fsw; + uint16_t ftw:8, :8; + uint16_t fop; + union { + struct { + uint32_t offs; + uint32_t sel:16, :16; + }; + uint64_t addr; + } fip, fdp; + uint32_t mxcsr; + uint32_t mxcsr_mask; + struct { + uint8_t data[10]; + uint8_t _[6]; + } fpreg[8]; + uint64_t __attribute__ ((aligned(16))) xmm[16][2]; + uint64_t _[6]; + uint64_t avl[6]; +}; + /* * While proper alignment gets specified above, this doesn't get honored by * the compiler for automatic variables. Use this helper to instantiate a @@ -1910,6 +1938,7 @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_cmov() (ctxt->cpuid->basic.cmov) #define vcpu_has_clflush() (ctxt->cpuid->basic.clflush) #define vcpu_has_mmx() (ctxt->cpuid->basic.mmx) +#define vcpu_has_fxsr() (ctxt->cpuid->basic.fxsr) #define vcpu_has_sse() (ctxt->cpuid->basic.sse) #define vcpu_has_sse2() (ctxt->cpuid->basic.sse2) #define vcpu_has_sse3() (ctxt->cpuid->basic.sse3) @@ -8125,6 +8154,30 @@ x86_emulate( case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */ switch ( modrm_reg & 7 ) { +#if !defined(X86EMUL_NO_FPU) && !defined(X86EMUL_NO_MMX) && \ + !defined(X86EMUL_NO_SIMD) + case 0: /* fxsave */ + case 1: /* fxrstor */ + generate_exception_if(vex.pfx, EXC_UD); + vcpu_must_have(fxsr); + generate_exception_if(ea.type != OP_MEM, EXC_UD); + generate_exception_if(!is_aligned(ea.mem.seg, ea.mem.off, 16, + ctxt, ops), + EXC_GP, 0); + fail_if(!ops->blk); + /* + * This could also be X86EMUL_FPU_mmx, but it shouldn't be + * X86EMUL_FPU_xmm, as we don't want CR4.OSFXSR checked. + */ + get_fpu(X86EMUL_FPU_fpu); + state->blk = modrm_reg & 1 ? blk_fxrstor : blk_fxsave; + if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL, + sizeof(struct x86_fxsr), &_regs.eflags, + state, ctxt)) != X86EMUL_OKAY ) + goto done; + break; +#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */ + #ifndef X86EMUL_NO_SIMD case 2: /* ldmxcsr */ generate_exception_if(vex.pfx, EXC_UD); @@ -11611,6 +11664,8 @@ int x86_emul_blk( struct x86_emulate_state *state, struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_OKAY; + switch ( state->blk ) { bool zf; @@ -11815,7 +11870,79 @@ int x86_emul_blk( if ( ptr ) memcpy(ptr, fpstate.freg, sizeof(fpstate.freg)); break; -#endif + +# if !defined(X86EMUL_NO_MMX) && !defined(X86EMUL_NO_SIMD) + + case blk_fxrstor: + { + struct x86_fxsr *fxsr = FXSAVE_AREA; + + ASSERT(!data); + ASSERT(bytes == sizeof(*fxsr)); + + if ( mode_64bit() ) + { + /* We could probably also copy the entire area. */ + memcpy(fxsr, ptr, offsetof(struct x86_fxsr, _)); + memset(fxsr->_, 0, sizeof(*fxsr) - offsetof(struct x86_fxsr, _)); + } + else + { + memcpy(fxsr, ptr, offsetof(struct x86_fxsr, xmm[8])); + /* + * We can't issue a suitable FXRSTOR loading just XMM0...XMM7. Zeroing + * the high registers to be loaded isn't entirely correct, but the + * alternative would be quite a bit more involved: Synthesize the insn + * from FRSTOR, LDMXCSR, and individual XMM register loads. + */ + memset(fxsr->xmm[8], 0, + sizeof(*fxsr) - offsetof(struct x86_fxsr, xmm[8])); + } + + generate_exception_if(fxsr->mxcsr & ~mxcsr_mask, EXC_GP, 0); + +# ifdef __x86_64__ + if ( state->rex_prefix & REX_W ) + { + /* + * The only way to force fxrstorq on a wide range of gas versions. + * On older versions the rex64 prefix works only if we force an + * addressing mode that doesn't require extended registers. + */ + asm volatile ( ".byte 0x48; fxrstor (%0)" + :: "R" (fxsr), "m" (*fxsr) ); + } + else +# endif + asm volatile ( "fxrstor %0" :: "m" (*fxsr) ); + break; + } + + case blk_fxsave: + ASSERT(!data); + ASSERT(bytes == sizeof(struct x86_fxsr)); + +# ifdef __x86_64__ + if ( !mode_64bit() ) + { + struct x86_fxsr *fxsr = FXSAVE_AREA; + + asm volatile ( "fxsave %0" : "=m" (*fxsr) ); + memcpy(ptr, fxsr, offsetof(struct x86_fxsr, xmm[8])); + } + else if ( state->rex_prefix & REX_W ) + { + /* See above for why operand/constraints are this way. */ + asm volatile ( ".byte 0x48; fxsave (%0)" + :: "R" (ptr) : "memory" ); + } + else +# endif + asm volatile ( "fxsave (%0)" :: "r" (ptr) : "memory" ); + break; + +# endif /* X86EMUL_NO_{MMX,SIMD} */ +#endif /* X86EMUL_NO_FPU */ case blk_movdir: switch ( bytes ) @@ -11870,7 +11997,8 @@ int x86_emul_blk( return X86EMUL_UNHANDLEABLE; } - return X86EMUL_OKAY; + done: + return rc; } static void __init __maybe_unused build_assertions(void) --- a/xen/arch/x86/x86_emulate.c +++ b/xen/arch/x86/x86_emulate.c @@ -42,6 +42,8 @@ } \ }) +#define FXSAVE_AREA current->arch.fpu_ctxt + #ifndef CONFIG_HVM # define X86EMUL_NO_FPU # define X86EMUL_NO_MMX