Message ID | 95252da8-777b-9527-6f5b-1e1a5994f845@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | x86emul: remaining AVX512 support | expand |
On 01/07/2019 12:18, Jan Beulich wrote: > --- a/xen/arch/x86/x86_emulate/x86_emulate.c > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c > @@ -9100,6 +9100,133 @@ x86_emulate( > put_stub(stub); > > if ( rc != X86EMUL_OKAY ) > + goto done; > + > + state->simd_size = simd_none; > + break; > + } > + > + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ > + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ > + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ > + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ > + { > + typeof(evex) *pevex; > + union { > + int32_t dw[16]; > + int64_t qw[8]; > + } index; > + bool done = false; > + > + ASSERT(ea.type == OP_MEM); > + generate_exception_if((!evex.opmsk || evex.brs || evex.z || > + evex.reg != 0xf || > + modrm_reg == state->sib_index), > + EXC_UD); > + avx512_vlen_check(false); > + host_and_vcpu_must_have(avx512f); > + get_fpu(X86EMUL_FPU_zmm); > + > + /* Read destination and index registers. */ > + opc = init_evex(stub); > + pevex = copy_EVEX(opc, evex); > + pevex->opcx = vex_0f; > + opc[0] = 0x7f; /* vmovdqa{32,64} */ > + /* > + * The register writeback below has to retain masked-off elements, but > + * needs to clear upper portions in the index-wider-than-data cases. > + * Therefore read (and write below) the full register. The alternative > + * would have been to fiddle with the mask register used. > + */ > + pevex->opmsk = 0; > + /* Use (%rax) as destination and modrm_reg as source. */ > + pevex->b = 1; > + opc[1] = (modrm_reg & 7) << 3; > + pevex->RX = 1; > + opc[2] = 0xc3; > + > + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); > + > + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ > + pevex->w = b & 1; > + /* Switch to sib_index as source. */ > + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); > + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); > + opc[1] = (state->sib_index & 7) << 3; > + > + invoke_stub("", "", "=m" (index) : "a" (&index)); > + put_stub(stub); > + > + /* Clear untouched parts of the destination and mask values. */ > + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); > + op_bytes = 4 << evex.w; > + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); > + op_mask &= (1 << n) - 1; > + > + for ( i = 0; op_mask; ++i ) > + { > + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; No signed. However, surely this needs to be int64_t anyway, to function correctly in a 32bit build of the test harness? The SDM says VPGATHERQD is encodable in 32bit with quadword indices. ~Andrew > + > + if ( !(op_mask & (1 << i)) ) > + continue; > + > + rc = ops->read(ea.mem.seg, > + truncate_ea(ea.mem.off + (idx << state->sib_scale)), > + (void *)mmvalp + i * op_bytes, op_bytes, ctxt); > + if ( rc != X86EMUL_OKAY ) > + { > + /* > + * If we've made some progress and the access did not fault, > + * force a retry instead. This is for example necessary to > + * cope with the limited capacity of HVM's MMIO cache. > + */ > + if ( rc != X86EMUL_EXCEPTION && done ) > + rc = X86EMUL_RETRY; > + break; > + } > + > + op_mask &= ~(1 << i); > + done = true; > + > +#ifdef __XEN__ > + if ( op_mask && local_events_need_delivery() ) > + { > + rc = X86EMUL_RETRY; > + break; > + } > +#endif > + } > + > + /* Write destination and mask registers. */ > + opc = init_evex(stub); > + pevex = copy_EVEX(opc, evex); > + pevex->opcx = vex_0f; > + opc[0] = 0x6f; /* vmovdqa{32,64} */ > + pevex->opmsk = 0; > + /* Use modrm_reg as destination and (%rax) as source. */ > + pevex->b = 1; > + opc[1] = (modrm_reg & 7) << 3; > + pevex->RX = 1; > + opc[2] = 0xc3; > + > + invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp)); > + > + /* > + * kmovw: This is VEX-encoded, so we can't use pevex. Avoid copy_VEX() etc > + * as well, since we can easily use the 2-byte VEX form here. > + */ > + opc -= EVEX_PFX_BYTES; > + opc[0] = 0xc5; > + opc[1] = 0xf8; > + opc[2] = 0x90; > + /* Use (%rax) as source. */ > + opc[3] = evex.opmsk << 3; > + opc[4] = 0xc3; > + > + invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask)); > + put_stub(stub); > + > + if ( rc != X86EMUL_OKAY ) > goto done; > > state->simd_size = simd_none; >
On 04/07/2019 15:10, Andrew Cooper wrote: > On 01/07/2019 12:18, Jan Beulich wrote: >> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >> @@ -9100,6 +9100,133 @@ x86_emulate( >> put_stub(stub); >> >> if ( rc != X86EMUL_OKAY ) >> + goto done; >> + >> + state->simd_size = simd_none; >> + break; >> + } >> + >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ >> + { >> + typeof(evex) *pevex; >> + union { >> + int32_t dw[16]; >> + int64_t qw[8]; >> + } index; >> + bool done = false; >> + >> + ASSERT(ea.type == OP_MEM); >> + generate_exception_if((!evex.opmsk || evex.brs || evex.z || >> + evex.reg != 0xf || >> + modrm_reg == state->sib_index), >> + EXC_UD); >> + avx512_vlen_check(false); >> + host_and_vcpu_must_have(avx512f); >> + get_fpu(X86EMUL_FPU_zmm); >> + >> + /* Read destination and index registers. */ >> + opc = init_evex(stub); >> + pevex = copy_EVEX(opc, evex); >> + pevex->opcx = vex_0f; >> + opc[0] = 0x7f; /* vmovdqa{32,64} */ >> + /* >> + * The register writeback below has to retain masked-off elements, but >> + * needs to clear upper portions in the index-wider-than-data cases. >> + * Therefore read (and write below) the full register. The alternative >> + * would have been to fiddle with the mask register used. >> + */ >> + pevex->opmsk = 0; >> + /* Use (%rax) as destination and modrm_reg as source. */ >> + pevex->b = 1; >> + opc[1] = (modrm_reg & 7) << 3; >> + pevex->RX = 1; >> + opc[2] = 0xc3; >> + >> + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); >> + >> + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ >> + pevex->w = b & 1; >> + /* Switch to sib_index as source. */ >> + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); >> + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); >> + opc[1] = (state->sib_index & 7) << 3; >> + >> + invoke_stub("", "", "=m" (index) : "a" (&index)); >> + put_stub(stub); >> + >> + /* Clear untouched parts of the destination and mask values. */ >> + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); >> + op_bytes = 4 << evex.w; >> + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); >> + op_mask &= (1 << n) - 1; >> + >> + for ( i = 0; op_mask; ++i ) >> + { >> + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; > No signed. However, surely this needs to be int64_t anyway, to function > correctly in a 32bit build of the test harness? > > The SDM says VPGATHERQD is encodable in 32bit with quadword indices. > > ~Andrew > >> + >> + if ( !(op_mask & (1 << i)) ) >> + continue; >> + >> + rc = ops->read(ea.mem.seg, >> + truncate_ea(ea.mem.off + (idx << state->sib_scale)), Actually, what SDM says is: "The scaled index may require more bits to represent than the address bits used by the processor (e.g., in 32-bit mode, if the scale is greater than one). In this case, the most significant bits beyond the number of address bits are ignored." That reads as if it is it means "ea.mem.off + (u32)(idx << state->sib_scale)". However, given the overall truncation, I'm not sure how to confirm what the real behaviour is. ~Andrew
On 04.07.2019 16:10, Andrew Cooper wrote: > On 01/07/2019 12:18, Jan Beulich wrote: >> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >> @@ -9100,6 +9100,133 @@ x86_emulate( >> put_stub(stub); >> >> if ( rc != X86EMUL_OKAY ) >> + goto done; >> + >> + state->simd_size = simd_none; >> + break; >> + } >> + >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ >> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ >> + { >> + typeof(evex) *pevex; >> + union { >> + int32_t dw[16]; >> + int64_t qw[8]; >> + } index; >> + bool done = false; >> + >> + ASSERT(ea.type == OP_MEM); >> + generate_exception_if((!evex.opmsk || evex.brs || evex.z || >> + evex.reg != 0xf || >> + modrm_reg == state->sib_index), >> + EXC_UD); >> + avx512_vlen_check(false); >> + host_and_vcpu_must_have(avx512f); >> + get_fpu(X86EMUL_FPU_zmm); >> + >> + /* Read destination and index registers. */ >> + opc = init_evex(stub); >> + pevex = copy_EVEX(opc, evex); >> + pevex->opcx = vex_0f; >> + opc[0] = 0x7f; /* vmovdqa{32,64} */ >> + /* >> + * The register writeback below has to retain masked-off elements, but >> + * needs to clear upper portions in the index-wider-than-data cases. >> + * Therefore read (and write below) the full register. The alternative >> + * would have been to fiddle with the mask register used. >> + */ >> + pevex->opmsk = 0; >> + /* Use (%rax) as destination and modrm_reg as source. */ >> + pevex->b = 1; >> + opc[1] = (modrm_reg & 7) << 3; >> + pevex->RX = 1; >> + opc[2] = 0xc3; >> + >> + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); >> + >> + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ >> + pevex->w = b & 1; >> + /* Switch to sib_index as source. */ >> + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); >> + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); >> + opc[1] = (state->sib_index & 7) << 3; >> + >> + invoke_stub("", "", "=m" (index) : "a" (&index)); >> + put_stub(stub); >> + >> + /* Clear untouched parts of the destination and mask values. */ >> + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); >> + op_bytes = 4 << evex.w; >> + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); >> + op_mask &= (1 << n) - 1; >> + >> + for ( i = 0; op_mask; ++i ) >> + { >> + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; > > No signed. Hmm - would you mind this remaining consistent with the AVX counterpart code? (As an aside I continue to think it is a bad thing to not have explicit "signed" when we actually mean signed quantities, seeing the still large amount of plain short/int/long uses that actually should be unsigned.) > However, surely this needs to be int64_t anyway, to function > correctly in a 32bit build of the test harness? No, only 32 bits (or less, when the scale factor is larger than 1) will be used for address calculation. And again this is no different from pre-existing AVX code. > The SDM says VPGATHERQD is encodable in 32bit with quadword indices. Sure, truncating to just 32-bit values due to 32-bit addressing. Jan
On 04.07.2019 16:16, Andrew Cooper wrote: > On 04/07/2019 15:10, Andrew Cooper wrote: >> On 01/07/2019 12:18, Jan Beulich wrote: >>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >>> @@ -9100,6 +9100,133 @@ x86_emulate( >>> put_stub(stub); >>> >>> if ( rc != X86EMUL_OKAY ) >>> + goto done; >>> + >>> + state->simd_size = simd_none; >>> + break; >>> + } >>> + >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ >>> + { >>> + typeof(evex) *pevex; >>> + union { >>> + int32_t dw[16]; >>> + int64_t qw[8]; >>> + } index; >>> + bool done = false; >>> + >>> + ASSERT(ea.type == OP_MEM); >>> + generate_exception_if((!evex.opmsk || evex.brs || evex.z || >>> + evex.reg != 0xf || >>> + modrm_reg == state->sib_index), >>> + EXC_UD); >>> + avx512_vlen_check(false); >>> + host_and_vcpu_must_have(avx512f); >>> + get_fpu(X86EMUL_FPU_zmm); >>> + >>> + /* Read destination and index registers. */ >>> + opc = init_evex(stub); >>> + pevex = copy_EVEX(opc, evex); >>> + pevex->opcx = vex_0f; >>> + opc[0] = 0x7f; /* vmovdqa{32,64} */ >>> + /* >>> + * The register writeback below has to retain masked-off elements, but >>> + * needs to clear upper portions in the index-wider-than-data cases. >>> + * Therefore read (and write below) the full register. The alternative >>> + * would have been to fiddle with the mask register used. >>> + */ >>> + pevex->opmsk = 0; >>> + /* Use (%rax) as destination and modrm_reg as source. */ >>> + pevex->b = 1; >>> + opc[1] = (modrm_reg & 7) << 3; >>> + pevex->RX = 1; >>> + opc[2] = 0xc3; >>> + >>> + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); >>> + >>> + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ >>> + pevex->w = b & 1; >>> + /* Switch to sib_index as source. */ >>> + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); >>> + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); >>> + opc[1] = (state->sib_index & 7) << 3; >>> + >>> + invoke_stub("", "", "=m" (index) : "a" (&index)); >>> + put_stub(stub); >>> + >>> + /* Clear untouched parts of the destination and mask values. */ >>> + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); >>> + op_bytes = 4 << evex.w; >>> + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); >>> + op_mask &= (1 << n) - 1; >>> + >>> + for ( i = 0; op_mask; ++i ) >>> + { >>> + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; >> No signed. However, surely this needs to be int64_t anyway, to function >> correctly in a 32bit build of the test harness? >> >> The SDM says VPGATHERQD is encodable in 32bit with quadword indices. >> >> ~Andrew >> >>> + >>> + if ( !(op_mask & (1 << i)) ) >>> + continue; >>> + >>> + rc = ops->read(ea.mem.seg, >>> + truncate_ea(ea.mem.off + (idx << state->sib_scale)), > > Actually, what SDM says is: > > "The scaled index may require more bits to represent than the address > bits used by the processor (e.g., in 32-bit mode, if the scale is > greater than one). In this case, the most significant bits beyond the > number of address bits are ignored." > > That reads as if it is it means "ea.mem.off + (u32)(idx << > state->sib_scale)". Why "reads as if"? What else could a 32-bit address computation look like? (In essence truncate_ea() will truncate to 32 bits anyway when 32-bit addressing is in use, so the inner truncation is simply redundant.) Jan
On 04/07/2019 15:25, Jan Beulich wrote: > On 04.07.2019 16:16, Andrew Cooper wrote: >> On 04/07/2019 15:10, Andrew Cooper wrote: >>> On 01/07/2019 12:18, Jan Beulich wrote: >>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >>>> @@ -9100,6 +9100,133 @@ x86_emulate( >>>> put_stub(stub); >>>> >>>> if ( rc != X86EMUL_OKAY ) >>>> + goto done; >>>> + >>>> + state->simd_size = simd_none; >>>> + break; >>>> + } >>>> + >>>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ >>>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ >>>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ >>>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ >>>> + { >>>> + typeof(evex) *pevex; >>>> + union { >>>> + int32_t dw[16]; >>>> + int64_t qw[8]; >>>> + } index; >>>> + bool done = false; >>>> + >>>> + ASSERT(ea.type == OP_MEM); >>>> + generate_exception_if((!evex.opmsk || evex.brs || evex.z || >>>> + evex.reg != 0xf || >>>> + modrm_reg == state->sib_index), >>>> + EXC_UD); >>>> + avx512_vlen_check(false); >>>> + host_and_vcpu_must_have(avx512f); >>>> + get_fpu(X86EMUL_FPU_zmm); >>>> + >>>> + /* Read destination and index registers. */ >>>> + opc = init_evex(stub); >>>> + pevex = copy_EVEX(opc, evex); >>>> + pevex->opcx = vex_0f; >>>> + opc[0] = 0x7f; /* vmovdqa{32,64} */ >>>> + /* >>>> + * The register writeback below has to retain masked-off elements, but >>>> + * needs to clear upper portions in the index-wider-than-data cases. >>>> + * Therefore read (and write below) the full register. The alternative >>>> + * would have been to fiddle with the mask register used. >>>> + */ >>>> + pevex->opmsk = 0; >>>> + /* Use (%rax) as destination and modrm_reg as source. */ >>>> + pevex->b = 1; >>>> + opc[1] = (modrm_reg & 7) << 3; >>>> + pevex->RX = 1; >>>> + opc[2] = 0xc3; >>>> + >>>> + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); >>>> + >>>> + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ >>>> + pevex->w = b & 1; >>>> + /* Switch to sib_index as source. */ >>>> + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); >>>> + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); >>>> + opc[1] = (state->sib_index & 7) << 3; >>>> + >>>> + invoke_stub("", "", "=m" (index) : "a" (&index)); >>>> + put_stub(stub); >>>> + >>>> + /* Clear untouched parts of the destination and mask values. */ >>>> + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); >>>> + op_bytes = 4 << evex.w; >>>> + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); >>>> + op_mask &= (1 << n) - 1; >>>> + >>>> + for ( i = 0; op_mask; ++i ) >>>> + { >>>> + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; >>> No signed. However, surely this needs to be int64_t anyway, to function >>> correctly in a 32bit build of the test harness? >>> >>> The SDM says VPGATHERQD is encodable in 32bit with quadword indices. >>> >>> ~Andrew >>> >>>> + >>>> + if ( !(op_mask & (1 << i)) ) >>>> + continue; >>>> + >>>> + rc = ops->read(ea.mem.seg, >>>> + truncate_ea(ea.mem.off + (idx << state->sib_scale)), >> Actually, what SDM says is: >> >> "The scaled index may require more bits to represent than the address >> bits used by the processor (e.g., in 32-bit mode, if the scale is >> greater than one). In this case, the most significant bits beyond the >> number of address bits are ignored." >> >> That reads as if it is it means "ea.mem.off + (u32)(idx << >> state->sib_scale)". > Why "reads as if"? What else could a 32-bit address computation look > like? (In essence truncate_ea() will truncate to 32 bits anyway when > 32-bit addressing is in use, so the inner truncation is simply > redundant.) Ok - I think it will DTRT. Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
On 04/07/2019 15:22, Jan Beulich wrote: > On 04.07.2019 16:10, Andrew Cooper wrote: >> On 01/07/2019 12:18, Jan Beulich wrote: >>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c >>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c >>> @@ -9100,6 +9100,133 @@ x86_emulate( >>> put_stub(stub); >>> >>> if ( rc != X86EMUL_OKAY ) >>> + goto done; >>> + >>> + state->simd_size = simd_none; >>> + break; >>> + } >>> + >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ >>> + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ >>> + { >>> + typeof(evex) *pevex; >>> + union { >>> + int32_t dw[16]; >>> + int64_t qw[8]; >>> + } index; >>> + bool done = false; >>> + >>> + ASSERT(ea.type == OP_MEM); >>> + generate_exception_if((!evex.opmsk || evex.brs || evex.z || >>> + evex.reg != 0xf || >>> + modrm_reg == state->sib_index), >>> + EXC_UD); >>> + avx512_vlen_check(false); >>> + host_and_vcpu_must_have(avx512f); >>> + get_fpu(X86EMUL_FPU_zmm); >>> + >>> + /* Read destination and index registers. */ >>> + opc = init_evex(stub); >>> + pevex = copy_EVEX(opc, evex); >>> + pevex->opcx = vex_0f; >>> + opc[0] = 0x7f; /* vmovdqa{32,64} */ >>> + /* >>> + * The register writeback below has to retain masked-off elements, but >>> + * needs to clear upper portions in the index-wider-than-data cases. >>> + * Therefore read (and write below) the full register. The alternative >>> + * would have been to fiddle with the mask register used. >>> + */ >>> + pevex->opmsk = 0; >>> + /* Use (%rax) as destination and modrm_reg as source. */ >>> + pevex->b = 1; >>> + opc[1] = (modrm_reg & 7) << 3; >>> + pevex->RX = 1; >>> + opc[2] = 0xc3; >>> + >>> + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); >>> + >>> + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ >>> + pevex->w = b & 1; >>> + /* Switch to sib_index as source. */ >>> + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); >>> + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); >>> + opc[1] = (state->sib_index & 7) << 3; >>> + >>> + invoke_stub("", "", "=m" (index) : "a" (&index)); >>> + put_stub(stub); >>> + >>> + /* Clear untouched parts of the destination and mask values. */ >>> + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); >>> + op_bytes = 4 << evex.w; >>> + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); >>> + op_mask &= (1 << n) - 1; >>> + >>> + for ( i = 0; op_mask; ++i ) >>> + { >>> + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; >> No signed. > Hmm - would you mind this remaining consistent with the AVX > counterpart code? (As an aside I continue to think it is a bad > thing to not have explicit "signed" when we actually mean signed > quantities, seeing the still large amount of plain short/int/long > uses that actually should be unsigned.) That was conclusively objected to by multiple other committers, for a number of reasons. It is unfortunate that some examples slipped in, but as the coding style is not changing, they should be taken out. ~Andrew
--- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -18,7 +18,7 @@ CFLAGS += $(CFLAGS_xeninclude) SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er FMA := fma4 fma -SG := avx2-sg +SG := avx2-sg avx512f-sg avx512vl-sg TESTCASES := blowfish $(SIMD) $(FMA) $(SG) OPMASK := avx512f avx512dq avx512bw @@ -66,6 +66,14 @@ xop-flts := $(avx-flts) avx512f-vecs := 64 16 32 avx512f-ints := 4 8 avx512f-flts := 4 8 +avx512f-sg-vecs := 64 +avx512f-sg-idxs := 4 8 +avx512f-sg-ints := $(avx512f-ints) +avx512f-sg-flts := $(avx512f-flts) +avx512vl-sg-vecs := 16 32 +avx512vl-sg-idxs := $(avx512f-sg-idxs) +avx512vl-sg-ints := $(avx512f-ints) +avx512vl-sg-flts := $(avx512f-flts) avx512bw-vecs := $(avx512f-vecs) avx512bw-ints := 1 2 avx512bw-flts := --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -176,6 +176,8 @@ static const struct test avx512f_all[] = INSN(fnmsub213, 66, 0f38, af, el, sd, el), INSN(fnmsub231, 66, 0f38, be, vl, sd, vl), INSN(fnmsub231, 66, 0f38, bf, el, sd, el), + INSN(gatherd, 66, 0f38, 92, vl, sd, el), + INSN(gatherq, 66, 0f38, 93, vl, sd, el), INSN(getexp, 66, 0f38, 42, vl, sd, vl), INSN(getexp, 66, 0f38, 43, el, sd, el), INSN(getmant, 66, 0f3a, 26, vl, sd, vl), @@ -229,6 +231,8 @@ static const struct test avx512f_all[] = INSN(permt2, 66, 0f38, 7e, vl, dq, vl), INSN(permt2, 66, 0f38, 7f, vl, sd, vl), INSN(pexpand, 66, 0f38, 89, vl, dq, el), + INSN(pgatherd, 66, 0f38, 90, vl, dq, el), + INSN(pgatherq, 66, 0f38, 91, vl, dq, el), INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl), INSN(pmaxu, 66, 0f38, 3f, vl, dq, vl), INSN(pmins, 66, 0f38, 39, vl, dq, vl), --- a/tools/tests/x86_emulator/simd-sg.c +++ b/tools/tests/x86_emulator/simd-sg.c @@ -35,13 +35,78 @@ typedef long long __attribute__((vector_ #define ITEM_COUNT (VEC_SIZE / ELEM_SIZE < IVEC_SIZE / IDX_SIZE ? \ VEC_SIZE / ELEM_SIZE : IVEC_SIZE / IDX_SIZE) -#if VEC_SIZE == 16 -# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0) -#else -# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0) -#endif +#if defined(__AVX512F__) +# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT)) +# if ELEM_SIZE == 4 +# if IDX_SIZE == 4 || defined(__AVX512VL__) +# define to_mask(msk) B(ptestmd, , (vsi_t)(msk), (vsi_t)(msk), ~0) +# define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE) +# else +# define widen(x) __builtin_ia32_pmovzxdq512_mask((vsi_t)(x), (idi_t){}, ~0) +# define to_mask(msk) __builtin_ia32_ptestmq512(widen(msk), widen(msk), ~0) +# define eq(x, y) (__builtin_ia32_pcmpeqq512_mask(widen(x), widen(y), ~0) == ALL_TRUE) +# endif +# define BG_(dt, it, reg, mem, idx, msk, scl) \ + __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl) +# else +# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE) +# define BG_(dt, it, reg, mem, idx, msk, scl) \ + __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl) +# endif +/* + * Instead of replicating the main IDX_SIZE conditional below three times, use + * a double layer of macro invocations, allowing for substitution of the + * respective relevant macro argument tokens. + */ +# define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl) +# if VEC_MAX < 64 +/* + * The sub-512-bit built-ins have an extra "3" infix, presumably because the + * 512-bit names were chosen without the AVX512VL extension in mind (and hence + * making the latter collide with the AVX2 ones). + */ +# define si 3si +# define di 3di +# endif +# if VEC_MAX == 16 +# define v8df v2df +# define v8di v2di +# define v16sf v4sf +# define v16si v4si +# elif VEC_MAX == 32 +# define v8df v4df +# define v8di v4di +# define v16sf v8sf +# define v16si v8si +# endif +# if IDX_SIZE == 4 +# if INT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl) +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl)) +# elif FLOAT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl) +# elif FLOAT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl) +# endif +# elif IDX_SIZE == 8 +# if INT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl) +# elif INT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl)) +# elif FLOAT_SIZE == 4 +# define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl) +# elif FLOAT_SIZE == 8 +# define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl) +# endif +# endif +#elif defined(__AVX2__) +# if VEC_SIZE == 16 +# define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vec_t){} == 0) +# else +# define to_bool(cmp) __builtin_ia32_ptestc256(cmp, (vec_t){} == 0) +# endif -#if defined(__AVX2__) # if VEC_MAX == 16 # if IDX_SIZE == 4 # if INT_SIZE == 4 @@ -111,6 +176,10 @@ typedef long long __attribute__((vector_ # endif #endif +#ifndef eq +# define eq(x, y) to_bool((x) == (y)) +#endif + #define GLUE_(x, y) x ## y #define GLUE(x, y) GLUE_(x, y) @@ -119,6 +188,7 @@ typedef long long __attribute__((vector_ #define PUT8(n) PUT4(n), PUT4((n) + 4) #define PUT16(n) PUT8(n), PUT8((n) + 8) #define PUT32(n) PUT16(n), PUT16((n) + 16) +#define PUT64(n) PUT32(n), PUT32((n) + 32) const typeof((vec_t){}[0]) array[] = { GLUE(PUT, VEC_MAX)(1), @@ -174,7 +244,7 @@ int sg_test(void) y = gather(full, array + ITEM_COUNT, -idx, full, ELEM_SIZE); #if ITEM_COUNT == ELEM_COUNT - if ( !to_bool(y == x - 1) ) + if ( !eq(y, x - 1) ) return __LINE__; #else for ( i = 0; i < ITEM_COUNT; ++i ) --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -22,6 +22,8 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512dq-opmask.h" #include "avx512bw-opmask.h" #include "avx512f.h" +#include "avx512f-sg.h" +#include "avx512vl-sg.h" #include "avx512bw.h" #include "avx512dq.h" #include "avx512er.h" @@ -90,11 +92,13 @@ static bool simd_check_avx512f(void) return cpu_has_avx512f; } #define simd_check_avx512f_opmask simd_check_avx512f +#define simd_check_avx512f_sg simd_check_avx512f static bool simd_check_avx512f_vl(void) { return cpu_has_avx512f && cpu_has_avx512vl; } +#define simd_check_avx512vl_sg simd_check_avx512f_vl static bool simd_check_avx512dq(void) { @@ -291,6 +295,14 @@ static const struct { SIMD(AVX512F u32x16, avx512f, 64u4), SIMD(AVX512F s64x8, avx512f, 64i8), SIMD(AVX512F u64x8, avx512f, 64u8), + SIMD(AVX512F S/G f32[16x32], avx512f_sg, 64x4f4), + SIMD(AVX512F S/G f64[ 8x32], avx512f_sg, 64x4f8), + SIMD(AVX512F S/G f32[ 8x64], avx512f_sg, 64x8f4), + SIMD(AVX512F S/G f64[ 8x64], avx512f_sg, 64x8f8), + SIMD(AVX512F S/G i32[16x32], avx512f_sg, 64x4i4), + SIMD(AVX512F S/G i64[ 8x32], avx512f_sg, 64x4i8), + SIMD(AVX512F S/G i32[ 8x64], avx512f_sg, 64x8i4), + SIMD(AVX512F S/G i64[ 8x64], avx512f_sg, 64x8i8), AVX512VL(VL f32x4, avx512f, 16f4), AVX512VL(VL f64x2, avx512f, 16f8), AVX512VL(VL f32x8, avx512f, 32f4), @@ -303,6 +315,22 @@ static const struct { AVX512VL(VL u64x2, avx512f, 16u8), AVX512VL(VL s64x4, avx512f, 32i8), AVX512VL(VL u64x4, avx512f, 32u8), + SIMD(AVX512VL S/G f32[4x32], avx512vl_sg, 16x4f4), + SIMD(AVX512VL S/G f64[2x32], avx512vl_sg, 16x4f8), + SIMD(AVX512VL S/G f32[2x64], avx512vl_sg, 16x8f4), + SIMD(AVX512VL S/G f64[2x64], avx512vl_sg, 16x8f8), + SIMD(AVX512VL S/G f32[8x32], avx512vl_sg, 32x4f4), + SIMD(AVX512VL S/G f64[4x32], avx512vl_sg, 32x4f8), + SIMD(AVX512VL S/G f32[4x64], avx512vl_sg, 32x8f4), + SIMD(AVX512VL S/G f64[4x64], avx512vl_sg, 32x8f8), + SIMD(AVX512VL S/G i32[4x32], avx512vl_sg, 16x4i4), + SIMD(AVX512VL S/G i64[2x32], avx512vl_sg, 16x4i8), + SIMD(AVX512VL S/G i32[2x64], avx512vl_sg, 16x8i4), + SIMD(AVX512VL S/G i64[2x64], avx512vl_sg, 16x8i8), + SIMD(AVX512VL S/G i32[8x32], avx512vl_sg, 32x4i4), + SIMD(AVX512VL S/G i64[4x32], avx512vl_sg, 32x4i8), + SIMD(AVX512VL S/G i32[4x64], avx512vl_sg, 32x8i4), + SIMD(AVX512VL S/G i64[4x64], avx512vl_sg, 32x8i8), SIMD(AVX512BW s8x64, avx512bw, 64i1), SIMD(AVX512BW u8x64, avx512bw, 64u1), SIMD(AVX512BW s16x32, avx512bw, 64i2), --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -499,7 +499,7 @@ static const struct ext0f38_table { [0x8c] = { .simd_size = simd_packed_int }, [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, - [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, + [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq }, [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq }, [0x9a] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, @@ -9100,6 +9100,133 @@ x86_emulate( put_stub(stub); if ( rc != X86EMUL_OKAY ) + goto done; + + state->simd_size = simd_none; + break; + } + + case X86EMUL_OPC_EVEX_66(0x0f38, 0x90): /* vpgatherd{d,q} mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x91): /* vpgatherq{d,q} mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x92): /* vgatherdp{s,d} mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x93): /* vgatherqp{s,d} mem,[xyz]mm{k} */ + { + typeof(evex) *pevex; + union { + int32_t dw[16]; + int64_t qw[8]; + } index; + bool done = false; + + ASSERT(ea.type == OP_MEM); + generate_exception_if((!evex.opmsk || evex.brs || evex.z || + evex.reg != 0xf || + modrm_reg == state->sib_index), + EXC_UD); + avx512_vlen_check(false); + host_and_vcpu_must_have(avx512f); + get_fpu(X86EMUL_FPU_zmm); + + /* Read destination and index registers. */ + opc = init_evex(stub); + pevex = copy_EVEX(opc, evex); + pevex->opcx = vex_0f; + opc[0] = 0x7f; /* vmovdqa{32,64} */ + /* + * The register writeback below has to retain masked-off elements, but + * needs to clear upper portions in the index-wider-than-data cases. + * Therefore read (and write below) the full register. The alternative + * would have been to fiddle with the mask register used. + */ + pevex->opmsk = 0; + /* Use (%rax) as destination and modrm_reg as source. */ + pevex->b = 1; + opc[1] = (modrm_reg & 7) << 3; + pevex->RX = 1; + opc[2] = 0xc3; + + invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp)); + + pevex->pfx = vex_f3; /* vmovdqu{32,64} */ + pevex->w = b & 1; + /* Switch to sib_index as source. */ + pevex->r = !mode_64bit() || !(state->sib_index & 0x08); + pevex->R = !mode_64bit() || !(state->sib_index & 0x10); + opc[1] = (state->sib_index & 7) << 3; + + invoke_stub("", "", "=m" (index) : "a" (&index)); + put_stub(stub); + + /* Clear untouched parts of the destination and mask values. */ + n = 1 << (2 + evex.lr - ((b & 1) | evex.w)); + op_bytes = 4 << evex.w; + memset((void *)mmvalp + n * op_bytes, 0, 64 - n * op_bytes); + op_mask &= (1 << n) - 1; + + for ( i = 0; op_mask; ++i ) + { + signed long idx = b & 1 ? index.qw[i] : index.dw[i]; + + if ( !(op_mask & (1 << i)) ) + continue; + + rc = ops->read(ea.mem.seg, + truncate_ea(ea.mem.off + (idx << state->sib_scale)), + (void *)mmvalp + i * op_bytes, op_bytes, ctxt); + if ( rc != X86EMUL_OKAY ) + { + /* + * If we've made some progress and the access did not fault, + * force a retry instead. This is for example necessary to + * cope with the limited capacity of HVM's MMIO cache. + */ + if ( rc != X86EMUL_EXCEPTION && done ) + rc = X86EMUL_RETRY; + break; + } + + op_mask &= ~(1 << i); + done = true; + +#ifdef __XEN__ + if ( op_mask && local_events_need_delivery() ) + { + rc = X86EMUL_RETRY; + break; + } +#endif + } + + /* Write destination and mask registers. */ + opc = init_evex(stub); + pevex = copy_EVEX(opc, evex); + pevex->opcx = vex_0f; + opc[0] = 0x6f; /* vmovdqa{32,64} */ + pevex->opmsk = 0; + /* Use modrm_reg as destination and (%rax) as source. */ + pevex->b = 1; + opc[1] = (modrm_reg & 7) << 3; + pevex->RX = 1; + opc[2] = 0xc3; + + invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp)); + + /* + * kmovw: This is VEX-encoded, so we can't use pevex. Avoid copy_VEX() etc + * as well, since we can easily use the 2-byte VEX form here. + */ + opc -= EVEX_PFX_BYTES; + opc[0] = 0xc5; + opc[1] = 0xf8; + opc[2] = 0x90; + /* Use (%rax) as source. */ + opc[3] = evex.opmsk << 3; + opc[4] = 0xc3; + + invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask)); + put_stub(stub); + + if ( rc != X86EMUL_OKAY ) goto done; state->simd_size = simd_none;
Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v9: Suppress general register update upon failures. Split out ModR/M handling changes as well as independent test harness ones into prereq patches. Re-base. v8: Re-base. v7: Fix ByteOp register decode. Re-base. v6: New.