@@ -19,7 +19,8 @@ CFLAGS += $(CFLAGS_xeninclude)
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
+GF := sse2-gf avx2-gf avx512bw-gf
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -142,12 +143,17 @@ $(1)-cflags := \
$(foreach flt,$($(1)-flts), \
"-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
endef
+define simd-gf-defs
+$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
+ "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
define opmask-defs
$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -197,7 +203,10 @@ $(addsuffix .c,$(FMA)):
$(addsuffix .c,$(SG)):
ln -sf simd-sg.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
+$(addsuffix .c,$(GF)):
+ ln -sf simd-gf.c $@
+
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
@@ -591,6 +591,12 @@ static const struct test avx512_vpopcntd
INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
};
+static const struct test gfni_all[] = {
+ INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
+ INSN(gf2p8affineqb, 66, 0f3a, ce, vl, q, vl),
+ INSN(gf2p8mulb, 66, 0f38, cf, vl, b, vl),
+};
+
/*
* The uses of b in this table are simply (one of) the shortest form(s) of
* saying "no broadcast" without introducing a 128-bit granularity enumerator.
@@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct
if ( cpu_has_avx512f )
{
+ RUN(gfni, all);
RUN(vaes, all);
RUN(vpclmulqdq, all);
}
@@ -371,6 +371,7 @@ OVR(cvttsd2siq);
OVR(cvttss2si);
OVR(cvttss2sil);
OVR(cvttss2siq);
+OVR(gf2p8mulb);
OVR(movddup);
OVR(movntdq);
OVR(movntdqa);
@@ -0,0 +1,80 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(gf_test);
+
+#if VEC_SIZE == 16
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a)
+#elif VEC_SIZE == 32
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a)
+#elif VEC_SIZE == 64
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a)
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)
+# define transform(m, dir, x, c) ({ \
+ vec_t t_; \
+ asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \
+ : [dst] "=v" (t_) \
+ : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \
+ t_; \
+})
+#else
+# if defined(__AVX2__)
+# define bcstq(x) ({ \
+ vdi_t t_; \
+ asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \
+ t_; \
+})
+# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+# define bcstq(x) ((vdi_t){x, x})
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y))
+# define transform(m, dir, x, c) ({ \
+ vdi_t m_ = bcstq(m); \
+ touch(m_); \
+ ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \
+})
+#endif
+
+const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL;
+
+int gf_test(void)
+{
+ unsigned int i;
+ vec_t src, one;
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ {
+ src[i] = i;
+ one[i] = 1;
+ }
+
+ /* Special case for first iteration. */
+ one[0] = 0;
+
+ do {
+ vec_t inv = transform(ident, inv, src, 0);
+
+ touch(src);
+ touch(inv);
+ if ( !eq(mul(src, inv), one) ) return __LINE__;
+
+ touch(src);
+ touch(inv);
+ if ( !eq(mul(inv, src), one) ) return __LINE__;
+
+ one[0] = 1;
+
+ src += ELEM_COUNT;
+ i += ELEM_COUNT;
+ } while ( i < 256 );
+
+ return 0;
+}
@@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "3dnow.h"
#include "sse.h"
#include "sse2.h"
+#include "sse2-gf.h"
#include "sse4.h"
#include "avx.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
+#include "avx2-gf.h"
#include "xop.h"
#include "avx512f-opmask.h"
#include "avx512dq-opmask.h"
@@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512f-sg.h"
#include "avx512vl-sg.h"
#include "avx512bw.h"
+#include "avx512bw-gf.h"
#include "avx512dq.h"
#include "avx512er.h"
#include "avx512vbmi.h"
@@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(voi
return cpu_has_avx512_vbmi && cpu_has_avx512vl;
}
+static bool simd_check_sse2_gf(void)
+{
+ return cpu_has_gfni && cpu_has_sse2;
+}
+
+static bool simd_check_avx2_gf(void)
+{
+ return cpu_has_gfni && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_gf(void)
+{
+ return cpu_has_gfni && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_gf_vl(void)
+{
+ return cpu_has_gfni && cpu_has_avx512vl;
+}
+
static void simd_set_regs(struct cpu_user_regs *regs)
{
if ( cpu_has_mmx )
@@ -395,6 +418,12 @@ static const struct {
AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2),
AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2),
AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2),
+ SIMD(GFNI (legacy), sse2_gf, 16),
+ SIMD(GFNI (VEX/x16), avx2_gf, 16),
+ SIMD(GFNI (VEX/x32), avx2_gf, 32),
+ SIMD(GFNI (EVEX/x64), avx512bw_gf, 64),
+ AVX512VL(VL+GFNI (x16), avx512bw_gf, 16),
+ AVX512VL(VL+GFNI (x32), avx512bw_gf, 32),
#undef AVX512VL_
#undef AVX512VL
#undef SIMD_
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_gfni cp.feat.gfni
#define cpu_has_vaes (cp.feat.vaes && xcr0_mask(6))
#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
@@ -540,6 +540,7 @@ static const struct ext0f38_table {
[0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xf0] = { .two_op = 1 },
@@ -619,6 +620,7 @@ static const struct ext0f3a_table {
[0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
[0xcc] = { .simd_size = simd_other },
+ [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = {},
};
@@ -1890,6 +1892,7 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_gfni() (ctxt->cpuid->feat.gfni)
#define vcpu_has_vaes() (ctxt->cpuid->feat.vaes)
#define vcpu_has_vpclmulqdq() (ctxt->cpuid->feat.vpclmulqdq)
#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
@@ -9640,6 +9643,21 @@ x86_emulate(
host_and_vcpu_must_have(avx512er);
goto simd_zmm_scalar_sae;
+ case X86EMUL_OPC_66(0x0f38, 0xcf): /* gf2p8mulb xmm/m128,xmm */
+ host_and_vcpu_must_have(gfni);
+ goto simd_0f38_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xcf): /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_avx;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(evex.w || evex.brs, EXC_UD);
+ elem_bytes = 1;
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10383,6 +10401,24 @@ x86_emulate(
op_bytes = 16;
goto simd_0f3a_common;
+ case X86EMUL_OPC_66(0x0f3a, 0xce): /* gf2p8affineqb $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0xcf): /* gf2p8affineinvqb $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(gfni);
+ goto simd_0f3a_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(!vex.w, EXC_UD);
+ goto simd_0f_imm8_avx;
+
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(!evex.w, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(aesni);
@@ -111,6 +111,7 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_gfni boot_cpu_has(X86_FEATURE_GFNI)
#define cpu_has_vaes boot_cpu_has(X86_FEATURE_VAES)
#define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
#define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI)
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(GFNI, 6*32+ 8) /*A Galois Field Instrs */
XEN_CPUFEATURE(VAES, 6*32+ 9) /*A Vector AES Instrs */
XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */
XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */
@@ -201,7 +201,7 @@ def crunch_numbers(state):
# SSE2 was re-specified as core instructions for 64bit. Also ISA
# extensions dealing with vectors of integers are added here rather
# than to SSE.
- SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA],
+ SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA, GFNI],
# Other SSEn each depend on their predecessor versions.
SSE3: [SSSE3],
As to the feature dependency adjustment, while strictly speaking SSE is a sufficient prereq (to have XMM registers), vectors of bytes and qwords have got introduced only with SSE2. gcc, for example, uses a similar connection in its respective intrinsics header. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v9: Re-base. Drop stale part of description. v8: Add {evex}-producing vgf2p8mulb alias to simd.h. Add missing simd.h dependency. Re-base. v7: New.