diff mbox series

[v9,19/23] x86emul: support GFNI insns

Message ID a742e4c5-212c-38e9-e4a2-787946b0e30b@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich July 1, 2019, 11:26 a.m. UTC
As to the feature dependency adjustment, while strictly speaking SSE is
a sufficient prereq (to have XMM registers), vectors of bytes and qwords
have got introduced only with SSE2. gcc, for example, uses a similar
connection in its respective intrinsics header.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Re-base. Drop stale part of description.
v8: Add {evex}-producing vgf2p8mulb alias to simd.h. Add missing simd.h
     dependency. Re-base.
v7: New.

Comments

Andrew Cooper July 4, 2019, 3:10 p.m. UTC | #1
On 01/07/2019 12:26, Jan Beulich wrote:
> As to the feature dependency adjustment, while strictly speaking SSE is
> a sufficient prereq (to have XMM registers), vectors of bytes and qwords
> have got introduced only with SSE2. gcc, for example, uses a similar
> connection in its respective intrinsics header.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,7 +19,8 @@  CFLAGS += $(CFLAGS_xeninclude)
  SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
  FMA := fma4 fma
  SG := avx2-sg avx512f-sg avx512vl-sg
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
+GF := sse2-gf avx2-gf avx512bw-gf
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
  
  OPMASK := avx512f avx512dq avx512bw
  
@@ -142,12 +143,17 @@  $(1)-cflags := \
  	   $(foreach flt,$($(1)-flts), \
  	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
  endef
+define simd-gf-defs
+$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
+	         "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
  define opmask-defs
  $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
  endef
  
  $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
  $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
  $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
  
  first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -197,7 +203,10 @@  $(addsuffix .c,$(FMA)):
  $(addsuffix .c,$(SG)):
  	ln -sf simd-sg.c $@
  
-$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
+$(addsuffix .c,$(GF)):
+	ln -sf simd-gf.c $@
+
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
  
  xop.h avx512f.h: simd-fma.c
  
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,12 @@  static const struct test avx512_vpopcntd
      INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
  };
  
+static const struct test gfni_all[] = {
+    INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
+    INSN(gf2p8affineqb,    66, 0f3a, ce, vl, q, vl),
+    INSN(gf2p8mulb,        66, 0f38, cf, vl, b, vl),
+};
+
  /*
   * The uses of b in this table are simply (one of) the shortest form(s) of
   * saying "no broadcast" without introducing a 128-bit granularity enumerator.
@@ -987,6 +993,7 @@  void evex_disp8_test(void *instr, struct
  
      if ( cpu_has_avx512f )
      {
+        RUN(gfni, all);
          RUN(vaes, all);
          RUN(vpclmulqdq, all);
      }
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -371,6 +371,7 @@  OVR(cvttsd2siq);
  OVR(cvttss2si);
  OVR(cvttss2sil);
  OVR(cvttss2siq);
+OVR(gf2p8mulb);
  OVR(movddup);
  OVR(movntdq);
  OVR(movntdqa);
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-gf.c
@@ -0,0 +1,80 @@ 
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(gf_test);
+
+#if VEC_SIZE == 16
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a)
+#elif VEC_SIZE == 32
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a)
+#elif VEC_SIZE == 64
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a)
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)
+# define transform(m, dir, x, c) ({ \
+    vec_t t_; \
+    asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \
+          : [dst] "=v" (t_) \
+          : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \
+    t_; \
+})
+#else
+# if defined(__AVX2__)
+#  define bcstq(x) ({ \
+    vdi_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \
+    t_; \
+})
+#  define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+#  define bcstq(x) ((vdi_t){x, x})
+#  define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y))
+# define transform(m, dir, x, c) ({ \
+    vdi_t m_ = bcstq(m); \
+    touch(m_); \
+    ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \
+})
+#endif
+
+const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL;
+
+int gf_test(void)
+{
+    unsigned int i;
+    vec_t src, one;
+
+    for ( i = 0; i < ELEM_COUNT; ++i )
+    {
+        src[i] = i;
+        one[i] = 1;
+    }
+
+    /* Special case for first iteration. */
+    one[0] = 0;
+
+    do {
+        vec_t inv = transform(ident, inv, src, 0);
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(src, inv), one) ) return __LINE__;
+
+        touch(src);
+        touch(inv);
+        if ( !eq(mul(inv, src), one) ) return __LINE__;
+
+        one[0] = 1;
+
+        src += ELEM_COUNT;
+        i += ELEM_COUNT;
+    } while ( i < 256 );
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,12 +11,14 @@  asm ( ".pushsection .test, \"ax\", @prog
  #include "3dnow.h"
  #include "sse.h"
  #include "sse2.h"
+#include "sse2-gf.h"
  #include "sse4.h"
  #include "avx.h"
  #include "fma4.h"
  #include "fma.h"
  #include "avx2.h"
  #include "avx2-sg.h"
+#include "avx2-gf.h"
  #include "xop.h"
  #include "avx512f-opmask.h"
  #include "avx512dq-opmask.h"
@@ -25,6 +27,7 @@  asm ( ".pushsection .test, \"ax\", @prog
  #include "avx512f-sg.h"
  #include "avx512vl-sg.h"
  #include "avx512bw.h"
+#include "avx512bw-gf.h"
  #include "avx512dq.h"
  #include "avx512er.h"
  #include "avx512vbmi.h"
@@ -138,6 +141,26 @@  static bool simd_check_avx512vbmi_vl(voi
      return cpu_has_avx512_vbmi && cpu_has_avx512vl;
  }
  
+static bool simd_check_sse2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_sse2;
+}
+
+static bool simd_check_avx2_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_gf(void)
+{
+    return cpu_has_gfni && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_gf_vl(void)
+{
+    return cpu_has_gfni && cpu_has_avx512vl;
+}
+
  static void simd_set_regs(struct cpu_user_regs *regs)
  {
      if ( cpu_has_mmx )
@@ -395,6 +418,12 @@  static const struct {
      AVX512VL(_VBMI+VL u16x8, avx512vbmi,    16u2),
      AVX512VL(_VBMI+VL s16x16, avx512vbmi,   32i2),
      AVX512VL(_VBMI+VL u16x16, avx512vbmi,   32u2),
+    SIMD(GFNI (legacy),       sse2_gf,        16),
+    SIMD(GFNI (VEX/x16),      avx2_gf,        16),
+    SIMD(GFNI (VEX/x32),      avx2_gf,        32),
+    SIMD(GFNI (EVEX/x64), avx512bw_gf,        64),
+    AVX512VL(VL+GFNI (x16), avx512bw_gf,      16),
+    AVX512VL(VL+GFNI (x32), avx512bw_gf,      32),
  #undef AVX512VL_
  #undef AVX512VL
  #undef SIMD_
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@  static inline bool xcr0_mask(uint64_t ma
  #define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
  #define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_gfni       cp.feat.gfni
  #define cpu_has_vaes      (cp.feat.vaes && xcr0_mask(6))
  #define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
  #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -540,6 +540,7 @@  static const struct ext0f38_table {
      [0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xf0] = { .two_op = 1 },
@@ -619,6 +620,7 @@  static const struct ext0f3a_table {
      [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
      [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
      [0xcc] = { .simd_size = simd_other },
+    [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
      [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
      [0xf0] = {},
  };
@@ -1890,6 +1892,7 @@  in_protmode(
  #define vcpu_has_avx512vl()    (ctxt->cpuid->feat.avx512vl)
  #define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
  #define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_gfni()        (ctxt->cpuid->feat.gfni)
  #define vcpu_has_vaes()        (ctxt->cpuid->feat.vaes)
  #define vcpu_has_vpclmulqdq()  (ctxt->cpuid->feat.vpclmulqdq)
  #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
@@ -9640,6 +9643,21 @@  x86_emulate(
          host_and_vcpu_must_have(avx512er);
          goto simd_zmm_scalar_sae;
  
+    case X86EMUL_OPC_66(0x0f38, 0xcf):      /* gf2p8mulb xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f38_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xcf):  /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(evex.w || evex.brs, EXC_UD);
+        elem_bytes = 1;
+        goto avx512f_no_sae;
+
      case X86EMUL_OPC_VEX_66(0x0f38, 0xdc):  /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0xdd):  /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
      case X86EMUL_OPC_VEX_66(0x0f38, 0xde):  /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10383,6 +10401,24 @@  x86_emulate(
          op_bytes = 16;
          goto simd_0f3a_common;
  
+    case X86EMUL_OPC_66(0x0f3a, 0xce):      /* gf2p8affineqb $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0xcf):      /* gf2p8affineinvqb $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(gfni);
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xce):  /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf):  /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(gfni);
+        generate_exception_if(!evex.w, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm8_no_sae;
+
      case X86EMUL_OPC_66(0x0f3a, 0xdf):     /* aeskeygenassist $imm8,xmm/m128,xmm */
      case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
          host_and_vcpu_must_have(aesni);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@ 
  /* CPUID level 0x00000007:0.ecx */
  #define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
  #define cpu_has_avx512_vbmi2    boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_gfni            boot_cpu_has(X86_FEATURE_GFNI)
  #define cpu_has_vaes            boot_cpu_has(X86_FEATURE_VAES)
  #define cpu_has_vpclmulqdq      boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
  #define cpu_has_avx512_vnni     boot_cpu_has(X86_FEATURE_AVX512_VNNI)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@  XEN_CPUFEATURE(UMIP,          6*32+ 2) /
  XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
  XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
  XEN_CPUFEATURE(AVX512_VBMI2,  6*32+ 6) /*A  Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(GFNI,          6*32+ 8) /*A  Galois Field Instrs */
  XEN_CPUFEATURE(VAES,          6*32+ 9) /*A  Vector AES Instrs */
  XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /*A  Vector Carry-less Multiplication Instrs */
  XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -201,7 +201,7 @@  def crunch_numbers(state):
          # SSE2 was re-specified as core instructions for 64bit.  Also ISA
          # extensions dealing with vectors of integers are added here rather
          # than to SSE.
-        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA],
+        SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA, GFNI],
  
          # Other SSEn each depend on their predecessor versions.
          SSE3: [SSSE3],