diff mbox series

[v9,07/23] x86emul: support AVX512F scatter insns

Message ID 34deb8ec-fe37-0c99-edcf-c28bae0620c6@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich July 1, 2019, 11:20 a.m. UTC
This completes support of AVX512F in the insn emulator.

Note that in the test harness there's a little bit of trickery needed to
get around the not fully consistent naming of AVX512VL gather and
scatter compiler built-ins. To suppress expansion of the "di" and "si"
tokens they get constructed by token concatenation in BS(), which is
different from BG().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: I couldn't really decide whether to duplicate code or merge scatter
      into gather emulation.
---
v9: Suppress general register update upon failures.
v7: Re-base.
v6: New.

Comments

Andrew Cooper July 4, 2019, 2:19 p.m. UTC | #1
On 01/07/2019 12:20, Jan Beulich wrote:
> This completes support of AVX512F in the insn emulator.
>
> Note that in the test harness there's a little bit of trickery needed to
> get around the not fully consistent naming of AVX512VL gather and
> scatter compiler built-ins. To suppress expansion of the "di" and "si"
> tokens they get constructed by token concatenation in BS(), which is
> different from BG().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>, subject to the
resolution of the related comments on patch 5.
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -270,6 +270,8 @@  static const struct test avx512f_all[] =
      INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
      INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
      INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pscatterd,    66, 0f38, a0,    vl,     dq, el),
+    INSN(pscatterq,    66, 0f38, a1,    vl,     dq, el),
      INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
      INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
      INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
@@ -305,6 +307,8 @@  static const struct test avx512f_all[] =
      INSN(rsqrt14,      66, 0f38, 4f,    el,     sd, el),
      INSN(scalef,       66, 0f38, 2c,    vl,     sd, vl),
      INSN(scalef,       66, 0f38, 2d,    el,     sd, el),
+    INSN(scatterd,     66, 0f38, a2,    vl,     sd, el),
+    INSN(scatterq,     66, 0f38, a3,    vl,     sd, el),
      INSN_PFP(shuf,           0f, c6),
      INSN_FP(sqrt,            0f, 51),
      INSN_FP(sub,             0f, 5c),
--- a/tools/tests/x86_emulator/simd-sg.c
+++ b/tools/tests/x86_emulator/simd-sg.c
@@ -48,10 +48,14 @@  typedef long long __attribute__((vector_
  #  endif
  #  define BG_(dt, it, reg, mem, idx, msk, scl) \
      __builtin_ia32_gather##it##dt(reg, mem, idx, to_mask(msk), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, to_mask(msk), idx, reg, scl)
  # else
  #  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
  #  define BG_(dt, it, reg, mem, idx, msk, scl) \
      __builtin_ia32_gather##it##dt(reg, mem, idx, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), scl)
+#  define BS_(dt, it, mem, idx, reg, msk, scl) \
+    __builtin_ia32_scatter##it##dt(mem, B(ptestmq, , (vdi_t)(msk), (vdi_t)(msk), ~0), idx, reg, scl)
  # endif
  /*
   * Instead of replicating the main IDX_SIZE conditional below three times, use
@@ -59,6 +63,7 @@  typedef long long __attribute__((vector_
   * respective relevant macro argument tokens.
   */
  # define BG(dt, it, reg, mem, idx, msk, scl) BG_(dt, it, reg, mem, idx, msk, scl)
+# define BS(dt, it, mem, idx, reg, msk, scl) BS_(dt, it##i, mem, idx, reg, msk, scl)
  # if VEC_MAX < 64
  /*
   * The sub-512-bit built-ins have an extra "3" infix, presumably because the
@@ -82,22 +87,30 @@  typedef long long __attribute__((vector_
  # if IDX_SIZE == 4
  #  if INT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16si, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, s, mem, idx, reg, msk, scl)
  #  elif INT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, si, (vdi_t)(reg), mem, idx, msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, s, mem, idx, (vdi_t)(reg), msk, scl)
  #  elif FLOAT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16sf, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, s, mem, idx, reg, msk, scl)
  #  elif FLOAT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) BG(v8df, si, reg, mem, idx, msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, s, mem, idx, reg, msk, scl)
  #  endif
  # elif IDX_SIZE == 8
  #  if INT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16si, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16si, d, mem, (idi_t)(idx), reg, msk, scl)
  #  elif INT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) (vec_t)(BG(v8di, di, (vdi_t)(reg), mem, (idi_t)(idx), msk, scl))
+#   define scatter(mem, idx, reg, msk, scl) BS(v8di, d, mem, (idi_t)(idx), (vdi_t)(reg), msk, scl)
  #  elif FLOAT_SIZE == 4
  #   define gather(reg, mem, idx, msk, scl) BG(v16sf, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v16sf, d, mem, (idi_t)(idx), reg, msk, scl)
  #  elif FLOAT_SIZE == 8
  #   define gather(reg, mem, idx, msk, scl) BG(v8df, di, reg, mem, (idi_t)(idx), msk, scl)
+#   define scatter(mem, idx, reg, msk, scl) BS(v8df, d, mem, (idi_t)(idx), reg, msk, scl)
  #  endif
  # endif
  #elif defined(__AVX2__)
@@ -195,6 +208,8 @@  const typeof((vec_t){}[0]) array[] = {
      GLUE(PUT, VEC_MAX)(VEC_MAX + 1)
  };
  
+typeof((vec_t){}[0]) out[VEC_MAX * 2];
+
  int sg_test(void)
  {
      unsigned int i;
@@ -275,5 +290,41 @@  int sg_test(void)
  # endif
  #endif
  
+#ifdef scatter
+
+    for ( i = 0; i < sizeof(out) / sizeof(*out); ++i )
+        out[i] = 0;
+
+    for ( i = 0; i < ITEM_COUNT; ++i )
+        x[i] = i + 1;
+
+    touch(x);
+
+    scatter(out, (idx_t){}, x, (vec_t){ 1 } != 0, 1);
+    if ( out[0] != 1 )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, (idx_t){}, x, full, 1);
+    if ( out[0] != ITEM_COUNT )
+        return __LINE__;
+    for ( i = 1; i < ITEM_COUNT; ++i )
+        if ( out[i] )
+            return __LINE__;
+
+    scatter(out, idx, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != i )
+            return __LINE__;
+
+    scatter(out, inv, x, full, ELEM_SIZE);
+    for ( i = 1; i <= ITEM_COUNT; ++i )
+        if ( out[i] != ITEM_COUNT + 1 - i )
+            return __LINE__;
+
+#endif
+
      return 0;
  }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -508,6 +508,7 @@  static const struct ext0f38_table {
      [0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xa0 ... 0xa3] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -9312,6 +9313,105 @@  x86_emulate(
              avx512_vlen_check(true);
          goto simd_zmm;
  
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa3): /* vscatterqp{s,d} [xyz]mm,mem{k} */
+    {
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+        bool done = false;
+
+        ASSERT(ea.type == OP_MEM);
+        fail_if(!ops->write);
+        generate_exception_if((!evex.opmsk || evex.brs || evex.z ||
+                               evex.reg != 0xf ||
+                               modrm_reg == state->sib_index),
+                              EXC_UD);
+        avx512_vlen_check(false);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        /* Read source and index registers. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        opc[0] = 0x7f; /* vmovdqa{32,64} */
+        /* Use (%rax) as destination and modrm_reg as source. */
+        pevex->b = 1;
+        opc[1] = (modrm_reg & 7) << 3;
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (*mmvalp) : "a" (mmvalp));
+
+        pevex->pfx = vex_f3; /* vmovdqu{32,64} */
+        pevex->w = b & 1;
+        /* Switch to sib_index as source. */
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        opc[1] = (state->sib_index & 7) << 3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the mask value. */
+        n = 1 << (2 + evex.lr - ((b & 1) | evex.w));
+        op_bytes = 4 << evex.w;
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = ops->write(ea.mem.seg,
+                            truncate_ea(ea.mem.off + (idx << state->sib_scale)),
+                            (void *)mmvalp + i * op_bytes, op_bytes, ctxt);
+            if ( rc != X86EMUL_OKAY )
+            {
+                /* See comment in gather emulation. */
+                if ( rc != X86EMUL_EXCEPTION && done )
+                    rc = X86EMUL_RETRY;
+                break;
+            }
+
+            op_mask &= ~(1 << i);
+            done = true;
+
+#ifdef __XEN__
+            if ( op_mask && local_events_need_delivery() )
+            {
+                rc = X86EMUL_RETRY;
+                break;
+            }
+#endif
+        }
+
+        /* Write mask register. See comment in gather emulation. */
+        opc = get_stub(stub);
+        opc[0] = 0xc5;
+        opc[1] = 0xf8;
+        opc[2] = 0x90;
+        /* Use (%rax) as source. */
+        opc[3] = evex.opmsk << 3;
+        opc[4] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+        put_stub(stub);
+
+        if ( rc != X86EMUL_OKAY )
+            goto done;
+
+        state->simd_size = simd_none;
+        break;
+    }
+
      case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
      case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
      case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */