diff mbox series

[v9,08/23] x86emul: support AVX512PF insns

Message ID 4365e23d-c2aa-dc10-46d0-df38d9c36322@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich July 1, 2019, 11:20 a.m. UTC
Some adjustments are necessary to the EVEX Disp8 scaling test code to
account for the zero byte reads/writes, which get issued for the test
harness only.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v9: Suppress general register update upon failures. Re-base.
v8: #GP/#SS don't arise here. Add previously missed change to
     emul_test_init().
v7: Re-base.
v6: New.

Comments

Andrew Cooper July 4, 2019, 2:44 p.m. UTC | #1
On 01/07/2019 12:20, Jan Beulich wrote:
> +        /* Clear untouched parts of the mask value. */
> +        n = 1 << (4 - ((b & 1) | evex.w));
> +        op_mask &= (1 << n) - 1;
> +
> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
> +        {
> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
> +
> +            if ( !(op_mask & (1 << i)) )
> +                continue;

It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
although it looks like n can be at maximum 16 in this specific case.

If nothing else, using (1u << 1) would reduce the size of the UBSAN
build, but I expect we're soon going to have subtle bugs when we get to
the int8 instructions.

Are there current S/G instructions which can take 32 iterations?

~Andrew
Jan Beulich July 4, 2019, 2:50 p.m. UTC | #2
On 04.07.2019 16:44, Andrew Cooper wrote:
> On 01/07/2019 12:20, Jan Beulich wrote:
>> +        /* Clear untouched parts of the mask value. */
>> +        n = 1 << (4 - ((b & 1) | evex.w));
>> +        op_mask &= (1 << n) - 1;
>> +
>> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
>> +        {
>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>> +
>> +            if ( !(op_mask & (1 << i)) )
>> +                continue;
> 
> It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
> although it looks like n can be at maximum 16 in this specific case.
> 
> If nothing else, using (1u << 1) would reduce the size of the UBSAN
> build, but I expect we're soon going to have subtle bugs when we get to
> the int8 instructions.
> 
> Are there current S/G instructions which can take 32 iterations?

No, S/G insns currently only act on vector elements 32 or 64 bits
in size, which means 16 or 8 elements per vector max.

Jan
Andrew Cooper July 4, 2019, 6:29 p.m. UTC | #3
On 04/07/2019 15:50, Jan Beulich wrote:
> On 04.07.2019 16:44, Andrew Cooper wrote:
>> On 01/07/2019 12:20, Jan Beulich wrote:
>>> +        /* Clear untouched parts of the mask value. */
>>> +        n = 1 << (4 - ((b & 1) | evex.w));
>>> +        op_mask &= (1 << n) - 1;
>>> +
>>> +        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
>>> +        {
>>> +            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
>>> +
>>> +            if ( !(op_mask & (1 << i)) )
>>> +                continue;
>> It occurs from my recent foray into UBSAN that op_mask is 64 bits wide,
>> although it looks like n can be at maximum 16 in this specific case.
>>
>> If nothing else, using (1u << 1) would reduce the size of the UBSAN
>> build, but I expect we're soon going to have subtle bugs when we get to
>> the int8 instructions.
>>
>> Are there current S/G instructions which can take 32 iterations?
> No, S/G insns currently only act on vector elements 32 or 64 bits
> in size, which means 16 or 8 elements per vector max.

In which case we're fine for now.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -520,6 +520,17 @@  static const struct test avx512er_512[]
      INSN(rsqrt28, 66, 0f38, cd, el, sd, el),
  };
  
+static const struct test avx512pf_512[] = {
+    INSNX(gatherpf0d,  66, 0f38, c6, 1, vl, sd, el),
+    INSNX(gatherpf0q,  66, 0f38, c7, 1, vl, sd, el),
+    INSNX(gatherpf1d,  66, 0f38, c6, 2, vl, sd, el),
+    INSNX(gatherpf1q,  66, 0f38, c7, 2, vl, sd, el),
+    INSNX(scatterpf0d, 66, 0f38, c6, 5, vl, sd, el),
+    INSNX(scatterpf0q, 66, 0f38, c7, 5, vl, sd, el),
+    INSNX(scatterpf1d, 66, 0f38, c6, 6, vl, sd, el),
+    INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
+};
+
  static const struct test avx512_vbmi_all[] = {
      INSN(permb,         66, 0f38, 8d, vl, b, vl),
      INSN(permi2b,       66, 0f38, 75, vl, b, vl),
@@ -580,7 +591,7 @@  static bool record_access(enum x86_segme
  static int read(enum x86_segment seg, unsigned long offset, void *p_data,
                  unsigned int bytes, struct x86_emulate_ctxt *ctxt)
  {
-    if ( !record_access(seg, offset, bytes) )
+    if ( !record_access(seg, offset, bytes + !bytes) )
          return X86EMUL_UNHANDLEABLE;
      memset(p_data, 0, bytes);
      return X86EMUL_OKAY;
@@ -589,7 +600,7 @@  static int read(enum x86_segment seg, un
  static int write(enum x86_segment seg, unsigned long offset, void *p_data,
                   unsigned int bytes, struct x86_emulate_ctxt *ctxt)
  {
-    if ( !record_access(seg, offset, bytes) )
+    if ( !record_access(seg, offset, bytes + !bytes) )
          return X86EMUL_UNHANDLEABLE;
      return X86EMUL_OKAY;
  }
@@ -597,7 +608,7 @@  static int write(enum x86_segment seg, u
  static void test_one(const struct test *test, enum vl vl,
                       unsigned char *instr, struct x86_emulate_ctxt *ctxt)
  {
-    unsigned int vsz, esz, i;
+    unsigned int vsz, esz, i, n;
      int rc;
      bool sg = strstr(test->mnemonic, "gather") ||
                strstr(test->mnemonic, "scatter");
@@ -725,10 +736,20 @@  static void test_one(const struct test *
      for ( i = 0; i < (test->scale == SC_vl ? vsz : esz); ++i )
           if ( accessed[i] )
               goto fail;
-    for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
+
+    n = test->scale == SC_vl ? vsz : esz;
+    if ( !sg )
+        n += vsz;
+    else if ( !strstr(test->mnemonic, "pf") )
+        n += esz;
+    else
+        ++n;
+
+    for ( ; i < n; ++i )
           if ( accessed[i] != (sg ? (vsz / esz) >> (test->opc & 1 & !evex.w)
                                   : 1) )
               goto fail;
+
      for ( ; i < ARRAY_SIZE(accessed); ++i )
           if ( accessed[i] )
               goto fail;
@@ -887,6 +908,8 @@  void evex_disp8_test(void *instr, struct
      RUN(avx512dq, no128);
      RUN(avx512dq, 512);
      RUN(avx512er, 512);
+#define cpu_has_avx512pf cpu_has_avx512f
+    RUN(avx512pf, 512);
      RUN(avx512_vbmi, all);
      RUN(avx512_vbmi2, all);
  }
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -73,6 +73,7 @@  bool emul_test_init(void)
       */
      cp.basic.movbe = true;
      cp.feat.adx = true;
+    cp.feat.avx512pf = cp.feat.avx512f;
      cp.feat.rdpid = true;
      cp.extd.clzero = true;
  
@@ -135,12 +136,14 @@  int emul_test_cpuid(
          res->c |= 1U << 22;
  
      /*
-     * The emulator doesn't itself use ADCX/ADOX/RDPID, so we can always run
-     * the respective tests.
+     * The emulator doesn't itself use ADCX/ADOX/RDPID nor the S/G prefetch
+     * insns, so we can always run the respective tests.
       */
      if ( leaf == 7 && subleaf == 0 )
      {
          res->b |= 1U << 19;
+        if ( res->b & (1U << 16) )
+            res->b |= 1U << 26;
          res->c |= 1U << 22;
      }
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -525,6 +525,7 @@  static const struct ext0f38_table {
      [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
      [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
      [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xc6 ... 0xc7] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
      [0xc8] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
      [0xc9] = { .simd_size = simd_other },
      [0xca] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
@@ -1871,6 +1872,7 @@  in_protmode(
  #define vcpu_has_smap()        (ctxt->cpuid->feat.smap)
  #define vcpu_has_clflushopt()  (ctxt->cpuid->feat.clflushopt)
  #define vcpu_has_clwb()        (ctxt->cpuid->feat.clwb)
+#define vcpu_has_avx512pf()    (ctxt->cpuid->feat.avx512pf)
  #define vcpu_has_avx512er()    (ctxt->cpuid->feat.avx512er)
  #define vcpu_has_sha()         (ctxt->cpuid->feat.sha)
  #define vcpu_has_avx512bw()    (ctxt->cpuid->feat.avx512bw)
@@ -9410,6 +9412,97 @@  x86_emulate(
  
          state->simd_size = simd_none;
          break;
+    }
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xc6):
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xc7):
+    {
+#ifndef __XEN__
+        typeof(evex) *pevex;
+        union {
+            int32_t dw[16];
+            int64_t qw[8];
+        } index;
+#endif
+
+        ASSERT(ea.type == OP_MEM);
+        generate_exception_if((!cpu_has_avx512f || !evex.opmsk || evex.brs ||
+                               evex.z || evex.reg != 0xf || evex.lr != 2),
+                              EXC_UD);
+
+        switch ( modrm_reg & 7 )
+        {
+        case 1: /* vgatherpf0{d,q}p{s,d} mem{k} */
+        case 2: /* vgatherpf1{d,q}p{s,d} mem{k} */
+        case 5: /* vscatterpf0{d,q}p{s,d} mem{k} */
+        case 6: /* vscatterpf1{d,q}p{s,d} mem{k} */
+            vcpu_must_have(avx512pf);
+            break;
+        default:
+            generate_exception(EXC_UD);
+        }
+
+        get_fpu(X86EMUL_FPU_zmm);
+
+#ifndef __XEN__
+        /*
+         * For the test harness perform zero byte memory accesses, such that
+         * in particular correct Disp8 scaling can be verified.
+         */
+        fail_if((modrm_reg & 4) && !ops->write);
+
+        /* Read index register. */
+        opc = init_evex(stub);
+        pevex = copy_EVEX(opc, evex);
+        pevex->opcx = vex_0f;
+        /* vmovdqu{32,64} */
+        opc[0] = 0x7f;
+        pevex->pfx = vex_f3;
+        pevex->w = b & 1;
+        /* Use (%rax) as destination and sib_index as source. */
+        pevex->b = 1;
+        opc[1] = (state->sib_index & 7) << 3;
+        pevex->r = !mode_64bit() || !(state->sib_index & 0x08);
+        pevex->R = !mode_64bit() || !(state->sib_index & 0x10);
+        pevex->RX = 1;
+        opc[2] = 0xc3;
+
+        invoke_stub("", "", "=m" (index) : "a" (&index));
+        put_stub(stub);
+
+        /* Clear untouched parts of the mask value. */
+        n = 1 << (4 - ((b & 1) | evex.w));
+        op_mask &= (1 << n) - 1;
+
+        for ( i = 0; rc == X86EMUL_OKAY && op_mask; ++i )
+        {
+            signed long idx = b & 1 ? index.qw[i] : index.dw[i];
+
+            if ( !(op_mask & (1 << i)) )
+                continue;
+
+            rc = (modrm_reg & 4
+                  ? ops->write
+                  : ops->read)(ea.mem.seg,
+                               truncate_ea(ea.mem.off +
+                                           (idx << state->sib_scale)),
+                               NULL, 0, ctxt);
+            if ( rc == X86EMUL_EXCEPTION )
+            {
+                /* Squash memory access related exceptions. */
+                x86_emul_reset_event(ctxt);
+                rc = X86EMUL_OKAY;
+            }
+
+            op_mask &= ~(1 << i);
+        }
+
+        if ( rc != X86EMUL_OKAY )
+            goto done;
+#endif
+
+        state->simd_size = simd_none;
+        break;
      }
  
      case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */