diff mbox series

[PULL,13/25] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder

Message ID 20241015141711.528342-14-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series [PULL,01/25] target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] | expand

Commit Message

Paolo Bonzini Oct. 15, 2024, 2:16 p.m. UTC
The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct
prototype already; the only thing that needs to be done is removing the
gen_lea_modrm() call.

This moves the last LOCK-enabled instructions to the new decoder.  It is
now possible to assume that gen_multi0F is called only after checking
that PREFIX_LOCK was not specified.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.h     |   2 +
 target/i386/tcg/translate.c      | 121 +------------------------------
 target/i386/tcg/decode-new.c.inc |  34 ++++++---
 target/i386/tcg/emit.c.inc       |  96 ++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 129 deletions(-)

Comments

Philippe Mathieu-Daudé Oct. 16, 2024, 4:37 p.m. UTC | #1
Hi,

On 15/10/24 11:16, Paolo Bonzini wrote:
> The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct
> prototype already; the only thing that needs to be done is removing the
> gen_lea_modrm() call.
> 
> This moves the last LOCK-enabled instructions to the new decoder.  It is
> now possible to assume that gen_multi0F is called only after checking
> that PREFIX_LOCK was not specified.
> 
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   target/i386/tcg/decode-new.h     |   2 +
>   target/i386/tcg/translate.c      | 121 +------------------------------
>   target/i386/tcg/decode-new.c.inc |  34 ++++++---
>   target/i386/tcg/emit.c.inc       |  96 ++++++++++++++++++++++++
>   4 files changed, 124 insertions(+), 129 deletions(-)


> +static void gen_CMPXCHG8B(DisasContext *s, X86DecodedInsn *decode)
> +{
> +    TCGv_i64 cmp, val, old;
> +    TCGv Z;
> +
> +    cmp = tcg_temp_new_i64();
> +    val = tcg_temp_new_i64();
> +    old = tcg_temp_new_i64();
> +
> +    /* Construct the comparison values from the register pair. */
> +    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
> +    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
> +
> +    /* Only require atomic with LOCK; non-parallel handled in generator. */
> +    if (s->prefix & PREFIX_LOCK) {
> +        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ);
> +    } else {
> +        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
> +                                      s->mem_index, MO_TEUQ);
> +    }
> +
> +    /* Set tmp0 to match the required value of Z. */
> +    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
> +    Z = tcg_temp_new();
> +    tcg_gen_trunc_i64_tl(Z, cmp);
> +
> +    /*
> +     * Extract the result values for the register pair.
> +     * For 32-bit, we may do this unconditionally, because on success (Z=1),
> +     * the old value matches the previous value in EDX:EAX.  For x86_64,
> +     * the store must be conditional, because we must leave the source
> +     * registers unchanged on success, and zero-extend the writeback
> +     * on failure (Z=0).
> +     */
> +    if (TARGET_LONG_BITS == 32) {
> +        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
> +    } else {
> +        TCGv zero = tcg_constant_tl(0);
> +
> +        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
> +        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
> +                           s->T0, cpu_regs[R_EAX]);
> +        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
> +                           s->T1, cpu_regs[R_EDX]);
> +    }
> +
> +    /* Update Z. */
> +    gen_compute_eflags(s);
> +    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
> +}

On s390x the cdrom-test generates:

tcg/s390x/tcg-target.c.inc:1284:tgen_cmp2: code should not be reached
Paolo Bonzini Oct. 17, 2024, 9:14 a.m. UTC | #2
On Wed, Oct 16, 2024 at 6:37 PM Philippe Mathieu-Daudé
<philmd@linaro.org> wrote:
> On s390x the cdrom-test generates:
>
> tcg/s390x/tcg-target.c.inc:1284:tgen_cmp2: code should not be reached

Backend bug, sent "[PATCH] tcg/s390x: fix constraint for 32-bit
TSTEQ/TSTNE" to fix it.

Paolo
diff mbox series

Patch

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index bebc77bd54b..7f23d373ea7 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -114,6 +114,8 @@  typedef enum X86CPUIDFeature {
     X86_FEAT_CLWB,
     X86_FEAT_CMOV,
     X86_FEAT_CMPCCXADD,
+    X86_FEAT_CX8,
+    X86_FEAT_CX16,
     X86_FEAT_F16C,
     X86_FEAT_FMA,
     X86_FEAT_FSGSBASE,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index be68cde1baa..1d3b5f35c39 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2289,104 +2289,6 @@  static void gen_sty_env_A0(DisasContext *s, int offset, bool align)
     tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop);
 }
 
-static void gen_cmpxchg8b(DisasContext *s, X86DecodedInsn *decode)
-{
-    TCGv_i64 cmp, val, old;
-    TCGv Z;
-
-    gen_lea_modrm(s, decode);
-
-    cmp = tcg_temp_new_i64();
-    val = tcg_temp_new_i64();
-    old = tcg_temp_new_i64();
-
-    /* Construct the comparison values from the register pair. */
-    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-    /* Only require atomic with LOCK; non-parallel handled in generator. */
-    if (s->prefix & PREFIX_LOCK) {
-        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ);
-    } else {
-        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
-                                      s->mem_index, MO_TEUQ);
-    }
-
-    /* Set tmp0 to match the required value of Z. */
-    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
-    Z = tcg_temp_new();
-    tcg_gen_trunc_i64_tl(Z, cmp);
-
-    /*
-     * Extract the result values for the register pair.
-     * For 32-bit, we may do this unconditionally, because on success (Z=1),
-     * the old value matches the previous value in EDX:EAX.  For x86_64,
-     * the store must be conditional, because we must leave the source
-     * registers unchanged on success, and zero-extend the writeback
-     * on failure (Z=0).
-     */
-    if (TARGET_LONG_BITS == 32) {
-        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
-    } else {
-        TCGv zero = tcg_constant_tl(0);
-
-        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
-        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
-                           s->T0, cpu_regs[R_EAX]);
-        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
-                           s->T1, cpu_regs[R_EDX]);
-    }
-
-    /* Update Z. */
-    gen_compute_eflags(s);
-    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
-}
-
-#ifdef TARGET_X86_64
-static void gen_cmpxchg16b(DisasContext *s, X86DecodedInsn *decode)
-{
-    MemOp mop = MO_TE | MO_128 | MO_ALIGN;
-    TCGv_i64 t0, t1;
-    TCGv_i128 cmp, val;
-
-    gen_lea_modrm(s, decode);
-
-    cmp = tcg_temp_new_i128();
-    val = tcg_temp_new_i128();
-    tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-    tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-    /* Only require atomic with LOCK; non-parallel handled in generator. */
-    if (s->prefix & PREFIX_LOCK) {
-        tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
-    } else {
-        tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
-    }
-
-    tcg_gen_extr_i128_i64(s->T0, s->T1, val);
-
-    /* Determine success after the fact. */
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-    tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
-    tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
-    tcg_gen_or_i64(t0, t0, t1);
-
-    /* Update Z. */
-    gen_compute_eflags(s);
-    tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
-    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
-
-    /*
-     * Extract the result values for the register pair.  We may do this
-     * unconditionally, because on success (Z=1), the old value matches
-     * the previous value in RDX:RAX.
-     */
-    tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
-    tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
-}
-#endif
-
 #include "emit.c.inc"
 
 static void gen_x87(DisasContext *s, X86DecodedInsn *decode)
@@ -2962,29 +2864,10 @@  static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode)
 
     /* now check op code */
     switch (b) {
-    case 0x1c7: /* cmpxchg8b */
+    case 0x1c7: /* RDSEED, RDPID with f3 prefix */
         mod = (modrm >> 6) & 3;
         switch ((modrm >> 3) & 7) {
-        case 1: /* CMPXCHG8, CMPXCHG16 */
-            if (mod == 3) {
-                goto illegal_op;
-            }
-#ifdef TARGET_X86_64
-            if (dflag == MO_64) {
-                if (!(s->cpuid_ext_features & CPUID_EXT_CX16)) {
-                    goto illegal_op;
-                }
-                gen_cmpxchg16b(s, decode);
-                break;
-            }
-#endif
-            if (!(s->cpuid_features & CPUID_CX8)) {
-                goto illegal_op;
-            }
-            gen_cmpxchg8b(s, decode);
-            break;
-
-        case 7: /* RDSEED, RDPID with f3 prefix */
+        case 7:
             if (mod != 3 ||
                 (s->prefix & (PREFIX_LOCK | PREFIX_REPNZ))) {
                 goto illegal_op;
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index ee3ba16116e..fe3bfed147a 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -288,6 +288,25 @@  static void decode_group8(DisasContext *s, CPUX86State *env, X86OpEntry *entry,
     }
 }
 
+static void decode_group9(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry group9_reg =
+        X86_OP_ENTRY0(multi0F);  /* unconverted */
+    static const X86OpEntry cmpxchg8b =
+        X86_OP_ENTRY1(CMPXCHG8B,  M,q,  lock p_00 cpuid(CX8));
+    static const X86OpEntry cmpxchg16b =
+        X86_OP_ENTRY1(CMPXCHG16B, M,dq, lock p_00 cpuid(CX16));
+
+    int modrm = get_modrm(s, env);
+    int op = (modrm >> 3) & 7;
+
+    if ((modrm >> 6) == 3) {
+        *entry = group9_reg;
+    } else if (op == 1) {
+        *entry = REX_W(s) ? cmpxchg16b : cmpxchg8b;
+    }
+}
+
 static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry group15_reg[8] = {
@@ -1203,7 +1222,7 @@  static const X86OpEntry opcodes_0F[256] = {
     [0xc4] = X86_OP_ENTRY4(PINSRW,     V,dq,H,dq,E,w,       vex5 mmx p_00_66),
     [0xc5] = X86_OP_ENTRY3(PEXTRW,     G,d, U,dq,I,b,       vex5 mmx p_00_66),
     [0xc6] = X86_OP_ENTRY4(VSHUF,      V,x, H,x, W,x,       vex4 p_00_66),
-    [0xc7] = X86_OP_ENTRY1(multi0F,    nop,v,               nolea), /* unconverted */
+    [0xc7] = X86_OP_GROUP0(group9),
 
     [0xd0] = X86_OP_ENTRY3(VADDSUB,   V,x, H,x, W,x,        vex2 cpuid(SSE3) p_66_f2),
     [0xd1] = X86_OP_ENTRY3(PSRLW_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
@@ -2245,8 +2264,12 @@  static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid)
         return (s->cpuid_features & CPUID_CMOV);
     case X86_FEAT_CLFLUSH:
         return (s->cpuid_features & CPUID_CLFLUSH);
+    case X86_FEAT_CX8:
+        return (s->cpuid_features & CPUID_CX8);
     case X86_FEAT_FXSR:
         return (s->cpuid_features & CPUID_FXSR);
+    case X86_FEAT_CX16:
+        return (s->cpuid_ext_features & CPUID_EXT_CX16);
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
     case X86_FEAT_FMA:
@@ -2726,15 +2749,6 @@  static void disas_insn(DisasContext *s, CPUState *cpu)
         break;
     }
 
-    /*
-     * hack for old decoder: 0F C7 has both instructions that accept LOCK
-     * and instructions that don't, but also needs X86_SPECIAL_NoLoadEA.
-     * Keep this here until CMPXCHG8B/CMPXCHG16B is separated from the
-     * other unconverted opcodes.
-     */
-    if (decode.e.gen == gen_multi0F) {
-        accept_lock = true;
-    }
     if ((s->prefix & PREFIX_LOCK) && !accept_lock) {
         goto illegal_op;
     }
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 29de8bba6f7..fd17a9b1eca 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1788,6 +1788,102 @@  static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode)
     decode->cc_op = CC_OP_SUBB + ot;
 }
 
+static void gen_CMPXCHG16B(DisasContext *s, X86DecodedInsn *decode)
+{
+#ifdef TARGET_X86_64
+    MemOp mop = MO_TE | MO_128 | MO_ALIGN;
+    TCGv_i64 t0, t1;
+    TCGv_i128 cmp, val;
+
+    cmp = tcg_temp_new_i128();
+    val = tcg_temp_new_i128();
+    tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
+
+    /* Only require atomic with LOCK; non-parallel handled in generator. */
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
+    } else {
+        tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
+    }
+
+    tcg_gen_extr_i128_i64(s->T0, s->T1, val);
+
+    /* Determine success after the fact. */
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
+    tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
+    tcg_gen_or_i64(t0, t0, t1);
+
+    /* Update Z. */
+    gen_compute_eflags(s);
+    tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
+    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
+
+    /*
+     * Extract the result values for the register pair.  We may do this
+     * unconditionally, because on success (Z=1), the old value matches
+     * the previous value in RDX:RAX.
+     */
+    tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
+    tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
+#else
+    abort();
+#endif
+}
+
+static void gen_CMPXCHG8B(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 cmp, val, old;
+    TCGv Z;
+
+    cmp = tcg_temp_new_i64();
+    val = tcg_temp_new_i64();
+    old = tcg_temp_new_i64();
+
+    /* Construct the comparison values from the register pair. */
+    tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
+
+    /* Only require atomic with LOCK; non-parallel handled in generator. */
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ);
+    } else {
+        tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
+                                      s->mem_index, MO_TEUQ);
+    }
+
+    /* Set tmp0 to match the required value of Z. */
+    tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
+    Z = tcg_temp_new();
+    tcg_gen_trunc_i64_tl(Z, cmp);
+
+    /*
+     * Extract the result values for the register pair.
+     * For 32-bit, we may do this unconditionally, because on success (Z=1),
+     * the old value matches the previous value in EDX:EAX.  For x86_64,
+     * the store must be conditional, because we must leave the source
+     * registers unchanged on success, and zero-extend the writeback
+     * on failure (Z=0).
+     */
+    if (TARGET_LONG_BITS == 32) {
+        tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
+    } else {
+        TCGv zero = tcg_constant_tl(0);
+
+        tcg_gen_extr_i64_tl(s->T0, s->T1, old);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
+                           s->T0, cpu_regs[R_EAX]);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
+                           s->T1, cpu_regs[R_EDX]);
+    }
+
+    /* Update Z. */
+    gen_compute_eflags(s);
+    tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
+}
+
 static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);