diff mbox series

[12/12] target/ppc: Use gvec to decode XVTSTDC[DS]P

Message ID 20220923214754.217819-13-lucas.araujo@eldorado.org.br (mailing list archive)
State New, archived
Headers show
Series VMX/VSX instructions with gvec | expand

Commit Message

Lucas Mateus Martins Araujo e Castro Sept. 23, 2022, 9:47 p.m. UTC
From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>

Used gvec to translate XVTSTDCSP and XVTSTDCDP.

xvtstdcsp:
rept    loop    patch10             patch12
8       12500   2,70288900          1,24050300 (-54.1%)
25      4000    2,65665700          1,14078900 (-57.1%)
100     1000    2,82795400          1,53337200 (-45.8%)
500     200     3,62225400          3,91718000 (+8.1%)
2500    40      6,45658000         12,60683700 (+95.3%)
8000    12     17,48091900         44,15384000 (+152.6%)

xvtstdcdp:
rept    loop    patch10             patch12
8       12500    1,56435900         1,24554800 (-20.4%)
25      4000     1,53789500         1,14177800 (-25.8%)
100     1000     1,67964600         1,54280000 (-8.1%)
500     200      2,46777100         3,96816000 (+60.8%)
2500    40       5,21938900        12,79937800 (+145.2%)
8000    12      15,97600500        45,44233000 (+184.4%)

Overall these instructions are the hardest ones to measure performance
as the helper implementation is affected by the immediate. So for
example in a worst case scenario (high REPT, LOOP = 1, immediate 127) it
took 13x longer with the gvec implementation, and in a best case
scenario (low REPT, high LOOP, only 1 bit set in the immediate) the
execution took 21.8% of the time with gvec (-78.2%).
The tests here are the sum of every possible immediate.

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
---
 target/ppc/translate/vsx-impl.c.inc | 73 ++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 2 deletions(-)

Comments

Richard Henderson Sept. 24, 2022, 8:16 p.m. UTC | #1
On 9/23/22 21:47, Lucas Mateus Castro(alqotel) wrote:
> +static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm)
> +{
> +    TCGv_vec match = tcg_const_ones_vec_matching(t);
> +    TCGv_vec temp;
> +    TCGv_vec mask;
> +    uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
> +    uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
> +    uint64_t frc_msk = ~(exp_msk | sgn_msk);
> +    mask = tcg_constant_vec_matching(t, vece, 0);
> +    tcg_gen_mov_vec(t, mask);
> +    if (imm & (0x3 << 0)) {
> +        /* test if Denormal */
> +        temp = tcg_temp_new_vec_matching(t);
> +        mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
> +        tcg_gen_and_vec(vece, t, b, mask);
> +        mask = tcg_constant_vec_matching(t, vece, frc_msk);
> +        tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask);
> +        mask = tcg_constant_vec_matching(t, vece, 0);
> +        tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask);
> +
> +        tcg_gen_mov_vec(t, mask);
> +        mask = tcg_constant_vec_matching(t, vece, sgn_msk);
> +        if (imm & (0x1)) {
> +            /* test if negative */
> +            tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t);
> +        }
> +        if (imm & (0x2)) {
> +            /* test if positive */
> +            tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t);
> +        }
> +        tcg_temp_free_vec(temp);
> +    }
> +    if (imm & (1 << 2)) {
> +        /* test if -0 */
> +        mask = tcg_constant_vec_matching(t, vece, sgn_msk);
> +        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
> +    }
> +    if (imm & (1 << 3)) {
> +        /* test if +0 */
> +        mask = tcg_constant_vec_matching(t, vece, 0);
> +        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
> +    }
> +    if (imm & (1 << 4)) {
> +        /* test if -Inf */
> +        mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk);
> +        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
> +    }
> +    if (imm & (1 << 5)) {
> +        /* test if +Inf */
> +        mask = tcg_constant_vec_matching(t, vece, exp_msk);
> +        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
> +    }
> +    if (imm & (1 << 6)) {
> +        /* test if NaN */
> +        mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
> +        tcg_gen_and_vec(vece, b, b, mask);
> +        mask = tcg_constant_vec_matching(t, vece, exp_msk);
> +        tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t);
> +    }
> +    tcg_temp_free_vec(match);
> +}

While each case is fairly clever, I don't think that stringing them together like this is 
a good idea.  I think you should only handle the easy cases inline, and defer random (and 
probably rarely used) bit combinations to the helper function.

For instance,

static void gen_is_pos_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
{
     tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
         tcg_constant_vec_matching(t, vece, exp_mask));
}

static void gen_is_any_inf(unsigned vece, TCGv_vec t, TCGv_vec b)
{
     tcg_gen_and_vec(vece, t, b,
         tcg_constant_vec_matching(t, vece, ~sgn_mask));
     tcg_gen_cmp_vec(TCG_COND_EQ, vece, t, b,
         tcg_constant_vec_matching(t, vece, exp_mask));
}

static bool do_xvtstdc(...)
{
     switch (a->imm) {
     case (1 << 4): /* -Inf */
         tcg_gen_gvec_2(..., &op_is_neg_inf);
         break;
     case (1 << 5): /* +Inf */
         tcg_gen_gvec_2(..., &op_is_pos_inf);
         break;
     case (1 << 4) | (1 << 5): /* -Inf | +Inf */
         tcg_gen_gvec_2(..., &op_is_any_inf);
         break;
     ...
     default:
         tcg_gen_gvec_2_ool(..., 16, 16, a->imm, gen_helper_XVTSTDCXX);
     }
}

Or something of that nature.

I'll also note that you don't need CMPSEL -- all cases are mutually exclusive, so OR works 
just as well.

r~
diff mbox series

Patch

diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc
index c3c179723b..dc95e8fdf4 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -1121,16 +1121,85 @@  GEN_VSX_HELPER_X2(xscvhpdp, 0x16, 0x15, 0x10, PPC2_ISA300)
 GEN_VSX_HELPER_R2(xscvsdqp, 0x04, 0x1A, 0x0A, PPC2_ISA300)
 GEN_VSX_HELPER_X2(xscvspdp, 0x12, 0x14, 0, PPC2_VSX)
 
+static void do_xvtstdc_vec(unsigned vece, TCGv_vec t, TCGv_vec b, int64_t imm)
+{
+    TCGv_vec match = tcg_const_ones_vec_matching(t);
+    TCGv_vec temp;
+    TCGv_vec mask;
+    uint64_t exp_msk = (vece == MO_32) ? (uint32_t)EXP_MASK_SP : EXP_MASK_DP;
+    uint64_t sgn_msk = (vece == MO_32) ? (uint32_t)SGN_MASK_SP : SGN_MASK_DP;
+    uint64_t frc_msk = ~(exp_msk | sgn_msk);
+    mask = tcg_constant_vec_matching(t, vece, 0);
+    tcg_gen_mov_vec(t, mask);
+    if (imm & (0x3 << 0)) {
+        /* test if Denormal */
+        temp = tcg_temp_new_vec_matching(t);
+        mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+        tcg_gen_and_vec(vece, t, b, mask);
+        mask = tcg_constant_vec_matching(t, vece, frc_msk);
+        tcg_gen_cmp_vec(TCG_COND_LE, vece, temp, t, mask);
+        mask = tcg_constant_vec_matching(t, vece, 0);
+        tcg_gen_cmpsel_vec(TCG_COND_NE, vece, temp, t, mask, temp, mask);
+
+        tcg_gen_mov_vec(t, mask);
+        mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+        if (imm & (0x1)) {
+            /* test if negative */
+            tcg_gen_cmpsel_vec(TCG_COND_GTU, vece, t, b, mask, temp, t);
+        }
+        if (imm & (0x2)) {
+            /* test if positive */
+            tcg_gen_cmpsel_vec(TCG_COND_LTU, vece, t, b, mask, temp, t);
+        }
+        tcg_temp_free_vec(temp);
+    }
+    if (imm & (1 << 2)) {
+        /* test if -0 */
+        mask = tcg_constant_vec_matching(t, vece, sgn_msk);
+        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+    }
+    if (imm & (1 << 3)) {
+        /* test if +0 */
+        mask = tcg_constant_vec_matching(t, vece, 0);
+        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+    }
+    if (imm & (1 << 4)) {
+        /* test if -Inf */
+        mask = tcg_constant_vec_matching(t, vece, exp_msk | sgn_msk);
+        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+    }
+    if (imm & (1 << 5)) {
+        /* test if +Inf */
+        mask = tcg_constant_vec_matching(t, vece, exp_msk);
+        tcg_gen_cmpsel_vec(TCG_COND_EQ, vece, t, b, mask, match, t);
+    }
+    if (imm & (1 << 6)) {
+        /* test if NaN */
+        mask = tcg_constant_vec_matching(t, vece, ~sgn_msk);
+        tcg_gen_and_vec(vece, b, b, mask);
+        mask = tcg_constant_vec_matching(t, vece, exp_msk);
+        tcg_gen_cmpsel_vec(TCG_COND_GT, vece, t, b, mask, match, t);
+    }
+    tcg_temp_free_vec(match);
+}
+
 static bool do_xvtstdc(DisasContext *ctx, arg_XX2_uim *a, unsigned vece)
 {
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+    };
     static const GVecGen2i op[] = {
         {
             .fnoi = gen_helper_XVTSTDCSP,
-            .vece = MO_32
+            .fniv = do_xvtstdc_vec,
+            .vece = MO_32,
+            .opt_opc = vecop_list
         },
         {
             .fnoi = gen_helper_XVTSTDCDP,
-            .vece = MO_64
+            .fniv = do_xvtstdc_vec,
+            .vece = MO_64,
+            .opt_opc = vecop_list
         },
     };