diff mbox

[v4,6/9] target-ppc: add lxvh8x instruction

Message ID 1475040687-27523-7-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nikunj A. Dadhania Sept. 28, 2016, 5:31 a.m. UTC
lxvh8x:  Load VSX Vector Halfword*8

Big-Endian Storage
+-------+-------+-------+-------+-------+-------+-------+-------+
| 00 01 | 10 11 | 20 21 | 30 31 | 40 41 | 50 51 | 60 61 | 70 71 |
+-------+-------+-------+-------+-------+-------+-------+-------+

Little-Endian Storage
+-------+-------+-------+-------+-------+-------+-------+-------+
| 01 00 | 11 10 | 21 20 | 31 30 | 41 40 | 51 50 | 61 60 | 71 70 |
+-------+-------+-------+-------+-------+-------+-------+-------+

Vector load results in:
+-------+-------+-------+-------+-------+-------+-------+-------+
| 00 01 | 10 11 | 20 21 | 30 31 | 40 41 | 50 51 | 60 61 | 70 71 |
+-------+-------+-------+-------+-------+-------+-------+-------+

Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target-ppc/helper.h                 |  1 +
 target-ppc/mem_helper.c             |  6 ++++++
 target-ppc/translate/vsx-impl.inc.c | 28 ++++++++++++++++++++++++++++
 target-ppc/translate/vsx-ops.inc.c  |  1 +
 4 files changed, 36 insertions(+)

Comments

Richard Henderson Sept. 28, 2016, 4:12 p.m. UTC | #1
On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote:
> +DEF_HELPER_1(bswap16x4, i64, i64)

DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64)

> +    uint64_t m = 0x00ff00ff00ff00ffull;
> +    return ((x & m) << 8) | ((x >> 8) & m);

... although I suppose this is only 5 instructions, and could reasonably be
done inline too.  Especially if you shared the one 64-bit constant across the
two bswaps.


> +    if (ctx->le_mode) {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
> +        gen_helper_bswap16x4(xth, xth);
> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
> +        gen_helper_bswap16x4(xtl, xtl);
> +    } else {
> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
> +        tcg_gen_addi_tl(EA, EA, 8);
> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
> +    }

Better to not duplicate this.

  tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
  tcg_gen_addi_tl(EA, EA, 8);
  tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
  if (ctx->le_mode) {
    gen_helper_bswap16x4(xth, xth);
    gen_helper_bswap16x4(xtl, xtl);
  }


r~
Nikunj A. Dadhania Sept. 28, 2016, 5:11 p.m. UTC | #2
Richard Henderson <rth@twiddle.net> writes:

> On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote:
>> +DEF_HELPER_1(bswap16x4, i64, i64)
>
> DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64)
>
>> +    uint64_t m = 0x00ff00ff00ff00ffull;
>> +    return ((x & m) << 8) | ((x >> 8) & m);
>
> ... although I suppose this is only 5 instructions, and could reasonably be
> done inline too.  Especially if you shared the one 64-bit constant across the
> two bswaps.

Something like this:

static void gen_bswap16x4(TCGv_i64 val)
{
    TCGv_i64 mask = tcg_const_i64(0x00FF00FF00FF00FF);
    TCGv_i64 t0 = tcg_temp_new_i64();
    TCGv_i64 t1 = tcg_temp_new_i64();

    /* val = ((val & mask) << 8) | ((val >> 8) & mask) */
    tcg_gen_and_i64(t0, val, mask); 
    tcg_gen_shri_i64(t0, t0, 8);
    tcg_gen_shli_i64(t1, val, 8);
    tcg_gen_and_i64(t1, t1, mask);
    tcg_gen_or_i64(val, t0, t1);

    tcg_temp_free_i64(t0);
    tcg_temp_free_i64(t1);
    tcg_temp_free_i64(mask);
}

>
>
>> +    if (ctx->le_mode) {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
>> +        gen_helper_bswap16x4(xth, xth);
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
>> +        gen_helper_bswap16x4(xtl, xtl);
>> +    } else {
>> +        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
>> +        tcg_gen_addi_tl(EA, EA, 8);
>> +        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
>> +    }
>
> Better to not duplicate this.
>
>   tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
>   tcg_gen_addi_tl(EA, EA, 8);
>   tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
>   if (ctx->le_mode) {
>     gen_helper_bswap16x4(xth, xth);
>     gen_helper_bswap16x4(xtl, xtl);
>   }

Sure, much better, thanks.

Regards
Nikunj
Richard Henderson Sept. 28, 2016, 5:22 p.m. UTC | #3
On 09/28/2016 10:11 AM, Nikunj A Dadhania wrote:
> Richard Henderson <rth@twiddle.net> writes:
> 
>> On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote:
>>> +DEF_HELPER_1(bswap16x4, i64, i64)
>>
>> DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64)
>>
>>> +    uint64_t m = 0x00ff00ff00ff00ffull;
>>> +    return ((x & m) << 8) | ((x >> 8) & m);
>>
>> ... although I suppose this is only 5 instructions, and could reasonably be
>> done inline too.  Especially if you shared the one 64-bit constant across the
>> two bswaps.
> 
> Something like this:
> 
> static void gen_bswap16x4(TCGv_i64 val)
> {
>     TCGv_i64 mask = tcg_const_i64(0x00FF00FF00FF00FF);
>     TCGv_i64 t0 = tcg_temp_new_i64();
>     TCGv_i64 t1 = tcg_temp_new_i64();
> 
>     /* val = ((val & mask) << 8) | ((val >> 8) & mask) */
>     tcg_gen_and_i64(t0, val, mask); 
>     tcg_gen_shri_i64(t0, t0, 8);
>     tcg_gen_shli_i64(t1, val, 8);
>     tcg_gen_and_i64(t1, t1, mask);
>     tcg_gen_or_i64(val, t0, t1);
> 
>     tcg_temp_free_i64(t0);
>     tcg_temp_free_i64(t1);
>     tcg_temp_free_i64(mask);
> }

Like that, except that since you always perform this twice, you should share
the expensive constant load.  Recall also that you need temporaries for the
store, so

static void gen_bswap16x8(TCGv_i64 outh, TCGv_i64 outl,
                          TCGv_i64 inh, TCGv_i64 inl)


r~
diff mbox

Patch

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index a1c2962..9689000 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -298,6 +298,7 @@  DEF_HELPER_2(mtvscr, void, env, avr)
 DEF_HELPER_3(lvebx, void, env, avr, tl)
 DEF_HELPER_3(lvehx, void, env, avr, tl)
 DEF_HELPER_3(lvewx, void, env, avr, tl)
+DEF_HELPER_1(bswap16x4, i64, i64)
 DEF_HELPER_3(stvebx, void, env, avr, tl)
 DEF_HELPER_3(stvehx, void, env, avr, tl)
 DEF_HELPER_3(stvewx, void, env, avr, tl)
diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
index 6548715..29c7b5b 100644
--- a/target-ppc/mem_helper.c
+++ b/target-ppc/mem_helper.c
@@ -285,6 +285,12 @@  STVE(stvewx, cpu_stl_data_ra, bswap32, u32)
 #undef I
 #undef LVE
 
+uint64_t helper_bswap16x4(uint64_t x)
+{
+    uint64_t m = 0x00ff00ff00ff00ffull;
+    return ((x & m) << 8) | ((x >> 8) & m);
+}
+
 #undef HI_IDX
 #undef LO_IDX
 
diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c
index 9fdab5f..51f3dcb 100644
--- a/target-ppc/translate/vsx-impl.inc.c
+++ b/target-ppc/translate/vsx-impl.inc.c
@@ -107,6 +107,34 @@  static void gen_lxvw4x(DisasContext *ctx)
     tcg_temp_free(EA);
 }
 
+static void gen_lxvh8x(DisasContext *ctx)
+{
+    TCGv EA;
+    TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode));
+    TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode));
+
+    if (unlikely(!ctx->vsx_enabled)) {
+        gen_exception(ctx, POWERPC_EXCP_VSXU);
+        return;
+    }
+    gen_set_access_type(ctx, ACCESS_INT);
+    EA = tcg_temp_new();
+    gen_addr_reg_index(ctx, EA);
+
+    if (ctx->le_mode) {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
+        gen_helper_bswap16x4(xth, xth);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
+        gen_helper_bswap16x4(xtl, xtl);
+    } else {
+        tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ);
+        tcg_gen_addi_tl(EA, EA, 8);
+        tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ);
+    }
+    tcg_temp_free(EA);
+}
+
 #define VSX_STORE_SCALAR(name, operation)                     \
 static void gen_##name(DisasContext *ctx)                     \
 {                                                             \
diff --git a/target-ppc/translate/vsx-ops.inc.c b/target-ppc/translate/vsx-ops.inc.c
index d5f5b87..c52e6ff 100644
--- a/target-ppc/translate/vsx-ops.inc.c
+++ b/target-ppc/translate/vsx-ops.inc.c
@@ -7,6 +7,7 @@  GEN_HANDLER_E(lxsspx, 0x1F, 0x0C, 0x10, 0, PPC_NONE, PPC2_VSX207),
 GEN_HANDLER_E(lxvd2x, 0x1F, 0x0C, 0x1A, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(lxvdsx, 0x1F, 0x0C, 0x0A, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(lxvw4x, 0x1F, 0x0C, 0x18, 0, PPC_NONE, PPC2_VSX),
+GEN_HANDLER_E(lxvh8x, 0x1F, 0x0C, 0x19, 0, PPC_NONE,  PPC2_ISA300),
 
 GEN_HANDLER_E(stxsdx, 0x1F, 0xC, 0x16, 0, PPC_NONE, PPC2_VSX),
 GEN_HANDLER_E(stxsibx, 0x1F, 0xD, 0x1C, 0, PPC_NONE, PPC2_ISA300),