Message ID | 1475040687-27523-7-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote: > +DEF_HELPER_1(bswap16x4, i64, i64) DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64) > + uint64_t m = 0x00ff00ff00ff00ffull; > + return ((x & m) << 8) | ((x >> 8) & m); ... although I suppose this is only 5 instructions, and could reasonably be done inline too. Especially if you shared the one 64-bit constant across the two bswaps. > + if (ctx->le_mode) { > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); > + gen_helper_bswap16x4(xth, xth); > + tcg_gen_addi_tl(EA, EA, 8); > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); > + gen_helper_bswap16x4(xtl, xtl); > + } else { > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); > + tcg_gen_addi_tl(EA, EA, 8); > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); > + } Better to not duplicate this. tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); tcg_gen_addi_tl(EA, EA, 8); tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); if (ctx->le_mode) { gen_helper_bswap16x4(xth, xth); gen_helper_bswap16x4(xtl, xtl); } r~
Richard Henderson <rth@twiddle.net> writes: > On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote: >> +DEF_HELPER_1(bswap16x4, i64, i64) > > DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64) > >> + uint64_t m = 0x00ff00ff00ff00ffull; >> + return ((x & m) << 8) | ((x >> 8) & m); > > ... although I suppose this is only 5 instructions, and could reasonably be > done inline too. Especially if you shared the one 64-bit constant across the > two bswaps. Something like this: static void gen_bswap16x4(TCGv_i64 val) { TCGv_i64 mask = tcg_const_i64(0x00FF00FF00FF00FF); TCGv_i64 t0 = tcg_temp_new_i64(); TCGv_i64 t1 = tcg_temp_new_i64(); /* val = ((val & mask) << 8) | ((val >> 8) & mask) */ tcg_gen_and_i64(t0, val, mask); tcg_gen_shri_i64(t0, t0, 8); tcg_gen_shli_i64(t1, val, 8); tcg_gen_and_i64(t1, t1, mask); tcg_gen_or_i64(val, t0, t1); tcg_temp_free_i64(t0); tcg_temp_free_i64(t1); tcg_temp_free_i64(mask); } > > >> + if (ctx->le_mode) { >> + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); >> + gen_helper_bswap16x4(xth, xth); >> + tcg_gen_addi_tl(EA, EA, 8); >> + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); >> + gen_helper_bswap16x4(xtl, xtl); >> + } else { >> + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); >> + tcg_gen_addi_tl(EA, EA, 8); >> + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); >> + } > > Better to not duplicate this. > > tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); > tcg_gen_addi_tl(EA, EA, 8); > tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); > if (ctx->le_mode) { > gen_helper_bswap16x4(xth, xth); > gen_helper_bswap16x4(xtl, xtl); > } Sure, much better, thanks. Regards Nikunj
On 09/28/2016 10:11 AM, Nikunj A Dadhania wrote: > Richard Henderson <rth@twiddle.net> writes: > >> On 09/27/2016 10:31 PM, Nikunj A Dadhania wrote: >>> +DEF_HELPER_1(bswap16x4, i64, i64) >> >> DEF_HELPER_FLAGS_1(bswap16x4, TCG_CALL_NO_RWG_SE, i64, i64) >> >>> + uint64_t m = 0x00ff00ff00ff00ffull; >>> + return ((x & m) << 8) | ((x >> 8) & m); >> >> ... although I suppose this is only 5 instructions, and could reasonably be >> done inline too. Especially if you shared the one 64-bit constant across the >> two bswaps. > > Something like this: > > static void gen_bswap16x4(TCGv_i64 val) > { > TCGv_i64 mask = tcg_const_i64(0x00FF00FF00FF00FF); > TCGv_i64 t0 = tcg_temp_new_i64(); > TCGv_i64 t1 = tcg_temp_new_i64(); > > /* val = ((val & mask) << 8) | ((val >> 8) & mask) */ > tcg_gen_and_i64(t0, val, mask); > tcg_gen_shri_i64(t0, t0, 8); > tcg_gen_shli_i64(t1, val, 8); > tcg_gen_and_i64(t1, t1, mask); > tcg_gen_or_i64(val, t0, t1); > > tcg_temp_free_i64(t0); > tcg_temp_free_i64(t1); > tcg_temp_free_i64(mask); > } Like that, except that since you always perform this twice, you should share the expensive constant load. Recall also that you need temporaries for the store, so static void gen_bswap16x8(TCGv_i64 outh, TCGv_i64 outl, TCGv_i64 inh, TCGv_i64 inl) r~
diff --git a/target-ppc/helper.h b/target-ppc/helper.h index a1c2962..9689000 100644 --- a/target-ppc/helper.h +++ b/target-ppc/helper.h @@ -298,6 +298,7 @@ DEF_HELPER_2(mtvscr, void, env, avr) DEF_HELPER_3(lvebx, void, env, avr, tl) DEF_HELPER_3(lvehx, void, env, avr, tl) DEF_HELPER_3(lvewx, void, env, avr, tl) +DEF_HELPER_1(bswap16x4, i64, i64) DEF_HELPER_3(stvebx, void, env, avr, tl) DEF_HELPER_3(stvehx, void, env, avr, tl) DEF_HELPER_3(stvewx, void, env, avr, tl) diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c index 6548715..29c7b5b 100644 --- a/target-ppc/mem_helper.c +++ b/target-ppc/mem_helper.c @@ -285,6 +285,12 @@ STVE(stvewx, cpu_stl_data_ra, bswap32, u32) #undef I #undef LVE +uint64_t helper_bswap16x4(uint64_t x) +{ + uint64_t m = 0x00ff00ff00ff00ffull; + return ((x & m) << 8) | ((x >> 8) & m); +} + #undef HI_IDX #undef LO_IDX diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c index 9fdab5f..51f3dcb 100644 --- a/target-ppc/translate/vsx-impl.inc.c +++ b/target-ppc/translate/vsx-impl.inc.c @@ -107,6 +107,34 @@ static void gen_lxvw4x(DisasContext *ctx) tcg_temp_free(EA); } +static void gen_lxvh8x(DisasContext *ctx) +{ + TCGv EA; + TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); + TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); + + if (unlikely(!ctx->vsx_enabled)) { + gen_exception(ctx, POWERPC_EXCP_VSXU); + return; + } + gen_set_access_type(ctx, ACCESS_INT); + EA = tcg_temp_new(); + gen_addr_reg_index(ctx, EA); + + if (ctx->le_mode) { + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); + gen_helper_bswap16x4(xth, xth); + tcg_gen_addi_tl(EA, EA, 8); + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); + gen_helper_bswap16x4(xtl, xtl); + } else { + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_BEQ); + tcg_gen_addi_tl(EA, EA, 8); + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_BEQ); + } + tcg_temp_free(EA); +} + #define VSX_STORE_SCALAR(name, operation) \ static void gen_##name(DisasContext *ctx) \ { \ diff --git a/target-ppc/translate/vsx-ops.inc.c b/target-ppc/translate/vsx-ops.inc.c index d5f5b87..c52e6ff 100644 --- a/target-ppc/translate/vsx-ops.inc.c +++ b/target-ppc/translate/vsx-ops.inc.c @@ -7,6 +7,7 @@ GEN_HANDLER_E(lxsspx, 0x1F, 0x0C, 0x10, 0, PPC_NONE, PPC2_VSX207), GEN_HANDLER_E(lxvd2x, 0x1F, 0x0C, 0x1A, 0, PPC_NONE, PPC2_VSX), GEN_HANDLER_E(lxvdsx, 0x1F, 0x0C, 0x0A, 0, PPC_NONE, PPC2_VSX), GEN_HANDLER_E(lxvw4x, 0x1F, 0x0C, 0x18, 0, PPC_NONE, PPC2_VSX), +GEN_HANDLER_E(lxvh8x, 0x1F, 0x0C, 0x19, 0, PPC_NONE, PPC2_ISA300), GEN_HANDLER_E(stxsdx, 0x1F, 0xC, 0x16, 0, PPC_NONE, PPC2_VSX), GEN_HANDLER_E(stxsibx, 0x1F, 0xD, 0x1C, 0, PPC_NONE, PPC2_ISA300),
lxvh8x: Load VSX Vector Halfword*8 Big-Endian Storage +-------+-------+-------+-------+-------+-------+-------+-------+ | 00 01 | 10 11 | 20 21 | 30 31 | 40 41 | 50 51 | 60 61 | 70 71 | +-------+-------+-------+-------+-------+-------+-------+-------+ Little-Endian Storage +-------+-------+-------+-------+-------+-------+-------+-------+ | 01 00 | 11 10 | 21 20 | 31 30 | 41 40 | 51 50 | 61 60 | 71 70 | +-------+-------+-------+-------+-------+-------+-------+-------+ Vector load results in: +-------+-------+-------+-------+-------+-------+-------+-------+ | 00 01 | 10 11 | 20 21 | 30 31 | 40 41 | 50 51 | 60 61 | 70 71 | +-------+-------+-------+-------+-------+-------+-------+-------+ Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> --- target-ppc/helper.h | 1 + target-ppc/mem_helper.c | 6 ++++++ target-ppc/translate/vsx-impl.inc.c | 28 ++++++++++++++++++++++++++++ target-ppc/translate/vsx-ops.inc.c | 1 + 4 files changed, 36 insertions(+)