Message ID | 20221010191356.83659-6-lucas.araujo@eldorado.org.br (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | VMX/VSX instructions with gvec | expand |
On 10/10/22 12:13, Lucas Mateus Castro(alqotel) wrote: > From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br> > > Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to > decodetree. VPRTYBW and VPRTYBD now also use .fni4 and .fni8, > respectively. > > vprtybw: > rept loop master patch > 8 12500 0,00991200 0,00626300 (-36.8%) > 25 4000 0,01040600 0,00550600 (-47.1%) > 100 1000 0,01084500 0,00601100 (-44.6%) > 500 200 0,01490600 0,01394100 (-6.5%) > 2500 40 0,03285100 0,05143000 (+56.6%) > 8000 12 0,08971500 0,14662500 (+63.4%) > > vprtybd: > rept loop master patch > 8 12500 0,00665800 0,00652800 (-2.0%) > 25 4000 0,00589300 0,00670400 (+13.8%) > 100 1000 0,00646800 0,00743900 (+15.0%) > 500 200 0,01065800 0,01586400 (+48.8%) > 2500 40 0,03497000 0,07180100 (+105.3%) > 8000 12 0,09242200 0,21566600 (+133.3%) > > vprtybq: > rept loop master patch > 8 12500 0,00656200 0,00665800 (+1.5%) > 25 4000 0,00620500 0,00644900 (+3.9%) > 100 1000 0,00707500 0,00764900 (+8.1%) > 500 200 0,01203500 0,01349500 (+12.1%) > 2500 40 0,03505700 0,04123100 (+17.6%) > 8000 12 0,09590600 0,11586700 (+20.8%) > > I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ, > I'm not sure if it's worth to move those instructions. Comparing the > assembly of the helper with the TCGop they are pretty similar, so > I'm not sure why vprtybd took so much more time. > > Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br> > --- > target/ppc/helper.h | 4 +- > target/ppc/insn32.decode | 4 ++ > target/ppc/int_helper.c | 25 +-------- > target/ppc/translate/vmx-impl.c.inc | 80 +++++++++++++++++++++++++++-- > target/ppc/translate/vmx-ops.c.inc | 3 -- > 5 files changed, 83 insertions(+), 33 deletions(-) > > diff --git a/target/ppc/helper.h b/target/ppc/helper.h > index b2e910b089..a06193bc67 100644 > --- a/target/ppc/helper.h > +++ b/target/ppc/helper.h > @@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) > DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) > DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) > DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) > -DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) > -DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) > -DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) > +DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) > DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) > DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) > DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) > diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode > index 2658dd3395..aa4968e6b9 100644 > --- a/target/ppc/insn32.decode > +++ b/target/ppc/insn32.decode > @@ -529,6 +529,10 @@ VCTZDM 000100 ..... ..... ..... 11111000100 @VX > VPDEPD 000100 ..... ..... ..... 10111001101 @VX > VPEXTD 000100 ..... ..... ..... 10110001101 @VX > > +VPRTYBD 000100 ..... 01001 ..... 11000000010 @VX_tb > +VPRTYBQ 000100 ..... 01010 ..... 11000000010 @VX_tb > +VPRTYBW 000100 ..... 01000 ..... 11000000010 @VX_tb > + > ## Vector Permute and Formatting Instruction > > VEXTDUBVLX 000100 ..... ..... ..... ..... 011000 @VA > diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c > index c7fd0d1faa..c6ce4665fa 100644 > --- a/target/ppc/int_helper.c > +++ b/target/ppc/int_helper.c > @@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env) > env->vscr_sat.u32[0] = 1; > } > > -/* vprtybw */ > -void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) > -{ > - int i; > - for (i = 0; i < ARRAY_SIZE(r->u32); i++) { > - uint64_t res = b->u32[i] ^ (b->u32[i] >> 16); > - res ^= res >> 8; > - r->u32[i] = res & 1; > - } > -} > - > -/* vprtybd */ > -void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) > -{ > - int i; > - for (i = 0; i < ARRAY_SIZE(r->u64); i++) { > - uint64_t res = b->u64[i] ^ (b->u64[i] >> 32); > - res ^= res >> 16; > - res ^= res >> 8; > - r->u64[i] = res & 1; > - } > -} > - > /* vprtybq */ > -void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) > +void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) > { > uint64_t res = b->u64[0] ^ b->u64[1]; > res ^= res >> 32; > diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc > index b9a9e83ab3..23601942bc 100644 > --- a/target/ppc/translate/vmx-impl.c.inc > +++ b/target/ppc/translate/vmx-impl.c.inc > @@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); > GEN_VXFORM_NOA_ENV(vrfin, 5, 8); > GEN_VXFORM_NOA_ENV(vrfip, 5, 10); > GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); > -GEN_VXFORM_NOA(vprtybw, 1, 24); > -GEN_VXFORM_NOA(vprtybd, 1, 24); > -GEN_VXFORM_NOA(vprtybq, 1, 24); > + > +static void gen_vprtyb_vec(unsigned vece, TCGv_vec t, TCGv_vec b) > +{ > + int i; > + TCGv_vec tmp = tcg_temp_new_vec_matching(b); > + /* MO_32 is 2, so 2 iteractions for MO_32 and 3 for MO_64 */ > + for (i = 0; i < vece; i++) { > + tcg_gen_shri_vec(vece, tmp, b, (4 << (vece - i))); > + tcg_gen_xor_vec(vece, b, tmp, b); > + } > + tcg_gen_and_vec(vece, t, b, tcg_constant_vec_matching(t, vece, 1)); > + tcg_temp_free_vec(tmp); > +} > + > +/* vprtybw */ > +static void gen_vprtyb_i32(TCGv_i32 t, TCGv_i32 b) > +{ > + TCGv_i32 tmp = tcg_temp_new_i32(); > + tcg_gen_shri_i32(tmp, b, 16); > + tcg_gen_xor_i32(b, tmp, b); > + tcg_gen_shri_i32(tmp, b, 8); > + tcg_gen_xor_i32(b, tmp, b); > + tcg_gen_and_i32(t, b, tcg_constant_i32(1)); > + tcg_temp_free_i32(tmp); tcg_gen_ctpop_i32(t, b); tcg_gen_andi_i32(t, t, 1); > +} > + > +/* vprtybd */ > +static void gen_vprtyb_i64(TCGv_i64 t, TCGv_i64 b) > +{ > + TCGv_i64 tmp = tcg_temp_new_i64(); > + tcg_gen_shri_i64(tmp, b, 32); > + tcg_gen_xor_i64(b, tmp, b); > + tcg_gen_shri_i64(tmp, b, 16); > + tcg_gen_xor_i64(b, tmp, b); > + tcg_gen_shri_i64(tmp, b, 8); > + tcg_gen_xor_i64(b, tmp, b); > + tcg_gen_and_i64(t, b, tcg_constant_i64(1)); > + tcg_temp_free_i64(tmp); Similarly. Otherwise, Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
diff --git a/target/ppc/helper.h b/target/ppc/helper.h index b2e910b089..a06193bc67 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -193,9 +193,7 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr) DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr) -DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr) -DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr) +DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32) diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 2658dd3395..aa4968e6b9 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -529,6 +529,10 @@ VCTZDM 000100 ..... ..... ..... 11111000100 @VX VPDEPD 000100 ..... ..... ..... 10111001101 @VX VPEXTD 000100 ..... ..... ..... 10110001101 @VX +VPRTYBD 000100 ..... 01001 ..... 11000000010 @VX_tb +VPRTYBQ 000100 ..... 01010 ..... 11000000010 @VX_tb +VPRTYBW 000100 ..... 01000 ..... 11000000010 @VX_tb + ## Vector Permute and Formatting Instruction VEXTDUBVLX 000100 ..... ..... ..... ..... 011000 @VA diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index c7fd0d1faa..c6ce4665fa 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -492,31 +492,8 @@ static inline void set_vscr_sat(CPUPPCState *env) env->vscr_sat.u32[0] = 1; } -/* vprtybw */ -void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b) -{ - int i; - for (i = 0; i < ARRAY_SIZE(r->u32); i++) { - uint64_t res = b->u32[i] ^ (b->u32[i] >> 16); - res ^= res >> 8; - r->u32[i] = res & 1; - } -} - -/* vprtybd */ -void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b) -{ - int i; - for (i = 0; i < ARRAY_SIZE(r->u64); i++) { - uint64_t res = b->u64[i] ^ (b->u64[i] >> 32); - res ^= res >> 16; - res ^= res >> 8; - r->u64[i] = res & 1; - } -} - /* vprtybq */ -void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) +void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v) { uint64_t res = b->u64[0] ^ b->u64[1]; res ^= res >> 32; diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index b9a9e83ab3..23601942bc 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -1659,9 +1659,83 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); GEN_VXFORM_NOA_ENV(vrfin, 5, 8); GEN_VXFORM_NOA_ENV(vrfip, 5, 10); GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); -GEN_VXFORM_NOA(vprtybw, 1, 24); -GEN_VXFORM_NOA(vprtybd, 1, 24); -GEN_VXFORM_NOA(vprtybq, 1, 24); + +static void gen_vprtyb_vec(unsigned vece, TCGv_vec t, TCGv_vec b) +{ + int i; + TCGv_vec tmp = tcg_temp_new_vec_matching(b); + /* MO_32 is 2, so 2 iteractions for MO_32 and 3 for MO_64 */ + for (i = 0; i < vece; i++) { + tcg_gen_shri_vec(vece, tmp, b, (4 << (vece - i))); + tcg_gen_xor_vec(vece, b, tmp, b); + } + tcg_gen_and_vec(vece, t, b, tcg_constant_vec_matching(t, vece, 1)); + tcg_temp_free_vec(tmp); +} + +/* vprtybw */ +static void gen_vprtyb_i32(TCGv_i32 t, TCGv_i32 b) +{ + TCGv_i32 tmp = tcg_temp_new_i32(); + tcg_gen_shri_i32(tmp, b, 16); + tcg_gen_xor_i32(b, tmp, b); + tcg_gen_shri_i32(tmp, b, 8); + tcg_gen_xor_i32(b, tmp, b); + tcg_gen_and_i32(t, b, tcg_constant_i32(1)); + tcg_temp_free_i32(tmp); +} + +/* vprtybd */ +static void gen_vprtyb_i64(TCGv_i64 t, TCGv_i64 b) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + tcg_gen_shri_i64(tmp, b, 32); + tcg_gen_xor_i64(b, tmp, b); + tcg_gen_shri_i64(tmp, b, 16); + tcg_gen_xor_i64(b, tmp, b); + tcg_gen_shri_i64(tmp, b, 8); + tcg_gen_xor_i64(b, tmp, b); + tcg_gen_and_i64(t, b, tcg_constant_i64(1)); + tcg_temp_free_i64(tmp); +} + +static bool do_vx_vprtyb(DisasContext *ctx, arg_VX_tb *a, unsigned vece) +{ + static const TCGOpcode vecop_list[] = { + INDEX_op_shri_vec, 0 + }; + + static const GVecGen2 op[] = { + { + .fniv = gen_vprtyb_vec, + .fni4 = gen_vprtyb_i32, + .opt_opc = vecop_list, + .vece = MO_32 + }, + { + .fniv = gen_vprtyb_vec, + .fni8 = gen_vprtyb_i64, + .opt_opc = vecop_list, + .vece = MO_64 + }, + { + .fno = gen_helper_VPRTYBQ, + .vece = MO_128 + }, + }; + + REQUIRE_INSNS_FLAGS2(ctx, ISA300); + REQUIRE_VECTOR(ctx); + + tcg_gen_gvec_2(avr_full_offset(a->vrt), avr_full_offset(a->vrb), + 16, 16, &op[vece - MO_32]); + + return true; +} + +TRANS(VPRTYBW, do_vx_vprtyb, MO_32) +TRANS(VPRTYBD, do_vx_vprtyb, MO_64) +TRANS(VPRTYBQ, do_vx_vprtyb, MO_128) static void gen_vsplt(DisasContext *ctx, int vece) { diff --git a/target/ppc/translate/vmx-ops.c.inc b/target/ppc/translate/vmx-ops.c.inc index 27908533dd..46a620a232 100644 --- a/target/ppc/translate/vmx-ops.c.inc +++ b/target/ppc/translate/vmx-ops.c.inc @@ -106,9 +106,6 @@ GEN_VXFORM_300(vsrv, 2, 28), GEN_VXFORM_300(vslv, 2, 29), GEN_VXFORM(vslo, 6, 16), GEN_VXFORM(vsro, 6, 17), -GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300), -GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300), -GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300), GEN_VXFORM(xpnd04_1, 0, 22), GEN_VXFORM_300(bcdsr, 0, 23),