Message ID | 1474023111-11992-3-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: > Load 8byte at a time and manipulate. > > Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> > --- > target-ppc/helper.h | 1 + > target-ppc/mem_helper.c | 5 +++++ > target-ppc/translate/vsx-impl.inc.c | 19 +++++-------------- > 3 files changed, 11 insertions(+), 14 deletions(-) > > diff --git a/target-ppc/helper.h b/target-ppc/helper.h > index 966f2ce..9f6705d 100644 > --- a/target-ppc/helper.h > +++ b/target-ppc/helper.h > @@ -297,6 +297,7 @@ DEF_HELPER_2(mtvscr, void, env, avr) > DEF_HELPER_3(lvebx, void, env, avr, tl) > DEF_HELPER_3(lvehx, void, env, avr, tl) > DEF_HELPER_3(lvewx, void, env, avr, tl) > +DEF_HELPER_1(deposit32x2, i64, i64) > DEF_HELPER_3(stvebx, void, env, avr, tl) > DEF_HELPER_3(stvehx, void, env, avr, tl) > DEF_HELPER_3(stvewx, void, env, avr, tl) > diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c > index 6548715..86e493e 100644 > --- a/target-ppc/mem_helper.c > +++ b/target-ppc/mem_helper.c > @@ -285,6 +285,11 @@ STVE(stvewx, cpu_stl_data_ra, bswap32, u32) > #undef I > #undef LVE > > +uint64_t helper_deposit32x2(uint64_t x) > +{ > + return deposit64((x >> 32), 32, 32, (x)); > +} It seems a shame to drop out to a helper for something this simple. How hard would it be to implement this.. wordswap, I guess you'd call it.. in tcg ops? I'm also not particularly fond of the deposit32x2 name, though a better one doesn't quickly come to mind. > + > #undef HI_IDX > #undef LO_IDX > > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c > index eee6052..df278df 100644 > --- a/target-ppc/translate/vsx-impl.inc.c > +++ b/target-ppc/translate/vsx-impl.inc.c > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) > static void gen_lxvw4x(DisasContext *ctx) > { > TCGv EA; > - TCGv_i64 tmp; > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); > if (unlikely(!ctx->vsx_enabled)) { > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) > } > gen_set_access_type(ctx, ACCESS_INT); > EA = tcg_temp_new(); > - tmp = tcg_temp_new_i64(); > > gen_addr_reg_index(ctx, EA); > - gen_qemu_ld32u_i64(ctx, tmp, EA); > - tcg_gen_addi_tl(EA, EA, 4); > - gen_qemu_ld32u_i64(ctx, xth, EA); > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); > - > - tcg_gen_addi_tl(EA, EA, 4); > - gen_qemu_ld32u_i64(ctx, tmp, EA); > - tcg_gen_addi_tl(EA, EA, 4); > - gen_qemu_ld32u_i64(ctx, xtl, EA); > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); > - > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); > + gen_helper_deposit32x2(xth, xth); > + tcg_gen_addi_tl(EA, EA, 8); > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); > + gen_helper_deposit32x2(xtl, xtl); > tcg_temp_free(EA); > - tcg_temp_free_i64(tmp); > } > > #define VSX_STORE_SCALAR(name, operation) \
On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: > On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: > > Load 8byte at a time and manipulate. > > > > Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> > > --- > > target-ppc/helper.h | 1 + > > target-ppc/mem_helper.c | 5 +++++ > > target-ppc/translate/vsx-impl.inc.c | 19 +++++-------------- > > 3 files changed, 11 insertions(+), 14 deletions(-) > > > > diff --git a/target-ppc/helper.h b/target-ppc/helper.h > > index 966f2ce..9f6705d 100644 > > --- a/target-ppc/helper.h > > +++ b/target-ppc/helper.h > > @@ -297,6 +297,7 @@ DEF_HELPER_2(mtvscr, void, env, avr) > > DEF_HELPER_3(lvebx, void, env, avr, tl) > > DEF_HELPER_3(lvehx, void, env, avr, tl) > > DEF_HELPER_3(lvewx, void, env, avr, tl) > > +DEF_HELPER_1(deposit32x2, i64, i64) > > DEF_HELPER_3(stvebx, void, env, avr, tl) > > DEF_HELPER_3(stvehx, void, env, avr, tl) > > DEF_HELPER_3(stvewx, void, env, avr, tl) > > diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c > > index 6548715..86e493e 100644 > > --- a/target-ppc/mem_helper.c > > +++ b/target-ppc/mem_helper.c > > @@ -285,6 +285,11 @@ STVE(stvewx, cpu_stl_data_ra, bswap32, u32) > > #undef I > > #undef LVE > > > > +uint64_t helper_deposit32x2(uint64_t x) > > +{ > > + return deposit64((x >> 32), 32, 32, (x)); > > +} > > It seems a shame to drop out to a helper for something this simple. > How hard would it be to implement this.. wordswap, I guess you'd call > it.. in tcg ops? > > I'm also not particularly fond of the deposit32x2 name, though a > better one doesn't quickly come to mind. > > > + > > #undef HI_IDX > > #undef LO_IDX > > > > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c > > index eee6052..df278df 100644 > > --- a/target-ppc/translate/vsx-impl.inc.c > > +++ b/target-ppc/translate/vsx-impl.inc.c > > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) > > static void gen_lxvw4x(DisasContext *ctx) > > { > > TCGv EA; > > - TCGv_i64 tmp; > > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); > > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); > > if (unlikely(!ctx->vsx_enabled)) { > > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) > > } > > gen_set_access_type(ctx, ACCESS_INT); > > EA = tcg_temp_new(); > > - tmp = tcg_temp_new_i64(); > > > > gen_addr_reg_index(ctx, EA); > > - gen_qemu_ld32u_i64(ctx, tmp, EA); > > - tcg_gen_addi_tl(EA, EA, 4); > > - gen_qemu_ld32u_i64(ctx, xth, EA); > > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); > > - > > - tcg_gen_addi_tl(EA, EA, 4); > > - gen_qemu_ld32u_i64(ctx, tmp, EA); > > - tcg_gen_addi_tl(EA, EA, 4); > > - gen_qemu_ld32u_i64(ctx, xtl, EA); > > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); > > - > > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); > > + gen_helper_deposit32x2(xth, xth); > > + tcg_gen_addi_tl(EA, EA, 8); > > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); > > + gen_helper_deposit32x2(xtl, xtl); ..and I think this is wrong for BE mode. The deposit32x2 will get the words in the right order, but the bytes within each word will be wrong because of the LE mode load on a BE setup. > > tcg_temp_free(EA); > > - tcg_temp_free_i64(tmp); > > } > > > > #define VSX_STORE_SCALAR(name, operation) \ >
David Gibson <david@gibson.dropbear.id.au> writes: > [ Unknown signature status ] > On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: >> Load 8byte at a time and manipulate. >> >> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> >> --- >> target-ppc/helper.h | 1 + >> target-ppc/mem_helper.c | 5 +++++ >> target-ppc/translate/vsx-impl.inc.c | 19 +++++-------------- >> 3 files changed, 11 insertions(+), 14 deletions(-) >> >> diff --git a/target-ppc/helper.h b/target-ppc/helper.h >> index 966f2ce..9f6705d 100644 >> --- a/target-ppc/helper.h >> +++ b/target-ppc/helper.h >> @@ -297,6 +297,7 @@ DEF_HELPER_2(mtvscr, void, env, avr) >> DEF_HELPER_3(lvebx, void, env, avr, tl) >> DEF_HELPER_3(lvehx, void, env, avr, tl) >> DEF_HELPER_3(lvewx, void, env, avr, tl) >> +DEF_HELPER_1(deposit32x2, i64, i64) >> DEF_HELPER_3(stvebx, void, env, avr, tl) >> DEF_HELPER_3(stvehx, void, env, avr, tl) >> DEF_HELPER_3(stvewx, void, env, avr, tl) >> diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c >> index 6548715..86e493e 100644 >> --- a/target-ppc/mem_helper.c >> +++ b/target-ppc/mem_helper.c >> @@ -285,6 +285,11 @@ STVE(stvewx, cpu_stl_data_ra, bswap32, u32) >> #undef I >> #undef LVE >> >> +uint64_t helper_deposit32x2(uint64_t x) >> +{ >> + return deposit64((x >> 32), 32, 32, (x)); >> +} > > It seems a shame to drop out to a helper for something this simple. > How hard would it be to implement this.. wordswap, I guess you'd call > it.. in tcg ops? There is a tcg_ops for deposit64, I need to do the shifting and call it. I will change that. > I'm also not particularly fond of the deposit32x2 name, though a > better one doesn't quickly come to mind. Regards Nikunj
David Gibson <david@gibson.dropbear.id.au> writes: > [ Unknown signature status ] > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: >> > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c >> > index eee6052..df278df 100644 >> > --- a/target-ppc/translate/vsx-impl.inc.c >> > +++ b/target-ppc/translate/vsx-impl.inc.c >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) >> > static void gen_lxvw4x(DisasContext *ctx) >> > { >> > TCGv EA; >> > - TCGv_i64 tmp; >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); >> > if (unlikely(!ctx->vsx_enabled)) { >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) >> > } >> > gen_set_access_type(ctx, ACCESS_INT); >> > EA = tcg_temp_new(); >> > - tmp = tcg_temp_new_i64(); >> > >> > gen_addr_reg_index(ctx, EA); >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, xth, EA); >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); >> > - >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); >> > - >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); >> > + gen_helper_deposit32x2(xth, xth); >> > + tcg_gen_addi_tl(EA, EA, 8); >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); >> > + gen_helper_deposit32x2(xtl, xtl); > > ..and I think this is wrong for BE mode. The deposit32x2 will get the > words in the right order, but the bytes within each word will be wrong > because of the LE mode load on a BE setup. Since lxvw4x/stxvw4x is available on POWER8. I tried running my test code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. The order within the word is not changed. Snippet of the test code at the end of email. Can share full code if needed (maybe will do it in kvm-unit-test) Fedora24VM BE: [fedora@cloudimg ~]$ uname -a Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64 #1 SMP Tue May 24 12:24:54 UTC 2016 ppc64 ppc64 ppc64 GNU/Linux [fedora@cloudimg ~]$ ./lxv_x VRT32 = 00010203 20212223 30313233 40414243 [fedora@cloudimg ~]$ ./stxv_x E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 TCG Result BE: ============== $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 lxv_x VRT32 = 00010203 20212223 30313233 40414243 $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 stxv_x E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 Fedora24VM LE: ============== [fedora@cloudimg ~]$ uname -a Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64le #1 SMP Tue May 24 12:23:26 UTC 2016 ppc64le ppc64le ppc64le GNU/Linux [fedora@cloudimg ~]$ ./lxv_x VRT32 = 40414243 30313233 20212223 00010203 [fedora@cloudimg ~]$ ./stxv_x F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 TCG Result LE: ============== $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 lxv_x VRT32 = 40414243 30313233 20212223 00010203 $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 stxv_x F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 Regards, Nikunj vsx.h: ====== #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) typedef union { __vector uint32_t v; uint32_t a[U32_SIZE]; } vuint32_t; static void vec_put_u32(__vector uint32_t v) { int i; vuint32_t u; for (u.v = v, i = 0; i < U32_SIZE; ++i) { printf("%08x ", u.a[i]); } printf("\n"); } static void print4x4(uint32_t *p) { int i; if (!p) return; for(i = 0; i < 4; i++) printf(" %08X ", p[i]); printf("\n"); } lxv_x.c: ======== uint32_t rb32[4] = {0x00010203, 0x20212223, 0x30313233, 0x40414243}; vuint32_t vrt32; asm("lxvw4x %x0, 0, %1 \n\t" \ : "=ws"(vrt32) : "r"(&rb32)); printf("VRT32 = "); vec_put_u32(vrt32); stxv_x.c: ========= vuint32_t vrt32; vrt32.a[0] = 0xE0E1E2E3; vrt32.a[1] = 0xE4E5E6E7; vrt32.a[2] = 0xF0F1F2F3; vrt32.a[3] = 0xF4F5F6F7; asm("stxvw4x %x0, 0, %1 \n\t" \ : : "ws"(vrt32.v), "r"(&rb32)); print4x4(rb32);
On Mon, Sep 19, 2016 at 04:06:40PM +0530, Nikunj A Dadhania wrote: > David Gibson <david@gibson.dropbear.id.au> writes: > > [ Unknown signature status ] > > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: > >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: > >> > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c > >> > index eee6052..df278df 100644 > >> > --- a/target-ppc/translate/vsx-impl.inc.c > >> > +++ b/target-ppc/translate/vsx-impl.inc.c > >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) > >> > static void gen_lxvw4x(DisasContext *ctx) > >> > { > >> > TCGv EA; > >> > - TCGv_i64 tmp; > >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); > >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); > >> > if (unlikely(!ctx->vsx_enabled)) { > >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) > >> > } > >> > gen_set_access_type(ctx, ACCESS_INT); > >> > EA = tcg_temp_new(); > >> > - tmp = tcg_temp_new_i64(); > >> > > >> > gen_addr_reg_index(ctx, EA); > >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); > >> > - tcg_gen_addi_tl(EA, EA, 4); > >> > - gen_qemu_ld32u_i64(ctx, xth, EA); > >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); > >> > - > >> > - tcg_gen_addi_tl(EA, EA, 4); > >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); > >> > - tcg_gen_addi_tl(EA, EA, 4); > >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); > >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); > >> > - > >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); > >> > + gen_helper_deposit32x2(xth, xth); > >> > + tcg_gen_addi_tl(EA, EA, 8); > >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); > >> > + gen_helper_deposit32x2(xtl, xtl); > > > > ..and I think this is wrong for BE mode. The deposit32x2 will get the > > words in the right order, but the bytes within each word will be wrong > > because of the LE mode load on a BE setup. > > Since lxvw4x/stxvw4x is available on POWER8. I tried running my test > code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. > The order within the word is not changed. Snippet of the test code at > the end of email. Can share full code if needed (maybe will do it in > kvm-unit-test) Ugh.. now I'm confused. I would not have expected the results you've seen from these tests. But I still can't understand *how* the emulation could be correct: IIUC MO_LEQ would mean it loads the 8 bytes as a single 64-bit LE integer. Which should be the same as loading one 32-bit LE integer into the low half of the target register, then a 32-bit LE integer into the high half ot the target register. As I said above, the deposit32x2 will swap the order of the two ints, but it won't byteswap the individual int32s which should have been BE in memory. Can you find the flaw in my reasoning? > Fedora24VM BE: > > [fedora@cloudimg ~]$ uname -a > Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64 #1 SMP Tue May 24 12:24:54 UTC 2016 ppc64 ppc64 ppc64 GNU/Linux > [fedora@cloudimg ~]$ ./lxv_x > VRT32 = 00010203 20212223 30313233 40414243 > > [fedora@cloudimg ~]$ ./stxv_x > E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 > > > TCG Result BE: > ============== > $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 lxv_x > VRT32 = 00010203 20212223 30313233 40414243 > > $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 stxv_x > E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 > > > Fedora24VM LE: > ============== > [fedora@cloudimg ~]$ uname -a > Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64le #1 SMP Tue May 24 12:23:26 UTC 2016 ppc64le ppc64le ppc64le GNU/Linux > [fedora@cloudimg ~]$ ./lxv_x > VRT32 = 40414243 30313233 20212223 00010203 > > [fedora@cloudimg ~]$ ./stxv_x > F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 > > TCG Result LE: > ============== > $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 lxv_x > VRT32 = 40414243 30313233 20212223 00010203 > > $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 stxv_x > F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 > > Regards, > Nikunj > > > vsx.h: > ====== > #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) > > typedef union { > __vector uint32_t v; > uint32_t a[U32_SIZE]; > } vuint32_t; I am a little suspicious that whatever the compiler does to convert the vector to an array via this union might be undoing a byte reverse. I'd be more confident if you used VSX instructions to extract and store separately one of the 32-bit subwords of the vector. > > static void vec_put_u32(__vector uint32_t v) { > int i; > vuint32_t u; > > for (u.v = v, i = 0; i < U32_SIZE; ++i) { > printf("%08x ", u.a[i]); > } > > printf("\n"); > } > > static void print4x4(uint32_t *p) > { > int i; > if (!p) > return; > for(i = 0; i < 4; i++) > printf(" %08X ", p[i]); > printf("\n"); > } > > lxv_x.c: > ======== > uint32_t rb32[4] = {0x00010203, 0x20212223, 0x30313233, 0x40414243}; > vuint32_t vrt32; > > asm("lxvw4x %x0, 0, %1 \n\t" \ > : "=ws"(vrt32) : "r"(&rb32)); > printf("VRT32 = "); vec_put_u32(vrt32); > > stxv_x.c: > ========= > vuint32_t vrt32; > > vrt32.a[0] = 0xE0E1E2E3; > vrt32.a[1] = 0xE4E5E6E7; > vrt32.a[2] = 0xF0F1F2F3; > vrt32.a[3] = 0xF4F5F6F7; > > asm("stxvw4x %x0, 0, %1 \n\t" \ > : : "ws"(vrt32.v), "r"(&rb32)); > print4x4(rb32); >
David Gibson <david@gibson.dropbear.id.au> writes: > [ Unknown signature status ] > On Mon, Sep 19, 2016 at 04:06:40PM +0530, Nikunj A Dadhania wrote: >> David Gibson <david@gibson.dropbear.id.au> writes: >> > [ Unknown signature status ] >> > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: >> >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: >> >> > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c >> >> > index eee6052..df278df 100644 >> >> > --- a/target-ppc/translate/vsx-impl.inc.c >> >> > +++ b/target-ppc/translate/vsx-impl.inc.c >> >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) >> >> > static void gen_lxvw4x(DisasContext *ctx) >> >> > { >> >> > TCGv EA; >> >> > - TCGv_i64 tmp; >> >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); >> >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); >> >> > if (unlikely(!ctx->vsx_enabled)) { >> >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) >> >> > } >> >> > gen_set_access_type(ctx, ACCESS_INT); >> >> > EA = tcg_temp_new(); >> >> > - tmp = tcg_temp_new_i64(); >> >> > >> >> > gen_addr_reg_index(ctx, EA); >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> > - gen_qemu_ld32u_i64(ctx, xth, EA); >> >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); >> >> > - >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); >> >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); >> >> > - >> >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); >> >> > + gen_helper_deposit32x2(xth, xth); >> >> > + tcg_gen_addi_tl(EA, EA, 8); >> >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); >> >> > + gen_helper_deposit32x2(xtl, xtl); >> > >> > ..and I think this is wrong for BE mode. The deposit32x2 will get the >> > words in the right order, but the bytes within each word will be wrong >> > because of the LE mode load on a BE setup. >> >> Since lxvw4x/stxvw4x is available on POWER8. I tried running my test >> code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. >> The order within the word is not changed. Snippet of the test code at >> the end of email. Can share full code if needed (maybe will do it in >> kvm-unit-test) > > Ugh.. now I'm confused. I would not have expected the results you've > seen from these tests. But I still can't understand *how* the > emulation could be correct: IIUC MO_LEQ would mean it loads the 8 > bytes as a single 64-bit LE integer. For both the case LE/BE we do a LE read ... > Which should be the same as > loading one 32-bit LE integer into the low half of the target > register, then a 32-bit LE integer into the high half ot the target > register. .. The 64-bit integer read is not same in these cases. The input itself would be in the order of the format. Input rb32[]: 00010203 20212223 30313233 40414243 LE: helper_deposit32x2: 2021222300010203 helper_deposit32x2: 4041424330313233 BE helper_deposit32x2: 2322212003020100 helper_deposit32x2: 4342414033323130 > > As I said above, the deposit32x2 will swap the order of the two ints, > but it won't byteswap the individual int32s which should have been BE > in memory. > > Can you find the flaw in my reasoning? One anomaly that I see in BE code generation: it also generates a stxvw4x after lxvw4x. I am not sure why. >>>>>>>>>>>>>>>> BE BE BE >>>>>>>>>>>>>> Input rb32[]: 00010203 20212223 30313233 40414243 gen_lxvw4x: called helper_deposit32x2: 2322212003020100 helper_deposit32x2: 4342414033323130 gen_stxvw4x: called helper_deposit32x2: 0302010023222120 helper_deposit32x2: 3332313043424140 Output VRT32: 00010203 20212223 30313233 40414243 >> vsx.h: >> ====== >> #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) >> >> typedef union { >> __vector uint32_t v; >> uint32_t a[U32_SIZE]; >> } vuint32_t; > > I am a little suspicious that whatever the compiler does to convert > the vector to an array via this union might be undoing a byte reverse. > > I'd be more confident if you used VSX instructions to extract and > store separately one of the 32-bit subwords of the vector. I will try to figure those instructions. Regards Nikunj
On Tue, Sep 20, 2016 at 10:40:03PM +0530, Nikunj A Dadhania wrote: > David Gibson <david@gibson.dropbear.id.au> writes: > > > [ Unknown signature status ] > > On Mon, Sep 19, 2016 at 04:06:40PM +0530, Nikunj A Dadhania wrote: > >> David Gibson <david@gibson.dropbear.id.au> writes: > >> > [ Unknown signature status ] > >> > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: > >> >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: > >> >> > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c > >> >> > index eee6052..df278df 100644 > >> >> > --- a/target-ppc/translate/vsx-impl.inc.c > >> >> > +++ b/target-ppc/translate/vsx-impl.inc.c > >> >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) > >> >> > static void gen_lxvw4x(DisasContext *ctx) > >> >> > { > >> >> > TCGv EA; > >> >> > - TCGv_i64 tmp; > >> >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); > >> >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); > >> >> > if (unlikely(!ctx->vsx_enabled)) { > >> >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) > >> >> > } > >> >> > gen_set_access_type(ctx, ACCESS_INT); > >> >> > EA = tcg_temp_new(); > >> >> > - tmp = tcg_temp_new_i64(); > >> >> > > >> >> > gen_addr_reg_index(ctx, EA); > >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); > >> >> > - tcg_gen_addi_tl(EA, EA, 4); > >> >> > - gen_qemu_ld32u_i64(ctx, xth, EA); > >> >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); > >> >> > - > >> >> > - tcg_gen_addi_tl(EA, EA, 4); > >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); > >> >> > - tcg_gen_addi_tl(EA, EA, 4); > >> >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); > >> >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); > >> >> > - > >> >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); > >> >> > + gen_helper_deposit32x2(xth, xth); > >> >> > + tcg_gen_addi_tl(EA, EA, 8); > >> >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); > >> >> > + gen_helper_deposit32x2(xtl, xtl); > >> > > >> > ..and I think this is wrong for BE mode. The deposit32x2 will get the > >> > words in the right order, but the bytes within each word will be wrong > >> > because of the LE mode load on a BE setup. > >> > >> Since lxvw4x/stxvw4x is available on POWER8. I tried running my test > >> code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. > >> The order within the word is not changed. Snippet of the test code at > >> the end of email. Can share full code if needed (maybe will do it in > >> kvm-unit-test) > > > > Ugh.. now I'm confused. I would not have expected the results you've > > seen from these tests. But I still can't understand *how* the > > emulation could be correct: IIUC MO_LEQ would mean it loads the 8 > > bytes as a single 64-bit LE integer. > > For both the case LE/BE we do a LE read ... .. and I can't see how that can be right for the BE case. > > Which should be the same as > > loading one 32-bit LE integer into the low half of the target > > register, then a 32-bit LE integer into the high half ot the target > > register. > > .. The 64-bit integer read is not same in these cases. The input itself > would be in the order of the format. > > Input rb32[]: 00010203 20212223 30313233 40414243 > > LE: > helper_deposit32x2: 2021222300010203 > helper_deposit32x2: 4041424330313233 > > BE > helper_deposit32x2: 2322212003020100 > helper_deposit32x2: 4342414033323130 Sorry.. I can't really follow the above, because I'm not sure if you're displaying the bytes within each word in significance order, or increasing-address order. > > > > As I said above, the deposit32x2 will swap the order of the two ints, > > but it won't byteswap the individual int32s which should have been BE > > in memory. > > > > Can you find the flaw in my reasoning? > > One anomaly that I see in BE code generation: it also generates a > stxvw4x after lxvw4x. I am not sure why. Ah... see I'm wondering if it's using the stxvw4x to store back to the union which you then get the results from. If that's so it could explain the results, since the bug I suspect is in lxvw4x would be cancelled out by the corresponding bug in stxv4wx, which is exactly why I'd prefer the approach to testing mentioned below. > > >>>>>>>>>>>>>>>> BE BE BE >>>>>>>>>>>>>> > Input rb32[]: 00010203 20212223 30313233 40414243 > > gen_lxvw4x: called > helper_deposit32x2: 2322212003020100 > helper_deposit32x2: 4342414033323130 > gen_stxvw4x: called > helper_deposit32x2: 0302010023222120 > helper_deposit32x2: 3332313043424140 > Output VRT32: 00010203 20212223 30313233 40414243 > > >> vsx.h: > >> ====== > >> #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) > >> > >> typedef union { > >> __vector uint32_t v; > >> uint32_t a[U32_SIZE]; > >> } vuint32_t; > > > > I am a little suspicious that whatever the compiler does to convert > > the vector to an array via this union might be undoing a byte reverse. > > > > I'd be more confident if you used VSX instructions to extract and > > store separately one of the 32-bit subwords of the vector. > > I will try to figure those instructions. Ok, thanks.
David Gibson <david@gibson.dropbear.id.au> writes: > [ Unknown signature status ] > On Tue, Sep 20, 2016 at 10:40:03PM +0530, Nikunj A Dadhania wrote: >> David Gibson <david@gibson.dropbear.id.au> writes: >> >> > [ Unknown signature status ] >> > On Mon, Sep 19, 2016 at 04:06:40PM +0530, Nikunj A Dadhania wrote: >> >> David Gibson <david@gibson.dropbear.id.au> writes: >> >> > [ Unknown signature status ] >> >> > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: >> >> >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: >> >> >> > diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c >> >> >> > index eee6052..df278df 100644 >> >> >> > --- a/target-ppc/translate/vsx-impl.inc.c >> >> >> > +++ b/target-ppc/translate/vsx-impl.inc.c >> >> >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) >> >> >> > static void gen_lxvw4x(DisasContext *ctx) >> >> >> > { >> >> >> > TCGv EA; >> >> >> > - TCGv_i64 tmp; >> >> >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); >> >> >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); >> >> >> > if (unlikely(!ctx->vsx_enabled)) { >> >> >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) >> >> >> > } >> >> >> > gen_set_access_type(ctx, ACCESS_INT); >> >> >> > EA = tcg_temp_new(); >> >> >> > - tmp = tcg_temp_new_i64(); >> >> >> > >> >> >> > gen_addr_reg_index(ctx, EA); >> >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> >> > - gen_qemu_ld32u_i64(ctx, xth, EA); >> >> >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); >> >> >> > - >> >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> >> >> > - tcg_gen_addi_tl(EA, EA, 4); >> >> >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); >> >> >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); >> >> >> > - >> >> >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); >> >> >> > + gen_helper_deposit32x2(xth, xth); >> >> >> > + tcg_gen_addi_tl(EA, EA, 8); >> >> >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); >> >> >> > + gen_helper_deposit32x2(xtl, xtl); >> >> > >> >> > ..and I think this is wrong for BE mode. The deposit32x2 will get the >> >> > words in the right order, but the bytes within each word will be wrong >> >> > because of the LE mode load on a BE setup. >> >> >> >> Since lxvw4x/stxvw4x is available on POWER8. I tried running my test >> >> code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. >> >> The order within the word is not changed. Snippet of the test code at >> >> the end of email. Can share full code if needed (maybe will do it in >> >> kvm-unit-test) >> > >> > Ugh.. now I'm confused. I would not have expected the results you've >> > seen from these tests. But I still can't understand *how* the >> > emulation could be correct: IIUC MO_LEQ would mean it loads the 8 >> > bytes as a single 64-bit LE integer. >> >> For both the case LE/BE we do a LE read ... > > .. and I can't see how that can be right for the BE case. > >> > Which should be the same as >> > loading one 32-bit LE integer into the low half of the target >> > register, then a 32-bit LE integer into the high half ot the target >> > register. >> >> .. The 64-bit integer read is not same in these cases. The input itself >> would be in the order of the format. >> >> Input rb32[]: 00010203 20212223 30313233 40414243 >> >> LE: >> helper_deposit32x2: 2021222300010203 >> helper_deposit32x2: 4041424330313233 >> >> BE >> helper_deposit32x2: 2322212003020100 >> helper_deposit32x2: 4342414033323130 > > > Sorry.. I can't really follow the above, because I'm not sure if > you're displaying the bytes within each word in significance order, or > increasing-address order. Ah, thats just a print inside the helper just to check what MO_LEQ returned: uint64_t helper_deposit32x2(uint64_t x) { fprintf(stderr, "%s: %016lx\n", __func__, x); return deposit64((x >> 32), 32, 32, (x)); } > >> > >> > As I said above, the deposit32x2 will swap the order of the two ints, >> > but it won't byteswap the individual int32s which should have been BE >> > in memory. >> > >> > Can you find the flaw in my reasoning? >> >> One anomaly that I see in BE code generation: it also generates a >> stxvw4x after lxvw4x. I am not sure why. > > Ah... see I'm wondering if it's using the stxvw4x to store back to the > union which you then get the results from. If that's so it could > explain the results, since the bug I suspect is in lxvw4x would be > cancelled out by the corresponding bug in stxv4wx, which is exactly > why I'd prefer the approach to testing mentioned below. Yes, I am investigating it. >> >>>>>>>>>>>>>>>> BE BE BE >>>>>>>>>>>>>> >> Input rb32[]: 00010203 20212223 30313233 40414243 >> >> gen_lxvw4x: called >> helper_deposit32x2: 2322212003020100 >> helper_deposit32x2: 4342414033323130 >> gen_stxvw4x: called >> helper_deposit32x2: 0302010023222120 >> helper_deposit32x2: 3332313043424140 >> Output VRT32: 00010203 20212223 30313233 40414243 >> >> >> vsx.h: >> >> ====== >> >> #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) >> >> >> >> typedef union { >> >> __vector uint32_t v; >> >> uint32_t a[U32_SIZE]; >> >> } vuint32_t; >> > >> > I am a little suspicious that whatever the compiler does to convert >> > the vector to an array via this union might be undoing a byte reverse. >> > >> > I'd be more confident if you used VSX instructions to extract and >> > store separately one of the 32-bit subwords of the vector. >> >> I will try to figure those instructions. > > Ok, thanks. > Regards, Nikunj
diff --git a/target-ppc/helper.h b/target-ppc/helper.h index 966f2ce..9f6705d 100644 --- a/target-ppc/helper.h +++ b/target-ppc/helper.h @@ -297,6 +297,7 @@ DEF_HELPER_2(mtvscr, void, env, avr) DEF_HELPER_3(lvebx, void, env, avr, tl) DEF_HELPER_3(lvehx, void, env, avr, tl) DEF_HELPER_3(lvewx, void, env, avr, tl) +DEF_HELPER_1(deposit32x2, i64, i64) DEF_HELPER_3(stvebx, void, env, avr, tl) DEF_HELPER_3(stvehx, void, env, avr, tl) DEF_HELPER_3(stvewx, void, env, avr, tl) diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c index 6548715..86e493e 100644 --- a/target-ppc/mem_helper.c +++ b/target-ppc/mem_helper.c @@ -285,6 +285,11 @@ STVE(stvewx, cpu_stl_data_ra, bswap32, u32) #undef I #undef LVE +uint64_t helper_deposit32x2(uint64_t x) +{ + return deposit64((x >> 32), 32, 32, (x)); +} + #undef HI_IDX #undef LO_IDX diff --git a/target-ppc/translate/vsx-impl.inc.c b/target-ppc/translate/vsx-impl.inc.c index eee6052..df278df 100644 --- a/target-ppc/translate/vsx-impl.inc.c +++ b/target-ppc/translate/vsx-impl.inc.c @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) static void gen_lxvw4x(DisasContext *ctx) { TCGv EA; - TCGv_i64 tmp; TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); if (unlikely(!ctx->vsx_enabled)) { @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) } gen_set_access_type(ctx, ACCESS_INT); EA = tcg_temp_new(); - tmp = tcg_temp_new_i64(); gen_addr_reg_index(ctx, EA); - gen_qemu_ld32u_i64(ctx, tmp, EA); - tcg_gen_addi_tl(EA, EA, 4); - gen_qemu_ld32u_i64(ctx, xth, EA); - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); - - tcg_gen_addi_tl(EA, EA, 4); - gen_qemu_ld32u_i64(ctx, tmp, EA); - tcg_gen_addi_tl(EA, EA, 4); - gen_qemu_ld32u_i64(ctx, xtl, EA); - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); - + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); + gen_helper_deposit32x2(xth, xth); + tcg_gen_addi_tl(EA, EA, 8); + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); + gen_helper_deposit32x2(xtl, xtl); tcg_temp_free(EA); - tcg_temp_free_i64(tmp); } #define VSX_STORE_SCALAR(name, operation) \
Load 8byte at a time and manipulate. Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> --- target-ppc/helper.h | 1 + target-ppc/mem_helper.c | 5 +++++ target-ppc/translate/vsx-impl.inc.c | 19 +++++-------------- 3 files changed, 11 insertions(+), 14 deletions(-)