diff mbox series

[v2,02/13] target/arm: Fix SQDMULH (by element) with Q=0

Message ID 20240625183536.1672454-3-richard.henderson@linaro.org (mailing list archive)
State New, archived
Headers show
Series target/arm: AdvSIMD conversion, part 2 | expand

Commit Message

Richard Henderson June 25, 2024, 6:35 p.m. UTC
The inner loop, bounded by eltspersegment, must not be
larger than the outer loop, bounded by elements.

Cc: qemu-stable@nongnu.org
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/vec_helper.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

Comments

Michael Tokarev July 2, 2024, 6:48 a.m. UTC | #1
25.06.2024 21:35, Richard Henderson wrote:
> The inner loop, bounded by eltspersegment, must not be
> larger than the outer loop, bounded by elements.
> 
> Cc: qemu-stable@nongnu.org
> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/arm/tcg/vec_helper.c | 24 ++++++++++++++++--------
>   1 file changed, 16 insertions(+), 8 deletions(-)

If my understanding is correct, this one

Fixes: f80701cb44d3 ("target/arm: Convert SQDMULH, SQRDMULH to decodetree")

and before this commit, there was no issue.

Is my understanding correct?

Thanks,

/mjt

> diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
> index 7b34cc98af..d477479bb1 100644
> --- a/target/arm/tcg/vec_helper.c
> +++ b/target/arm/tcg/vec_helper.c
> @@ -317,10 +317,12 @@ void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
>       intptr_t i, j, opr_sz = simd_oprsz(desc);
>       int idx = simd_data(desc);
>       int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
> +    intptr_t elements = opr_sz / 2;
> +    intptr_t eltspersegment = MIN(16 / 2, elements);
>   
> -    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
> +    for (i = 0; i < elements; i += 16 / 2) {
>           int16_t mm = m[i];
> -        for (j = 0; j < 16 / 2; ++j) {
> +        for (j = 0; j < eltspersegment; ++j) {
>               d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
>           }
>       }
> @@ -333,10 +335,12 @@ void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
>       intptr_t i, j, opr_sz = simd_oprsz(desc);
>       int idx = simd_data(desc);
>       int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
> +    intptr_t elements = opr_sz / 2;
> +    intptr_t eltspersegment = MIN(16 / 2, elements);
>   
> -    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
> +    for (i = 0; i < elements; i += 16 / 2) {
>           int16_t mm = m[i];
> -        for (j = 0; j < 16 / 2; ++j) {
> +        for (j = 0; j < eltspersegment; ++j) {
>               d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
>           }
>       }
> @@ -512,10 +516,12 @@ void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
>       intptr_t i, j, opr_sz = simd_oprsz(desc);
>       int idx = simd_data(desc);
>       int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
> +    intptr_t elements = opr_sz / 4;
> +    intptr_t eltspersegment = MIN(16 / 4, elements);
>   
> -    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
> +    for (i = 0; i < elements; i += 16 / 4) {
>           int32_t mm = m[i];
> -        for (j = 0; j < 16 / 4; ++j) {
> +        for (j = 0; j < eltspersegment; ++j) {
>               d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
>           }
>       }
> @@ -528,10 +534,12 @@ void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
>       intptr_t i, j, opr_sz = simd_oprsz(desc);
>       int idx = simd_data(desc);
>       int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
> +    intptr_t elements = opr_sz / 4;
> +    intptr_t eltspersegment = MIN(16 / 4, elements);
>   
> -    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
> +    for (i = 0; i < elements; i += 16 / 4) {
>           int32_t mm = m[i];
> -        for (j = 0; j < 16 / 4; ++j) {
> +        for (j = 0; j < eltspersegment; ++j) {
>               d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
>           }
>       }
Richard Henderson July 2, 2024, 2:37 p.m. UTC | #2
On 7/1/24 23:48, Michael Tokarev wrote:
> 25.06.2024 21:35, Richard Henderson wrote:
>> The inner loop, bounded by eltspersegment, must not be
>> larger than the outer loop, bounded by elements.
>>
>> Cc: qemu-stable@nongnu.org
>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   target/arm/tcg/vec_helper.c | 24 ++++++++++++++++--------
>>   1 file changed, 16 insertions(+), 8 deletions(-)
> 
> If my understanding is correct, this one
> 
> Fixes: f80701cb44d3 ("target/arm: Convert SQDMULH, SQRDMULH to decodetree")
> 
> and before this commit, there was no issue.
> 
> Is my understanding correct?

Yes.  So, not as old a bug as I thought.


r~
diff mbox series

Patch

diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 7b34cc98af..d477479bb1 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -317,10 +317,12 @@  void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
     intptr_t i, j, opr_sz = simd_oprsz(desc);
     int idx = simd_data(desc);
     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+    intptr_t elements = opr_sz / 2;
+    intptr_t eltspersegment = MIN(16 / 2, elements);
 
-    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+    for (i = 0; i < elements; i += 16 / 2) {
         int16_t mm = m[i];
-        for (j = 0; j < 16 / 2; ++j) {
+        for (j = 0; j < eltspersegment; ++j) {
             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
         }
     }
@@ -333,10 +335,12 @@  void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
     intptr_t i, j, opr_sz = simd_oprsz(desc);
     int idx = simd_data(desc);
     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+    intptr_t elements = opr_sz / 2;
+    intptr_t eltspersegment = MIN(16 / 2, elements);
 
-    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+    for (i = 0; i < elements; i += 16 / 2) {
         int16_t mm = m[i];
-        for (j = 0; j < 16 / 2; ++j) {
+        for (j = 0; j < eltspersegment; ++j) {
             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
         }
     }
@@ -512,10 +516,12 @@  void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
     intptr_t i, j, opr_sz = simd_oprsz(desc);
     int idx = simd_data(desc);
     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+    intptr_t elements = opr_sz / 4;
+    intptr_t eltspersegment = MIN(16 / 4, elements);
 
-    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+    for (i = 0; i < elements; i += 16 / 4) {
         int32_t mm = m[i];
-        for (j = 0; j < 16 / 4; ++j) {
+        for (j = 0; j < eltspersegment; ++j) {
             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
         }
     }
@@ -528,10 +534,12 @@  void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
     intptr_t i, j, opr_sz = simd_oprsz(desc);
     int idx = simd_data(desc);
     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+    intptr_t elements = opr_sz / 4;
+    intptr_t eltspersegment = MIN(16 / 4, elements);
 
-    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+    for (i = 0; i < elements; i += 16 / 4) {
         int32_t mm = m[i];
-        for (j = 0; j < 16 / 4; ++j) {
+        for (j = 0; j < eltspersegment; ++j) {
             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
         }
     }