diff mbox series

[RFC,v3,2/2] target/riscv: rvv: improve performance of RISC-V vector loads and stores on large amounts of data.

Message ID 20241014220153.196183-3-paolo.savini@embecosm.com (mailing list archive)
State New, archived
Headers show
Series target/riscv: add endianness checks and atomicity guarantees. | expand

Commit Message

Paolo Savini Oct. 14, 2024, 10:01 p.m. UTC
This patch optimizes the emulation of unit-stride load/store RVV instructions
when the data being loaded/stored per iteration amounts to 64 bytes or more.
The optimization consists of calling __builtin_memcpy on chunks of data of 128
bytes between the memory address of the simulated vector register and the
destination memory address and vice versa.
This is done only if we have direct access to the RAM of the host machine,
if the host is little endiand and if it supports atomic 128 bit memory
operations.

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
---
 target/riscv/vector_helper.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

Comments

Richard Henderson Oct. 14, 2024, 11:11 p.m. UTC | #1
On 10/14/24 15:01, Paolo Savini wrote:
> This patch optimizes the emulation of unit-stride load/store RVV instructions
> when the data being loaded/stored per iteration amounts to 64 bytes or more.
> The optimization consists of calling __builtin_memcpy on chunks of data of 128
> bytes between the memory address of the simulated vector register and the
> destination memory address and vice versa.
> This is done only if we have direct access to the RAM of the host machine,
> if the host is little endiand and if it supports atomic 128 bit memory
> operations.
> 
> Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
> ---
>   target/riscv/vector_helper.c | 14 +++++++++++++-
>   1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 75c24653f0..b3d0be8e39 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -488,7 +488,19 @@ vext_group_ldst_host(CPURISCVState *env, void *vd, uint32_t byte_end,
>       }
>   
>       fn = fns[is_load][group_size];
> -    fn(vd, byte_offset, host + byte_offset);
> +
> +    /* x86 and AMD processors provide strong guarantees of atomicity for
> +     * 16-byte memory operations if the memory operands are 16-byte aligned */
> +    if (!HOST_BIG_ENDIAN && (byte_offset + 16 < byte_end) && ((byte_offset % 16) == 0) &&
> +        ((cpuinfo & (CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU)) != 0)) {
> +      group_size = MO_128;
> +      if (is_load)
> +        __builtin_memcpy((uint8_t *)(vd + byte_offset), (uint8_t *)(host + byte_offset), 16);
> +      else
> +        __builtin_memcpy((uint8_t *)(host + byte_offset), (uint8_t *)(vd + byte_offset), 16);
> +    } else {

This will not compile on anything other than x86.
Moreover, your comment about vmovdqa bears no relation to __builtin_memcpy.


r~
diff mbox series

Patch

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 75c24653f0..b3d0be8e39 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -488,7 +488,19 @@  vext_group_ldst_host(CPURISCVState *env, void *vd, uint32_t byte_end,
     }
 
     fn = fns[is_load][group_size];
-    fn(vd, byte_offset, host + byte_offset);
+
+    /* x86 and AMD processors provide strong guarantees of atomicity for
+     * 16-byte memory operations if the memory operands are 16-byte aligned */
+    if (!HOST_BIG_ENDIAN && (byte_offset + 16 < byte_end) && ((byte_offset % 16) == 0) &&
+        ((cpuinfo & (CPUINFO_ATOMIC_VMOVDQA | CPUINFO_ATOMIC_VMOVDQU)) != 0)) {
+      group_size = MO_128;
+      if (is_load)
+        __builtin_memcpy((uint8_t *)(vd + byte_offset), (uint8_t *)(host + byte_offset), 16);
+      else
+        __builtin_memcpy((uint8_t *)(host + byte_offset), (uint8_t *)(vd + byte_offset), 16);
+    } else {
+      fn(vd, byte_offset, host + byte_offset);
+    }
 
     return 1 << group_size;
 }