diff mbox series

[V3,2/5] riscv: atomic: Optimize acquire and release for AMO operations

Message ID 20220420144417.2453958-3-guoren@kernel.org (mailing list archive)
State New, archived
Headers show
Series riscv: atomic: Optimize AMO instructions usage | expand

Commit Message

Guo Ren April 20, 2022, 2:44 p.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

Current acquire & release implementations from atomic-arch-
fallback.h are using __atomic_acquire/release_fence(), it cause
another extra "fence r, rw/fence rw,w" instruction after/before
AMO instruction. RISC-V AMO instructions could combine acquire
and release in the instruction self which could reduce a fence
instruction. Here is from RISC-V ISA 10.4 Atomic Memory
Operations:

To help implement multiprocessor synchronization, the AMOs
optionally provide release consistency semantics.
 - .aq:   If the aq bit is set, then no later memory operations
          in this RISC-V hart can be observed to take place
          before the AMO.
 - .rl:   If the rl bit is set, then other RISC-V harts will not
          observe the AMO before memory accesses preceding the
          AMO in this RISC-V hart.
 - .aqrl: Setting both the aq and the rl bit on an AMO makes the
          sequence sequentially consistent, meaning that it cannot
          be reordered with earlier or later memory operations
          from the same hart.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Dan Lustig <dlustig@nvidia.com>
---
 arch/riscv/include/asm/atomic.h  | 64 ++++++++++++++++++++++++++++++++
 arch/riscv/include/asm/cmpxchg.h | 12 ++----
 2 files changed, 68 insertions(+), 8 deletions(-)

Comments

Guo Ren April 22, 2022, 3:43 a.m. UTC | #1
Ping Boqun & Daniel & Andrea,

Have you any comments on the patch? This revert 0123f4d76ca6
("riscv/spinlock: Strengthen implementations with fences").

But I think it's considerable because reducing the fence would gain
benefits in performance in our hardware.

In RISC-V ISA manual
     - .aq:   If the aq bit is set, then no later memory operations
              in this RISC-V hart can be observed to take place
              before the AMO.
     - .rl:   If the rl bit is set, then other RISC-V harts will not
              observe the AMO before memory accesses preceding the
              AMO in this RISC-V hart.
     - .aqrl: Setting both the aq and the rl bit on an AMO makes the
              sequence sequentially consistent, meaning that it cannot
              be reordered with earlier or later memory operations
              from the same hart.
On Wed, Apr 20, 2022 at 10:44 PM <guoren@kernel.org> wrote:
>
> From: Guo Ren <guoren@linux.alibaba.com>
>
> Current acquire & release implementations from atomic-arch-
> fallback.h are using __atomic_acquire/release_fence(), it cause
> another extra "fence r, rw/fence rw,w" instruction after/before
> AMO instruction. RISC-V AMO instructions could combine acquire
> and release in the instruction self which could reduce a fence
> instruction. Here is from RISC-V ISA 10.4 Atomic Memory
> Operations:
>
> To help implement multiprocessor synchronization, the AMOs
> optionally provide release consistency semantics.
>  - .aq:   If the aq bit is set, then no later memory operations
>           in this RISC-V hart can be observed to take place
>           before the AMO.
>  - .rl:   If the rl bit is set, then other RISC-V harts will not
>           observe the AMO before memory accesses preceding the
>           AMO in this RISC-V hart.
>  - .aqrl: Setting both the aq and the rl bit on an AMO makes the
>           sequence sequentially consistent, meaning that it cannot
>           be reordered with earlier or later memory operations
>           from the same hart.
>
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> Cc: Palmer Dabbelt <palmer@dabbelt.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Andrea Parri <parri.andrea@gmail.com>
> Cc: Dan Lustig <dlustig@nvidia.com>
> ---
>  arch/riscv/include/asm/atomic.h  | 64 ++++++++++++++++++++++++++++++++
>  arch/riscv/include/asm/cmpxchg.h | 12 ++----
>  2 files changed, 68 insertions(+), 8 deletions(-)
>
> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
> index ac9bdf4fc404..20ce8b83bc18 100644
> --- a/arch/riscv/include/asm/atomic.h
> +++ b/arch/riscv/include/asm/atomic.h
> @@ -99,6 +99,30 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i,          \
>         return ret;                                                     \
>  }                                                                      \
>  static __always_inline                                                 \
> +c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i,            \
> +                                            atomic##prefix##_t *v)     \
> +{                                                                      \
> +       register c_type ret;                                            \
> +       __asm__ __volatile__ (                                          \
> +               "       amo" #asm_op "." #asm_type ".aq %1, %2, %0"     \
> +               : "+A" (v->counter), "=r" (ret)                         \
> +               : "r" (I)                                               \
> +               : "memory");                                            \
> +       return ret;                                                     \
> +}                                                                      \
> +static __always_inline                                                 \
> +c_type arch_atomic##prefix##_fetch_##op##_release(c_type i,            \
> +                                            atomic##prefix##_t *v)     \
> +{                                                                      \
> +       register c_type ret;                                            \
> +       __asm__ __volatile__ (                                          \
> +               "       amo" #asm_op "." #asm_type ".rl %1, %2, %0"     \
> +               : "+A" (v->counter), "=r" (ret)                         \
> +               : "r" (I)                                               \
> +               : "memory");                                            \
> +       return ret;                                                     \
> +}                                                                      \
> +static __always_inline                                                 \
>  c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)       \
>  {                                                                      \
>         register c_type ret;                                            \
> @@ -118,6 +142,18 @@ c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i,               \
>          return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;        \
>  }                                                                      \
>  static __always_inline                                                 \
> +c_type arch_atomic##prefix##_##op##_return_acquire(c_type i,           \
> +                                             atomic##prefix##_t *v)    \
> +{                                                                      \
> +        return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I;        \
> +}                                                                      \
> +static __always_inline                                                 \
> +c_type arch_atomic##prefix##_##op##_return_release(c_type i,           \
> +                                             atomic##prefix##_t *v)    \
> +{                                                                      \
> +        return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I;        \
> +}                                                                      \
> +static __always_inline                                                 \
>  c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)    \
>  {                                                                      \
>          return arch_atomic##prefix##_fetch_##op(i, v) c_op I;          \
> @@ -140,22 +176,38 @@ ATOMIC_OPS(sub, add, +, -i)
>
>  #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
>  #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
> +#define arch_atomic_add_return_acquire arch_atomic_add_return_acquire
> +#define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire
> +#define arch_atomic_add_return_release arch_atomic_add_return_release
> +#define arch_atomic_sub_return_release arch_atomic_sub_return_release
>  #define arch_atomic_add_return         arch_atomic_add_return
>  #define arch_atomic_sub_return         arch_atomic_sub_return
>
>  #define arch_atomic_fetch_add_relaxed  arch_atomic_fetch_add_relaxed
>  #define arch_atomic_fetch_sub_relaxed  arch_atomic_fetch_sub_relaxed
> +#define arch_atomic_fetch_add_acquire  arch_atomic_fetch_add_acquire
> +#define arch_atomic_fetch_sub_acquire  arch_atomic_fetch_sub_acquire
> +#define arch_atomic_fetch_add_release  arch_atomic_fetch_add_release
> +#define arch_atomic_fetch_sub_release  arch_atomic_fetch_sub_release
>  #define arch_atomic_fetch_add          arch_atomic_fetch_add
>  #define arch_atomic_fetch_sub          arch_atomic_fetch_sub
>
>  #ifndef CONFIG_GENERIC_ATOMIC64
>  #define arch_atomic64_add_return_relaxed       arch_atomic64_add_return_relaxed
>  #define arch_atomic64_sub_return_relaxed       arch_atomic64_sub_return_relaxed
> +#define arch_atomic64_add_return_acquire       arch_atomic64_add_return_acquire
> +#define arch_atomic64_sub_return_acquire       arch_atomic64_sub_return_acquire
> +#define arch_atomic64_add_return_release       arch_atomic64_add_return_release
> +#define arch_atomic64_sub_return_release       arch_atomic64_sub_return_release
>  #define arch_atomic64_add_return               arch_atomic64_add_return
>  #define arch_atomic64_sub_return               arch_atomic64_sub_return
>
>  #define arch_atomic64_fetch_add_relaxed        arch_atomic64_fetch_add_relaxed
>  #define arch_atomic64_fetch_sub_relaxed        arch_atomic64_fetch_sub_relaxed
> +#define arch_atomic64_fetch_add_acquire        arch_atomic64_fetch_add_acquire
> +#define arch_atomic64_fetch_sub_acquire        arch_atomic64_fetch_sub_acquire
> +#define arch_atomic64_fetch_add_release        arch_atomic64_fetch_add_release
> +#define arch_atomic64_fetch_sub_release        arch_atomic64_fetch_sub_release
>  #define arch_atomic64_fetch_add                arch_atomic64_fetch_add
>  #define arch_atomic64_fetch_sub                arch_atomic64_fetch_sub
>  #endif
> @@ -178,6 +230,12 @@ ATOMIC_OPS(xor, xor, i)
>  #define arch_atomic_fetch_and_relaxed  arch_atomic_fetch_and_relaxed
>  #define arch_atomic_fetch_or_relaxed   arch_atomic_fetch_or_relaxed
>  #define arch_atomic_fetch_xor_relaxed  arch_atomic_fetch_xor_relaxed
> +#define arch_atomic_fetch_and_acquire  arch_atomic_fetch_and_acquire
> +#define arch_atomic_fetch_or_acquire   arch_atomic_fetch_or_acquire
> +#define arch_atomic_fetch_xor_acquire  arch_atomic_fetch_xor_acquire
> +#define arch_atomic_fetch_and_release  arch_atomic_fetch_and_release
> +#define arch_atomic_fetch_or_release   arch_atomic_fetch_or_release
> +#define arch_atomic_fetch_xor_release  arch_atomic_fetch_xor_release
>  #define arch_atomic_fetch_and          arch_atomic_fetch_and
>  #define arch_atomic_fetch_or           arch_atomic_fetch_or
>  #define arch_atomic_fetch_xor          arch_atomic_fetch_xor
> @@ -186,6 +244,12 @@ ATOMIC_OPS(xor, xor, i)
>  #define arch_atomic64_fetch_and_relaxed        arch_atomic64_fetch_and_relaxed
>  #define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed
>  #define arch_atomic64_fetch_xor_relaxed        arch_atomic64_fetch_xor_relaxed
> +#define arch_atomic64_fetch_and_acquire        arch_atomic64_fetch_and_acquire
> +#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire
> +#define arch_atomic64_fetch_xor_acquire        arch_atomic64_fetch_xor_acquire
> +#define arch_atomic64_fetch_and_release        arch_atomic64_fetch_and_release
> +#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release
> +#define arch_atomic64_fetch_xor_release        arch_atomic64_fetch_xor_release
>  #define arch_atomic64_fetch_and                arch_atomic64_fetch_and
>  #define arch_atomic64_fetch_or         arch_atomic64_fetch_or
>  #define arch_atomic64_fetch_xor                arch_atomic64_fetch_xor
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 12debce235e5..1af8db92250b 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -52,16 +52,14 @@
>         switch (size) {                                                 \
>         case 4:                                                         \
>                 __asm__ __volatile__ (                                  \
> -                       "       amoswap.w %0, %2, %1\n"                 \
> -                       RISCV_ACQUIRE_BARRIER                           \
> +                       "       amoswap.w.aq %0, %2, %1\n"              \
>                         : "=r" (__ret), "+A" (*__ptr)                   \
>                         : "r" (__new)                                   \
>                         : "memory");                                    \
>                 break;                                                  \
>         case 8:                                                         \
>                 __asm__ __volatile__ (                                  \
> -                       "       amoswap.d %0, %2, %1\n"                 \
> -                       RISCV_ACQUIRE_BARRIER                           \
> +                       "       amoswap.d.aq %0, %2, %1\n"              \
>                         : "=r" (__ret), "+A" (*__ptr)                   \
>                         : "r" (__new)                                   \
>                         : "memory");                                    \
> @@ -87,16 +85,14 @@
>         switch (size) {                                                 \
>         case 4:                                                         \
>                 __asm__ __volatile__ (                                  \
> -                       RISCV_RELEASE_BARRIER                           \
> -                       "       amoswap.w %0, %2, %1\n"                 \
> +                       "       amoswap.w.rl %0, %2, %1\n"              \
>                         : "=r" (__ret), "+A" (*__ptr)                   \
>                         : "r" (__new)                                   \
>                         : "memory");                                    \
>                 break;                                                  \
>         case 8:                                                         \
>                 __asm__ __volatile__ (                                  \
> -                       RISCV_RELEASE_BARRIER                           \
> -                       "       amoswap.d %0, %2, %1\n"                 \
> +                       "       amoswap.d.rl %0, %2, %1\n"              \
>                         : "=r" (__ret), "+A" (*__ptr)                   \
>                         : "r" (__new)                                   \
>                         : "memory");                                    \
> --
> 2.25.1
>


--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index ac9bdf4fc404..20ce8b83bc18 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -99,6 +99,30 @@  c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i,		\
 	return ret;							\
 }									\
 static __always_inline							\
+c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i,		\
+					     atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type ".aq %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
+}									\
+static __always_inline							\
+c_type arch_atomic##prefix##_fetch_##op##_release(c_type i,		\
+					     atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type ".rl %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
+}									\
+static __always_inline							\
 c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
 {									\
 	register c_type ret;						\
@@ -118,6 +142,18 @@  c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i,		\
         return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;	\
 }									\
 static __always_inline							\
+c_type arch_atomic##prefix##_##op##_return_acquire(c_type i,		\
+					      atomic##prefix##_t *v)	\
+{									\
+        return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I;	\
+}									\
+static __always_inline							\
+c_type arch_atomic##prefix##_##op##_return_release(c_type i,		\
+					      atomic##prefix##_t *v)	\
+{									\
+        return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I;	\
+}									\
+static __always_inline							\
 c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)	\
 {									\
         return arch_atomic##prefix##_fetch_##op(i, v) c_op I;		\
@@ -140,22 +176,38 @@  ATOMIC_OPS(sub, add, +, -i)
 
 #define arch_atomic_add_return_relaxed	arch_atomic_add_return_relaxed
 #define arch_atomic_sub_return_relaxed	arch_atomic_sub_return_relaxed
+#define arch_atomic_add_return_acquire	arch_atomic_add_return_acquire
+#define arch_atomic_sub_return_acquire	arch_atomic_sub_return_acquire
+#define arch_atomic_add_return_release	arch_atomic_add_return_release
+#define arch_atomic_sub_return_release	arch_atomic_sub_return_release
 #define arch_atomic_add_return		arch_atomic_add_return
 #define arch_atomic_sub_return		arch_atomic_sub_return
 
 #define arch_atomic_fetch_add_relaxed	arch_atomic_fetch_add_relaxed
 #define arch_atomic_fetch_sub_relaxed	arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_add_acquire	arch_atomic_fetch_add_acquire
+#define arch_atomic_fetch_sub_acquire	arch_atomic_fetch_sub_acquire
+#define arch_atomic_fetch_add_release	arch_atomic_fetch_add_release
+#define arch_atomic_fetch_sub_release	arch_atomic_fetch_sub_release
 #define arch_atomic_fetch_add		arch_atomic_fetch_add
 #define arch_atomic_fetch_sub		arch_atomic_fetch_sub
 
 #ifndef CONFIG_GENERIC_ATOMIC64
 #define arch_atomic64_add_return_relaxed	arch_atomic64_add_return_relaxed
 #define arch_atomic64_sub_return_relaxed	arch_atomic64_sub_return_relaxed
+#define arch_atomic64_add_return_acquire	arch_atomic64_add_return_acquire
+#define arch_atomic64_sub_return_acquire	arch_atomic64_sub_return_acquire
+#define arch_atomic64_add_return_release	arch_atomic64_add_return_release
+#define arch_atomic64_sub_return_release	arch_atomic64_sub_return_release
 #define arch_atomic64_add_return		arch_atomic64_add_return
 #define arch_atomic64_sub_return		arch_atomic64_sub_return
 
 #define arch_atomic64_fetch_add_relaxed	arch_atomic64_fetch_add_relaxed
 #define arch_atomic64_fetch_sub_relaxed	arch_atomic64_fetch_sub_relaxed
+#define arch_atomic64_fetch_add_acquire	arch_atomic64_fetch_add_acquire
+#define arch_atomic64_fetch_sub_acquire	arch_atomic64_fetch_sub_acquire
+#define arch_atomic64_fetch_add_release	arch_atomic64_fetch_add_release
+#define arch_atomic64_fetch_sub_release	arch_atomic64_fetch_sub_release
 #define arch_atomic64_fetch_add		arch_atomic64_fetch_add
 #define arch_atomic64_fetch_sub		arch_atomic64_fetch_sub
 #endif
@@ -178,6 +230,12 @@  ATOMIC_OPS(xor, xor, i)
 #define arch_atomic_fetch_and_relaxed	arch_atomic_fetch_and_relaxed
 #define arch_atomic_fetch_or_relaxed	arch_atomic_fetch_or_relaxed
 #define arch_atomic_fetch_xor_relaxed	arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_and_acquire	arch_atomic_fetch_and_acquire
+#define arch_atomic_fetch_or_acquire	arch_atomic_fetch_or_acquire
+#define arch_atomic_fetch_xor_acquire	arch_atomic_fetch_xor_acquire
+#define arch_atomic_fetch_and_release	arch_atomic_fetch_and_release
+#define arch_atomic_fetch_or_release	arch_atomic_fetch_or_release
+#define arch_atomic_fetch_xor_release	arch_atomic_fetch_xor_release
 #define arch_atomic_fetch_and		arch_atomic_fetch_and
 #define arch_atomic_fetch_or		arch_atomic_fetch_or
 #define arch_atomic_fetch_xor		arch_atomic_fetch_xor
@@ -186,6 +244,12 @@  ATOMIC_OPS(xor, xor, i)
 #define arch_atomic64_fetch_and_relaxed	arch_atomic64_fetch_and_relaxed
 #define arch_atomic64_fetch_or_relaxed	arch_atomic64_fetch_or_relaxed
 #define arch_atomic64_fetch_xor_relaxed	arch_atomic64_fetch_xor_relaxed
+#define arch_atomic64_fetch_and_acquire	arch_atomic64_fetch_and_acquire
+#define arch_atomic64_fetch_or_acquire	arch_atomic64_fetch_or_acquire
+#define arch_atomic64_fetch_xor_acquire	arch_atomic64_fetch_xor_acquire
+#define arch_atomic64_fetch_and_release	arch_atomic64_fetch_and_release
+#define arch_atomic64_fetch_or_release	arch_atomic64_fetch_or_release
+#define arch_atomic64_fetch_xor_release	arch_atomic64_fetch_xor_release
 #define arch_atomic64_fetch_and		arch_atomic64_fetch_and
 #define arch_atomic64_fetch_or		arch_atomic64_fetch_or
 #define arch_atomic64_fetch_xor		arch_atomic64_fetch_xor
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 12debce235e5..1af8db92250b 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -52,16 +52,14 @@ 
 	switch (size) {							\
 	case 4:								\
 		__asm__ __volatile__ (					\
-			"	amoswap.w %0, %2, %1\n"			\
-			RISCV_ACQUIRE_BARRIER				\
+			"	amoswap.w.aq %0, %2, %1\n"		\
 			: "=r" (__ret), "+A" (*__ptr)			\
 			: "r" (__new)					\
 			: "memory");					\
 		break;							\
 	case 8:								\
 		__asm__ __volatile__ (					\
-			"	amoswap.d %0, %2, %1\n"			\
-			RISCV_ACQUIRE_BARRIER				\
+			"	amoswap.d.aq %0, %2, %1\n"		\
 			: "=r" (__ret), "+A" (*__ptr)			\
 			: "r" (__new)					\
 			: "memory");					\
@@ -87,16 +85,14 @@ 
 	switch (size) {							\
 	case 4:								\
 		__asm__ __volatile__ (					\
-			RISCV_RELEASE_BARRIER				\
-			"	amoswap.w %0, %2, %1\n"			\
+			"	amoswap.w.rl %0, %2, %1\n"		\
 			: "=r" (__ret), "+A" (*__ptr)			\
 			: "r" (__new)					\
 			: "memory");					\
 		break;							\
 	case 8:								\
 		__asm__ __volatile__ (					\
-			RISCV_RELEASE_BARRIER				\
-			"	amoswap.d %0, %2, %1\n"			\
+			"	amoswap.d.rl %0, %2, %1\n"		\
 			: "=r" (__ret), "+A" (*__ptr)			\
 			: "r" (__new)					\
 			: "memory");					\