diff mbox series

[V11,03/17] riscv: Use Zicbop in arch_xchg when available

Message ID 20230910082911.3378782-4-guoren@kernel.org (mailing list archive)
State New, archived
Headers show
Series riscv: Add Native/Paravirt qspinlock support | expand

Commit Message

Guo Ren Sept. 10, 2023, 8:28 a.m. UTC
From: Guo Ren <guoren@linux.alibaba.com>

Cache-block prefetch instructions are HINTs to the hardware to
indicate that software intends to perform a particular type of
memory access in the near future. Enable ARCH_HAS_PREFETCHW and
improve the arch_xchg for qspinlock xchg_tail.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
---
 arch/riscv/Kconfig                 | 15 +++++++++++++++
 arch/riscv/include/asm/cmpxchg.h   |  4 +++-
 arch/riscv/include/asm/hwcap.h     |  1 +
 arch/riscv/include/asm/insn-def.h  |  5 +++++
 arch/riscv/include/asm/processor.h | 13 +++++++++++++
 arch/riscv/kernel/cpufeature.c     |  1 +
 6 files changed, 38 insertions(+), 1 deletion(-)

Comments

Leonardo Bras Sept. 13, 2023, 8:49 a.m. UTC | #1
On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> improve the arch_xchg for qspinlock xchg_tail.
> 
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> ---
>  arch/riscv/Kconfig                 | 15 +++++++++++++++
>  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
>  arch/riscv/include/asm/hwcap.h     |  1 +
>  arch/riscv/include/asm/insn-def.h  |  5 +++++
>  arch/riscv/include/asm/processor.h | 13 +++++++++++++
>  arch/riscv/kernel/cpufeature.c     |  1 +
>  6 files changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e9ae6fa232c3..2c346fe169c1 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
>  
>  	   If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_ZICBOP
> +	bool "Zicbop extension support for cache block prefetch"
> +	depends on MMU
> +	depends on RISCV_ALTERNATIVE
> +	default y
> +	help
> +	   Adds support to dynamically detect the presence of the ZICBOP
> +	   extension (Cache Block Prefetch Operations) and enable its
> +	   usage.
> +
> +	   The Zicbop extension can be used to prefetch cache block for
> +	   read/write/instruction fetch.
> +
> +	   If you don't know what to do here, say Y.
> +
>  config TOOLCHAIN_HAS_ZIHINTPAUSE
>  	bool
>  	default y
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 702725727671..56eff7a9d2d2 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -11,6 +11,7 @@
>  
>  #include <asm/barrier.h>
>  #include <asm/fence.h>
> +#include <asm/processor.h>
>  
>  #define __arch_xchg_masked(prepend, append, r, p, n)			\
>  ({									\
> @@ -25,6 +26,7 @@
>  									\
>  	__asm__ __volatile__ (						\
>  	       prepend							\
> +	       PREFETCHW_ASM(%5)					\
>  	       "0:	lr.w %0, %2\n"					\
>  	       "	and  %1, %0, %z4\n"				\
>  	       "	or   %1, %1, %z3\n"				\
> @@ -32,7 +34,7 @@
>  	       "	bnez %1, 0b\n"					\
>  	       append							\
>  	       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
> -	       : "rJ" (__newx), "rJ" (~__mask)				\
> +	       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)		\
>  	       : "memory");						\
>  									\
>  	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index b7b58258f6c7..78b7b8b53778 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -58,6 +58,7 @@
>  #define RISCV_ISA_EXT_ZICSR		40
>  #define RISCV_ISA_EXT_ZIFENCEI		41
>  #define RISCV_ISA_EXT_ZIHPM		42
> +#define RISCV_ISA_EXT_ZICBOP		43
>  
>  #define RISCV_ISA_EXT_MAX		64
>  
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index 6960beb75f32..dc590d331894 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -134,6 +134,7 @@
>  
>  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
>  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
>  
>  #define HFENCE_VVMA(vaddr, asid)				\
>  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> @@ -196,4 +197,8 @@
>  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
>  	       RS1(base), SIMM12(4))
>  
> +#define CBO_prefetchw(base)					\
> +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> +	       RD(x0), RS1(base), RS2(x0))
> +

I understand that here you create the instruction via bitfield, following 
the ISA, and this enables using instructions not available on the 
toolchain.

It took me some time to find the document with this instruction, so please 
add this to the commit msg:

https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
Page 23.

IIUC, the instruction is "prefetch.w".

Maybe I am missing something, but in the document the rs2 field 
(PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.

rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.

Is the above correct, or am I missing something?


Thanks!
Leo

>  #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> index de9da852f78d..7ad3a24212e8 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -12,6 +12,8 @@
>  #include <vdso/processor.h>
>  
>  #include <asm/ptrace.h>
> +#include <asm/insn-def.h>
> +#include <asm/hwcap.h>
>  
>  #ifdef CONFIG_64BIT
>  #define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
>  #define KSTK_EIP(tsk)		(ulong)(task_pt_regs(tsk)->epc)
>  #define KSTK_ESP(tsk)		(ulong)(task_pt_regs(tsk)->sp)
>  
> +#define ARCH_HAS_PREFETCHW
> +#define PREFETCHW_ASM(base)	ALTERNATIVE(__nops(1), \
> +					    CBO_prefetchw(base), \
> +					    0, \
> +					    RISCV_ISA_EXT_ZICBOP, \
> +					    CONFIG_RISCV_ISA_ZICBOP)
> +static inline void prefetchw(const void *ptr)
> +{
> +	asm volatile(PREFETCHW_ASM(%0)
> +		: : "r" (ptr) : "memory");
> +}
>  
>  /* Do necessary setup to start up a newly executed thread. */
>  extern void start_thread(struct pt_regs *regs,
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index ef7b4fd9e876..e0b897db0b97 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
>  	__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> +	__RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
>  	__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
>  	__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
>  	__RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> -- 
> 2.36.1
>
Andrew Jones Sept. 14, 2023, 1:47 p.m. UTC | #2
On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> improve the arch_xchg for qspinlock xchg_tail.
> 
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> ---
>  arch/riscv/Kconfig                 | 15 +++++++++++++++
>  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
>  arch/riscv/include/asm/hwcap.h     |  1 +
>  arch/riscv/include/asm/insn-def.h  |  5 +++++
>  arch/riscv/include/asm/processor.h | 13 +++++++++++++
>  arch/riscv/kernel/cpufeature.c     |  1 +
>  6 files changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e9ae6fa232c3..2c346fe169c1 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
>  
>  	   If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_ZICBOP
> +	bool "Zicbop extension support for cache block prefetch"
> +	depends on MMU
> +	depends on RISCV_ALTERNATIVE
> +	default y
> +	help
> +	   Adds support to dynamically detect the presence of the ZICBOP
> +	   extension (Cache Block Prefetch Operations) and enable its
> +	   usage.
> +
> +	   The Zicbop extension can be used to prefetch cache block for
> +	   read/write/instruction fetch.
> +
> +	   If you don't know what to do here, say Y.
> +
>  config TOOLCHAIN_HAS_ZIHINTPAUSE
>  	bool
>  	default y
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 702725727671..56eff7a9d2d2 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -11,6 +11,7 @@
>  
>  #include <asm/barrier.h>
>  #include <asm/fence.h>
> +#include <asm/processor.h>
>  
>  #define __arch_xchg_masked(prepend, append, r, p, n)			\
>  ({									\
> @@ -25,6 +26,7 @@
>  									\
>  	__asm__ __volatile__ (						\
>  	       prepend							\
> +	       PREFETCHW_ASM(%5)					\
>  	       "0:	lr.w %0, %2\n"					\
>  	       "	and  %1, %0, %z4\n"				\
>  	       "	or   %1, %1, %z3\n"				\
> @@ -32,7 +34,7 @@
>  	       "	bnez %1, 0b\n"					\
>  	       append							\
>  	       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
> -	       : "rJ" (__newx), "rJ" (~__mask)				\
> +	       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)		\
>  	       : "memory");						\
>  									\
>  	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index b7b58258f6c7..78b7b8b53778 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -58,6 +58,7 @@
>  #define RISCV_ISA_EXT_ZICSR		40
>  #define RISCV_ISA_EXT_ZIFENCEI		41
>  #define RISCV_ISA_EXT_ZIHPM		42
> +#define RISCV_ISA_EXT_ZICBOP		43
>  
>  #define RISCV_ISA_EXT_MAX		64
>  
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index 6960beb75f32..dc590d331894 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -134,6 +134,7 @@
>  
>  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
>  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)

This should be named RV_OPCODE_OP_IMM and be placed in
numerical order with the others, i.e. above SYSTEM.

>  
>  #define HFENCE_VVMA(vaddr, asid)				\
>  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> @@ -196,4 +197,8 @@
>  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
>  	       RS1(base), SIMM12(4))
>  
> +#define CBO_prefetchw(base)					\

Please name this 'PREFETCH_w' and it should take an immediate parameter,
even if we intend to pass 0 for it.

> +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> +	       RD(x0), RS1(base), RS2(x0))

prefetch.w is not an R-type instruction, it's an S-type. While the bit
shifts are the same, the names are different. We need to add S-type
names while defining this instruction. Then, this define would be

 #define PREFETCH_w(base, imm) \
     INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
            RS1(base), __RS2(3))

When the assembler as insn_r I hope it will validate that
(imm & 0xfe0) == imm

> +
>  #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> index de9da852f78d..7ad3a24212e8 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -12,6 +12,8 @@
>  #include <vdso/processor.h>
>  
>  #include <asm/ptrace.h>
> +#include <asm/insn-def.h>
> +#include <asm/hwcap.h>
>  
>  #ifdef CONFIG_64BIT
>  #define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
>  #define KSTK_EIP(tsk)		(ulong)(task_pt_regs(tsk)->epc)
>  #define KSTK_ESP(tsk)		(ulong)(task_pt_regs(tsk)->sp)
>  
> +#define ARCH_HAS_PREFETCHW
> +#define PREFETCHW_ASM(base)	ALTERNATIVE(__nops(1), \
> +					    CBO_prefetchw(base), \
> +					    0, \
> +					    RISCV_ISA_EXT_ZICBOP, \
> +					    CONFIG_RISCV_ISA_ZICBOP)
> +static inline void prefetchw(const void *ptr)
> +{
> +	asm volatile(PREFETCHW_ASM(%0)
> +		: : "r" (ptr) : "memory");
> +}
>  
>  /* Do necessary setup to start up a newly executed thread. */
>  extern void start_thread(struct pt_regs *regs,
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index ef7b4fd9e876..e0b897db0b97 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
>  	__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> +	__RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),

zicbop should be above zicboz (extensions alphabetical within their
category).

>  	__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
>  	__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
>  	__RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> -- 
> 2.36.1
>

Thanks,
drew
Andrew Jones Sept. 14, 2023, 2:25 p.m. UTC | #3
On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> From: Guo Ren <guoren@linux.alibaba.com>
> 
> Cache-block prefetch instructions are HINTs to the hardware to
> indicate that software intends to perform a particular type of
> memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> improve the arch_xchg for qspinlock xchg_tail.
> 
> Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren@kernel.org>
> ---
>  arch/riscv/Kconfig                 | 15 +++++++++++++++
>  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
>  arch/riscv/include/asm/hwcap.h     |  1 +
>  arch/riscv/include/asm/insn-def.h  |  5 +++++
>  arch/riscv/include/asm/processor.h | 13 +++++++++++++
>  arch/riscv/kernel/cpufeature.c     |  1 +
>  6 files changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e9ae6fa232c3..2c346fe169c1 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
>  
>  	   If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_ZICBOP

Even if we're not concerned with looping over blocks yet, I think we
should introduce zicbop block size DT parsing at the same time we bring
zicbop support to the kernel (it's just more copy+paste from zicbom and
zicboz). It's a bit annoying that the CMO spec doesn't state that block
sizes should be the same for m/z/p. And, the fact that m/z/p are all
separate extensions leads us to needing to parse block sizes for all
three, despite the fact that in practice they'll probably be the same.

Thanks,
drew
Andrew Jones Sept. 14, 2023, 2:47 p.m. UTC | #4
On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> > 
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > improve the arch_xchg for qspinlock xchg_tail.
> > 
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > ---
> >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> >  arch/riscv/include/asm/hwcap.h     |  1 +
> >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> >  arch/riscv/kernel/cpufeature.c     |  1 +
> >  6 files changed, 38 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e9ae6fa232c3..2c346fe169c1 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> >  
> >  	   If you don't know what to do here, say Y.
> >  
> > +config RISCV_ISA_ZICBOP
> 
> Even if we're not concerned with looping over blocks yet, I think we
> should introduce zicbop block size DT parsing at the same time we bring
> zicbop support to the kernel (it's just more copy+paste from zicbom and
> zicboz). It's a bit annoying that the CMO spec doesn't state that block
> sizes should be the same for m/z/p. And, the fact that m/z/p are all
> separate extensions leads us to needing to parse block sizes for all
> three, despite the fact that in practice they'll probably be the same.

Although, I saw on a different mailing list that Andrei Warkentin
interpreted section 2.7 "Software Discovery" of the spec, which states

"""
The initial set of CMO extensions requires the following information to be
discovered by software:

* The size of the cache block for management and prefetch instructions
* The size of the cache block for zero instructions
* CBIE support at each privilege level

Other general cache characteristics may also be specified in the discovery
mechanism.
"""

as management and prefetch having the same block size and only zero
potentially having a different size. That looks like a reasonable
interpretation to me, too. So, we could maybe proceed with assuming we
can use zicbom_block_size for prefetch, for now. If a platform comes along
that interpreted the spec differently, requiring prefetch block size to
be specified separately, then we'll cross that bridge when we get there.

Thanks,
drew
Leonardo Bras Sept. 15, 2023, 8:22 a.m. UTC | #5
On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> > 
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > improve the arch_xchg for qspinlock xchg_tail.
> > 
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > ---
> >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> >  arch/riscv/include/asm/hwcap.h     |  1 +
> >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> >  arch/riscv/kernel/cpufeature.c     |  1 +
> >  6 files changed, 38 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e9ae6fa232c3..2c346fe169c1 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> >  
> >  	   If you don't know what to do here, say Y.
> >  
> > +config RISCV_ISA_ZICBOP
> > +	bool "Zicbop extension support for cache block prefetch"
> > +	depends on MMU
> > +	depends on RISCV_ALTERNATIVE
> > +	default y
> > +	help
> > +	   Adds support to dynamically detect the presence of the ZICBOP
> > +	   extension (Cache Block Prefetch Operations) and enable its
> > +	   usage.
> > +
> > +	   The Zicbop extension can be used to prefetch cache block for
> > +	   read/write/instruction fetch.
> > +
> > +	   If you don't know what to do here, say Y.
> > +
> >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> >  	bool
> >  	default y
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 702725727671..56eff7a9d2d2 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -11,6 +11,7 @@
> >  
> >  #include <asm/barrier.h>
> >  #include <asm/fence.h>
> > +#include <asm/processor.h>
> >  
> >  #define __arch_xchg_masked(prepend, append, r, p, n)			\
> >  ({									\
> > @@ -25,6 +26,7 @@
> >  									\
> >  	__asm__ __volatile__ (						\
> >  	       prepend							\
> > +	       PREFETCHW_ASM(%5)					\
> >  	       "0:	lr.w %0, %2\n"					\
> >  	       "	and  %1, %0, %z4\n"				\
> >  	       "	or   %1, %1, %z3\n"				\
> > @@ -32,7 +34,7 @@
> >  	       "	bnez %1, 0b\n"					\
> >  	       append							\
> >  	       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
> > -	       : "rJ" (__newx), "rJ" (~__mask)				\
> > +	       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)		\
> >  	       : "memory");						\
> >  									\
> >  	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index b7b58258f6c7..78b7b8b53778 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -58,6 +58,7 @@
> >  #define RISCV_ISA_EXT_ZICSR		40
> >  #define RISCV_ISA_EXT_ZIFENCEI		41
> >  #define RISCV_ISA_EXT_ZIHPM		42
> > +#define RISCV_ISA_EXT_ZICBOP		43
> >  
> >  #define RISCV_ISA_EXT_MAX		64
> >  
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index 6960beb75f32..dc590d331894 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -134,6 +134,7 @@
> >  
> >  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
> >  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> > +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
> 
> This should be named RV_OPCODE_OP_IMM and be placed in
> numerical order with the others, i.e. above SYSTEM.
> 
> >  
> >  #define HFENCE_VVMA(vaddr, asid)				\
> >  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> > @@ -196,4 +197,8 @@
> >  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
> >  	       RS1(base), SIMM12(4))
> >  
> > +#define CBO_prefetchw(base)					\
> 
> Please name this 'PREFETCH_w' and it should take an immediate parameter,
> even if we intend to pass 0 for it.

It makes sense.

The mnemonic in the previously mentioned documentation is:

prefetch.w offset(base)

So yeah, makes sense to have both offset and base as parameters for 
CBO_prefetchw (or PREFETCH_w, I have no strong preference).

> 
> > +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> > +	       RD(x0), RS1(base), RS2(x0))
> 
> prefetch.w is not an R-type instruction, it's an S-type. While the bit
> shifts are the same, the names are different. We need to add S-type
> names while defining this instruction. 

That is correct, it is supposed to look like a store instruction (S-type), 
even though documentation don't explicitly state that.

Even though it works fine with the R-type definition, code documentation 
would be wrong, and future changes could break it.

> Then, this define would be
> 
>  #define PREFETCH_w(base, imm) \
>      INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
>             RS1(base), __RS2(3))

s/OPCODE_OP_IMM/OPCODE_PREFETCH
0x4 vs 0x13

RS2 == 0x3 is correct (PREFETCH.W instead of PREFETCH.I)


So IIUC, it should be:

INSN_S(OPCODE_PREFETCH, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
       RS1(base), __RS2(3)

Thanks,
Leo


> 
> When the assembler as insn_r I hope it will validate that
> (imm & 0xfe0) == imm
> 
> > +
> >  #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index de9da852f78d..7ad3a24212e8 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -12,6 +12,8 @@
> >  #include <vdso/processor.h>
> >  
> >  #include <asm/ptrace.h>
> > +#include <asm/insn-def.h>
> > +#include <asm/hwcap.h>
> >  
> >  #ifdef CONFIG_64BIT
> >  #define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> >  #define KSTK_EIP(tsk)		(ulong)(task_pt_regs(tsk)->epc)
> >  #define KSTK_ESP(tsk)		(ulong)(task_pt_regs(tsk)->sp)
> >  
> > +#define ARCH_HAS_PREFETCHW
> > +#define PREFETCHW_ASM(base)	ALTERNATIVE(__nops(1), \
> > +					    CBO_prefetchw(base), \
> > +					    0, \
> > +					    RISCV_ISA_EXT_ZICBOP, \
> > +					    CONFIG_RISCV_ISA_ZICBOP)
> > +static inline void prefetchw(const void *ptr)
> > +{
> > +	asm volatile(PREFETCHW_ASM(%0)
> > +		: : "r" (ptr) : "memory");
> > +}
> >  
> >  /* Do necessary setup to start up a newly executed thread. */
> >  extern void start_thread(struct pt_regs *regs,
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index ef7b4fd9e876..e0b897db0b97 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> >  	__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> >  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> >  	__RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > +	__RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> 
> zicbop should be above zicboz (extensions alphabetical within their
> category).
> 
> >  	__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> >  	__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> >  	__RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > -- 
> > 2.36.1
> >
> 
> Thanks,
> drew
>
Andrew Jones Sept. 15, 2023, 11:07 a.m. UTC | #6
On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
...
> > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > index 6960beb75f32..dc590d331894 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -134,6 +134,7 @@
> > >  
> > >  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
> > >  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> > > +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
> > 
> > This should be named RV_OPCODE_OP_IMM and be placed in
> > numerical order with the others, i.e. above SYSTEM.
> > 
> > >  
> > >  #define HFENCE_VVMA(vaddr, asid)				\
> > >  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> > > @@ -196,4 +197,8 @@
> > >  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
> > >  	       RS1(base), SIMM12(4))
> > >  
> > > +#define CBO_prefetchw(base)					\
> > 
> > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > even if we intend to pass 0 for it.
> 
> It makes sense.
> 
> The mnemonic in the previously mentioned documentation is:
> 
> prefetch.w offset(base)
> 
> So yeah, makes sense to have both offset and base as parameters for 
> CBO_prefetchw (or PREFETCH_w, I have no strong preference).

I have a strong preference :-)

PREFETCH_w is consistent with the naming we already have for e.g.
cbo.clean, which is CBO_clean. The instruction we're picking a name
for now is prefetch.w, not cbo.prefetchw.

> 
> > 
> > > +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> > > +	       RD(x0), RS1(base), RS2(x0))
> > 
> > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > shifts are the same, the names are different. We need to add S-type
> > names while defining this instruction. 
> 
> That is correct, it is supposed to look like a store instruction (S-type), 
> even though documentation don't explicitly state that.
> 
> Even though it works fine with the R-type definition, code documentation 
> would be wrong, and future changes could break it.
> 
> > Then, this define would be
> > 
> >  #define PREFETCH_w(base, imm) \

I should have suggested 'offset' instead of 'imm' for the second parameter
name.

> >      INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> >             RS1(base), __RS2(3))
> 
> s/OPCODE_OP_IMM/OPCODE_PREFETCH
> 0x4 vs 0x13

There's no major opcode named "PREFETCH" and the spec says that the major
opcode used for prefetch instructions is OP-IMM. That's why we want to
name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
comes from. A 32-bit instruction has the lowest two bits set (figure 1.1
of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
consistent with the spec also keeps them consistent with the .insn
directive where we could even use the names directly, i.e.

 .insn s OP_IMM, 6, x3, 0(a0)

> > 
> > When the assembler as insn_r I hope it will validate that

I meant insn_s here, which would be the macro for '.insn s'

> > (imm & 0xfe0) == imm

I played with it. It won't do what we want for prefetch, only
what works for s-type instructions in general, i.e. it allows
+/-2047 offsets and fails for everything else. That's good enough.
We can just mask off the low 5 bits here in our macro

 #define PREFETCH_w(base, offset) \
    INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
           __IMM_4_0(0), RS1(base), __RS2(3))

Thanks,
drew
Conor Dooley Sept. 15, 2023, 11:26 a.m. UTC | #7
On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> ...
> > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > index 6960beb75f32..dc590d331894 100644
> > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > @@ -134,6 +134,7 @@
> > > >  
> > > >  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
> > > >  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> > > > +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
> > > 
> > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > numerical order with the others, i.e. above SYSTEM.
> > > 
> > > >  
> > > >  #define HFENCE_VVMA(vaddr, asid)				\
> > > >  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> > > > @@ -196,4 +197,8 @@
> > > >  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
> > > >  	       RS1(base), SIMM12(4))
> > > >  
> > > > +#define CBO_prefetchw(base)					\
> > > 
> > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > even if we intend to pass 0 for it.
> > 
> > It makes sense.
> > 
> > The mnemonic in the previously mentioned documentation is:
> > 
> > prefetch.w offset(base)
> > 
> > So yeah, makes sense to have both offset and base as parameters for 
> > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> 
> I have a strong preference :-)
> 
> PREFETCH_w is consistent with the naming we already have for e.g.
> cbo.clean, which is CBO_clean. The instruction we're picking a name
> for now is prefetch.w, not cbo.prefetchw.

btw, the CBO_foo stuff was named that way as we were using them in
alternatives originally as an argument, that manifested as:
"cbo." __stringify(_op) " (a0)\n\t"
That was later changed to
CBO_##_op(a0)
but the then un-needed (AFAICT) capitalisation was kept to avoid
touching the callsites of the alternative. Maybe you remember better
than I do drew, since the idea was yours & I forgot I even wrote that
pattch.
If this isn't being used in a similar manner, then the w has no reason
to be in the odd lowercase form.

Cheers,
Conor.

> 
> > 
> > > 
> > > > +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> > > > +	       RD(x0), RS1(base), RS2(x0))
> > > 
> > > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > > shifts are the same, the names are different. We need to add S-type
> > > names while defining this instruction. 
> > 
> > That is correct, it is supposed to look like a store instruction (S-type), 
> > even though documentation don't explicitly state that.
> > 
> > Even though it works fine with the R-type definition, code documentation 
> > would be wrong, and future changes could break it.
> > 
> > > Then, this define would be
> > > 
> > >  #define PREFETCH_w(base, imm) \
> 
> I should have suggested 'offset' instead of 'imm' for the second parameter
> name.
> 
> > >      INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> > >             RS1(base), __RS2(3))
> > 
> > s/OPCODE_OP_IMM/OPCODE_PREFETCH
> > 0x4 vs 0x13
> 
> There's no major opcode named "PREFETCH" and the spec says that the major
> opcode used for prefetch instructions is OP-IMM. That's why we want to
> name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
> comes from. A 32-bit instruction has the lowest two bits set (figure 1.1
> of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
> 0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
> consistent with the spec also keeps them consistent with the .insn
> directive where we could even use the names directly, i.e.
> 
>  .insn s OP_IMM, 6, x3, 0(a0)
> 
> > > 
> > > When the assembler as insn_r I hope it will validate that
> 
> I meant insn_s here, which would be the macro for '.insn s'
> 
> > > (imm & 0xfe0) == imm
> 
> I played with it. It won't do what we want for prefetch, only
> what works for s-type instructions in general, i.e. it allows
> +/-2047 offsets and fails for everything else. That's good enough.
> We can just mask off the low 5 bits here in our macro
> 
>  #define PREFETCH_w(base, offset) \
>     INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
>            __IMM_4_0(0), RS1(base), __RS2(3))
> 
> Thanks,
> drew
Conor Dooley Sept. 15, 2023, 11:37 a.m. UTC | #8
Yo,

On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > > 
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > improve the arch_xchg for qspinlock xchg_tail.
> > > 
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > ---
> > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index e9ae6fa232c3..2c346fe169c1 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > >  
> > >  	   If you don't know what to do here, say Y.
> > >  
> > > +config RISCV_ISA_ZICBOP
> > 
> > Even if we're not concerned with looping over blocks yet, I think we
> > should introduce zicbop block size DT parsing at the same time we bring
> > zicbop support to the kernel (it's just more copy+paste from zicbom and
> > zicboz). It's a bit annoying that the CMO spec doesn't state that block
> > sizes should be the same for m/z/p. And, the fact that m/z/p are all
> > separate extensions leads us to needing to parse block sizes for all
> > three, despite the fact that in practice they'll probably be the same.
> 
> Although, I saw on a different mailing list that Andrei Warkentin
> interpreted section 2.7 "Software Discovery" of the spec, which states
> 
> """
> The initial set of CMO extensions requires the following information to be
> discovered by software:
> 
> * The size of the cache block for management and prefetch instructions
> * The size of the cache block for zero instructions
> * CBIE support at each privilege level
> 
> Other general cache characteristics may also be specified in the discovery
> mechanism.
> """
> 
> as management and prefetch having the same block size and only zero
> potentially having a different size. That looks like a reasonable
> interpretation to me, too.

TBH, I don't really care what ambiguous wording the spec has used, we
have the opportunity to make better decisions if we please. I hate the
fact that the specs are often not abundantly clear about things like this.

> So, we could maybe proceed with assuming we
> can use zicbom_block_size for prefetch, for now. If a platform comes along
> that interpreted the spec differently, requiring prefetch block size to
> be specified separately, then we'll cross that bridge when we get there.

That said, I think I suggested originally having the zicboz stuff default
to the zicbom size too, so I'd be happy with prefetch stuff working
exclusively that way until someone comes along looking for different sizes.
The binding should be updated though since

  riscv,cbom-block-size:
    $ref: /schemas/types.yaml#/definitions/uint32
    description:
      The blocksize in bytes for the Zicbom cache operations.

would no longer be a complete description.

While thinking about new wording though, it feels really clunky to describe
it like:
	The block size in bytes for the Zicbom cache operations, Zicbop
	cache operations will default to this block size where not
	explicitly defined.

since there's then no way to actually define the block size if it is
different. Unless you've got some magic wording, I'd rather document
riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
as the default.

Cheers,
Conor.
Andrew Jones Sept. 15, 2023, 12:14 p.m. UTC | #9
On Fri, Sep 15, 2023 at 12:37:50PM +0100, Conor Dooley wrote:
> Yo,
> 
> On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> > On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > 
> > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > indicate that software intends to perform a particular type of
> > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > improve the arch_xchg for qspinlock xchg_tail.
> > > > 
> > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > ---
> > > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > --- a/arch/riscv/Kconfig
> > > > +++ b/arch/riscv/Kconfig
> > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > >  
> > > >  	   If you don't know what to do here, say Y.
> > > >  
> > > > +config RISCV_ISA_ZICBOP
> > > 
> > > Even if we're not concerned with looping over blocks yet, I think we
> > > should introduce zicbop block size DT parsing at the same time we bring
> > > zicbop support to the kernel (it's just more copy+paste from zicbom and
> > > zicboz). It's a bit annoying that the CMO spec doesn't state that block
> > > sizes should be the same for m/z/p. And, the fact that m/z/p are all
> > > separate extensions leads us to needing to parse block sizes for all
> > > three, despite the fact that in practice they'll probably be the same.
> > 
> > Although, I saw on a different mailing list that Andrei Warkentin
> > interpreted section 2.7 "Software Discovery" of the spec, which states
> > 
> > """
> > The initial set of CMO extensions requires the following information to be
> > discovered by software:
> > 
> > * The size of the cache block for management and prefetch instructions
> > * The size of the cache block for zero instructions
> > * CBIE support at each privilege level
> > 
> > Other general cache characteristics may also be specified in the discovery
> > mechanism.
> > """
> > 
> > as management and prefetch having the same block size and only zero
> > potentially having a different size. That looks like a reasonable
> > interpretation to me, too.
> 
> TBH, I don't really care what ambiguous wording the spec has used, we
> have the opportunity to make better decisions if we please. I hate the
> fact that the specs are often not abundantly clear about things like this.
> 
> > So, we could maybe proceed with assuming we
> > can use zicbom_block_size for prefetch, for now. If a platform comes along
> > that interpreted the spec differently, requiring prefetch block size to
> > be specified separately, then we'll cross that bridge when we get there.
> 
> That said, I think I suggested originally having the zicboz stuff default
> to the zicbom size too, so I'd be happy with prefetch stuff working
> exclusively that way until someone comes along looking for different sizes.
> The binding should be updated though since
> 
>   riscv,cbom-block-size:
>     $ref: /schemas/types.yaml#/definitions/uint32
>     description:
>       The blocksize in bytes for the Zicbom cache operations.
> 
> would no longer be a complete description.
> 
> While thinking about new wording though, it feels really clunky to describe
> it like:
> 	The block size in bytes for the Zicbom cache operations, Zicbop
> 	cache operations will default to this block size where not
> 	explicitly defined.
> 
> since there's then no way to actually define the block size if it is
> different. Unless you've got some magic wording, I'd rather document
> riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
> as the default.
>

Sounds good to me, but if it's documented, then we should probably
implement its parsing. Then, at that point, I wonder if it makes sense to
have the fallback at all, or if it's not better just to require all the
DTs to be explicit (even if redundant).

Thanks,
drew
Andrew Jones Sept. 15, 2023, 12:22 p.m. UTC | #10
On Fri, Sep 15, 2023 at 12:26:20PM +0100, Conor Dooley wrote:
> On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> > On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > ...
> > > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > > index 6960beb75f32..dc590d331894 100644
> > > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > > @@ -134,6 +134,7 @@
> > > > >  
> > > > >  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
> > > > >  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> > > > > +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
> > > > 
> > > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > > numerical order with the others, i.e. above SYSTEM.
> > > > 
> > > > >  
> > > > >  #define HFENCE_VVMA(vaddr, asid)				\
> > > > >  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> > > > > @@ -196,4 +197,8 @@
> > > > >  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
> > > > >  	       RS1(base), SIMM12(4))
> > > > >  
> > > > > +#define CBO_prefetchw(base)					\
> > > > 
> > > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > > even if we intend to pass 0 for it.
> > > 
> > > It makes sense.
> > > 
> > > The mnemonic in the previously mentioned documentation is:
> > > 
> > > prefetch.w offset(base)
> > > 
> > > So yeah, makes sense to have both offset and base as parameters for 
> > > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> > 
> > I have a strong preference :-)
> > 
> > PREFETCH_w is consistent with the naming we already have for e.g.
> > cbo.clean, which is CBO_clean. The instruction we're picking a name
> > for now is prefetch.w, not cbo.prefetchw.
> 
> btw, the CBO_foo stuff was named that way as we were using them in
> alternatives originally as an argument, that manifested as:
> "cbo." __stringify(_op) " (a0)\n\t"
> That was later changed to
> CBO_##_op(a0)
> but the then un-needed (AFAICT) capitalisation was kept to avoid
> touching the callsites of the alternative. Maybe you remember better
> than I do drew, since the idea was yours & I forgot I even wrote that
> pattch.

And I forgot anything I may have suggested about it :-)

> If this isn't being used in a similar manner, then the w has no reason
> to be in the odd lowercase form.

Other than to be consistent... However, the CBO_* instructions are not
consistent with the rest of macros. If we don't need lowercase for any
reason, then my preference would be to bite the bullet and change all the
callsites of CBO_* macros and then introduce this new instruction as
PREFETCH_W

Thanks,
drew
Guo Ren Sept. 15, 2023, 12:36 p.m. UTC | #11
On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > From: Guo Ren <guoren@linux.alibaba.com>
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > improve the arch_xchg for qspinlock xchg_tail.
> >
> > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > Signed-off-by: Guo Ren <guoren@kernel.org>
> > ---
> >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> >  arch/riscv/include/asm/hwcap.h     |  1 +
> >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> >  arch/riscv/kernel/cpufeature.c     |  1 +
> >  6 files changed, 38 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e9ae6fa232c3..2c346fe169c1 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> >
> >          If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > +     bool "Zicbop extension support for cache block prefetch"
> > +     depends on MMU
> > +     depends on RISCV_ALTERNATIVE
> > +     default y
> > +     help
> > +        Adds support to dynamically detect the presence of the ZICBOP
> > +        extension (Cache Block Prefetch Operations) and enable its
> > +        usage.
> > +
> > +        The Zicbop extension can be used to prefetch cache block for
> > +        read/write/instruction fetch.
> > +
> > +        If you don't know what to do here, say Y.
> > +
> >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> >       bool
> >       default y
> > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > index 702725727671..56eff7a9d2d2 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -11,6 +11,7 @@
> >
> >  #include <asm/barrier.h>
> >  #include <asm/fence.h>
> > +#include <asm/processor.h>
> >
> >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> >  ({                                                                   \
> > @@ -25,6 +26,7 @@
> >                                                                       \
> >       __asm__ __volatile__ (                                          \
> >              prepend                                                  \
> > +            PREFETCHW_ASM(%5)                                        \
> >              "0:      lr.w %0, %2\n"                                  \
> >              "        and  %1, %0, %z4\n"                             \
> >              "        or   %1, %1, %z3\n"                             \
> > @@ -32,7 +34,7 @@
> >              "        bnez %1, 0b\n"                                  \
> >              append                                                   \
> >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> >              : "memory");                                             \
> >                                                                       \
> >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index b7b58258f6c7..78b7b8b53778 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -58,6 +58,7 @@
> >  #define RISCV_ISA_EXT_ZICSR          40
> >  #define RISCV_ISA_EXT_ZIFENCEI               41
> >  #define RISCV_ISA_EXT_ZIHPM          42
> > +#define RISCV_ISA_EXT_ZICBOP         43
> >
> >  #define RISCV_ISA_EXT_MAX            64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > index 6960beb75f32..dc590d331894 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -134,6 +134,7 @@
> >
> >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> >
> >  #define HFENCE_VVMA(vaddr, asid)                             \
> >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > @@ -196,4 +197,8 @@
> >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> >              RS1(base), SIMM12(4))
> >
> > +#define CBO_prefetchw(base)                                  \
> > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > +            RD(x0), RS1(base), RS2(x0))
> > +
>
> I understand that here you create the instruction via bitfield, following
> the ISA, and this enables using instructions not available on the
> toolchain.
>
> It took me some time to find the document with this instruction, so please
> add this to the commit msg:
>
> https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> Page 23.
>
> IIUC, the instruction is "prefetch.w".
>
> Maybe I am missing something, but in the document the rs2 field
> (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
>
> rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
>
> Is the above correct, or am I missing something?
Oh, you are right. My fault, thx for pointing out. It should be:
+       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
+              RD(x0), RS1(base), RS2(x3))

>
>
> Thanks!
> Leo
>
> >  #endif /* __ASM_INSN_DEF_H */
> > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > index de9da852f78d..7ad3a24212e8 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -12,6 +12,8 @@
> >  #include <vdso/processor.h>
> >
> >  #include <asm/ptrace.h>
> > +#include <asm/insn-def.h>
> > +#include <asm/hwcap.h>
> >
> >  #ifdef CONFIG_64BIT
> >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> >
> > +#define ARCH_HAS_PREFETCHW
> > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > +                                         CBO_prefetchw(base), \
> > +                                         0, \
> > +                                         RISCV_ISA_EXT_ZICBOP, \
> > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > +static inline void prefetchw(const void *ptr)
> > +{
> > +     asm volatile(PREFETCHW_ASM(%0)
> > +             : : "r" (ptr) : "memory");
> > +}
> >
> >  /* Do necessary setup to start up a newly executed thread. */
> >  extern void start_thread(struct pt_regs *regs,
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index ef7b4fd9e876..e0b897db0b97 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > --
> > 2.36.1
> >
>
Conor Dooley Sept. 15, 2023, 12:42 p.m. UTC | #12
> > If this isn't being used in a similar manner, then the w has no reason
> > to be in the odd lowercase form.
> 
> Other than to be consistent... However, the CBO_* instructions are not
> consistent with the rest of macros. If we don't need lowercase for any
> reason, then my preference would be to bite the bullet and change all the
> callsites of CBO_* macros and then introduce this new instruction as
> PREFETCH_W

Aye, I probably should've done it to begin with. Maybe there was some
other consideration at the time.
Conor Dooley Sept. 15, 2023, 12:53 p.m. UTC | #13
On Fri, Sep 15, 2023 at 02:14:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 12:37:50PM +0100, Conor Dooley wrote:
> > On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> > > On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > > From: Guo Ren <guoren@linux.alibaba.com>

> > > So, we could maybe proceed with assuming we
> > > can use zicbom_block_size for prefetch, for now. If a platform comes along
> > > that interpreted the spec differently, requiring prefetch block size to
> > > be specified separately, then we'll cross that bridge when we get there.
> > 
> > That said, I think I suggested originally having the zicboz stuff default
> > to the zicbom size too, so I'd be happy with prefetch stuff working
> > exclusively that way until someone comes along looking for different sizes.
> > The binding should be updated though since
> > 
> >   riscv,cbom-block-size:
> >     $ref: /schemas/types.yaml#/definitions/uint32
> >     description:
> >       The blocksize in bytes for the Zicbom cache operations.
> > 
> > would no longer be a complete description.
> > 
> > While thinking about new wording though, it feels really clunky to describe
> > it like:
> > 	The block size in bytes for the Zicbom cache operations, Zicbop
> > 	cache operations will default to this block size where not
> > 	explicitly defined.
> > 
> > since there's then no way to actually define the block size if it is
> > different. Unless you've got some magic wording, I'd rather document
> > riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
> > as the default.
> >
> 
> Sounds good to me, but if it's documented, then we should probably
> implement its parsing. Then, at that point, I wonder if it makes sense to
> have the fallback at all, or if it's not better just to require all the
> DTs to be explicit (even if redundant).

Sure, why not I guess.
Leonardo Bras Sept. 15, 2023, 8:32 p.m. UTC | #14
On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> ...
> > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > index 6960beb75f32..dc590d331894 100644
> > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > @@ -134,6 +134,7 @@
> > > >  
> > > >  #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
> > > >  #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
> > > > +#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
> > > 
> > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > numerical order with the others, i.e. above SYSTEM.
> > > 
> > > >  
> > > >  #define HFENCE_VVMA(vaddr, asid)				\
> > > >  	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
> > > > @@ -196,4 +197,8 @@
> > > >  	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
> > > >  	       RS1(base), SIMM12(4))
> > > >  
> > > > +#define CBO_prefetchw(base)					\
> > > 
> > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > even if we intend to pass 0 for it.
> > 
> > It makes sense.
> > 
> > The mnemonic in the previously mentioned documentation is:
> > 
> > prefetch.w offset(base)
> > 
> > So yeah, makes sense to have both offset and base as parameters for 
> > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> 
> I have a strong preference :-)
> 
> PREFETCH_w is consistent with the naming we already have for e.g.
> cbo.clean, which is CBO_clean. The instruction we're picking a name
> for now is prefetch.w, not cbo.prefetchw.
> 
> > 
> > > 
> > > > +	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
> > > > +	       RD(x0), RS1(base), RS2(x0))
> > > 
> > > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > > shifts are the same, the names are different. We need to add S-type
> > > names while defining this instruction. 
> > 
> > That is correct, it is supposed to look like a store instruction (S-type), 
> > even though documentation don't explicitly state that.
> > 
> > Even though it works fine with the R-type definition, code documentation 
> > would be wrong, and future changes could break it.
> > 
> > > Then, this define would be
> > > 
> > >  #define PREFETCH_w(base, imm) \
> 
> I should have suggested 'offset' instead of 'imm' for the second parameter
> name.
> 
> > >      INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> > >             RS1(base), __RS2(3))
> > 
> > s/OPCODE_OP_IMM/OPCODE_PREFETCH
> > 0x4 vs 0x13
> 
> There's no major opcode named "PREFETCH" and the spec says that the major
> opcode used for prefetch instructions is OP-IMM. That's why we want to
> name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
> comes from

Oh, you are right.

Sorry about this, I misinterpreted table 24.1 from the 
Unprivileged ISA (20191213). 

Yeap, everything make sense now, and the define below is not actually 
needed:

> > > > +#define RV_OPCODE_PREFETCH     RV_OPCODE(19)

Thanks!
Leo


> . A 32-bit instruction has the lowest two bits set (figure 1.1
> of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
> 0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
> consistent with the spec also keeps them consistent with the .insn
> directive where we could even use the names directly, i.e.
> 
>  .insn s OP_IMM, 6, x3, 0(a0)
> 
> > > 
> > > When the assembler as insn_r I hope it will validate that
> 
> I meant insn_s here, which would be the macro for '.insn s'
> 
> > > (imm & 0xfe0) == imm
> 
> I played with it. It won't do what we want for prefetch, only
> what works for s-type instructions in general, i.e. it allows
> +/-2047 offsets and fails for everything else. That's good enough.
> We can just mask off the low 5 bits here in our macro
> 
>  #define PREFETCH_w(base, offset) \
>     INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
>            __IMM_4_0(0), RS1(base), __RS2(3))
> 
> Thanks,
> drew
>
Conor Dooley Sept. 16, 2023, 12:05 a.m. UTC | #15
On Fri, Sep 15, 2023 at 01:42:56PM +0100, Conor Dooley wrote:
> 
> > > If this isn't being used in a similar manner, then the w has no reason
> > > to be in the odd lowercase form.
> > 
> > Other than to be consistent... However, the CBO_* instructions are not
> > consistent with the rest of macros. If we don't need lowercase for any
> > reason, then my preference would be to bite the bullet and change all the
> > callsites of CBO_* macros and then introduce this new instruction as
> > PREFETCH_W
> 
> Aye, I probably should've done it to begin with. Maybe there was some
> other consideration at the time.

FWIW, I sent a patch for this earlier today. I figure you saw it Drew,
but nonetheless:
https://lore.kernel.org/all/20230915-aloe-dollar-994937477776@spud/
Leonardo Bras Sept. 16, 2023, 1:25 a.m. UTC | #16
On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > From: Guo Ren <guoren@linux.alibaba.com>
> > >
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > improve the arch_xchg for qspinlock xchg_tail.
> > >
> > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > ---
> > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index e9ae6fa232c3..2c346fe169c1 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > >
> > >          If you don't know what to do here, say Y.
> > >
> > > +config RISCV_ISA_ZICBOP
> > > +     bool "Zicbop extension support for cache block prefetch"
> > > +     depends on MMU
> > > +     depends on RISCV_ALTERNATIVE
> > > +     default y
> > > +     help
> > > +        Adds support to dynamically detect the presence of the ZICBOP
> > > +        extension (Cache Block Prefetch Operations) and enable its
> > > +        usage.
> > > +
> > > +        The Zicbop extension can be used to prefetch cache block for
> > > +        read/write/instruction fetch.
> > > +
> > > +        If you don't know what to do here, say Y.
> > > +
> > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > >       bool
> > >       default y
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > index 702725727671..56eff7a9d2d2 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -11,6 +11,7 @@
> > >
> > >  #include <asm/barrier.h>
> > >  #include <asm/fence.h>
> > > +#include <asm/processor.h>
> > >
> > >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> > >  ({                                                                   \
> > > @@ -25,6 +26,7 @@
> > >                                                                       \
> > >       __asm__ __volatile__ (                                          \
> > >              prepend                                                  \
> > > +            PREFETCHW_ASM(%5)                                        \
> > >              "0:      lr.w %0, %2\n"                                  \
> > >              "        and  %1, %0, %z4\n"                             \
> > >              "        or   %1, %1, %z3\n"                             \
> > > @@ -32,7 +34,7 @@
> > >              "        bnez %1, 0b\n"                                  \
> > >              append                                                   \
> > >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> > >              : "memory");                                             \
> > >                                                                       \
> > >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > index b7b58258f6c7..78b7b8b53778 100644
> > > --- a/arch/riscv/include/asm/hwcap.h
> > > +++ b/arch/riscv/include/asm/hwcap.h
> > > @@ -58,6 +58,7 @@
> > >  #define RISCV_ISA_EXT_ZICSR          40
> > >  #define RISCV_ISA_EXT_ZIFENCEI               41
> > >  #define RISCV_ISA_EXT_ZIHPM          42
> > > +#define RISCV_ISA_EXT_ZICBOP         43
> > >
> > >  #define RISCV_ISA_EXT_MAX            64
> > >
> > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > index 6960beb75f32..dc590d331894 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -134,6 +134,7 @@
> > >
> > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > >
> > >  #define HFENCE_VVMA(vaddr, asid)                             \
> > >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > > @@ -196,4 +197,8 @@
> > >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> > >              RS1(base), SIMM12(4))
> > >
> > > +#define CBO_prefetchw(base)                                  \
> > > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > +            RD(x0), RS1(base), RS2(x0))
> > > +
> >
> > I understand that here you create the instruction via bitfield, following
> > the ISA, and this enables using instructions not available on the
> > toolchain.
> >
> > It took me some time to find the document with this instruction, so please
> > add this to the commit msg:
> >
> > https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> > Page 23.
> >
> > IIUC, the instruction is "prefetch.w".
> >
> > Maybe I am missing something, but in the document the rs2 field
> > (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
> >
> > rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
> >
> > Is the above correct, or am I missing something?
> Oh, you are right. My fault, thx for pointing out. It should be:
> +       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> +              RD(x0), RS1(base), RS2(x3))

Now I am curious to check if / how will this impact performance. :)
(Please let me know)


> 
> >
> >
> > Thanks!
> > Leo
> >
> > >  #endif /* __ASM_INSN_DEF_H */
> > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > index de9da852f78d..7ad3a24212e8 100644
> > > --- a/arch/riscv/include/asm/processor.h
> > > +++ b/arch/riscv/include/asm/processor.h
> > > @@ -12,6 +12,8 @@
> > >  #include <vdso/processor.h>
> > >
> > >  #include <asm/ptrace.h>
> > > +#include <asm/insn-def.h>
> > > +#include <asm/hwcap.h>
> > >
> > >  #ifdef CONFIG_64BIT
> > >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> > >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> > >
> > > +#define ARCH_HAS_PREFETCHW
> > > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > > +                                         CBO_prefetchw(base), \
> > > +                                         0, \
> > > +                                         RISCV_ISA_EXT_ZICBOP, \
> > > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > > +static inline void prefetchw(const void *ptr)
> > > +{
> > > +     asm volatile(PREFETCHW_ASM(%0)
> > > +             : : "r" (ptr) : "memory");
> > > +}
> > >
> > >  /* Do necessary setup to start up a newly executed thread. */
> > >  extern void start_thread(struct pt_regs *regs,
> > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > index ef7b4fd9e876..e0b897db0b97 100644
> > > --- a/arch/riscv/kernel/cpufeature.c
> > > +++ b/arch/riscv/kernel/cpufeature.c
> > > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > > --
> > > 2.36.1
> > >
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
Guo Ren Sept. 17, 2023, 2:34 p.m. UTC | #17
On Sat, Sep 16, 2023 at 9:25 AM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> > On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
> > >
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > >
> > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > indicate that software intends to perform a particular type of
> > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > improve the arch_xchg for qspinlock xchg_tail.
> > > >
> > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > ---
> > > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > --- a/arch/riscv/Kconfig
> > > > +++ b/arch/riscv/Kconfig
> > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > >
> > > >          If you don't know what to do here, say Y.
> > > >
> > > > +config RISCV_ISA_ZICBOP
> > > > +     bool "Zicbop extension support for cache block prefetch"
> > > > +     depends on MMU
> > > > +     depends on RISCV_ALTERNATIVE
> > > > +     default y
> > > > +     help
> > > > +        Adds support to dynamically detect the presence of the ZICBOP
> > > > +        extension (Cache Block Prefetch Operations) and enable its
> > > > +        usage.
> > > > +
> > > > +        The Zicbop extension can be used to prefetch cache block for
> > > > +        read/write/instruction fetch.
> > > > +
> > > > +        If you don't know what to do here, say Y.
> > > > +
> > > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > >       bool
> > > >       default y
> > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > index 702725727671..56eff7a9d2d2 100644
> > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > @@ -11,6 +11,7 @@
> > > >
> > > >  #include <asm/barrier.h>
> > > >  #include <asm/fence.h>
> > > > +#include <asm/processor.h>
> > > >
> > > >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> > > >  ({                                                                   \
> > > > @@ -25,6 +26,7 @@
> > > >                                                                       \
> > > >       __asm__ __volatile__ (                                          \
> > > >              prepend                                                  \
> > > > +            PREFETCHW_ASM(%5)                                        \
> > > >              "0:      lr.w %0, %2\n"                                  \
> > > >              "        and  %1, %0, %z4\n"                             \
> > > >              "        or   %1, %1, %z3\n"                             \
> > > > @@ -32,7 +34,7 @@
> > > >              "        bnez %1, 0b\n"                                  \
> > > >              append                                                   \
> > > >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > > > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > > > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> > > >              : "memory");                                             \
> > > >                                                                       \
> > > >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > index b7b58258f6c7..78b7b8b53778 100644
> > > > --- a/arch/riscv/include/asm/hwcap.h
> > > > +++ b/arch/riscv/include/asm/hwcap.h
> > > > @@ -58,6 +58,7 @@
> > > >  #define RISCV_ISA_EXT_ZICSR          40
> > > >  #define RISCV_ISA_EXT_ZIFENCEI               41
> > > >  #define RISCV_ISA_EXT_ZIHPM          42
> > > > +#define RISCV_ISA_EXT_ZICBOP         43
> > > >
> > > >  #define RISCV_ISA_EXT_MAX            64
> > > >
> > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > index 6960beb75f32..dc590d331894 100644
> > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > @@ -134,6 +134,7 @@
> > > >
> > > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > > >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > > >
> > > >  #define HFENCE_VVMA(vaddr, asid)                             \
> > > >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > > > @@ -196,4 +197,8 @@
> > > >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> > > >              RS1(base), SIMM12(4))
> > > >
> > > > +#define CBO_prefetchw(base)                                  \
> > > > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > +            RD(x0), RS1(base), RS2(x0))
> > > > +
> > >
> > > I understand that here you create the instruction via bitfield, following
> > > the ISA, and this enables using instructions not available on the
> > > toolchain.
> > >
> > > It took me some time to find the document with this instruction, so please
> > > add this to the commit msg:
> > >
> > > https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> > > Page 23.
> > >
> > > IIUC, the instruction is "prefetch.w".
> > >
> > > Maybe I am missing something, but in the document the rs2 field
> > > (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
> > >
> > > rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
> > >
> > > Is the above correct, or am I missing something?
> > Oh, you are right. My fault, thx for pointing out. It should be:
> > +       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > +              RD(x0), RS1(base), RS2(x3))
>
> Now I am curious to check if / how will this impact performance. :)
> (Please let me know)
Ref:
commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word
for write prior to stxr")
commit 86d231459d6d ("bpf: cpumap memory prefetchw optimizations for
struct page")

>
>
> >
> > >
> > >
> > > Thanks!
> > > Leo
> > >
> > > >  #endif /* __ASM_INSN_DEF_H */
> > > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > > index de9da852f78d..7ad3a24212e8 100644
> > > > --- a/arch/riscv/include/asm/processor.h
> > > > +++ b/arch/riscv/include/asm/processor.h
> > > > @@ -12,6 +12,8 @@
> > > >  #include <vdso/processor.h>
> > > >
> > > >  #include <asm/ptrace.h>
> > > > +#include <asm/insn-def.h>
> > > > +#include <asm/hwcap.h>
> > > >
> > > >  #ifdef CONFIG_64BIT
> > > >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > > > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > > >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> > > >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> > > >
> > > > +#define ARCH_HAS_PREFETCHW
> > > > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > > > +                                         CBO_prefetchw(base), \
> > > > +                                         0, \
> > > > +                                         RISCV_ISA_EXT_ZICBOP, \
> > > > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > > > +static inline void prefetchw(const void *ptr)
> > > > +{
> > > > +     asm volatile(PREFETCHW_ASM(%0)
> > > > +             : : "r" (ptr) : "memory");
> > > > +}
> > > >
> > > >  /* Do necessary setup to start up a newly executed thread. */
> > > >  extern void start_thread(struct pt_regs *regs,
> > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > index ef7b4fd9e876..e0b897db0b97 100644
> > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > > >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > > >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > > >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > > >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > > >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > > >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > > > --
> > > > 2.36.1
> > > >
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
>
Leonardo Bras Sept. 19, 2023, 5:13 a.m. UTC | #18
On Sun, Sep 17, 2023 at 10:34:36PM +0800, Guo Ren wrote:
> On Sat, Sep 16, 2023 at 9:25 AM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> > > On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
> > > >
> > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > >
> > > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > > indicate that software intends to perform a particular type of
> > > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > > improve the arch_xchg for qspinlock xchg_tail.
> > > > >
> > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > ---
> > > > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > > > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > > > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > > > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > > --- a/arch/riscv/Kconfig
> > > > > +++ b/arch/riscv/Kconfig
> > > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > > >
> > > > >          If you don't know what to do here, say Y.
> > > > >
> > > > > +config RISCV_ISA_ZICBOP
> > > > > +     bool "Zicbop extension support for cache block prefetch"
> > > > > +     depends on MMU
> > > > > +     depends on RISCV_ALTERNATIVE
> > > > > +     default y
> > > > > +     help
> > > > > +        Adds support to dynamically detect the presence of the ZICBOP
> > > > > +        extension (Cache Block Prefetch Operations) and enable its
> > > > > +        usage.
> > > > > +
> > > > > +        The Zicbop extension can be used to prefetch cache block for
> > > > > +        read/write/instruction fetch.
> > > > > +
> > > > > +        If you don't know what to do here, say Y.
> > > > > +
> > > > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > > >       bool
> > > > >       default y
> > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > index 702725727671..56eff7a9d2d2 100644
> > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > @@ -11,6 +11,7 @@
> > > > >
> > > > >  #include <asm/barrier.h>
> > > > >  #include <asm/fence.h>
> > > > > +#include <asm/processor.h>
> > > > >
> > > > >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> > > > >  ({                                                                   \
> > > > > @@ -25,6 +26,7 @@
> > > > >                                                                       \
> > > > >       __asm__ __volatile__ (                                          \
> > > > >              prepend                                                  \
> > > > > +            PREFETCHW_ASM(%5)                                        \
> > > > >              "0:      lr.w %0, %2\n"                                  \
> > > > >              "        and  %1, %0, %z4\n"                             \
> > > > >              "        or   %1, %1, %z3\n"                             \
> > > > > @@ -32,7 +34,7 @@
> > > > >              "        bnez %1, 0b\n"                                  \
> > > > >              append                                                   \
> > > > >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > > > > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > > > > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> > > > >              : "memory");                                             \
> > > > >                                                                       \
> > > > >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > > > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > index b7b58258f6c7..78b7b8b53778 100644
> > > > > --- a/arch/riscv/include/asm/hwcap.h
> > > > > +++ b/arch/riscv/include/asm/hwcap.h
> > > > > @@ -58,6 +58,7 @@
> > > > >  #define RISCV_ISA_EXT_ZICSR          40
> > > > >  #define RISCV_ISA_EXT_ZIFENCEI               41
> > > > >  #define RISCV_ISA_EXT_ZIHPM          42
> > > > > +#define RISCV_ISA_EXT_ZICBOP         43
> > > > >
> > > > >  #define RISCV_ISA_EXT_MAX            64
> > > > >
> > > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > > index 6960beb75f32..dc590d331894 100644
> > > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > > @@ -134,6 +134,7 @@
> > > > >
> > > > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > > > >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > > > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > > > >
> > > > >  #define HFENCE_VVMA(vaddr, asid)                             \
> > > > >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > > > > @@ -196,4 +197,8 @@
> > > > >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> > > > >              RS1(base), SIMM12(4))
> > > > >
> > > > > +#define CBO_prefetchw(base)                                  \
> > > > > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > > +            RD(x0), RS1(base), RS2(x0))
> > > > > +
> > > >
> > > > I understand that here you create the instruction via bitfield, following
> > > > the ISA, and this enables using instructions not available on the
> > > > toolchain.
> > > >
> > > > It took me some time to find the document with this instruction, so please
> > > > add this to the commit msg:
> > > >
> > > > https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> > > > Page 23.
> > > >
> > > > IIUC, the instruction is "prefetch.w".
> > > >
> > > > Maybe I am missing something, but in the document the rs2 field
> > > > (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
> > > >
> > > > rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
> > > >
> > > > Is the above correct, or am I missing something?
> > > Oh, you are right. My fault, thx for pointing out. It should be:
> > > +       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > +              RD(x0), RS1(base), RS2(x3))
> >
> > Now I am curious to check if / how will this impact performance. :)
> > (Please let me know)
> Ref:
> commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word
> for write prior to stxr")
> commit 86d231459d6d ("bpf: cpumap memory prefetchw optimizations for
> struct page")

Oh, I understand that prefetch.w is very useful for performance :)

What I meant is that previously this patch was issuing a prefetch.i,
and now it's issuing a prefetch.w (as intended). 

What got me curious is how much would it impact the performance to change 
the prefetch.i to prefetch.w. :)

Thanks!
Leo


> 
> >
> >
> > >
> > > >
> > > >
> > > > Thanks!
> > > > Leo
> > > >
> > > > >  #endif /* __ASM_INSN_DEF_H */
> > > > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > > > index de9da852f78d..7ad3a24212e8 100644
> > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > @@ -12,6 +12,8 @@
> > > > >  #include <vdso/processor.h>
> > > > >
> > > > >  #include <asm/ptrace.h>
> > > > > +#include <asm/insn-def.h>
> > > > > +#include <asm/hwcap.h>
> > > > >
> > > > >  #ifdef CONFIG_64BIT
> > > > >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > > > > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > > > >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> > > > >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> > > > >
> > > > > +#define ARCH_HAS_PREFETCHW
> > > > > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > > > > +                                         CBO_prefetchw(base), \
> > > > > +                                         0, \
> > > > > +                                         RISCV_ISA_EXT_ZICBOP, \
> > > > > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > > > > +static inline void prefetchw(const void *ptr)
> > > > > +{
> > > > > +     asm volatile(PREFETCHW_ASM(%0)
> > > > > +             : : "r" (ptr) : "memory");
> > > > > +}
> > > > >
> > > > >  /* Do necessary setup to start up a newly executed thread. */
> > > > >  extern void start_thread(struct pt_regs *regs,
> > > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > > index ef7b4fd9e876..e0b897db0b97 100644
> > > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > > > >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > > > >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > > > >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > > > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > > > >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > > > >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > > > >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > > > > --
> > > > > 2.36.1
> > > > >
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
Guo Ren Sept. 19, 2023, 7:53 a.m. UTC | #19
On Tue, Sep 19, 2023 at 1:13 PM Leonardo Bras <leobras@redhat.com> wrote:
>
> On Sun, Sep 17, 2023 at 10:34:36PM +0800, Guo Ren wrote:
> > On Sat, Sep 16, 2023 at 9:25 AM Leonardo Bras <leobras@redhat.com> wrote:
> > >
> > > On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> > > > On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
> > > > >
> > > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > > >
> > > > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > > > indicate that software intends to perform a particular type of
> > > > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > > > improve the arch_xchg for qspinlock xchg_tail.
> > > > > >
> > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > > ---
> > > > > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > > > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > > > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > > > > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > > > > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > > > > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > > > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > > > --- a/arch/riscv/Kconfig
> > > > > > +++ b/arch/riscv/Kconfig
> > > > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > > > >
> > > > > >          If you don't know what to do here, say Y.
> > > > > >
> > > > > > +config RISCV_ISA_ZICBOP
> > > > > > +     bool "Zicbop extension support for cache block prefetch"
> > > > > > +     depends on MMU
> > > > > > +     depends on RISCV_ALTERNATIVE
> > > > > > +     default y
> > > > > > +     help
> > > > > > +        Adds support to dynamically detect the presence of the ZICBOP
> > > > > > +        extension (Cache Block Prefetch Operations) and enable its
> > > > > > +        usage.
> > > > > > +
> > > > > > +        The Zicbop extension can be used to prefetch cache block for
> > > > > > +        read/write/instruction fetch.
> > > > > > +
> > > > > > +        If you don't know what to do here, say Y.
> > > > > > +
> > > > > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > > > >       bool
> > > > > >       default y
> > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > index 702725727671..56eff7a9d2d2 100644
> > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > @@ -11,6 +11,7 @@
> > > > > >
> > > > > >  #include <asm/barrier.h>
> > > > > >  #include <asm/fence.h>
> > > > > > +#include <asm/processor.h>
> > > > > >
> > > > > >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> > > > > >  ({                                                                   \
> > > > > > @@ -25,6 +26,7 @@
> > > > > >                                                                       \
> > > > > >       __asm__ __volatile__ (                                          \
> > > > > >              prepend                                                  \
> > > > > > +            PREFETCHW_ASM(%5)                                        \
> > > > > >              "0:      lr.w %0, %2\n"                                  \
> > > > > >              "        and  %1, %0, %z4\n"                             \
> > > > > >              "        or   %1, %1, %z3\n"                             \
> > > > > > @@ -32,7 +34,7 @@
> > > > > >              "        bnez %1, 0b\n"                                  \
> > > > > >              append                                                   \
> > > > > >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > > > > > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > > > > > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> > > > > >              : "memory");                                             \
> > > > > >                                                                       \
> > > > > >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > > > > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > > index b7b58258f6c7..78b7b8b53778 100644
> > > > > > --- a/arch/riscv/include/asm/hwcap.h
> > > > > > +++ b/arch/riscv/include/asm/hwcap.h
> > > > > > @@ -58,6 +58,7 @@
> > > > > >  #define RISCV_ISA_EXT_ZICSR          40
> > > > > >  #define RISCV_ISA_EXT_ZIFENCEI               41
> > > > > >  #define RISCV_ISA_EXT_ZIHPM          42
> > > > > > +#define RISCV_ISA_EXT_ZICBOP         43
> > > > > >
> > > > > >  #define RISCV_ISA_EXT_MAX            64
> > > > > >
> > > > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > > > index 6960beb75f32..dc590d331894 100644
> > > > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > > > @@ -134,6 +134,7 @@
> > > > > >
> > > > > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > > > > >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > > > > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > > > > >
> > > > > >  #define HFENCE_VVMA(vaddr, asid)                             \
> > > > > >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > > > > > @@ -196,4 +197,8 @@
> > > > > >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> > > > > >              RS1(base), SIMM12(4))
> > > > > >
> > > > > > +#define CBO_prefetchw(base)                                  \
> > > > > > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > > > +            RD(x0), RS1(base), RS2(x0))
> > > > > > +
> > > > >
> > > > > I understand that here you create the instruction via bitfield, following
> > > > > the ISA, and this enables using instructions not available on the
> > > > > toolchain.
> > > > >
> > > > > It took me some time to find the document with this instruction, so please
> > > > > add this to the commit msg:
> > > > >
> > > > > https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> > > > > Page 23.
> > > > >
> > > > > IIUC, the instruction is "prefetch.w".
> > > > >
> > > > > Maybe I am missing something, but in the document the rs2 field
> > > > > (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
> > > > >
> > > > > rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
> > > > >
> > > > > Is the above correct, or am I missing something?
> > > > Oh, you are right. My fault, thx for pointing out. It should be:
> > > > +       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > +              RD(x0), RS1(base), RS2(x3))
> > >
> > > Now I am curious to check if / how will this impact performance. :)
> > > (Please let me know)
> > Ref:
> > commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word
> > for write prior to stxr")
> > commit 86d231459d6d ("bpf: cpumap memory prefetchw optimizations for
> > struct page")
>
> Oh, I understand that prefetch.w is very useful for performance :)
>
> What I meant is that previously this patch was issuing a prefetch.i,
> and now it's issuing a prefetch.w (as intended).
>
> What got me curious is how much would it impact the performance to change
> the prefetch.i to prefetch.w. :)
The current SOPHO sg2042 hardware platform didn't support prefetch.w
instruction. So there is no performance result I could share with you.

Our next generation of processors would support ZICBOP.

>
> Thanks!
> Leo
>
>
> >
> > >
> > >
> > > >
> > > > >
> > > > >
> > > > > Thanks!
> > > > > Leo
> > > > >
> > > > > >  #endif /* __ASM_INSN_DEF_H */
> > > > > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > > > > index de9da852f78d..7ad3a24212e8 100644
> > > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > > @@ -12,6 +12,8 @@
> > > > > >  #include <vdso/processor.h>
> > > > > >
> > > > > >  #include <asm/ptrace.h>
> > > > > > +#include <asm/insn-def.h>
> > > > > > +#include <asm/hwcap.h>
> > > > > >
> > > > > >  #ifdef CONFIG_64BIT
> > > > > >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > > > > >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> > > > > >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> > > > > >
> > > > > > +#define ARCH_HAS_PREFETCHW
> > > > > > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > > > > > +                                         CBO_prefetchw(base), \
> > > > > > +                                         0, \
> > > > > > +                                         RISCV_ISA_EXT_ZICBOP, \
> > > > > > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > > > > > +static inline void prefetchw(const void *ptr)
> > > > > > +{
> > > > > > +     asm volatile(PREFETCHW_ASM(%0)
> > > > > > +             : : "r" (ptr) : "memory");
> > > > > > +}
> > > > > >
> > > > > >  /* Do necessary setup to start up a newly executed thread. */
> > > > > >  extern void start_thread(struct pt_regs *regs,
> > > > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > > > index ef7b4fd9e876..e0b897db0b97 100644
> > > > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > > > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > > > > >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > > > > >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > > > > >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > > > > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > > > > >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > > > > >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > > > > >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > > > > > --
> > > > > > 2.36.1
> > > > > >
> > > > >
> > > >
> > > >
> > > > --
> > > > Best Regards
> > > >  Guo Ren
> > > >
> > >
> >
> >
> > --
> > Best Regards
> >  Guo Ren
> >
>
Leonardo Bras Sept. 19, 2023, 2:38 p.m. UTC | #20
On Tue, Sep 19, 2023 at 03:53:22PM +0800, Guo Ren wrote:
> On Tue, Sep 19, 2023 at 1:13 PM Leonardo Bras <leobras@redhat.com> wrote:
> >
> > On Sun, Sep 17, 2023 at 10:34:36PM +0800, Guo Ren wrote:
> > > On Sat, Sep 16, 2023 at 9:25 AM Leonardo Bras <leobras@redhat.com> wrote:
> > > >
> > > > On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> > > > > On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras <leobras@redhat.com> wrote:
> > > > > >
> > > > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guoren@kernel.org wrote:
> > > > > > > From: Guo Ren <guoren@linux.alibaba.com>
> > > > > > >
> > > > > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > > > > indicate that software intends to perform a particular type of
> > > > > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > > > > improve the arch_xchg for qspinlock xchg_tail.
> > > > > > >
> > > > > > > Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
> > > > > > > Signed-off-by: Guo Ren <guoren@kernel.org>
> > > > > > > ---
> > > > > > >  arch/riscv/Kconfig                 | 15 +++++++++++++++
> > > > > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > > > > >  arch/riscv/include/asm/hwcap.h     |  1 +
> > > > > > >  arch/riscv/include/asm/insn-def.h  |  5 +++++
> > > > > > >  arch/riscv/include/asm/processor.h | 13 +++++++++++++
> > > > > > >  arch/riscv/kernel/cpufeature.c     |  1 +
> > > > > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > > > > --- a/arch/riscv/Kconfig
> > > > > > > +++ b/arch/riscv/Kconfig
> > > > > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > > > > >
> > > > > > >          If you don't know what to do here, say Y.
> > > > > > >
> > > > > > > +config RISCV_ISA_ZICBOP
> > > > > > > +     bool "Zicbop extension support for cache block prefetch"
> > > > > > > +     depends on MMU
> > > > > > > +     depends on RISCV_ALTERNATIVE
> > > > > > > +     default y
> > > > > > > +     help
> > > > > > > +        Adds support to dynamically detect the presence of the ZICBOP
> > > > > > > +        extension (Cache Block Prefetch Operations) and enable its
> > > > > > > +        usage.
> > > > > > > +
> > > > > > > +        The Zicbop extension can be used to prefetch cache block for
> > > > > > > +        read/write/instruction fetch.
> > > > > > > +
> > > > > > > +        If you don't know what to do here, say Y.
> > > > > > > +
> > > > > > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > > > > > >       bool
> > > > > > >       default y
> > > > > > > diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > index 702725727671..56eff7a9d2d2 100644
> > > > > > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > > > > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > > > > > @@ -11,6 +11,7 @@
> > > > > > >
> > > > > > >  #include <asm/barrier.h>
> > > > > > >  #include <asm/fence.h>
> > > > > > > +#include <asm/processor.h>
> > > > > > >
> > > > > > >  #define __arch_xchg_masked(prepend, append, r, p, n)                 \
> > > > > > >  ({                                                                   \
> > > > > > > @@ -25,6 +26,7 @@
> > > > > > >                                                                       \
> > > > > > >       __asm__ __volatile__ (                                          \
> > > > > > >              prepend                                                  \
> > > > > > > +            PREFETCHW_ASM(%5)                                        \
> > > > > > >              "0:      lr.w %0, %2\n"                                  \
> > > > > > >              "        and  %1, %0, %z4\n"                             \
> > > > > > >              "        or   %1, %1, %z3\n"                             \
> > > > > > > @@ -32,7 +34,7 @@
> > > > > > >              "        bnez %1, 0b\n"                                  \
> > > > > > >              append                                                   \
> > > > > > >              : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       \
> > > > > > > -            : "rJ" (__newx), "rJ" (~__mask)                          \
> > > > > > > +            : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         \
> > > > > > >              : "memory");                                             \
> > > > > > >                                                                       \
> > > > > > >       r = (__typeof__(*(p)))((__retx & __mask) >> __s);               \
> > > > > > > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > > > > > > index b7b58258f6c7..78b7b8b53778 100644
> > > > > > > --- a/arch/riscv/include/asm/hwcap.h
> > > > > > > +++ b/arch/riscv/include/asm/hwcap.h
> > > > > > > @@ -58,6 +58,7 @@
> > > > > > >  #define RISCV_ISA_EXT_ZICSR          40
> > > > > > >  #define RISCV_ISA_EXT_ZIFENCEI               41
> > > > > > >  #define RISCV_ISA_EXT_ZIHPM          42
> > > > > > > +#define RISCV_ISA_EXT_ZICBOP         43
> > > > > > >
> > > > > > >  #define RISCV_ISA_EXT_MAX            64
> > > > > > >
> > > > > > > diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> > > > > > > index 6960beb75f32..dc590d331894 100644
> > > > > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > > > > @@ -134,6 +134,7 @@
> > > > > > >
> > > > > > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > > > > > >  #define RV_OPCODE_SYSTEM     RV_OPCODE(115)
> > > > > > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > > > > > >
> > > > > > >  #define HFENCE_VVMA(vaddr, asid)                             \
> > > > > > >       INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),              \
> > > > > > > @@ -196,4 +197,8 @@
> > > > > > >       INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),              \
> > > > > > >              RS1(base), SIMM12(4))
> > > > > > >
> > > > > > > +#define CBO_prefetchw(base)                                  \
> > > > > > > +     INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > > > > +            RD(x0), RS1(base), RS2(x0))
> > > > > > > +
> > > > > >
> > > > > > I understand that here you create the instruction via bitfield, following
> > > > > > the ISA, and this enables using instructions not available on the
> > > > > > toolchain.
> > > > > >
> > > > > > It took me some time to find the document with this instruction, so please
> > > > > > add this to the commit msg:
> > > > > >
> > > > > > https://github.com/riscv/riscv-CMOs/blob/master/specifications/cmobase-v1.0.pdf
> > > > > > Page 23.
> > > > > >
> > > > > > IIUC, the instruction is "prefetch.w".
> > > > > >
> > > > > > Maybe I am missing something, but in the document the rs2 field
> > > > > > (PREFETCH.W) contains a 0x3, while the above looks to have a 0 instead.
> > > > > >
> > > > > > rs2 field = 0x0 would be a prefetch.i (instruction prefetch) instead.
> > > > > >
> > > > > > Is the above correct, or am I missing something?
> > > > > Oh, you are right. My fault, thx for pointing out. It should be:
> > > > > +       INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),             \
> > > > > +              RD(x0), RS1(base), RS2(x3))
> > > >
> > > > Now I am curious to check if / how will this impact performance. :)
> > > > (Please let me know)
> > > Ref:
> > > commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word
> > > for write prior to stxr")
> > > commit 86d231459d6d ("bpf: cpumap memory prefetchw optimizations for
> > > struct page")
> >
> > Oh, I understand that prefetch.w is very useful for performance :)
> >
> > What I meant is that previously this patch was issuing a prefetch.i,
> > and now it's issuing a prefetch.w (as intended).
> >
> > What got me curious is how much would it impact the performance to change
> > the prefetch.i to prefetch.w. :)
> The current SOPHO sg2042 hardware platform didn't support prefetch.w
> instruction. So there is no performance result I could share with you.
> 
> Our next generation of processors would support ZICBOP.

Oh, okay then.

Thanks for sharing this info!
Leo

> 
> >
> > Thanks!
> > Leo
> >
> >
> > >
> > > >
> > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > Thanks!
> > > > > > Leo
> > > > > >
> > > > > > >  #endif /* __ASM_INSN_DEF_H */
> > > > > > > diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
> > > > > > > index de9da852f78d..7ad3a24212e8 100644
> > > > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > > > @@ -12,6 +12,8 @@
> > > > > > >  #include <vdso/processor.h>
> > > > > > >
> > > > > > >  #include <asm/ptrace.h>
> > > > > > > +#include <asm/insn-def.h>
> > > > > > > +#include <asm/hwcap.h>
> > > > > > >
> > > > > > >  #ifdef CONFIG_64BIT
> > > > > > >  #define DEFAULT_MAP_WINDOW   (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > > @@ -103,6 +105,17 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
> > > > > > >  #define KSTK_EIP(tsk)                (ulong)(task_pt_regs(tsk)->epc)
> > > > > > >  #define KSTK_ESP(tsk)                (ulong)(task_pt_regs(tsk)->sp)
> > > > > > >
> > > > > > > +#define ARCH_HAS_PREFETCHW
> > > > > > > +#define PREFETCHW_ASM(base)  ALTERNATIVE(__nops(1), \
> > > > > > > +                                         CBO_prefetchw(base), \
> > > > > > > +                                         0, \
> > > > > > > +                                         RISCV_ISA_EXT_ZICBOP, \
> > > > > > > +                                         CONFIG_RISCV_ISA_ZICBOP)
> > > > > > > +static inline void prefetchw(const void *ptr)
> > > > > > > +{
> > > > > > > +     asm volatile(PREFETCHW_ASM(%0)
> > > > > > > +             : : "r" (ptr) : "memory");
> > > > > > > +}
> > > > > > >
> > > > > > >  /* Do necessary setup to start up a newly executed thread. */
> > > > > > >  extern void start_thread(struct pt_regs *regs,
> > > > > > > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > > > > > > index ef7b4fd9e876..e0b897db0b97 100644
> > > > > > > --- a/arch/riscv/kernel/cpufeature.c
> > > > > > > +++ b/arch/riscv/kernel/cpufeature.c
> > > > > > > @@ -159,6 +159,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
> > > > > > >       __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
> > > > > > >       __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> > > > > > >       __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
> > > > > > > +     __RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
> > > > > > >       __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
> > > > > > >       __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
> > > > > > >       __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),
> > > > > > > --
> > > > > > > 2.36.1
> > > > > > >
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Best Regards
> > > > >  Guo Ren
> > > > >
> > > >
> > >
> > >
> > > --
> > > Best Regards
> > >  Guo Ren
> > >
> >
> 
> 
> -- 
> Best Regards
>  Guo Ren
>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e9ae6fa232c3..2c346fe169c1 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -617,6 +617,21 @@  config RISCV_ISA_ZICBOZ
 
 	   If you don't know what to do here, say Y.
 
+config RISCV_ISA_ZICBOP
+	bool "Zicbop extension support for cache block prefetch"
+	depends on MMU
+	depends on RISCV_ALTERNATIVE
+	default y
+	help
+	   Adds support to dynamically detect the presence of the ZICBOP
+	   extension (Cache Block Prefetch Operations) and enable its
+	   usage.
+
+	   The Zicbop extension can be used to prefetch cache block for
+	   read/write/instruction fetch.
+
+	   If you don't know what to do here, say Y.
+
 config TOOLCHAIN_HAS_ZIHINTPAUSE
 	bool
 	default y
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 702725727671..56eff7a9d2d2 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -11,6 +11,7 @@ 
 
 #include <asm/barrier.h>
 #include <asm/fence.h>
+#include <asm/processor.h>
 
 #define __arch_xchg_masked(prepend, append, r, p, n)			\
 ({									\
@@ -25,6 +26,7 @@ 
 									\
 	__asm__ __volatile__ (						\
 	       prepend							\
+	       PREFETCHW_ASM(%5)					\
 	       "0:	lr.w %0, %2\n"					\
 	       "	and  %1, %0, %z4\n"				\
 	       "	or   %1, %1, %z3\n"				\
@@ -32,7 +34,7 @@ 
 	       "	bnez %1, 0b\n"					\
 	       append							\
 	       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
-	       : "rJ" (__newx), "rJ" (~__mask)				\
+	       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)		\
 	       : "memory");						\
 									\
 	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index b7b58258f6c7..78b7b8b53778 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -58,6 +58,7 @@ 
 #define RISCV_ISA_EXT_ZICSR		40
 #define RISCV_ISA_EXT_ZIFENCEI		41
 #define RISCV_ISA_EXT_ZIHPM		42
+#define RISCV_ISA_EXT_ZICBOP		43
 
 #define RISCV_ISA_EXT_MAX		64
 
diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index 6960beb75f32..dc590d331894 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -134,6 +134,7 @@ 
 
 #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
 #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
+#define RV_OPCODE_PREFETCH	RV_OPCODE(19)
 
 #define HFENCE_VVMA(vaddr, asid)				\
 	INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),		\
@@ -196,4 +197,8 @@ 
 	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
 	       RS1(base), SIMM12(4))
 
+#define CBO_prefetchw(base)					\
+	INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0),		\
+	       RD(x0), RS1(base), RS2(x0))
+
 #endif /* __ASM_INSN_DEF_H */
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index de9da852f78d..7ad3a24212e8 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -12,6 +12,8 @@ 
 #include <vdso/processor.h>
 
 #include <asm/ptrace.h>
+#include <asm/insn-def.h>
+#include <asm/hwcap.h>
 
 #ifdef CONFIG_64BIT
 #define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
@@ -103,6 +105,17 @@  static inline void arch_thread_struct_whitelist(unsigned long *offset,
 #define KSTK_EIP(tsk)		(ulong)(task_pt_regs(tsk)->epc)
 #define KSTK_ESP(tsk)		(ulong)(task_pt_regs(tsk)->sp)
 
+#define ARCH_HAS_PREFETCHW
+#define PREFETCHW_ASM(base)	ALTERNATIVE(__nops(1), \
+					    CBO_prefetchw(base), \
+					    0, \
+					    RISCV_ISA_EXT_ZICBOP, \
+					    CONFIG_RISCV_ISA_ZICBOP)
+static inline void prefetchw(const void *ptr)
+{
+	asm volatile(PREFETCHW_ASM(%0)
+		: : "r" (ptr) : "memory");
+}
 
 /* Do necessary setup to start up a newly executed thread. */
 extern void start_thread(struct pt_regs *regs,
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index ef7b4fd9e876..e0b897db0b97 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -159,6 +159,7 @@  const struct riscv_isa_ext_data riscv_isa_ext[] = {
 	__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
 	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
 	__RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ),
+	__RISCV_ISA_EXT_DATA(zicbop, RISCV_ISA_EXT_ZICBOP),
 	__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
 	__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),
 	__RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI),