diff mbox series

[v2,4/6] RISC-V: Detect unaligned vector accesses supported.

Message ID 20240613191616.2101821-5-jesse@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series RISC-V: Detect and report speed of unaligned vector accesses | expand

Checks

Context Check Description
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Jesse Taube June 13, 2024, 7:16 p.m. UTC
Run a unaligned vector access to test if the system supports
vector unaligned access. Add the result to a new key in hwprobe.
This is useful for usermode to know if vector misaligned accesses are
supported and if they are faster or slower than equivalent byte accesses.

Signed-off-by: Jesse Taube <jesse@rivosinc.com>
---
V1 -> V2:
 - Add Kconfig options
 - Add insn_is_vector
 - Add handle_vector_misaligned_load
 - Fix build
 - Seperate vector from scalar misaligned access
 - This patch was almost completely rewritten
---
 arch/riscv/Kconfig                         |  41 +++++++
 arch/riscv/include/asm/cpufeature.h        |   7 +-
 arch/riscv/include/asm/entry-common.h      |  11 --
 arch/riscv/include/asm/hwprobe.h           |   2 +-
 arch/riscv/include/asm/vector.h            |   1 +
 arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
 arch/riscv/kernel/Makefile                 |   4 +-
 arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
 arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
 arch/riscv/kernel/unaligned_access_speed.c |   9 +-
 arch/riscv/kernel/vector.c                 |   2 +-
 11 files changed, 221 insertions(+), 21 deletions(-)

Comments

Conor Dooley June 14, 2024, 8:36 a.m. UTC | #1
On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -19,7 +19,8 @@
>  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>  
> -DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>  
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  static cpumask_t fast_misaligned_access;
> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>  
>  	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>  		for_each_online_cpu(cpu) {
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> +#endif
> +#ifdef CONFIG_RISCV_MISALIGNED
>  			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> +#endif

Can you IS_ENABLED()-ify these two as well please?
Conor Dooley June 14, 2024, 8:40 a.m. UTC | #2
On Fri, Jun 14, 2024 at 09:36:55AM +0100, Conor Dooley wrote:
> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > @@ -19,7 +19,8 @@
> >  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> >  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> >  
> > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> >  
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >  static cpumask_t fast_misaligned_access;
> > @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
> >  
> >  	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
> >  		for_each_online_cpu(cpu) {
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> > +#endif
> > +#ifdef CONFIG_RISCV_MISALIGNED
> >  			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> > +#endif
> 
> Can you IS_ENABLED()-ify these two as well please?

Ah, you can't cos the variable doesn't exist in the other case.
Jesse Taube June 14, 2024, 2:28 p.m. UTC | #3
On 6/14/24 04:40, Conor Dooley wrote:
> On Fri, Jun 14, 2024 at 09:36:55AM +0100, Conor Dooley wrote:
>> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
>>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>>> @@ -19,7 +19,8 @@
>>>   #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>>   
>>> -DEFINE_PER_CPU(long, misaligned_access_speed);
>>> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
>>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>   
>>>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>>   static cpumask_t fast_misaligned_access;
>>> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>>>   
>>>   	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>>>   		for_each_online_cpu(cpu) {
>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>>> +#endif
>>> +#ifdef CONFIG_RISCV_MISALIGNED
>>>   			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
>>> +#endif
>>
>> Can you IS_ENABLED()-ify these two as well please?
> 
> Ah, you can't cos the variable doesn't exist in the other case.

Yeah kinda just dealing with how it was originally written ideally we 
would use IS_ENABLED. I don't really want to have a 500+ diff patch
IS_ENABLED()-ifying the original code as well. I can do that if 
necessary though.

Thank,
Jesse Taube
Conor Dooley June 14, 2024, 2:32 p.m. UTC | #4
On Fri, Jun 14, 2024 at 10:28:17AM -0400, Jesse Taube wrote:
> 
> 
> On 6/14/24 04:40, Conor Dooley wrote:
> > On Fri, Jun 14, 2024 at 09:36:55AM +0100, Conor Dooley wrote:
> > > On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > > > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > > > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > > > @@ -19,7 +19,8 @@
> > > >   #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > > >   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > > > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > > > +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> > > > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > >   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> > > >   static cpumask_t fast_misaligned_access;
> > > > @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
> > > >   	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
> > > >   		for_each_online_cpu(cpu) {
> > > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> > > > +#endif
> > > > +#ifdef CONFIG_RISCV_MISALIGNED
> > > >   			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> > > > +#endif
> > > 
> > > Can you IS_ENABLED()-ify these two as well please?
> > 
> > Ah, you can't cos the variable doesn't exist in the other case.
> 
> Yeah kinda just dealing with how it was originally written ideally we would
> use IS_ENABLED. I don't really want to have a 500+ diff patch
> IS_ENABLED()-ifying the original code as well. I can do that if necessary
> though.

No, dw about it. I made a mistake.
Charlie Jenkins June 17, 2024, 4:39 p.m. UTC | #5
On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> Run a unaligned vector access to test if the system supports
> vector unaligned access. Add the result to a new key in hwprobe.
> This is useful for usermode to know if vector misaligned accesses are
> supported and if they are faster or slower than equivalent byte accesses.
> 
> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> ---
> V1 -> V2:
>  - Add Kconfig options
>  - Add insn_is_vector
>  - Add handle_vector_misaligned_load
>  - Fix build
>  - Seperate vector from scalar misaligned access
>  - This patch was almost completely rewritten
> ---
>  arch/riscv/Kconfig                         |  41 +++++++
>  arch/riscv/include/asm/cpufeature.h        |   7 +-
>  arch/riscv/include/asm/entry-common.h      |  11 --
>  arch/riscv/include/asm/hwprobe.h           |   2 +-
>  arch/riscv/include/asm/vector.h            |   1 +
>  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>  arch/riscv/kernel/Makefile                 |   4 +-
>  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>  arch/riscv/kernel/vector.c                 |   2 +-
>  11 files changed, 221 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index b94176e25be1..f12df0ca6c18 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>  	help
>  	  Embed support for emulating misaligned loads and stores.
>  
> +config RISCV_VECTOR_MISALIGNED
> +	bool
> +	depends on RISCV_ISA_V
> +	help
> +	  Enable detecting support for vector misaligned loads and stores.
> +
>  choice
>  	prompt "Unaligned Accesses Support"
>  	default RISCV_PROBE_UNALIGNED_ACCESS
> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>  
>  endchoice
>  
> +choice
> +	prompt "Vector unaligned Accesses Support"
> +	depends on RISCV_ISA_V
> +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +	help
> +	  This determines the level of support for vector unaligned accesses. This
> +	  information is used by the kernel to perform optimizations. It is also
> +	  exposed to user space via the hwprobe syscall. The hardware will be
> +	  probed at boot by default.
> +
> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> +	bool "Detect support for vector unaligned accesses"
> +	select RISCV_VECTOR_MISALIGNED
> +	help
> +	  During boot, the kernel will detect if the system supports vector
> +	  unaligned accesses.
> +
> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +	bool "Probe speed of vector unaligned accesses"
> +	select RISCV_VECTOR_MISALIGNED
> +	help
> +	  During boot, the kernel will run a series of tests to determine the
> +	  speed of vector unaligned accesses if they are supported. This probing
> +	  will dynamically determine the speed of vector unaligned accesses on
> +	  the underlying system if they are supported.
> +
> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> +	bool "Assume the system does not support vector unaligned memory accesses"
> +	help
> +	  Assume that the system does not support vector unaligned memory accesses.
> +	  The kernel and userspace programs may run them successfully on systems
> +	  that do support vector unaligned memory accesses.
> +
> +endchoice
> +
>  endmenu # "Platform type"
>  
>  menu "Kernel features"
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..d0ea5921ab20 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
>  
>  void riscv_user_isa_enable(void);
>  
> -#if defined(CONFIG_RISCV_MISALIGNED)
>  bool check_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_MISALIGNED)
>  void unaligned_emulation_finish(void);
>  bool unaligned_ctl_available(void);
>  DECLARE_PER_CPU(long, misaligned_access_speed);
> @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
>  }
>  #endif
>  
> +bool check_vector_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> +DECLARE_PER_CPU(long, vector_misaligned_access);
> +#endif
> +
>  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>  
> diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> index 2293e535f865..7b32d2b08bb6 100644
> --- a/arch/riscv/include/asm/entry-common.h
> +++ b/arch/riscv/include/asm/entry-common.h
> @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
>  void handle_page_fault(struct pt_regs *regs);
>  void handle_break(struct pt_regs *regs);
>  
> -#ifdef CONFIG_RISCV_MISALIGNED
>  int handle_misaligned_load(struct pt_regs *regs);
>  int handle_misaligned_store(struct pt_regs *regs);
> -#else
> -static inline int handle_misaligned_load(struct pt_regs *regs)
> -{
> -	return -1;
> -}
> -static inline int handle_misaligned_store(struct pt_regs *regs)
> -{
> -	return -1;
> -}
> -#endif
>  
>  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 150a9877b0af..ef01c182af2b 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>  
>  #include <uapi/asm/hwprobe.h>
>  
> -#define RISCV_HWPROBE_MAX_KEY 7
> +#define RISCV_HWPROBE_MAX_KEY 8
>  
>  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>  {
> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> index be7d309cca8a..99b0f91db9ee 100644
> --- a/arch/riscv/include/asm/vector.h
> +++ b/arch/riscv/include/asm/vector.h
> @@ -21,6 +21,7 @@
>  
>  extern unsigned long riscv_v_vsize;
>  int riscv_v_setup_vsize(void);
> +bool insn_is_vector(u32 insn_buf);
>  bool riscv_v_first_use_handler(struct pt_regs *regs);
>  void kernel_vector_begin(void);
>  void kernel_vector_end(void);
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 023b7771d1b7..2fee870e41bb 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -75,6 +75,11 @@ struct riscv_hwprobe {
>  #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
>  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
>  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
> +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
>  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>  
>  /* Flags */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 5b243d46f4b1..62ac19c029f1 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -62,8 +62,8 @@ obj-y	+= probes/
>  obj-y	+= tests/
>  obj-$(CONFIG_MMU) += vdso.o vdso/
>  
> -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
> -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
> +obj-y	+= traps_misaligned.o
> +obj-y	+= unaligned_access_speed.o

These files only need to be compiled if either CONFIG_RISCV_MISALIGNED
or CONFIG_RISCV_VECTOR_MISALIGNED is selected. Can you refactor this
code to replace CONFIG_RISCV_MISALIGNED with
CONFIG_RISCV_SCALAR_MISALIGNED and then have
CONFIG_RISCV_SCALAR_MISALIGNED and CONFIG_RISCV_VECTOR_MISALIGNED select
CONFIG_RISCV_MISALIGNED in the Kconfig?

- Charlie

>  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
>  
>  obj-$(CONFIG_FPU)		+= fpu.o
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index e910e2971984..c40df314058b 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>  }
>  #endif
>  
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	int cpu;
> +	u64 perf = -1ULL;
> +
> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +	/* Return if supported or not even if speed wasn't probed */
> +	for_each_cpu(cpu, cpus) {
> +		int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> +		if (perf == -1ULL)
> +			perf = this_perf;
> +
> +		if (perf != this_perf) {
> +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +			break;
> +		}
> +	}
> +
> +	if (perf == -1ULL)
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +	return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
>  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>  			     const struct cpumask *cpus)
>  {
> @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>  		pair->value = hwprobe_misaligned(cpus);
>  		break;
>  
> +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> +		pair->value = hwprobe_vec_misaligned(cpus);
> +		break;
> +
>  	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>  		pair->value = 0;
>  		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 8fadbe00dd62..6f0264a8c9de 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
>  #include <asm/entry-common.h>
>  #include <asm/hwprobe.h>
>  #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>  
>  #define INSN_MATCH_LB			0x3
>  #define INSN_MASK_LB			0x707f
> @@ -322,12 +323,37 @@ union reg_data {
>  	u64 data_u64;
>  };
>  
> -static bool unaligned_ctl __read_mostly;
> -
>  /* sysctl hooks */
>  int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
>  
> -int handle_misaligned_load(struct pt_regs *regs)
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +	unsigned long epc = regs->epc;
> +	unsigned long insn;
> +
> +	if (get_insn(regs, epc, &insn))
> +		return -1;
> +
> +	/* Only return 0 when in check_vector_unaligned_access_emulated */
> +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +		regs->epc = epc + INSN_LEN(insn);
> +		return 0;
> +	}
> +
> +	/* If vector instruction we don't emulate it yet */
> +	regs->epc = epc;
> +	return -1;
> +}
> +#else
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +	return -1;
> +}
> +#endif
> +
> +static int handle_scalar_misaligned_load(struct pt_regs *regs)
>  {
>  	union reg_data val;
>  	unsigned long epc = regs->epc;
> @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
>  	return 0;
>  }
>  
> -int handle_misaligned_store(struct pt_regs *regs)
> +static int handle_scalar_misaligned_store(struct pt_regs *regs)
>  {
>  	union reg_data val;
>  	unsigned long epc = regs->epc;
> @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
>  	return 0;
>  }
>  
> +int handle_misaligned_load(struct pt_regs *regs)
> +{
> +	unsigned long epc = regs->epc;
> +	unsigned long insn;
> +
> +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> +		if (get_insn(regs, epc, &insn))
> +			return -1;
> +
> +		if (insn_is_vector(insn))
> +			return handle_vector_misaligned_load(regs);
> +	}
> +
> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +		return handle_scalar_misaligned_load(regs);
> +
> +	return -1;
> +}
> +
> +int handle_misaligned_store(struct pt_regs *regs)
> +{
> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +		return handle_scalar_misaligned_store(regs);
> +
> +	return -1;
> +}
> +
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> +{
> +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> +	unsigned long tmp_var;
> +
> +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +	local_irq_enable();
> +	kernel_vector_begin();
> +	__asm__ __volatile__ (
> +		".balign 4\n\t"
> +		".option push\n\t"
> +		".option arch, +zve32x\n\t"
> +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
> +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
> +		".option pop\n\t"
> +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> +	kernel_vector_end();
> +}
> +
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +	int cpu;
> +	bool ret = true;
> +
> +	if (!has_vector()) {
> +		for_each_online_cpu(cpu)
> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +		return false;
> +	}
> +
> +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> +
> +	for_each_online_cpu(cpu)
> +		if (per_cpu(vector_misaligned_access, cpu)
> +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> +			return false;
> +
> +	return ret;
> +}
> +#else
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +	return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_RISCV_MISALIGNED
> +
> +static bool unaligned_ctl __read_mostly;
> +
>  static void check_unaligned_access_emulated(struct work_struct *unused)
>  {
>  	int cpu = smp_processor_id();
> @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
>  {
>  	return unaligned_ctl;
>  }
> +#else
> +bool check_unaligned_access_emulated_all_cpus(void)
> +{
> +	return false;
> +}
> +#endif
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index 70c1588fc353..c6106bd4a25a 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -19,7 +19,8 @@
>  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>  
> -DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>  
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  static cpumask_t fast_misaligned_access;
> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>  
>  	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>  		for_each_online_cpu(cpu) {
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> +#endif
> +#ifdef CONFIG_RISCV_MISALIGNED
>  			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> +#endif
>  		}
>  		return 0;
>  	}
>  
>  	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> +	check_vector_unaligned_access_emulated_all_cpus();
>  
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  	if (!all_cpus_emulated)
> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> index 682b3feee451..821818886fab 100644
> --- a/arch/riscv/kernel/vector.c
> +++ b/arch/riscv/kernel/vector.c
> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>  #endif
>  }
>  
> -static bool insn_is_vector(u32 insn_buf)
> +bool insn_is_vector(u32 insn_buf)
>  {
>  	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>  	u32 width, csr;
> -- 
> 2.43.0
>
Charlie Jenkins June 18, 2024, 1:43 a.m. UTC | #6
On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> Run a unaligned vector access to test if the system supports
> vector unaligned access. Add the result to a new key in hwprobe.
> This is useful for usermode to know if vector misaligned accesses are
> supported and if they are faster or slower than equivalent byte accesses.
> 
> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> ---
> V1 -> V2:
>  - Add Kconfig options
>  - Add insn_is_vector
>  - Add handle_vector_misaligned_load
>  - Fix build
>  - Seperate vector from scalar misaligned access
>  - This patch was almost completely rewritten
> ---
>  arch/riscv/Kconfig                         |  41 +++++++
>  arch/riscv/include/asm/cpufeature.h        |   7 +-
>  arch/riscv/include/asm/entry-common.h      |  11 --
>  arch/riscv/include/asm/hwprobe.h           |   2 +-
>  arch/riscv/include/asm/vector.h            |   1 +
>  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>  arch/riscv/kernel/Makefile                 |   4 +-
>  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>  arch/riscv/kernel/vector.c                 |   2 +-
>  11 files changed, 221 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index b94176e25be1..f12df0ca6c18 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>  	help
>  	  Embed support for emulating misaligned loads and stores.
>  
> +config RISCV_VECTOR_MISALIGNED
> +	bool
> +	depends on RISCV_ISA_V
> +	help
> +	  Enable detecting support for vector misaligned loads and stores.
> +
>  choice
>  	prompt "Unaligned Accesses Support"
>  	default RISCV_PROBE_UNALIGNED_ACCESS
> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>  
>  endchoice
>  
> +choice
> +	prompt "Vector unaligned Accesses Support"
> +	depends on RISCV_ISA_V
> +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +	help
> +	  This determines the level of support for vector unaligned accesses. This
> +	  information is used by the kernel to perform optimizations. It is also
> +	  exposed to user space via the hwprobe syscall. The hardware will be
> +	  probed at boot by default.
> +
> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS

This is not used anywhere, what is the reason for including it?

> +	bool "Detect support for vector unaligned accesses"
> +	select RISCV_VECTOR_MISALIGNED
> +	help
> +	  During boot, the kernel will detect if the system supports vector
> +	  unaligned accesses.
> +
> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +	bool "Probe speed of vector unaligned accesses"
> +	select RISCV_VECTOR_MISALIGNED
> +	help
> +	  During boot, the kernel will run a series of tests to determine the
> +	  speed of vector unaligned accesses if they are supported. This probing
> +	  will dynamically determine the speed of vector unaligned accesses on
> +	  the underlying system if they are supported.
> +
> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED

This should not be prefixed with CONFIG and does not include VECTOR in
the name. I assume you meant to put
"RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?

This was also intentionally left out on the scalar side [1]. The
implication here is that having this config will cause people to compile
kernels without unaligned access support which really shouldn't be
something we are explicitly supporting.

If somebody does want to support hardware that does not handle vector
unaligned accesses, the solution should be to add emulation support to
the kernel.

Link: https://lore.kernel.org/all/Zd4y5llkvTfKHf6b@ghost/ [1]

- Charlie

> +	bool "Assume the system does not support vector unaligned memory accesses"
> +	help
> +	  Assume that the system does not support vector unaligned memory accesses.
> +	  The kernel and userspace programs may run them successfully on systems
> +	  that do support vector unaligned memory accesses.
> +
> +endchoice
> +
>  endmenu # "Platform type"
>  
>  menu "Kernel features"
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..d0ea5921ab20 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
>  
>  void riscv_user_isa_enable(void);
>  
> -#if defined(CONFIG_RISCV_MISALIGNED)
>  bool check_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_MISALIGNED)
>  void unaligned_emulation_finish(void);
>  bool unaligned_ctl_available(void);
>  DECLARE_PER_CPU(long, misaligned_access_speed);
> @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
>  }
>  #endif
>  
> +bool check_vector_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> +DECLARE_PER_CPU(long, vector_misaligned_access);
> +#endif
> +
>  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>  
> diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> index 2293e535f865..7b32d2b08bb6 100644
> --- a/arch/riscv/include/asm/entry-common.h
> +++ b/arch/riscv/include/asm/entry-common.h
> @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
>  void handle_page_fault(struct pt_regs *regs);
>  void handle_break(struct pt_regs *regs);
>  
> -#ifdef CONFIG_RISCV_MISALIGNED
>  int handle_misaligned_load(struct pt_regs *regs);
>  int handle_misaligned_store(struct pt_regs *regs);
> -#else
> -static inline int handle_misaligned_load(struct pt_regs *regs)
> -{
> -	return -1;
> -}
> -static inline int handle_misaligned_store(struct pt_regs *regs)
> -{
> -	return -1;
> -}
> -#endif
>  
>  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 150a9877b0af..ef01c182af2b 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>  
>  #include <uapi/asm/hwprobe.h>
>  
> -#define RISCV_HWPROBE_MAX_KEY 7
> +#define RISCV_HWPROBE_MAX_KEY 8
>  
>  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>  {
> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> index be7d309cca8a..99b0f91db9ee 100644
> --- a/arch/riscv/include/asm/vector.h
> +++ b/arch/riscv/include/asm/vector.h
> @@ -21,6 +21,7 @@
>  
>  extern unsigned long riscv_v_vsize;
>  int riscv_v_setup_vsize(void);
> +bool insn_is_vector(u32 insn_buf);
>  bool riscv_v_first_use_handler(struct pt_regs *regs);
>  void kernel_vector_begin(void);
>  void kernel_vector_end(void);
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 023b7771d1b7..2fee870e41bb 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -75,6 +75,11 @@ struct riscv_hwprobe {
>  #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
>  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
>  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
> +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0

I appreciate you leaving the key for EMULATED open!

> +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
>  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>  
>  /* Flags */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 5b243d46f4b1..62ac19c029f1 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -62,8 +62,8 @@ obj-y	+= probes/
>  obj-y	+= tests/
>  obj-$(CONFIG_MMU) += vdso.o vdso/
>  
> -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
> -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
> +obj-y	+= traps_misaligned.o
> +obj-y	+= unaligned_access_speed.o
>  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
>  
>  obj-$(CONFIG_FPU)		+= fpu.o
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index e910e2971984..c40df314058b 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>  }
>  #endif
>  
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	int cpu;
> +	u64 perf = -1ULL;
> +
> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +	/* Return if supported or not even if speed wasn't probed */
> +	for_each_cpu(cpu, cpus) {
> +		int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> +		if (perf == -1ULL)
> +			perf = this_perf;
> +
> +		if (perf != this_perf) {
> +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +			break;
> +		}
> +	}
> +
> +	if (perf == -1ULL)
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +	return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
>  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>  			     const struct cpumask *cpus)
>  {
> @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>  		pair->value = hwprobe_misaligned(cpus);
>  		break;
>  
> +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> +		pair->value = hwprobe_vec_misaligned(cpus);
> +		break;
> +
>  	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>  		pair->value = 0;
>  		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 8fadbe00dd62..6f0264a8c9de 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
>  #include <asm/entry-common.h>
>  #include <asm/hwprobe.h>
>  #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>  
>  #define INSN_MATCH_LB			0x3
>  #define INSN_MASK_LB			0x707f
> @@ -322,12 +323,37 @@ union reg_data {
>  	u64 data_u64;
>  };
>  
> -static bool unaligned_ctl __read_mostly;
> -
>  /* sysctl hooks */
>  int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
>  
> -int handle_misaligned_load(struct pt_regs *regs)
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +	unsigned long epc = regs->epc;
> +	unsigned long insn;
> +
> +	if (get_insn(regs, epc, &insn))
> +		return -1;
> +
> +	/* Only return 0 when in check_vector_unaligned_access_emulated */
> +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +		regs->epc = epc + INSN_LEN(insn);
> +		return 0;
> +	}
> +
> +	/* If vector instruction we don't emulate it yet */
> +	regs->epc = epc;
> +	return -1;
> +}
> +#else
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +	return -1;
> +}
> +#endif
> +
> +static int handle_scalar_misaligned_load(struct pt_regs *regs)
>  {
>  	union reg_data val;
>  	unsigned long epc = regs->epc;
> @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
>  	return 0;
>  }
>  
> -int handle_misaligned_store(struct pt_regs *regs)
> +static int handle_scalar_misaligned_store(struct pt_regs *regs)
>  {
>  	union reg_data val;
>  	unsigned long epc = regs->epc;
> @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
>  	return 0;
>  }
>  
> +int handle_misaligned_load(struct pt_regs *regs)
> +{
> +	unsigned long epc = regs->epc;
> +	unsigned long insn;
> +
> +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> +		if (get_insn(regs, epc, &insn))
> +			return -1;
> +
> +		if (insn_is_vector(insn))
> +			return handle_vector_misaligned_load(regs);
> +	}
> +
> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +		return handle_scalar_misaligned_load(regs);
> +
> +	return -1;
> +}
> +
> +int handle_misaligned_store(struct pt_regs *regs)
> +{
> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +		return handle_scalar_misaligned_store(regs);
> +
> +	return -1;
> +}
> +
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> +{
> +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> +	unsigned long tmp_var;
> +
> +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +	local_irq_enable();
> +	kernel_vector_begin();
> +	__asm__ __volatile__ (
> +		".balign 4\n\t"
> +		".option push\n\t"
> +		".option arch, +zve32x\n\t"
> +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
> +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
> +		".option pop\n\t"
> +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> +	kernel_vector_end();
> +}
> +
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +	int cpu;
> +	bool ret = true;
> +
> +	if (!has_vector()) {
> +		for_each_online_cpu(cpu)
> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +		return false;
> +	}
> +
> +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> +
> +	for_each_online_cpu(cpu)
> +		if (per_cpu(vector_misaligned_access, cpu)
> +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> +			return false;
> +
> +	return ret;
> +}
> +#else
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +	return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_RISCV_MISALIGNED
> +
> +static bool unaligned_ctl __read_mostly;
> +
>  static void check_unaligned_access_emulated(struct work_struct *unused)
>  {
>  	int cpu = smp_processor_id();
> @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
>  {
>  	return unaligned_ctl;
>  }
> +#else
> +bool check_unaligned_access_emulated_all_cpus(void)
> +{
> +	return false;
> +}
> +#endif
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index 70c1588fc353..c6106bd4a25a 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -19,7 +19,8 @@
>  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>  
> -DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>  
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  static cpumask_t fast_misaligned_access;
> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>  
>  	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>  		for_each_online_cpu(cpu) {
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> +#endif
> +#ifdef CONFIG_RISCV_MISALIGNED
>  			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> +#endif
>  		}
>  		return 0;

Since this function returns 0 in both cases, can you wrap the rest of
the function with an else and remove this early return?

- Charlie

>  	}
>  
>  	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> +	check_vector_unaligned_access_emulated_all_cpus();
>  
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  	if (!all_cpus_emulated)
> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> index 682b3feee451..821818886fab 100644
> --- a/arch/riscv/kernel/vector.c
> +++ b/arch/riscv/kernel/vector.c
> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>  #endif
>  }
>  
> -static bool insn_is_vector(u32 insn_buf)
> +bool insn_is_vector(u32 insn_buf)
>  {
>  	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>  	u32 width, csr;
> -- 
> 2.43.0
>
Charlie Jenkins June 18, 2024, 2:09 a.m. UTC | #7
On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > Run a unaligned vector access to test if the system supports
> > vector unaligned access. Add the result to a new key in hwprobe.
> > This is useful for usermode to know if vector misaligned accesses are
> > supported and if they are faster or slower than equivalent byte accesses.
> > 
> > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > ---
> > V1 -> V2:
> >  - Add Kconfig options
> >  - Add insn_is_vector
> >  - Add handle_vector_misaligned_load
> >  - Fix build
> >  - Seperate vector from scalar misaligned access
> >  - This patch was almost completely rewritten
> > ---
> >  arch/riscv/Kconfig                         |  41 +++++++
> >  arch/riscv/include/asm/cpufeature.h        |   7 +-
> >  arch/riscv/include/asm/entry-common.h      |  11 --
> >  arch/riscv/include/asm/hwprobe.h           |   2 +-
> >  arch/riscv/include/asm/vector.h            |   1 +
> >  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> >  arch/riscv/kernel/Makefile                 |   4 +-
> >  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> >  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> >  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> >  arch/riscv/kernel/vector.c                 |   2 +-
> >  11 files changed, 221 insertions(+), 21 deletions(-)
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index b94176e25be1..f12df0ca6c18 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> >  	help
> >  	  Embed support for emulating misaligned loads and stores.
> >  
> > +config RISCV_VECTOR_MISALIGNED
> > +	bool
> > +	depends on RISCV_ISA_V
> > +	help
> > +	  Enable detecting support for vector misaligned loads and stores.
> > +
> >  choice
> >  	prompt "Unaligned Accesses Support"
> >  	default RISCV_PROBE_UNALIGNED_ACCESS
> > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> >  
> >  endchoice
> >  
> > +choice
> > +	prompt "Vector unaligned Accesses Support"
> > +	depends on RISCV_ISA_V
> > +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > +	help
> > +	  This determines the level of support for vector unaligned accesses. This
> > +	  information is used by the kernel to perform optimizations. It is also
> > +	  exposed to user space via the hwprobe syscall. The hardware will be
> > +	  probed at boot by default.
> > +
> > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> 
> This is not used anywhere, what is the reason for including it?
> 
> > +	bool "Detect support for vector unaligned accesses"
> > +	select RISCV_VECTOR_MISALIGNED
> > +	help
> > +	  During boot, the kernel will detect if the system supports vector
> > +	  unaligned accesses.
> > +
> > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > +	bool "Probe speed of vector unaligned accesses"
> > +	select RISCV_VECTOR_MISALIGNED
> > +	help
> > +	  During boot, the kernel will run a series of tests to determine the
> > +	  speed of vector unaligned accesses if they are supported. This probing
> > +	  will dynamically determine the speed of vector unaligned accesses on
> > +	  the underlying system if they are supported.
> > +
> > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> 
> This should not be prefixed with CONFIG and does not include VECTOR in
> the name. I assume you meant to put
> "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
> 
> This was also intentionally left out on the scalar side [1]. The
> implication here is that having this config will cause people to compile
> kernels without unaligned access support which really shouldn't be
> something we are explicitly supporting.
> 
> If somebody does want to support hardware that does not handle vector
> unaligned accesses, the solution should be to add emulation support to
> the kernel.
> 
> Link: https://lore.kernel.org/all/Zd4y5llkvTfKHf6b@ghost/ [1]
> 
> - Charlie
> 
> > +	bool "Assume the system does not support vector unaligned memory accesses"
> > +	help
> > +	  Assume that the system does not support vector unaligned memory accesses.
> > +	  The kernel and userspace programs may run them successfully on systems
> > +	  that do support vector unaligned memory accesses.
> > +
> > +endchoice
> > +
> >  endmenu # "Platform type"
> >  
> >  menu "Kernel features"
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index 347805446151..d0ea5921ab20 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
> >  
> >  void riscv_user_isa_enable(void);
> >  
> > -#if defined(CONFIG_RISCV_MISALIGNED)
> >  bool check_unaligned_access_emulated_all_cpus(void);
> > +#if defined(CONFIG_RISCV_MISALIGNED)
> >  void unaligned_emulation_finish(void);
> >  bool unaligned_ctl_available(void);
> >  DECLARE_PER_CPU(long, misaligned_access_speed);
> > @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
> >  }
> >  #endif
> >  
> > +bool check_vector_unaligned_access_emulated_all_cpus(void);
> > +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> > +DECLARE_PER_CPU(long, vector_misaligned_access);
> > +#endif
> > +
> >  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> >  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
> >  
> > diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> > index 2293e535f865..7b32d2b08bb6 100644
> > --- a/arch/riscv/include/asm/entry-common.h
> > +++ b/arch/riscv/include/asm/entry-common.h
> > @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> >  void handle_page_fault(struct pt_regs *regs);
> >  void handle_break(struct pt_regs *regs);
> >  
> > -#ifdef CONFIG_RISCV_MISALIGNED
> >  int handle_misaligned_load(struct pt_regs *regs);
> >  int handle_misaligned_store(struct pt_regs *regs);
> > -#else
> > -static inline int handle_misaligned_load(struct pt_regs *regs)
> > -{
> > -	return -1;
> > -}
> > -static inline int handle_misaligned_store(struct pt_regs *regs)
> > -{
> > -	return -1;
> > -}
> > -#endif
> >  
> >  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> > index 150a9877b0af..ef01c182af2b 100644
> > --- a/arch/riscv/include/asm/hwprobe.h
> > +++ b/arch/riscv/include/asm/hwprobe.h
> > @@ -8,7 +8,7 @@
> >  
> >  #include <uapi/asm/hwprobe.h>
> >  
> > -#define RISCV_HWPROBE_MAX_KEY 7
> > +#define RISCV_HWPROBE_MAX_KEY 8
> >  
> >  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> >  {
> > diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> > index be7d309cca8a..99b0f91db9ee 100644
> > --- a/arch/riscv/include/asm/vector.h
> > +++ b/arch/riscv/include/asm/vector.h
> > @@ -21,6 +21,7 @@
> >  
> >  extern unsigned long riscv_v_vsize;
> >  int riscv_v_setup_vsize(void);
> > +bool insn_is_vector(u32 insn_buf);
> >  bool riscv_v_first_use_handler(struct pt_regs *regs);
> >  void kernel_vector_begin(void);
> >  void kernel_vector_end(void);
> > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> > index 023b7771d1b7..2fee870e41bb 100644
> > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > @@ -75,6 +75,11 @@ struct riscv_hwprobe {
> >  #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
> >  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
> >  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
> > +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
> > +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
> 
> I appreciate you leaving the key for EMULATED open!
> 
> > +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
> > +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
> > +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
> >  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
> >  
> >  /* Flags */
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index 5b243d46f4b1..62ac19c029f1 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -62,8 +62,8 @@ obj-y	+= probes/
> >  obj-y	+= tests/
> >  obj-$(CONFIG_MMU) += vdso.o vdso/
> >  
> > -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
> > -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
> > +obj-y	+= traps_misaligned.o
> > +obj-y	+= unaligned_access_speed.o
> >  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
> >  
> >  obj-$(CONFIG_FPU)		+= fpu.o
> > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > index e910e2971984..c40df314058b 100644
> > --- a/arch/riscv/kernel/sys_hwprobe.c
> > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> >  }
> >  #endif
> >  
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +	int cpu;
> > +	u64 perf = -1ULL;
> > +
> > +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +
> > +	/* Return if supported or not even if speed wasn't probed */
> > +	for_each_cpu(cpu, cpus) {
> > +		int this_perf = per_cpu(vector_misaligned_access, cpu);
> > +
> > +		if (perf == -1ULL)
> > +			perf = this_perf;
> > +
> > +		if (perf != this_perf) {
> > +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (perf == -1ULL)
> > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +
> > +	return perf;
> > +}
> > +#else
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +
> > +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +}
> > +#endif
> > +
> >  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >  			     const struct cpumask *cpus)
> >  {
> > @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >  		pair->value = hwprobe_misaligned(cpus);
> >  		break;
> >  
> > +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> > +		pair->value = hwprobe_vec_misaligned(cpus);
> > +		break;
> > +
> >  	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> >  		pair->value = 0;
> >  		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> > index 8fadbe00dd62..6f0264a8c9de 100644
> > --- a/arch/riscv/kernel/traps_misaligned.c
> > +++ b/arch/riscv/kernel/traps_misaligned.c
> > @@ -16,6 +16,7 @@
> >  #include <asm/entry-common.h>
> >  #include <asm/hwprobe.h>
> >  #include <asm/cpufeature.h>
> > +#include <asm/vector.h>
> >  
> >  #define INSN_MATCH_LB			0x3
> >  #define INSN_MASK_LB			0x707f
> > @@ -322,12 +323,37 @@ union reg_data {
> >  	u64 data_u64;
> >  };
> >  
> > -static bool unaligned_ctl __read_mostly;
> > -
> >  /* sysctl hooks */
> >  int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
> >  
> > -int handle_misaligned_load(struct pt_regs *regs)
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > +{
> > +	unsigned long epc = regs->epc;
> > +	unsigned long insn;
> > +
> > +	if (get_insn(regs, epc, &insn))
> > +		return -1;
> > +
> > +	/* Only return 0 when in check_vector_unaligned_access_emulated */
> > +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> > +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +		regs->epc = epc + INSN_LEN(insn);
> > +		return 0;
> > +	}
> > +
> > +	/* If vector instruction we don't emulate it yet */
> > +	regs->epc = epc;
> > +	return -1;
> > +}
> > +#else
> > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > +{
> > +	return -1;
> > +}
> > +#endif
> > +
> > +static int handle_scalar_misaligned_load(struct pt_regs *regs)
> >  {
> >  	union reg_data val;
> >  	unsigned long epc = regs->epc;
> > @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
> >  	return 0;
> >  }
> >  
> > -int handle_misaligned_store(struct pt_regs *regs)
> > +static int handle_scalar_misaligned_store(struct pt_regs *regs)
> >  {
> >  	union reg_data val;
> >  	unsigned long epc = regs->epc;
> > @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
> >  	return 0;
> >  }
> >  
> > +int handle_misaligned_load(struct pt_regs *regs)
> > +{
> > +	unsigned long epc = regs->epc;
> > +	unsigned long insn;
> > +
> > +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> > +		if (get_insn(regs, epc, &insn))
> > +			return -1;
> > +
> > +		if (insn_is_vector(insn))
> > +			return handle_vector_misaligned_load(regs);
> > +	}
> > +
> > +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > +		return handle_scalar_misaligned_load(regs);
> > +
> > +	return -1;
> > +}
> > +
> > +int handle_misaligned_store(struct pt_regs *regs)
> > +{
> > +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > +		return handle_scalar_misaligned_store(regs);
> > +
> > +	return -1;
> > +}
> > +
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> > +{
> > +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> > +	unsigned long tmp_var;
> > +
> > +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +
> > +	local_irq_enable();
> > +	kernel_vector_begin();
> > +	__asm__ __volatile__ (
> > +		".balign 4\n\t"
> > +		".option push\n\t"
> > +		".option arch, +zve32x\n\t"
> > +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
> > +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
> > +		".option pop\n\t"
> > +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> > +	kernel_vector_end();
> > +}
> > +
> > +bool check_vector_unaligned_access_emulated_all_cpus(void)

Hopefully I catch the final things I want to say in this email ;)

> > +{
> > +	int cpu;
> > +	bool ret = true;
> > +
> > +	if (!has_vector()) {
> > +		for_each_online_cpu(cpu)
> > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +		return false;
> > +	}
> > +
> > +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> > +
> > +	for_each_online_cpu(cpu)
> > +		if (per_cpu(vector_misaligned_access, cpu)
> > +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> > +			return false;

The default value of vector_misaligned_access is
RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED so when the hardware supports
unaligned accesses this will return false. If the hardware doesn't
support unaligned accesses, then the trap will happen and the kernel
will set this variable to UNSUPPORTED, causing this function to again
return false.

Having the default value be UNKNOWN and checking for UNKNOWN here and in
check_vector_unaligned_access() can remedy this issue.

- Charlie

> > +
> > +	return ret;
> > +}
> > +#else
> > +bool check_vector_unaligned_access_emulated_all_cpus(void)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +
> > +#ifdef CONFIG_RISCV_MISALIGNED
> > +
> > +static bool unaligned_ctl __read_mostly;
> > +
> >  static void check_unaligned_access_emulated(struct work_struct *unused)
> >  {
> >  	int cpu = smp_processor_id();
> > @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
> >  {
> >  	return unaligned_ctl;
> >  }
> > +#else
> > +bool check_unaligned_access_emulated_all_cpus(void)
> > +{
> > +	return false;
> > +}
> > +#endif
> > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> > index 70c1588fc353..c6106bd4a25a 100644
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > @@ -19,7 +19,8 @@
> >  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> >  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> >  
> > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> >  
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >  static cpumask_t fast_misaligned_access;
> > @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
> >  
> >  	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
> >  		for_each_online_cpu(cpu) {
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> > +#endif
> > +#ifdef CONFIG_RISCV_MISALIGNED
> >  			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> > +#endif
> >  		}
> >  		return 0;
> 
> Since this function returns 0 in both cases, can you wrap the rest of
> the function with an else and remove this early return?
> 
> - Charlie
> 
> >  	}
> >  
> >  	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> > +	check_vector_unaligned_access_emulated_all_cpus();
> >  
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >  	if (!all_cpus_emulated)
> > diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> > index 682b3feee451..821818886fab 100644
> > --- a/arch/riscv/kernel/vector.c
> > +++ b/arch/riscv/kernel/vector.c
> > @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
> >  #endif
> >  }
> >  
> > -static bool insn_is_vector(u32 insn_buf)
> > +bool insn_is_vector(u32 insn_buf)
> >  {
> >  	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
> >  	u32 width, csr;
> > -- 
> > 2.43.0
> >
Evan Green June 20, 2024, 6:51 p.m. UTC | #8
On Thu, Jun 13, 2024 at 12:17 PM Jesse Taube <jesse@rivosinc.com> wrote:
>
> Run a unaligned vector access to test if the system supports
> vector unaligned access. Add the result to a new key in hwprobe.
> This is useful for usermode to know if vector misaligned accesses are
> supported and if they are faster or slower than equivalent byte accesses.
>
> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> ---
> V1 -> V2:
>  - Add Kconfig options
>  - Add insn_is_vector
>  - Add handle_vector_misaligned_load
>  - Fix build
>  - Seperate vector from scalar misaligned access
>  - This patch was almost completely rewritten
> ---
>  arch/riscv/Kconfig                         |  41 +++++++
>  arch/riscv/include/asm/cpufeature.h        |   7 +-
>  arch/riscv/include/asm/entry-common.h      |  11 --
>  arch/riscv/include/asm/hwprobe.h           |   2 +-
>  arch/riscv/include/asm/vector.h            |   1 +
>  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>  arch/riscv/kernel/Makefile                 |   4 +-
>  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>  arch/riscv/kernel/vector.c                 |   2 +-
>  11 files changed, 221 insertions(+), 21 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index b94176e25be1..f12df0ca6c18 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>         help
>           Embed support for emulating misaligned loads and stores.
>
> +config RISCV_VECTOR_MISALIGNED
> +       bool
> +       depends on RISCV_ISA_V
> +       help
> +         Enable detecting support for vector misaligned loads and stores.
> +
>  choice
>         prompt "Unaligned Accesses Support"
>         default RISCV_PROBE_UNALIGNED_ACCESS
> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>
>  endchoice
>
> +choice
> +       prompt "Vector unaligned Accesses Support"
> +       depends on RISCV_ISA_V
> +       default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +       help
> +         This determines the level of support for vector unaligned accesses. This
> +         information is used by the kernel to perform optimizations. It is also
> +         exposed to user space via the hwprobe syscall. The hardware will be
> +         probed at boot by default.
> +
> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> +       bool "Detect support for vector unaligned accesses"
> +       select RISCV_VECTOR_MISALIGNED
> +       help
> +         During boot, the kernel will detect if the system supports vector
> +         unaligned accesses.
> +
> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> +       bool "Probe speed of vector unaligned accesses"
> +       select RISCV_VECTOR_MISALIGNED
> +       help
> +         During boot, the kernel will run a series of tests to determine the
> +         speed of vector unaligned accesses if they are supported. This probing
> +         will dynamically determine the speed of vector unaligned accesses on
> +         the underlying system if they are supported.
> +
> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> +       bool "Assume the system does not support vector unaligned memory accesses"
> +       help
> +         Assume that the system does not support vector unaligned memory accesses.
> +         The kernel and userspace programs may run them successfully on systems
> +         that do support vector unaligned memory accesses.
> +
> +endchoice
> +
>  endmenu # "Platform type"
>
>  menu "Kernel features"
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..d0ea5921ab20 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
>
>  void riscv_user_isa_enable(void);
>
> -#if defined(CONFIG_RISCV_MISALIGNED)
>  bool check_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_MISALIGNED)
>  void unaligned_emulation_finish(void);
>  bool unaligned_ctl_available(void);
>  DECLARE_PER_CPU(long, misaligned_access_speed);
> @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
>  }
>  #endif
>
> +bool check_vector_unaligned_access_emulated_all_cpus(void);
> +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> +DECLARE_PER_CPU(long, vector_misaligned_access);
> +#endif
> +
>  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>
> diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> index 2293e535f865..7b32d2b08bb6 100644
> --- a/arch/riscv/include/asm/entry-common.h
> +++ b/arch/riscv/include/asm/entry-common.h
> @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
>  void handle_page_fault(struct pt_regs *regs);
>  void handle_break(struct pt_regs *regs);
>
> -#ifdef CONFIG_RISCV_MISALIGNED
>  int handle_misaligned_load(struct pt_regs *regs);
>  int handle_misaligned_store(struct pt_regs *regs);
> -#else
> -static inline int handle_misaligned_load(struct pt_regs *regs)
> -{
> -       return -1;
> -}
> -static inline int handle_misaligned_store(struct pt_regs *regs)
> -{
> -       return -1;
> -}
> -#endif
>
>  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 150a9877b0af..ef01c182af2b 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>
>  #include <uapi/asm/hwprobe.h>
>
> -#define RISCV_HWPROBE_MAX_KEY 7
> +#define RISCV_HWPROBE_MAX_KEY 8
>
>  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>  {
> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> index be7d309cca8a..99b0f91db9ee 100644
> --- a/arch/riscv/include/asm/vector.h
> +++ b/arch/riscv/include/asm/vector.h
> @@ -21,6 +21,7 @@
>
>  extern unsigned long riscv_v_vsize;
>  int riscv_v_setup_vsize(void);
> +bool insn_is_vector(u32 insn_buf);
>  bool riscv_v_first_use_handler(struct pt_regs *regs);
>  void kernel_vector_begin(void);
>  void kernel_vector_end(void);
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 023b7771d1b7..2fee870e41bb 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -75,6 +75,11 @@ struct riscv_hwprobe {
>  #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
>  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
>  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF      7
> +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF  8
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
>  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>
>  /* Flags */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 5b243d46f4b1..62ac19c029f1 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -62,8 +62,8 @@ obj-y += probes/
>  obj-y  += tests/
>  obj-$(CONFIG_MMU) += vdso.o vdso/
>
> -obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
> -obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
> +obj-y  += traps_misaligned.o
> +obj-y  += unaligned_access_speed.o
>  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
>
>  obj-$(CONFIG_FPU)              += fpu.o
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index e910e2971984..c40df314058b 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>  }
>  #endif
>
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +       int cpu;
> +       u64 perf = -1ULL;
> +
> +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +       /* Return if supported or not even if speed wasn't probed */
> +       for_each_cpu(cpu, cpus) {
> +               int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> +               if (perf == -1ULL)
> +                       perf = this_perf;
> +
> +               if (perf != this_perf) {
> +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +                       break;
> +               }
> +       }
> +
> +       if (perf == -1ULL)
> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +       return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +
> +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
>  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>                              const struct cpumask *cpus)
>  {
> @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>                 pair->value = hwprobe_misaligned(cpus);
>                 break;
>
> +       case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> +               pair->value = hwprobe_vec_misaligned(cpus);
> +               break;
> +
>         case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>                 pair->value = 0;
>                 if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 8fadbe00dd62..6f0264a8c9de 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
>  #include <asm/entry-common.h>
>  #include <asm/hwprobe.h>
>  #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>
>  #define INSN_MATCH_LB                  0x3
>  #define INSN_MASK_LB                   0x707f
> @@ -322,12 +323,37 @@ union reg_data {
>         u64 data_u64;
>  };
>
> -static bool unaligned_ctl __read_mostly;
> -
>  /* sysctl hooks */
>  int unaligned_enabled __read_mostly = 1;       /* Enabled by default */
>
> -int handle_misaligned_load(struct pt_regs *regs)
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +       unsigned long epc = regs->epc;
> +       unsigned long insn;
> +
> +       if (get_insn(regs, epc, &insn))
> +               return -1;
> +
> +       /* Only return 0 when in check_vector_unaligned_access_emulated */
> +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +               regs->epc = epc + INSN_LEN(insn);
> +               return 0;
> +       }
> +
> +       /* If vector instruction we don't emulate it yet */
> +       regs->epc = epc;
> +       return -1;
> +}
> +#else
> +static int handle_vector_misaligned_load(struct pt_regs *regs)
> +{
> +       return -1;
> +}
> +#endif
> +
> +static int handle_scalar_misaligned_load(struct pt_regs *regs)
>  {
>         union reg_data val;
>         unsigned long epc = regs->epc;
> @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
>         return 0;
>  }
>
> -int handle_misaligned_store(struct pt_regs *regs)
> +static int handle_scalar_misaligned_store(struct pt_regs *regs)
>  {
>         union reg_data val;
>         unsigned long epc = regs->epc;
> @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
>         return 0;
>  }
>
> +int handle_misaligned_load(struct pt_regs *regs)
> +{
> +       unsigned long epc = regs->epc;
> +       unsigned long insn;
> +
> +       if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> +               if (get_insn(regs, epc, &insn))
> +                       return -1;
> +
> +               if (insn_is_vector(insn))
> +                       return handle_vector_misaligned_load(regs);
> +       }
> +
> +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +               return handle_scalar_misaligned_load(regs);
> +
> +       return -1;
> +}
> +
> +int handle_misaligned_store(struct pt_regs *regs)
> +{
> +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> +               return handle_scalar_misaligned_store(regs);
> +
> +       return -1;
> +}
> +
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> +{
> +       long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> +       unsigned long tmp_var;
> +
> +       *mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +       local_irq_enable();

Generally if a function is called with interrupts disabled there's a
reason for it, like the system will crash if an interrupt fires during
execution of this region. I haven't researched this, but to feel
comfortable I'd want to know why interrupts were disabled on entry
here, why it's safe to enable them now, and why it's safe to return
from the function with them still enabled.

I'm guessing this was added because may_use_simd() was blowing up for
you without it. If that's the case, I think we'll need to reconcile
that in a different way. From a quick glance at kernel_mode_vector.c,
I was originally thinking may_use_simd() enforces this because there's
only a single instance of current->thread.kernel_vstate. So if you
tried to start a v context from an interrupt, you may land on top of
another kernel user, and there's nowhere to save that kernel user's
old state. But there does seem to be some support for nested V context
associated with CONFIG_RISCV_ISA_V_PREEMPTIVE, so I'm a little
confused. It seems like your options are to try and get running on
each CPU in a different manner (such that you're not stuck in irq
context by the time you first run), or try and dive into the kernel V
context code to enable support for starting a V context in irq land.


> +       kernel_vector_begin();
> +       __asm__ __volatile__ (
> +               ".balign 4\n\t"
> +               ".option push\n\t"
> +               ".option arch, +zve32x\n\t"
> +               "       vsetivli zero, 1, e16, m1, ta, ma\n\t"  // Vectors of 16b
> +               "       vle16.v v0, (%[ptr])\n\t"               // Load bytes
> +               ".option pop\n\t"
> +               : : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> +       kernel_vector_end();
> +}
> +
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +       int cpu;
> +       bool ret = true;
> +
> +       if (!has_vector()) {
> +               for_each_online_cpu(cpu)
> +                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +               return false;
> +       }
> +
> +       schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> +
> +       for_each_online_cpu(cpu)
> +               if (per_cpu(vector_misaligned_access, cpu)
> +                   != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> +                       return false;
> +
> +       return ret;
> +}
> +#else
> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> +{
> +       return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_RISCV_MISALIGNED
> +
> +static bool unaligned_ctl __read_mostly;
> +
>  static void check_unaligned_access_emulated(struct work_struct *unused)
>  {
>         int cpu = smp_processor_id();
> @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
>  {
>         return unaligned_ctl;
>  }
> +#else
> +bool check_unaligned_access_emulated_all_cpus(void)
> +{
> +       return false;
> +}
> +#endif
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index 70c1588fc353..c6106bd4a25a 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -19,7 +19,8 @@
>  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>
> -DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  static cpumask_t fast_misaligned_access;
> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>
>         if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>                 for_each_online_cpu(cpu) {
> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> +                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> +#endif
> +#ifdef CONFIG_RISCV_MISALIGNED
>                         per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> +#endif
>                 }
>                 return 0;
>         }
>
>         all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> +       check_vector_unaligned_access_emulated_all_cpus();
>
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>         if (!all_cpus_emulated)
> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> index 682b3feee451..821818886fab 100644
> --- a/arch/riscv/kernel/vector.c
> +++ b/arch/riscv/kernel/vector.c
> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>  #endif
>  }
>
> -static bool insn_is_vector(u32 insn_buf)
> +bool insn_is_vector(u32 insn_buf)
>  {
>         u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>         u32 width, csr;
> --
> 2.43.0
>
Jesse Taube June 20, 2024, 9:31 p.m. UTC | #9
On 6/17/24 22:09, Charlie Jenkins wrote:
> On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
>> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
>>> Run a unaligned vector access to test if the system supports
>>> vector unaligned access. Add the result to a new key in hwprobe.
>>> This is useful for usermode to know if vector misaligned accesses are
>>> supported and if they are faster or slower than equivalent byte accesses.
>>>
>>> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
>>> ---
>>> V1 -> V2:
>>>   - Add Kconfig options
>>>   - Add insn_is_vector
>>>   - Add handle_vector_misaligned_load
>>>   - Fix build
>>>   - Seperate vector from scalar misaligned access
>>>   - This patch was almost completely rewritten
>>> ---
>>>   arch/riscv/Kconfig                         |  41 +++++++
>>>   arch/riscv/include/asm/cpufeature.h        |   7 +-
>>>   arch/riscv/include/asm/entry-common.h      |  11 --
>>>   arch/riscv/include/asm/hwprobe.h           |   2 +-
>>>   arch/riscv/include/asm/vector.h            |   1 +
>>>   arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>>>   arch/riscv/kernel/Makefile                 |   4 +-
>>>   arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>>>   arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>>>   arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>>>   arch/riscv/kernel/vector.c                 |   2 +-
>>>   11 files changed, 221 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index b94176e25be1..f12df0ca6c18 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>>>   	help
>>>   	  Embed support for emulating misaligned loads and stores.
>>>   
>>> +config RISCV_VECTOR_MISALIGNED
>>> +	bool
>>> +	depends on RISCV_ISA_V
>>> +	help
>>> +	  Enable detecting support for vector misaligned loads and stores.
>>> +
>>>   choice
>>>   	prompt "Unaligned Accesses Support"
>>>   	default RISCV_PROBE_UNALIGNED_ACCESS
>>> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>>>   
>>>   endchoice
>>>   
>>> +choice
>>> +	prompt "Vector unaligned Accesses Support"
>>> +	depends on RISCV_ISA_V
>>> +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>> +	help
>>> +	  This determines the level of support for vector unaligned accesses. This
>>> +	  information is used by the kernel to perform optimizations. It is also
>>> +	  exposed to user space via the hwprobe syscall. The hardware will be
>>> +	  probed at boot by default.
>>> +
>>> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
>>
>> This is not used anywhere, what is the reason for including it?

This is so that we can check if they are supported or not, but not check 
the speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.

>>
>>> +	bool "Detect support for vector unaligned accesses"
>>> +	select RISCV_VECTOR_MISALIGNED
>>> +	help
>>> +	  During boot, the kernel will detect if the system supports vector
>>> +	  unaligned accesses.
>>> +
>>> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>> +	bool "Probe speed of vector unaligned accesses"
>>> +	select RISCV_VECTOR_MISALIGNED
>>> +	help
>>> +	  During boot, the kernel will run a series of tests to determine the
>>> +	  speed of vector unaligned accesses if they are supported. This probing
>>> +	  will dynamically determine the speed of vector unaligned accesses on
>>> +	  the underlying system if they are supported.
>>> +
>>> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
>>
>> This should not be prefixed with CONFIG and does not include VECTOR in
>> the name.

Huh thought it would warn fixed though

> I assume you meant to put
>> "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?

This is to leave a faster path like SLOW or FAST to say that unaligned 
access arent suported.

>>
>> This was also intentionally left out on the scalar side [1]. The
>> implication here is that having this config will cause people to compile
>> kernels without unaligned access support which really shouldn't be
>> something we are explicitly supporting.
>>
>> If somebody does want to support hardware that does not handle vector
>> unaligned accesses, the solution should be to add emulation support to
>> the kernel.

Yes but we dont have emulation support yet so I do think its a good idea.

>>
>> Link: https://lore.kernel.org/all/Zd4y5llkvTfKHf6b@ghost/ [1]
>>
>> - Charlie
>>
>>> +	bool "Assume the system does not support vector unaligned memory accesses"
>>> +	help
>>> +	  Assume that the system does not support vector unaligned memory accesses.
>>> +	  The kernel and userspace programs may run them successfully on systems
>>> +	  that do support vector unaligned memory accesses.
>>> +
>>> +endchoice
>>> +
>>>   endmenu # "Platform type"
>>>   
>>>   menu "Kernel features"
>>> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
>>> index 347805446151..d0ea5921ab20 100644
>>> --- a/arch/riscv/include/asm/cpufeature.h
>>> +++ b/arch/riscv/include/asm/cpufeature.h
>>> @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
>>>   
>>>   void riscv_user_isa_enable(void);
>>>   
>>> -#if defined(CONFIG_RISCV_MISALIGNED)
>>>   bool check_unaligned_access_emulated_all_cpus(void);
>>> +#if defined(CONFIG_RISCV_MISALIGNED)
>>>   void unaligned_emulation_finish(void);
>>>   bool unaligned_ctl_available(void);
>>>   DECLARE_PER_CPU(long, misaligned_access_speed);
>>> @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
>>>   }
>>>   #endif
>>>   
>>> +bool check_vector_unaligned_access_emulated_all_cpus(void);
>>> +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
>>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>> +#endif
>>> +
>>>   #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>>>   DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>>>   
>>> diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
>>> index 2293e535f865..7b32d2b08bb6 100644
>>> --- a/arch/riscv/include/asm/entry-common.h
>>> +++ b/arch/riscv/include/asm/entry-common.h
>>> @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
>>>   void handle_page_fault(struct pt_regs *regs);
>>>   void handle_break(struct pt_regs *regs);
>>>   
>>> -#ifdef CONFIG_RISCV_MISALIGNED
>>>   int handle_misaligned_load(struct pt_regs *regs);
>>>   int handle_misaligned_store(struct pt_regs *regs);
>>> -#else
>>> -static inline int handle_misaligned_load(struct pt_regs *regs)
>>> -{
>>> -	return -1;
>>> -}
>>> -static inline int handle_misaligned_store(struct pt_regs *regs)
>>> -{
>>> -	return -1;
>>> -}
>>> -#endif
>>>   
>>>   #endif /* _ASM_RISCV_ENTRY_COMMON_H */
>>> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
>>> index 150a9877b0af..ef01c182af2b 100644
>>> --- a/arch/riscv/include/asm/hwprobe.h
>>> +++ b/arch/riscv/include/asm/hwprobe.h
>>> @@ -8,7 +8,7 @@
>>>   
>>>   #include <uapi/asm/hwprobe.h>
>>>   
>>> -#define RISCV_HWPROBE_MAX_KEY 7
>>> +#define RISCV_HWPROBE_MAX_KEY 8
>>>   
>>>   static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>>   {
>>> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
>>> index be7d309cca8a..99b0f91db9ee 100644
>>> --- a/arch/riscv/include/asm/vector.h
>>> +++ b/arch/riscv/include/asm/vector.h
>>> @@ -21,6 +21,7 @@
>>>   
>>>   extern unsigned long riscv_v_vsize;
>>>   int riscv_v_setup_vsize(void);
>>> +bool insn_is_vector(u32 insn_buf);
>>>   bool riscv_v_first_use_handler(struct pt_regs *regs);
>>>   void kernel_vector_begin(void);
>>>   void kernel_vector_end(void);
>>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
>>> index 023b7771d1b7..2fee870e41bb 100644
>>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>>> @@ -75,6 +75,11 @@ struct riscv_hwprobe {
>>>   #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
>>>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
>>>   #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
>>> +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
>>
>> I appreciate you leaving the key for EMULATED open!
>>
>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
>>>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>>   
>>>   /* Flags */
>>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>>> index 5b243d46f4b1..62ac19c029f1 100644
>>> --- a/arch/riscv/kernel/Makefile
>>> +++ b/arch/riscv/kernel/Makefile
>>> @@ -62,8 +62,8 @@ obj-y	+= probes/
>>>   obj-y	+= tests/
>>>   obj-$(CONFIG_MMU) += vdso.o vdso/
>>>   
>>> -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
>>> -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
>>> +obj-y	+= traps_misaligned.o
>>> +obj-y	+= unaligned_access_speed.o

 > These files only need to be compiled if either CONFIG_RISCV_MISALIGNED
 > or CONFIG_RISCV_VECTOR_MISALIGNED is selected. Can you refactor this
 > code to replace CONFIG_RISCV_MISALIGNED with
 > CONFIG_RISCV_SCALAR_MISALIGNED and then have
 > CONFIG_RISCV_SCALAR_MISALIGNED and CONFIG_RISCV_VECTOR_MISALIGNED
 > select CONFIG_RISCV_MISALIGNED in the Kconfig?

Fixed!

>>>   obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
>>>   
>>>   obj-$(CONFIG_FPU)		+= fpu.o
>>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>>> index e910e2971984..c40df314058b 100644
>>> --- a/arch/riscv/kernel/sys_hwprobe.c
>>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>>> @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>>>   }
>>>   #endif
>>>   
>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>> +{
>>> +	int cpu;
>>> +	u64 perf = -1ULL;
>>> +
>>> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> +
>>> +	/* Return if supported or not even if speed wasn't probed */
>>> +	for_each_cpu(cpu, cpus) {
>>> +		int this_perf = per_cpu(vector_misaligned_access, cpu);
>>> +
>>> +		if (perf == -1ULL)
>>> +			perf = this_perf;
>>> +
>>> +		if (perf != this_perf) {
>>> +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +			break;
>>> +		}
>>> +	}
>>> +
>>> +	if (perf == -1ULL)
>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +
>>> +	return perf;
>>> +}
>>> +#else
>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>> +{
>>> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> +
>>> +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +}
>>> +#endif
>>> +
>>>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>>   			     const struct cpumask *cpus)
>>>   {
>>> @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>>   		pair->value = hwprobe_misaligned(cpus);
>>>   		break;
>>>   
>>> +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
>>> +		pair->value = hwprobe_vec_misaligned(cpus);
>>> +		break;
>>> +
>>>   	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>>   		pair->value = 0;
>>>   		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>>> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
>>> index 8fadbe00dd62..6f0264a8c9de 100644
>>> --- a/arch/riscv/kernel/traps_misaligned.c
>>> +++ b/arch/riscv/kernel/traps_misaligned.c
>>> @@ -16,6 +16,7 @@
>>>   #include <asm/entry-common.h>
>>>   #include <asm/hwprobe.h>
>>>   #include <asm/cpufeature.h>
>>> +#include <asm/vector.h>
>>>   
>>>   #define INSN_MATCH_LB			0x3
>>>   #define INSN_MASK_LB			0x707f
>>> @@ -322,12 +323,37 @@ union reg_data {
>>>   	u64 data_u64;
>>>   };
>>>   
>>> -static bool unaligned_ctl __read_mostly;
>>> -
>>>   /* sysctl hooks */
>>>   int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
>>>   
>>> -int handle_misaligned_load(struct pt_regs *regs)
>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>> +static int handle_vector_misaligned_load(struct pt_regs *regs)
>>> +{
>>> +	unsigned long epc = regs->epc;
>>> +	unsigned long insn;
>>> +
>>> +	if (get_insn(regs, epc, &insn))
>>> +		return -1;
>>> +
>>> +	/* Only return 0 when in check_vector_unaligned_access_emulated */
>>> +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>>> +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> +		regs->epc = epc + INSN_LEN(insn);
>>> +		return 0;
>>> +	}
>>> +
>>> +	/* If vector instruction we don't emulate it yet */
>>> +	regs->epc = epc;
>>> +	return -1;
>>> +}
>>> +#else
>>> +static int handle_vector_misaligned_load(struct pt_regs *regs)
>>> +{
>>> +	return -1;
>>> +}
>>> +#endif
>>> +
>>> +static int handle_scalar_misaligned_load(struct pt_regs *regs)
>>>   {
>>>   	union reg_data val;
>>>   	unsigned long epc = regs->epc;
>>> @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
>>>   	return 0;
>>>   }
>>>   
>>> -int handle_misaligned_store(struct pt_regs *regs)
>>> +static int handle_scalar_misaligned_store(struct pt_regs *regs)
>>>   {
>>>   	union reg_data val;
>>>   	unsigned long epc = regs->epc;
>>> @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
>>>   	return 0;
>>>   }
>>>   
>>> +int handle_misaligned_load(struct pt_regs *regs)
>>> +{
>>> +	unsigned long epc = regs->epc;
>>> +	unsigned long insn;
>>> +
>>> +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
>>> +		if (get_insn(regs, epc, &insn))
>>> +			return -1;
>>> +
>>> +		if (insn_is_vector(insn))
>>> +			return handle_vector_misaligned_load(regs);
>>> +	}
>>> +
>>> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
>>> +		return handle_scalar_misaligned_load(regs);
>>> +
>>> +	return -1;
>>> +}
>>> +
>>> +int handle_misaligned_store(struct pt_regs *regs)
>>> +{
>>> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
>>> +		return handle_scalar_misaligned_store(regs);
>>> +
>>> +	return -1;
>>> +}
>>> +
>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>> +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
>>> +{
>>> +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
>>> +	unsigned long tmp_var;
>>> +
>>> +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>> +
>>> +	local_irq_enable();
@Evan Green I forgot to remove this from when when I was using 
smp_call_on_cpu and encountered the problem you descibed.
They can be removed.

>>> +	kernel_vector_begin();
>>> +	__asm__ __volatile__ (
>>> +		".balign 4\n\t"
>>> +		".option push\n\t"
>>> +		".option arch, +zve32x\n\t"
>>> +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
>>> +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
>>> +		".option pop\n\t"
>>> +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
>>> +	kernel_vector_end();
	if (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
		*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;

>>> +}
>>> +
>>> +bool check_vector_unaligned_access_emulated_all_cpus(void)
> 
> Hopefully I catch the final things I want to say in this email ;)
> 
>>> +{
>>> +	int cpu;
>>> +	bool ret = true;
>>> +
>>> +	if (!has_vector()) {
>>> +		for_each_online_cpu(cpu)
>>> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>> +		return false;
>>> +	}
>>> +
>>> +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
>>> +
>>> +	for_each_online_cpu(cpu)
>>> +		if (per_cpu(vector_misaligned_access, cpu)
>>> +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
>>> +			return false;
> 
> The default value of vector_misaligned_access is
> RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED so when the hardware supports
> unaligned accesses this will return false. If the hardware doesn't
> support unaligned accesses, then the trap will happen and the kernel
> will set this variable to UNSUPPORTED, causing this function to again
> return false.
> 
> Having the default value be UNKNOWN and checking for UNKNOWN here and in
> check_vector_unaligned_access() can remedy this issue.

I meant to set it to SLOW in check_vector_unaligned_access_emulated like 
above.

> - Charlie
> 
>>> +
>>> +	return ret;
>>> +}
>>> +#else
>>> +bool check_vector_unaligned_access_emulated_all_cpus(void)
>>> +{
>>> +	return false;
>>> +}
>>> +#endif
>>> +
>>> +#ifdef CONFIG_RISCV_MISALIGNED
>>> +
>>> +static bool unaligned_ctl __read_mostly;
>>> +
>>>   static void check_unaligned_access_emulated(struct work_struct *unused)
>>>   {
>>>   	int cpu = smp_processor_id();
>>> @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
>>>   {
>>>   	return unaligned_ctl;
>>>   }
>>> +#else
>>> +bool check_unaligned_access_emulated_all_cpus(void)
>>> +{
>>> +	return false;
>>> +}
>>> +#endif
>>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>>> index 70c1588fc353..c6106bd4a25a 100644
>>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>>> @@ -19,7 +19,8 @@
>>>   #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>>   
>>> -DEFINE_PER_CPU(long, misaligned_access_speed);
>>> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
>>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>   
>>>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>>   static cpumask_t fast_misaligned_access;
>>> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>>>   
>>>   	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>>>   		for_each_online_cpu(cpu) {
>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>>> +#endif
>>> +#ifdef CONFIG_RISCV_MISALIGNED
>>>   			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
>>> +#endif
>>>   		}
>>>   		return 0;
>>
>> Since this function returns 0 in both cases, can you wrap the rest of
>> the function with an else and remove this early return?

I think its more readable in a guard clause style.


Thanks,
Jesse Taube
>> - Charlie
>>
>>>   	}
>>>   
>>>   	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>>> +	check_vector_unaligned_access_emulated_all_cpus();
>>>   
>>>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>>   	if (!all_cpus_emulated)
>>> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
>>> index 682b3feee451..821818886fab 100644
>>> --- a/arch/riscv/kernel/vector.c
>>> +++ b/arch/riscv/kernel/vector.c
>>> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>>>   #endif
>>>   }
>>>   
>>> -static bool insn_is_vector(u32 insn_buf)
>>> +bool insn_is_vector(u32 insn_buf)
>>>   {
>>>   	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>>>   	u32 width, csr;
>>> -- 
>>> 2.43.0
>>>
Charlie Jenkins June 20, 2024, 10:14 p.m. UTC | #10
On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
> 
> 
> On 6/17/24 22:09, Charlie Jenkins wrote:
> > On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
> > > On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > > > Run a unaligned vector access to test if the system supports
> > > > vector unaligned access. Add the result to a new key in hwprobe.
> > > > This is useful for usermode to know if vector misaligned accesses are
> > > > supported and if they are faster or slower than equivalent byte accesses.
> > > > 
> > > > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > > > ---
> > > > V1 -> V2:
> > > >   - Add Kconfig options
> > > >   - Add insn_is_vector
> > > >   - Add handle_vector_misaligned_load
> > > >   - Fix build
> > > >   - Seperate vector from scalar misaligned access
> > > >   - This patch was almost completely rewritten
> > > > ---
> > > >   arch/riscv/Kconfig                         |  41 +++++++
> > > >   arch/riscv/include/asm/cpufeature.h        |   7 +-
> > > >   arch/riscv/include/asm/entry-common.h      |  11 --
> > > >   arch/riscv/include/asm/hwprobe.h           |   2 +-
> > > >   arch/riscv/include/asm/vector.h            |   1 +
> > > >   arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> > > >   arch/riscv/kernel/Makefile                 |   4 +-
> > > >   arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> > > >   arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> > > >   arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> > > >   arch/riscv/kernel/vector.c                 |   2 +-
> > > >   11 files changed, 221 insertions(+), 21 deletions(-)
> > > > 
> > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > index b94176e25be1..f12df0ca6c18 100644
> > > > --- a/arch/riscv/Kconfig
> > > > +++ b/arch/riscv/Kconfig
> > > > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> > > >   	help
> > > >   	  Embed support for emulating misaligned loads and stores.
> > > > +config RISCV_VECTOR_MISALIGNED
> > > > +	bool
> > > > +	depends on RISCV_ISA_V
> > > > +	help
> > > > +	  Enable detecting support for vector misaligned loads and stores.
> > > > +
> > > >   choice
> > > >   	prompt "Unaligned Accesses Support"
> > > >   	default RISCV_PROBE_UNALIGNED_ACCESS
> > > > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> > > >   endchoice
> > > > +choice
> > > > +	prompt "Vector unaligned Accesses Support"
> > > > +	depends on RISCV_ISA_V
> > > > +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > +	help
> > > > +	  This determines the level of support for vector unaligned accesses. This
> > > > +	  information is used by the kernel to perform optimizations. It is also
> > > > +	  exposed to user space via the hwprobe syscall. The hardware will be
> > > > +	  probed at boot by default.
> > > > +
> > > > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > > 
> > > This is not used anywhere, what is the reason for including it?
> 
> This is so that we can check if they are supported or not, but not check the
> speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.

What do you mean? It isn't used anywhere so this "check if they are
supported or not" is not guarded by this config.

> 
> > > 
> > > > +	bool "Detect support for vector unaligned accesses"
> > > > +	select RISCV_VECTOR_MISALIGNED
> > > > +	help
> > > > +	  During boot, the kernel will detect if the system supports vector
> > > > +	  unaligned accesses.
> > > > +
> > > > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > +	bool "Probe speed of vector unaligned accesses"
> > > > +	select RISCV_VECTOR_MISALIGNED
> > > > +	help
> > > > +	  During boot, the kernel will run a series of tests to determine the
> > > > +	  speed of vector unaligned accesses if they are supported. This probing
> > > > +	  will dynamically determine the speed of vector unaligned accesses on
> > > > +	  the underlying system if they are supported.
> > > > +
> > > > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > > 
> > > This should not be prefixed with CONFIG and does not include VECTOR in
> > > the name.
> 
> Huh thought it would warn fixed though

What do you mean by "warn fixed"?

> 
> > I assume you meant to put
> > > "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
> 
> This is to leave a faster path like SLOW or FAST to say that unaligned
> access arent suported.

I am not sure what you are responding to. This comment seems to be
responding to my correction of
CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
so I don't see how that ties into SLOW/FAST.

> 
> > > 
> > > This was also intentionally left out on the scalar side [1]. The
> > > implication here is that having this config will cause people to compile
> > > kernels without unaligned access support which really shouldn't be
> > > something we are explicitly supporting.
> > > 
> > > If somebody does want to support hardware that does not handle vector
> > > unaligned accesses, the solution should be to add emulation support to
> > > the kernel.
> 
> Yes but we dont have emulation support yet so I do think its a good idea.

I am hesitant because it is very likely that somebody will add support
for unaligned vector emulation. When there is emulation support, this
config option should not exist to be consistent with scalar. However if
we add this option in now, we must expect a user to enable this config,
and then we will have to get rid of it later. Users are not always happy
when config options are removed.

> 
> > > 
> > > Link: https://lore.kernel.org/all/Zd4y5llkvTfKHf6b@ghost/ [1]
> > > 
> > > - Charlie
> > > 
> > > > +	bool "Assume the system does not support vector unaligned memory accesses"
> > > > +	help
> > > > +	  Assume that the system does not support vector unaligned memory accesses.
> > > > +	  The kernel and userspace programs may run them successfully on systems
> > > > +	  that do support vector unaligned memory accesses.
> > > > +
> > > > +endchoice
> > > > +
> > > >   endmenu # "Platform type"
> > > >   menu "Kernel features"
> > > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > > index 347805446151..d0ea5921ab20 100644
> > > > --- a/arch/riscv/include/asm/cpufeature.h
> > > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > > @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
> > > >   void riscv_user_isa_enable(void);
> > > > -#if defined(CONFIG_RISCV_MISALIGNED)
> > > >   bool check_unaligned_access_emulated_all_cpus(void);
> > > > +#if defined(CONFIG_RISCV_MISALIGNED)
> > > >   void unaligned_emulation_finish(void);
> > > >   bool unaligned_ctl_available(void);
> > > >   DECLARE_PER_CPU(long, misaligned_access_speed);
> > > > @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
> > > >   }
> > > >   #endif
> > > > +bool check_vector_unaligned_access_emulated_all_cpus(void);
> > > > +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> > > > +DECLARE_PER_CPU(long, vector_misaligned_access);
> > > > +#endif
> > > > +
> > > >   #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> > > >   DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
> > > > diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> > > > index 2293e535f865..7b32d2b08bb6 100644
> > > > --- a/arch/riscv/include/asm/entry-common.h
> > > > +++ b/arch/riscv/include/asm/entry-common.h
> > > > @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> > > >   void handle_page_fault(struct pt_regs *regs);
> > > >   void handle_break(struct pt_regs *regs);
> > > > -#ifdef CONFIG_RISCV_MISALIGNED
> > > >   int handle_misaligned_load(struct pt_regs *regs);
> > > >   int handle_misaligned_store(struct pt_regs *regs);
> > > > -#else
> > > > -static inline int handle_misaligned_load(struct pt_regs *regs)
> > > > -{
> > > > -	return -1;
> > > > -}
> > > > -static inline int handle_misaligned_store(struct pt_regs *regs)
> > > > -{
> > > > -	return -1;
> > > > -}
> > > > -#endif
> > > >   #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> > > > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> > > > index 150a9877b0af..ef01c182af2b 100644
> > > > --- a/arch/riscv/include/asm/hwprobe.h
> > > > +++ b/arch/riscv/include/asm/hwprobe.h
> > > > @@ -8,7 +8,7 @@
> > > >   #include <uapi/asm/hwprobe.h>
> > > > -#define RISCV_HWPROBE_MAX_KEY 7
> > > > +#define RISCV_HWPROBE_MAX_KEY 8
> > > >   static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> > > >   {
> > > > diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> > > > index be7d309cca8a..99b0f91db9ee 100644
> > > > --- a/arch/riscv/include/asm/vector.h
> > > > +++ b/arch/riscv/include/asm/vector.h
> > > > @@ -21,6 +21,7 @@
> > > >   extern unsigned long riscv_v_vsize;
> > > >   int riscv_v_setup_vsize(void);
> > > > +bool insn_is_vector(u32 insn_buf);
> > > >   bool riscv_v_first_use_handler(struct pt_regs *regs);
> > > >   void kernel_vector_begin(void);
> > > >   void kernel_vector_end(void);
> > > > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> > > > index 023b7771d1b7..2fee870e41bb 100644
> > > > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > > > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > > > @@ -75,6 +75,11 @@ struct riscv_hwprobe {
> > > >   #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
> > > >   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
> > > >   #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
> > > > +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
> > > > +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
> > > 
> > > I appreciate you leaving the key for EMULATED open!
> > > 
> > > > +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
> > > > +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
> > > > +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
> > > >   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
> > > >   /* Flags */
> > > > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > > > index 5b243d46f4b1..62ac19c029f1 100644
> > > > --- a/arch/riscv/kernel/Makefile
> > > > +++ b/arch/riscv/kernel/Makefile
> > > > @@ -62,8 +62,8 @@ obj-y	+= probes/
> > > >   obj-y	+= tests/
> > > >   obj-$(CONFIG_MMU) += vdso.o vdso/
> > > > -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
> > > > -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
> > > > +obj-y	+= traps_misaligned.o
> > > > +obj-y	+= unaligned_access_speed.o
> 
> > These files only need to be compiled if either CONFIG_RISCV_MISALIGNED
> > or CONFIG_RISCV_VECTOR_MISALIGNED is selected. Can you refactor this
> > code to replace CONFIG_RISCV_MISALIGNED with
> > CONFIG_RISCV_SCALAR_MISALIGNED and then have
> > CONFIG_RISCV_SCALAR_MISALIGNED and CONFIG_RISCV_VECTOR_MISALIGNED
> > select CONFIG_RISCV_MISALIGNED in the Kconfig?
> 
> Fixed!
> 
> > > >   obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
> > > >   obj-$(CONFIG_FPU)		+= fpu.o
> > > > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > > > index e910e2971984..c40df314058b 100644
> > > > --- a/arch/riscv/kernel/sys_hwprobe.c
> > > > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > > > @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> > > >   }
> > > >   #endif
> > > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > > > +{
> > > > +	int cpu;
> > > > +	u64 perf = -1ULL;
> > > > +
> > > > +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > > > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > > +
> > > > +	/* Return if supported or not even if speed wasn't probed */
> > > > +	for_each_cpu(cpu, cpus) {
> > > > +		int this_perf = per_cpu(vector_misaligned_access, cpu);
> > > > +
> > > > +		if (perf == -1ULL)
> > > > +			perf = this_perf;
> > > > +
> > > > +		if (perf != this_perf) {
> > > > +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	if (perf == -1ULL)
> > > > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > > +
> > > > +	return perf;
> > > > +}
> > > > +#else
> > > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > > > +{
> > > > +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > > > +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > > +
> > > > +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > > +}
> > > > +#endif
> > > > +
> > > >   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> > > >   			     const struct cpumask *cpus)
> > > >   {
> > > > @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> > > >   		pair->value = hwprobe_misaligned(cpus);
> > > >   		break;
> > > > +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> > > > +		pair->value = hwprobe_vec_misaligned(cpus);
> > > > +		break;
> > > > +
> > > >   	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> > > >   		pair->value = 0;
> > > >   		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> > > > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> > > > index 8fadbe00dd62..6f0264a8c9de 100644
> > > > --- a/arch/riscv/kernel/traps_misaligned.c
> > > > +++ b/arch/riscv/kernel/traps_misaligned.c
> > > > @@ -16,6 +16,7 @@
> > > >   #include <asm/entry-common.h>
> > > >   #include <asm/hwprobe.h>
> > > >   #include <asm/cpufeature.h>
> > > > +#include <asm/vector.h>
> > > >   #define INSN_MATCH_LB			0x3
> > > >   #define INSN_MASK_LB			0x707f
> > > > @@ -322,12 +323,37 @@ union reg_data {
> > > >   	u64 data_u64;
> > > >   };
> > > > -static bool unaligned_ctl __read_mostly;
> > > > -
> > > >   /* sysctl hooks */
> > > >   int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
> > > > -int handle_misaligned_load(struct pt_regs *regs)
> > > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > > > +{
> > > > +	unsigned long epc = regs->epc;
> > > > +	unsigned long insn;
> > > > +
> > > > +	if (get_insn(regs, epc, &insn))
> > > > +		return -1;
> > > > +
> > > > +	/* Only return 0 when in check_vector_unaligned_access_emulated */
> > > > +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> > > > +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > > +		regs->epc = epc + INSN_LEN(insn);
> > > > +		return 0;
> > > > +	}
> > > > +
> > > > +	/* If vector instruction we don't emulate it yet */
> > > > +	regs->epc = epc;
> > > > +	return -1;
> > > > +}
> > > > +#else
> > > > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > > > +{
> > > > +	return -1;
> > > > +}
> > > > +#endif
> > > > +
> > > > +static int handle_scalar_misaligned_load(struct pt_regs *regs)
> > > >   {
> > > >   	union reg_data val;
> > > >   	unsigned long epc = regs->epc;
> > > > @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
> > > >   	return 0;
> > > >   }
> > > > -int handle_misaligned_store(struct pt_regs *regs)
> > > > +static int handle_scalar_misaligned_store(struct pt_regs *regs)
> > > >   {
> > > >   	union reg_data val;
> > > >   	unsigned long epc = regs->epc;
> > > > @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
> > > >   	return 0;
> > > >   }
> > > > +int handle_misaligned_load(struct pt_regs *regs)
> > > > +{
> > > > +	unsigned long epc = regs->epc;
> > > > +	unsigned long insn;
> > > > +
> > > > +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> > > > +		if (get_insn(regs, epc, &insn))
> > > > +			return -1;
> > > > +
> > > > +		if (insn_is_vector(insn))
> > > > +			return handle_vector_misaligned_load(regs);
> > > > +	}
> > > > +
> > > > +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > > > +		return handle_scalar_misaligned_load(regs);
> > > > +
> > > > +	return -1;
> > > > +}
> > > > +
> > > > +int handle_misaligned_store(struct pt_regs *regs)
> > > > +{
> > > > +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > > > +		return handle_scalar_misaligned_store(regs);
> > > > +
> > > > +	return -1;
> > > > +}
> > > > +
> > > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > > +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> > > > +{
> > > > +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> > > > +	unsigned long tmp_var;
> > > > +
> > > > +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > > +
> > > > +	local_irq_enable();
> @Evan Green I forgot to remove this from when when I was using
> smp_call_on_cpu and encountered the problem you descibed.
> They can be removed.
> 
> > > > +	kernel_vector_begin();
> > > > +	__asm__ __volatile__ (
> > > > +		".balign 4\n\t"
> > > > +		".option push\n\t"
> > > > +		".option arch, +zve32x\n\t"
> > > > +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
> > > > +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
> > > > +		".option pop\n\t"
> > > > +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> > > > +	kernel_vector_end();
> 	if (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
> 		*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
> 
> > > > +}
> > > > +
> > > > +bool check_vector_unaligned_access_emulated_all_cpus(void)
> > 
> > Hopefully I catch the final things I want to say in this email ;)
> > 
> > > > +{
> > > > +	int cpu;
> > > > +	bool ret = true;
> > > > +
> > > > +	if (!has_vector()) {
> > > > +		for_each_online_cpu(cpu)
> > > > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > > +		return false;
> > > > +	}
> > > > +
> > > > +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> > > > +
> > > > +	for_each_online_cpu(cpu)
> > > > +		if (per_cpu(vector_misaligned_access, cpu)
> > > > +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> > > > +			return false;
> > 
> > The default value of vector_misaligned_access is
> > RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED so when the hardware supports
> > unaligned accesses this will return false. If the hardware doesn't
> > support unaligned accesses, then the trap will happen and the kernel
> > will set this variable to UNSUPPORTED, causing this function to again
> > return false.
> > 
> > Having the default value be UNKNOWN and checking for UNKNOWN here and in
> > check_vector_unaligned_access() can remedy this issue.
> 
> I meant to set it to SLOW in check_vector_unaligned_access_emulated like
> above.

What "it" are you referring to? UNKNOWN should be the default internally
here, not SLOW. Before probing is done, the speed is unknown, so UNKNOWN
is the logical default.

> 
> > - Charlie
> > 
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +#else
> > > > +bool check_vector_unaligned_access_emulated_all_cpus(void)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > +
> > > > +#ifdef CONFIG_RISCV_MISALIGNED
> > > > +
> > > > +static bool unaligned_ctl __read_mostly;
> > > > +
> > > >   static void check_unaligned_access_emulated(struct work_struct *unused)
> > > >   {
> > > >   	int cpu = smp_processor_id();
> > > > @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
> > > >   {
> > > >   	return unaligned_ctl;
> > > >   }
> > > > +#else
> > > > +bool check_unaligned_access_emulated_all_cpus(void)
> > > > +{
> > > > +	return false;
> > > > +}
> > > > +#endif
> > > > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> > > > index 70c1588fc353..c6106bd4a25a 100644
> > > > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > > > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > > > @@ -19,7 +19,8 @@
> > > >   #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > > >   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > > > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > > > +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> > > > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > >   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> > > >   static cpumask_t fast_misaligned_access;
> > > > @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
> > > >   	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
> > > >   		for_each_online_cpu(cpu) {
> > > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > > +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> > > > +#endif
> > > > +#ifdef CONFIG_RISCV_MISALIGNED
> > > >   			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> > > > +#endif
> > > >   		}
> > > >   		return 0;
> > > 
> > > Since this function returns 0 in both cases, can you wrap the rest of
> > > the function with an else and remove this early return?
> 
> I think its more readable in a guard clause style.

By guard clause style are you referring to how it is right now? It's the
same return value of 0 in both cases and the most common way of doing
that is by having a single line for the return at the bottom of the
function instead of duplicating the return.

- Charlie

> 
> 
> Thanks,
> Jesse Taube
> > > - Charlie
> > > 
> > > >   	}
> > > >   	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> > > > +	check_vector_unaligned_access_emulated_all_cpus();
> > > >   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> > > >   	if (!all_cpus_emulated)
> > > > diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> > > > index 682b3feee451..821818886fab 100644
> > > > --- a/arch/riscv/kernel/vector.c
> > > > +++ b/arch/riscv/kernel/vector.c
> > > > @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
> > > >   #endif
> > > >   }
> > > > -static bool insn_is_vector(u32 insn_buf)
> > > > +bool insn_is_vector(u32 insn_buf)
> > > >   {
> > > >   	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
> > > >   	u32 width, csr;
> > > > -- 
> > > > 2.43.0
> > > >
Jesse Taube June 20, 2024, 11:08 p.m. UTC | #11
On 6/20/24 18:14, Charlie Jenkins wrote:
> On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
>>
>>
>> On 6/17/24 22:09, Charlie Jenkins wrote:
>>> On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
>>>> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
>>>>> Run a unaligned vector access to test if the system supports
>>>>> vector unaligned access. Add the result to a new key in hwprobe.
>>>>> This is useful for usermode to know if vector misaligned accesses are
>>>>> supported and if they are faster or slower than equivalent byte accesses.
>>>>>
>>>>> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
>>>>> ---
>>>>> V1 -> V2:
>>>>>    - Add Kconfig options
>>>>>    - Add insn_is_vector
>>>>>    - Add handle_vector_misaligned_load
>>>>>    - Fix build
>>>>>    - Seperate vector from scalar misaligned access
>>>>>    - This patch was almost completely rewritten
>>>>> ---
>>>>>    arch/riscv/Kconfig                         |  41 +++++++
>>>>>    arch/riscv/include/asm/cpufeature.h        |   7 +-
>>>>>    arch/riscv/include/asm/entry-common.h      |  11 --
>>>>>    arch/riscv/include/asm/hwprobe.h           |   2 +-
>>>>>    arch/riscv/include/asm/vector.h            |   1 +
>>>>>    arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>>>>>    arch/riscv/kernel/Makefile                 |   4 +-
>>>>>    arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>>>>>    arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>>>>>    arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>>>>>    arch/riscv/kernel/vector.c                 |   2 +-
>>>>>    11 files changed, 221 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>>>> index b94176e25be1..f12df0ca6c18 100644
>>>>> --- a/arch/riscv/Kconfig
>>>>> +++ b/arch/riscv/Kconfig
>>>>> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>>>>>    	help
>>>>>    	  Embed support for emulating misaligned loads and stores.
>>>>> +config RISCV_VECTOR_MISALIGNED
>>>>> +	bool
>>>>> +	depends on RISCV_ISA_V
>>>>> +	help
>>>>> +	  Enable detecting support for vector misaligned loads and stores.
>>>>> +
>>>>>    choice
>>>>>    	prompt "Unaligned Accesses Support"
>>>>>    	default RISCV_PROBE_UNALIGNED_ACCESS
>>>>> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>>>>>    endchoice
>>>>> +choice
>>>>> +	prompt "Vector unaligned Accesses Support"
>>>>> +	depends on RISCV_ISA_V
>>>>> +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>>>> +	help
>>>>> +	  This determines the level of support for vector unaligned accesses. This
>>>>> +	  information is used by the kernel to perform optimizations. It is also
>>>>> +	  exposed to user space via the hwprobe syscall. The hardware will be
>>>>> +	  probed at boot by default.
>>>>> +
>>>>> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
>>>>
>>>> This is not used anywhere, what is the reason for including it?
>>
>> This is so that we can check if they are supported or not, but not check the
>> speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.
> 
> What do you mean? It isn't used anywhere so this "check if they are
> supported or not" is not guarded by this config.

It is not used anywhere because it just needs to enable 
RISCV_VECTOR_MISALIGNED and nothing else.

> 
>>
>>>>
>>>>> +	bool "Detect support for vector unaligned accesses"
>>>>> +	select RISCV_VECTOR_MISALIGNED
>>>>> +	help
>>>>> +	  During boot, the kernel will detect if the system supports vector
>>>>> +	  unaligned accesses.
>>>>> +
>>>>> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>>>> +	bool "Probe speed of vector unaligned accesses"
>>>>> +	select RISCV_VECTOR_MISALIGNED
>>>>> +	help
>>>>> +	  During boot, the kernel will run a series of tests to determine the
>>>>> +	  speed of vector unaligned accesses if they are supported. This probing
>>>>> +	  will dynamically determine the speed of vector unaligned accesses on
>>>>> +	  the underlying system if they are supported.
>>>>> +
>>>>> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
>>>>
>>>> This should not be prefixed with CONFIG and does not include VECTOR in
>>>> the name.
>>
>> Huh thought it would warn fixed though
> 
> What do you mean by "warn fixed"?

Huh thought it would warn. fixed though.

As in I fixed it in V3.

> 
>>
>>> I assume you meant to put
>>>> "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
>>
>> This is to leave a faster path like SLOW or FAST to say that unaligned
>> access arent suported.
> 
> I am not sure what you are responding to. This comment seems to be
> responding to my correction of
> CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
> so I don't see how that ties into SLOW/FAST

Sorry for the confustion it was meant respond to the comment below...

> 
>>
>>>>
>>>> This was also intentionally left out on the scalar side [1]. The
>>>> implication here is that having this config will cause people to compile
>>>> kernels without unaligned access support which really shouldn't be
>>>> something we are explicitly supporting.
>>>>
>>>> If somebody does want to support hardware that does not handle vector
>>>> unaligned accesses, the solution should be to add emulation support to
>>>> the kernel.
>>
>> Yes but we dont have emulation support yet so I do think its a good idea.
> 
> I am hesitant because it is very likely that somebody will add support
> for unaligned vector emulation. When there is emulation support, this
> config option should not exist to be consistent with scalar. However if
> we add this option in now, we must expect a user to enable this config,
> and then we will have to get rid of it later. Users are not always happy
> when config options are removed.

I will remove it then.

> 
>>
>>>>
>>>> Link: https://lore.kernel.org/all/Zd4y5llkvTfKHf6b@ghost/ [1]
>>>>
>>>> - Charlie
>>>>
>>>>> +	bool "Assume the system does not support vector unaligned memory accesses"
>>>>> +	help
>>>>> +	  Assume that the system does not support vector unaligned memory accesses.
>>>>> +	  The kernel and userspace programs may run them successfully on systems
>>>>> +	  that do support vector unaligned memory accesses.
>>>>> +
>>>>> +endchoice
>>>>> +
>>>>>    endmenu # "Platform type"
>>>>>    menu "Kernel features"
>>>>> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
>>>>> index 347805446151..d0ea5921ab20 100644
>>>>> --- a/arch/riscv/include/asm/cpufeature.h
>>>>> +++ b/arch/riscv/include/asm/cpufeature.h
>>>>> @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
>>>>>    void riscv_user_isa_enable(void);
>>>>> -#if defined(CONFIG_RISCV_MISALIGNED)
>>>>>    bool check_unaligned_access_emulated_all_cpus(void);
>>>>> +#if defined(CONFIG_RISCV_MISALIGNED)
>>>>>    void unaligned_emulation_finish(void);
>>>>>    bool unaligned_ctl_available(void);
>>>>>    DECLARE_PER_CPU(long, misaligned_access_speed);
>>>>> @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
>>>>>    }
>>>>>    #endif
>>>>> +bool check_vector_unaligned_access_emulated_all_cpus(void);
>>>>> +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
>>>>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>>>> +#endif
>>>>> +
>>>>>    #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>>>>>    DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>>>>> diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
>>>>> index 2293e535f865..7b32d2b08bb6 100644
>>>>> --- a/arch/riscv/include/asm/entry-common.h
>>>>> +++ b/arch/riscv/include/asm/entry-common.h
>>>>> @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
>>>>>    void handle_page_fault(struct pt_regs *regs);
>>>>>    void handle_break(struct pt_regs *regs);
>>>>> -#ifdef CONFIG_RISCV_MISALIGNED
>>>>>    int handle_misaligned_load(struct pt_regs *regs);
>>>>>    int handle_misaligned_store(struct pt_regs *regs);
>>>>> -#else
>>>>> -static inline int handle_misaligned_load(struct pt_regs *regs)
>>>>> -{
>>>>> -	return -1;
>>>>> -}
>>>>> -static inline int handle_misaligned_store(struct pt_regs *regs)
>>>>> -{
>>>>> -	return -1;
>>>>> -}
>>>>> -#endif
>>>>>    #endif /* _ASM_RISCV_ENTRY_COMMON_H */
>>>>> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
>>>>> index 150a9877b0af..ef01c182af2b 100644
>>>>> --- a/arch/riscv/include/asm/hwprobe.h
>>>>> +++ b/arch/riscv/include/asm/hwprobe.h
>>>>> @@ -8,7 +8,7 @@
>>>>>    #include <uapi/asm/hwprobe.h>
>>>>> -#define RISCV_HWPROBE_MAX_KEY 7
>>>>> +#define RISCV_HWPROBE_MAX_KEY 8
>>>>>    static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>>>>    {
>>>>> diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
>>>>> index be7d309cca8a..99b0f91db9ee 100644
>>>>> --- a/arch/riscv/include/asm/vector.h
>>>>> +++ b/arch/riscv/include/asm/vector.h
>>>>> @@ -21,6 +21,7 @@
>>>>>    extern unsigned long riscv_v_vsize;
>>>>>    int riscv_v_setup_vsize(void);
>>>>> +bool insn_is_vector(u32 insn_buf);
>>>>>    bool riscv_v_first_use_handler(struct pt_regs *regs);
>>>>>    void kernel_vector_begin(void);
>>>>>    void kernel_vector_end(void);
>>>>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
>>>>> index 023b7771d1b7..2fee870e41bb 100644
>>>>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>>>>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>>>>> @@ -75,6 +75,11 @@ struct riscv_hwprobe {
>>>>>    #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
>>>>>    #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
>>>>>    #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
>>>>> +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
>>>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
>>>>
>>>> I appreciate you leaving the key for EMULATED open!
>>>>
>>>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
>>>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
>>>>> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
>>>>>    /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>>>>    /* Flags */
>>>>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>>>>> index 5b243d46f4b1..62ac19c029f1 100644
>>>>> --- a/arch/riscv/kernel/Makefile
>>>>> +++ b/arch/riscv/kernel/Makefile
>>>>> @@ -62,8 +62,8 @@ obj-y	+= probes/
>>>>>    obj-y	+= tests/
>>>>>    obj-$(CONFIG_MMU) += vdso.o vdso/
>>>>> -obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
>>>>> -obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
>>>>> +obj-y	+= traps_misaligned.o
>>>>> +obj-y	+= unaligned_access_speed.o
>>
>>> These files only need to be compiled if either CONFIG_RISCV_MISALIGNED
>>> or CONFIG_RISCV_VECTOR_MISALIGNED is selected. Can you refactor this
>>> code to replace CONFIG_RISCV_MISALIGNED with
>>> CONFIG_RISCV_SCALAR_MISALIGNED and then have
>>> CONFIG_RISCV_SCALAR_MISALIGNED and CONFIG_RISCV_VECTOR_MISALIGNED
>>> select CONFIG_RISCV_MISALIGNED in the Kconfig?
>>
>> Fixed!
>>
>>>>>    obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
>>>>>    obj-$(CONFIG_FPU)		+= fpu.o
>>>>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>>>>> index e910e2971984..c40df314058b 100644
>>>>> --- a/arch/riscv/kernel/sys_hwprobe.c
>>>>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>>>>> @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>>>>>    }
>>>>>    #endif
>>>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>>>> +{
>>>>> +	int cpu;
>>>>> +	u64 perf = -1ULL;
>>>>> +
>>>>> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
>>>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>>> +
>>>>> +	/* Return if supported or not even if speed wasn't probed */
>>>>> +	for_each_cpu(cpu, cpus) {
>>>>> +		int this_perf = per_cpu(vector_misaligned_access, cpu);
>>>>> +
>>>>> +		if (perf == -1ULL)
>>>>> +			perf = this_perf;
>>>>> +
>>>>> +		if (perf != this_perf) {
>>>>> +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>>>> +			break;
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	if (perf == -1ULL)
>>>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>>>> +
>>>>> +	return perf;
>>>>> +}
>>>>> +#else
>>>>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>>>> +{
>>>>> +	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
>>>>> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>>> +
>>>>> +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>    static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>>>>    			     const struct cpumask *cpus)
>>>>>    {
>>>>> @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>>>>    		pair->value = hwprobe_misaligned(cpus);
>>>>>    		break;
>>>>> +	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
>>>>> +		pair->value = hwprobe_vec_misaligned(cpus);
>>>>> +		break;
>>>>> +
>>>>>    	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>>>>    		pair->value = 0;
>>>>>    		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>>>>> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
>>>>> index 8fadbe00dd62..6f0264a8c9de 100644
>>>>> --- a/arch/riscv/kernel/traps_misaligned.c
>>>>> +++ b/arch/riscv/kernel/traps_misaligned.c
>>>>> @@ -16,6 +16,7 @@
>>>>>    #include <asm/entry-common.h>
>>>>>    #include <asm/hwprobe.h>
>>>>>    #include <asm/cpufeature.h>
>>>>> +#include <asm/vector.h>
>>>>>    #define INSN_MATCH_LB			0x3
>>>>>    #define INSN_MASK_LB			0x707f
>>>>> @@ -322,12 +323,37 @@ union reg_data {
>>>>>    	u64 data_u64;
>>>>>    };
>>>>> -static bool unaligned_ctl __read_mostly;
>>>>> -
>>>>>    /* sysctl hooks */
>>>>>    int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
>>>>> -int handle_misaligned_load(struct pt_regs *regs)
>>>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>>>> +static int handle_vector_misaligned_load(struct pt_regs *regs)
>>>>> +{
>>>>> +	unsigned long epc = regs->epc;
>>>>> +	unsigned long insn;
>>>>> +
>>>>> +	if (get_insn(regs, epc, &insn))
>>>>> +		return -1;
>>>>> +
>>>>> +	/* Only return 0 when in check_vector_unaligned_access_emulated */
>>>>> +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>>>>> +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>>> +		regs->epc = epc + INSN_LEN(insn);
>>>>> +		return 0;
>>>>> +	}
>>>>> +
>>>>> +	/* If vector instruction we don't emulate it yet */
>>>>> +	regs->epc = epc;
>>>>> +	return -1;
>>>>> +}
>>>>> +#else
>>>>> +static int handle_vector_misaligned_load(struct pt_regs *regs)
>>>>> +{
>>>>> +	return -1;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> +static int handle_scalar_misaligned_load(struct pt_regs *regs)
>>>>>    {
>>>>>    	union reg_data val;
>>>>>    	unsigned long epc = regs->epc;
>>>>> @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
>>>>>    	return 0;
>>>>>    }
>>>>> -int handle_misaligned_store(struct pt_regs *regs)
>>>>> +static int handle_scalar_misaligned_store(struct pt_regs *regs)
>>>>>    {
>>>>>    	union reg_data val;
>>>>>    	unsigned long epc = regs->epc;
>>>>> @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
>>>>>    	return 0;
>>>>>    }
>>>>> +int handle_misaligned_load(struct pt_regs *regs)
>>>>> +{
>>>>> +	unsigned long epc = regs->epc;
>>>>> +	unsigned long insn;
>>>>> +
>>>>> +	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
>>>>> +		if (get_insn(regs, epc, &insn))
>>>>> +			return -1;
>>>>> +
>>>>> +		if (insn_is_vector(insn))
>>>>> +			return handle_vector_misaligned_load(regs);
>>>>> +	}
>>>>> +
>>>>> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
>>>>> +		return handle_scalar_misaligned_load(regs);
>>>>> +
>>>>> +	return -1;
>>>>> +}
>>>>> +
>>>>> +int handle_misaligned_store(struct pt_regs *regs)
>>>>> +{
>>>>> +	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
>>>>> +		return handle_scalar_misaligned_store(regs);
>>>>> +
>>>>> +	return -1;
>>>>> +}
>>>>> +
>>>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>>>> +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
>>>>> +{
>>>>> +	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
>>>>> +	unsigned long tmp_var;
>>>>> +
>>>>> +	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>>>> +
>>>>> +	local_irq_enable();
>> @Evan Green I forgot to remove this from when when I was using
>> smp_call_on_cpu and encountered the problem you descibed.
>> They can be removed.
>>
>>>>> +	kernel_vector_begin();
>>>>> +	__asm__ __volatile__ (
>>>>> +		".balign 4\n\t"
>>>>> +		".option push\n\t"
>>>>> +		".option arch, +zve32x\n\t"
>>>>> +		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
>>>>> +		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
>>>>> +		".option pop\n\t"
>>>>> +		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
>>>>> +	kernel_vector_end();
>> 	if (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>> 		*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
>>
>>>>> +}
>>>>> +
>>>>> +bool check_vector_unaligned_access_emulated_all_cpus(void)
>>>
>>> Hopefully I catch the final things I want to say in this email ;)
>>>
>>>>> +{
>>>>> +	int cpu;
>>>>> +	bool ret = true;
>>>>> +
>>>>> +	if (!has_vector()) {
>>>>> +		for_each_online_cpu(cpu)
>>>>> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>>> +		return false;
>>>>> +	}
>>>>> +
>>>>> +	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
>>>>> +
>>>>> +	for_each_online_cpu(cpu)
>>>>> +		if (per_cpu(vector_misaligned_access, cpu)
>>>>> +		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
>>>>> +			return false;
>>>
>>> The default value of vector_misaligned_access is
>>> RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED so when the hardware supports
>>> unaligned accesses this will return false. If the hardware doesn't
>>> support unaligned accesses, then the trap will happen and the kernel
>>> will set this variable to UNSUPPORTED, causing this function to again
>>> return false.
>>>
>>> Having the default value be UNKNOWN and checking for UNKNOWN here and in
>>> check_vector_unaligned_access() can remedy this issue.
>>
>> I meant to set it to SLOW in check_vector_unaligned_access_emulated like
>> above.
> 
> What "it" are you referring to?

`per_cpu(vector_misaligned_access, cpu)`

> UNKNOWN should be the default internally
> here, not SLOW. Before probing is done, the speed is unknown, so UNKNOWN
> is the logical default.

The scalar hwprobe returns SLOW if misaligned accesses are not emulated. 
Rather then storing UNKNOWN in the kernel, I thought it would be better 
to set it to slow until we know its fast as it has the same affect as 
swaping unknown for slow in sys_hwprobe. The reason i want to keep the 
code like this is because if missaligned vector loads work then sudenly 
dont we wont ignore them in handle_vector_misaligned_load because the 
the instruction should have worked. I can change it to be unknown though 
and swap UNKNOWN for SLOW in hwprobe.

Thanks,
Jesse Taube
> 
>>
>>> - Charlie
>>>
>>>>> +
>>>>> +	return ret;
>>>>> +}
>>>>> +#else
>>>>> +bool check_vector_unaligned_access_emulated_all_cpus(void)
>>>>> +{
>>>>> +	return false;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> +#ifdef CONFIG_RISCV_MISALIGNED
>>>>> +
>>>>> +static bool unaligned_ctl __read_mostly;
>>>>> +
>>>>>    static void check_unaligned_access_emulated(struct work_struct *unused)
>>>>>    {
>>>>>    	int cpu = smp_processor_id();
>>>>> @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
>>>>>    {
>>>>>    	return unaligned_ctl;
>>>>>    }
>>>>> +#else
>>>>> +bool check_unaligned_access_emulated_all_cpus(void)
>>>>> +{
>>>>> +	return false;
>>>>> +}
>>>>> +#endif
>>>>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>>>>> index 70c1588fc353..c6106bd4a25a 100644
>>>>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>>>>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>>>>> @@ -19,7 +19,8 @@
>>>>>    #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
>>>>>    #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>>>> -DEFINE_PER_CPU(long, misaligned_access_speed);
>>>>> +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
>>>>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>>>>>    #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>>>>    static cpumask_t fast_misaligned_access;
>>>>> @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
>>>>>    	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>>>>>    		for_each_online_cpu(cpu) {
>>>>> +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
>>>>> +			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>>>>> +#endif
>>>>> +#ifdef CONFIG_RISCV_MISALIGNED
>>>>>    			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
>>>>> +#endif
>>>>>    		}
>>>>>    		return 0;
>>>>
>>>> Since this function returns 0 in both cases, can you wrap the rest of
>>>> the function with an else and remove this early return?
>>
>> I think its more readable in a guard clause style.
> 
> By guard clause style are you referring to how it is right now? 

yes.

> It's the
> same return value of 0 in both cases and the most common way of doing
> that is by having a single line for the return at the bottom of the
> function instead of duplicating the return.
> 
> - Charlie
> 
>>
>>
>> Thanks,
>> Jesse Taube
>>>> - Charlie
>>>>
>>>>>    	}
>>>>>    	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>>>>> +	check_vector_unaligned_access_emulated_all_cpus();
>>>>>    #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>>>>    	if (!all_cpus_emulated)
>>>>> diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
>>>>> index 682b3feee451..821818886fab 100644
>>>>> --- a/arch/riscv/kernel/vector.c
>>>>> +++ b/arch/riscv/kernel/vector.c
>>>>> @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
>>>>>    #endif
>>>>>    }
>>>>> -static bool insn_is_vector(u32 insn_buf)
>>>>> +bool insn_is_vector(u32 insn_buf)
>>>>>    {
>>>>>    	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
>>>>>    	u32 width, csr;
>>>>> -- 
>>>>> 2.43.0
>>>>>
Conor Dooley June 21, 2024, 10:06 a.m. UTC | #12
On Thu, Jun 20, 2024 at 03:14:14PM -0700, Charlie Jenkins wrote:
> On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
> > 
> > 
> > On 6/17/24 22:09, Charlie Jenkins wrote:
> > > On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
> > > > On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > > > > Run a unaligned vector access to test if the system supports
> > > > > vector unaligned access. Add the result to a new key in hwprobe.
> > > > > This is useful for usermode to know if vector misaligned accesses are
> > > > > supported and if they are faster or slower than equivalent byte accesses.
> > > > > 
> > > > > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > > > > ---
> > > > > V1 -> V2:
> > > > >   - Add Kconfig options
> > > > >   - Add insn_is_vector
> > > > >   - Add handle_vector_misaligned_load
> > > > >   - Fix build
> > > > >   - Seperate vector from scalar misaligned access
> > > > >   - This patch was almost completely rewritten
> > > > > ---
> > > > >   arch/riscv/Kconfig                         |  41 +++++++
> > > > >   arch/riscv/include/asm/cpufeature.h        |   7 +-
> > > > >   arch/riscv/include/asm/entry-common.h      |  11 --
> > > > >   arch/riscv/include/asm/hwprobe.h           |   2 +-
> > > > >   arch/riscv/include/asm/vector.h            |   1 +
> > > > >   arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> > > > >   arch/riscv/kernel/Makefile                 |   4 +-
> > > > >   arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> > > > >   arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> > > > >   arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> > > > >   arch/riscv/kernel/vector.c                 |   2 +-
> > > > >   11 files changed, 221 insertions(+), 21 deletions(-)
> > > > > 
> > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > index b94176e25be1..f12df0ca6c18 100644
> > > > > --- a/arch/riscv/Kconfig
> > > > > +++ b/arch/riscv/Kconfig
> > > > > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> > > > >   	help
> > > > >   	  Embed support for emulating misaligned loads and stores.
> > > > > +config RISCV_VECTOR_MISALIGNED
> > > > > +	bool
> > > > > +	depends on RISCV_ISA_V
> > > > > +	help
> > > > > +	  Enable detecting support for vector misaligned loads and stores.
> > > > > +
> > > > >   choice
> > > > >   	prompt "Unaligned Accesses Support"
> > > > >   	default RISCV_PROBE_UNALIGNED_ACCESS
> > > > > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> > > > >   endchoice
> > > > > +choice
> > > > > +	prompt "Vector unaligned Accesses Support"
> > > > > +	depends on RISCV_ISA_V
> > > > > +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > +	help
> > > > > +	  This determines the level of support for vector unaligned accesses. This
> > > > > +	  information is used by the kernel to perform optimizations.

I haven't actually checked the patchset, but is it actually used by the
kernel to perform optimisations?

> > > > > It is also
> > > > > +	  exposed to user space via the hwprobe syscall. The hardware will be
> > > > > +	  probed at boot by default.
> > > > > +
> > > > > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > > > 
> > > > This is not used anywhere, what is the reason for including it?
> > 
> > This is so that we can check if they are supported or not, but not check the
> > speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.
> 
> What do you mean? It isn't used anywhere so this "check if they are
> supported or not" is not guarded by this config.
> 
> > 
> > > > 
> > > > > +	bool "Detect support for vector unaligned accesses"
> > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > +	help
> > > > > +	  During boot, the kernel will detect if the system supports vector
> > > > > +	  unaligned accesses.
> > > > > +
> > > > > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > +	bool "Probe speed of vector unaligned accesses"
> > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > +	help
> > > > > +	  During boot, the kernel will run a series of tests to determine the
> > > > > +	  speed of vector unaligned accesses if they are supported. This probing
> > > > > +	  will dynamically determine the speed of vector unaligned accesses on
> > > > > +	  the underlying system if they are supported.
> > > > > +
> > > > > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > > > 
> > > > This should not be prefixed with CONFIG and does not include VECTOR in
> > > > the name.
> > 
> > Huh thought it would warn fixed though
> 
> What do you mean by "warn fixed"?
> 
> > 
> > > I assume you meant to put
> > > > "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
> > 
> > This is to leave a faster path like SLOW or FAST to say that unaligned
> > access arent suported.
> 
> I am not sure what you are responding to. This comment seems to be
> responding to my correction of
> CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
> so I don't see how that ties into SLOW/FAST.
> 
> > 
> > > > 
> > > > This was also intentionally left out on the scalar side [1]. The
> > > > implication here is that having this config will cause people to compile
> > > > kernels without unaligned access support which really shouldn't be
> > > > something we are explicitly supporting.
> > > > 
> > > > If somebody does want to support hardware that does not handle vector
> > > > unaligned accesses, the solution should be to add emulation support to
> > > > the kernel.
> > 
> > Yes but we dont have emulation support yet so I do think its a good idea.
> 
> I am hesitant because it is very likely that somebody will add support
> for unaligned vector emulation. When there is emulation support, this
> config option should not exist to be consistent with scalar. However if
> we add this option in now, we must expect a user to enable this config,
> and then 

I dunno, I think there could be value in having the option here. For
scalar, we couldn't have an option that would break the uABI, so the
unsupported option wasn't okay. That's not a constraint that we have for
vector.

For vector, if you have a system that doesn't support misaligned access,
you probably don't want to emulate the accesses either, since that's
likely remove any performance gains you get from using vector in the
first place, so I can see benefit in the option.
Enabling the probing is going to end up with same outcome for userspace
as having this option on such a system, so it comes down to whether you
want to allow people to skip the probing if they know their system has
this problem.

> we will have to get rid of it later. Users are not always happy
> when config options are removed.

I dunno, I don't think that adding emulation requires that we remove
this unsupported option.

Additionally, what are we doing in the kernel if we detect that
misaligned stuff isn't supported? Are we going to mandate that kernel
code is aligned only, disable in-kernel vector or some other mechanism
to make sure that things like crypto code don't have/introduce code
that'll not run on these systems?

Cheers,
Conor.
Charlie Jenkins June 21, 2024, 5:18 p.m. UTC | #13
On Fri, Jun 21, 2024 at 11:06:31AM +0100, Conor Dooley wrote:
> On Thu, Jun 20, 2024 at 03:14:14PM -0700, Charlie Jenkins wrote:
> > On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
> > > 
> > > 
> > > On 6/17/24 22:09, Charlie Jenkins wrote:
> > > > On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
> > > > > On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > > > > > Run a unaligned vector access to test if the system supports
> > > > > > vector unaligned access. Add the result to a new key in hwprobe.
> > > > > > This is useful for usermode to know if vector misaligned accesses are
> > > > > > supported and if they are faster or slower than equivalent byte accesses.
> > > > > > 
> > > > > > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > > > > > ---
> > > > > > V1 -> V2:
> > > > > >   - Add Kconfig options
> > > > > >   - Add insn_is_vector
> > > > > >   - Add handle_vector_misaligned_load
> > > > > >   - Fix build
> > > > > >   - Seperate vector from scalar misaligned access
> > > > > >   - This patch was almost completely rewritten
> > > > > > ---
> > > > > >   arch/riscv/Kconfig                         |  41 +++++++
> > > > > >   arch/riscv/include/asm/cpufeature.h        |   7 +-
> > > > > >   arch/riscv/include/asm/entry-common.h      |  11 --
> > > > > >   arch/riscv/include/asm/hwprobe.h           |   2 +-
> > > > > >   arch/riscv/include/asm/vector.h            |   1 +
> > > > > >   arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> > > > > >   arch/riscv/kernel/Makefile                 |   4 +-
> > > > > >   arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> > > > > >   arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> > > > > >   arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> > > > > >   arch/riscv/kernel/vector.c                 |   2 +-
> > > > > >   11 files changed, 221 insertions(+), 21 deletions(-)
> > > > > > 
> > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > index b94176e25be1..f12df0ca6c18 100644
> > > > > > --- a/arch/riscv/Kconfig
> > > > > > +++ b/arch/riscv/Kconfig
> > > > > > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> > > > > >   	help
> > > > > >   	  Embed support for emulating misaligned loads and stores.
> > > > > > +config RISCV_VECTOR_MISALIGNED
> > > > > > +	bool
> > > > > > +	depends on RISCV_ISA_V
> > > > > > +	help
> > > > > > +	  Enable detecting support for vector misaligned loads and stores.
> > > > > > +
> > > > > >   choice
> > > > > >   	prompt "Unaligned Accesses Support"
> > > > > >   	default RISCV_PROBE_UNALIGNED_ACCESS
> > > > > > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> > > > > >   endchoice
> > > > > > +choice
> > > > > > +	prompt "Vector unaligned Accesses Support"
> > > > > > +	depends on RISCV_ISA_V
> > > > > > +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > > +	help
> > > > > > +	  This determines the level of support for vector unaligned accesses. This
> > > > > > +	  information is used by the kernel to perform optimizations.
> 
> I haven't actually checked the patchset, but is it actually used by the
> kernel to perform optimisations?

No ;)

Right now this patch is just providing the information to userspace
through hwprobe and doesn't optimize anything in the kernel.

> 
> > > > > > It is also
> > > > > > +	  exposed to user space via the hwprobe syscall. The hardware will be
> > > > > > +	  probed at boot by default.
> > > > > > +
> > > > > > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > > > > 
> > > > > This is not used anywhere, what is the reason for including it?
> > > 
> > > This is so that we can check if they are supported or not, but not check the
> > > speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.
> > 
> > What do you mean? It isn't used anywhere so this "check if they are
> > supported or not" is not guarded by this config.
> > 
> > > 
> > > > > 
> > > > > > +	bool "Detect support for vector unaligned accesses"
> > > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > > +	help
> > > > > > +	  During boot, the kernel will detect if the system supports vector
> > > > > > +	  unaligned accesses.
> > > > > > +
> > > > > > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > > +	bool "Probe speed of vector unaligned accesses"
> > > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > > +	help
> > > > > > +	  During boot, the kernel will run a series of tests to determine the
> > > > > > +	  speed of vector unaligned accesses if they are supported. This probing
> > > > > > +	  will dynamically determine the speed of vector unaligned accesses on
> > > > > > +	  the underlying system if they are supported.
> > > > > > +
> > > > > > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > > > > 
> > > > > This should not be prefixed with CONFIG and does not include VECTOR in
> > > > > the name.
> > > 
> > > Huh thought it would warn fixed though
> > 
> > What do you mean by "warn fixed"?
> > 
> > > 
> > > > I assume you meant to put
> > > > > "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
> > > 
> > > This is to leave a faster path like SLOW or FAST to say that unaligned
> > > access arent suported.
> > 
> > I am not sure what you are responding to. This comment seems to be
> > responding to my correction of
> > CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
> > so I don't see how that ties into SLOW/FAST.
> > 
> > > 
> > > > > 
> > > > > This was also intentionally left out on the scalar side [1]. The
> > > > > implication here is that having this config will cause people to compile
> > > > > kernels without unaligned access support which really shouldn't be
> > > > > something we are explicitly supporting.
> > > > > 
> > > > > If somebody does want to support hardware that does not handle vector
> > > > > unaligned accesses, the solution should be to add emulation support to
> > > > > the kernel.
> > > 
> > > Yes but we dont have emulation support yet so I do think its a good idea.
> > 
> > I am hesitant because it is very likely that somebody will add support
> > for unaligned vector emulation. When there is emulation support, this
> > config option should not exist to be consistent with scalar. However if
> > we add this option in now, we must expect a user to enable this config,
> > and then 
> 
> I dunno, I think there could be value in having the option here. For
> scalar, we couldn't have an option that would break the uABI, so the
> unsupported option wasn't okay. That's not a constraint that we have for
> vector.
> 
> For vector, if you have a system that doesn't support misaligned access,
> you probably don't want to emulate the accesses either, since that's
> likely remove any performance gains you get from using vector in the
> first place, so I can see benefit in the option.

We have the RISCV_MISALIGNED option that disables the scalar emulation,
but doesn't break the UABI because it's not saying that misaligned
scalar is not supported, but just that the kernel doesn't emulate.
Having an UNSUPPORTED option explicitly breaks the UABI which doesn't
seem like something that the kernel should support. If we are okay with
having options that break the UABI then this is fine, but I was under
the impression that we did our best to avoid that.

There definitely is value in having an option like this for hardware
that never wants to run misaligned code, but since we decided with the
scalar accesses that we should not break the UABI I think vector should
do the same.

> Enabling the probing is going to end up with same outcome for userspace
> as having this option on such a system, so it comes down to whether you
> want to allow people to skip the probing if they know their system has
> this problem.
> 
> > we will have to get rid of it later. Users are not always happy
> > when config options are removed.
> 
> I dunno, I don't think that adding emulation requires that we remove
> this unsupported option.

I am probably being too paranoid about this but I am hesistant to cause
vector and scalar misaligned access implemenations to diverge. It is
inconsistent to allow an unsupported option for vector but not for
scalar when both support emulation. The probing is very fast as it just
checks if a misaligned access causes a trap and then sets the access
speed to unsupported if it does trap.

> 
> Additionally, what are we doing in the kernel if we detect that
> misaligned stuff isn't supported? Are we going to mandate that kernel
> code is aligned only, disable in-kernel vector or some other mechanism
> to make sure that things like crypto code don't have/introduce code
> that'll not run on these systems?

UNSUPPORTED will still be set by the quick probe so it would be possible
for the kernel/userspace to avoid running misaligned vector when it's
unsupported. Any kernel methods would probably want to always run
aligned vector unless misaligned support was determined to be FAST
anyway, I am doubtful that code will have different optimizations for
FAST, SLOW, and UNSUPPORTED but it is possible. 

I would prefer consistency between scalar and vector misaligned support,
but this is not a deal breaker for this patch. I am not convinced it is
the best choice, but I am okay with leaving this option in the kernel.

- Charlie

> 
> Cheers,
> Conor.
Eric Biggers June 21, 2024, 5:58 p.m. UTC | #14
On Fri, Jun 21, 2024 at 10:18:23AM -0700, Charlie Jenkins wrote:
> > Additionally, what are we doing in the kernel if we detect that
> > misaligned stuff isn't supported? Are we going to mandate that kernel
> > code is aligned only, disable in-kernel vector or some other mechanism
> > to make sure that things like crypto code don't have/introduce code
> > that'll not run on these systems?
> 
> UNSUPPORTED will still be set by the quick probe so it would be possible
> for the kernel/userspace to avoid running misaligned vector when it's
> unsupported. Any kernel methods would probably want to always run
> aligned vector unless misaligned support was determined to be FAST
> anyway, I am doubtful that code will have different optimizations for
> FAST, SLOW, and UNSUPPORTED but it is possible. 
> 
> I would prefer consistency between scalar and vector misaligned support,
> but this is not a deal breaker for this patch. I am not convinced it is
> the best choice, but I am okay with leaving this option in the kernel.
> 

Note that most of the vector crypto code (in arch/riscv/crypto/) assumes that
vector misaligned accesses are supported.  Many of the RISC-V vector crypto
instructions require using SEW=32 or SEW=64, and as a result, loads and stores
of data can be misaligned unless the code changes the SEW to 8 and back again,
which would be inefficient and add extra complexity.  I don't anticipate
workarounds for CPUs that couldn't be bothered to support misaligned accesses
being added.  So what we'll probably have to do is just disable the vector
crypto algorithms if the CPU doesn't support misaligned accesses...

- Eric
Conor Dooley June 21, 2024, 6:02 p.m. UTC | #15
On Fri, Jun 21, 2024 at 10:58:16AM -0700, Eric Biggers wrote:
> On Fri, Jun 21, 2024 at 10:18:23AM -0700, Charlie Jenkins wrote:
> > > Additionally, what are we doing in the kernel if we detect that
> > > misaligned stuff isn't supported? Are we going to mandate that kernel
> > > code is aligned only, disable in-kernel vector or some other mechanism
> > > to make sure that things like crypto code don't have/introduce code
> > > that'll not run on these systems?
> > 
> > UNSUPPORTED will still be set by the quick probe so it would be possible
> > for the kernel/userspace to avoid running misaligned vector when it's
> > unsupported. Any kernel methods would probably want to always run
> > aligned vector unless misaligned support was determined to be FAST
> > anyway, I am doubtful that code will have different optimizations for
> > FAST, SLOW, and UNSUPPORTED but it is possible. 
> > 
> > I would prefer consistency between scalar and vector misaligned support,
> > but this is not a deal breaker for this patch. I am not convinced it is
> > the best choice, but I am okay with leaving this option in the kernel.
> > 
> 
> Note that most of the vector crypto code (in arch/riscv/crypto/) assumes that
> vector misaligned accesses are supported.  Many of the RISC-V vector crypto
> instructions require using SEW=32 or SEW=64, and as a result, loads and stores
> of data can be misaligned unless the code changes the SEW to 8 and back again,
> which would be inefficient and add extra complexity.  I don't anticipate
> workarounds for CPUs that couldn't be bothered to support misaligned accesses
> being added.

> So what we'll probably have to do is just disable the vector
> crypto algorithms if the CPU doesn't support misaligned accesses...

Right. I was thinking similarly, and that we should just disable all
in-kernel vector code if the platform doesn't support misaligned vector.
Jesse Taube June 21, 2024, 6:07 p.m. UTC | #16
On 6/21/24 13:18, Charlie Jenkins wrote:
> On Fri, Jun 21, 2024 at 11:06:31AM +0100, Conor Dooley wrote:
>> On Thu, Jun 20, 2024 at 03:14:14PM -0700, Charlie Jenkins wrote:
>>> On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
>>>>
>>>>
>>>> On 6/17/24 22:09, Charlie Jenkins wrote:
>>>>> On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
>>>>>> On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
>>>>>>> Run a unaligned vector access to test if the system supports
>>>>>>> vector unaligned access. Add the result to a new key in hwprobe.
>>>>>>> This is useful for usermode to know if vector misaligned accesses are
>>>>>>> supported and if they are faster or slower than equivalent byte accesses.
>>>>>>>
>>>>>>> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
>>>>>>> ---
>>>>>>> V1 -> V2:
>>>>>>>    - Add Kconfig options
>>>>>>>    - Add insn_is_vector
>>>>>>>    - Add handle_vector_misaligned_load
>>>>>>>    - Fix build
>>>>>>>    - Seperate vector from scalar misaligned access
>>>>>>>    - This patch was almost completely rewritten
>>>>>>> ---
>>>>>>>    arch/riscv/Kconfig                         |  41 +++++++
>>>>>>>    arch/riscv/include/asm/cpufeature.h        |   7 +-
>>>>>>>    arch/riscv/include/asm/entry-common.h      |  11 --
>>>>>>>    arch/riscv/include/asm/hwprobe.h           |   2 +-
>>>>>>>    arch/riscv/include/asm/vector.h            |   1 +
>>>>>>>    arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
>>>>>>>    arch/riscv/kernel/Makefile                 |   4 +-
>>>>>>>    arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
>>>>>>>    arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
>>>>>>>    arch/riscv/kernel/unaligned_access_speed.c |   9 +-
>>>>>>>    arch/riscv/kernel/vector.c                 |   2 +-
>>>>>>>    11 files changed, 221 insertions(+), 21 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>>>>>> index b94176e25be1..f12df0ca6c18 100644
>>>>>>> --- a/arch/riscv/Kconfig
>>>>>>> +++ b/arch/riscv/Kconfig
>>>>>>> @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
>>>>>>>    	help
>>>>>>>    	  Embed support for emulating misaligned loads and stores.
>>>>>>> +config RISCV_VECTOR_MISALIGNED
>>>>>>> +	bool
>>>>>>> +	depends on RISCV_ISA_V
>>>>>>> +	help
>>>>>>> +	  Enable detecting support for vector misaligned loads and stores.
>>>>>>> +
>>>>>>>    choice
>>>>>>>    	prompt "Unaligned Accesses Support"
>>>>>>>    	default RISCV_PROBE_UNALIGNED_ACCESS
>>>>>>> @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
>>>>>>>    endchoice
>>>>>>> +choice
>>>>>>> +	prompt "Vector unaligned Accesses Support"
>>>>>>> +	depends on RISCV_ISA_V
>>>>>>> +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>>>>>> +	help
>>>>>>> +	  This determines the level of support for vector unaligned accesses. This
>>>>>>> +	  information is used by the kernel to perform optimizations.
>>
>> I haven't actually checked the patchset, but is it actually used by the
>> kernel to perform optimisations?
> 
> No ;)
> 
> Right now this patch is just providing the information to userspace
> through hwprobe and doesn't optimize anything in the kernel.
> 
>>
>>>>>>> It is also
>>>>>>> +	  exposed to user space via the hwprobe syscall. The hardware will be
>>>>>>> +	  probed at boot by default.
>>>>>>> +
>>>>>>> +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
>>>>>>
>>>>>> This is not used anywhere, what is the reason for including it?
>>>>
>>>> This is so that we can check if they are supported or not, but not check the
>>>> speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.
>>>
>>> What do you mean? It isn't used anywhere so this "check if they are
>>> supported or not" is not guarded by this config.
>>>
>>>>
>>>>>>
>>>>>>> +	bool "Detect support for vector unaligned accesses"
>>>>>>> +	select RISCV_VECTOR_MISALIGNED
>>>>>>> +	help
>>>>>>> +	  During boot, the kernel will detect if the system supports vector
>>>>>>> +	  unaligned accesses.
>>>>>>> +
>>>>>>> +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>>>>>> +	bool "Probe speed of vector unaligned accesses"
>>>>>>> +	select RISCV_VECTOR_MISALIGNED
>>>>>>> +	help
>>>>>>> +	  During boot, the kernel will run a series of tests to determine the
>>>>>>> +	  speed of vector unaligned accesses if they are supported. This probing
>>>>>>> +	  will dynamically determine the speed of vector unaligned accesses on
>>>>>>> +	  the underlying system if they are supported.
>>>>>>> +
>>>>>>> +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
>>>>>>
>>>>>> This should not be prefixed with CONFIG and does not include VECTOR in
>>>>>> the name.
>>>>
>>>> Huh thought it would warn fixed though
>>>
>>> What do you mean by "warn fixed"?
>>>
>>>>
>>>>> I assume you meant to put
>>>>>> "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
>>>>
>>>> This is to leave a faster path like SLOW or FAST to say that unaligned
>>>> access arent suported.
>>>
>>> I am not sure what you are responding to. This comment seems to be
>>> responding to my correction of
>>> CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
>>> so I don't see how that ties into SLOW/FAST.
>>>
>>>>
>>>>>>
>>>>>> This was also intentionally left out on the scalar side [1]. The
>>>>>> implication here is that having this config will cause people to compile
>>>>>> kernels without unaligned access support which really shouldn't be
>>>>>> something we are explicitly supporting.
>>>>>>
>>>>>> If somebody does want to support hardware that does not handle vector
>>>>>> unaligned accesses, the solution should be to add emulation support to
>>>>>> the kernel.
>>>>
>>>> Yes but we dont have emulation support yet so I do think its a good idea.
>>>
>>> I am hesitant because it is very likely that somebody will add support
>>> for unaligned vector emulation. When there is emulation support, this
>>> config option should not exist to be consistent with scalar. However if
>>> we add this option in now, we must expect a user to enable this config,
>>> and then
>>
>> I dunno, I think there could be value in having the option here. For
>> scalar, we couldn't have an option that would break the uABI, so the
>> unsupported option wasn't okay. That's not a constraint that we have for
>> vector.
>>
>> For vector, if you have a system that doesn't support misaligned access,
>> you probably don't want to emulate the accesses either, since that's
>> likely remove any performance gains you get from using vector in the
>> first place, so I can see benefit in the option.
> 
> We have the RISCV_MISALIGNED option that disables the scalar emulation,
> but doesn't break the UABI because it's not saying that misaligned
> scalar is not supported, but just that the kernel doesn't emulate.
> Having an UNSUPPORTED option explicitly breaks the UABI which doesn't
> seem like something that the kernel should support. If we are okay with
> having options that break the UABI then this is fine, but I was under
> the impression that we did our best to avoid that.
> 
> There definitely is value in having an option like this for hardware
> that never wants to run misaligned code, but since we decided with the
> scalar accesses that we should not break the UABI I think vector should
> do the same.

The rational for scalar accesses was slightly different as
The base ISA spec said/says: ". The base ISA supports misaligned 
accesses, but these might run extremely slowly depending on the 
implementation."

Section: 2.6 Load and Store Instructions. Available: 
https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf

> 
>> Enabling the probing is going to end up with same outcome for userspace
>> as having this option on such a system, so it comes down to whether you
>> want to allow people to skip the probing if they know their system has
>> this problem.
>>
>>> we will have to get rid of it later. Users are not always happy
>>> when config options are removed.
>>
>> I dunno, I don't think that adding emulation requires that we remove
>> this unsupported option.
> 
> I am probably being too paranoid about this but I am hesistant to cause
> vector and scalar misaligned access implemenations to diverge. It is
> inconsistent to allow an unsupported option for vector but not for
> scalar when both support emulation. The probing is very fast as it just
> checks if a misaligned access causes a trap and then sets the access
> speed to unsupported if it does trap.

Charlie is right about probing for support being fast. There is
RISCV_DETECT_VECTOR_UNALIGNED_ACCESS to only detect support not the 
speed. I thought it might be a good idea to add this config option, but
I'll just drop it to shorten this thread.

> 
>>
>> Additionally, what are we doing in the kernel if we detect that
>> misaligned stuff isn't supported? Are we going to mandate that kernel
>> code is aligned only

As of now yes.

>> disable in-kernel vector or some other mechanism
>> to make sure that things like crypto code don't have/introduce code
>> that'll not run on these systems?

I'm not too familiar with the uses of unaligned vector accesses, but if
it significantly improves performance then I think there should be a 
faster unaligned access pathway, and a aligned access pathway, so both 
are supported. This also ties into your first question.

Thanks,
Jesse Taube

> UNSUPPORTED will still be set by the quick probe so it would be possible
> for the kernel/userspace to avoid running misaligned vector when it's
> unsupported. Any kernel methods would probably want to always run
> aligned vector unless misaligned support was determined to be FAST
> anyway, I am doubtful that code will have different optimizations for
> FAST, SLOW, and UNSUPPORTED but it is possible.
> 
> I would prefer consistency between scalar and vector misaligned support,
> but this is not a deal breaker for this patch. I am not convinced it is
> the best choice, but I am okay with leaving this option in the kernel.
> 
> - Charlie
> 
>>
>> Cheers,
>> Conor.
> 
>
Conor Dooley June 22, 2024, 11:42 a.m. UTC | #17
On Fri, Jun 21, 2024 at 02:07:58PM -0400, Jesse Taube wrote:
> 
> 
> On 6/21/24 13:18, Charlie Jenkins wrote:
> > On Fri, Jun 21, 2024 at 11:06:31AM +0100, Conor Dooley wrote:
> > > On Thu, Jun 20, 2024 at 03:14:14PM -0700, Charlie Jenkins wrote:
> > > > On Thu, Jun 20, 2024 at 05:31:28PM -0400, Jesse Taube wrote:
> > > > > 
> > > > > 
> > > > > On 6/17/24 22:09, Charlie Jenkins wrote:
> > > > > > On Mon, Jun 17, 2024 at 06:43:32PM -0700, Charlie Jenkins wrote:
> > > > > > > On Thu, Jun 13, 2024 at 03:16:13PM -0400, Jesse Taube wrote:
> > > > > > > > Run a unaligned vector access to test if the system supports
> > > > > > > > vector unaligned access. Add the result to a new key in hwprobe.
> > > > > > > > This is useful for usermode to know if vector misaligned accesses are
> > > > > > > > supported and if they are faster or slower than equivalent byte accesses.
> > > > > > > > 
> > > > > > > > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > > > > > > > ---
> > > > > > > > V1 -> V2:
> > > > > > > >    - Add Kconfig options
> > > > > > > >    - Add insn_is_vector
> > > > > > > >    - Add handle_vector_misaligned_load
> > > > > > > >    - Fix build
> > > > > > > >    - Seperate vector from scalar misaligned access
> > > > > > > >    - This patch was almost completely rewritten
> > > > > > > > ---
> > > > > > > >    arch/riscv/Kconfig                         |  41 +++++++
> > > > > > > >    arch/riscv/include/asm/cpufeature.h        |   7 +-
> > > > > > > >    arch/riscv/include/asm/entry-common.h      |  11 --
> > > > > > > >    arch/riscv/include/asm/hwprobe.h           |   2 +-
> > > > > > > >    arch/riscv/include/asm/vector.h            |   1 +
> > > > > > > >    arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> > > > > > > >    arch/riscv/kernel/Makefile                 |   4 +-
> > > > > > > >    arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> > > > > > > >    arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> > > > > > > >    arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> > > > > > > >    arch/riscv/kernel/vector.c                 |   2 +-
> > > > > > > >    11 files changed, 221 insertions(+), 21 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > > > > > index b94176e25be1..f12df0ca6c18 100644
> > > > > > > > --- a/arch/riscv/Kconfig
> > > > > > > > +++ b/arch/riscv/Kconfig
> > > > > > > > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> > > > > > > >    	help
> > > > > > > >    	  Embed support for emulating misaligned loads and stores.
> > > > > > > > +config RISCV_VECTOR_MISALIGNED
> > > > > > > > +	bool
> > > > > > > > +	depends on RISCV_ISA_V
> > > > > > > > +	help
> > > > > > > > +	  Enable detecting support for vector misaligned loads and stores.
> > > > > > > > +
> > > > > > > >    choice
> > > > > > > >    	prompt "Unaligned Accesses Support"
> > > > > > > >    	default RISCV_PROBE_UNALIGNED_ACCESS
> > > > > > > > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> > > > > > > >    endchoice
> > > > > > > > +choice
> > > > > > > > +	prompt "Vector unaligned Accesses Support"
> > > > > > > > +	depends on RISCV_ISA_V
> > > > > > > > +	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > > > > +	help
> > > > > > > > +	  This determines the level of support for vector unaligned accesses. This
> > > > > > > > +	  information is used by the kernel to perform optimizations.
> > > 
> > > I haven't actually checked the patchset, but is it actually used by the
> > > kernel to perform optimisations?
> > 
> > No ;)
> > 
> > Right now this patch is just providing the information to userspace
> > through hwprobe and doesn't optimize anything in the kernel.
> > 
> > > 
> > > > > > > > It is also
> > > > > > > > +	  exposed to user space via the hwprobe syscall. The hardware will be
> > > > > > > > +	  probed at boot by default.
> > > > > > > > +
> > > > > > > > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > > > > > > 
> > > > > > > This is not used anywhere, what is the reason for including it?
> > > > > 
> > > > > This is so that we can check if they are supported or not, but not check the
> > > > > speed of them. Similar to RISCV_EMULATED_UNALIGNED_ACCESS.
> > > > 
> > > > What do you mean? It isn't used anywhere so this "check if they are
> > > > supported or not" is not guarded by this config.
> > > > 
> > > > > 
> > > > > > > 
> > > > > > > > +	bool "Detect support for vector unaligned accesses"
> > > > > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > > > > +	help
> > > > > > > > +	  During boot, the kernel will detect if the system supports vector
> > > > > > > > +	  unaligned accesses.
> > > > > > > > +
> > > > > > > > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > > > > > > +	bool "Probe speed of vector unaligned accesses"
> > > > > > > > +	select RISCV_VECTOR_MISALIGNED
> > > > > > > > +	help
> > > > > > > > +	  During boot, the kernel will run a series of tests to determine the
> > > > > > > > +	  speed of vector unaligned accesses if they are supported. This probing
> > > > > > > > +	  will dynamically determine the speed of vector unaligned accesses on
> > > > > > > > +	  the underlying system if they are supported.
> > > > > > > > +
> > > > > > > > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > > > > > > 
> > > > > > > This should not be prefixed with CONFIG and does not include VECTOR in
> > > > > > > the name.
> > > > > 
> > > > > Huh thought it would warn fixed though
> > > > 
> > > > What do you mean by "warn fixed"?
> > > > 
> > > > > 
> > > > > > I assume you meant to put
> > > > > > > "RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED" here?
> > > > > 
> > > > > This is to leave a faster path like SLOW or FAST to say that unaligned
> > > > > access arent suported.
> > > > 
> > > > I am not sure what you are responding to. This comment seems to be
> > > > responding to my correction of
> > > > CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED->RISCV_VEC_UNALIGNED_ACCESS_UNSUPPORTED
> > > > so I don't see how that ties into SLOW/FAST.
> > > > 
> > > > > 
> > > > > > > 
> > > > > > > This was also intentionally left out on the scalar side [1]. The
> > > > > > > implication here is that having this config will cause people to compile
> > > > > > > kernels without unaligned access support which really shouldn't be
> > > > > > > something we are explicitly supporting.
> > > > > > > 
> > > > > > > If somebody does want to support hardware that does not handle vector
> > > > > > > unaligned accesses, the solution should be to add emulation support to
> > > > > > > the kernel.
> > > > > 
> > > > > Yes but we dont have emulation support yet so I do think its a good idea.
> > > > 
> > > > I am hesitant because it is very likely that somebody will add support
> > > > for unaligned vector emulation. When there is emulation support, this
> > > > config option should not exist to be consistent with scalar. However if
> > > > we add this option in now, we must expect a user to enable this config,
> > > > and then
> > > 
> > > I dunno, I think there could be value in having the option here. For
> > > scalar, we couldn't have an option that would break the uABI, so the
> > > unsupported option wasn't okay. That's not a constraint that we have for
> > > vector.
> > > 
> > > For vector, if you have a system that doesn't support misaligned access,
> > > you probably don't want to emulate the accesses either, since that's
> > > likely remove any performance gains you get from using vector in the
> > > first place, so I can see benefit in the option.
> > 
> > We have the RISCV_MISALIGNED option that disables the scalar emulation,
> > but doesn't break the UABI because it's not saying that misaligned
> > scalar is not supported, but just that the kernel doesn't emulate.
> > Having an UNSUPPORTED option explicitly breaks the UABI which doesn't
> > seem like something that the kernel should support. If we are okay with
> > having options that break the UABI then this is fine, but I was under
> > the impression that we did our best to avoid that.
> > 
> > There definitely is value in having an option like this for hardware
> > that never wants to run misaligned code, but since we decided with the
> > scalar accesses that we should not break the UABI I think vector should
> > do the same.
> 
> The rational for scalar accesses was slightly different as
> The base ISA spec said/says: ". The base ISA supports misaligned accesses,
> but these might run extremely slowly depending on the implementation."
> 
> Section: 2.6 Load and Store Instructions. Available:
> https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf

Right, the rationale for the Scalar stuff is that the uABI requires them,
cos when the port was merged the ISA required them to be supported.
The same constraint doesn't exist for vector, so there is no uABI
concern.

For scalar, the options must end up with a system that supports
misaligned access - hence anything without in-kernel emulation is marked
NONPORTABLE, since we cannot be sure that all systems will have
emulation in their firmware and/or support in the hardware.

For vector, emulation might be nice to have (or not given extra code and
complexity for shit performance that makes using vector not worthwhile,
but that's another argument), but it is not required for uABI reasons.
None of the options in this series make any different to the uABI
anyway, no matter what you choose, if the hardware doesn't support
misaligned accesses, you're gonna have problems.

> > > Enabling the probing is going to end up with same outcome for userspace
> > > as having this option on such a system, so it comes down to whether you
> > > want to allow people to skip the probing if they know their system has
> > > this problem.
> > > 
> > > > we will have to get rid of it later. Users are not always happy
> > > > when config options are removed.
> > > 
> > > I dunno, I don't think that adding emulation requires that we remove
> > > this unsupported option.
> > 
> > I am probably being too paranoid about this but I am hesistant to cause
> > vector and scalar misaligned access implemenations to diverge. It is
> > inconsistent to allow an unsupported option for vector but not for
> > scalar when both support emulation.

> > The probing is very fast as it just
> > checks if a misaligned access causes a trap and then sets the access
> > speed to unsupported if it does trap.
> 
> Charlie is right about probing for support being fast. There is
> RISCV_DETECT_VECTOR_UNALIGNED_ACCESS to only detect support not the speed.
> I
> thought it might be a good idea to add this config option, but
> I'll just drop it to shorten this thread.

I actually agree with Charlie that the option really isn't needed, I was
just arguing against the justifications he gave. It being fairly fast
and not really much code to just check for a trap I think is enough to
not have much cause for the option.

I really think that we should be added checks for how well it
performs though. Unless it is supported by the hardware (which we cannot
know just by checking for traps, since the firmware could be emulating),
the performance is unlikely to make it perform better than using the same
instructions but aligned or sticking to using scalars.

IMO the options should be (provided we don't have emulation):
- RISCV_VECTOR_PROBE_UNALIGNED_ACCESS: Check for a trap and then bail
  early if not supported. If supported, check the speed.
- RISCV_VECTOR_SLOW_UNALIGNED_ACCESS: Don't check for traps or the
  speed, report slow. NONPORTABLE.
- RISCV_VECTOR_EFFICIENT_UNALIGNED_ACCESS: Don't check for traps or the
  speed, report fast. NONPORTABLE.

Having two different options for "detection" and "probing" is just
confusing and RISCV_DETECT_VECTOR_UNALIGNED_ACCESS' two possible results
both effectively mean the same thing to me if I was writing a userspace
program that wanted to make use of vector optimisations.

> > > Additionally, what are we doing in the kernel if we detect that
> > > misaligned stuff isn't supported? Are we going to mandate that kernel
> > > code is aligned only
> 
> As of now yes.

Seems like we already have in-kernel vector code that assumes misaligned
accesses are supported :)

> > > disable in-kernel vector or some other mechanism
> > > to make sure that things like crypto code don't have/introduce code
> > > that'll not run on these systems?
> 
> I'm not too familiar with the uses of unaligned vector accesses, but if
> it significantly improves performance then I think there should be a faster
> unaligned access pathway, and a aligned access pathway, so both are
> supported. This also ties into your first question.

I dunno, I think we'll have enough bits of optional code for different
extensions, I don't want to have different codepaths for these kinds of
systems. As I said to Eric, I think the vector crypto stuff should just
get disabled once we detect that misaligned accesses are unsupported.
He seems unwilling to support systems that didn't implement misaligned
accesses in the crypto code anyway.

I'm tempted to say that we should force enable Zicclsm if we detect both
support for vector and scalar crypto, but it happens too late to fix
that up I think.

> > UNSUPPORTED will still be set by the quick probe so it would be possible
> > for the kernel/userspace to avoid running misaligned vector when it's
> > unsupported. Any kernel methods would probably want to always run
> > aligned vector unless misaligned support was determined to be FAST
> > anyway, I am doubtful that code will have different optimizations for
> > FAST, SLOW, and UNSUPPORTED but it is possible.
> > 
> > I would prefer consistency between scalar and vector misaligned support,
> > but this is not a deal breaker for this patch. I am not convinced it is
> > the best choice, but I am okay with leaving this option in the kernel.

I would like consistency too :)

Thanks,
Conor.
Andy Chiu June 24, 2024, 5:34 a.m. UTC | #18
On Fri, Jun 21, 2024 at 2:52 AM Evan Green <evan@rivosinc.com> wrote:
>
> On Thu, Jun 13, 2024 at 12:17 PM Jesse Taube <jesse@rivosinc.com> wrote:
> >
> > Run a unaligned vector access to test if the system supports
> > vector unaligned access. Add the result to a new key in hwprobe.
> > This is useful for usermode to know if vector misaligned accesses are
> > supported and if they are faster or slower than equivalent byte accesses.
> >
> > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > ---
> > V1 -> V2:
> >  - Add Kconfig options
> >  - Add insn_is_vector
> >  - Add handle_vector_misaligned_load
> >  - Fix build
> >  - Seperate vector from scalar misaligned access
> >  - This patch was almost completely rewritten
> > ---
> >  arch/riscv/Kconfig                         |  41 +++++++
> >  arch/riscv/include/asm/cpufeature.h        |   7 +-
> >  arch/riscv/include/asm/entry-common.h      |  11 --
> >  arch/riscv/include/asm/hwprobe.h           |   2 +-
> >  arch/riscv/include/asm/vector.h            |   1 +
> >  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> >  arch/riscv/kernel/Makefile                 |   4 +-
> >  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> >  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> >  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> >  arch/riscv/kernel/vector.c                 |   2 +-
> >  11 files changed, 221 insertions(+), 21 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index b94176e25be1..f12df0ca6c18 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> >         help
> >           Embed support for emulating misaligned loads and stores.
> >
> > +config RISCV_VECTOR_MISALIGNED
> > +       bool
> > +       depends on RISCV_ISA_V
> > +       help
> > +         Enable detecting support for vector misaligned loads and stores.
> > +
> >  choice
> >         prompt "Unaligned Accesses Support"
> >         default RISCV_PROBE_UNALIGNED_ACCESS
> > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> >
> >  endchoice
> >
> > +choice
> > +       prompt "Vector unaligned Accesses Support"
> > +       depends on RISCV_ISA_V
> > +       default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > +       help
> > +         This determines the level of support for vector unaligned accesses. This
> > +         information is used by the kernel to perform optimizations. It is also
> > +         exposed to user space via the hwprobe syscall. The hardware will be
> > +         probed at boot by default.
> > +
> > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > +       bool "Detect support for vector unaligned accesses"
> > +       select RISCV_VECTOR_MISALIGNED
> > +       help
> > +         During boot, the kernel will detect if the system supports vector
> > +         unaligned accesses.
> > +
> > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > +       bool "Probe speed of vector unaligned accesses"
> > +       select RISCV_VECTOR_MISALIGNED
> > +       help
> > +         During boot, the kernel will run a series of tests to determine the
> > +         speed of vector unaligned accesses if they are supported. This probing
> > +         will dynamically determine the speed of vector unaligned accesses on
> > +         the underlying system if they are supported.
> > +
> > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > +       bool "Assume the system does not support vector unaligned memory accesses"
> > +       help
> > +         Assume that the system does not support vector unaligned memory accesses.
> > +         The kernel and userspace programs may run them successfully on systems
> > +         that do support vector unaligned memory accesses.
> > +
> > +endchoice
> > +
> >  endmenu # "Platform type"
> >
> >  menu "Kernel features"
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index 347805446151..d0ea5921ab20 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
> >
> >  void riscv_user_isa_enable(void);
> >
> > -#if defined(CONFIG_RISCV_MISALIGNED)
> >  bool check_unaligned_access_emulated_all_cpus(void);
> > +#if defined(CONFIG_RISCV_MISALIGNED)
> >  void unaligned_emulation_finish(void);
> >  bool unaligned_ctl_available(void);
> >  DECLARE_PER_CPU(long, misaligned_access_speed);
> > @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
> >  }
> >  #endif
> >
> > +bool check_vector_unaligned_access_emulated_all_cpus(void);
> > +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> > +DECLARE_PER_CPU(long, vector_misaligned_access);
> > +#endif
> > +
> >  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> >  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
> >
> > diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> > index 2293e535f865..7b32d2b08bb6 100644
> > --- a/arch/riscv/include/asm/entry-common.h
> > +++ b/arch/riscv/include/asm/entry-common.h
> > @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> >  void handle_page_fault(struct pt_regs *regs);
> >  void handle_break(struct pt_regs *regs);
> >
> > -#ifdef CONFIG_RISCV_MISALIGNED
> >  int handle_misaligned_load(struct pt_regs *regs);
> >  int handle_misaligned_store(struct pt_regs *regs);
> > -#else
> > -static inline int handle_misaligned_load(struct pt_regs *regs)
> > -{
> > -       return -1;
> > -}
> > -static inline int handle_misaligned_store(struct pt_regs *regs)
> > -{
> > -       return -1;
> > -}
> > -#endif
> >
> >  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> > index 150a9877b0af..ef01c182af2b 100644
> > --- a/arch/riscv/include/asm/hwprobe.h
> > +++ b/arch/riscv/include/asm/hwprobe.h
> > @@ -8,7 +8,7 @@
> >
> >  #include <uapi/asm/hwprobe.h>
> >
> > -#define RISCV_HWPROBE_MAX_KEY 7
> > +#define RISCV_HWPROBE_MAX_KEY 8
> >
> >  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> >  {
> > diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> > index be7d309cca8a..99b0f91db9ee 100644
> > --- a/arch/riscv/include/asm/vector.h
> > +++ b/arch/riscv/include/asm/vector.h
> > @@ -21,6 +21,7 @@
> >
> >  extern unsigned long riscv_v_vsize;
> >  int riscv_v_setup_vsize(void);
> > +bool insn_is_vector(u32 insn_buf);
> >  bool riscv_v_first_use_handler(struct pt_regs *regs);
> >  void kernel_vector_begin(void);
> >  void kernel_vector_end(void);
> > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> > index 023b7771d1b7..2fee870e41bb 100644
> > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > @@ -75,6 +75,11 @@ struct riscv_hwprobe {
> >  #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
> >  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
> >  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF      7
> > +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF  8
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
> >  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
> >
> >  /* Flags */
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index 5b243d46f4b1..62ac19c029f1 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -62,8 +62,8 @@ obj-y += probes/
> >  obj-y  += tests/
> >  obj-$(CONFIG_MMU) += vdso.o vdso/
> >
> > -obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
> > -obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
> > +obj-y  += traps_misaligned.o
> > +obj-y  += unaligned_access_speed.o
> >  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
> >
> >  obj-$(CONFIG_FPU)              += fpu.o
> > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > index e910e2971984..c40df314058b 100644
> > --- a/arch/riscv/kernel/sys_hwprobe.c
> > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> >  }
> >  #endif
> >
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +       int cpu;
> > +       u64 perf = -1ULL;
> > +
> > +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +
> > +       /* Return if supported or not even if speed wasn't probed */
> > +       for_each_cpu(cpu, cpus) {
> > +               int this_perf = per_cpu(vector_misaligned_access, cpu);
> > +
> > +               if (perf == -1ULL)
> > +                       perf = this_perf;
> > +
> > +               if (perf != this_perf) {
> > +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       if (perf == -1ULL)
> > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +
> > +       return perf;
> > +}
> > +#else
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +
> > +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +}
> > +#endif
> > +
> >  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >                              const struct cpumask *cpus)
> >  {
> > @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >                 pair->value = hwprobe_misaligned(cpus);
> >                 break;
> >
> > +       case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> > +               pair->value = hwprobe_vec_misaligned(cpus);
> > +               break;
> > +
> >         case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> >                 pair->value = 0;
> >                 if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> > index 8fadbe00dd62..6f0264a8c9de 100644
> > --- a/arch/riscv/kernel/traps_misaligned.c
> > +++ b/arch/riscv/kernel/traps_misaligned.c
> > @@ -16,6 +16,7 @@
> >  #include <asm/entry-common.h>
> >  #include <asm/hwprobe.h>
> >  #include <asm/cpufeature.h>
> > +#include <asm/vector.h>
> >
> >  #define INSN_MATCH_LB                  0x3
> >  #define INSN_MASK_LB                   0x707f
> > @@ -322,12 +323,37 @@ union reg_data {
> >         u64 data_u64;
> >  };
> >
> > -static bool unaligned_ctl __read_mostly;
> > -
> >  /* sysctl hooks */
> >  int unaligned_enabled __read_mostly = 1;       /* Enabled by default */
> >
> > -int handle_misaligned_load(struct pt_regs *regs)
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > +{
> > +       unsigned long epc = regs->epc;
> > +       unsigned long insn;
> > +
> > +       if (get_insn(regs, epc, &insn))
> > +               return -1;
> > +
> > +       /* Only return 0 when in check_vector_unaligned_access_emulated */
> > +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> > +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +               regs->epc = epc + INSN_LEN(insn);
> > +               return 0;
> > +       }
> > +
> > +       /* If vector instruction we don't emulate it yet */
> > +       regs->epc = epc;
> > +       return -1;
> > +}
> > +#else
> > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > +{
> > +       return -1;
> > +}
> > +#endif
> > +
> > +static int handle_scalar_misaligned_load(struct pt_regs *regs)
> >  {
> >         union reg_data val;
> >         unsigned long epc = regs->epc;
> > @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
> >         return 0;
> >  }
> >
> > -int handle_misaligned_store(struct pt_regs *regs)
> > +static int handle_scalar_misaligned_store(struct pt_regs *regs)
> >  {
> >         union reg_data val;
> >         unsigned long epc = regs->epc;
> > @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
> >         return 0;
> >  }
> >
> > +int handle_misaligned_load(struct pt_regs *regs)
> > +{
> > +       unsigned long epc = regs->epc;
> > +       unsigned long insn;
> > +
> > +       if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> > +               if (get_insn(regs, epc, &insn))
> > +                       return -1;
> > +
> > +               if (insn_is_vector(insn))
> > +                       return handle_vector_misaligned_load(regs);
> > +       }
> > +
> > +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > +               return handle_scalar_misaligned_load(regs);
> > +
> > +       return -1;
> > +}
> > +
> > +int handle_misaligned_store(struct pt_regs *regs)
> > +{
> > +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > +               return handle_scalar_misaligned_store(regs);
> > +
> > +       return -1;
> > +}
> > +
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> > +{
> > +       long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> > +       unsigned long tmp_var;
> > +
> > +       *mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +
> > +       local_irq_enable();
>
> Generally if a function is called with interrupts disabled there's a
> reason for it, like the system will crash if an interrupt fires during
> execution of this region. I haven't researched this, but to feel
> comfortable I'd want to know why interrupts were disabled on entry
> here, why it's safe to enable them now, and why it's safe to return
> from the function with them still enabled.
>
> I'm guessing this was added because may_use_simd() was blowing up for
> you without it. If that's the case, I think we'll need to reconcile
> that in a different way. From a quick glance at kernel_mode_vector.c,
> I was originally thinking may_use_simd() enforces this because there's

Current nesting support of kernel mode vector only allows softirq
context to use v context nesting on top of kernel thread's v context.
I did not expect the use of v in the interrupt handler but I am open
to discussions if it is needed.

The reason why may_use_simd() checks for irq_disabled() is that
local_bh_enable later in kernel_vector_end() expects running with irq
on. Although the success path of preemptible v does not require
!irq_disabled(), the kernel falls back to non-preemptible v
(get_cpu_vector_context) in some cases. First is the nesting softirq
case, or a rare case where an exception handler uses v in a kernel
fault that also uses v. The second is when memory allocation for
kernel_vstate fails. The fall back uses the non-preemptible v, so it
is linked to bh_*.

We can get rid of the last case by failing the launch of a kernel
thread when the allocation fails.

> only a single instance of current->thread.kernel_vstate. So if you
> tried to start a v context from an interrupt, you may land on top of
> another kernel user, and there's nowhere to save that kernel user's
> old state. But there does seem to be some support for nested V context
> associated with CONFIG_RISCV_ISA_V_PREEMPTIVE, so I'm a little
> confused. It seems like your options are to try and get running on
> each CPU in a different manner (such that you're not stuck in irq
> context by the time you first run), or try and dive into the kernel V
> context code to enable support for starting a V context in irq land.
>
>
> > +       kernel_vector_begin();
> > +       __asm__ __volatile__ (
> > +               ".balign 4\n\t"
> > +               ".option push\n\t"
> > +               ".option arch, +zve32x\n\t"
> > +               "       vsetivli zero, 1, e16, m1, ta, ma\n\t"  // Vectors of 16b
> > +               "       vle16.v v0, (%[ptr])\n\t"               // Load bytes
> > +               ".option pop\n\t"
> > +               : : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
> > +       kernel_vector_end();
> > +}
> > +
> > +bool check_vector_unaligned_access_emulated_all_cpus(void)
> > +{
> > +       int cpu;
> > +       bool ret = true;
> > +
> > +       if (!has_vector()) {
> > +               for_each_online_cpu(cpu)
> > +                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > +               return false;
> > +       }
> > +
> > +       schedule_on_each_cpu(check_vector_unaligned_access_emulated);
> > +
> > +       for_each_online_cpu(cpu)
> > +               if (per_cpu(vector_misaligned_access, cpu)
> > +                   != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
> > +                       return false;
> > +
> > +       return ret;
> > +}
> > +#else
> > +bool check_vector_unaligned_access_emulated_all_cpus(void)
> > +{
> > +       return false;
> > +}
> > +#endif
> > +
> > +#ifdef CONFIG_RISCV_MISALIGNED
> > +
> > +static bool unaligned_ctl __read_mostly;
> > +
> >  static void check_unaligned_access_emulated(struct work_struct *unused)
> >  {
> >         int cpu = smp_processor_id();
> > @@ -563,3 +668,9 @@ bool unaligned_ctl_available(void)
> >  {
> >         return unaligned_ctl;
> >  }
> > +#else
> > +bool check_unaligned_access_emulated_all_cpus(void)
> > +{
> > +       return false;
> > +}
> > +#endif
> > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> > index 70c1588fc353..c6106bd4a25a 100644
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > @@ -19,7 +19,8 @@
> >  #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> >  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> >
> > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> >
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >  static cpumask_t fast_misaligned_access;
> > @@ -268,12 +269,18 @@ static int check_unaligned_access_all_cpus(void)
> >
> >         if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
> >                 for_each_online_cpu(cpu) {
> > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > +                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
> > +#endif
> > +#ifdef CONFIG_RISCV_MISALIGNED
> >                         per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
> > +#endif
> >                 }
> >                 return 0;
> >         }
> >
> >         all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> > +       check_vector_unaligned_access_emulated_all_cpus();
> >
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >         if (!all_cpus_emulated)
> > diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
> > index 682b3feee451..821818886fab 100644
> > --- a/arch/riscv/kernel/vector.c
> > +++ b/arch/riscv/kernel/vector.c
> > @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void)
> >  #endif
> >  }
> >
> > -static bool insn_is_vector(u32 insn_buf)
> > +bool insn_is_vector(u32 insn_buf)
> >  {
> >         u32 opcode = insn_buf & __INSN_OPCODE_MASK;
> >         u32 width, csr;
> > --
> > 2.43.0
> >

Thanks,
Andy
Evan Green June 24, 2024, 4:57 p.m. UTC | #19
On Sun, Jun 23, 2024 at 10:34 PM Andy Chiu <andy.chiu@sifive.com> wrote:
>
> On Fri, Jun 21, 2024 at 2:52 AM Evan Green <evan@rivosinc.com> wrote:
> >
> > On Thu, Jun 13, 2024 at 12:17 PM Jesse Taube <jesse@rivosinc.com> wrote:
> > >
> > > Run a unaligned vector access to test if the system supports
> > > vector unaligned access. Add the result to a new key in hwprobe.
> > > This is useful for usermode to know if vector misaligned accesses are
> > > supported and if they are faster or slower than equivalent byte accesses.
> > >
> > > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> > > ---
> > > V1 -> V2:
> > >  - Add Kconfig options
> > >  - Add insn_is_vector
> > >  - Add handle_vector_misaligned_load
> > >  - Fix build
> > >  - Seperate vector from scalar misaligned access
> > >  - This patch was almost completely rewritten
> > > ---
> > >  arch/riscv/Kconfig                         |  41 +++++++
> > >  arch/riscv/include/asm/cpufeature.h        |   7 +-
> > >  arch/riscv/include/asm/entry-common.h      |  11 --
> > >  arch/riscv/include/asm/hwprobe.h           |   2 +-
> > >  arch/riscv/include/asm/vector.h            |   1 +
> > >  arch/riscv/include/uapi/asm/hwprobe.h      |   5 +
> > >  arch/riscv/kernel/Makefile                 |   4 +-
> > >  arch/riscv/kernel/sys_hwprobe.c            |  41 +++++++
> > >  arch/riscv/kernel/traps_misaligned.c       | 119 ++++++++++++++++++++-
> > >  arch/riscv/kernel/unaligned_access_speed.c |   9 +-
> > >  arch/riscv/kernel/vector.c                 |   2 +-
> > >  11 files changed, 221 insertions(+), 21 deletions(-)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index b94176e25be1..f12df0ca6c18 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -723,6 +723,12 @@ config RISCV_MISALIGNED
> > >         help
> > >           Embed support for emulating misaligned loads and stores.
> > >
> > > +config RISCV_VECTOR_MISALIGNED
> > > +       bool
> > > +       depends on RISCV_ISA_V
> > > +       help
> > > +         Enable detecting support for vector misaligned loads and stores.
> > > +
> > >  choice
> > >         prompt "Unaligned Accesses Support"
> > >         default RISCV_PROBE_UNALIGNED_ACCESS
> > > @@ -774,6 +780,41 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
> > >
> > >  endchoice
> > >
> > > +choice
> > > +       prompt "Vector unaligned Accesses Support"
> > > +       depends on RISCV_ISA_V
> > > +       default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > +       help
> > > +         This determines the level of support for vector unaligned accesses. This
> > > +         information is used by the kernel to perform optimizations. It is also
> > > +         exposed to user space via the hwprobe syscall. The hardware will be
> > > +         probed at boot by default.
> > > +
> > > +config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
> > > +       bool "Detect support for vector unaligned accesses"
> > > +       select RISCV_VECTOR_MISALIGNED
> > > +       help
> > > +         During boot, the kernel will detect if the system supports vector
> > > +         unaligned accesses.
> > > +
> > > +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
> > > +       bool "Probe speed of vector unaligned accesses"
> > > +       select RISCV_VECTOR_MISALIGNED
> > > +       help
> > > +         During boot, the kernel will run a series of tests to determine the
> > > +         speed of vector unaligned accesses if they are supported. This probing
> > > +         will dynamically determine the speed of vector unaligned accesses on
> > > +         the underlying system if they are supported.
> > > +
> > > +config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
> > > +       bool "Assume the system does not support vector unaligned memory accesses"
> > > +       help
> > > +         Assume that the system does not support vector unaligned memory accesses.
> > > +         The kernel and userspace programs may run them successfully on systems
> > > +         that do support vector unaligned memory accesses.
> > > +
> > > +endchoice
> > > +
> > >  endmenu # "Platform type"
> > >
> > >  menu "Kernel features"
> > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > > index 347805446151..d0ea5921ab20 100644
> > > --- a/arch/riscv/include/asm/cpufeature.h
> > > +++ b/arch/riscv/include/asm/cpufeature.h
> > > @@ -33,8 +33,8 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
> > >
> > >  void riscv_user_isa_enable(void);
> > >
> > > -#if defined(CONFIG_RISCV_MISALIGNED)
> > >  bool check_unaligned_access_emulated_all_cpus(void);
> > > +#if defined(CONFIG_RISCV_MISALIGNED)
> > >  void unaligned_emulation_finish(void);
> > >  bool unaligned_ctl_available(void);
> > >  DECLARE_PER_CPU(long, misaligned_access_speed);
> > > @@ -45,6 +45,11 @@ static inline bool unaligned_ctl_available(void)
> > >  }
> > >  #endif
> > >
> > > +bool check_vector_unaligned_access_emulated_all_cpus(void);
> > > +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
> > > +DECLARE_PER_CPU(long, vector_misaligned_access);
> > > +#endif
> > > +
> > >  #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> > >  DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
> > >
> > > diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
> > > index 2293e535f865..7b32d2b08bb6 100644
> > > --- a/arch/riscv/include/asm/entry-common.h
> > > +++ b/arch/riscv/include/asm/entry-common.h
> > > @@ -25,18 +25,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
> > >  void handle_page_fault(struct pt_regs *regs);
> > >  void handle_break(struct pt_regs *regs);
> > >
> > > -#ifdef CONFIG_RISCV_MISALIGNED
> > >  int handle_misaligned_load(struct pt_regs *regs);
> > >  int handle_misaligned_store(struct pt_regs *regs);
> > > -#else
> > > -static inline int handle_misaligned_load(struct pt_regs *regs)
> > > -{
> > > -       return -1;
> > > -}
> > > -static inline int handle_misaligned_store(struct pt_regs *regs)
> > > -{
> > > -       return -1;
> > > -}
> > > -#endif
> > >
> > >  #endif /* _ASM_RISCV_ENTRY_COMMON_H */
> > > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> > > index 150a9877b0af..ef01c182af2b 100644
> > > --- a/arch/riscv/include/asm/hwprobe.h
> > > +++ b/arch/riscv/include/asm/hwprobe.h
> > > @@ -8,7 +8,7 @@
> > >
> > >  #include <uapi/asm/hwprobe.h>
> > >
> > > -#define RISCV_HWPROBE_MAX_KEY 7
> > > +#define RISCV_HWPROBE_MAX_KEY 8
> > >
> > >  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> > >  {
> > > diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
> > > index be7d309cca8a..99b0f91db9ee 100644
> > > --- a/arch/riscv/include/asm/vector.h
> > > +++ b/arch/riscv/include/asm/vector.h
> > > @@ -21,6 +21,7 @@
> > >
> > >  extern unsigned long riscv_v_vsize;
> > >  int riscv_v_setup_vsize(void);
> > > +bool insn_is_vector(u32 insn_buf);
> > >  bool riscv_v_first_use_handler(struct pt_regs *regs);
> > >  void kernel_vector_begin(void);
> > >  void kernel_vector_end(void);
> > > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> > > index 023b7771d1b7..2fee870e41bb 100644
> > > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > > @@ -75,6 +75,11 @@ struct riscv_hwprobe {
> > >  #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
> > >  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
> > >  #define RISCV_HWPROBE_KEY_MISALIGNED_PERF      7
> > > +#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF  8
> > > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
> > > +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
> > > +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
> > > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
> > >  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
> > >
> > >  /* Flags */
> > > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > > index 5b243d46f4b1..62ac19c029f1 100644
> > > --- a/arch/riscv/kernel/Makefile
> > > +++ b/arch/riscv/kernel/Makefile
> > > @@ -62,8 +62,8 @@ obj-y += probes/
> > >  obj-y  += tests/
> > >  obj-$(CONFIG_MMU) += vdso.o vdso/
> > >
> > > -obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
> > > -obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
> > > +obj-y  += traps_misaligned.o
> > > +obj-y  += unaligned_access_speed.o
> > >  obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
> > >
> > >  obj-$(CONFIG_FPU)              += fpu.o
> > > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > > index e910e2971984..c40df314058b 100644
> > > --- a/arch/riscv/kernel/sys_hwprobe.c
> > > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > > @@ -194,6 +194,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> > >  }
> > >  #endif
> > >
> > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > > +{
> > > +       int cpu;
> > > +       u64 perf = -1ULL;
> > > +
> > > +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > +
> > > +       /* Return if supported or not even if speed wasn't probed */
> > > +       for_each_cpu(cpu, cpus) {
> > > +               int this_perf = per_cpu(vector_misaligned_access, cpu);
> > > +
> > > +               if (perf == -1ULL)
> > > +                       perf = this_perf;
> > > +
> > > +               if (perf != this_perf) {
> > > +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > +                       break;
> > > +               }
> > > +       }
> > > +
> > > +       if (perf == -1ULL)
> > > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > +
> > > +       return perf;
> > > +}
> > > +#else
> > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > > +{
> > > +       if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
> > > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > +
> > > +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > +}
> > > +#endif
> > > +
> > >  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> > >                              const struct cpumask *cpus)
> > >  {
> > > @@ -222,6 +259,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> > >                 pair->value = hwprobe_misaligned(cpus);
> > >                 break;
> > >
> > > +       case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
> > > +               pair->value = hwprobe_vec_misaligned(cpus);
> > > +               break;
> > > +
> > >         case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> > >                 pair->value = 0;
> > >                 if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> > > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> > > index 8fadbe00dd62..6f0264a8c9de 100644
> > > --- a/arch/riscv/kernel/traps_misaligned.c
> > > +++ b/arch/riscv/kernel/traps_misaligned.c
> > > @@ -16,6 +16,7 @@
> > >  #include <asm/entry-common.h>
> > >  #include <asm/hwprobe.h>
> > >  #include <asm/cpufeature.h>
> > > +#include <asm/vector.h>
> > >
> > >  #define INSN_MATCH_LB                  0x3
> > >  #define INSN_MASK_LB                   0x707f
> > > @@ -322,12 +323,37 @@ union reg_data {
> > >         u64 data_u64;
> > >  };
> > >
> > > -static bool unaligned_ctl __read_mostly;
> > > -
> > >  /* sysctl hooks */
> > >  int unaligned_enabled __read_mostly = 1;       /* Enabled by default */
> > >
> > > -int handle_misaligned_load(struct pt_regs *regs)
> > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > > +{
> > > +       unsigned long epc = regs->epc;
> > > +       unsigned long insn;
> > > +
> > > +       if (get_insn(regs, epc, &insn))
> > > +               return -1;
> > > +
> > > +       /* Only return 0 when in check_vector_unaligned_access_emulated */
> > > +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> > > +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> > > +               regs->epc = epc + INSN_LEN(insn);
> > > +               return 0;
> > > +       }
> > > +
> > > +       /* If vector instruction we don't emulate it yet */
> > > +       regs->epc = epc;
> > > +       return -1;
> > > +}
> > > +#else
> > > +static int handle_vector_misaligned_load(struct pt_regs *regs)
> > > +{
> > > +       return -1;
> > > +}
> > > +#endif
> > > +
> > > +static int handle_scalar_misaligned_load(struct pt_regs *regs)
> > >  {
> > >         union reg_data val;
> > >         unsigned long epc = regs->epc;
> > > @@ -435,7 +461,7 @@ int handle_misaligned_load(struct pt_regs *regs)
> > >         return 0;
> > >  }
> > >
> > > -int handle_misaligned_store(struct pt_regs *regs)
> > > +static int handle_scalar_misaligned_store(struct pt_regs *regs)
> > >  {
> > >         union reg_data val;
> > >         unsigned long epc = regs->epc;
> > > @@ -526,6 +552,85 @@ int handle_misaligned_store(struct pt_regs *regs)
> > >         return 0;
> > >  }
> > >
> > > +int handle_misaligned_load(struct pt_regs *regs)
> > > +{
> > > +       unsigned long epc = regs->epc;
> > > +       unsigned long insn;
> > > +
> > > +       if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
> > > +               if (get_insn(regs, epc, &insn))
> > > +                       return -1;
> > > +
> > > +               if (insn_is_vector(insn))
> > > +                       return handle_vector_misaligned_load(regs);
> > > +       }
> > > +
> > > +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > > +               return handle_scalar_misaligned_load(regs);
> > > +
> > > +       return -1;
> > > +}
> > > +
> > > +int handle_misaligned_store(struct pt_regs *regs)
> > > +{
> > > +       if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
> > > +               return handle_scalar_misaligned_store(regs);
> > > +
> > > +       return -1;
> > > +}
> > > +
> > > +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
> > > +static void check_vector_unaligned_access_emulated(struct work_struct *unused)
> > > +{
> > > +       long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
> > > +       unsigned long tmp_var;
> > > +
> > > +       *mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > > +
> > > +       local_irq_enable();
> >
> > Generally if a function is called with interrupts disabled there's a
> > reason for it, like the system will crash if an interrupt fires during
> > execution of this region. I haven't researched this, but to feel
> > comfortable I'd want to know why interrupts were disabled on entry
> > here, why it's safe to enable them now, and why it's safe to return
> > from the function with them still enabled.
> >
> > I'm guessing this was added because may_use_simd() was blowing up for
> > you without it. If that's the case, I think we'll need to reconcile
> > that in a different way. From a quick glance at kernel_mode_vector.c,
> > I was originally thinking may_use_simd() enforces this because there's
>
> Current nesting support of kernel mode vector only allows softirq
> context to use v context nesting on top of kernel thread's v context.
> I did not expect the use of v in the interrupt handler but I am open
> to discussions if it is needed.
>
> The reason why may_use_simd() checks for irq_disabled() is that
> local_bh_enable later in kernel_vector_end() expects running with irq
> on. Although the success path of preemptible v does not require
> !irq_disabled(), the kernel falls back to non-preemptible v
> (get_cpu_vector_context) in some cases. First is the nesting softirq
> case, or a rare case where an exception handler uses v in a kernel
> fault that also uses v. The second is when memory allocation for
> kernel_vstate fails. The fall back uses the non-preemptible v, so it
> is linked to bh_*.
>
> We can get rid of the last case by failing the launch of a kernel
> thread when the allocation fails.

Thanks for the context Andy, that fills in a couple of "wait why" gaps
I had in my brain while reading that code.

My understanding is Jesse skated around this issue by using
schedule_on_each_cpu(), which gets the function running on each cpu in
a nicer worker context where kernel_vector_begin() will work as is.
This local_irq_enable() was a leftover from when she was originally
experimenting with on_each_cpu(), which invokes the callback from a
harsher IPI context. So I think we can just remove the
local_irq_enable() and not have to modify the kernel V context code.
-Evan
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b94176e25be1..f12df0ca6c18 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -723,6 +723,12 @@  config RISCV_MISALIGNED
 	help
 	  Embed support for emulating misaligned loads and stores.
 
+config RISCV_VECTOR_MISALIGNED
+	bool
+	depends on RISCV_ISA_V
+	help
+	  Enable detecting support for vector misaligned loads and stores.
+
 choice
 	prompt "Unaligned Accesses Support"
 	default RISCV_PROBE_UNALIGNED_ACCESS
@@ -774,6 +780,41 @@  config RISCV_EFFICIENT_UNALIGNED_ACCESS
 
 endchoice
 
+choice
+	prompt "Vector unaligned Accesses Support"
+	depends on RISCV_ISA_V
+	default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+	help
+	  This determines the level of support for vector unaligned accesses. This
+	  information is used by the kernel to perform optimizations. It is also
+	  exposed to user space via the hwprobe syscall. The hardware will be
+	  probed at boot by default.
+
+config RISCV_DETECT_VECTOR_UNALIGNED_ACCESS
+	bool "Detect support for vector unaligned accesses"
+	select RISCV_VECTOR_MISALIGNED
+	help
+	  During boot, the kernel will detect if the system supports vector
+	  unaligned accesses.
+
+config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+	bool "Probe speed of vector unaligned accesses"
+	select RISCV_VECTOR_MISALIGNED
+	help
+	  During boot, the kernel will run a series of tests to determine the
+	  speed of vector unaligned accesses if they are supported. This probing
+	  will dynamically determine the speed of vector unaligned accesses on
+	  the underlying system if they are supported.
+
+config CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED
+	bool "Assume the system does not support vector unaligned memory accesses"
+	help
+	  Assume that the system does not support vector unaligned memory accesses.
+	  The kernel and userspace programs may run them successfully on systems
+	  that do support vector unaligned memory accesses.
+
+endchoice
+
 endmenu # "Platform type"
 
 menu "Kernel features"
diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 347805446151..d0ea5921ab20 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -33,8 +33,8 @@  extern struct riscv_isainfo hart_isa[NR_CPUS];
 
 void riscv_user_isa_enable(void);
 
-#if defined(CONFIG_RISCV_MISALIGNED)
 bool check_unaligned_access_emulated_all_cpus(void);
+#if defined(CONFIG_RISCV_MISALIGNED)
 void unaligned_emulation_finish(void);
 bool unaligned_ctl_available(void);
 DECLARE_PER_CPU(long, misaligned_access_speed);
@@ -45,6 +45,11 @@  static inline bool unaligned_ctl_available(void)
 }
 #endif
 
+bool check_vector_unaligned_access_emulated_all_cpus(void);
+#if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
+DECLARE_PER_CPU(long, vector_misaligned_access);
+#endif
+
 #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
 DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
 
diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h
index 2293e535f865..7b32d2b08bb6 100644
--- a/arch/riscv/include/asm/entry-common.h
+++ b/arch/riscv/include/asm/entry-common.h
@@ -25,18 +25,7 @@  static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
 void handle_page_fault(struct pt_regs *regs);
 void handle_break(struct pt_regs *regs);
 
-#ifdef CONFIG_RISCV_MISALIGNED
 int handle_misaligned_load(struct pt_regs *regs);
 int handle_misaligned_store(struct pt_regs *regs);
-#else
-static inline int handle_misaligned_load(struct pt_regs *regs)
-{
-	return -1;
-}
-static inline int handle_misaligned_store(struct pt_regs *regs)
-{
-	return -1;
-}
-#endif
 
 #endif /* _ASM_RISCV_ENTRY_COMMON_H */
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 150a9877b0af..ef01c182af2b 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@ 
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 7
+#define RISCV_HWPROBE_MAX_KEY 8
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
index be7d309cca8a..99b0f91db9ee 100644
--- a/arch/riscv/include/asm/vector.h
+++ b/arch/riscv/include/asm/vector.h
@@ -21,6 +21,7 @@ 
 
 extern unsigned long riscv_v_vsize;
 int riscv_v_setup_vsize(void);
+bool insn_is_vector(u32 insn_buf);
 bool riscv_v_first_use_handler(struct pt_regs *regs);
 void kernel_vector_begin(void);
 void kernel_vector_end(void);
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index 023b7771d1b7..2fee870e41bb 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -75,6 +75,11 @@  struct riscv_hwprobe {
 #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
 #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
 #define RISCV_HWPROBE_KEY_MISALIGNED_PERF	7
+#define RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF	8
+#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
+#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
+#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
+#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 5b243d46f4b1..62ac19c029f1 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -62,8 +62,8 @@  obj-y	+= probes/
 obj-y	+= tests/
 obj-$(CONFIG_MMU) += vdso.o vdso/
 
-obj-$(CONFIG_RISCV_MISALIGNED)	+= traps_misaligned.o
-obj-$(CONFIG_RISCV_MISALIGNED)	+= unaligned_access_speed.o
+obj-y	+= traps_misaligned.o
+obj-y	+= unaligned_access_speed.o
 obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)	+= copy-unaligned.o
 
 obj-$(CONFIG_FPU)		+= fpu.o
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index e910e2971984..c40df314058b 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -194,6 +194,43 @@  static u64 hwprobe_misaligned(const struct cpumask *cpus)
 }
 #endif
 
+#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
+static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
+{
+	int cpu;
+	u64 perf = -1ULL;
+
+	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
+		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
+
+	/* Return if supported or not even if speed wasn't probed */
+	for_each_cpu(cpu, cpus) {
+		int this_perf = per_cpu(vector_misaligned_access, cpu);
+
+		if (perf == -1ULL)
+			perf = this_perf;
+
+		if (perf != this_perf) {
+			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+			break;
+		}
+	}
+
+	if (perf == -1ULL)
+		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+
+	return perf;
+}
+#else
+static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
+{
+	if (IS_ENABLED(CONFIG_RISCV_UNALIGNED_ACCESS_UNSUPPORTED))
+		return RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
+
+	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+}
+#endif
+
 static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 			     const struct cpumask *cpus)
 {
@@ -222,6 +259,10 @@  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 		pair->value = hwprobe_misaligned(cpus);
 		break;
 
+	case RISCV_HWPROBE_KEY_VEC_MISALIGNED_PERF:
+		pair->value = hwprobe_vec_misaligned(cpus);
+		break;
+
 	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
 		pair->value = 0;
 		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 8fadbe00dd62..6f0264a8c9de 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -16,6 +16,7 @@ 
 #include <asm/entry-common.h>
 #include <asm/hwprobe.h>
 #include <asm/cpufeature.h>
+#include <asm/vector.h>
 
 #define INSN_MATCH_LB			0x3
 #define INSN_MASK_LB			0x707f
@@ -322,12 +323,37 @@  union reg_data {
 	u64 data_u64;
 };
 
-static bool unaligned_ctl __read_mostly;
-
 /* sysctl hooks */
 int unaligned_enabled __read_mostly = 1;	/* Enabled by default */
 
-int handle_misaligned_load(struct pt_regs *regs)
+#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
+static int handle_vector_misaligned_load(struct pt_regs *regs)
+{
+	unsigned long epc = regs->epc;
+	unsigned long insn;
+
+	if (get_insn(regs, epc, &insn))
+		return -1;
+
+	/* Only return 0 when in check_vector_unaligned_access_emulated */
+	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
+		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
+		regs->epc = epc + INSN_LEN(insn);
+		return 0;
+	}
+
+	/* If vector instruction we don't emulate it yet */
+	regs->epc = epc;
+	return -1;
+}
+#else
+static int handle_vector_misaligned_load(struct pt_regs *regs)
+{
+	return -1;
+}
+#endif
+
+static int handle_scalar_misaligned_load(struct pt_regs *regs)
 {
 	union reg_data val;
 	unsigned long epc = regs->epc;
@@ -435,7 +461,7 @@  int handle_misaligned_load(struct pt_regs *regs)
 	return 0;
 }
 
-int handle_misaligned_store(struct pt_regs *regs)
+static int handle_scalar_misaligned_store(struct pt_regs *regs)
 {
 	union reg_data val;
 	unsigned long epc = regs->epc;
@@ -526,6 +552,85 @@  int handle_misaligned_store(struct pt_regs *regs)
 	return 0;
 }
 
+int handle_misaligned_load(struct pt_regs *regs)
+{
+	unsigned long epc = regs->epc;
+	unsigned long insn;
+
+	if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) {
+		if (get_insn(regs, epc, &insn))
+			return -1;
+
+		if (insn_is_vector(insn))
+			return handle_vector_misaligned_load(regs);
+	}
+
+	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
+		return handle_scalar_misaligned_load(regs);
+
+	return -1;
+}
+
+int handle_misaligned_store(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_RISCV_MISALIGNED))
+		return handle_scalar_misaligned_store(regs);
+
+	return -1;
+}
+
+#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
+static void check_vector_unaligned_access_emulated(struct work_struct *unused)
+{
+	long *mas_ptr = this_cpu_ptr(&vector_misaligned_access);
+	unsigned long tmp_var;
+
+	*mas_ptr = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+
+	local_irq_enable();
+	kernel_vector_begin();
+	__asm__ __volatile__ (
+		".balign 4\n\t"
+		".option push\n\t"
+		".option arch, +zve32x\n\t"
+		"       vsetivli zero, 1, e16, m1, ta, ma\n\t"	// Vectors of 16b
+		"       vle16.v v0, (%[ptr])\n\t"		// Load bytes
+		".option pop\n\t"
+		: : [ptr] "r" ((u8 *)&tmp_var + 1) : "v0");
+	kernel_vector_end();
+}
+
+bool check_vector_unaligned_access_emulated_all_cpus(void)
+{
+	int cpu;
+	bool ret = true;
+
+	if (!has_vector()) {
+		for_each_online_cpu(cpu)
+			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
+		return false;
+	}
+
+	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
+
+	for_each_online_cpu(cpu)
+		if (per_cpu(vector_misaligned_access, cpu)
+		    != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
+			return false;
+
+	return ret;
+}
+#else
+bool check_vector_unaligned_access_emulated_all_cpus(void)
+{
+	return false;
+}
+#endif
+
+#ifdef CONFIG_RISCV_MISALIGNED
+
+static bool unaligned_ctl __read_mostly;
+
 static void check_unaligned_access_emulated(struct work_struct *unused)
 {
 	int cpu = smp_processor_id();
@@ -563,3 +668,9 @@  bool unaligned_ctl_available(void)
 {
 	return unaligned_ctl;
 }
+#else
+bool check_unaligned_access_emulated_all_cpus(void)
+{
+	return false;
+}
+#endif
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index 70c1588fc353..c6106bd4a25a 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -19,7 +19,8 @@ 
 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
 
-DEFINE_PER_CPU(long, misaligned_access_speed);
+DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_UNKNOWN;
+DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
 
 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 static cpumask_t fast_misaligned_access;
@@ -268,12 +269,18 @@  static int check_unaligned_access_all_cpus(void)
 
 	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
 		for_each_online_cpu(cpu) {
+#ifdef CONFIG_RISCV_VECTOR_MISALIGNED
+			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
+#endif
+#ifdef CONFIG_RISCV_MISALIGNED
 			per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
+#endif
 		}
 		return 0;
 	}
 
 	all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
+	check_vector_unaligned_access_emulated_all_cpus();
 
 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 	if (!all_cpus_emulated)
diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
index 682b3feee451..821818886fab 100644
--- a/arch/riscv/kernel/vector.c
+++ b/arch/riscv/kernel/vector.c
@@ -66,7 +66,7 @@  void __init riscv_v_setup_ctx_cache(void)
 #endif
 }
 
-static bool insn_is_vector(u32 insn_buf)
+bool insn_is_vector(u32 insn_buf)
 {
 	u32 opcode = insn_buf & __INSN_OPCODE_MASK;
 	u32 width, csr;