Message ID | 20240604162457.3757417-1-jesse@rivosinc.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | [RFC,v0] RISCV: Report vector unaligned accesses hwprobe | expand |
On 6/4/24 12:24, Jesse Taube wrote: > Detected if a system traps into the kernel on an vector unaligned access. > Add the result to a new key in hwprobe. > > Signed-off-by: Jesse Taube <jesse@rivosinc.com> > --- > arch/riscv/include/asm/cpufeature.h | 3 ++ > arch/riscv/include/asm/hwprobe.h | 2 +- > arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ > arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ > arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ > arch/riscv/kernel/unaligned_access_speed.c | 4 ++ > 6 files changed, 108 insertions(+), 1 deletion(-) > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h > index 347805446151..5ad69cf25b25 100644 > --- a/arch/riscv/include/asm/cpufeature.h > +++ b/arch/riscv/include/asm/cpufeature.h > @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); > > #if defined(CONFIG_RISCV_MISALIGNED) > bool check_unaligned_access_emulated_all_cpus(void); > +bool check_vector_unaligned_access_all_cpus(void); > + > void unaligned_emulation_finish(void); > bool unaligned_ctl_available(void); > DECLARE_PER_CPU(long, misaligned_access_speed); > +DECLARE_PER_CPU(long, vector_misaligned_access); > #else > static inline bool unaligned_ctl_available(void) > { > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h > index 630507dff5ea..150a9877b0af 100644 > --- a/arch/riscv/include/asm/hwprobe.h > +++ b/arch/riscv/include/asm/hwprobe.h > @@ -8,7 +8,7 @@ > > #include <uapi/asm/hwprobe.h> > > -#define RISCV_HWPROBE_MAX_KEY 6 > +#define RISCV_HWPROBE_MAX_KEY 7 > > static inline bool riscv_hwprobe_key_is_valid(__s64 key) > { > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h > index 060212331a03..4474e98d17bd 100644 > --- a/arch/riscv/include/uapi/asm/hwprobe.h > +++ b/arch/riscv/include/uapi/asm/hwprobe.h > @@ -68,6 +68,12 @@ struct riscv_hwprobe { > #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) > #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) > #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 > +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 There were talks about combining vecotor and scalar speed for the user facing API into RISCV_HWPROBE_KEY_CPUPERF_0, but adding another key seems easier. > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 > +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 > +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 > +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 > /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ > > /* Flags */ > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c > index b286b73e763e..ce641cc6e47a 100644 > --- a/arch/riscv/kernel/sys_hwprobe.c > +++ b/arch/riscv/kernel/sys_hwprobe.c > @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) > } > #endif > > +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > +{ > + int cpu; > + u64 perf = -1ULL; > + > + for_each_cpu(cpu, cpus) { > + int this_perf = per_cpu(vector_misaligned_access, cpu); > + > + if (perf == -1ULL) > + perf = this_perf; > + > + if (perf != this_perf) { > + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > + break; > + } > + } > + > + if (perf == -1ULL) > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > + > + return perf; > +} > +#else > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > +{ > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > +} > +#endif > + > static void hwprobe_one_pair(struct riscv_hwprobe *pair, > const struct cpumask *cpus) > { > @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, > pair->value = hwprobe_misaligned(cpus); > break; > > + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: > + pair->value = hwprobe_vec_misaligned(cpus); > + break; > + > case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: > pair->value = 0; > if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c > index 2adb7c3e4dd5..0c07e990e9c5 100644 > --- a/arch/riscv/kernel/traps_misaligned.c > +++ b/arch/riscv/kernel/traps_misaligned.c > @@ -16,6 +16,7 @@ > #include <asm/entry-common.h> > #include <asm/hwprobe.h> > #include <asm/cpufeature.h> > +#include <asm/vector.h> > > #define INSN_MATCH_LB 0x3 > #define INSN_MASK_LB 0x707f > @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) > if (get_insn(regs, epc, &insn)) > return -1; > Is this an appropriate way to check if there is vector missaligned access? What if a unaligned vector load as called by the kernel before this check? > +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { > + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; > + regs->epc = epc + INSN_LEN(insn); > + return 0; > + } > +#endif > + > regs->epc = 0; > > if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { > @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) > return misaligned_emu_detected; > } > > +#ifdef CONFIG_RISCV_ISA_V > +static bool check_vector_unaligned_access(int cpu) > +{ > + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); > + struct riscv_isainfo *isainfo = &hart_isa[cpu]; > + unsigned long tmp_var; > + bool misaligned_vec_suported; > + > + if (!riscv_isa_extension_available(isainfo->isa, v)) > + return false; > + > + /* This case will only happen if a unaligned vector load > + * was called by the kernel before this check > + */ > + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) > + return false; > + > + kernel_vector_begin(); > + __asm__ __volatile__ ( > + ".option push\n\t" > + ".option arch, +v\n\t" > + " li t1, 0x1\n" //size > + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b > + " addi t0, %[ptr], 1\n\t" // Misalign address > + " vle16.v v0, (t0)\n\t" // Load bytes > + ".option pop\n\t" > + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); > + kernel_vector_end(); > + > + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); > + > + return misaligned_vec_suported; > +} > +#else > +static bool check_vector_unaligned_access(int cpu) > +{ > + return false; > +} > +#endif > + > +bool check_vector_unaligned_access_all_cpus(void) > +{ > + int cpu; > + > + for_each_online_cpu(cpu) > + if (!check_vector_unaligned_access(cpu)) > + return false; > + > + return true; > +} > + > bool check_unaligned_access_emulated_all_cpus(void) > { > int cpu; > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c > index a9a6bcb02acf..92a84239beaa 100644 > --- a/arch/riscv/kernel/unaligned_access_speed.c > +++ b/arch/riscv/kernel/unaligned_access_speed.c > @@ -20,6 +20,7 @@ > #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) > > DEFINE_PER_CPU(long, misaligned_access_speed); > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > static cpumask_t fast_misaligned_access; > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) > { > bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); > There was talks about Zicclsm, but spike doesnt have support for Zicclsm afaik, but I was wondering if i should add Zicclsm to cpufeature and aswell. If anyone wants to run this i tested with spike --misaligned --isa=RV64IMAFDCV_zicntr_zihpm --kernel=arch/riscv/boot/Image opensbi/build/platform/generic/firmware/fw_jump.elf Thanks, Jesse Taube > + check_vector_unaligned_access_all_cpus(); > + > if (!all_cpus_emulated) > return check_unaligned_access_speed_all_cpus(); > > @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) > static int check_unaligned_access_all_cpus(void) > { > check_unaligned_access_emulated_all_cpus(); > + check_vector_unaligned_access_all_cpus(); > > return 0; > }
On 04/06/2024 18:42, Jesse Taube wrote: > > > On 6/4/24 12:24, Jesse Taube wrote: >> Detected if a system traps into the kernel on an vector unaligned access. >> Add the result to a new key in hwprobe. >> >> Signed-off-by: Jesse Taube <jesse@rivosinc.com> >> --- >> arch/riscv/include/asm/cpufeature.h | 3 ++ >> arch/riscv/include/asm/hwprobe.h | 2 +- >> arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ >> arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ >> arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ >> arch/riscv/kernel/unaligned_access_speed.c | 4 ++ >> 6 files changed, 108 insertions(+), 1 deletion(-) >> >> diff --git a/arch/riscv/include/asm/cpufeature.h >> b/arch/riscv/include/asm/cpufeature.h >> index 347805446151..5ad69cf25b25 100644 >> --- a/arch/riscv/include/asm/cpufeature.h >> +++ b/arch/riscv/include/asm/cpufeature.h >> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); >> #if defined(CONFIG_RISCV_MISALIGNED) >> bool check_unaligned_access_emulated_all_cpus(void); >> +bool check_vector_unaligned_access_all_cpus(void); >> + >> void unaligned_emulation_finish(void); >> bool unaligned_ctl_available(void); >> DECLARE_PER_CPU(long, misaligned_access_speed); >> +DECLARE_PER_CPU(long, vector_misaligned_access); >> #else >> static inline bool unaligned_ctl_available(void) >> { >> diff --git a/arch/riscv/include/asm/hwprobe.h >> b/arch/riscv/include/asm/hwprobe.h >> index 630507dff5ea..150a9877b0af 100644 >> --- a/arch/riscv/include/asm/hwprobe.h >> +++ b/arch/riscv/include/asm/hwprobe.h >> @@ -8,7 +8,7 @@ >> #include <uapi/asm/hwprobe.h> >> -#define RISCV_HWPROBE_MAX_KEY 6 >> +#define RISCV_HWPROBE_MAX_KEY 7 >> static inline bool riscv_hwprobe_key_is_valid(__s64 key) >> { >> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h >> b/arch/riscv/include/uapi/asm/hwprobe.h >> index 060212331a03..4474e98d17bd 100644 >> --- a/arch/riscv/include/uapi/asm/hwprobe.h >> +++ b/arch/riscv/include/uapi/asm/hwprobe.h >> @@ -68,6 +68,12 @@ struct riscv_hwprobe { >> #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) >> #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) >> #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 >> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 > > There were talks about combining vecotor and scalar speed for the user > facing API into RISCV_HWPROBE_KEY_CPUPERF_0, but adding another key > seems easier. > >> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 >> /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ >> /* Flags */ >> diff --git a/arch/riscv/kernel/sys_hwprobe.c >> b/arch/riscv/kernel/sys_hwprobe.c >> index b286b73e763e..ce641cc6e47a 100644 >> --- a/arch/riscv/kernel/sys_hwprobe.c >> +++ b/arch/riscv/kernel/sys_hwprobe.c >> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct >> cpumask *cpus) >> } >> #endif >> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) >> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) >> +{ >> + int cpu; >> + u64 perf = -1ULL; >> + >> + for_each_cpu(cpu, cpus) { >> + int this_perf = per_cpu(vector_misaligned_access, cpu); >> + >> + if (perf == -1ULL) >> + perf = this_perf; >> + >> + if (perf != this_perf) { >> + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> + break; >> + } >> + } >> + >> + if (perf == -1ULL) >> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> + >> + return perf; >> +} >> +#else >> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) >> +{ >> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> +} >> +#endif >> + >> static void hwprobe_one_pair(struct riscv_hwprobe *pair, >> const struct cpumask *cpus) >> { >> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe >> *pair, >> pair->value = hwprobe_misaligned(cpus); >> break; >> + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: >> + pair->value = hwprobe_vec_misaligned(cpus); >> + break; >> + >> case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: >> pair->value = 0; >> if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) >> diff --git a/arch/riscv/kernel/traps_misaligned.c >> b/arch/riscv/kernel/traps_misaligned.c >> index 2adb7c3e4dd5..0c07e990e9c5 100644 >> --- a/arch/riscv/kernel/traps_misaligned.c >> +++ b/arch/riscv/kernel/traps_misaligned.c >> @@ -16,6 +16,7 @@ >> #include <asm/entry-common.h> >> #include <asm/hwprobe.h> >> #include <asm/cpufeature.h> >> +#include <asm/vector.h> >> #define INSN_MATCH_LB 0x3 >> #define INSN_MASK_LB 0x707f >> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) >> if (get_insn(regs, epc, &insn)) >> return -1; >> > > Is this an appropriate way to check if there is vector missaligned > access? What if a unaligned vector load as called by the kernel before > this check? > >> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS >> + if (*this_cpu_ptr(&vector_misaligned_access) == >> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { >> + *this_cpu_ptr(&vector_misaligned_access) = >> RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; >> + regs->epc = epc + INSN_LEN(insn); >> + return 0; >> + } >> +#endif Since this code will be called for standard load/store misaligned accesses your, I guess this needs to check if the faulty instruction itself is a vector related one before setting it to supported. I don't know what the specs says about that but I guess it shoukld be checked if vector load/store and standard load/store could have different behaviors with respect to misaligned accesses. Regardless of that (except if I missed something), the emulation code does not actually support vector load/store instructions. So, support for misaligned vector load/store should be added in a first place before reporting that it is actually "supported". Thanks, Clément >> + >> regs->epc = 0; >> if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { >> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) >> return misaligned_emu_detected; >> } >> +#ifdef CONFIG_RISCV_ISA_V >> +static bool check_vector_unaligned_access(int cpu) >> +{ >> + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); >> + struct riscv_isainfo *isainfo = &hart_isa[cpu]; >> + unsigned long tmp_var; >> + bool misaligned_vec_suported; >> + >> + if (!riscv_isa_extension_available(isainfo->isa, v)) >> + return false; >> + >> + /* This case will only happen if a unaligned vector load >> + * was called by the kernel before this check >> + */ >> + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) >> + return false; >> + >> + kernel_vector_begin(); >> + __asm__ __volatile__ ( >> + ".option push\n\t" >> + ".option arch, +v\n\t" >> + " li t1, 0x1\n" //size >> + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of >> 16b >> + " addi t0, %[ptr], 1\n\t" // Misalign address >> + " vle16.v v0, (t0)\n\t" // Load bytes >> + ".option pop\n\t" >> + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); >> + kernel_vector_end(); >> + >> + misaligned_vec_suported = (*mas_ptr == >> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); >> + >> + return misaligned_vec_suported; >> +} >> +#else >> +static bool check_vector_unaligned_access(int cpu) >> +{ >> + return false; >> +} >> +#endif >> + >> +bool check_vector_unaligned_access_all_cpus(void) >> +{ >> + int cpu; >> + >> + for_each_online_cpu(cpu) >> + if (!check_vector_unaligned_access(cpu)) >> + return false; >> + >> + return true; >> +} >> + >> bool check_unaligned_access_emulated_all_cpus(void) >> { >> int cpu; >> diff --git a/arch/riscv/kernel/unaligned_access_speed.c >> b/arch/riscv/kernel/unaligned_access_speed.c >> index a9a6bcb02acf..92a84239beaa 100644 >> --- a/arch/riscv/kernel/unaligned_access_speed.c >> +++ b/arch/riscv/kernel/unaligned_access_speed.c >> @@ -20,6 +20,7 @@ >> #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) >> DEFINE_PER_CPU(long, misaligned_access_speed); >> +DEFINE_PER_CPU(long, vector_misaligned_access) = >> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS >> static cpumask_t fast_misaligned_access; >> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) >> { >> bool all_cpus_emulated = >> check_unaligned_access_emulated_all_cpus(); >> > > There was talks about Zicclsm, but spike doesnt have support for Zicclsm > afaik, but I was wondering if i should add Zicclsm to cpufeature and > aswell. > > If anyone wants to run this i tested with > spike --misaligned --isa=RV64IMAFDCV_zicntr_zihpm > --kernel=arch/riscv/boot/Image > opensbi/build/platform/generic/firmware/fw_jump.elf AFAIK, --misaligned tells spike to actually handle misaligned load/store in "hardware" and not generate a misaligned trap so I guess you would need to remove that flag and use a specific openSBI version that always delegate the misaligned load/store traps if you want to be sure that the kernel actually handles the vector misaligned load/store traps. I previously made a small test utility to verify that the kernel correctly gets the misaligned traps [1]. If it fails then, your kernel probably do not get any trap :) Link: https://github.com/clementleger/unaligned_test [1] > > > Thanks, > Jesse Taube > > >> + check_vector_unaligned_access_all_cpus(); >> + >> if (!all_cpus_emulated) >> return check_unaligned_access_speed_all_cpus(); >> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) >> static int check_unaligned_access_all_cpus(void) >> { >> check_unaligned_access_emulated_all_cpus(); >> + check_vector_unaligned_access_all_cpus(); >> return 0; >> }
On Tue, Jun 04, 2024 at 12:42:10PM -0400, Jesse Taube wrote: > On 6/4/24 12:24, Jesse Taube wrote: > > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c > > index a9a6bcb02acf..92a84239beaa 100644 > > --- a/arch/riscv/kernel/unaligned_access_speed.c > > +++ b/arch/riscv/kernel/unaligned_access_speed.c > > @@ -20,6 +20,7 @@ > > #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) > > DEFINE_PER_CPU(long, misaligned_access_speed); > > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > > static cpumask_t fast_misaligned_access; > > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) > > { > > bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); > > There was talks about Zicclsm, but spike doesnt have support for Zicclsm > afaik, Support for Zicclsm just means that it can perform misaligned loads and stores to cache coherent memory. I guess support in Spike would involve setting that in its devicetree iff/when that's the case. > but I was wondering if i should add Zicclsm to cpufeature and aswell. Ye, please do add detection for Zicclsm. I think that should be fairly straightforward to do, nothing too special to document. Cheers, Conor.
On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote: > > Detected if a system traps into the kernel on an vector unaligned access. > Add the result to a new key in hwprobe. > > Signed-off-by: Jesse Taube <jesse@rivosinc.com> For the scalar world, we wanted to know whether misaligned accesses were faster or slower than equivalent byte accesses, so usermode could know for something like memcpy which option had better bandwidth. Is the motivation here the same, where we're going to use vector registers for memcpy and we want to know which size load to use? Or will usermode be consuming this info for a different purpose as well? I know this is a basic question, but having the motivation helps me get the right lens for reviewing it. Perhaps that should be added to the commit message as well. > --- > arch/riscv/include/asm/cpufeature.h | 3 ++ > arch/riscv/include/asm/hwprobe.h | 2 +- > arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ > arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ > arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ > arch/riscv/kernel/unaligned_access_speed.c | 4 ++ > 6 files changed, 108 insertions(+), 1 deletion(-) > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h > index 347805446151..5ad69cf25b25 100644 > --- a/arch/riscv/include/asm/cpufeature.h > +++ b/arch/riscv/include/asm/cpufeature.h > @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); > > #if defined(CONFIG_RISCV_MISALIGNED) > bool check_unaligned_access_emulated_all_cpus(void); > +bool check_vector_unaligned_access_all_cpus(void); > + > void unaligned_emulation_finish(void); > bool unaligned_ctl_available(void); > DECLARE_PER_CPU(long, misaligned_access_speed); > +DECLARE_PER_CPU(long, vector_misaligned_access); > #else > static inline bool unaligned_ctl_available(void) > { > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h > index 630507dff5ea..150a9877b0af 100644 > --- a/arch/riscv/include/asm/hwprobe.h > +++ b/arch/riscv/include/asm/hwprobe.h > @@ -8,7 +8,7 @@ > > #include <uapi/asm/hwprobe.h> > > -#define RISCV_HWPROBE_MAX_KEY 6 > +#define RISCV_HWPROBE_MAX_KEY 7 > > static inline bool riscv_hwprobe_key_is_valid(__s64 key) > { > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h > index 060212331a03..4474e98d17bd 100644 > --- a/arch/riscv/include/uapi/asm/hwprobe.h > +++ b/arch/riscv/include/uapi/asm/hwprobe.h > @@ -68,6 +68,12 @@ struct riscv_hwprobe { > #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) > #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) > #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 > +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 > +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 > +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 > +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 This needs to be added to the documentation as well. What value should be returned when V is not enabled in the kernel, or V is not supported in the hardware? Currently in the code it would be UNKNOWN, right? Is that what we want, or is it worth differentiating "no support for V" from "I don't know the speed of misaligned loads"? Maybe UNKNOWN is the right value, as there are other values to tell you V is not enabled. > /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ > > /* Flags */ > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c > index b286b73e763e..ce641cc6e47a 100644 > --- a/arch/riscv/kernel/sys_hwprobe.c > +++ b/arch/riscv/kernel/sys_hwprobe.c > @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) > } > #endif > > +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > +{ > + int cpu; > + u64 perf = -1ULL; > + > + for_each_cpu(cpu, cpus) { > + int this_perf = per_cpu(vector_misaligned_access, cpu); > + > + if (perf == -1ULL) > + perf = this_perf; > + > + if (perf != this_perf) { > + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > + break; > + } > + } > + > + if (perf == -1ULL) > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > + > + return perf; > +} > +#else > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > +{ > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > +} > +#endif > + > static void hwprobe_one_pair(struct riscv_hwprobe *pair, > const struct cpumask *cpus) > { > @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, > pair->value = hwprobe_misaligned(cpus); > break; > > + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: > + pair->value = hwprobe_vec_misaligned(cpus); > + break; > + > case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: > pair->value = 0; > if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c > index 2adb7c3e4dd5..0c07e990e9c5 100644 > --- a/arch/riscv/kernel/traps_misaligned.c > +++ b/arch/riscv/kernel/traps_misaligned.c > @@ -16,6 +16,7 @@ > #include <asm/entry-common.h> > #include <asm/hwprobe.h> > #include <asm/cpufeature.h> > +#include <asm/vector.h> > > #define INSN_MATCH_LB 0x3 > #define INSN_MASK_LB 0x707f > @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) > if (get_insn(regs, epc, &insn)) > return -1; > > +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { > + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; Shouldn't this be EMULATED, given we were just delegated this trap? I guess it depends on whether you're going to add support for actually handling the misaligned vector trap, as Clément mentioned. Scalar misaligned loads had a history to lean on since the specs were always explicit that misaligned loads/store had to be supported one way or another. So UNSUPPORTED was a future theoretical value. I haven't dug through the specs yet, do you know what the story is for V and misaligned loads? My sub-question is what the rules are for detecting the difference between EMULATED and UNSUPPORTED. > + regs->epc = epc + INSN_LEN(insn); > + return 0; > + } > +#endif > + > regs->epc = 0; > > if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { > @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) > return misaligned_emu_detected; > } > > +#ifdef CONFIG_RISCV_ISA_V > +static bool check_vector_unaligned_access(int cpu) > +{ > + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); > + struct riscv_isainfo *isainfo = &hart_isa[cpu]; > + unsigned long tmp_var; > + bool misaligned_vec_suported; > + > + if (!riscv_isa_extension_available(isainfo->isa, v)) > + return false; > + > + /* This case will only happen if a unaligned vector load > + * was called by the kernel before this check > + */ > + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) > + return false; > + > + kernel_vector_begin(); > + __asm__ __volatile__ ( > + ".option push\n\t" > + ".option arch, +v\n\t" > + " li t1, 0x1\n" //size > + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b > + " addi t0, %[ptr], 1\n\t" // Misalign address > + " vle16.v v0, (t0)\n\t" // Load bytes > + ".option pop\n\t" > + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); > + kernel_vector_end(); > + > + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); > + > + return misaligned_vec_suported; > +} > +#else > +static bool check_vector_unaligned_access(int cpu) > +{ > + return false; > +} > +#endif > + > +bool check_vector_unaligned_access_all_cpus(void) > +{ > + int cpu; > + > + for_each_online_cpu(cpu) > + if (!check_vector_unaligned_access(cpu)) > + return false; > + > + return true; > +} These functions return a bool, but the bool is never checked. I'm guessing that's because you're going to check it in a future patch where you decide whether or not to probe for fast/slow? > + > bool check_unaligned_access_emulated_all_cpus(void) > { > int cpu; > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c > index a9a6bcb02acf..92a84239beaa 100644 > --- a/arch/riscv/kernel/unaligned_access_speed.c > +++ b/arch/riscv/kernel/unaligned_access_speed.c > @@ -20,6 +20,7 @@ > #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) > > DEFINE_PER_CPU(long, misaligned_access_speed); > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > static cpumask_t fast_misaligned_access; > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) > { > bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); > > + check_vector_unaligned_access_all_cpus(); > + > if (!all_cpus_emulated) > return check_unaligned_access_speed_all_cpus(); > > @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) > static int check_unaligned_access_all_cpus(void) > { > check_unaligned_access_emulated_all_cpus(); > + check_vector_unaligned_access_all_cpus(); > > return 0; > } > -- > 2.43.0 >
On 6/5/24 11:54, Evan Green wrote: > On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote: >> >> Detected if a system traps into the kernel on an vector unaligned access. >> Add the result to a new key in hwprobe. >> >> Signed-off-by: Jesse Taube <jesse@rivosinc.com> > > For the scalar world, we wanted to know whether misaligned accesses > were faster or slower than equivalent byte accesses, so usermode could > know for something like memcpy which option had better bandwidth. Yes that will be in another patch and will set it to either fast or slow. This patch will just detect if we can run it. > Is > the motivation here the same, where we're going to use vector > registers for memcpy and we want to know which size load to use? Or > will usermode be consuming this info for a different purpose as well? > I know this is a basic question, but having the motivation helps me > get the right lens for reviewing it. Perhaps that should be added to > the commit message as well. > >> --- >> arch/riscv/include/asm/cpufeature.h | 3 ++ >> arch/riscv/include/asm/hwprobe.h | 2 +- >> arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ >> arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ >> arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ >> arch/riscv/kernel/unaligned_access_speed.c | 4 ++ >> 6 files changed, 108 insertions(+), 1 deletion(-) >> >> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h >> index 347805446151..5ad69cf25b25 100644 >> --- a/arch/riscv/include/asm/cpufeature.h >> +++ b/arch/riscv/include/asm/cpufeature.h >> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); >> >> #if defined(CONFIG_RISCV_MISALIGNED) >> bool check_unaligned_access_emulated_all_cpus(void); >> +bool check_vector_unaligned_access_all_cpus(void); >> + >> void unaligned_emulation_finish(void); >> bool unaligned_ctl_available(void); >> DECLARE_PER_CPU(long, misaligned_access_speed); >> +DECLARE_PER_CPU(long, vector_misaligned_access); >> #else >> static inline bool unaligned_ctl_available(void) >> { >> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h >> index 630507dff5ea..150a9877b0af 100644 >> --- a/arch/riscv/include/asm/hwprobe.h >> +++ b/arch/riscv/include/asm/hwprobe.h >> @@ -8,7 +8,7 @@ >> >> #include <uapi/asm/hwprobe.h> >> >> -#define RISCV_HWPROBE_MAX_KEY 6 >> +#define RISCV_HWPROBE_MAX_KEY 7 >> >> static inline bool riscv_hwprobe_key_is_valid(__s64 key) >> { >> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h >> index 060212331a03..4474e98d17bd 100644 >> --- a/arch/riscv/include/uapi/asm/hwprobe.h >> +++ b/arch/riscv/include/uapi/asm/hwprobe.h >> @@ -68,6 +68,12 @@ struct riscv_hwprobe { >> #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) >> #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) >> #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 >> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 >> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 > > This needs to be added to the documentation as well. > > What value should be returned when V is not enabled in the kernel, or > V is not supported in the hardware? Currently in the code it would be > UNKNOWN, right? Yes. It would be trivial to set it to unsupported if V is not enabled. > Is that what we want, or is it worth differentiating > "no support for V" from "I don't know the speed of misaligned loads"? > Maybe UNKNOWN is the right value, as there are other values to tell > you V is not enabled. > > >> /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ >> >> /* Flags */ >> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c >> index b286b73e763e..ce641cc6e47a 100644 >> --- a/arch/riscv/kernel/sys_hwprobe.c >> +++ b/arch/riscv/kernel/sys_hwprobe.c >> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) >> } >> #endif >> >> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) >> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) >> +{ >> + int cpu; >> + u64 perf = -1ULL; >> + >> + for_each_cpu(cpu, cpus) { >> + int this_perf = per_cpu(vector_misaligned_access, cpu); >> + >> + if (perf == -1ULL) >> + perf = this_perf; >> + >> + if (perf != this_perf) { >> + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> + break; >> + } >> + } >> + >> + if (perf == -1ULL) >> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> + >> + return perf; >> +} >> +#else >> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) >> +{ >> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> +} >> +#endif >> + >> static void hwprobe_one_pair(struct riscv_hwprobe *pair, >> const struct cpumask *cpus) >> { >> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, >> pair->value = hwprobe_misaligned(cpus); >> break; >> >> + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: >> + pair->value = hwprobe_vec_misaligned(cpus); >> + break; >> + >> case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: >> pair->value = 0; >> if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) >> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c >> index 2adb7c3e4dd5..0c07e990e9c5 100644 >> --- a/arch/riscv/kernel/traps_misaligned.c >> +++ b/arch/riscv/kernel/traps_misaligned.c >> @@ -16,6 +16,7 @@ >> #include <asm/entry-common.h> >> #include <asm/hwprobe.h> >> #include <asm/cpufeature.h> >> +#include <asm/vector.h> >> >> #define INSN_MATCH_LB 0x3 >> #define INSN_MASK_LB 0x707f >> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) >> if (get_insn(regs, epc, &insn)) >> return -1; >> >> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS >> + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { >> + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; > > Shouldn't this be EMULATED, given we were just delegated this trap? I > guess it depends on whether you're going to add support for actually > handling the misaligned vector trap, as Clément mentioned. Yeah I accedentaly forgot to check if the trap was caused by a vector instruction. I was not planing on emulating them, this is just a one-time check. > > Scalar misaligned loads had a history to lean on since the specs were > always explicit that misaligned loads/store had to be supported one > way or another. So UNSUPPORTED was a future theoretical value. I > haven't dug through the specs yet, do you know what the story is for V > and misaligned loads? " 8. Vector Memory Alignment Constraints If an element accessed by a vector memory instruction is not naturally aligned to the size of the element, either the element is transferred successfully or an address misaligned exception is raised on that element. Support for misaligned vector memory accesses is independent of an implementation’s support for misaligned scalar memory accesses. " It is if the load/store is naturally aligned to the size of the element. My sub-question is what the rules are for > detecting the difference between EMULATED and UNSUPPORTED. For vector EMULATED isnt used thought it can be added if I add emulation. Idealy that would be done in the SBI. > >> + regs->epc = epc + INSN_LEN(insn); >> + return 0; >> + } >> +#endif >> + >> regs->epc = 0; >> >> if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { >> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) >> return misaligned_emu_detected; >> } >> >> +#ifdef CONFIG_RISCV_ISA_V >> +static bool check_vector_unaligned_access(int cpu) >> +{ >> + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); >> + struct riscv_isainfo *isainfo = &hart_isa[cpu]; >> + unsigned long tmp_var; >> + bool misaligned_vec_suported; >> + >> + if (!riscv_isa_extension_available(isainfo->isa, v)) >> + return false; >> + >> + /* This case will only happen if a unaligned vector load >> + * was called by the kernel before this check >> + */ >> + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) >> + return false; >> + >> + kernel_vector_begin(); >> + __asm__ __volatile__ ( >> + ".option push\n\t" >> + ".option arch, +v\n\t" >> + " li t1, 0x1\n" //size >> + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b >> + " addi t0, %[ptr], 1\n\t" // Misalign address >> + " vle16.v v0, (t0)\n\t" // Load bytes >> + ".option pop\n\t" >> + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); >> + kernel_vector_end(); >> + >> + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); >> + >> + return misaligned_vec_suported; >> +} >> +#else >> +static bool check_vector_unaligned_access(int cpu) >> +{ >> + return false; >> +} >> +#endif >> + >> +bool check_vector_unaligned_access_all_cpus(void) >> +{ >> + int cpu; >> + >> + for_each_online_cpu(cpu) >> + if (!check_vector_unaligned_access(cpu)) >> + return false; >> + >> + return true; >> +} > > These functions return a bool, but the bool is never checked. I'm > guessing that's because you're going to check it in a future patch > where you decide whether or not to probe for fast/slow? Yes. Relating to your first question. The values should have the same meaning as scalar. So UNKNOWN whould mean it's supported, we dont know the speed. > > >> + >> bool check_unaligned_access_emulated_all_cpus(void) >> { >> int cpu; >> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c >> index a9a6bcb02acf..92a84239beaa 100644 >> --- a/arch/riscv/kernel/unaligned_access_speed.c >> +++ b/arch/riscv/kernel/unaligned_access_speed.c >> @@ -20,6 +20,7 @@ >> #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) >> >> DEFINE_PER_CPU(long, misaligned_access_speed); >> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; >> >> #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS >> static cpumask_t fast_misaligned_access; >> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) >> { >> bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); >> >> + check_vector_unaligned_access_all_cpus(); >> + >> if (!all_cpus_emulated) >> return check_unaligned_access_speed_all_cpus(); >> >> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) >> static int check_unaligned_access_all_cpus(void) >> { >> check_unaligned_access_emulated_all_cpus(); >> + check_vector_unaligned_access_all_cpus(); >> >> return 0; >> } >> -- >> 2.43.0 >>
Yo, Beat me to some things I wanted to saw, so I'll leave my comments on top of yours. On Wed, Jun 05, 2024 at 08:54:21AM -0700, Evan Green wrote: > On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote: > > > > Detected if a system traps into the kernel on an vector unaligned access. > > Add the result to a new key in hwprobe. > > > > Signed-off-by: Jesse Taube <jesse@rivosinc.com> > > For the scalar world, we wanted to know whether misaligned accesses > were faster or slower than equivalent byte accesses, so usermode could > know for something like memcpy which option had better bandwidth. Is > the motivation here the same, where we're going to use vector > registers for memcpy and we want to know which size load to use? Or > will usermode be consuming this info for a different purpose as well? > I know this is a basic question, but having the motivation helps me > get the right lens for reviewing it. Perhaps that should be added to > the commit message as well. Motivation should always be in the commit message ;) > > arch/riscv/include/asm/cpufeature.h | 3 ++ > > arch/riscv/include/asm/hwprobe.h | 2 +- > > arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ > > arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ > > arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ > > arch/riscv/kernel/unaligned_access_speed.c | 4 ++ > > 6 files changed, 108 insertions(+), 1 deletion(-) > > > > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h > > index 347805446151..5ad69cf25b25 100644 > > --- a/arch/riscv/include/asm/cpufeature.h > > +++ b/arch/riscv/include/asm/cpufeature.h > > @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); > > > > #if defined(CONFIG_RISCV_MISALIGNED) > > bool check_unaligned_access_emulated_all_cpus(void); > > +bool check_vector_unaligned_access_all_cpus(void); > > + > > void unaligned_emulation_finish(void); > > bool unaligned_ctl_available(void); > > DECLARE_PER_CPU(long, misaligned_access_speed); > > +DECLARE_PER_CPU(long, vector_misaligned_access); > > #else > > static inline bool unaligned_ctl_available(void) > > { > > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h > > index 630507dff5ea..150a9877b0af 100644 > > --- a/arch/riscv/include/asm/hwprobe.h > > +++ b/arch/riscv/include/asm/hwprobe.h > > @@ -8,7 +8,7 @@ > > > > #include <uapi/asm/hwprobe.h> > > > > -#define RISCV_HWPROBE_MAX_KEY 6 > > +#define RISCV_HWPROBE_MAX_KEY 7 > > > > static inline bool riscv_hwprobe_key_is_valid(__s64 key) > > { > > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h > > index 060212331a03..4474e98d17bd 100644 > > --- a/arch/riscv/include/uapi/asm/hwprobe.h > > +++ b/arch/riscv/include/uapi/asm/hwprobe.h > > @@ -68,6 +68,12 @@ struct riscv_hwprobe { > > #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) > > #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) > > #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 > > +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 > > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 > > +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 > > +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 > > +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 > > +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 > > This needs to be added to the documentation as well. +1 > What value should be returned when V is not enabled in the kernel, or > V is not supported in the hardware? Currently in the code it would be > UNKNOWN, right? Is that what we want, or is it worth differentiating > "no support for V" from "I don't know the speed of misaligned loads"? > Maybe UNKNOWN is the right value, as there are other values to tell > you V is not enabled. I think UNKNOWN is fine for !V, assuming identical definitions as scalar. Realistically, we /should/ always know the value once we get as far as userspace (ignoring the fact that this patch as implemented doesn't know), either by probing for it or config options like those that the non-vector stuff has. What I am not a fan of in this patch is how it makes UNKNOWN a proxy for "SUPPORTED", I think we should set SLOW as a minimum. Really we should either add probing and Kconfig options, and maybe emulation if there's a strong case for it. > > /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ > > > > /* Flags */ > > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c > > index b286b73e763e..ce641cc6e47a 100644 > > --- a/arch/riscv/kernel/sys_hwprobe.c > > +++ b/arch/riscv/kernel/sys_hwprobe.c > > @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) > > } > > #endif > > > > +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > > +{ > > + int cpu; > > + u64 perf = -1ULL; > > + > > + for_each_cpu(cpu, cpus) { > > + int this_perf = per_cpu(vector_misaligned_access, cpu); > > + > > + if (perf == -1ULL) > > + perf = this_perf; > > + > > + if (perf != this_perf) { > > + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > + break; > > + } > > + } > > + > > + if (perf == -1ULL) > > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > + > > + return perf; > > +} > > +#else > > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) > > +{ > > + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > +} > > +#endif > > + > > static void hwprobe_one_pair(struct riscv_hwprobe *pair, > > const struct cpumask *cpus) > > { > > @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, > > pair->value = hwprobe_misaligned(cpus); > > break; > > > > + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: > > + pair->value = hwprobe_vec_misaligned(cpus); > > + break; > > + > > case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: > > pair->value = 0; > > if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) > > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c > > index 2adb7c3e4dd5..0c07e990e9c5 100644 > > --- a/arch/riscv/kernel/traps_misaligned.c > > +++ b/arch/riscv/kernel/traps_misaligned.c > > @@ -16,6 +16,7 @@ > > #include <asm/entry-common.h> > > #include <asm/hwprobe.h> > > #include <asm/cpufeature.h> > > +#include <asm/vector.h> > > > > #define INSN_MATCH_LB 0x3 > > #define INSN_MASK_LB 0x707f > > @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) > > if (get_insn(regs, epc, &insn)) > > return -1; > > > > +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > > + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { > > + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; > > Shouldn't this be EMULATED, given we were just delegated this trap? I > guess it depends on whether you're going to add support for actually > handling the misaligned vector trap, as Clément mentioned. Also, we should not actually set EMULATED if we don't actually emulate it - this patch just advances epc, right? Also, is it even correct to arbitrarily set this value on a misaligned trap - how do we know if this was a vector access at this point? Finally, with how handle_misaligned_load() works, aren't we gonna set the scalar misaligned access speed to EMULATED during this trap even if the trapping instruction is a vector one? That seems like it should be fixed in a patch of its own. > Scalar misaligned loads had a history to lean on since the specs were > always explicit that misaligned loads/store had to be supported one > way or another. So UNSUPPORTED was a future theoretical value. I > haven't dug through the specs yet, do you know what the story is for V > and misaligned loads? My sub-question is what the rules are for > detecting the difference between EMULATED and UNSUPPORTED. I mean, if we emulate it, then it's EMULATED. If we don't EMULATE it and nothing below us emulates it, then it is UNSUPPORTED. The one that has questionmarks is EMULATED v SLOW, where firmware emulation is communicated to userspace as SLOW, because we can't really tell that it is in fact EMULATED. Cheers, Conor. > > + regs->epc = epc + INSN_LEN(insn); > > + return 0; > > + } > > +#endif > > + > > regs->epc = 0; > > > > if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { > > @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) > > return misaligned_emu_detected; > > } > > > > +#ifdef CONFIG_RISCV_ISA_V > > +static bool check_vector_unaligned_access(int cpu) > > +{ > > + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); > > + struct riscv_isainfo *isainfo = &hart_isa[cpu]; > > + unsigned long tmp_var; > > + bool misaligned_vec_suported; > > + > > + if (!riscv_isa_extension_available(isainfo->isa, v)) > > + return false; > > + > > + /* This case will only happen if a unaligned vector load > > + * was called by the kernel before this check > > + */ > > + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) > > + return false; > > + > > + kernel_vector_begin(); > > + __asm__ __volatile__ ( > > + ".option push\n\t" > > + ".option arch, +v\n\t" > > + " li t1, 0x1\n" //size > > + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b > > + " addi t0, %[ptr], 1\n\t" // Misalign address > > + " vle16.v v0, (t0)\n\t" // Load bytes > > + ".option pop\n\t" > > + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); > > + kernel_vector_end(); > > + > > + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); > > + > > + return misaligned_vec_suported; > > +} > > +#else > > +static bool check_vector_unaligned_access(int cpu) > > +{ > > + return false; > > +} > > +#endif > > + > > +bool check_vector_unaligned_access_all_cpus(void) > > +{ > > + int cpu; > > + > > + for_each_online_cpu(cpu) > > + if (!check_vector_unaligned_access(cpu)) > > + return false; > > + > > + return true; > > +} > > These functions return a bool, but the bool is never checked. I'm > guessing that's because you're going to check it in a future patch > where you decide whether or not to probe for fast/slow? > > > > + > > bool check_unaligned_access_emulated_all_cpus(void) > > { > > int cpu; > > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c > > index a9a6bcb02acf..92a84239beaa 100644 > > --- a/arch/riscv/kernel/unaligned_access_speed.c > > +++ b/arch/riscv/kernel/unaligned_access_speed.c > > @@ -20,6 +20,7 @@ > > #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) > > > > DEFINE_PER_CPU(long, misaligned_access_speed); > > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; > > > > #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS > > static cpumask_t fast_misaligned_access; > > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) > > { > > bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); > > > > + check_vector_unaligned_access_all_cpus(); > > + > > if (!all_cpus_emulated) > > return check_unaligned_access_speed_all_cpus(); > > > > @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) > > static int check_unaligned_access_all_cpus(void) > > { > > check_unaligned_access_emulated_all_cpus(); > > + check_vector_unaligned_access_all_cpus(); > > > > return 0; > > } > > -- > > 2.43.0 > > >
> On Wed, Jun 05, 2024 at 08:54:21AM -0700, Evan Green wrote: > > On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote: > > What value should be returned when V is not enabled in the kernel, or > > V is not supported in the hardware? Currently in the code it would be > > UNKNOWN, right? Is that what we want, or is it worth differentiating > > "no support for V" from "I don't know the speed of misaligned loads"? > > Maybe UNKNOWN is the right value, as there are other values to tell > > you V is not enabled. > > I think UNKNOWN is fine for !V, assuming identical definitions as scalar. I dunno, maybe we should set it to UNSUPPORTED in that case, but there's probably some funny behaviour around the v prctl we might need to be aware of. If the system has vector, we should probably figure out the actual speed is, but not report it while vector is disabled from the prctl?
diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 347805446151..5ad69cf25b25 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void); #if defined(CONFIG_RISCV_MISALIGNED) bool check_unaligned_access_emulated_all_cpus(void); +bool check_vector_unaligned_access_all_cpus(void); + void unaligned_emulation_finish(void); bool unaligned_ctl_available(void); DECLARE_PER_CPU(long, misaligned_access_speed); +DECLARE_PER_CPU(long, vector_misaligned_access); #else static inline bool unaligned_ctl_available(void) { diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 630507dff5ea..150a9877b0af 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include <uapi/asm/hwprobe.h> -#define RISCV_HWPROBE_MAX_KEY 6 +#define RISCV_HWPROBE_MAX_KEY 7 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 060212331a03..4474e98d17bd 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -68,6 +68,12 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7 +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0 +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1 +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2 +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3 +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index b286b73e763e..ce641cc6e47a 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) } #endif +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + int cpu; + u64 perf = -1ULL; + + for_each_cpu(cpu, cpus) { + int this_perf = per_cpu(vector_misaligned_access, cpu); + + if (perf == -1ULL) + perf = this_perf; + + if (perf != this_perf) { + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; + break; + } + } + + if (perf == -1ULL) + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; + + return perf; +} +#else +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; +} +#endif + static void hwprobe_one_pair(struct riscv_hwprobe *pair, const struct cpumask *cpus) { @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, pair->value = hwprobe_misaligned(cpus); break; + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF: + pair->value = hwprobe_vec_misaligned(cpus); + break; + case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: pair->value = 0; if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 2adb7c3e4dd5..0c07e990e9c5 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -16,6 +16,7 @@ #include <asm/entry-common.h> #include <asm/hwprobe.h> #include <asm/cpufeature.h> +#include <asm/vector.h> #define INSN_MATCH_LB 0x3 #define INSN_MASK_LB 0x707f @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs) if (get_insn(regs, epc, &insn)) return -1; +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) { + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED; + regs->epc = epc + INSN_LEN(insn); + return 0; + } +#endif + regs->epc = 0; if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu) return misaligned_emu_detected; } +#ifdef CONFIG_RISCV_ISA_V +static bool check_vector_unaligned_access(int cpu) +{ + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu); + struct riscv_isainfo *isainfo = &hart_isa[cpu]; + unsigned long tmp_var; + bool misaligned_vec_suported; + + if (!riscv_isa_extension_available(isainfo->isa, v)) + return false; + + /* This case will only happen if a unaligned vector load + * was called by the kernel before this check + */ + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) + return false; + + kernel_vector_begin(); + __asm__ __volatile__ ( + ".option push\n\t" + ".option arch, +v\n\t" + " li t1, 0x1\n" //size + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b + " addi t0, %[ptr], 1\n\t" // Misalign address + " vle16.v v0, (t0)\n\t" // Load bytes + ".option pop\n\t" + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory"); + kernel_vector_end(); + + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN); + + return misaligned_vec_suported; +} +#else +static bool check_vector_unaligned_access(int cpu) +{ + return false; +} +#endif + +bool check_vector_unaligned_access_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + if (!check_vector_unaligned_access(cpu)) + return false; + + return true; +} + bool check_unaligned_access_emulated_all_cpus(void) { int cpu; diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index a9a6bcb02acf..92a84239beaa 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -20,6 +20,7 @@ #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) DEFINE_PER_CPU(long, misaligned_access_speed); +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN; #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS static cpumask_t fast_misaligned_access; @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void) { bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); + check_vector_unaligned_access_all_cpus(); + if (!all_cpus_emulated) return check_unaligned_access_speed_all_cpus(); @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void) static int check_unaligned_access_all_cpus(void) { check_unaligned_access_emulated_all_cpus(); + check_vector_unaligned_access_all_cpus(); return 0; }
Detected if a system traps into the kernel on an vector unaligned access. Add the result to a new key in hwprobe. Signed-off-by: Jesse Taube <jesse@rivosinc.com> --- arch/riscv/include/asm/cpufeature.h | 3 ++ arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 6 +++ arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++ arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++ arch/riscv/kernel/unaligned_access_speed.c | 4 ++ 6 files changed, 108 insertions(+), 1 deletion(-)