Message ID | 20240610204821.230388-5-torvalds@linux-foundation.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64 / x86-64: low-level code generation issues | expand |
Hi Linus, On Mon, Jun 10, 2024 at 01:48:18PM -0700, Linus Torvalds wrote: > This implements the runtime constant infrastructure for arm64, allowing > the dcache d_hash() function to be generated using as a constant for > hash table address followed by shift by a constant of the hash index. > > Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> > --- > arch/arm64/include/asm/runtime-const.h | 75 ++++++++++++++++++++++++++ > arch/arm64/kernel/vmlinux.lds.S | 3 ++ > 2 files changed, 78 insertions(+) > create mode 100644 arch/arm64/include/asm/runtime-const.h From a quick scan I spotted a couple of issues (explained with fixes below). I haven't had the chance to test/review the whole series yet. Do we expect to use this more widely? If this only really matters for d_hash() it might be better to handle this via the alternatives framework with callbacks and avoid the need for new infrastructure. > diff --git a/arch/arm64/include/asm/runtime-const.h b/arch/arm64/include/asm/runtime-const.h > new file mode 100644 > index 000000000000..02462b2cb6f9 > --- /dev/null > +++ b/arch/arm64/include/asm/runtime-const.h > @@ -0,0 +1,75 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_RUNTIME_CONST_H > +#define _ASM_RUNTIME_CONST_H > + > +#define runtime_const_ptr(sym) ({ \ > + typeof(sym) __ret; \ > + asm_inline("1:\t" \ > + "movz %0, #0xcdef\n\t" \ > + "movk %0, #0x89ab, lsl #16\n\t" \ > + "movk %0, #0x4567, lsl #32\n\t" \ > + "movk %0, #0x0123, lsl #48\n\t" \ > + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ > + ".long 1b - .\n\t" \ > + ".popsection" \ > + :"=r" (__ret)); \ > + __ret; }) > + > +#define runtime_const_shift_right_32(val, sym) ({ \ > + unsigned long __ret; \ > + asm_inline("1:\t" \ > + "lsr %w0,%w1,#12\n\t" \ > + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ > + ".long 1b - .\n\t" \ > + ".popsection" \ > + :"=r" (__ret) \ > + :"r" (0u+(val))); \ > + __ret; }) > + > +#define runtime_const_init(type, sym) do { \ > + extern s32 __start_runtime_##type##_##sym[]; \ > + extern s32 __stop_runtime_##type##_##sym[]; \ > + runtime_const_fixup(__runtime_fixup_##type, \ > + (unsigned long)(sym), \ > + __start_runtime_##type##_##sym, \ > + __stop_runtime_##type##_##sym); \ > +} while (0) > + > +// 16-bit immediate for wide move (movz and movk) in bits 5..20 > +static inline void __runtime_fixup_16(unsigned int *p, unsigned int val) > +{ > + unsigned int insn = *p; > + insn &= 0xffe0001f; > + insn |= (val & 0xffff) << 5; > + *p = insn; > +} As-is this will break BE kernels as instructions are always encoded little-endian regardless of data endianness. We usually handle that by using __le32 instruction pointers and using le32_to_cpu()/cpu_to_le32() when reading/writing, e.g. #include <asm/byteorder.h> static inline void __runtime_fixup_16(__le32 *p, unsigned int val) { u32 insn = le32_to_cpu(*p); insn &= 0xffe0001f; insn |= (val & 0xffff) << 5; *p = cpu_to_le32(insn); } We have some helpers for instruction manipulation, and we can use aarch64_insn_encode_immediate() here, e.g. #include <asm/insn.h> static inline void __runtime_fixup_16(__le32 *p, unsigned int val) { u32 insn = le32_to_cpu(*p); insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, val); *p = cpu_to_le32(insn); } > +static inline void __runtime_fixup_ptr(void *where, unsigned long val) > +{ > + unsigned int *p = lm_alias(where); > + __runtime_fixup_16(p, val); > + __runtime_fixup_16(p+1, val >> 16); > + __runtime_fixup_16(p+2, val >> 32); > + __runtime_fixup_16(p+3, val >> 48); > +} This is missing the necessary cache maintenance and context synchronization event. After the new values are written, we need cache maintenance (a D$ clean to PoU, then an I$ invalidate to PoU) followed by a context synchronization event (e.g. an ISB) before CPUs are guaranteed to use the new instructions rather than the old ones. Depending on how your system has been integrated, you might get away without that. If you see: Data cache clean to the PoU not required for I/D coherence ... in your dmesg, that means you only need the I$ invalidate and context synchronization event, and you might happen to get those by virtue of alternative patching running between dcache_init_early() and the use of the patched instructions. However, in general, we do need all of that. As long as this runs before secondaries are brought up, we can handle that with caches_clean_inval_pou(). Assuming the __le32 changes above, I'd expect this to be: static inline void __runtime_fixup_ptr(void *where, unsigned long val) { __le32 *p = lm_alias(where); __runtime_fixup_16(p, val); __runtime_fixup_16(p + 1, val >> 16); __runtime_fixup_16(p + 2, val >> 32); __runtime_fixup_16(p + 3, val >> 48); caches_clean_inval_pou((unsigned long)p, (unsigned long)(p + 4)); } > +// Immediate value is 5 bits starting at bit #16 > +static inline void __runtime_fixup_shift(void *where, unsigned long val) > +{ > + unsigned int *p = lm_alias(where); > + unsigned int insn = *p; > + insn &= 0xffc0ffff; > + insn |= (val & 63) << 16; > + *p = insn; > +} As with the other bits above, I'd expect this to be: static inline void __runtime_fixup_shift(void *where, unsigned long val) { __le32 *p = lm_alias(where); u32 insn = le32_to_cpu(*p); insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, val); *p = cpu_to_le32(insn); caches_clean_inval_pou((unsigned long)p, (unsigned long)(p + 1)); } Mark. > + > +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), > + unsigned long val, s32 *start, s32 *end) > +{ > + while (start < end) { > + fn(*start + (void *)start, val); > + start++; > + } > +} > + > +#endif > diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S > index 755a22d4f840..55a8e310ea12 100644 > --- a/arch/arm64/kernel/vmlinux.lds.S > +++ b/arch/arm64/kernel/vmlinux.lds.S > @@ -264,6 +264,9 @@ SECTIONS > EXIT_DATA > } > > + RUNTIME_CONST(shift, d_hash_shift) > + RUNTIME_CONST(ptr, dentry_hashtable) > + > PERCPU_SECTION(L1_CACHE_BYTES) > HYPERVISOR_PERCPU_SECTION > > -- > 2.45.1.209.gc6f12300df > >
On Tue, 11 Jun 2024 at 07:29, Mark Rutland <mark.rutland@arm.com> wrote: > > Do we expect to use this more widely? If this only really matters for > d_hash() it might be better to handle this via the alternatives > framework with callbacks and avoid the need for new infrastructure. Hmm. The notion of a callback for alternatives is intriguing and would be very generic, but we don't have anything like that right now. Is anybody willing to implement something like that? Because while I like the idea, it sounds like a much bigger change. > As-is this will break BE kernels [...] I had forgotten about that horror. BE in this day and age is insane, but it's easy enough to fix as per your comments. Will do. > We have some helpers for instruction manipulation, and we can use > aarch64_insn_encode_immediate() here, e.g. > > #include <asm/insn.h> > > static inline void __runtime_fixup_16(__le32 *p, unsigned int val) > { > u32 insn = le32_to_cpu(*p); > insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, val); > *p = cpu_to_le32(insn); > } Ugh. I did that, and then noticed that it makes the generated code about ten times bigger. That interface looks positively broken. There is absolutely nobody who actually wants a dynamic argument, so it would have made both the callers and the implementation *much* simpler had the "AARCH64_INSN_IMM_16" been encoded in the function name the way I did it for my instruction rewriting. It would have made the use of it simpler, it would have avoided all the "switch (type)" garbage, and it would have made it all generate much better code. So I did that change you suggested, and then undid it again. Because that whole aarch64_insn_encode_immediate() thing is an abomination, and should be burned at the stake. It's misdesigned in the *worst* possible way. And no, this code isn't performance-critical, but I have some taste, and the code I write will not be using that garbage. > This is missing the necessary cache maintenance and context > synchronization event. Will do. Linus
On Tue, Jun 11, 2024 at 09:56:17AM -0700, Linus Torvalds wrote: > On Tue, 11 Jun 2024 at 07:29, Mark Rutland <mark.rutland@arm.com> wrote: > > > > Do we expect to use this more widely? If this only really matters for > > d_hash() it might be better to handle this via the alternatives > > framework with callbacks and avoid the need for new infrastructure. > > Hmm. The notion of a callback for alternatives is intriguing and would > be very generic, but we don't have anything like that right now. > > Is anybody willing to implement something like that? Because while I > like the idea, it sounds like a much bigger change. Fair enough if that's a pain on x86, but we already have them on arm64, and hence using them is a smaller change there. We already have a couple of cases which uses MOVZ;MOVK;MOVK;MOVK sequence, e.g. // in __invalidate_icache_max_range() asm volatile(ALTERNATIVE_CB("movz %0, #0\n" "movk %0, #0, lsl #16\n" "movk %0, #0, lsl #32\n" "movk %0, #0, lsl #48\n", ARM64_ALWAYS_SYSTEM, kvm_compute_final_ctr_el0) : "=r" (ctr)); ... which is patched via the callback: void kvm_compute_final_ctr_el0(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0), origptr, updptr, nr_inst); } ... where the generate_mov_q() helper does the actual instruction generation. So if we only care about a few specific constants, we could give them their own callbacks, like kvm_compute_final_ctr_el0() above. [...] > > We have some helpers for instruction manipulation, and we can use > > aarch64_insn_encode_immediate() here, e.g. > > > > #include <asm/insn.h> > > > > static inline void __runtime_fixup_16(__le32 *p, unsigned int val) > > { > > u32 insn = le32_to_cpu(*p); > > insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, val); > > *p = cpu_to_le32(insn); > > } > > Ugh. I did that, and then noticed that it makes the generated code > about ten times bigger. > > That interface looks positively broken. > > There is absolutely nobody who actually wants a dynamic argument, so > it would have made both the callers and the implementation *much* > simpler had the "AARCH64_INSN_IMM_16" been encoded in the function > name the way I did it for my instruction rewriting. > > It would have made the use of it simpler, it would have avoided all > the "switch (type)" garbage, and it would have made it all generate > much better code. Oh, completely agreed. FWIW, I have better versions sat in my arm64/insn/rework branch, but I haven't had the time to get all the rest of the insn framework cleanup sorted: https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/commit/?h=arm64/insn/rework&id=9cf0ec088c9d5324c60933bf3924176fea0a4d0b I can go prioritise getting that bit out if it'd help, or I can clean this up later. Those allow the compiler to do much better, including compile-time (or runtime) checks that immediates fit. For example: void encode_imm16(__le32 *p, u16 imm) { u32 insn = le32_to_cpu(*p); // Would warn if 'imm' were u32. // As u16 always fits, no warning BUILD_BUG_ON(!aarch64_insn_try_encode_unsigned_imm16(&insn, imm)); *p = cpu_to_le32(insn); } ... compiles to: <encode_imm16>: ldr w2, [x0] bfi w2, w1, #5, #16 str w2, [x0] ret ... which I think is what you want? > So I did that change you suggested, and then undid it again. > > Because that whole aarch64_insn_encode_immediate() thing is an > abomination, and should be burned at the stake. It's misdesigned in > the *worst* possible way. > > And no, this code isn't performance-critical, but I have some taste, > and the code I write will not be using that garbage. Fair enough. Mark.
On Tue, 11 Jun 2024 at 10:48, Mark Rutland <mark.rutland@arm.com> wrote: > > Fair enough if that's a pain on x86, but we already have them on arm64, and > hence using them is a smaller change there. We already have a couple of cases > which uses MOVZ;MOVK;MOVK;MOVK sequence, e.g. > > // in __invalidate_icache_max_range() > asm volatile(ALTERNATIVE_CB("movz %0, #0\n" > "movk %0, #0, lsl #16\n" > "movk %0, #0, lsl #32\n" > "movk %0, #0, lsl #48\n", > ARM64_ALWAYS_SYSTEM, > kvm_compute_final_ctr_el0) > : "=r" (ctr)); > > ... which is patched via the callback: > > void kvm_compute_final_ctr_el0(struct alt_instr *alt, > __le32 *origptr, __le32 *updptr, int nr_inst) > { > generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0), > origptr, updptr, nr_inst); > } > > ... where the generate_mov_q() helper does the actual instruction generation. > > So if we only care about a few specific constants, we could give them their own > callbacks, like kvm_compute_final_ctr_el0() above. I'll probably only have another day until my mailbox starts getting more pull requests (Mon-Tue outside the merge window is typically my quiet time when I have time to go through old emails and have time for private projects). So I'll look at doing this for x86 and see how it works. I do suspect that even then it's possibly more code with a site-specific callback for each case, but maybe it would be worth it just for the flexibility. Linus
On Tue, 11 Jun 2024 at 10:59, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > So I'll look at doing this for x86 and see how it works. Oh - and when I started looking at it, I immediately remembered why I didn't want to use alternatives originally. The alternatives are finalized much too early for this. By the time the dcache code works, the alternatives have already been applied. I guess all the arm64 alternative callbacks are basically finalized very early, basically when the CPU models etc have been setup. We could do a "late alternatives", I guess, but now it's even more infrastructure just for the constants. Linus
On Tue, Jun 11, 2024 at 11:59:21AM -0700, Linus Torvalds wrote: > On Tue, 11 Jun 2024 at 10:59, Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > So I'll look at doing this for x86 and see how it works. > > Oh - and when I started looking at it, I immediately remembered why I > didn't want to use alternatives originally. > > The alternatives are finalized much too early for this. By the time > the dcache code works, the alternatives have already been applied. > > I guess all the arm64 alternative callbacks are basically finalized > very early, basically when the CPU models etc have been setup. On arm64 we have early ("boot") and late ("system-wide") alternatives. We apply the system-wide alternatives in apply_alternatives_all(), a few callees deep under smp_cpus_done(), after secondary CPUs are brought up, since that has to handle mismatched features in big.LITTLE systems. I had assumed that we could use late/system-wide alternatives here, since those get applied after vfs_caches_init_early(), but maybe that's too late? > We could do a "late alternatives", I guess, but now it's even more > infrastructure just for the constants. Fair enough; thanks for taking a look. Mark.
On Tue, 11 Jun 2024 at 13:22, Mark Rutland <mark.rutland@arm.com> wrote: > > On arm64 we have early ("boot") and late ("system-wide") alternatives. > We apply the system-wide alternatives in apply_alternatives_all(), a few > callees deep under smp_cpus_done(), after secondary CPUs are brought up, > since that has to handle mismatched features in big.LITTLE systems. Annoyingly, we don't have any generic model for this. Maybe that would be a good thing regardless, but your point that you have big.LITTLE issues does kind of reinforce the thing that different architectures have different requirements for the alternatives patching. On arm64, the late alternatives seem to be in kernel_init() -> kernel_init_freeable() -> smp_init() -> smp_cpus_done() -> setup_system_features() -> setup_system_capabilities() -> apply_alternatives_all() which is nice and late - that's when the system is fully initialized, and kernel_init() is already running as the first real thread. On x86, the alternatives are finalized much earlier in start_kernel() -> arch_cpu_finalize_init -> alternative_instructions() which is quite early, much closer to the early arm64 case. Now, even that early x86 timing is good enough for vfs_caches_early(), which is also done from start_kernel() fairly early on - and before the arch_cpu_finalize_init() code is run. But ... > I had assumed that we could use late/system-wide alternatives here, since > those get applied after vfs_caches_init_early(), but maybe that's too > late? So vfs_caches_init_early() is *one* case for the dcache init, but for the NUMA case, we delay the dcache init until after the MM setup has been completed, and do it relatively later in the init sequence at vfs_caches_init(). See that horribly named 'hashdist' variable ('dist' is not 'distance', it's 'distribute'). It's not dcache-specific, btw. There's a couple of other hashes that do that whole "NUMA distribution or not" thing.. Annoying, yes. I'm not sure that the dual init makes any actual sense - I think it's entirely a historical oddity. But that "done conditionally in two different places" may be ugly, but even if we fixed it, we'd fix it by doing it in just once, and it would be that later "NUMA has been initialized" vfs_caches_init() case. Which is too late for the x86 alternatives. The arm64 late case would seem to work fine. It's late enough to be after all "core kernel init", but still early enough to be before the "generic" initcalls that will start initializing filesystems etc (that then need the vfs code to have been initialized). So that "smp_init()" placement that arm64 has is actually a very good place for at least the dcache case. It's just not what x86 does. Note that my "just replace the constants" model avoids all the ordering issues because it just does the constant initialization synchronously when the constant is initialized. So it doesn't depend on any other ordering at all, and there is no worry about subtle differences in when alternatives are applied, or when the uses happen. (It obviously does have the same ordering requirement that the variable initialization itself has: the dcache init itself has to happen before any dcache use, but that's neither surprising nor a new ordering imposed by the runtime constant case). There's an advantage to just being self-sufficient and not tying into random other subsystems that have random other constraints. Linus
diff --git a/arch/arm64/include/asm/runtime-const.h b/arch/arm64/include/asm/runtime-const.h new file mode 100644 index 000000000000..02462b2cb6f9 --- /dev/null +++ b/arch/arm64/include/asm/runtime-const.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +#define runtime_const_ptr(sym) ({ \ + typeof(sym) __ret; \ + asm_inline("1:\t" \ + "movz %0, #0xcdef\n\t" \ + "movk %0, #0x89ab, lsl #16\n\t" \ + "movk %0, #0x4567, lsl #32\n\t" \ + "movk %0, #0x0123, lsl #48\n\t" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ + ".long 1b - .\n\t" \ + ".popsection" \ + :"=r" (__ret)); \ + __ret; }) + +#define runtime_const_shift_right_32(val, sym) ({ \ + unsigned long __ret; \ + asm_inline("1:\t" \ + "lsr %w0,%w1,#12\n\t" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ + ".long 1b - .\n\t" \ + ".popsection" \ + :"=r" (__ret) \ + :"r" (0u+(val))); \ + __ret; }) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +// 16-bit immediate for wide move (movz and movk) in bits 5..20 +static inline void __runtime_fixup_16(unsigned int *p, unsigned int val) +{ + unsigned int insn = *p; + insn &= 0xffe0001f; + insn |= (val & 0xffff) << 5; + *p = insn; +} + +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + unsigned int *p = lm_alias(where); + __runtime_fixup_16(p, val); + __runtime_fixup_16(p+1, val >> 16); + __runtime_fixup_16(p+2, val >> 32); + __runtime_fixup_16(p+3, val >> 48); +} + +// Immediate value is 5 bits starting at bit #16 +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + unsigned int *p = lm_alias(where); + unsigned int insn = *p; + insn &= 0xffc0ffff; + insn |= (val & 63) << 16; + *p = insn; +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 755a22d4f840..55a8e310ea12 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -264,6 +264,9 @@ SECTIONS EXIT_DATA } + RUNTIME_CONST(shift, d_hash_shift) + RUNTIME_CONST(ptr, dentry_hashtable) + PERCPU_SECTION(L1_CACHE_BYTES) HYPERVISOR_PERCPU_SECTION
This implements the runtime constant infrastructure for arm64, allowing the dcache d_hash() function to be generated using as a constant for hash table address followed by shift by a constant of the hash index. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/arm64/include/asm/runtime-const.h | 75 ++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 3 ++ 2 files changed, 78 insertions(+) create mode 100644 arch/arm64/include/asm/runtime-const.h