Message ID | 20241016122424.1655560-7-rppt@kernel.org (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | x86/module: use large ROX pages for text allocations | expand |
On Wed, 16 Oct 2024 15:24:22 +0300 Mike Rapoport <rppt@kernel.org> wrote: > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > index 8da0e66ca22d..b498897b213c 100644 > --- a/arch/x86/kernel/ftrace.c > +++ b/arch/x86/kernel/ftrace.c > @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, > return ret; > > /* replace the text with the new text */ > - if (ftrace_poke_late) > + if (ftrace_poke_late) { > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > - else > - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > + } else { > + mutex_lock(&text_mutex); > + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > + mutex_unlock(&text_mutex); > + } > return 0; > } So this slows down the boot by over 30ms. That may not sound like much, but we care very much about boot times. This code is serialized with boot and runs whenever ftrace is configured in the kernel. The way I measured this, was that I added: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4dd0ad6c94d6..b72bb9943140 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -104,6 +104,8 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) return 0; } +u64 sdr_total; + /* * Marked __ref because it calls text_poke_early() which is .init.text. That is * ok because that call will happen early, during boot, when .init sections are @@ -114,6 +116,8 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, const char *new_code) { int ret = ftrace_verify_code(ip, old_code); + u64 start, stop; + if (ret) return ret; @@ -121,9 +125,12 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); } else { + start = trace_clock_local(); mutex_lock(&text_mutex); text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); mutex_unlock(&text_mutex); + stop = trace_clock_local(); + sdr_total += stop - start; } return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c01375adc471..93284557144d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10738,6 +10738,11 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); + { + extern u64 sdr_total; + printk("SDR TOTAL: %lld\n", sdr_total); + } + test_can_verify(); return 0; And did the same before this patch. I ran it three times and have the following numbers (all in nanoseconds): before: 11356637 11863526 11507750 after: 43978750 41293162 42741067 Before this patch, the total updates took 11ms. After the patch it takes around 42ms. This is because we are patching 59 thousand sites with this. # dmesg |grep ftrace [ 1.620569] ftrace: allocating 59475 entries in 233 pages [ 1.667178] ftrace: allocated 233 pages with 5 groups If this is only needed for module load, can we at least still use the text_poke_early() at boot up? if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); } else if (system_state == SYSTEM_BOOTING) { text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); } else { mutex_lock(&text_mutex); text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); mutex_unlock(&text_mutex); } ? The above if statement looks to slow things down just slightly, but only by 2ms, which is more reasonable. -- Steve
On Wed, Oct 16, 2024 at 05:01:28PM -0400, Steven Rostedt wrote: > On Wed, 16 Oct 2024 15:24:22 +0300 > Mike Rapoport <rppt@kernel.org> wrote: > > > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > > index 8da0e66ca22d..b498897b213c 100644 > > --- a/arch/x86/kernel/ftrace.c > > +++ b/arch/x86/kernel/ftrace.c > > @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, > > return ret; > > > > /* replace the text with the new text */ > > - if (ftrace_poke_late) > > + if (ftrace_poke_late) { > > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > > - else > > - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > > + } else { > > + mutex_lock(&text_mutex); > > + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > > + mutex_unlock(&text_mutex); > > + } > > return 0; > > } > > So this slows down the boot by over 30ms. That may not sound like much, but > we care very much about boot times. This code is serialized with boot and > runs whenever ftrace is configured in the kernel. The way I measured this, > was that I added: > > If this is only needed for module load, can we at least still use the > text_poke_early() at boot up? Right, so I don't understand why this is needed at all. ftrace_module_init() runs before complete_formation() which normally switches to ROX, as such ftrace should be able to continue to do direct modifications here. Which reminds me, at some point I did patches adding a MODULE_STATE_UNFORMED callback in order for static_call / jump_label to be able to avoid the expensive patching on module load as well (arguably ftrace should be using that too, instead of a custom callback).
On Thu, Oct 17, 2024 at 11:35:15AM +0200, Peter Zijlstra wrote: > On Wed, Oct 16, 2024 at 05:01:28PM -0400, Steven Rostedt wrote: > > On Wed, 16 Oct 2024 15:24:22 +0300 > > Mike Rapoport <rppt@kernel.org> wrote: > > > > > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > > > index 8da0e66ca22d..b498897b213c 100644 > > > --- a/arch/x86/kernel/ftrace.c > > > +++ b/arch/x86/kernel/ftrace.c > > > @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, > > > return ret; > > > > > > /* replace the text with the new text */ > > > - if (ftrace_poke_late) > > > + if (ftrace_poke_late) { > > > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > > > - else > > > - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > > > + } else { > > > + mutex_lock(&text_mutex); > > > + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > > > + mutex_unlock(&text_mutex); > > > + } > > > return 0; > > > } > > > > So this slows down the boot by over 30ms. That may not sound like much, but > > we care very much about boot times. This code is serialized with boot and > > runs whenever ftrace is configured in the kernel. The way I measured this, > > was that I added: > > > > > If this is only needed for module load, can we at least still use the > > text_poke_early() at boot up? > > Right, so I don't understand why this is needed at all. > ftrace_module_init() runs before complete_formation() which normally > switches to ROX, as such ftrace should be able to continue to do direct > modifications here. With this series the module text is allocated as ROX at the first place, so the modifications ftrace does to module text have to either use text poking even before complete_formation() or deal with a writable copy like I did for relocations and alternatives. I've been carrying the ftrace changes from a very old prototype and didn't pay enough attention to them them until Steve's complaint. I'll look into it. > Which reminds me, at some point I did patches adding a > MODULE_STATE_UNFORMED callback in order for static_call / jump_label to > be able to avoid the expensive patching on module load as well (arguably > ftrace should be using that too, instead of a custom callback). >
On Wed, 16 Oct 2024 17:01:28 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > If this is only needed for module load, can we at least still use the > text_poke_early() at boot up? > > if (ftrace_poke_late) { > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > } else if (system_state == SYSTEM_BOOTING) { > text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > } else { > mutex_lock(&text_mutex); > text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > mutex_unlock(&text_mutex); > } > > ? > > The above if statement looks to slow things down just slightly, but only by > 2ms, which is more reasonable. I changed the above to this (yes it's a little hacky) and got my 2ms back! -- Steve DEFINE_STATIC_KEY_TRUE(ftrace_modify_boot); static int __init ftrace_boot_init_done(void) { static_branch_disable(&ftrace_modify_boot); return 0; } /* Ftrace updates happen before core init */ core_initcall(ftrace_boot_init_done); /* * Marked __ref because it calls text_poke_early() which is .init.text. That is * ok because that call will happen early, during boot, when .init sections are * still present. */ static int __ref ftrace_modify_code_direct(unsigned long ip, const char *old_code, const char *new_code) { int ret = ftrace_verify_code(ip, old_code); if (ret) return ret; /* replace the text with the new text */ if (static_branch_unlikely(&ftrace_modify_boot)) { text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); } else if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); } else { mutex_lock(&text_mutex); text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); mutex_unlock(&text_mutex); } return 0; }
On Thu, 17 Oct 2024 14:25:05 +0300 Mike Rapoport <rppt@kernel.org> wrote: > With this series the module text is allocated as ROX at the first place, so > the modifications ftrace does to module text have to either use text poking > even before complete_formation() or deal with a writable copy like I did > for relocations and alternatives. > > I've been carrying the ftrace changes from a very old prototype and > didn't pay enough attention to them them until Steve's complaint. > > I'll look into it. I just posted a patch where you can see the effects of these changes with respect to ftrace patching times. https://lore.kernel.org/all/20241017113105.1edfa943@gandalf.local.home/ I'll be adding this to the next merge window. -- Steve
On Thu, Oct 17, 2024 at 10:17:12AM -0400, Steven Rostedt wrote: > On Wed, 16 Oct 2024 17:01:28 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > If this is only needed for module load, can we at least still use the > > text_poke_early() at boot up? > > > > if (ftrace_poke_late) { > > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > > } else if (system_state == SYSTEM_BOOTING) { > > text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > > } else { > > mutex_lock(&text_mutex); > > text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > > mutex_unlock(&text_mutex); > > } > > > > ? > > > > The above if statement looks to slow things down just slightly, but only by > > 2ms, which is more reasonable. > > I changed the above to this (yes it's a little hacky) and got my 2ms back! > > -- Steve > > DEFINE_STATIC_KEY_TRUE(ftrace_modify_boot); > > static int __init ftrace_boot_init_done(void) > { > static_branch_disable(&ftrace_modify_boot); > return 0; > } > /* Ftrace updates happen before core init */ > core_initcall(ftrace_boot_init_done); We can also pass mod to ftrace_modify_code_direct() and use that to distinguish early boot and ftrace_module_init. With this I get very similar numbers like with the static branch diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22d..859902dd06fc 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -111,17 +111,22 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) */ static int __ref ftrace_modify_code_direct(unsigned long ip, const char *old_code, - const char *new_code) + const char *new_code, struct module *mod) { int ret = ftrace_verify_code(ip, old_code); if (ret) return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else + } else if (!mod) { text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -142,7 +147,7 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long ad * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(ip, old, new); + return ftrace_modify_code_direct(ip, old, new, mod); /* * x86 overrides ftrace_replace_code -- this function will never be used @@ -161,7 +166,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) new = ftrace_call_replace(ip, addr); /* Should only be called when module is loaded */ - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_code_direct(rec->ip, old, new, NULL); } /*
Hi Mike, On Wed, Oct 16, 2024 at 03:24:22PM +0300, Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org> > > When module text memory will be allocated with ROX permissions, the > memory at the actual address where the module will live will contain > invalid instructions and there will be a writable copy that contains the > actual module code. > > Update relocations and alternatives patching to deal with it. > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Sorry that you have to hear from me again :) It seems that module loading is still broken with this version of the patch, which is something that I missed in my earlier testing since I only test a monolithic kernel with my regular virtual machine testing. If I build and install the kernel and modules in the VM via a distribution package, I get the following splat at boot: Starting systemd-udevd version 256.7-1-arch [ 0.882312] SMP alternatives: Something went horribly wrong trying to rewrite the CFI implementation. [ 0.883526] CFI failure at do_one_initcall+0x128/0x380 (target: init_module+0x0/0xff0 [crc32c_intel]; expected type: 0x0c7a3a22) [ 0.884802] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 0.885434] CPU: 3 UID: 0 PID: 157 Comm: modprobe Tainted: G W 6.12.0-rc3-debug-next-20241021-06324-g63b3ff03d91a #1 291f0fd70f293827edec681d3c5304f5807a3c7b [ 0.887084] Tainted: [W]=WARN [ 0.887409] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022 [ 0.888241] RIP: 0010:do_one_initcall+0x128/0x380 [ 0.888720] Code: f3 0f 1e fa 41 be ff ff ff ff e9 0f 01 00 00 0f 1f 44 00 00 41 81 e7 ff ff ff 7f 49 89 db 41 ba de c5 85 f3 45 03 53 f1 74 02 <0f> 0b 41 ff d3 0f 1f 00 41 89 c6 0f 1f 44 00 00 c6 04 24 00 65 8b [ 0.890598] RSP: 0018:ff3f93e5c052f970 EFLAGS: 00010217 [ 0.891129] RAX: ffffffffb4c105b8 RBX: ffffffffc0602010 RCX: 0000000000000000 [ 0.891850] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffc0602010 [ 0.892588] RBP: ff3f93e5c052fc88 R08: 0000000000000020 R09: 0000000000000000 [ 0.893305] R10: 000000002a378b84 R11: ffffffffc0602010 R12: 00000000000069c6 [ 0.894003] R13: ff1f0090c5596900 R14: ff1f0090c15a55c0 R15: 0000000000000000 [ 0.894693] FS: 00007ffb712c0740(0000) GS:ff1f00942fb80000(0000) knlGS:0000000000000000 [ 0.895453] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.896020] CR2: 00007ffffc4424c8 CR3: 0000000100af4002 CR4: 0000000000771ef0 [ 0.896698] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 0.897391] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 0.898077] PKRU: 55555554 [ 0.898337] Call Trace: [ 0.898577] <TASK> [ 0.898784] ? __die_body+0x6a/0xb0 [ 0.899132] ? die+0xa4/0xd0 [ 0.899413] ? do_trap+0xa6/0x180 [ 0.899740] ? do_one_initcall+0x128/0x380 [ 0.900130] ? do_one_initcall+0x128/0x380 [ 0.900523] ? handle_invalid_op+0x6a/0x90 [ 0.900917] ? do_one_initcall+0x128/0x380 [ 0.901311] ? exc_invalid_op+0x38/0x60 [ 0.901679] ? asm_exc_invalid_op+0x1a/0x20 [ 0.902081] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] [ 0.902933] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] [ 0.903781] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] [ 0.904634] ? do_one_initcall+0x128/0x380 [ 0.905028] ? idr_alloc_cyclic+0x139/0x1d0 [ 0.905437] ? security_kernfs_init_security+0x54/0x190 [ 0.905958] ? __kernfs_new_node+0x1ba/0x240 [ 0.906377] ? sysfs_create_dir_ns+0x8f/0x140 [ 0.906795] ? kernfs_link_sibling+0xf2/0x110 [ 0.907211] ? kernfs_activate+0x2c/0x110 [ 0.907599] ? kernfs_add_one+0x108/0x150 [ 0.907981] ? __kernfs_create_file+0x75/0xa0 [ 0.908407] ? sysfs_create_bin_file+0xc6/0x120 [ 0.908853] ? __vunmap_range_noflush+0x347/0x420 [ 0.909313] ? _raw_spin_unlock+0xe/0x30 [ 0.909692] ? free_unref_page+0x22c/0x4c0 [ 0.910097] ? __kmalloc_cache_noprof+0x1a8/0x360 [ 0.910546] do_init_module+0x60/0x250 [ 0.910910] __se_sys_finit_module+0x316/0x420 [ 0.911351] do_syscall_64+0x88/0x170 [ 0.911699] ? __x64_sys_lseek+0x68/0xb0 [ 0.912077] ? syscall_exit_to_user_mode+0x97/0xc0 [ 0.912538] ? do_syscall_64+0x94/0x170 [ 0.912902] ? syscall_exit_to_user_mode+0x97/0xc0 [ 0.913353] ? do_syscall_64+0x94/0x170 [ 0.913709] ? clear_bhb_loop+0x45/0xa0 [ 0.914071] ? clear_bhb_loop+0x45/0xa0 [ 0.914428] ? clear_bhb_loop+0x45/0xa0 [ 0.914767] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 0.915089] RIP: 0033:0x7ffb713dc1fd [ 0.915316] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e3 fa 0c 00 f7 d8 64 89 01 48 [ 0.916491] RSP: 002b:00007ffffc4454a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 0.916964] RAX: ffffffffffffffda RBX: 000055f28c6a5420 RCX: 00007ffb713dc1fd [ 0.917413] RDX: 0000000000000000 RSI: 000055f26c40cc03 RDI: 0000000000000003 [ 0.917858] RBP: 00007ffffc445560 R08: 0000000000000001 R09: 00007ffffc4454f0 [ 0.918302] R10: 0000000000000040 R11: 0000000000000246 R12: 000055f26c40cc03 [ 0.918748] R13: 0000000000060000 R14: 000055f28c6a4b50 R15: 000055f28c6ac5b0 [ 0.919211] </TASK> [ 0.919356] Modules linked in: crc32c_intel(+) [ 0.919661] ---[ end trace 0000000000000000 ]--- I also see some other WARNs interleaved along the lines of [ 0.982759] no CFI hash found at: 0xffffffffc0608000 ffffffffc0608000 cc cc cc cc cc [ 0.982767] WARNING: CPU: 5 PID: 170 at arch/x86/kernel/alternative.c:1204 __apply_fineibt+0xa6d/0xab0 The console appears to be a bit of a mess after that initial message. If there is any more information I can provide or patches I can test, I am more than happy to do so. Cheers, Nathan # bad: [f2493655d2d3d5c6958ed996b043c821c23ae8d3] Add linux-next specific files for 20241018 # good: [6efbea77b390604a7be7364583e19cd2d6a1291b] Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux git bisect start 'f2493655d2d3d5c6958ed996b043c821c23ae8d3' '6efbea77b390604a7be7364583e19cd2d6a1291b' # bad: [7ed02555e105b27b9a680fe6a7c7bcec77ad8e82] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git git bisect bad 7ed02555e105b27b9a680fe6a7c7bcec77ad8e82 # bad: [fbf07148fc8b9810d1cd5d3c5bdf187b6cbc39fd] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git git bisect bad fbf07148fc8b9810d1cd5d3c5bdf187b6cbc39fd # bad: [b725ac161a1c9cd9fe33d1bd4e390342afff8b01] Merge branch 'for-next/core' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux git bisect bad b725ac161a1c9cd9fe33d1bd4e390342afff8b01 # good: [e38329e4c0ed720219784fe16862e0916424e381] Merge branch 'pwrseq/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git git bisect good e38329e4c0ed720219784fe16862e0916424e381 # bad: [f3752abeb12e52516d84935581f8fc30faf171cb] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git git bisect bad f3752abeb12e52516d84935581f8fc30faf171cb # good: [70d0db56c123833f540fe8efa0b6eb1ae847aacb] mm: renovate page_address_in_vma() git bisect good 70d0db56c123833f540fe8efa0b6eb1ae847aacb # good: [43b0021d7e0cdad81c83a9e6f2d0b3ebddca9cc1] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations git bisect good 43b0021d7e0cdad81c83a9e6f2d0b3ebddca9cc1 # good: [7d0120380249b87b339b9160c2af6bcaa936e007] tools: fix -Wunused-result in linux.c git bisect good 7d0120380249b87b339b9160c2af6bcaa936e007 # bad: [31ad3c5c341be24425db3eb5779caca447ba0a83] mm: optimization on page allocation when CMA enabled git bisect bad 31ad3c5c341be24425db3eb5779caca447ba0a83 # bad: [bbec4231f196b70a4c29c106b7f065d751fba394] x86/module: prepare module loading for ROX allocations of text git bisect bad bbec4231f196b70a4c29c106b7f065d751fba394 # good: [d0ce166108ced86f2114c34ddf1794f2188b80ab] module: prepare to handle ROX allocations for text git bisect good d0ce166108ced86f2114c34ddf1794f2188b80ab # good: [dbfc5522bcf6d64bce8872c9b6d46c34569f655e] arch: introduce set_direct_map_valid_noflush() git bisect good dbfc5522bcf6d64bce8872c9b6d46c34569f655e # first bad commit: [bbec4231f196b70a4c29c106b7f065d751fba394] x86/module: prepare module loading for ROX allocations of text
Hi Nathan, On Mon, Oct 21, 2024 at 03:15:19PM -0700, Nathan Chancellor wrote: > Hi Mike, > > On Wed, Oct 16, 2024 at 03:24:22PM +0300, Mike Rapoport wrote: > > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org> > > > > When module text memory will be allocated with ROX permissions, the > > memory at the actual address where the module will live will contain > > invalid instructions and there will be a writable copy that contains the > > actual module code. > > > > Update relocations and alternatives patching to deal with it. > > > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org> > > Sorry that you have to hear from me again :) It seems that module > loading is still broken with this version of the patch, which is > something that I missed in my earlier testing since I only test a > monolithic kernel with my regular virtual machine testing. If I build > and install the kernel and modules in the VM via a distribution package, > I get the following splat at boot: > > Starting systemd-udevd version 256.7-1-arch > [ 0.882312] SMP alternatives: Something went horribly wrong trying to rewrite the CFI implementation. > [ 0.883526] CFI failure at do_one_initcall+0x128/0x380 (target: init_module+0x0/0xff0 [crc32c_intel]; expected type: 0x0c7a3a22) > [ 0.884802] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI > [ 0.885434] CPU: 3 UID: 0 PID: 157 Comm: modprobe Tainted: G W 6.12.0-rc3-debug-next-20241021-06324-g63b3ff03d91a #1 291f0fd70f293827edec681d3c5304f5807a3c7b > [ 0.887084] Tainted: [W]=WARN > [ 0.887409] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022 > [ 0.888241] RIP: 0010:do_one_initcall+0x128/0x380 > [ 0.888720] Code: f3 0f 1e fa 41 be ff ff ff ff e9 0f 01 00 00 0f 1f 44 00 00 41 81 e7 ff ff ff 7f 49 89 db 41 ba de c5 85 f3 45 03 53 f1 74 02 <0f> 0b 41 ff d3 0f 1f 00 41 89 c6 0f 1f 44 00 00 c6 04 24 00 65 8b > [ 0.890598] RSP: 0018:ff3f93e5c052f970 EFLAGS: 00010217 > [ 0.891129] RAX: ffffffffb4c105b8 RBX: ffffffffc0602010 RCX: 0000000000000000 > [ 0.891850] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffc0602010 > [ 0.892588] RBP: ff3f93e5c052fc88 R08: 0000000000000020 R09: 0000000000000000 > [ 0.893305] R10: 000000002a378b84 R11: ffffffffc0602010 R12: 00000000000069c6 > [ 0.894003] R13: ff1f0090c5596900 R14: ff1f0090c15a55c0 R15: 0000000000000000 > [ 0.894693] FS: 00007ffb712c0740(0000) GS:ff1f00942fb80000(0000) knlGS:0000000000000000 > [ 0.895453] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 0.896020] CR2: 00007ffffc4424c8 CR3: 0000000100af4002 CR4: 0000000000771ef0 > [ 0.896698] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 0.897391] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 0.898077] PKRU: 55555554 > [ 0.898337] Call Trace: > [ 0.898577] <TASK> > [ 0.898784] ? __die_body+0x6a/0xb0 > [ 0.899132] ? die+0xa4/0xd0 > [ 0.899413] ? do_trap+0xa6/0x180 > [ 0.899740] ? do_one_initcall+0x128/0x380 > [ 0.900130] ? do_one_initcall+0x128/0x380 > [ 0.900523] ? handle_invalid_op+0x6a/0x90 > [ 0.900917] ? do_one_initcall+0x128/0x380 > [ 0.901311] ? exc_invalid_op+0x38/0x60 > [ 0.901679] ? asm_exc_invalid_op+0x1a/0x20 > [ 0.902081] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] > [ 0.902933] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] > [ 0.903781] ? __cfi_init_module+0x10/0x10 [crc32c_intel 5331566c5540f82df397056699bc4ddac8be1306] > [ 0.904634] ? do_one_initcall+0x128/0x380 > [ 0.905028] ? idr_alloc_cyclic+0x139/0x1d0 > [ 0.905437] ? security_kernfs_init_security+0x54/0x190 > [ 0.905958] ? __kernfs_new_node+0x1ba/0x240 > [ 0.906377] ? sysfs_create_dir_ns+0x8f/0x140 > [ 0.906795] ? kernfs_link_sibling+0xf2/0x110 > [ 0.907211] ? kernfs_activate+0x2c/0x110 > [ 0.907599] ? kernfs_add_one+0x108/0x150 > [ 0.907981] ? __kernfs_create_file+0x75/0xa0 > [ 0.908407] ? sysfs_create_bin_file+0xc6/0x120 > [ 0.908853] ? __vunmap_range_noflush+0x347/0x420 > [ 0.909313] ? _raw_spin_unlock+0xe/0x30 > [ 0.909692] ? free_unref_page+0x22c/0x4c0 > [ 0.910097] ? __kmalloc_cache_noprof+0x1a8/0x360 > [ 0.910546] do_init_module+0x60/0x250 > [ 0.910910] __se_sys_finit_module+0x316/0x420 > [ 0.911351] do_syscall_64+0x88/0x170 > [ 0.911699] ? __x64_sys_lseek+0x68/0xb0 > [ 0.912077] ? syscall_exit_to_user_mode+0x97/0xc0 > [ 0.912538] ? do_syscall_64+0x94/0x170 > [ 0.912902] ? syscall_exit_to_user_mode+0x97/0xc0 > [ 0.913353] ? do_syscall_64+0x94/0x170 > [ 0.913709] ? clear_bhb_loop+0x45/0xa0 > [ 0.914071] ? clear_bhb_loop+0x45/0xa0 > [ 0.914428] ? clear_bhb_loop+0x45/0xa0 > [ 0.914767] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 0.915089] RIP: 0033:0x7ffb713dc1fd > [ 0.915316] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e3 fa 0c 00 f7 d8 64 89 01 48 > [ 0.916491] RSP: 002b:00007ffffc4454a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 > [ 0.916964] RAX: ffffffffffffffda RBX: 000055f28c6a5420 RCX: 00007ffb713dc1fd > [ 0.917413] RDX: 0000000000000000 RSI: 000055f26c40cc03 RDI: 0000000000000003 > [ 0.917858] RBP: 00007ffffc445560 R08: 0000000000000001 R09: 00007ffffc4454f0 > [ 0.918302] R10: 0000000000000040 R11: 0000000000000246 R12: 000055f26c40cc03 > [ 0.918748] R13: 0000000000060000 R14: 000055f28c6a4b50 R15: 000055f28c6ac5b0 > [ 0.919211] </TASK> > [ 0.919356] Modules linked in: crc32c_intel(+) > [ 0.919661] ---[ end trace 0000000000000000 ]--- > > I also see some other WARNs interleaved along the lines of > > [ 0.982759] no CFI hash found at: 0xffffffffc0608000 ffffffffc0608000 cc cc cc cc cc > [ 0.982767] WARNING: CPU: 5 PID: 170 at arch/x86/kernel/alternative.c:1204 __apply_fineibt+0xa6d/0xab0 > > The console appears to be a bit of a mess after that initial message. > > If there is any more information I can provide or patches I can test, I > am more than happy to do so. I've got similar report from kbuild bot a few days ago: https://lore.kernel.org/all/202410202257.b7edc376-lkp@intel.com I fixed fineibt handling in v7: https://lore.kernel.org/linux-mm/20241023162711.2579610-1-rppt@kernel.org > Cheers, > Nathan >
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index f8de31a0c5d1..e8e8b54b3037 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -435,24 +435,25 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end) +void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } -void apply_retpolines(s32 *start, s32 *end) +void apply_retpolines(s32 *start, s32 *end, struct module *mod) { } -void apply_returns(s32 *start, s32 *end) +void apply_returns(s32 *start, s32 *end, struct module *mod) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod) { } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8fed8b8b9cc..ed21151923c3 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -54,7 +54,8 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len)); + image->alt_len), + NULL); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ca9ae606aab9..dc03a647776d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; +struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); -extern void apply_retpolines(s32 *start, s32 *end); -extern void apply_returns(s32 *start, s32 *end); -extern void apply_seal_endbr(s32 *start, s32 *end); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod); +extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); +extern void apply_returns(s32 *start, s32 *end, struct module *mod); +extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi); - -struct module; + s32 *start_cfi, s32 *end_cfi, struct module *mod); struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d17518ca19b8..cf782f431110 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,8 +392,10 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, + struct module *mod) { + u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -403,14 +405,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) } if (a->instrlen != 6 || - instr[0] != CALL_RIP_REL_OPCODE || - instr[1] != CALL_RIP_REL_MODRM) { + wr_instr[0] != CALL_RIP_REL_OPCODE || + wr_instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(instr + 2); + disp = *(s32 *)(wr_instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -448,7 +450,8 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end) + struct alt_instr *end, + struct module *mod) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -477,6 +480,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -490,7 +494,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); + wr_instr = module_writable_address(mod, instr); + replacement = (u8 *)&a->repl_offset + a->repl_offset; + wr_replacement = module_writable_address(mod, replacement); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -501,9 +509,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, instr, a->instrlen); + memcpy(insn_buff, wr_instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(instr, insn_buff, a->instrlen); + text_poke_early(wr_instr, insn_buff, a->instrlen); continue; } @@ -513,11 +521,12 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, replacement, a->replacementlen); + memcpy(insn_buff, wr_replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a); + insn_buff_sz = alt_replace_call(instr, insn_buff, a, + mod); if (insn_buff_sz < 0) continue; } @@ -527,11 +536,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insn_buff, insn_buff_sz); + text_poke_early(wr_instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -722,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -761,9 +772,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } @@ -799,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { s32 *s; @@ -808,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -833,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr); +static void poison_cfi(void *addr, void *wr_addr); -static void __init_or_module poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) return; if (!is_endbr(endbr)) { @@ -873,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + text_poke_early(wr_addr, &poison, 4); } /* @@ -882,22 +898,23 @@ static void __init_or_module poison_endbr(void *addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, true); + poison_endbr(addr, wr_addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - 16, wr_addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1119,7 +1136,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end) +static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1131,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); + if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, jmp, 2); + text_poke_early(wr_addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end) +static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1154,26 +1174,29 @@ static int cfi_enable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, mov, 2); + text_poke_early(wr_addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end) +static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; hash = decode_preamble_hash(addr); @@ -1182,18 +1205,19 @@ static int cfi_rand_preamble(s32 *start, s32 *end) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(addr + 1, &hash, 4); + text_poke_early(wr_addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end) +static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; hash = decode_preamble_hash(addr); @@ -1201,59 +1225,64 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) addr, addr, 5, addr)) return -EINVAL; - text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end) +static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr+16, false); + poison_endbr(addr+16, wr_addr, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end) +static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(addr + 2, &hash, 4); + text_poke_early(wr_addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end) +static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { - text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(addr + fineibt_caller_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1262,8 +1291,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { + bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1281,7 +1311,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); + ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1292,11 +1322,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi); + ret = cfi_rand_preamble(start_cfi, end_cfi, mod); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline); + ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; } @@ -1308,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline); + ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1318,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi); + ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi); + cfi_rewrite_endbr(start_cfi, end_cfi, mod); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1347,7 +1377,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr) +static void poison_cfi(void *addr, void *wr_addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1359,8 +1389,8 @@ static void poison_cfi(void *addr) * ud2 * 1: nop */ - poison_endbr(addr, false); - poison_hash(addr + fineibt_preamble_hash); + poison_endbr(addr, wr_addr, false); + poison_hash(wr_addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1369,7 +1399,7 @@ static void poison_cfi(void *addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(addr + 1); + poison_hash(wr_addr + 1); break; default: @@ -1380,22 +1410,21 @@ static void poison_cfi(void *addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr) { } +static void poison_cfi(void *addr, void *wr_addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, - /* .builtin = */ false); + start_cfi, end_cfi, mod); } #ifdef CONFIG_SMP @@ -1692,16 +1721,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, true); + __cfi_sites, __cfi_sites_end, NULL); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end); - apply_returns(__return_sites, __return_sites_end); + apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); + apply_returns(__return_sites, __return_sites_end, NULL); - apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); /* * Now all calls are established. Apply the call thunks if @@ -1712,7 +1741,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22d..b498897b213c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -318,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - int ret; + void *ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -349,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); - if (WARN_ON(ret < 0)) + ret = text_poke_copy(trampoline, (void *)start_offset, size); + if (WARN_ON(!ret)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - memcpy(ip, retq, sizeof(retq)); + text_poke_copy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -365,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); - if (ret < 0) + if (!text_poke_copy(ip, x86_nops[2], 2)) goto fail; } @@ -379,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - *ptr = (unsigned long)ops; + text_poke_copy(ptr, &ops, sizeof(unsigned long)); op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -395,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -405,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE); + text_poke_copy_locked(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE, false); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 837450b6e882..8984abd91c00 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - if (memcmp(loc, &zero, size)) { + void *wr_loc = module_writable_address(me, loc); + + if (memcmp(wr_loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &val, size); + write(wr_loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } + /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -224,7 +227,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -233,8 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -265,20 +266,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size); + apply_retpolines(rseg, rseg + retpolines->sh_size, me); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size); + apply_returns(rseg, rseg + returns->sh_size, me); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); + apply_alternatives(aseg, aseg + alt->sh_size, me); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -297,8 +298,28 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); } + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *locks = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + } + if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -308,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr, text, text_end); } - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - return 0; } diff --git a/include/linux/module.h b/include/linux/module.h index 7039f609c6ef..2c1a24ba99a3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -376,15 +376,6 @@ struct module_memory { #endif }; -#ifdef CONFIG_MODULES -void *module_writable_address(struct module *mod, void *loc); -#else -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} -#endif - #ifdef CONFIG_MODULES_TREE_LOOKUP /* Only touch one cacheline for common rbtree-for-core-layout case. */ #define __module_memory_align ____cacheline_aligned @@ -778,6 +769,8 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); +void *module_writable_address(struct module *mod, void *loc); + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -885,6 +878,11 @@ static inline bool module_is_coming(struct module *mod) { return false; } + +static inline void *module_writable_address(struct module *mod, void *loc) +{ + return loc; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS