Message ID | 20220316100132.244849-4-bobo.shaobowang@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64/ftrace: support dynamic trampoline | expand |
On Wed, Mar 16, 2022 at 06:01:31PM +0800, Wang ShaoBo wrote: > From: Cheng Jian <cj.chengjian@huawei.com> > > When tracing multiple functions customly, a list function is called > in ftrace_(regs)_caller, which makes all the other traced functions > recheck the hash of the ftrace_ops when tracing happend, apparently > it is inefficient. ... and when does that actually matter? Who does this and why? > We notified X86_64 has delivered a dynamically allocated trampoline > which calls the callback directly to solve this problem. This patch > introduces similar trampolines for ARM64, but we also should also > handle long jump different. > > Similar to X86_64, save the local ftrace_ops at the end. > the ftrace trampoline layout: > > ftrace_(regs_)caller_tramp low > `--> +---------------------+ ^ > | ftrace_regs_entry: | | > | ... | | > +---------------------+ | > | ftrace_common: | | > | ... | | > | ldr x2, <ftrace_ops>| | > ftrace callsite | ... | | > `--> +---------------------+ | > | nop >> bl <callback> > ftrace graph callsite | PLT entrys (TODO) | | > `--> +---------------------+ | > | nop >> bl <graph callback> > | PLT entrys (TODO) | | > +---------------------+ | > | ftrace_regs_restore:| | > | ... | | > +---------------------+ | > | ftrace_ops | | > +---------------------+ | > high Chengming Zhou has patches to unify the two calls in the trampoline: https://lore.kernel.org/linux-arm-kernel/20220420160006.17880-1-zhouchengming@bytedance.com/ ... and I'd much prefer that we do that *before* we copy-paste that code for dynamic trampolines. > Adopting the new dynamical trampoline is benificial when only one > callback is registered to a function, when tracing happened, ftrace_ops > can get its own callback directly from this trampoline allocated. > > To compare the efficiency of calling schedule() when using traditional > dynamic ftrace(using ftrace_ops_list) and ftrace using dynamic trampoline, > We wrote a module to register multiple ftrace_ops, but only one(in our > testcase is 'schedule') is used to measure the performance on the call > path used by Unixbench. > > This is the partial code: > ``` > static struct ftrace_ops *ops_array[100]; > static struct ftrace_ops ops = { > .func = my_callback_func, > .flags = FTRACE_OPS_FL_PID, > }; > static int register_ftrace_test_init(void) > { > int i, ret; > > if (cnt > 100 || cnt < 0) > return -1; > > for (i = 0; i < cnt; i++) { > ops_array[i] = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); > if (!ops_array[i]) > return -1; > > *ops_array[i] = ops; > > ret = ftrace_set_filter(ops_array[i], "cpuset_write_u64", > strlen("cpuset_write_u64"), 1); > if (ret) > return ret; > > ret = register_ftrace_function(ops_array[i]); > if (ret) > return ret; > } > > ret = ftrace_set_filter(&ops, "schedule", strlen("schedule"), 1); > if (ret) > return ret; > return register_ftrace_function(&ops); > } > ``` > > So in our test, ftrace_ops_list can be illustrated: > > HEAD(ftrace_ops_list) > `-> +--------+ +--------+ +--------+ +------------+ +--------+ > + ops0 + -> + ops1 + -> + ops2 + ... + ops(cnt-1)+ -> + ops + > +--------+ +--------+ +--------+ +------------+ +--------+ \_END > `-------------cpuset_write_u64----------------` `schedule` > > This is the result testing on kunpeng920 with CONFIG_RANDOMIZE_BASE=n > (we also add first NA row for comparison with not tracing any function): > > command: taskset -c 1 ./Run -c 1 context1 > +------------------UnixBench context1--------------------+ > +---------+--------------+-----------------+-------------+ > + + Score + improvement + > + +--------------+-----------------+-------------+ > + + traditional + dynamic tramp + +/- + > +---------+--------------+-----------------+-------------+ > + NA + 554.6 + 553.6 + - + > +---------+--------------+-----------------+-------------+ > + cnt=0 + 542.4 + 549.7 + +1.2% + > +---------+--------------+-----------------+-------------+ > + cnt=3 + 528.7 + 549.7 + +3.9% + > +---------+--------------+-----------------+-------------+ > + cnt=6 + 523.9 + 549.8 + +4.9% + > +---------+--------------+-----------------+-------------+ > + cnt=15 + 504.2 + 549.0 + +8.9% + > +---------+--------------+-----------------+-------------+ > + cnt=20 + 492.6 + 551.1 + +11.9% + > +---------+--------------+-----------------+-------------+ > > References: > - https://lore.kernel.org/linux-arm-kernel/20200109142736.1122-1-cj.chengjian@huawei.com/ > > Signed-off-by: Cheng Jian <cj.chengjian@huawei.com> > Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com> > --- > arch/arm64/kernel/ftrace.c | 286 +++++++++++++++++++++++++++++++++++++ > 1 file changed, 286 insertions(+) > > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c > index 4506c4a90ac1..d35a042baf75 100644 > --- a/arch/arm64/kernel/ftrace.c > +++ b/arch/arm64/kernel/ftrace.c > @@ -10,12 +10,14 @@ > #include <linux/module.h> > #include <linux/swab.h> > #include <linux/uaccess.h> > +#include <linux/vmalloc.h> > > #include <asm/cacheflush.h> > #include <asm/debug-monitors.h> > #include <asm/ftrace.h> > #include <asm/insn.h> > #include <asm/patching.h> > +#include <asm/set_memory.h> > > #ifdef CONFIG_DYNAMIC_FTRACE > /* > @@ -48,6 +50,290 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, > return 0; > } > > +/* ftrace dynamic trampolines */ > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS > +#ifdef CONFIG_MODULES > +#include <linux/moduleloader.h> > + > +static inline void *alloc_tramp(unsigned long size) > +{ > + return module_alloc(size); > +} > + > +static inline void tramp_free(void *tramp) > +{ > + module_memfree(tramp); > +} > +#else > +static inline void *alloc_tramp(unsigned long size) > +{ > + return NULL; > +} > + > +static inline void tramp_free(void *tramp) {} > +#endif > + > +/* ftrace_caller trampoline template */ > +extern void ftrace_caller_tramp(void); > +extern void ftrace_caller_tramp_end(void); > +extern void ftrace_caller_tramp_call(void); > +extern void ftrace_caller_tramp_graph_call(void); > +extern void ftrace_caller_tramp_ops(void); > + > +/* ftrace_regs_caller trampoline template */ > +extern void ftrace_regs_caller_tramp(void); > +extern void ftrace_regs_caller_tramp_end(void); > +extern void ftrace_regs_caller_tramp_call(void); > +extern void ftrace_regs_caller_tramp_graph_call(void); > +extern void ftrace_regs_caller_tramp_ops(void); > + > + > +/* > + * The ARM64 ftrace trampoline layout : > + * > + * > + * ftrace_(regs_)caller_tramp low > + * `--> +---------------------+ ^ > + * | ftrace_regs_entry: | | > + * | ... | | > + * +---------------------+ | > + * | ftrace_common: | | > + * | ... | | > + * | ldr x2, <ftrace_ops>| | > + * ftrace callsite | ... | | > + * `--> +---------------------+ | > + * | nop >> bl <callback> > + * ftrace graph callsite | PLT entrys (TODO) | | > + * `--> +---------------------+ | > + * | nop >> bl <graph callback> > + * | PLT entrys (TODO) | | > + * +---------------------+ | > + * | ftrace_regs_restore:| | > + * | ... | | > + * +---------------------+ | > + * | ftrace_ops | | > + * +---------------------+ | > + * high > + */ As above, I'd prefer we unify the callsite and graph callsite first. > + > +static unsigned long > +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) > +{ > + unsigned long start_offset, end_offset, ops_offset; > + unsigned long caller_size, size, offset; > + unsigned long pc, npages; > + unsigned long *ptr; > + void *trampoline; > + u32 load; > + int ret; > + > + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { > + start_offset = (unsigned long)ftrace_regs_caller_tramp; > + end_offset = (unsigned long)ftrace_regs_caller_tramp_end; > + ops_offset = (unsigned long)ftrace_regs_caller_tramp_ops; > + } else { > + start_offset = (unsigned long)ftrace_caller_tramp; > + end_offset = (unsigned long)ftrace_caller_tramp_end; > + ops_offset = (unsigned long)ftrace_caller_tramp_ops; > + } > + > + caller_size = end_offset - start_offset + AARCH64_INSN_SIZE; > + size = caller_size + sizeof(void *); > + > + trampoline = alloc_tramp(size); > + if (!trampoline) > + return 0; > + > + *tramp_size = size; > + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); > + > + /* > + * copy ftrace_caller/ftrace_regs_caller trampoline template > + * onto the trampoline memory > + */ > + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, caller_size); > + if (WARN_ON(ret < 0)) > + goto free; Why is this using copy_from_kernel_nofault() ? There's no reason whatsoever that this should legitimately fault, so it should be fine to use memcpy(). > + > + /* > + * Stored ftrace_ops at the end of the trampoline. > + * This will be used to load the third parameter for the callback. > + * Basically, that location at the end of the trampoline takes the > + * place of the global function_trace_op variable. > + */ > + ptr = (unsigned long *)(trampoline + caller_size); > + *ptr = (unsigned long)ops; > + > + /* > + * Update the trampoline ops REF > + * ldr x2, <ftrace_ops> > + */ > + ops_offset -= start_offset; > + pc = (unsigned long)trampoline + ops_offset; > + offset = (unsigned long)ptr - pc; > + if (WARN_ON(offset % AARCH64_INSN_SIZE != 0)) > + goto free; > + > + load = aarch64_insn_gen_load_literal(AARCH64_INSN_REG_2, > + AARCH64_INSN_VARIANT_64BIT, offset); > + if (ftrace_modify_code(pc, 0, load, false)) > + goto free; > + > + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; > + > + set_vm_flush_reset_perms(trampoline); > + > + /* > + * Module allocation needs to be completed by making the page > + * executable. The page is still writable, which is a security hazard, > + * but anyhow ftrace breaks W^X completely. > + */ > + set_memory_ro((unsigned long)trampoline, npages); > + set_memory_x((unsigned long)trampoline, npages); > + Where is the cache maintenance performed? > + return (unsigned long)trampoline; > + > +free: > + tramp_free(trampoline); > + return 0; > +} > + > +static unsigned long calc_trampoline_call_offset(bool save_regs, bool is_graph) > +{ > + unsigned start_offset, call_offset; > + > + if (save_regs) { > + start_offset = (unsigned long)ftrace_regs_caller_tramp; > + if (is_graph) > + call_offset = (unsigned long)ftrace_regs_caller_tramp_graph_call; > + else > + call_offset = (unsigned long)ftrace_regs_caller_tramp_call; > + } else { > + start_offset = (unsigned long)ftrace_caller_tramp; > + if (is_graph) > + call_offset = (unsigned long)ftrace_caller_tramp_graph_call; > + else > + call_offset = (unsigned long)ftrace_caller_tramp_call; > + } This should be simplified by unifying the call sites. Since we need the offsets in a few places, we should factor that out. For example, have a structure with those offsets, and a helper which returns that filled with either the regular offsets or the regs offsts. More ideally, fill those in at compile-time. > + > + return call_offset - start_offset; > +} > + > +static inline ftrace_func_t > +ftrace_trampoline_get_func(struct ftrace_ops *ops, bool is_graph) > +{ > + ftrace_func_t func; > + > + if (!is_graph) > + func = ftrace_ops_get_func(ops); > +#ifdef CONFIG_FUNCTION_GRAPH_TRACER > + else > + func = (ftrace_func_t)ftrace_graph_caller; > +#endif > + > + return func; > +} > + > +static int > +ftrace_trampoline_modify_call(struct ftrace_ops *ops, bool is_graph, bool active) > +{ > + unsigned long offset; > + unsigned long pc; > + ftrace_func_t func; > + u32 nop, branch; > + > + offset = calc_trampoline_call_offset(ops->flags & > + FTRACE_OPS_FL_SAVE_REGS, is_graph); > + pc = ops->trampoline + offset; > + > + func = ftrace_trampoline_get_func(ops, is_graph); > + nop = aarch64_insn_gen_nop(); > + branch = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, > + AARCH64_INSN_BRANCH_LINK); > + > + if (active) > + return ftrace_modify_code(pc, 0, branch, false); > + else > + return ftrace_modify_code(pc, 0, nop, false); > +} > + > +extern int ftrace_graph_active; > +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) > +{ > + unsigned int size; > + > + /* > + * If kaslr is enabled, the address of tramp and ftrace call > + * may be far away, Therefore, long jump support is required. > + */ > + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) > + return; IIUC this can be required in some unusual configurations *today* even without KASLR, so we probably want to add the long jump support first. Thanks, Mark. > + > + if (!ops->trampoline) { > + ops->trampoline = create_trampoline(ops, &size); > + if (!ops->trampoline) > + return; > + ops->trampoline_size = size; > + } > + > + /* These is no static trampoline on ARM64 */ > + WARN_ON(!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)); > + > + /* atomic modify trampoline: <call func> */ > + WARN_ON(ftrace_trampoline_modify_call(ops, false, true)); > +#ifdef CONFIG_FUNCTION_GRAPH_TRACER > + /* atomic modify trampoline: <call graph func> */ > + WARN_ON(ftrace_trampoline_modify_call(ops, true, ftrace_graph_active)); > +#endif > +} > + > +static void *addr_from_call(void *ptr) > +{ > + u32 insn; > + unsigned long offset; > + > + if (aarch64_insn_read(ptr, &insn)) > + return NULL; > + > + /* Make sure this is a call */ > + if (!aarch64_insn_is_bl(insn)) { > + pr_warn("Expected bl instruction, but not\n"); > + return NULL; > + } > + > + offset = aarch64_get_branch_offset(insn); > + > + return (void *)ptr + AARCH64_INSN_SIZE + offset; > +} > + > +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, > + unsigned long frame_pointer); > + > +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) > +{ > + unsigned long offset; > + > + /* If we didn't allocate this trampoline, consider it static */ > + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) > + return NULL; > + > + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS, > + ftrace_graph_active); > + > + return addr_from_call((void *)ops->trampoline + offset); > +} > + > +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) > +{ > + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) > + return; > + > + tramp_free((void *)ops->trampoline); > + ops->trampoline = 0; > + ops->trampoline_size = 0; > +} > +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ > + > /* > * Replace tracer function in ftrace_caller() > */ > -- > 2.25.1 >
On Thu, 21 Apr 2022 14:10:04 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > On Wed, Mar 16, 2022 at 06:01:31PM +0800, Wang ShaoBo wrote: > > From: Cheng Jian <cj.chengjian@huawei.com> > > > > When tracing multiple functions customly, a list function is called > > in ftrace_(regs)_caller, which makes all the other traced functions > > recheck the hash of the ftrace_ops when tracing happend, apparently > > it is inefficient. > > ... and when does that actually matter? Who does this and why? I don't think it was explained properly. What dynamically allocated trampolines give you is this. Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can be quite common). But each of these ftrace_ops traces a function (or functions) that are not being traced by the other ftrace_ops. That is, each ftrace_ops has its own unique function(s) that they are tracing. One could be tracing schedule, the other could be tracing ksoftirqd_should_run (whatever). Without this change, because the arch does not support dynamically allocated trampolines, it means that all these ftrace_ops will be registered to the same trampoline. That means, for every function that is traced, it will loop through all 10 of theses ftrace_ops and check their hashes to see if their callback should be called or not. With dynamically allocated trampolines, each ftrace_ops will have their own trampoline, and that trampoline will be called directly if the function is only being traced by the one ftrace_ops. This is much more efficient. If a function is traced by more than one ftrace_ops, then it falls back to the loop. -- Steve
On Thu, 21 Apr 2022 10:06:39 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > Without this change, because the arch does not support dynamically > allocated trampolines, it means that all these ftrace_ops will be > registered to the same trampoline. That means, for every function that is > traced, it will loop through all 10 of theses ftrace_ops and check their > hashes to see if their callback should be called or not. Oh, I forgot to mention. If you then enable function tracer that traces *every function*, then *every function* will be going through this loop of the 10 ftrace_ops! With dynamically allocated trampolines, then most functions will just call the trampoline that handles the function tracer directly. -- Steve
On Thu, Apr 21, 2022 at 10:06:39AM -0400, Steven Rostedt wrote: > On Thu, 21 Apr 2022 14:10:04 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > On Wed, Mar 16, 2022 at 06:01:31PM +0800, Wang ShaoBo wrote: > > > From: Cheng Jian <cj.chengjian@huawei.com> > > > > > > When tracing multiple functions customly, a list function is called > > > in ftrace_(regs)_caller, which makes all the other traced functions > > > recheck the hash of the ftrace_ops when tracing happend, apparently > > > it is inefficient. > > > > ... and when does that actually matter? Who does this and why? > > I don't think it was explained properly. What dynamically allocated > trampolines give you is this. Thanks for the, explanation, btw! > Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can > be quite common). But each of these ftrace_ops traces a function (or > functions) that are not being traced by the other ftrace_ops. That is, each > ftrace_ops has its own unique function(s) that they are tracing. One could > be tracing schedule, the other could be tracing ksoftirqd_should_run > (whatever). Ok, so that's when messing around with bpf or kprobes, and not generally when using plain old ftrace functionality under /sys/kernel/tracing/ (unless that's concurrent with one of the former, as per your other reply) ? > Without this change, because the arch does not support dynamically > allocated trampolines, it means that all these ftrace_ops will be > registered to the same trampoline. That means, for every function that is > traced, it will loop through all 10 of theses ftrace_ops and check their > hashes to see if their callback should be called or not. Sure; I can see how that can be quite expensive. What I'm trying to figure out is who this matters to and when, since the implementation is going to come with a bunch of subtle/fractal complexities, and likely a substantial overhead too when enabling or disabling tracing of a patch-site. I'd like to understand the trade-offs better. > With dynamically allocated trampolines, each ftrace_ops will have their own > trampoline, and that trampoline will be called directly if the function > is only being traced by the one ftrace_ops. This is much more efficient. > > If a function is traced by more than one ftrace_ops, then it falls back to > the loop. I see -- so the dynamic trampoline is just to get the ops? Or is that doing additional things? There might be a middle-ground here where we patch the ftrace_ops pointer into a literal pool at the patch-site, which would allow us to handle this atomically, and would avoid the issues with out-of-range trampolines. Thanks, Mark.
On Thu, 21 Apr 2022 16:14:13 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > > Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can > > be quite common). But each of these ftrace_ops traces a function (or > > functions) that are not being traced by the other ftrace_ops. That is, each > > ftrace_ops has its own unique function(s) that they are tracing. One could > > be tracing schedule, the other could be tracing ksoftirqd_should_run > > (whatever). > > Ok, so that's when messing around with bpf or kprobes, and not generally > when using plain old ftrace functionality under /sys/kernel/tracing/ > (unless that's concurrent with one of the former, as per your other > reply) ? It's any user of the ftrace infrastructure, which includes kprobes, bpf, perf, function tracing, function graph tracing, and also affects instances. > > > Without this change, because the arch does not support dynamically > > allocated trampolines, it means that all these ftrace_ops will be > > registered to the same trampoline. That means, for every function that is > > traced, it will loop through all 10 of theses ftrace_ops and check their > > hashes to see if their callback should be called or not. > > Sure; I can see how that can be quite expensive. > > What I'm trying to figure out is who this matters to and when, since the > implementation is going to come with a bunch of subtle/fractal > complexities, and likely a substantial overhead too when enabling or > disabling tracing of a patch-site. I'd like to understand the trade-offs > better. > > > With dynamically allocated trampolines, each ftrace_ops will have their own > > trampoline, and that trampoline will be called directly if the function > > is only being traced by the one ftrace_ops. This is much more efficient. > > > > If a function is traced by more than one ftrace_ops, then it falls back to > > the loop. > > I see -- so the dynamic trampoline is just to get the ops? Or is that > doing additional things? It's to get both the ftrace_ops (as that's one of the parameters) as well as to call the callback directly. Not sure if arm is affected by spectre, but the "loop" function is filled with indirect function calls, where as the dynamic trampolines call the callback directly. Instead of: bl ftrace_caller ftrace_caller: [..] bl ftrace_ops_list_func [..] void ftrace_ops_list_func(...) { __do_for_each_ftrace_ops(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) // test the hash to see if it // should trace this // function. op->func(...); } } It does: bl dyanmic_tramp dynamic_tramp: [..] bl func // call the op->func directly! Much more efficient! > > There might be a middle-ground here where we patch the ftrace_ops > pointer into a literal pool at the patch-site, which would allow us to > handle this atomically, and would avoid the issues with out-of-range > trampolines. Have an example of what you are suggesting? -- Steve
On Thu, Apr 21, 2022 at 11:42:01AM -0400, Steven Rostedt wrote: > On Thu, 21 Apr 2022 16:14:13 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > > Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can > > > be quite common). But each of these ftrace_ops traces a function (or > > > functions) that are not being traced by the other ftrace_ops. That is, each > > > ftrace_ops has its own unique function(s) that they are tracing. One could > > > be tracing schedule, the other could be tracing ksoftirqd_should_run > > > (whatever). > > > > Ok, so that's when messing around with bpf or kprobes, and not generally > > when using plain old ftrace functionality under /sys/kernel/tracing/ > > (unless that's concurrent with one of the former, as per your other > > reply) ? > > It's any user of the ftrace infrastructure, which includes kprobes, bpf, > perf, function tracing, function graph tracing, and also affects instances. > > > > > > Without this change, because the arch does not support dynamically > > > allocated trampolines, it means that all these ftrace_ops will be > > > registered to the same trampoline. That means, for every function that is > > > traced, it will loop through all 10 of theses ftrace_ops and check their > > > hashes to see if their callback should be called or not. > > > > Sure; I can see how that can be quite expensive. > > > > What I'm trying to figure out is who this matters to and when, since the > > implementation is going to come with a bunch of subtle/fractal > > complexities, and likely a substantial overhead too when enabling or > > disabling tracing of a patch-site. I'd like to understand the trade-offs > > better. > > > > > With dynamically allocated trampolines, each ftrace_ops will have their own > > > trampoline, and that trampoline will be called directly if the function > > > is only being traced by the one ftrace_ops. This is much more efficient. > > > > > > If a function is traced by more than one ftrace_ops, then it falls back to > > > the loop. > > > > I see -- so the dynamic trampoline is just to get the ops? Or is that > > doing additional things? > > It's to get both the ftrace_ops (as that's one of the parameters) as well > as to call the callback directly. Not sure if arm is affected by spectre, > but the "loop" function is filled with indirect function calls, where as > the dynamic trampolines call the callback directly. > > Instead of: > > bl ftrace_caller > > ftrace_caller: > [..] > bl ftrace_ops_list_func > [..] > > > void ftrace_ops_list_func(...) > { > __do_for_each_ftrace_ops(op, ftrace_ops_list) { > if (ftrace_ops_test(op, ip)) // test the hash to see if it > // should trace this > // function. > op->func(...); > } > } > > It does: > > bl dyanmic_tramp > > dynamic_tramp: > [..] > bl func // call the op->func directly! > > > Much more efficient! > > > > > > There might be a middle-ground here where we patch the ftrace_ops > > pointer into a literal pool at the patch-site, which would allow us to > > handle this atomically, and would avoid the issues with out-of-range > > trampolines. > > Have an example of what you are suggesting? We can make the compiler to place 2 NOPs before the function entry point, and 2 NOPs after it using `-fpatchable-function-entry=4,2` (the arguments are <total>,<before>). On arm64 all instructions are 4 bytes, and we'll use the first two NOPs as an 8-byte literal pool. Ignoring BTI for now, the compiler generates (with some magic labels added here for demonstration): __before_func: NOP NOP func: NOP NOP __remainder_of_func: ... At ftrace_init_nop() time we patch that to: __before_func: // treat the 2 NOPs as an 8-byte literal-pool .quad <default ops pointer> // see below func: MOV X9, X30 NOP __remainder_of_func: ... When enabling tracing we do __before_func: // patch this with the relevant ops pointer .quad <ops pointer> func: MOV X9, X30 BL <trampoline> // common trampoline __remainder_of_func: .. The `BL <trampoline>` clobbers X30 with __remainder_of_func, so within the trampoline we can find the ops pointer at an offset from X30. On arm64 we can load that directly with something like: LDR <tmp>, [X30, # -(__remainder_of_func - __before_func)] ... then load the ops->func from that and invoke it (or pass it to a helper which does): // Ignoring the function arguments for this demonstration LDR <tmp2>, [<tmp>, #OPS_FUNC_OFFSET] BLR <tmp2> That avoids iterating over the list *without* requiring separate trampolines, and allows us to patch the sequence without requiring stop-the-world logic (since arm64 has strong requirements for patching most instructions other than branches and nops). We can initialize the ops pointer to a default ops that does the whole __do_for_each_ftrace_ops() dance. To handle BTI we can have two trampolines, or we can always reserve 3 NOPs before the function so that we can have a consistent offset regardless. Thanks, Mark.
On Thu, 21 Apr 2022 17:27:40 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > We can initialize the ops pointer to a default ops that does the whole > __do_for_each_ftrace_ops() dance. OK, I think I understand now. What you are doing is instead of creating a trampoline that has all the information in the trampoline, you add nops to all the functions where you can place the information in the nops (before the function), and then have the trampoline just read that information to find the ops pointer as well as the function to call. I guess you could have two trampolines as well. One that always calls the list loop, and one that calls the data stored in front of the function that was just called the trampoline. As it is always safe to call the loop function, you could have the call call that trampoline first, set up the specific data before the function, then call the trampoline that will read it. And same thing for tear down. -- Steve
On Thu, Apr 21, 2022 at 01:06:48PM -0400, Steven Rostedt wrote: > On Thu, 21 Apr 2022 17:27:40 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > We can initialize the ops pointer to a default ops that does the whole > > __do_for_each_ftrace_ops() dance. > > OK, I think I understand now. What you are doing is instead of creating a > trampoline that has all the information in the trampoline, you add nops to > all the functions where you can place the information in the nops (before > the function), and then have the trampoline just read that information to > find the ops pointer as well as the function to call. Yup! > I guess you could have two trampolines as well. One that always calls the > list loop, and one that calls the data stored in front of the function that > was just called the trampoline. As it is always safe to call the loop > function, you could have the call call that trampoline first, set up the > specific data before the function, then call the trampoline that will read > it. And same thing for tear down. Having separate trampolines is possible, but there are some complications, and we might end up with an explosion of trampolines (and associated module PLTs) due to needing BTI/!BTI and REGs/!REGS variants, so if it's possible to have a default ops that handled the list case, that'd be my preference to keep that simple and manageable. As an aside, I'd also love to remove the REGS/!REGs distinction, and always save a minimum amount of state (like ARGS, but never saving a full pt_regs), since on arm64 the extra state stored for the REGS case isn't useful (and we can't reliably capture all of the pt_regs state anyway, so bits of it are made up or not filled in). Thanks, Mark.
On Fri, 22 Apr 2022 11:12:39 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > As an aside, I'd also love to remove the REGS/!REGs distinction, and always > save a minimum amount of state (like ARGS, but never saving a full pt_regs), > since on arm64 the extra state stored for the REGS case isn't useful (and we > can't reliably capture all of the pt_regs state anyway, so bits of it are made > up or not filled in). Note, the reason for the addition of REGS was a requirement of kprobes. Because before ftrace, kprobes would be triggered at the start of a function by a breakpoint that would load in all the regs. And for backward compatibility, Masami wanted to make sure that kprobes coming from ftrace had all the regs just like it had when coming from a breakpoint. IIUC, kprobes is the only reason we have the "regs" variant (all other use cases could get by with the ARGS version). -- Steve
On Fri, Apr 22, 2022 at 11:45:41AM -0400, Steven Rostedt wrote: > On Fri, 22 Apr 2022 11:12:39 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > As an aside, I'd also love to remove the REGS/!REGs distinction, and always > > save a minimum amount of state (like ARGS, but never saving a full pt_regs), > > since on arm64 the extra state stored for the REGS case isn't useful (and we > > can't reliably capture all of the pt_regs state anyway, so bits of it are made > > up or not filled in). > > Note, the reason for the addition of REGS was a requirement of kprobes. > Because before ftrace, kprobes would be triggered at the start of a > function by a breakpoint that would load in all the regs. And for backward > compatibility, Masami wanted to make sure that kprobes coming from ftrace > had all the regs just like it had when coming from a breakpoint. > > IIUC, kprobes is the only reason we have the "regs" variant (all other use > cases could get by with the ARGS version). I see. FWIW, we don't have KPROBES_ON_FTRACE on arm64. Also, the same problems apply to KRETPROBES: the synthetic `pstate` value is bogus and we don't fill in other bits of the regs (e.g. the PMR value), so it's not a "real" pt_regs, and things like interrupts_enabled(regs) won't work correctly. In addition, as KRETPROBES only hooks function entry/exit and x9-x17 + x19-x28 are meaningless at those times, no-one's going to care what they contain anyway. The state we can correctly snapshot (and that would be useful) is the same as ARGS. It'd be nice if KRETPROBES could just use ARGS, but a standard KPROBE that traps could provide regs (since it actually gets "real" regs, and within a function the other GPRs could be important). Thanks, Mark.
Hi Mark, On Fri, 22 Apr 2022 18:27:53 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > On Fri, Apr 22, 2022 at 11:45:41AM -0400, Steven Rostedt wrote: > > On Fri, 22 Apr 2022 11:12:39 +0100 > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > As an aside, I'd also love to remove the REGS/!REGs distinction, and always > > > save a minimum amount of state (like ARGS, but never saving a full pt_regs), > > > since on arm64 the extra state stored for the REGS case isn't useful (and we > > > can't reliably capture all of the pt_regs state anyway, so bits of it are made > > > up or not filled in). > > > > Note, the reason for the addition of REGS was a requirement of kprobes. > > Because before ftrace, kprobes would be triggered at the start of a > > function by a breakpoint that would load in all the regs. And for backward > > compatibility, Masami wanted to make sure that kprobes coming from ftrace > > had all the regs just like it had when coming from a breakpoint. Yes. Since this kprobes->ftrace conversion is done by kprobes transparently, user doesn't know their kprobe handler is called from sw break or ftrace. > > > > IIUC, kprobes is the only reason we have the "regs" variant (all other use > > cases could get by with the ARGS version). > > I see. FWIW, we don't have KPROBES_ON_FTRACE on arm64. Right. Since x86 fentry puts the entry on function address, I need such compatibility. But on arm64, ftrace leads some preparation instructions, kprobes can put the sw break on the function address there. And may not need to put the kprobes on it. So it depends on arch. I would like to keep the kprobes available at the function address so that it can trace any registers. (like debugger usage) > Also, the same problems apply to KRETPROBES: the synthetic `pstate` > value is bogus and we don't fill in other bits of the regs (e.g. the PMR > value), so it's not a "real" pt_regs, and things like > interrupts_enabled(regs) won't work correctly. Would you mean the process which kprobes_save/restore_local_irqflag() does? Is the regs->pstate saved correctly in sw break or ftrace? (sorry, I missed the context) > In addition, as > KRETPROBES only hooks function entry/exit and x9-x17 + x19-x28 are > meaningless at those times, no-one's going to care what they contain > anyway. It depends on what bug they are trying to trace. C source level bug will not need such information, but assembly level bug (or compiler level bug) may need such registers. Anyway, this also depends on user. I just won't like limit the usage. > The state we can correctly snapshot (and that would be useful) > is the same as ARGS. > > It'd be nice if KRETPROBES could just use ARGS, but a standard KPROBE > that traps could provide regs (since it actually gets "real" regs, and > within a function the other GPRs could be important). Here, the KRETPROBES means the exit handler, or including entry handler? Since kretprobes uses a standard kprobe to trap the function entry. If you talk about fprobes (ftrace probe interface), it will only use the ftrace. Thus your idea is acceptable for it (because fprobe is different from kprobes *). * Of course we have to talk with BPF people so that they will only access ARGS from BPF program on fprobes. Thank you,
在 2022/4/21 22:06, Steven Rostedt 写道: > On Thu, 21 Apr 2022 14:10:04 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > >> On Wed, Mar 16, 2022 at 06:01:31PM +0800, Wang ShaoBo wrote: >>> From: Cheng Jian <cj.chengjian@huawei.com> >>> >>> When tracing multiple functions customly, a list function is called >>> in ftrace_(regs)_caller, which makes all the other traced functions >>> recheck the hash of the ftrace_ops when tracing happend, apparently >>> it is inefficient. >> ... and when does that actually matter? Who does this and why? > I don't think it was explained properly. What dynamically allocated > trampolines give you is this. > > Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can > be quite common). But each of these ftrace_ops traces a function (or > functions) that are not being traced by the other ftrace_ops. That is, each > ftrace_ops has its own unique function(s) that they are tracing. One could > be tracing schedule, the other could be tracing ksoftirqd_should_run > (whatever). > > Without this change, because the arch does not support dynamically > allocated trampolines, it means that all these ftrace_ops will be > registered to the same trampoline. That means, for every function that is > traced, it will loop through all 10 of theses ftrace_ops and check their > hashes to see if their callback should be called or not. > > With dynamically allocated trampolines, each ftrace_ops will have their own > trampoline, and that trampoline will be called directly if the function > is only being traced by the one ftrace_ops. This is much more efficient. > > If a function is traced by more than one ftrace_ops, then it falls back to > the loop. > > -- Steve > . yes, this explanation is easier to understand, I will update commit description according to this. -- Wang ShaoBo
On Tue, Apr 26, 2022 at 05:47:49PM +0900, Masami Hiramatsu wrote: > Hi Mark, > > On Fri, 22 Apr 2022 18:27:53 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > On Fri, Apr 22, 2022 at 11:45:41AM -0400, Steven Rostedt wrote: > > > On Fri, 22 Apr 2022 11:12:39 +0100 > > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > > > As an aside, I'd also love to remove the REGS/!REGs distinction, and always > > > > save a minimum amount of state (like ARGS, but never saving a full pt_regs), > > > > since on arm64 the extra state stored for the REGS case isn't useful (and we > > > > can't reliably capture all of the pt_regs state anyway, so bits of it are made > > > > up or not filled in). > > > > > > Note, the reason for the addition of REGS was a requirement of kprobes. > > > Because before ftrace, kprobes would be triggered at the start of a > > > function by a breakpoint that would load in all the regs. And for backward > > > compatibility, Masami wanted to make sure that kprobes coming from ftrace > > > had all the regs just like it had when coming from a breakpoint. > > Yes. Since this kprobes->ftrace conversion is done by kprobes transparently, > user doesn't know their kprobe handler is called from sw break or ftrace. The big problem is that on arm64 kprobes and ftrace are *very* different, and we can't do that transparently (unless both had a mode which just provided the ARGS). > > > IIUC, kprobes is the only reason we have the "regs" variant (all other use > > > cases could get by with the ARGS version). > > > > I see. FWIW, we don't have KPROBES_ON_FTRACE on arm64. > > Right. Since x86 fentry puts the entry on function address, I need such > compatibility. > > But on arm64, ftrace leads some preparation instructions, kprobes can put > the sw break on the function address there. And may not need to put the > kprobes on it. So it depends on arch. I would like to keep the kprobes > available at the function address so that it can trace any registers. > (like debugger usage) > > > Also, the same problems apply to KRETPROBES: the synthetic `pstate` > > value is bogus and we don't fill in other bits of the regs (e.g. the PMR > > value), so it's not a "real" pt_regs, and things like > > interrupts_enabled(regs) won't work correctly. > > Would you mean the process which kprobes_save/restore_local_irqflag() does? > Is the regs->pstate saved correctly in sw break or ftrace? (sorry, I missed > the context) For `BRK` (SW break) instructions we take an exception, PSTATE (and all of the struct pt_regs) is saved correctly. For ftrace, PSTATE (and other bits of pt_regs) are not saved correctly. Practically speaking it's not feasible to do so reliably without taking an exception, which is why I'd like to reduce ftrace down to just the ARGs. > > In addition, as > > KRETPROBES only hooks function entry/exit and x9-x17 + x19-x28 are > > meaningless at those times, no-one's going to care what they contain > > anyway. > > It depends on what bug they are trying to trace. C source level bug > will not need such information, but assembly level bug (or compiler > level bug) may need such registers. Anyway, this also depends on user. > I just won't like limit the usage. If that's how kretprobes are intended to be used, then I think they must *always* use a BRK as that's the only way to reliably get a complete struct pt_regs. I've implemented that: https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/commit/?h=arm64/kprobes/kretprobe-brk-trampoline&id=80be4ccbf47b0294a02b05b797cbff36364bc435 ... and can send that out as a patch shortly. > > The state we can correctly snapshot (and that would be useful) > > is the same as ARGS. > > > > It'd be nice if KRETPROBES could just use ARGS, but a standard KPROBE > > that traps could provide regs (since it actually gets "real" regs, and > > within a function the other GPRs could be important). > > Here, the KRETPROBES means the exit handler, or including entry handler? > Since kretprobes uses a standard kprobe to trap the function entry. I'm suggesting that (if there are cases where kretprobes are only used to acquire arguments and return vaalues, and not other state), we change things so that kretprobes can use a different entry handler from a regular kprobe, and that new handler only has to acquire the arguments and return values, matching the ftrace ARGS. That way we can have: a) A regular kprobe, which uses BRK as today, and gets a full pt_regs b) A regular kretprobe, which uses BRK for both entry/exit, and gets a full pt_regs in both cases. c) An args-only kretprobe, where the entry/exit handlers only present the ARGS to the registered handlers. If kretprobes always needs the full pt_regs, then (c) isn't necessary, and we don't need to change anything more than the kretprobes trampoline, as above. What I really want to get away from it kretprobes and ftrace having an incomplete pt_regs, and the two ways of doing that are: 1) Save/restore the full pt_regs by taking an exception. 2) Save/restore a minimal ARGS structure, and ensure the interfaces don't have to pass a struct pt_regs pointer. For kretprobes I can implement either, but for ftrace (2) is the only real option. Thanks, Mark. > If you talk about fprobes (ftrace probe interface), it will only use the > ftrace. Thus your idea is acceptable for it (because fprobe is different > from kprobes *). > > * Of course we have to talk with BPF people so that they will only access > ARGS from BPF program on fprobes. > > Thank you, > > -- > Masami Hiramatsu <mhiramat@kernel.org>
On Thu, Apr 21, 2022 at 01:06:48PM -0400, Steven Rostedt wrote: > On Thu, 21 Apr 2022 17:27:40 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > We can initialize the ops pointer to a default ops that does the whole > > __do_for_each_ftrace_ops() dance. > > OK, I think I understand now. What you are doing is instead of creating a > trampoline that has all the information in the trampoline, you add nops to > all the functions where you can place the information in the nops (before > the function), and then have the trampoline just read that information to > find the ops pointer as well as the function to call. FWIW, I had a go at mocking that up: https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/ftrace/per-callsite-ops Aside from some bodges required to ensure the patch site is suitably aligned (which I think can be cleaned up somewhat), I don't think it looks that bad. I wasn't sure how exactly to wire that up in the core code, so all the patch sites are initialized with a default ops that calls arch_ftrace_ops_list_func(), but it looks like it should be possible to wire that up in the core with some refactoring. > I guess you could have two trampolines as well. One that always calls the > list loop, and one that calls the data stored in front of the function that > was just called the trampoline. As it is always safe to call the loop > function, you could have the call call that trampoline first, set up the > specific data before the function, then call the trampoline that will read > it. I was thinking we could just patch the ops with a default ops that called the list loop, as my patches default them to. > And same thing for tear down. I wasn't sure how teardown was meant to work in general. When we want to remove an ops structure, or a trampoline, how do we ensure those are no longer in use before we remove them? I can see how we can synchronize the updates to the kernel text, but I couldn't spot how we handle a thread being in the middle of a trampoline. Thanks, Mark.
在 2022/4/22 0:27, Mark Rutland 写道: > On Thu, Apr 21, 2022 at 11:42:01AM -0400, Steven Rostedt wrote: >> On Thu, 21 Apr 2022 16:14:13 +0100 >> Mark Rutland <mark.rutland@arm.com> wrote: >> >>>> Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can >>>> be quite common). But each of these ftrace_ops traces a function (or >>>> functions) that are not being traced by the other ftrace_ops. That is, each >>>> ftrace_ops has its own unique function(s) that they are tracing. One could >>>> be tracing schedule, the other could be tracing ksoftirqd_should_run >>>> (whatever). >>> Ok, so that's when messing around with bpf or kprobes, and not generally >>> when using plain old ftrace functionality under /sys/kernel/tracing/ >>> (unless that's concurrent with one of the former, as per your other >>> reply) ? >> It's any user of the ftrace infrastructure, which includes kprobes, bpf, >> perf, function tracing, function graph tracing, and also affects instances. >> >>>> Without this change, because the arch does not support dynamically >>>> allocated trampolines, it means that all these ftrace_ops will be >>>> registered to the same trampoline. That means, for every function that is >>>> traced, it will loop through all 10 of theses ftrace_ops and check their >>>> hashes to see if their callback should be called or not. >>> Sure; I can see how that can be quite expensive. >>> >>> What I'm trying to figure out is who this matters to and when, since the >>> implementation is going to come with a bunch of subtle/fractal >>> complexities, and likely a substantial overhead too when enabling or >>> disabling tracing of a patch-site. I'd like to understand the trade-offs >>> better. >>> >>>> With dynamically allocated trampolines, each ftrace_ops will have their own >>>> trampoline, and that trampoline will be called directly if the function >>>> is only being traced by the one ftrace_ops. This is much more efficient. >>>> >>>> If a function is traced by more than one ftrace_ops, then it falls back to >>>> the loop. >>> I see -- so the dynamic trampoline is just to get the ops? Or is that >>> doing additional things? >> It's to get both the ftrace_ops (as that's one of the parameters) as well >> as to call the callback directly. Not sure if arm is affected by spectre, >> but the "loop" function is filled with indirect function calls, where as >> the dynamic trampolines call the callback directly. >> >> Instead of: >> >> bl ftrace_caller >> >> ftrace_caller: >> [..] >> bl ftrace_ops_list_func >> [..] >> >> >> void ftrace_ops_list_func(...) >> { >> __do_for_each_ftrace_ops(op, ftrace_ops_list) { >> if (ftrace_ops_test(op, ip)) // test the hash to see if it >> // should trace this >> // function. >> op->func(...); >> } >> } >> >> It does: >> >> bl dyanmic_tramp >> >> dynamic_tramp: >> [..] >> bl func // call the op->func directly! >> >> >> Much more efficient! >> >> >>> There might be a middle-ground here where we patch the ftrace_ops >>> pointer into a literal pool at the patch-site, which would allow us to >>> handle this atomically, and would avoid the issues with out-of-range >>> trampolines. >> Have an example of what you are suggesting? > We can make the compiler to place 2 NOPs before the function entry point, and 2 > NOPs after it using `-fpatchable-function-entry=4,2` (the arguments are > <total>,<before>). On arm64 all instructions are 4 bytes, and we'll use the > first two NOPs as an 8-byte literal pool. > > Ignoring BTI for now, the compiler generates (with some magic labels added here > for demonstration): > > __before_func: > NOP > NOP > func: > NOP > NOP > __remainder_of_func: > ... > > At ftrace_init_nop() time we patch that to: > > __before_func: > // treat the 2 NOPs as an 8-byte literal-pool > .quad <default ops pointer> // see below > func: > MOV X9, X30 > NOP > __remainder_of_func: > ... > > When enabling tracing we do > > __before_func: > // patch this with the relevant ops pointer > .quad <ops pointer> > func: > MOV X9, X30 > BL <trampoline> // common trampoline I have a question that does this common trampoline allocated by module_alloc()? if yes, how to handle the long jump from traced func to common trampoline if only adding two NOPs in front of func. -- Wang ShaoBo > __remainder_of_func: > .. > > The `BL <trampoline>` clobbers X30 with __remainder_of_func, so within > the trampoline we can find the ops pointer at an offset from X30. On > arm64 we can load that directly with something like: > > LDR <tmp>, [X30, # -(__remainder_of_func - __before_func)] > > ... then load the ops->func from that and invoke it (or pass it to a > helper which does): > > // Ignoring the function arguments for this demonstration > LDR <tmp2>, [<tmp>, #OPS_FUNC_OFFSET] > BLR <tmp2> > > That avoids iterating over the list *without* requiring separate > trampolines, and allows us to patch the sequence without requiring > stop-the-world logic (since arm64 has strong requirements for patching > most instructions other than branches and nops). > > We can initialize the ops pointer to a default ops that does the whole > __do_for_each_ftrace_ops() dance. > > To handle BTI we can have two trampolines, or we can always reserve 3 NOPs > before the function so that we can have a consistent offset regardless. > > Thanks, > Mark. > .
Hi Mark, On Wed, 4 May 2022 11:24:43 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > On Tue, Apr 26, 2022 at 05:47:49PM +0900, Masami Hiramatsu wrote: > > Hi Mark, > > > > On Fri, 22 Apr 2022 18:27:53 +0100 > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > On Fri, Apr 22, 2022 at 11:45:41AM -0400, Steven Rostedt wrote: > > > > On Fri, 22 Apr 2022 11:12:39 +0100 > > > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > > > > > As an aside, I'd also love to remove the REGS/!REGs distinction, and always > > > > > save a minimum amount of state (like ARGS, but never saving a full pt_regs), > > > > > since on arm64 the extra state stored for the REGS case isn't useful (and we > > > > > can't reliably capture all of the pt_regs state anyway, so bits of it are made > > > > > up or not filled in). > > > > > > > > Note, the reason for the addition of REGS was a requirement of kprobes. > > > > Because before ftrace, kprobes would be triggered at the start of a > > > > function by a breakpoint that would load in all the regs. And for backward > > > > compatibility, Masami wanted to make sure that kprobes coming from ftrace > > > > had all the regs just like it had when coming from a breakpoint. > > > > Yes. Since this kprobes->ftrace conversion is done by kprobes transparently, > > user doesn't know their kprobe handler is called from sw break or ftrace. > > The big problem is that on arm64 kprobes and ftrace are *very* different, and > we can't do that transparently (unless both had a mode which just provided the > ARGS). OK, in that case, we can just drop the KPROBE_ON_FTRACE support on arm64. A problem I thought in this case, is that the fprobe which I introduced recently for eBPF is expecting to access pt_regs. I would like to know that what kind of limitation exists if the ftrace on arm64 makes pt_regs. Only PSTATE is a problem? > > > > > IIUC, kprobes is the only reason we have the "regs" variant (all other use > > > > cases could get by with the ARGS version). > > > > > > I see. FWIW, we don't have KPROBES_ON_FTRACE on arm64. > > > > Right. Since x86 fentry puts the entry on function address, I need such > > compatibility. > > > > But on arm64, ftrace leads some preparation instructions, kprobes can put > > the sw break on the function address there. And may not need to put the > > kprobes on it. So it depends on arch. I would like to keep the kprobes > > available at the function address so that it can trace any registers. > > (like debugger usage) > > > > > Also, the same problems apply to KRETPROBES: the synthetic `pstate` > > > value is bogus and we don't fill in other bits of the regs (e.g. the PMR > > > value), so it's not a "real" pt_regs, and things like > > > interrupts_enabled(regs) won't work correctly. > > > > Would you mean the process which kprobes_save/restore_local_irqflag() does? > > Is the regs->pstate saved correctly in sw break or ftrace? (sorry, I missed > > the context) > > For `BRK` (SW break) instructions we take an exception, PSTATE (and all of the > struct pt_regs) is saved correctly. Great, thanks for the confirmation! > > For ftrace, PSTATE (and other bits of pt_regs) are not saved correctly. > Practically speaking it's not feasible to do so reliably without taking an > exception, which is why I'd like to reduce ftrace down to just the ARGs. OK. But my interest is that the ftrace on arm64 can provide a limited access to registers via pt_regs or not. I don't mind the contained values so much because in the most case, 'users' will (most likely) access to the ARGs via BPF or tracefs (and we can just warn users if they try to access the registers which is not saved.) But if the arm64 ftrace only provides a special data structure, arch-independent code must have 2 different access code. That is inefficient. That is my concern. IOW, I'm interested in interface abstraction. > > > In addition, as > > > KRETPROBES only hooks function entry/exit and x9-x17 + x19-x28 are > > > meaningless at those times, no-one's going to care what they contain > > > anyway. > > > > It depends on what bug they are trying to trace. C source level bug > > will not need such information, but assembly level bug (or compiler > > level bug) may need such registers. Anyway, this also depends on user. > > I just won't like limit the usage. > > If that's how kretprobes are intended to be used, then I think they must > *always* use a BRK as that's the only way to reliably get a complete struct > pt_regs. > > I've implemented that: > > https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/commit/?h=arm64/kprobes/kretprobe-brk-trampoline&id=80be4ccbf47b0294a02b05b797cbff36364bc435 > > ... and can send that out as a patch shortly. Hm, this is reasonable, and not hard to port it for rethook on arm64. Just FYI, the rethook will be used for kprobe and fprobe (ftrace based probe, which was introduced for batch BPF probe [1]). kprobe will be used for assembly level debug, but fprobe may not. Thus, eventually those should have 2 different trampolines specified by rethook 'flags'. But for the first step, I think we can use BRK trampoline by default. [1] https://lore.kernel.org/all/20220316122419.933957-1-jolsa@kernel.org/T/#u > > > The state we can correctly snapshot (and that would be useful) > > > is the same as ARGS. > > > > > > It'd be nice if KRETPROBES could just use ARGS, but a standard KPROBE > > > that traps could provide regs (since it actually gets "real" regs, and > > > within a function the other GPRs could be important). > > > > Here, the KRETPROBES means the exit handler, or including entry handler? > > Since kretprobes uses a standard kprobe to trap the function entry. > > I'm suggesting that (if there are cases where kretprobes are only used to > acquire arguments and return vaalues, and not other state), we change things so > that kretprobes can use a different entry handler from a regular kprobe, and > that new handler only has to acquire the arguments and return values, matching > the ftrace ARGS. I would like to move this feature on fprobe. Since the kretprobe is used widely, it is hard to change the interface. Now fprobe is only used for BPF, so I think there is a chance to change it. So there are 2 options I think. - Keep fprobe and kprobe current interface (use pt_regs) but only the registers for argument and return values are accessible (hopefully, the interrupt state etc. too) from fprobes. With this, the BPF can use same program code for kprobes and fprobe, but BPF compiler will be able to reject user program which access the non-saved registers. - Introduce a new data structure which saves argument and return value for fprobe. BPF runtime program and BPF compiler needs to be changed for this. > > That way we can have: > > a) A regular kprobe, which uses BRK as today, and gets a full pt_regs > > b) A regular kretprobe, which uses BRK for both entry/exit, and gets a full > pt_regs in both cases. > > c) An args-only kretprobe, where the entry/exit handlers only present the ARGS > to the registered handlers. > > If kretprobes always needs the full pt_regs, then (c) isn't necessary, and we > don't need to change anything more than the kretprobes trampoline, as above. I'm OK for the args-only, but that is not kretprobe, we need another one and should remove current kretprobe in this case for avoiding confusion. I think fprobe is a good candidate because it is not widely used yet. Anyway, we need to talk with BPF people about this idea. As a compromise plan, keep using pt_regs but filling a part of them on arm64. > What I really want to get away from it kretprobes and ftrace having an > incomplete pt_regs, and the two ways of doing that are: > > 1) Save/restore the full pt_regs by taking an exception. > > 2) Save/restore a minimal ARGS structure, and ensure the interfaces don't have > to pass a struct pt_regs pointer. > > For kretprobes I can implement either, but for ftrace (2) is the only real > option. Hmm, I'm still not sure why you can't pass an incomplete pt_regs? (to optimize stack usage??) Anyway, I think your kretprobe patch is perfect. I think it is good for kretprobe. Thank you, > > Thanks, > Mark. > > > If you talk about fprobes (ftrace probe interface), it will only use the > > ftrace. Thus your idea is acceptable for it (because fprobe is different > > from kprobes *). > > > > * Of course we have to talk with BPF people so that they will only access > > ARGS from BPF program on fprobes. > > > > Thank you, > > > > -- > > Masami Hiramatsu <mhiramat@kernel.org>
On Thu, 5 May 2022 12:15:38 +0900 Masami Hiramatsu <mhiramat@kernel.org> wrote: > OK. But my interest is that the ftrace on arm64 can provide a limited > access to registers via pt_regs or not. I don't mind the contained values > so much because in the most case, 'users' will (most likely) access to the > ARGs via BPF or tracefs (and we can just warn users if they try to access > the registers which is not saved.) But if the arm64 ftrace only provides > a special data structure, arch-independent code must have 2 different access > code. That is inefficient. That is my concern. > IOW, I'm interested in interface abstraction. Note, ftrace now has a ftrace_regs structure that is passed to the callbacks for the function tracer. It then has an arch dependent helper function ftrace_get_regs(fregs), that returns a pt_regs from the fregs only if the fregs has a full pt_regs to return. If not, it returns NULL. This was suggested by both Peter Zijlstra and Thomas Gleixner when I introduced FTRACE_WITH_ARGS, where all functions can now get the arguments from fregs, but not the full pt_regs. If a ftrace_ops has the REGS flag set (using ftrace_regs_caller), the ftrace_get_regs(fregs) will return the pt_regs, or it will return NULL if ftrace_regs_caller was not used. This way the same parameter can provide full pt_regs or a subset, and have an generic interface to tell the difference. -- Steve
Hi Steve, On Mon, 9 May 2022 14:22:03 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Thu, 5 May 2022 12:15:38 +0900 > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > OK. But my interest is that the ftrace on arm64 can provide a limited > > access to registers via pt_regs or not. I don't mind the contained values > > so much because in the most case, 'users' will (most likely) access to the > > ARGs via BPF or tracefs (and we can just warn users if they try to access > > the registers which is not saved.) But if the arm64 ftrace only provides > > a special data structure, arch-independent code must have 2 different access > > code. That is inefficient. That is my concern. > > IOW, I'm interested in interface abstraction. > > Note, ftrace now has a ftrace_regs structure that is passed to the > callbacks for the function tracer. > > It then has an arch dependent helper function ftrace_get_regs(fregs), that > returns a pt_regs from the fregs only if the fregs has a full pt_regs to > return. If not, it returns NULL. > > This was suggested by both Peter Zijlstra and Thomas Gleixner when I > introduced FTRACE_WITH_ARGS, where all functions can now get the arguments > from fregs, but not the full pt_regs. Hmm, I thought the ftrace_get_regs() is the all-or-nothing interface, or is there any way to get the arguments from fregs? > If a ftrace_ops has the REGS flag set > (using ftrace_regs_caller), the ftrace_get_regs(fregs) will return the > pt_regs, or it will return NULL if ftrace_regs_caller was not used. > > This way the same parameter can provide full pt_regs or a subset, and have > an generic interface to tell the difference. If it can provide a partial (subset of) pt_regs, that could be good for me too, since at least kprobe-events on ftrace can check the traced register is in the subset or not and reject it if it doesn't saved. Thank you,
On Tue, 10 May 2022 18:10:12 +0900 Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > This was suggested by both Peter Zijlstra and Thomas Gleixner when I > > introduced FTRACE_WITH_ARGS, where all functions can now get the arguments > > from fregs, but not the full pt_regs. > > Hmm, I thought the ftrace_get_regs() is the all-or-nothing interface, or > is there any way to get the arguments from fregs? Not yet generically. But that can easily be added. If you look at x86 live patching, since it is arch specific, it just took the regs parameter directly, knowing that the args were already set up. That is, ftrace_regs is just a wrapper around pt_regs with just the regs for the arguments and stack initialized. If you get regs from ftrace_get_regs(fregs) it will return NULL if it wasn't full set of regs. But we can add generic functions to get the parameters. That is, we can create a ftrace_get_kernel_argument() function that takes fregs instead of pt_regs, and produce the same thing as regs_get_kernel_argument(). x86 live kernel patching has this: arch/x86/include/asm/ftrace.h: #define ftrace_instruction_pointer_set(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) arch/x86/include/asm/livepatch.h: static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { ftrace_instruction_pointer_set(fregs, ip); } Where fregs is not a full set of regs. > > > If a ftrace_ops has the REGS flag set > > (using ftrace_regs_caller), the ftrace_get_regs(fregs) will return the > > pt_regs, or it will return NULL if ftrace_regs_caller was not used. > > > > This way the same parameter can provide full pt_regs or a subset, and have > > an generic interface to tell the difference. > > If it can provide a partial (subset of) pt_regs, that could be good for me > too, since at least kprobe-events on ftrace can check the traced register > is in the subset or not and reject it if it doesn't saved. That's exactly the use case for ftrace_regs. -- Steve
On Tue, 10 May 2022 10:44:46 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Tue, 10 May 2022 18:10:12 +0900 > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > This was suggested by both Peter Zijlstra and Thomas Gleixner when I > > > introduced FTRACE_WITH_ARGS, where all functions can now get the arguments > > > from fregs, but not the full pt_regs. > > > > Hmm, I thought the ftrace_get_regs() is the all-or-nothing interface, or > > is there any way to get the arguments from fregs? > > Not yet generically. But that can easily be added. If you look at x86 live > patching, since it is arch specific, it just took the regs parameter > directly, knowing that the args were already set up. That is, ftrace_regs is > just a wrapper around pt_regs with just the regs for the arguments and stack > initialized. If you get regs from ftrace_get_regs(fregs) it will return > NULL if it wasn't full set of regs. But we can add generic functions to get > the parameters. > > That is, we can create a ftrace_get_kernel_argument() function that takes > fregs instead of pt_regs, and produce the same thing as > regs_get_kernel_argument(). > > x86 live kernel patching has this: > > arch/x86/include/asm/ftrace.h: > > #define ftrace_instruction_pointer_set(fregs, _ip) \ > do { (fregs)->regs.ip = (_ip); } while (0) > > > arch/x86/include/asm/livepatch.h: > > static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) > { > ftrace_instruction_pointer_set(fregs, ip); > } > > Where fregs is not a full set of regs. OK, so fregs::regs will have a subset of pt_regs, and accessibility of the registers depends on the architecture. If we can have a checker like ftrace_regs_exist(fregs, reg_offset) kprobe on ftrace or fprobe user (BPF) can filter user's requests. I think I can introduce a flag for kprobes so that user can make a kprobe handler only using a subset of registers. Maybe similar filter code is also needed for BPF 'user space' library because this check must be done when compiling BPF. Thank you, > > > > > If a ftrace_ops has the REGS flag set > > > (using ftrace_regs_caller), the ftrace_get_regs(fregs) will return the > > > pt_regs, or it will return NULL if ftrace_regs_caller was not used. > > > > > > This way the same parameter can provide full pt_regs or a subset, and have > > > an generic interface to tell the difference. > > > > If it can provide a partial (subset of) pt_regs, that could be good for me > > too, since at least kprobe-events on ftrace can check the traced register > > is in the subset or not and reject it if it doesn't saved. > > That's exactly the use case for ftrace_regs. > > -- Steve
On Wed, 11 May 2022 23:34:50 +0900 Masami Hiramatsu <mhiramat@kernel.org> wrote: > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > the registers depends on the architecture. If we can have a checker like > > ftrace_regs_exist(fregs, reg_offset) Or something. I'd have to see the use case. > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > I think I can introduce a flag for kprobes so that user can make a > kprobe handler only using a subset of registers. > Maybe similar filter code is also needed for BPF 'user space' library > because this check must be done when compiling BPF. Is there any other case without full regs that the user would want anything other than the args, stack pointer and instruction pointer? That is, have a flag that says "only_args" or something, that says they will only get the registers for arguments, a stack pointer, and the instruction pointer (note, the fregs may not have the instruction pointer as that is passed to the the caller via the "ip" parameter. If the fregs needs that, we can add a "ftrace_regs_set_ip()" before calling the callback registered to the fprobe). -- Steve
On Wed, 11 May 2022 11:12:07 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Wed, 11 May 2022 23:34:50 +0900 > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > > the registers depends on the architecture. If we can have a checker like > > > > ftrace_regs_exist(fregs, reg_offset) > > Or something. I'd have to see the use case. > > > > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > > I think I can introduce a flag for kprobes so that user can make a > > kprobe handler only using a subset of registers. > > Maybe similar filter code is also needed for BPF 'user space' library > > because this check must be done when compiling BPF. > > Is there any other case without full regs that the user would want anything > other than the args, stack pointer and instruction pointer? For the kprobes APIs/events, yes, it needs to access to the registers which is used for local variables when probing inside the function body. However at the function entry, I think almost no use case. (BTW, pstate is a bit special, that may show the actual processor-level status (context), so for the debugging, user might want to read it.) Thus the BPF use case via fprobes, I think there is no usecase. My concern is that the BPF may allow user program to access any field of pt_regs. Thus if the user miss-programmed, they may see a wrong value (I guess the fregs is not zero-filled) for unsaved registers. > That is, have a flag that says "only_args" or something, that says they > will only get the registers for arguments, a stack pointer, and the > instruction pointer (note, the fregs may not have the instruction pointer > as that is passed to the the caller via the "ip" parameter. If the fregs > needs that, we can add a "ftrace_regs_set_ip()" before calling the > callback registered to the fprobe). Yes, that is what I'm thinking. If "only_args" flag is set, BPF runtime must check the user program. And if it finds the program access the unsaved registers, it should stop executing. BTW, "what register is saved" can be determined statically, thus I think we just need the offset for checking (for fprobe usecase, since it will set the ftrace_ops flag by itself.) Thank you,
On Thu, 12 May 2022 21:02:31 +0900 Masami Hiramatsu <mhiramat@kernel.org> wrote: > BTW, "what register is saved" can be determined statically, thus I think > we just need the offset for checking (for fprobe usecase, since it will > set the ftrace_ops flag by itself.) I'm not against that API, but I guess it would be useful to see how it would be used. -- Steve
On Thu, May 12, 2022 at 09:02:31PM +0900, Masami Hiramatsu wrote: > On Wed, 11 May 2022 11:12:07 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Wed, 11 May 2022 23:34:50 +0900 > > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > > > the registers depends on the architecture. If we can have a checker like > > > > > > ftrace_regs_exist(fregs, reg_offset) > > > > Or something. I'd have to see the use case. > > > > > > > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > > > I think I can introduce a flag for kprobes so that user can make a > > > kprobe handler only using a subset of registers. > > > Maybe similar filter code is also needed for BPF 'user space' library > > > because this check must be done when compiling BPF. > > > > Is there any other case without full regs that the user would want anything > > other than the args, stack pointer and instruction pointer? > > For the kprobes APIs/events, yes, it needs to access to the registers > which is used for local variables when probing inside the function body. > However at the function entry, I think almost no use case. (BTW, pstate > is a bit special, that may show the actual processor-level status > (context), so for the debugging, user might want to read it.) As before, if we really need PSTATE we *must* take an exception to get a reliable snapshot (or to alter the value). So I'd really like to split this into two cases: * Where users *really* need PSTATE (or arbitrary GPRs), they use kprobes. That always takes an exception and they can have a complete, real struct pt_regs. * Where users just need to capture a function call boundary, they use ftrace. That uses a trampoline without taking an exception, and they get the minimal set of registers relevant to the function call boundary (which does not include PSTATE or most GPRs). > Thus the BPF use case via fprobes, I think there is no usecase. > My concern is that the BPF may allow user program to access any > field of pt_regs. Thus if the user miss-programmed, they may see > a wrong value (I guess the fregs is not zero-filled) for unsaved > registers. > > > That is, have a flag that says "only_args" or something, that says they > > will only get the registers for arguments, a stack pointer, and the > > instruction pointer (note, the fregs may not have the instruction pointer > > as that is passed to the the caller via the "ip" parameter. If the fregs > > needs that, we can add a "ftrace_regs_set_ip()" before calling the > > callback registered to the fprobe). > > Yes, that is what I'm thinking. If "only_args" flag is set, BPF runtime > must check the user program. And if it finds the program access the > unsaved registers, it should stop executing. > > BTW, "what register is saved" can be determined statically, thus I think > we just need the offset for checking (for fprobe usecase, since it will > set the ftrace_ops flag by itself.) For arm64 I'd like to make this static, and have ftrace *always* capture a minimal set of ftrace_regs, which would be: X0 to X8 inclusive SP PC LR FP Since X0 to X8 + SP is all that we need for arguments and return values (per the calling convention we use), and PC+LR+FP gives us everything we need for unwinding and live patching. I *might* want to add x18 to that when SCS is enabled, but I'm not immediately sure. Thanks, Mark.
On Thu, May 05, 2022 at 10:57:35AM +0800, Wangshaobo (bobo) wrote: > > 锟斤拷 2022/4/22 0:27, Mark Rutland 写锟斤拷: > > On Thu, Apr 21, 2022 at 11:42:01AM -0400, Steven Rostedt wrote: > > > On Thu, 21 Apr 2022 16:14:13 +0100 > > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > > > > Let's say you have 10 ftrace_ops registered (with bpf and kprobes this can > > > > > be quite common). But each of these ftrace_ops traces a function (or > > > > > functions) that are not being traced by the other ftrace_ops. That is, each > > > > > ftrace_ops has its own unique function(s) that they are tracing. One could > > > > > be tracing schedule, the other could be tracing ksoftirqd_should_run > > > > > (whatever). > > > > Ok, so that's when messing around with bpf or kprobes, and not generally > > > > when using plain old ftrace functionality under /sys/kernel/tracing/ > > > > (unless that's concurrent with one of the former, as per your other > > > > reply) ? > > > It's any user of the ftrace infrastructure, which includes kprobes, bpf, > > > perf, function tracing, function graph tracing, and also affects instances. > > > > > > > > Without this change, because the arch does not support dynamically > > > > > allocated trampolines, it means that all these ftrace_ops will be > > > > > registered to the same trampoline. That means, for every function that is > > > > > traced, it will loop through all 10 of theses ftrace_ops and check their > > > > > hashes to see if their callback should be called or not. > > > > Sure; I can see how that can be quite expensive. > > > > > > > > What I'm trying to figure out is who this matters to and when, since the > > > > implementation is going to come with a bunch of subtle/fractal > > > > complexities, and likely a substantial overhead too when enabling or > > > > disabling tracing of a patch-site. I'd like to understand the trade-offs > > > > better. > > > > > > > > > With dynamically allocated trampolines, each ftrace_ops will have their own > > > > > trampoline, and that trampoline will be called directly if the function > > > > > is only being traced by the one ftrace_ops. This is much more efficient. > > > > > > > > > > If a function is traced by more than one ftrace_ops, then it falls back to > > > > > the loop. > > > > I see -- so the dynamic trampoline is just to get the ops? Or is that > > > > doing additional things? > > > It's to get both the ftrace_ops (as that's one of the parameters) as well > > > as to call the callback directly. Not sure if arm is affected by spectre, > > > but the "loop" function is filled with indirect function calls, where as > > > the dynamic trampolines call the callback directly. > > > > > > Instead of: > > > > > > bl ftrace_caller > > > > > > ftrace_caller: > > > [..] > > > bl ftrace_ops_list_func > > > [..] > > > > > > > > > void ftrace_ops_list_func(...) > > > { > > > __do_for_each_ftrace_ops(op, ftrace_ops_list) { > > > if (ftrace_ops_test(op, ip)) // test the hash to see if it > > > // should trace this > > > // function. > > > op->func(...); > > > } > > > } > > > > > > It does: > > > > > > bl dyanmic_tramp > > > > > > dynamic_tramp: > > > [..] > > > bl func // call the op->func directly! > > > > > > > > > Much more efficient! > > > > > > > > > > There might be a middle-ground here where we patch the ftrace_ops > > > > pointer into a literal pool at the patch-site, which would allow us to > > > > handle this atomically, and would avoid the issues with out-of-range > > > > trampolines. > > > Have an example of what you are suggesting? > > We can make the compiler to place 2 NOPs before the function entry point, and 2 > > NOPs after it using `-fpatchable-function-entry=4,2` (the arguments are > > <total>,<before>). On arm64 all instructions are 4 bytes, and we'll use the > > first two NOPs as an 8-byte literal pool. > > > > Ignoring BTI for now, the compiler generates (with some magic labels added here > > for demonstration): > > > > __before_func: > > NOP > > NOP > > func: > > NOP > > NOP > > __remainder_of_func: > > ... > > > > At ftrace_init_nop() time we patch that to: > > > > __before_func: > > // treat the 2 NOPs as an 8-byte literal-pool > > .quad <default ops pointer> // see below > > func: > > MOV X9, X30 > > NOP > > __remainder_of_func: > > ... > > > > When enabling tracing we do > > > > __before_func: > > // patch this with the relevant ops pointer > > .quad <ops pointer> > > func: > > MOV X9, X30 > > BL <trampoline> // common trampoline > > I have a question that does this common trampoline allocated by > module_alloc()? if yes, how to handle the long jump from traced func to > common trampoline if only adding two NOPs in front of func. No; as today there'd be *one* trampoline in the main kernel image, and where a module is out-of-range it will use a PLT the module loader created at load time (and any patch-site in that module would use the same PLT and trampoline, regardless of what the ops pointer was). There might be a PLT between the call and the trampoline, but that wouldn't have any functional effect; we'd still get all the arguments, the original LR (in x9), and the location of the call (in the LR), as we get today. For how we do that today, see commits: * e71a4e1bebaf7fd9 ("arm64: ftrace: add support for far branches to dynamic ftrace") https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e71a4e1bebaf7fd990efbdc04b38e5526914f0f1 * f1a54ae9af0da4d7 ("arm64: module/ftrace: intialize PLT at load time") https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f1a54ae9af0da4d76239256ed640a93ab3aadac0 * 3b23e4991fb66f6d ("arm64: implement ftrace with regs") https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3b23e4991fb66f6d152f9055ede271a726ef9f21 Thanks, Mark.
On Wed, 25 May 2022 13:17:30 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > For arm64 I'd like to make this static, and have ftrace *always* capture a > minimal set of ftrace_regs, which would be: > > X0 to X8 inclusive > SP > PC > LR > FP > > Since X0 to X8 + SP is all that we need for arguments and return values (per > the calling convention we use), and PC+LR+FP gives us everything we need for > unwinding and live patching. > > I *might* want to add x18 to that when SCS is enabled, but I'm not immediately > sure. Does arm64 have HAVE_DYNAMIC_FTRACE_WITH_ARGS enabled? If so, then having the normal ftrace call back save the above so that all functions have it available would be useful. -- Steve
On Wed, May 25, 2022 at 09:43:07AM -0400, Steven Rostedt wrote: > On Wed, 25 May 2022 13:17:30 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > For arm64 I'd like to make this static, and have ftrace *always* capture a > > minimal set of ftrace_regs, which would be: > > > > X0 to X8 inclusive > > SP > > PC > > LR > > FP > > > > Since X0 to X8 + SP is all that we need for arguments and return values (per > > the calling convention we use), and PC+LR+FP gives us everything we need for > > unwinding and live patching. > > > > I *might* want to add x18 to that when SCS is enabled, but I'm not immediately > > sure. > > Does arm64 have HAVE_DYNAMIC_FTRACE_WITH_ARGS enabled? Not yet. I'd like to implement it, but always only saving the values above and never saving a full pt_regs (since as mentioned elsewhere we can't do that correctly anyway). > If so, then having the normal ftrace call back save the above so that all > functions have it available would be useful. I think that's what I'm saying: I'd want to have one trampoline which always saved the above, so all functions would get that. Thanks, Mark.
(Cc: BPF ML) On Wed, 25 May 2022 13:17:30 +0100 Mark Rutland <mark.rutland@arm.com> wrote: > On Thu, May 12, 2022 at 09:02:31PM +0900, Masami Hiramatsu wrote: > > On Wed, 11 May 2022 11:12:07 -0400 > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > On Wed, 11 May 2022 23:34:50 +0900 > > > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > > > > the registers depends on the architecture. If we can have a checker like > > > > > > > > ftrace_regs_exist(fregs, reg_offset) > > > > > > Or something. I'd have to see the use case. > > > > > > > > > > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > > > > I think I can introduce a flag for kprobes so that user can make a > > > > kprobe handler only using a subset of registers. > > > > Maybe similar filter code is also needed for BPF 'user space' library > > > > because this check must be done when compiling BPF. > > > > > > Is there any other case without full regs that the user would want anything > > > other than the args, stack pointer and instruction pointer? > > > > For the kprobes APIs/events, yes, it needs to access to the registers > > which is used for local variables when probing inside the function body. > > However at the function entry, I think almost no use case. (BTW, pstate > > is a bit special, that may show the actual processor-level status > > (context), so for the debugging, user might want to read it.) > > As before, if we really need PSTATE we *must* take an exception to get a > reliable snapshot (or to alter the value). So I'd really like to split this > into two cases: > > * Where users *really* need PSTATE (or arbitrary GPRs), they use kprobes. That > always takes an exception and they can have a complete, real struct pt_regs. > > * Where users just need to capture a function call boundary, they use ftrace. > That uses a trampoline without taking an exception, and they get the minimal > set of registers relevant to the function call boundary (which does not > include PSTATE or most GPRs). I totally agree with this idea. The x86 is a special case, since the -fentry option puts a call on the first instruction of the function entry, I had to reuse the ftrace instead of swbp for kprobes. But on arm64 (and other RISCs), we can use them properly. My concern is that the eBPF depends on kprobe (pt_regs) interface, thus I need to ask them that it is OK to not accessable to some part of pt_regs (especially, PSTATE) if they puts probes on function entry with ftrace (fprobe in this case.) (Jiri and BPF developers) Currently fprobe is only enabled on x86 for "multiple kprobes" BPF interface, but in the future, it will be enabled on arm64. And at that point, it will be only accessible to the regs for function arguments. Is that OK for your use case? And will the BPF compiler be able to restrict the user program to access only those registers when using the "multiple kprobes"? > > Thus the BPF use case via fprobes, I think there is no usecase. > > My concern is that the BPF may allow user program to access any > > field of pt_regs. Thus if the user miss-programmed, they may see > > a wrong value (I guess the fregs is not zero-filled) for unsaved > > registers. > > > > > That is, have a flag that says "only_args" or something, that says they > > > will only get the registers for arguments, a stack pointer, and the > > > instruction pointer (note, the fregs may not have the instruction pointer > > > as that is passed to the the caller via the "ip" parameter. If the fregs > > > needs that, we can add a "ftrace_regs_set_ip()" before calling the > > > callback registered to the fprobe). > > > > Yes, that is what I'm thinking. If "only_args" flag is set, BPF runtime > > must check the user program. And if it finds the program access the > > unsaved registers, it should stop executing. > > > > BTW, "what register is saved" can be determined statically, thus I think > > we just need the offset for checking (for fprobe usecase, since it will > > set the ftrace_ops flag by itself.) > > For arm64 I'd like to make this static, and have ftrace *always* capture a > minimal set of ftrace_regs, which would be: > > X0 to X8 inclusive > SP > PC > LR > FP > > Since X0 to X8 + SP is all that we need for arguments and return values (per > the calling convention we use), and PC+LR+FP gives us everything we need for > unwinding and live patching. It would be good for me. So is it enabled with CONFIG_DYNAMIC_FTRACE_WITH_ARGS, instead of CONFIG_DYNAMIC_FTRACE_WITH_REGS? Thank you, > > I *might* want to add x18 to that when SCS is enabled, but I'm not immediately > sure. > > Thanks, > Mark.
On Mon, May 30, 2022 at 10:03:10AM +0900, Masami Hiramatsu wrote: > (Cc: BPF ML) > > On Wed, 25 May 2022 13:17:30 +0100 > Mark Rutland <mark.rutland@arm.com> wrote: > > > On Thu, May 12, 2022 at 09:02:31PM +0900, Masami Hiramatsu wrote: > > > On Wed, 11 May 2022 11:12:07 -0400 > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > On Wed, 11 May 2022 23:34:50 +0900 > > > > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > > > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > > > > > the registers depends on the architecture. If we can have a checker like > > > > > > > > > > ftrace_regs_exist(fregs, reg_offset) > > > > > > > > Or something. I'd have to see the use case. > > > > > > > > > > > > > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > > > > > I think I can introduce a flag for kprobes so that user can make a > > > > > kprobe handler only using a subset of registers. > > > > > Maybe similar filter code is also needed for BPF 'user space' library > > > > > because this check must be done when compiling BPF. > > > > > > > > Is there any other case without full regs that the user would want anything > > > > other than the args, stack pointer and instruction pointer? > > > > > > For the kprobes APIs/events, yes, it needs to access to the registers > > > which is used for local variables when probing inside the function body. > > > However at the function entry, I think almost no use case. (BTW, pstate > > > is a bit special, that may show the actual processor-level status > > > (context), so for the debugging, user might want to read it.) > > > > As before, if we really need PSTATE we *must* take an exception to get a > > reliable snapshot (or to alter the value). So I'd really like to split this > > into two cases: > > > > * Where users *really* need PSTATE (or arbitrary GPRs), they use kprobes. That > > always takes an exception and they can have a complete, real struct pt_regs. > > > > * Where users just need to capture a function call boundary, they use ftrace. > > That uses a trampoline without taking an exception, and they get the minimal > > set of registers relevant to the function call boundary (which does not > > include PSTATE or most GPRs). > > I totally agree with this idea. The x86 is a special case, since the > -fentry option puts a call on the first instruction of the function entry, > I had to reuse the ftrace instead of swbp for kprobes. > But on arm64 (and other RISCs), we can use them properly. > > My concern is that the eBPF depends on kprobe (pt_regs) interface, thus > I need to ask them that it is OK to not accessable to some part of > pt_regs (especially, PSTATE) if they puts probes on function entry > with ftrace (fprobe in this case.) > > (Jiri and BPF developers) > Currently fprobe is only enabled on x86 for "multiple kprobes" BPF > interface, but in the future, it will be enabled on arm64. And at > that point, it will be only accessible to the regs for function > arguments. Is that OK for your use case? And will the BPF compiler I guess from practical POV registers for arguments and ip should be enough, but whole pt_regs was already exposed to programs, so people can already use any of them.. not sure it's good idea to restrict it > be able to restrict the user program to access only those registers > when using the "multiple kprobes"? pt-regs pointer is provided to kprobe programs, I guess we could provide copy of that with just available values jirka
On Mon, 30 May 2022 14:38:31 +0200 Jiri Olsa <olsajiri@gmail.com> wrote: > On Mon, May 30, 2022 at 10:03:10AM +0900, Masami Hiramatsu wrote: > > (Cc: BPF ML) > > > > On Wed, 25 May 2022 13:17:30 +0100 > > Mark Rutland <mark.rutland@arm.com> wrote: > > > > > On Thu, May 12, 2022 at 09:02:31PM +0900, Masami Hiramatsu wrote: > > > > On Wed, 11 May 2022 11:12:07 -0400 > > > > Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > > > > On Wed, 11 May 2022 23:34:50 +0900 > > > > > Masami Hiramatsu <mhiramat@kernel.org> wrote: > > > > > > > > > > > OK, so fregs::regs will have a subset of pt_regs, and accessibility of > > > > > > the registers depends on the architecture. If we can have a checker like > > > > > > > > > > > > ftrace_regs_exist(fregs, reg_offset) > > > > > > > > > > Or something. I'd have to see the use case. > > > > > > > > > > > > > > > > > kprobe on ftrace or fprobe user (BPF) can filter user's requests. > > > > > > I think I can introduce a flag for kprobes so that user can make a > > > > > > kprobe handler only using a subset of registers. > > > > > > Maybe similar filter code is also needed for BPF 'user space' library > > > > > > because this check must be done when compiling BPF. > > > > > > > > > > Is there any other case without full regs that the user would want anything > > > > > other than the args, stack pointer and instruction pointer? > > > > > > > > For the kprobes APIs/events, yes, it needs to access to the registers > > > > which is used for local variables when probing inside the function body. > > > > However at the function entry, I think almost no use case. (BTW, pstate > > > > is a bit special, that may show the actual processor-level status > > > > (context), so for the debugging, user might want to read it.) > > > > > > As before, if we really need PSTATE we *must* take an exception to get a > > > reliable snapshot (or to alter the value). So I'd really like to split this > > > into two cases: > > > > > > * Where users *really* need PSTATE (or arbitrary GPRs), they use kprobes. That > > > always takes an exception and they can have a complete, real struct pt_regs. > > > > > > * Where users just need to capture a function call boundary, they use ftrace. > > > That uses a trampoline without taking an exception, and they get the minimal > > > set of registers relevant to the function call boundary (which does not > > > include PSTATE or most GPRs). > > > > I totally agree with this idea. The x86 is a special case, since the > > -fentry option puts a call on the first instruction of the function entry, > > I had to reuse the ftrace instead of swbp for kprobes. > > But on arm64 (and other RISCs), we can use them properly. > > > > My concern is that the eBPF depends on kprobe (pt_regs) interface, thus > > I need to ask them that it is OK to not accessable to some part of > > pt_regs (especially, PSTATE) if they puts probes on function entry > > with ftrace (fprobe in this case.) > > > > (Jiri and BPF developers) > > Currently fprobe is only enabled on x86 for "multiple kprobes" BPF > > interface, but in the future, it will be enabled on arm64. And at > > that point, it will be only accessible to the regs for function > > arguments. Is that OK for your use case? And will the BPF compiler > > I guess from practical POV registers for arguments and ip should be enough, > but whole pt_regs was already exposed to programs, so people can already use > any of them.. not sure it's good idea to restrict it > > > be able to restrict the user program to access only those registers > > when using the "multiple kprobes"? > > pt-regs pointer is provided to kprobe programs, I guess we could provide copy > of that with just available values Yes, ftrace_regs already provides partial filled pt_regs (which registers are valid is arch-dependent). Thus, my idea is changing fprobe's handler interface to expose ftrace_regs instead of pt_regs, and the BPF handler will extract the internal pt_regs. If the BPF compiler can list which registers will be accessed form the user program, the kernel side can filter it. I think similar feature can be done in the kprobe-event (new fprobe event?). Thank you,
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 4506c4a90ac1..d35a042baf75 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -10,12 +10,14 @@ #include <linux/module.h> #include <linux/swab.h> #include <linux/uaccess.h> +#include <linux/vmalloc.h> #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> #include <asm/patching.h> +#include <asm/set_memory.h> #ifdef CONFIG_DYNAMIC_FTRACE /* @@ -48,6 +50,290 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, return 0; } +/* ftrace dynamic trampolines */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#ifdef CONFIG_MODULES +#include <linux/moduleloader.h> + +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} + +static inline void tramp_free(void *tramp) +{ + module_memfree(tramp); +} +#else +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} + +static inline void tramp_free(void *tramp) {} +#endif + +/* ftrace_caller trampoline template */ +extern void ftrace_caller_tramp(void); +extern void ftrace_caller_tramp_end(void); +extern void ftrace_caller_tramp_call(void); +extern void ftrace_caller_tramp_graph_call(void); +extern void ftrace_caller_tramp_ops(void); + +/* ftrace_regs_caller trampoline template */ +extern void ftrace_regs_caller_tramp(void); +extern void ftrace_regs_caller_tramp_end(void); +extern void ftrace_regs_caller_tramp_call(void); +extern void ftrace_regs_caller_tramp_graph_call(void); +extern void ftrace_regs_caller_tramp_ops(void); + + +/* + * The ARM64 ftrace trampoline layout : + * + * + * ftrace_(regs_)caller_tramp low + * `--> +---------------------+ ^ + * | ftrace_regs_entry: | | + * | ... | | + * +---------------------+ | + * | ftrace_common: | | + * | ... | | + * | ldr x2, <ftrace_ops>| | + * ftrace callsite | ... | | + * `--> +---------------------+ | + * | nop >> bl <callback> + * ftrace graph callsite | PLT entrys (TODO) | | + * `--> +---------------------+ | + * | nop >> bl <graph callback> + * | PLT entrys (TODO) | | + * +---------------------+ | + * | ftrace_regs_restore:| | + * | ... | | + * +---------------------+ | + * | ftrace_ops | | + * +---------------------+ | + * high + */ + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned long start_offset, end_offset, ops_offset; + unsigned long caller_size, size, offset; + unsigned long pc, npages; + unsigned long *ptr; + void *trampoline; + u32 load; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller_tramp; + end_offset = (unsigned long)ftrace_regs_caller_tramp_end; + ops_offset = (unsigned long)ftrace_regs_caller_tramp_ops; + } else { + start_offset = (unsigned long)ftrace_caller_tramp; + end_offset = (unsigned long)ftrace_caller_tramp_end; + ops_offset = (unsigned long)ftrace_caller_tramp_ops; + } + + caller_size = end_offset - start_offset + AARCH64_INSN_SIZE; + size = caller_size + sizeof(void *); + + trampoline = alloc_tramp(size); + if (!trampoline) + return 0; + + *tramp_size = size; + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); + + /* + * copy ftrace_caller/ftrace_regs_caller trampoline template + * onto the trampoline memory + */ + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, caller_size); + if (WARN_ON(ret < 0)) + goto free; + + /* + * Stored ftrace_ops at the end of the trampoline. + * This will be used to load the third parameter for the callback. + * Basically, that location at the end of the trampoline takes the + * place of the global function_trace_op variable. + */ + ptr = (unsigned long *)(trampoline + caller_size); + *ptr = (unsigned long)ops; + + /* + * Update the trampoline ops REF + * ldr x2, <ftrace_ops> + */ + ops_offset -= start_offset; + pc = (unsigned long)trampoline + ops_offset; + offset = (unsigned long)ptr - pc; + if (WARN_ON(offset % AARCH64_INSN_SIZE != 0)) + goto free; + + load = aarch64_insn_gen_load_literal(AARCH64_INSN_REG_2, + AARCH64_INSN_VARIANT_64BIT, offset); + if (ftrace_modify_code(pc, 0, load, false)) + goto free; + + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + set_vm_flush_reset_perms(trampoline); + + /* + * Module allocation needs to be completed by making the page + * executable. The page is still writable, which is a security hazard, + * but anyhow ftrace breaks W^X completely. + */ + set_memory_ro((unsigned long)trampoline, npages); + set_memory_x((unsigned long)trampoline, npages); + + return (unsigned long)trampoline; + +free: + tramp_free(trampoline); + return 0; +} + +static unsigned long calc_trampoline_call_offset(bool save_regs, bool is_graph) +{ + unsigned start_offset, call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller_tramp; + if (is_graph) + call_offset = (unsigned long)ftrace_regs_caller_tramp_graph_call; + else + call_offset = (unsigned long)ftrace_regs_caller_tramp_call; + } else { + start_offset = (unsigned long)ftrace_caller_tramp; + if (is_graph) + call_offset = (unsigned long)ftrace_caller_tramp_graph_call; + else + call_offset = (unsigned long)ftrace_caller_tramp_call; + } + + return call_offset - start_offset; +} + +static inline ftrace_func_t +ftrace_trampoline_get_func(struct ftrace_ops *ops, bool is_graph) +{ + ftrace_func_t func; + + if (!is_graph) + func = ftrace_ops_get_func(ops); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + else + func = (ftrace_func_t)ftrace_graph_caller; +#endif + + return func; +} + +static int +ftrace_trampoline_modify_call(struct ftrace_ops *ops, bool is_graph, bool active) +{ + unsigned long offset; + unsigned long pc; + ftrace_func_t func; + u32 nop, branch; + + offset = calc_trampoline_call_offset(ops->flags & + FTRACE_OPS_FL_SAVE_REGS, is_graph); + pc = ops->trampoline + offset; + + func = ftrace_trampoline_get_func(ops, is_graph); + nop = aarch64_insn_gen_nop(); + branch = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, + AARCH64_INSN_BRANCH_LINK); + + if (active) + return ftrace_modify_code(pc, 0, branch, false); + else + return ftrace_modify_code(pc, 0, nop, false); +} + +extern int ftrace_graph_active; +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + unsigned int size; + + /* + * If kaslr is enabled, the address of tramp and ftrace call + * may be far away, Therefore, long jump support is required. + */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return; + + if (!ops->trampoline) { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + } + + /* These is no static trampoline on ARM64 */ + WARN_ON(!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)); + + /* atomic modify trampoline: <call func> */ + WARN_ON(ftrace_trampoline_modify_call(ops, false, true)); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* atomic modify trampoline: <call graph func> */ + WARN_ON(ftrace_trampoline_modify_call(ops, true, ftrace_graph_active)); +#endif +} + +static void *addr_from_call(void *ptr) +{ + u32 insn; + unsigned long offset; + + if (aarch64_insn_read(ptr, &insn)) + return NULL; + + /* Make sure this is a call */ + if (!aarch64_insn_is_bl(insn)) { + pr_warn("Expected bl instruction, but not\n"); + return NULL; + } + + offset = aarch64_get_branch_offset(insn); + + return (void *)ptr + AARCH64_INSN_SIZE + offset; +} + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return NULL; + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS, + ftrace_graph_active); + + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; + ops->trampoline_size = 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + /* * Replace tracer function in ftrace_caller() */