Message ID | 20220329231854.3188647-1-song@kernel.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | [bpf] tools/runqslower: fix handle__sched_switch for updated tp sched_switch | expand |
On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: > > TP_PROTO of sched_switch is updated with a new arg prev_state, which > causes runqslower load failure: > > libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied > libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- > R1 type=ctx expected=fp > 0: R1=ctx(off=0,imm=0) R10=fp0 > ; int handle__sched_switch(u64 *ctx) > 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) > ; struct task_struct *next = (struct task_struct *)ctx[2]; > 1: (79) r6 = *(u64 *)(r7 +16) > func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' > 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) > ; struct task_struct *prev = (struct task_struct *)ctx[1]; > 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) > 3: (b7) r1 = 0 ; R1_w=0 > ; struct runq_event event = {}; > 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 > 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 > 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 > 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 > ; if (prev->__state == TASK_RUNNING) > 8: (61) r1 = *(u32 *)(r2 +24) > R2 invalid mem access 'scalar' > processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 > -- END PROG LOAD LOG -- > libbpf: failed to load program 'handle__sched_switch' > libbpf: failed to load object 'runqslower_bpf' > libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 > failed to load BPF object: -13 > > Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG > in runqslower for cleaner code. > > Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") > Signed-off-by: Song Liu <song@kernel.org> > --- > tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- > 1 file changed, 5 insertions(+), 14 deletions(-) > It would be much less disruptive if that prev_state was added after "next", but oh well... But anyways, let's handle this in a way that can handle both old kernels and new ones and do the same change in libbpf-tool's runqslower ([0]). Can you please follow up there as well? We can use BPF CO-RE to detect which order of arguments running kernel has by checking prev_state field existence in struct trace_event_raw_sched_switch. Can you please try that? Use bpf_core_field_exists() for that. [0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c > diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c > index 9a5c1f008fe6..30e491d8308f 100644 > --- a/tools/bpf/runqslower/runqslower.bpf.c > +++ b/tools/bpf/runqslower/runqslower.bpf.c > @@ -2,6 +2,7 @@ > // Copyright (c) 2019 Facebook > #include "vmlinux.h" > #include <bpf/bpf_helpers.h> > +#include <bpf/bpf_tracing.h> > #include "runqslower.h" > > #define TASK_RUNNING 0 > @@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t) > } > > SEC("tp_btf/sched_wakeup") > -int handle__sched_wakeup(u64 *ctx) > +int BPF_PROG(handle__sched_wakeup, struct task_struct *p) > { > - /* TP_PROTO(struct task_struct *p) */ > - struct task_struct *p = (void *)ctx[0]; > - > return trace_enqueue(p); > } > > SEC("tp_btf/sched_wakeup_new") > -int handle__sched_wakeup_new(u64 *ctx) > +int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p) > { > - /* TP_PROTO(struct task_struct *p) */ > - struct task_struct *p = (void *)ctx[0]; > - > return trace_enqueue(p); > } > > SEC("tp_btf/sched_switch") > -int handle__sched_switch(u64 *ctx) > +int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state, > + struct task_struct *prev, struct task_struct *next) > { > - /* TP_PROTO(bool preempt, struct task_struct *prev, > - * struct task_struct *next) > - */ > - struct task_struct *prev = (struct task_struct *)ctx[1]; > - struct task_struct *next = (struct task_struct *)ctx[2]; > struct runq_event event = {}; > u64 *tsp, delta_us; > long state; > -- > 2.30.2 >
> On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: >> >> TP_PROTO of sched_switch is updated with a new arg prev_state, which >> causes runqslower load failure: >> >> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied >> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- >> R1 type=ctx expected=fp >> 0: R1=ctx(off=0,imm=0) R10=fp0 >> ; int handle__sched_switch(u64 *ctx) >> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) >> ; struct task_struct *next = (struct task_struct *)ctx[2]; >> 1: (79) r6 = *(u64 *)(r7 +16) >> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' >> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) >> ; struct task_struct *prev = (struct task_struct *)ctx[1]; >> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) >> 3: (b7) r1 = 0 ; R1_w=0 >> ; struct runq_event event = {}; >> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 >> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 >> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 >> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 >> ; if (prev->__state == TASK_RUNNING) >> 8: (61) r1 = *(u32 *)(r2 +24) >> R2 invalid mem access 'scalar' >> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 >> -- END PROG LOAD LOG -- >> libbpf: failed to load program 'handle__sched_switch' >> libbpf: failed to load object 'runqslower_bpf' >> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 >> failed to load BPF object: -13 >> >> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG >> in runqslower for cleaner code. >> >> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") >> Signed-off-by: Song Liu <song@kernel.org> >> --- >> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- >> 1 file changed, 5 insertions(+), 14 deletions(-) >> > > It would be much less disruptive if that prev_state was added after > "next", but oh well... Maybe we should change that. +Valentin and Steven, how about we change the order with the attached diff (not the original patch in this thread, but the one at the end of this email)? > > But anyways, let's handle this in a way that can handle both old > kernels and new ones and do the same change in libbpf-tool's > runqslower ([0]). Can you please follow up there as well? Yeah, I will also fix that one. > > > We can use BPF CO-RE to detect which order of arguments running kernel > has by checking prev_state field existence in struct > trace_event_raw_sched_switch. Can you please try that? Use > bpf_core_field_exists() for that. Do you mean something like if (bpf_core_field_exists(ctx->prev_state)) /* use ctx[2] and ctx[3] */ else /* use ctx[1] and ctx[2] */ ? I think we will need BTF for the arguments, which doesn't exist yet. Did I miss something? I was thinking about adding something like struct tp_sched_switch_args for all the raw tracepoints, but haven't got time to look into how. Thanks, Song > > > [0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c > > >> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c >> index 9a5c1f008fe6..30e491d8308f 100644 >> --- a/tools/bpf/runqslower/runqslower.bpf.c >> +++ b/tools/bpf/runqslower/runqslower.bpf.c >> @@ -2,6 +2,7 @@ >> // Copyright (c) 2019 Facebook >> #include "vmlinux.h" >> #include <bpf/bpf_helpers.h> >> +#include <bpf/bpf_tracing.h> >> #include "runqslower.h" >> >> #define TASK_RUNNING 0 >> @@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t) >> } >> >> SEC("tp_btf/sched_wakeup") >> -int handle__sched_wakeup(u64 *ctx) >> +int BPF_PROG(handle__sched_wakeup, struct task_struct *p) >> { >> - /* TP_PROTO(struct task_struct *p) */ >> - struct task_struct *p = (void *)ctx[0]; >> - >> return trace_enqueue(p); >> } >> >> SEC("tp_btf/sched_wakeup_new") >> -int handle__sched_wakeup_new(u64 *ctx) >> +int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p) >> { >> - /* TP_PROTO(struct task_struct *p) */ >> - struct task_struct *p = (void *)ctx[0]; >> - >> return trace_enqueue(p); >> } >> >> SEC("tp_btf/sched_switch") >> -int handle__sched_switch(u64 *ctx) >> +int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state, >> + struct task_struct *prev, struct task_struct *next) >> { >> - /* TP_PROTO(bool preempt, struct task_struct *prev, >> - * struct task_struct *next) >> - */ >> - struct task_struct *prev = (struct task_struct *)ctx[1]; >> - struct task_struct *next = (struct task_struct *)ctx[2]; >> struct runq_event event = {}; >> u64 *tsp, delta_us; >> long state; >> -- >> 2.30.2 diff --git i/include/trace/events/sched.h w/include/trace/events/sched.h index 65e786756321..fbb99a61f714 100644 --- i/include/trace/events/sched.h +++ w/include/trace/events/sched.h @@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next), + struct task_struct *next, + unsigned int prev_state), - TP_ARGS(preempt, prev_state, prev, next), + TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) diff --git i/kernel/sched/core.c w/kernel/sched/core.c index d575b4914925..25784f3efd81 100644 --- i/kernel/sched/core.c +++ w/kernel/sched/core.c @@ -6376,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); diff --git i/kernel/trace/fgraph.c w/kernel/trace/fgraph.c index 19028e072cdb..d7a81d277f66 100644 --- i/kernel/trace/fgraph.c +++ w/kernel/trace/fgraph.c @@ -415,9 +415,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) + { unsigned long long timestamp; int index; diff --git i/kernel/trace/ftrace.c w/kernel/trace/ftrace.c index 4f1d2f5e7263..957354488242 100644 --- i/kernel/trace/ftrace.c +++ w/kernel/trace/ftrace.c @@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list; diff --git i/kernel/trace/trace_events.c w/kernel/trace/trace_events.c index e11e167b7809..e7b6a2155e96 100644 --- i/kernel/trace/trace_events.c +++ w/kernel/trace/trace_events.c @@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; diff --git i/kernel/trace/trace_sched_switch.c w/kernel/trace/trace_sched_switch.c index 45796d8bd4b2..c9ffdcfe622e 100644 --- i/kernel/trace/trace_sched_switch.c +++ w/kernel/trace/trace_sched_switch.c @@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags; diff --git i/kernel/trace/trace_sched_wakeup.c w/kernel/trace/trace_sched_wakeup.c index 46429f9a96fa..330aee1c1a49 100644 --- i/kernel/trace/trace_sched_wakeup.c +++ w/kernel/trace/trace_sched_wakeup.c @@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta;
On Tue, Mar 29, 2022 at 5:39 PM Song Liu <songliubraving@fb.com> wrote: > > > > > On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: > >> > >> TP_PROTO of sched_switch is updated with a new arg prev_state, which > >> causes runqslower load failure: > >> > >> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied > >> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- > >> R1 type=ctx expected=fp > >> 0: R1=ctx(off=0,imm=0) R10=fp0 > >> ; int handle__sched_switch(u64 *ctx) > >> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) > >> ; struct task_struct *next = (struct task_struct *)ctx[2]; > >> 1: (79) r6 = *(u64 *)(r7 +16) > >> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' > >> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) > >> ; struct task_struct *prev = (struct task_struct *)ctx[1]; > >> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) > >> 3: (b7) r1 = 0 ; R1_w=0 > >> ; struct runq_event event = {}; > >> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 > >> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 > >> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 > >> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 > >> ; if (prev->__state == TASK_RUNNING) > >> 8: (61) r1 = *(u32 *)(r2 +24) > >> R2 invalid mem access 'scalar' > >> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 > >> -- END PROG LOAD LOG -- > >> libbpf: failed to load program 'handle__sched_switch' > >> libbpf: failed to load object 'runqslower_bpf' > >> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 > >> failed to load BPF object: -13 > >> > >> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG > >> in runqslower for cleaner code. > >> > >> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") > >> Signed-off-by: Song Liu <song@kernel.org> > >> --- > >> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- > >> 1 file changed, 5 insertions(+), 14 deletions(-) > >> > > > > It would be much less disruptive if that prev_state was added after > > "next", but oh well... > > Maybe we should change that. > > +Valentin and Steven, how about we change the order with the attached > diff (not the original patch in this thread, but the one at the end of > this email)? > > > > > But anyways, let's handle this in a way that can handle both old > > kernels and new ones and do the same change in libbpf-tool's > > runqslower ([0]). Can you please follow up there as well? > > Yeah, I will also fix that one. Thanks! > > > > > > > We can use BPF CO-RE to detect which order of arguments running kernel > > has by checking prev_state field existence in struct > > trace_event_raw_sched_switch. Can you please try that? Use > > bpf_core_field_exists() for that. > > Do you mean something like > > if (bpf_core_field_exists(ctx->prev_state)) > /* use ctx[2] and ctx[3] */ > else > /* use ctx[1] and ctx[2] */ yep, that's what I meant, except you don't have ctx->prev_state, you have to do: if (bpf_core_field_exists(((struct trace_event_raw_sched_switch *)0)->prev_state)) > > ? I think we will need BTF for the arguments, which doesn't exist yet. > Did I miss something? Probably :) struct trace_event_raw_sched_switch is in vmlinux.h already for non-raw sched:sched_switch tracepoint. We just use that type information for feature probing. From BPF verifier's perspective, ctx[1] or ctx[2] will have proper BTF information (task_struct) during program validation. > > I was thinking about adding something like struct tp_sched_switch_args > for all the raw tracepoints, but haven't got time to look into how. > > Thanks, > Song > > > > > > > [0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c > > > > > >> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c > >> index 9a5c1f008fe6..30e491d8308f 100644 > >> --- a/tools/bpf/runqslower/runqslower.bpf.c > >> +++ b/tools/bpf/runqslower/runqslower.bpf.c > >> @@ -2,6 +2,7 @@ > >> // Copyright (c) 2019 Facebook > >> #include "vmlinux.h" > >> #include <bpf/bpf_helpers.h> > >> +#include <bpf/bpf_tracing.h> > >> #include "runqslower.h" > >> [...]
> On Mar 29, 2022, at 7:47 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Mar 29, 2022 at 5:39 PM Song Liu <songliubraving@fb.com> wrote: >> >> >> >>> On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: >>> >>> On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: >>>> >>>> TP_PROTO of sched_switch is updated with a new arg prev_state, which >>>> causes runqslower load failure: >>>> >>>> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied >>>> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- >>>> R1 type=ctx expected=fp >>>> 0: R1=ctx(off=0,imm=0) R10=fp0 >>>> ; int handle__sched_switch(u64 *ctx) >>>> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) >>>> ; struct task_struct *next = (struct task_struct *)ctx[2]; >>>> 1: (79) r6 = *(u64 *)(r7 +16) >>>> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' >>>> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) >>>> ; struct task_struct *prev = (struct task_struct *)ctx[1]; >>>> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) >>>> 3: (b7) r1 = 0 ; R1_w=0 >>>> ; struct runq_event event = {}; >>>> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 >>>> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 >>>> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 >>>> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 >>>> ; if (prev->__state == TASK_RUNNING) >>>> 8: (61) r1 = *(u32 *)(r2 +24) >>>> R2 invalid mem access 'scalar' >>>> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 >>>> -- END PROG LOAD LOG -- >>>> libbpf: failed to load program 'handle__sched_switch' >>>> libbpf: failed to load object 'runqslower_bpf' >>>> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 >>>> failed to load BPF object: -13 >>>> >>>> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG >>>> in runqslower for cleaner code. >>>> >>>> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") >>>> Signed-off-by: Song Liu <song@kernel.org> >>>> --- >>>> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- >>>> 1 file changed, 5 insertions(+), 14 deletions(-) >>>> >>> >>> It would be much less disruptive if that prev_state was added after >>> "next", but oh well... >> >> Maybe we should change that. >> >> +Valentin and Steven, how about we change the order with the attached >> diff (not the original patch in this thread, but the one at the end of >> this email)? >> >>> >>> But anyways, let's handle this in a way that can handle both old >>> kernels and new ones and do the same change in libbpf-tool's >>> runqslower ([0]). Can you please follow up there as well? >> >> Yeah, I will also fix that one. > > Thanks! > >> >>> >>> >>> We can use BPF CO-RE to detect which order of arguments running kernel >>> has by checking prev_state field existence in struct >>> trace_event_raw_sched_switch. Can you please try that? Use >>> bpf_core_field_exists() for that. >> >> Do you mean something like >> >> if (bpf_core_field_exists(ctx->prev_state)) >> /* use ctx[2] and ctx[3] */ >> else >> /* use ctx[1] and ctx[2] */ > > yep, that's what I meant, except you don't have ctx->prev_state, you have to do: > > if (bpf_core_field_exists(((struct trace_event_raw_sched_switch > *)0)->prev_state)) > >> >> ? I think we will need BTF for the arguments, which doesn't exist yet. >> Did I miss something? > > Probably :) struct trace_event_raw_sched_switch is in vmlinux.h > already for non-raw sched:sched_switch tracepoint. We just use that > type information for feature probing. From BPF verifier's perspective, > ctx[1] or ctx[2] will have proper BTF information (task_struct) during > program validation. Sigh. I run pahole and saw trace_event_raw_sched_switch. Somehow I thought it was not the one. Will send v2 tomorrow. Thanks, Song
> On Mar 29, 2022, at 11:43 PM, Song Liu <songliubraving@fb.com> wrote: > > > >> On Mar 29, 2022, at 7:47 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: >> >> On Tue, Mar 29, 2022 at 5:39 PM Song Liu <songliubraving@fb.com> wrote: >>> >>> >>> >>>> On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: >>>> >>>> On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: >>>>> >>>>> TP_PROTO of sched_switch is updated with a new arg prev_state, which >>>>> causes runqslower load failure: >>>>> >>>>> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied >>>>> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- >>>>> R1 type=ctx expected=fp >>>>> 0: R1=ctx(off=0,imm=0) R10=fp0 >>>>> ; int handle__sched_switch(u64 *ctx) >>>>> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) >>>>> ; struct task_struct *next = (struct task_struct *)ctx[2]; >>>>> 1: (79) r6 = *(u64 *)(r7 +16) >>>>> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' >>>>> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) >>>>> ; struct task_struct *prev = (struct task_struct *)ctx[1]; >>>>> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) >>>>> 3: (b7) r1 = 0 ; R1_w=0 >>>>> ; struct runq_event event = {}; >>>>> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 >>>>> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 >>>>> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 >>>>> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 >>>>> ; if (prev->__state == TASK_RUNNING) >>>>> 8: (61) r1 = *(u32 *)(r2 +24) >>>>> R2 invalid mem access 'scalar' >>>>> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 >>>>> -- END PROG LOAD LOG -- >>>>> libbpf: failed to load program 'handle__sched_switch' >>>>> libbpf: failed to load object 'runqslower_bpf' >>>>> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 >>>>> failed to load BPF object: -13 >>>>> >>>>> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG >>>>> in runqslower for cleaner code. >>>>> >>>>> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") >>>>> Signed-off-by: Song Liu <song@kernel.org> >>>>> --- >>>>> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- >>>>> 1 file changed, 5 insertions(+), 14 deletions(-) >>>>> >>>> >>>> It would be much less disruptive if that prev_state was added after >>>> "next", but oh well... >>> >>> Maybe we should change that. >>> >>> +Valentin and Steven, how about we change the order with the attached >>> diff (not the original patch in this thread, but the one at the end of >>> this email)? >>> >>>> >>>> But anyways, let's handle this in a way that can handle both old >>>> kernels and new ones and do the same change in libbpf-tool's >>>> runqslower ([0]). Can you please follow up there as well? >>> >>> Yeah, I will also fix that one. >> >> Thanks! >> >>> >>>> >>>> >>>> We can use BPF CO-RE to detect which order of arguments running kernel >>>> has by checking prev_state field existence in struct >>>> trace_event_raw_sched_switch. Can you please try that? Use >>>> bpf_core_field_exists() for that. >>> >>> Do you mean something like >>> >>> if (bpf_core_field_exists(ctx->prev_state)) >>> /* use ctx[2] and ctx[3] */ >>> else >>> /* use ctx[1] and ctx[2] */ >> >> yep, that's what I meant, except you don't have ctx->prev_state, you have to do: >> >> if (bpf_core_field_exists(((struct trace_event_raw_sched_switch >> *)0)->prev_state)) Actually, struct trace_event_raw_sched_switch is not the arguments we have for tp_btf. For both older and newer kernel, it is the same: struct trace_event_raw_sched_switch { struct trace_entry ent; char prev_comm[16]; pid_t prev_pid; int prev_prio; long int prev_state; char next_comm[16]; pid_t next_pid; int next_prio; char __data[0]; }; So I guess this check won't work? Thanks, Song
On Wed, Mar 30, 2022 at 2:11 PM Song Liu <songliubraving@fb.com> wrote: > > > > > On Mar 29, 2022, at 11:43 PM, Song Liu <songliubraving@fb.com> wrote: > > > > > > > >> On Mar 29, 2022, at 7:47 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > >> > >> On Tue, Mar 29, 2022 at 5:39 PM Song Liu <songliubraving@fb.com> wrote: > >>> > >>> > >>> > >>>> On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > >>>> > >>>> On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@kernel.org> wrote: > >>>>> > >>>>> TP_PROTO of sched_switch is updated with a new arg prev_state, which > >>>>> causes runqslower load failure: > >>>>> > >>>>> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied > >>>>> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- > >>>>> R1 type=ctx expected=fp > >>>>> 0: R1=ctx(off=0,imm=0) R10=fp0 > >>>>> ; int handle__sched_switch(u64 *ctx) > >>>>> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) > >>>>> ; struct task_struct *next = (struct task_struct *)ctx[2]; > >>>>> 1: (79) r6 = *(u64 *)(r7 +16) > >>>>> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' > >>>>> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) > >>>>> ; struct task_struct *prev = (struct task_struct *)ctx[1]; > >>>>> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) > >>>>> 3: (b7) r1 = 0 ; R1_w=0 > >>>>> ; struct runq_event event = {}; > >>>>> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 > >>>>> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 > >>>>> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 > >>>>> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 > >>>>> ; if (prev->__state == TASK_RUNNING) > >>>>> 8: (61) r1 = *(u32 *)(r2 +24) > >>>>> R2 invalid mem access 'scalar' > >>>>> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 > >>>>> -- END PROG LOAD LOG -- > >>>>> libbpf: failed to load program 'handle__sched_switch' > >>>>> libbpf: failed to load object 'runqslower_bpf' > >>>>> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 > >>>>> failed to load BPF object: -13 > >>>>> > >>>>> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG > >>>>> in runqslower for cleaner code. > >>>>> > >>>>> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") > >>>>> Signed-off-by: Song Liu <song@kernel.org> > >>>>> --- > >>>>> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- > >>>>> 1 file changed, 5 insertions(+), 14 deletions(-) > >>>>> > >>>> > >>>> It would be much less disruptive if that prev_state was added after > >>>> "next", but oh well... > >>> > >>> Maybe we should change that. > >>> > >>> +Valentin and Steven, how about we change the order with the attached > >>> diff (not the original patch in this thread, but the one at the end of > >>> this email)? > >>> > >>>> > >>>> But anyways, let's handle this in a way that can handle both old > >>>> kernels and new ones and do the same change in libbpf-tool's > >>>> runqslower ([0]). Can you please follow up there as well? > >>> > >>> Yeah, I will also fix that one. > >> > >> Thanks! > >> > >>> > >>>> > >>>> > >>>> We can use BPF CO-RE to detect which order of arguments running kernel > >>>> has by checking prev_state field existence in struct > >>>> trace_event_raw_sched_switch. Can you please try that? Use > >>>> bpf_core_field_exists() for that. > >>> > >>> Do you mean something like > >>> > >>> if (bpf_core_field_exists(ctx->prev_state)) > >>> /* use ctx[2] and ctx[3] */ > >>> else > >>> /* use ctx[1] and ctx[2] */ > >> > >> yep, that's what I meant, except you don't have ctx->prev_state, you have to do: > >> > >> if (bpf_core_field_exists(((struct trace_event_raw_sched_switch > >> *)0)->prev_state)) > > Actually, struct trace_event_raw_sched_switch is not the arguments we > have for tp_btf. For both older and newer kernel, it is the same: > > struct trace_event_raw_sched_switch { > struct trace_entry ent; > char prev_comm[16]; > pid_t prev_pid; > int prev_prio; > long int prev_state; > char next_comm[16]; > pid_t next_pid; > int next_prio; > char __data[0]; > }; > > So I guess this check won't work? Ah, you are right, we had prev_state in this struct before. There seems to be func proto for each raw tracepoint: typedef void (*btf_trace_sched_switch)(void *, bool, unsigned int, struct task_struct *, struct task_struct *); But I'm not sure if we can use that. I'll need to play with it a bit tomorrow to see if any of bpf_core_xxx() checks can be used. > > Thanks, > Song >
diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 9a5c1f008fe6..30e491d8308f 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -2,6 +2,7 @@ // Copyright (c) 2019 Facebook #include "vmlinux.h" #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "runqslower.h" #define TASK_RUNNING 0 @@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t) } SEC("tp_btf/sched_wakeup") -int handle__sched_wakeup(u64 *ctx) +int BPF_PROG(handle__sched_wakeup, struct task_struct *p) { - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") -int handle__sched_wakeup_new(u64 *ctx) +int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p) { - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p); } SEC("tp_btf/sched_switch") -int handle__sched_switch(u64 *ctx) +int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state, + struct task_struct *prev, struct task_struct *next) { - /* TP_PROTO(bool preempt, struct task_struct *prev, - * struct task_struct *next) - */ - struct task_struct *prev = (struct task_struct *)ctx[1]; - struct task_struct *next = (struct task_struct *)ctx[2]; struct runq_event event = {}; u64 *tsp, delta_us; long state;
TP_PROTO of sched_switch is updated with a new arg prev_state, which causes runqslower load failure: libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- R1 type=ctx expected=fp 0: R1=ctx(off=0,imm=0) R10=fp0 ; int handle__sched_switch(u64 *ctx) 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) ; struct task_struct *next = (struct task_struct *)ctx[2]; 1: (79) r6 = *(u64 *)(r7 +16) func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) ; struct task_struct *prev = (struct task_struct *)ctx[1]; 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) 3: (b7) r1 = 0 ; R1_w=0 ; struct runq_event event = {}; 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 ; if (prev->__state == TASK_RUNNING) 8: (61) r1 = *(u32 *)(r2 +24) R2 invalid mem access 'scalar' processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- libbpf: failed to load program 'handle__sched_switch' libbpf: failed to load object 'runqslower_bpf' libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 failed to load BPF object: -13 Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG in runqslower for cleaner code. Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") Signed-off-by: Song Liu <song@kernel.org> --- tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-)