Message ID | 20230925105552.817513-4-zhouchuyi@bytedance.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Add Open-coded task, css_task and css iters | expand |
On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow > creation and manipulation of struct bpf_iter_task in open-coded iterator > style. BPF programs can use these kfuncs or through bpf_for_each macro to > iterate all processes in the system. > > The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() > accepts a specific task and iterating type which allows: > 1. iterating all process in the system > > 2. iterating all threads in the system > > 3. iterating all threads of a specific task > Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID > to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. > > The newly-added struct bpf_iter_task has a name collision with a selftest > for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is > renamed in order to avoid the collision. > > Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > --- > include/linux/bpf.h | 8 +- > kernel/bpf/helpers.c | 3 + > kernel/bpf/task_iter.c | 96 ++++++++++++++++--- > .../testing/selftests/bpf/bpf_experimental.h | 5 + > .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- > .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 > 6 files changed, 106 insertions(+), 24 deletions(-) > rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%) > [...] > @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b > static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) > { > seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); > - if (aux->task.type == BPF_TASK_ITER_TID) > + if (aux->task.type == BPF_TASK_ITER_THREAD) > seq_printf(seq, "tid:\t%u\n", aux->task.pid); > - else if (aux->task.type == BPF_TASK_ITER_TGID) > + else if (aux->task.type == BPF_TASK_ITER_PROC) > seq_printf(seq, "pid:\t%u\n", aux->task.pid); > } > > @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) > bpf_mem_free(&bpf_global_ma, kit->css_it); > } > > +struct bpf_iter_task { > + __u64 __opaque[2]; > + __u32 __opaque_int[1]; this should be __u64 __opaque[3], because struct takes full 24 bytes > +} __attribute__((aligned(8))); > + > +struct bpf_iter_task_kern { > + struct task_struct *task; > + struct task_struct *pos; > + unsigned int type; > +} __attribute__((aligned(8))); > + > +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) nit: type -> flags, so we can add a bit more stuff, if necessary > +{ > + struct bpf_iter_task_kern *kit = (void *)it; empty line after variable declarations > + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task)); > + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != > + __alignof__(struct bpf_iter_task)); and I'd add empty line here to keep BUILD_BUG_ON block separate > + kit->task = kit->pos = NULL; > + switch (type) { > + case BPF_TASK_ITER_ALL: > + case BPF_TASK_ITER_PROC: > + case BPF_TASK_ITER_THREAD: > + break; > + default: > + return -EINVAL; > + } > + > + if (type == BPF_TASK_ITER_THREAD) > + kit->task = task; > + else > + kit->task = &init_task; > + kit->pos = kit->task; > + kit->type = type; > + return 0; > +} > + > +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) > +{ > + struct bpf_iter_task_kern *kit = (void *)it; > + struct task_struct *pos; > + unsigned int type; > + > + type = kit->type; > + pos = kit->pos; > + > + if (!pos) > + goto out; > + > + if (type == BPF_TASK_ITER_PROC) > + goto get_next_task; > + > + kit->pos = next_thread(kit->pos); > + if (kit->pos == kit->task) { > + if (type == BPF_TASK_ITER_THREAD) { > + kit->pos = NULL; > + goto out; > + } > + } else > + goto out; > + > +get_next_task: > + kit->pos = next_task(kit->pos); > + kit->task = kit->pos; > + if (kit->pos == &init_task) > + kit->pos = NULL; I can't say I completely follow the logic (e.g., for BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)? Can you elabore the expected behavior for various combinations of types and starting task argument? > + > +out: > + return pos; > +} > + > +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) > +{ > +} > + > DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); > > static void do_mmap_read_unlock(struct irq_work *entry) > diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h > index d3ea90f0e142..d989775dbdb5 100644 > --- a/tools/testing/selftests/bpf/bpf_experimental.h > +++ b/tools/testing/selftests/bpf/bpf_experimental.h > @@ -169,4 +169,9 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it, > extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym; > extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym; > > +struct bpf_iter_task; > +extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) __weak __ksym; > +extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym; > +extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym; > + > #endif > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c please split out selftests changes from kernel-side changes. We only combine them if kernel changes break selftests, preventing bisection. [...]
Hello, 在 2023/9/28 07:20, Andrii Nakryiko 写道: > On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow >> creation and manipulation of struct bpf_iter_task in open-coded iterator >> style. BPF programs can use these kfuncs or through bpf_for_each macro to >> iterate all processes in the system. >> >> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() >> accepts a specific task and iterating type which allows: >> 1. iterating all process in the system >> >> 2. iterating all threads in the system >> >> 3. iterating all threads of a specific task >> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID >> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. >> >> The newly-added struct bpf_iter_task has a name collision with a selftest >> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is >> renamed in order to avoid the collision. >> >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >> --- >> include/linux/bpf.h | 8 +- >> kernel/bpf/helpers.c | 3 + >> kernel/bpf/task_iter.c | 96 ++++++++++++++++--- >> .../testing/selftests/bpf/bpf_experimental.h | 5 + >> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- >> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 >> 6 files changed, 106 insertions(+), 24 deletions(-) >> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%) >> > > [...] > >> @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b >> static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) >> { >> seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); >> - if (aux->task.type == BPF_TASK_ITER_TID) >> + if (aux->task.type == BPF_TASK_ITER_THREAD) >> seq_printf(seq, "tid:\t%u\n", aux->task.pid); >> - else if (aux->task.type == BPF_TASK_ITER_TGID) >> + else if (aux->task.type == BPF_TASK_ITER_PROC) >> seq_printf(seq, "pid:\t%u\n", aux->task.pid); >> } >> >> @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) >> bpf_mem_free(&bpf_global_ma, kit->css_it); >> } >> >> +struct bpf_iter_task { >> + __u64 __opaque[2]; >> + __u32 __opaque_int[1]; > > this should be __u64 __opaque[3], because struct takes full 24 bytes > >> +} __attribute__((aligned(8))); >> + >> +struct bpf_iter_task_kern { >> + struct task_struct *task; >> + struct task_struct *pos; >> + unsigned int type; >> +} __attribute__((aligned(8))); >> + >> +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) > > nit: type -> flags, so we can add a bit more stuff, if necessary > >> +{ >> + struct bpf_iter_task_kern *kit = (void *)it; > > empty line after variable declarations > >> + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task)); >> + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != >> + __alignof__(struct bpf_iter_task)); > > and I'd add empty line here to keep BUILD_BUG_ON block separate > >> + kit->task = kit->pos = NULL; >> + switch (type) { >> + case BPF_TASK_ITER_ALL: >> + case BPF_TASK_ITER_PROC: >> + case BPF_TASK_ITER_THREAD: >> + break; >> + default: >> + return -EINVAL; >> + } >> + >> + if (type == BPF_TASK_ITER_THREAD) >> + kit->task = task; >> + else >> + kit->task = &init_task; >> + kit->pos = kit->task; >> + kit->type = type; >> + return 0; >> +} >> + >> +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) >> +{ >> + struct bpf_iter_task_kern *kit = (void *)it; >> + struct task_struct *pos; >> + unsigned int type; >> + >> + type = kit->type; >> + pos = kit->pos; >> + >> + if (!pos) >> + goto out; >> + >> + if (type == BPF_TASK_ITER_PROC) >> + goto get_next_task; >> + >> + kit->pos = next_thread(kit->pos); >> + if (kit->pos == kit->task) { >> + if (type == BPF_TASK_ITER_THREAD) { >> + kit->pos = NULL; >> + goto out; >> + } >> + } else >> + goto out; >> + >> +get_next_task: >> + kit->pos = next_task(kit->pos); >> + kit->task = kit->pos; >> + if (kit->pos == &init_task) >> + kit->pos = NULL; > > I can't say I completely follow the logic (e.g., for > BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)? > Can you elabore the expected behavior for various combinations of > types and starting task argument? > Thanks for the review. The expected behavior of current implementation is: BPF_TASK_ITER_PROC: init_task->first_process->second_process->...->last_process->init_task We would exit before visiting init_task again. BPF_TASK_ITER_THREAD: group_task->first_thread->second_thread->...->last_thread->group_task We would exit before visiting group_task again. BPF_TASK_ITER_ALL: init_task -> first_process -> second_process -> ... | | -> first_thread.. | -> first_thread Actually, every next() call, we would return the "pos" which was prepared by previous next() call, and use next_task()/next_thread() to update kit->pos. Once we meet the exit condition (next_task() return init_task or next_thread() return group_task), we would update kit->pos to NULL. In this way, when next() is called again, we will terminate the iteration. Here "kit->pos = NULL;" means we would return the last valid "pos" and will return NULL in next call to exit from the iteration. Am I miss something important? Thanks.
On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > Hello, > > 在 2023/9/28 07:20, Andrii Nakryiko 写道: > > On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > >> > >> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow > >> creation and manipulation of struct bpf_iter_task in open-coded iterator > >> style. BPF programs can use these kfuncs or through bpf_for_each macro to > >> iterate all processes in the system. > >> > >> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() > >> accepts a specific task and iterating type which allows: > >> 1. iterating all process in the system > >> > >> 2. iterating all threads in the system > >> > >> 3. iterating all threads of a specific task > >> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID > >> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. > >> > >> The newly-added struct bpf_iter_task has a name collision with a selftest > >> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is > >> renamed in order to avoid the collision. > >> > >> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > >> --- > >> include/linux/bpf.h | 8 +- > >> kernel/bpf/helpers.c | 3 + > >> kernel/bpf/task_iter.c | 96 ++++++++++++++++--- > >> .../testing/selftests/bpf/bpf_experimental.h | 5 + > >> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- > >> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 > >> 6 files changed, 106 insertions(+), 24 deletions(-) > >> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%) > >> > > > > [...] > > > >> @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b > >> static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) > >> { > >> seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); > >> - if (aux->task.type == BPF_TASK_ITER_TID) > >> + if (aux->task.type == BPF_TASK_ITER_THREAD) > >> seq_printf(seq, "tid:\t%u\n", aux->task.pid); > >> - else if (aux->task.type == BPF_TASK_ITER_TGID) > >> + else if (aux->task.type == BPF_TASK_ITER_PROC) > >> seq_printf(seq, "pid:\t%u\n", aux->task.pid); > >> } > >> > >> @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) > >> bpf_mem_free(&bpf_global_ma, kit->css_it); > >> } > >> > >> +struct bpf_iter_task { > >> + __u64 __opaque[2]; > >> + __u32 __opaque_int[1]; > > > > this should be __u64 __opaque[3], because struct takes full 24 bytes > > > >> +} __attribute__((aligned(8))); > >> + > >> +struct bpf_iter_task_kern { > >> + struct task_struct *task; > >> + struct task_struct *pos; > >> + unsigned int type; > >> +} __attribute__((aligned(8))); > >> + > >> +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) > > > > nit: type -> flags, so we can add a bit more stuff, if necessary > > > >> +{ > >> + struct bpf_iter_task_kern *kit = (void *)it; > > > > empty line after variable declarations > > > >> + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task)); > >> + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != > >> + __alignof__(struct bpf_iter_task)); > > > > and I'd add empty line here to keep BUILD_BUG_ON block separate > > > >> + kit->task = kit->pos = NULL; > >> + switch (type) { > >> + case BPF_TASK_ITER_ALL: > >> + case BPF_TASK_ITER_PROC: > >> + case BPF_TASK_ITER_THREAD: > >> + break; > >> + default: > >> + return -EINVAL; > >> + } > >> + > >> + if (type == BPF_TASK_ITER_THREAD) > >> + kit->task = task; > >> + else > >> + kit->task = &init_task; > >> + kit->pos = kit->task; > >> + kit->type = type; > >> + return 0; > >> +} > >> + > >> +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) > >> +{ > >> + struct bpf_iter_task_kern *kit = (void *)it; > >> + struct task_struct *pos; > >> + unsigned int type; > >> + > >> + type = kit->type; > >> + pos = kit->pos; > >> + > >> + if (!pos) > >> + goto out; > >> + > >> + if (type == BPF_TASK_ITER_PROC) > >> + goto get_next_task; > >> + > >> + kit->pos = next_thread(kit->pos); > >> + if (kit->pos == kit->task) { > >> + if (type == BPF_TASK_ITER_THREAD) { > >> + kit->pos = NULL; > >> + goto out; > >> + } > >> + } else > >> + goto out; > >> + > >> +get_next_task: > >> + kit->pos = next_task(kit->pos); > >> + kit->task = kit->pos; > >> + if (kit->pos == &init_task) > >> + kit->pos = NULL; > > > > I can't say I completely follow the logic (e.g., for > > BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)? > > Can you elabore the expected behavior for various combinations of > > types and starting task argument? > > > > Thanks for the review. > > The expected behavior of current implementation is: > > BPF_TASK_ITER_PROC: > > init_task->first_process->second_process->...->last_process->init_task > > We would exit before visiting init_task again. ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e., we iterate all processes in the system. Input `task` that we provide is ignored/meaningless, right? Maybe we should express it as ALL_PROCS? > > BPF_TASK_ITER_THREAD: > > group_task->first_thread->second_thread->...->last_thread->group_task > > We would exit before visiting group_task again. > And this one is iterating threads of a process specified by given `task`, right? This is where my confusion comes from. ITER_PROC and ITER_THREAD, by their name, seems to be very similar, but in reality ITER_PROC is more like ITER_ALL (except process vs thread iteration), while ITER_THREAD is parameterized by input `task`. I'm not sure what's the least confusing way to name and organize everything, but I think it's quite confusing right now, unfortunately. I wonder if you or someone else have a better suggestion on making this more straightforward? > BPF_TASK_ITER_ALL: > > init_task -> first_process -> second_process -> ... > | | > -> first_thread.. | > -> first_thread > Ok, and this one is "all threads in the system". So BPF_TASK_ITER_ALL_THREADS would describe it more precisely, right? > Actually, every next() call, we would return the "pos" which was > prepared by previous next() call, and use next_task()/next_thread() to > update kit->pos. Once we meet the exit condition (next_task() return > init_task or next_thread() return group_task), we would update kit->pos > to NULL. In this way, when next() is called again, we will terminate the > iteration. > > Here "kit->pos = NULL;" means we would return the last valid "pos" and > will return NULL in next call to exit from the iteration. > > Am I miss something important? No, it's my bad. I did check, but somehow concluded that you are returning kit->pos, but you are returning locally cached previous value of kit->pos. All good here, I think. > > Thanks. > > >
Hello, Andrii 在 2023/9/30 05:27, Andrii Nakryiko 写道: > On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >> >> Hello, >> >> 在 2023/9/28 07:20, Andrii Nakryiko 写道: >>> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: >>>> >>>> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow >>>> creation and manipulation of struct bpf_iter_task in open-coded iterator >>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to >>>> iterate all processes in the system. >>>> >>>> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() >>>> accepts a specific task and iterating type which allows: >>>> 1. iterating all process in the system >>>> >>>> 2. iterating all threads in the system >>>> >>>> 3. iterating all threads of a specific task >>>> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID >>>> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. >>>> >>>> The newly-added struct bpf_iter_task has a name collision with a selftest >>>> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is >>>> renamed in order to avoid the collision. >>>> >>>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> >>>> --- >>>> include/linux/bpf.h | 8 +- >>>> kernel/bpf/helpers.c | 3 + >>>> kernel/bpf/task_iter.c | 96 ++++++++++++++++--- >>>> .../testing/selftests/bpf/bpf_experimental.h | 5 + >>>> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- >>>> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 >>>> 6 files changed, 106 insertions(+), 24 deletions(-) >>>> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%) >>>> >>> [...] >>>> +get_next_task: >>>> + kit->pos = next_task(kit->pos); >>>> + kit->task = kit->pos; >>>> + if (kit->pos == &init_task) >>>> + kit->pos = NULL; >>> >>> I can't say I completely follow the logic (e.g., for >>> BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)? >>> Can you elabore the expected behavior for various combinations of >>> types and starting task argument? >>> >> >> Thanks for the review. >> >> The expected behavior of current implementation is: >> >> BPF_TASK_ITER_PROC: >> >> init_task->first_process->second_process->...->last_process->init_task >> >> We would exit before visiting init_task again. > > ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e., > we iterate all processes in the system. Input `task` that we provide > is ignored/meaningless, right? Maybe we should express it as > ALL_PROCS? > >> >> BPF_TASK_ITER_THREAD: >> >> group_task->first_thread->second_thread->...->last_thread->group_task >> >> We would exit before visiting group_task again. >> > > And this one is iterating threads of a process specified by given > `task`, right? This is where my confusion comes from. ITER_PROC and > ITER_THREAD, by their name, seems to be very similar, but in reality > ITER_PROC is more like ITER_ALL (except process vs thread iteration), > while ITER_THREAD is parameterized by input `task`. > > I'm not sure what's the least confusing way to name and organize > everything, but I think it's quite confusing right now, unfortunately. > I wonder if you or someone else have a better suggestion on making > this more straightforward? > Maybe here we can introduce new enums and not reuse or rename BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID? { BPF_TASK_ITER_ALL_PROC, BPF_TASK_ITER_ALL_THREAD, BPF_TASK_ITER_THREAD } BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags. Looking at the example usage of SEC("iter/task"), unlike BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST, we actually don't use BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID directly. When using SEC("iter/task"), we just set pid/tid for struct bpf_iter_link_info. Exposing new enums to users for open coded task_iters will not confuse users. Thanks.
On Sun, Oct 1, 2023 at 1:21 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > > Hello, Andrii > > 在 2023/9/30 05:27, Andrii Nakryiko 写道: > > On Wed, Sep 27, 2023 at 8:29 PM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > >> > >> Hello, > >> > >> 在 2023/9/28 07:20, Andrii Nakryiko 写道: > >>> On Mon, Sep 25, 2023 at 3:56 AM Chuyi Zhou <zhouchuyi@bytedance.com> wrote: > >>>> > >>>> This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow > >>>> creation and manipulation of struct bpf_iter_task in open-coded iterator > >>>> style. BPF programs can use these kfuncs or through bpf_for_each macro to > >>>> iterate all processes in the system. > >>>> > >>>> The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() > >>>> accepts a specific task and iterating type which allows: > >>>> 1. iterating all process in the system > >>>> > >>>> 2. iterating all threads in the system > >>>> > >>>> 3. iterating all threads of a specific task > >>>> Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID > >>>> to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. > >>>> > >>>> The newly-added struct bpf_iter_task has a name collision with a selftest > >>>> for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is > >>>> renamed in order to avoid the collision. > >>>> > >>>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> > >>>> --- > >>>> include/linux/bpf.h | 8 +- > >>>> kernel/bpf/helpers.c | 3 + > >>>> kernel/bpf/task_iter.c | 96 ++++++++++++++++--- > >>>> .../testing/selftests/bpf/bpf_experimental.h | 5 + > >>>> .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- > >>>> .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 > >>>> 6 files changed, 106 insertions(+), 24 deletions(-) > >>>> rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%) > >>>> > >>> > > > [...] > > >>>> +get_next_task: > >>>> + kit->pos = next_task(kit->pos); > >>>> + kit->task = kit->pos; > >>>> + if (kit->pos == &init_task) > >>>> + kit->pos = NULL; > >>> > >>> I can't say I completely follow the logic (e.g., for > >>> BPF_TASK_ITER_PROC, why do we do next_task() on first next() call)? > >>> Can you elabore the expected behavior for various combinations of > >>> types and starting task argument? > >>> > >> > >> Thanks for the review. > >> > >> The expected behavior of current implementation is: > >> > >> BPF_TASK_ITER_PROC: > >> > >> init_task->first_process->second_process->...->last_process->init_task > >> > >> We would exit before visiting init_task again. > > > > ah, ok, so in this case it's more like BPF_TASK_ITER_ALL_PROCS, i.e., > > we iterate all processes in the system. Input `task` that we provide > > is ignored/meaningless, right? Maybe we should express it as > > ALL_PROCS? > > > >> > >> BPF_TASK_ITER_THREAD: > >> > >> group_task->first_thread->second_thread->...->last_thread->group_task > >> > >> We would exit before visiting group_task again. > >> > > > > And this one is iterating threads of a process specified by given > > `task`, right? This is where my confusion comes from. ITER_PROC and > > ITER_THREAD, by their name, seems to be very similar, but in reality > > ITER_PROC is more like ITER_ALL (except process vs thread iteration), > > while ITER_THREAD is parameterized by input `task`. > > > > I'm not sure what's the least confusing way to name and organize > > everything, but I think it's quite confusing right now, unfortunately. > > I wonder if you or someone else have a better suggestion on making > > this more straightforward? > > > > Maybe here we can introduce new enums and not reuse or rename > BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID? Yep, probably it's cleaner > > { > BPF_TASK_ITER_ALL_PROC, BPF_TASK_ITER_ALL_PROCS > BPF_TASK_ITER_ALL_THREAD, BPF_TASK_ITER_ALL_THREADS > BPF_TASK_ITER_THREAD BPF_TASK_ITER_PROC_THREADS ? > } > > BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID are inner flags. Looking at the > example usage of SEC("iter/task"), unlike > BPF_CGROUP_ITER_DESCENDANTS_PRE/BPF_CGROUP_ITER_DESCENDANTS_POST, we > actually don't use BPF_TASK_ITER_TID/BPF_TASK_ITER_TGID directly. When > using SEC("iter/task"), we just set pid/tid for struct > bpf_iter_link_info. Exposing new enums to users for open coded > task_iters will not confuse users. > > Thanks. >
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 87eeb3a46a1d..0ef5b7a59d62 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2194,16 +2194,16 @@ int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); * BPF_TASK_ITER_ALL (default) * Iterate over resources of every task. * - * BPF_TASK_ITER_TID + * BPF_TASK_ITER_THREAD * Iterate over resources of a task/tid. * - * BPF_TASK_ITER_TGID + * BPF_TASK_ITER_PROC * Iterate over resources of every task of a process / task group. */ enum bpf_iter_task_type { BPF_TASK_ITER_ALL = 0, - BPF_TASK_ITER_TID, - BPF_TASK_ITER_TGID, + BPF_TASK_ITER_THREAD, + BPF_TASK_ITER_PROC, }; struct bpf_iter_aux_info { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 189d158c9b7f..556262c27a75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2507,6 +2507,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 2cfcb4dd8a37..9bcd3f9922b1 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -94,7 +94,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co struct task_struct *task = NULL; struct pid *pid; - if (common->type == BPF_TASK_ITER_TID) { + if (common->type == BPF_TASK_ITER_THREAD) { if (*tid && *tid != common->pid) return NULL; rcu_read_lock(); @@ -108,7 +108,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co return task; } - if (common->type == BPF_TASK_ITER_TGID) { + if (common->type == BPF_TASK_ITER_PROC) { rcu_read_lock(); task = task_group_seq_get_next(common, tid, skip_if_dup_files); rcu_read_unlock(); @@ -217,15 +217,15 @@ static int bpf_iter_attach_task(struct bpf_prog *prog, aux->task.type = BPF_TASK_ITER_ALL; if (linfo->task.tid != 0) { - aux->task.type = BPF_TASK_ITER_TID; + aux->task.type = BPF_TASK_ITER_THREAD; aux->task.pid = linfo->task.tid; } if (linfo->task.pid != 0) { - aux->task.type = BPF_TASK_ITER_TGID; + aux->task.type = BPF_TASK_ITER_PROC; aux->task.pid = linfo->task.pid; } if (linfo->task.pid_fd != 0) { - aux->task.type = BPF_TASK_ITER_TGID; + aux->task.type = BPF_TASK_ITER_PROC; pid = pidfd_get_pid(linfo->task.pid_fd, &flags); if (IS_ERR(pid)) @@ -305,7 +305,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) rcu_read_unlock(); put_task_struct(curr_task); - if (info->common.type == BPF_TASK_ITER_TID) { + if (info->common.type == BPF_TASK_ITER_THREAD) { info->task = NULL; return NULL; } @@ -566,7 +566,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) return curr_vma; next_task: - if (info->common.type == BPF_TASK_ITER_TID) + if (info->common.type == BPF_TASK_ITER_THREAD) goto finish; put_task_struct(curr_task); @@ -677,10 +677,10 @@ static const struct bpf_iter_seq_info task_seq_info = { static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) { switch (aux->task.type) { - case BPF_TASK_ITER_TID: + case BPF_TASK_ITER_THREAD: info->iter.task.tid = aux->task.pid; break; - case BPF_TASK_ITER_TGID: + case BPF_TASK_ITER_PROC: info->iter.task.pid = aux->task.pid; break; default: @@ -692,9 +692,9 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) { seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); - if (aux->task.type == BPF_TASK_ITER_TID) + if (aux->task.type == BPF_TASK_ITER_THREAD) seq_printf(seq, "tid:\t%u\n", aux->task.pid); - else if (aux->task.type == BPF_TASK_ITER_TGID) + else if (aux->task.type == BPF_TASK_ITER_PROC) seq_printf(seq, "pid:\t%u\n", aux->task.pid); } @@ -856,6 +856,80 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) bpf_mem_free(&bpf_global_ma, kit->css_it); } +struct bpf_iter_task { + __u64 __opaque[2]; + __u32 __opaque_int[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_task_kern { + struct task_struct *task; + struct task_struct *pos; + unsigned int type; +} __attribute__((aligned(8))); + +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) +{ + struct bpf_iter_task_kern *kit = (void *)it; + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) != sizeof(struct bpf_iter_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != + __alignof__(struct bpf_iter_task)); + kit->task = kit->pos = NULL; + switch (type) { + case BPF_TASK_ITER_ALL: + case BPF_TASK_ITER_PROC: + case BPF_TASK_ITER_THREAD: + break; + default: + return -EINVAL; + } + + if (type == BPF_TASK_ITER_THREAD) + kit->task = task; + else + kit->task = &init_task; + kit->pos = kit->task; + kit->type = type; + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) +{ + struct bpf_iter_task_kern *kit = (void *)it; + struct task_struct *pos; + unsigned int type; + + type = kit->type; + pos = kit->pos; + + if (!pos) + goto out; + + if (type == BPF_TASK_ITER_PROC) + goto get_next_task; + + kit->pos = next_thread(kit->pos); + if (kit->pos == kit->task) { + if (type == BPF_TASK_ITER_THREAD) { + kit->pos = NULL; + goto out; + } + } else + goto out; + +get_next_task: + kit->pos = next_task(kit->pos); + kit->task = kit->pos; + if (kit->pos == &init_task) + kit->pos = NULL; + +out: + return pos; +} + +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) +{ +} + DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); static void do_mmap_read_unlock(struct irq_work *entry) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index d3ea90f0e142..d989775dbdb5 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -169,4 +169,9 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it, extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym; extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym; +struct bpf_iter_task; +extern int bpf_iter_task_new(struct bpf_iter_task *it, struct task_struct *task, unsigned int type) __weak __ksym; +extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym; +extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym; + #endif diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 1f02168103dd..dc60e8e125cd 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -7,7 +7,7 @@ #include "bpf_iter_ipv6_route.skel.h" #include "bpf_iter_netlink.skel.h" #include "bpf_iter_bpf_map.skel.h" -#include "bpf_iter_task.skel.h" +#include "bpf_iter_tasks.skel.h" #include "bpf_iter_task_stack.skel.h" #include "bpf_iter_task_file.skel.h" #include "bpf_iter_task_vma.skel.h" @@ -215,12 +215,12 @@ static void *do_nothing_wait(void *arg) static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts, int *num_unknown, int *num_known) { - struct bpf_iter_task *skel; + struct bpf_iter_tasks *skel; pthread_t thread_id; void *ret; - skel = bpf_iter_task__open_and_load(); - if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load")) + skel = bpf_iter_tasks__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_tasks__open_and_load")) return; ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock"); @@ -239,7 +239,7 @@ static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts, ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL, "pthread_join"); - bpf_iter_task__destroy(skel); + bpf_iter_tasks__destroy(skel); } static void test_task_common(struct bpf_iter_attach_opts *opts, int num_unknown, int num_known) @@ -307,10 +307,10 @@ static void test_task_pidfd(void) static void test_task_sleepable(void) { - struct bpf_iter_task *skel; + struct bpf_iter_tasks *skel; - skel = bpf_iter_task__open_and_load(); - if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load")) + skel = bpf_iter_tasks__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_tasks__open_and_load")) return; do_dummy_read(skel->progs.dump_task_sleepable); @@ -320,7 +320,7 @@ static void test_task_sleepable(void) ASSERT_GT(skel->bss->num_success_copy_from_user_task, 0, "num_success_copy_from_user_task"); - bpf_iter_task__destroy(skel); + bpf_iter_tasks__destroy(skel); } static void test_task_stack(void) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_tasks.c similarity index 100% rename from tools/testing/selftests/bpf/progs/bpf_iter_task.c rename to tools/testing/selftests/bpf/progs/bpf_iter_tasks.c
This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow creation and manipulation of struct bpf_iter_task in open-coded iterator style. BPF programs can use these kfuncs or through bpf_for_each macro to iterate all processes in the system. The API design keep consistent with SEC("iter/task"). bpf_iter_task_new() accepts a specific task and iterating type which allows: 1. iterating all process in the system 2. iterating all threads in the system 3. iterating all threads of a specific task Here we also resuse enum bpf_iter_task_type and rename BPF_TASK_ITER_TID to BPF_TASK_ITER_THREAD, rename BPF_TASK_ITER_TGID to BPF_TASK_ITER_PROC. The newly-added struct bpf_iter_task has a name collision with a selftest for the seq_file task iter's bpf skel, so the selftests/bpf/progs file is renamed in order to avoid the collision. Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com> --- include/linux/bpf.h | 8 +- kernel/bpf/helpers.c | 3 + kernel/bpf/task_iter.c | 96 ++++++++++++++++--- .../testing/selftests/bpf/bpf_experimental.h | 5 + .../selftests/bpf/prog_tests/bpf_iter.c | 18 ++-- .../{bpf_iter_task.c => bpf_iter_tasks.c} | 0 6 files changed, 106 insertions(+), 24 deletions(-) rename tools/testing/selftests/bpf/progs/{bpf_iter_task.c => bpf_iter_tasks.c} (100%)