diff mbox series

[bpf-next,v2,1/4] bpf/crib: Introduce task_file open-coded iterator kfuncs

Message ID AM6PR03MB584846B635B10C59EFAF596099542@AM6PR03MB5848.eurprd03.prod.outlook.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf/crib: Add open-coded style process file iterator and file related CRIB kfuncs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 5 this patch: 7
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 14 of 14 maintainers
netdev/build_clang fail Errors and warnings before: 4 this patch: 6
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 8 this patch: 14
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 85 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 0 this patch: 5
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Juntong Deng Oct. 30, 2024, 12:14 a.m. UTC
This patch adds the open-coded iterator style process file iterator
kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
files opened by the specified process.

In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
the file descriptor corresponding to the file in the current iteration.

The reference to struct file acquired by the previous
bpf_iter_task_file_next() is released in the next
bpf_iter_task_file_next(), and the last reference is released in the
last bpf_iter_task_file_next() that returns NULL.

In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
the end, then the last struct file reference is released at this time.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
---
 kernel/bpf/Makefile      |   1 +
 kernel/bpf/crib/Makefile |   3 ++
 kernel/bpf/crib/crib.c   |  29 +++++++++++
 kernel/bpf/crib/files.c  | 105 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+)
 create mode 100644 kernel/bpf/crib/Makefile
 create mode 100644 kernel/bpf/crib/crib.c
 create mode 100644 kernel/bpf/crib/files.c

Comments

Andrii Nakryiko Nov. 1, 2024, 7:06 p.m. UTC | #1
On Tue, Oct 29, 2024 at 5:15 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> This patch adds the open-coded iterator style process file iterator
> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
> files opened by the specified process.
>
> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
> the file descriptor corresponding to the file in the current iteration.
>
> The reference to struct file acquired by the previous
> bpf_iter_task_file_next() is released in the next
> bpf_iter_task_file_next(), and the last reference is released in the
> last bpf_iter_task_file_next() that returns NULL.
>
> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
> the end, then the last struct file reference is released at this time.
>
> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> ---
>  kernel/bpf/Makefile      |   1 +
>  kernel/bpf/crib/Makefile |   3 ++
>  kernel/bpf/crib/crib.c   |  29 +++++++++++
>  kernel/bpf/crib/files.c  | 105 +++++++++++++++++++++++++++++++++++++++
>  4 files changed, 138 insertions(+)
>  create mode 100644 kernel/bpf/crib/Makefile
>  create mode 100644 kernel/bpf/crib/crib.c
>  create mode 100644 kernel/bpf/crib/files.c
>
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 105328f0b9c0..933d36264e5e 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -53,3 +53,4 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
>  obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
>  obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += crib/
> diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
> new file mode 100644
> index 000000000000..4e1bae1972dd
> --- /dev/null
> +++ b/kernel/bpf/crib/Makefile
> @@ -0,0 +1,3 @@
> +# SPDX-License-Identifier: GPL-2.0
> +
> +obj-$(CONFIG_BPF_SYSCALL) += crib.o files.o
> diff --git a/kernel/bpf/crib/crib.c b/kernel/bpf/crib/crib.c
> new file mode 100644
> index 000000000000..e6536ee9a845
> --- /dev/null
> +++ b/kernel/bpf/crib/crib.c
> @@ -0,0 +1,29 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Checkpoint/Restore In eBPF (CRIB)
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/btf_ids.h>
> +
> +BTF_KFUNCS_START(bpf_crib_kfuncs)
> +
> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)

This is in no way CRIB-specific, right? So I'd drop the CRIB reference
and move code next to task_file BPF iterator program type
implementation, this is a generic functionality.

Even more so, given Namhyung's recent work on adding kmem_cache
iterator (both program type and open-coded iterator), it seems like it
should be possible to cut down on code duplication by using open-coded
iterator logic inside the BPF iterator program. Now that you are
adding task_file open-coded iterator, can you please check if it can
be reused. See kernel/bpf/task_iter.c (and I think that's where your
code should live as well, btw).

pw-bot: cr

> +
> +BTF_KFUNCS_END(bpf_crib_kfuncs)
> +
> +static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
> +       .owner = THIS_MODULE,
> +       .set   = &bpf_crib_kfuncs,
> +};
> +
> +static int __init bpf_crib_init(void)
> +{
> +       return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_crib_kfunc_set);
> +}
> +
> +late_initcall(bpf_crib_init);
> diff --git a/kernel/bpf/crib/files.c b/kernel/bpf/crib/files.c
> new file mode 100644
> index 000000000000..ececf150303f
> --- /dev/null
> +++ b/kernel/bpf/crib/files.c
> @@ -0,0 +1,105 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/btf.h>
> +#include <linux/file.h>
> +#include <linux/fdtable.h>
> +#include <linux/net.h>
> +
> +struct bpf_iter_task_file {
> +       __u64 __opaque[3];
> +} __aligned(8);
> +
> +struct bpf_iter_task_file_kern {
> +       struct task_struct *task;
> +       struct file *file;
> +       int fd;
> +} __aligned(8);
> +
> +__bpf_kfunc_start_defs();
> +
> +/**
> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
> + * used to iterate over all files opened by a specified task
> + *
> + * @it: the new bpf_iter_task_file to be created
> + * @task: a pointer pointing to a task to be iterated over
> + */
> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> +               struct task_struct *task)
> +{
> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> +                    __alignof__(struct bpf_iter_task_file));
> +
> +       kit->task = task;
> +       kit->fd = -1;
> +       kit->file = NULL;
> +
> +       return 0;
> +}
> +
> +/**
> + * bpf_iter_task_file_next() - Get the next file in bpf_iter_task_file
> + *
> + * bpf_iter_task_file_next acquires a reference to the returned struct file.
> + *
> + * The reference to struct file acquired by the previous
> + * bpf_iter_task_file_next() is released in the next bpf_iter_task_file_next(),
> + * and the last reference is released in the last bpf_iter_task_file_next()
> + * that returns NULL.
> + *
> + * @it: the bpf_iter_task_file to be checked
> + *
> + * @returns a pointer to the struct file of the next file if further files
> + * are available, otherwise returns NULL
> + */
> +__bpf_kfunc struct file *bpf_iter_task_file_next(struct bpf_iter_task_file *it)
> +{
> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> +
> +       if (kit->file)
> +               fput(kit->file);
> +
> +       kit->fd++;
> +
> +       rcu_read_lock();
> +       kit->file = task_lookup_next_fdget_rcu(kit->task, &kit->fd);
> +       rcu_read_unlock();
> +
> +       return kit->file;
> +}
> +
> +/**
> + * bpf_iter_task_file_get_fd() - Get the file descriptor corresponding to
> + * the file in the current iteration
> + *
> + * @it: the bpf_iter_task_file to be checked
> + *
> + * @returns the file descriptor
> + */
> +__bpf_kfunc int bpf_iter_task_file_get_fd(struct bpf_iter_task_file *it__iter)
> +{
> +       struct bpf_iter_task_file_kern *kit = (void *)it__iter;
> +
> +       return kit->fd;
> +}
> +

I don't think we need this. It's probably better to return a pointer
to a small struct representing "item" returned from this iterator.
Something like

struct bpf_iter_task_file_item {
    struct task_struct *task;
    struct file *file;
    int fd;
};

You can then embed this struct into struct bpf_iter_task_file and
return a pointer to it on each next() call (avoiding memory
allocation)


(naming just for illustrative purposes, I spent 0 seconds thinking about it)

> +/**
> + * bpf_iter_task_file_destroy() - Destroy a bpf_iter_task_file
> + *
> + * If the iterator does not iterate to the end, then the last
> + * struct file reference is released at this time.
> + *
> + * @it: the bpf_iter_task_file to be destroyed
> + */
> +__bpf_kfunc void bpf_iter_task_file_destroy(struct bpf_iter_task_file *it)
> +{
> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> +
> +       if (kit->file)
> +               fput(kit->file);
> +}
> +
> +__bpf_kfunc_end_defs();
> --
> 2.39.5
>
Juntong Deng Nov. 1, 2024, 8:22 p.m. UTC | #2
On 2024/11/1 19:06, Andrii Nakryiko wrote:
> On Tue, Oct 29, 2024 at 5:15 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> This patch adds the open-coded iterator style process file iterator
>> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
>> files opened by the specified process.
>>
>> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
>> the file descriptor corresponding to the file in the current iteration.
>>
>> The reference to struct file acquired by the previous
>> bpf_iter_task_file_next() is released in the next
>> bpf_iter_task_file_next(), and the last reference is released in the
>> last bpf_iter_task_file_next() that returns NULL.
>>
>> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
>> the end, then the last struct file reference is released at this time.
>>
>> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
>> ---
>>   kernel/bpf/Makefile      |   1 +
>>   kernel/bpf/crib/Makefile |   3 ++
>>   kernel/bpf/crib/crib.c   |  29 +++++++++++
>>   kernel/bpf/crib/files.c  | 105 +++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 138 insertions(+)
>>   create mode 100644 kernel/bpf/crib/Makefile
>>   create mode 100644 kernel/bpf/crib/crib.c
>>   create mode 100644 kernel/bpf/crib/files.c
>>
>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>> index 105328f0b9c0..933d36264e5e 100644
>> --- a/kernel/bpf/Makefile
>> +++ b/kernel/bpf/Makefile
>> @@ -53,3 +53,4 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
>>   obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
>>   obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
>>   obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
>> +obj-$(CONFIG_BPF_SYSCALL) += crib/
>> diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
>> new file mode 100644
>> index 000000000000..4e1bae1972dd
>> --- /dev/null
>> +++ b/kernel/bpf/crib/Makefile
>> @@ -0,0 +1,3 @@
>> +# SPDX-License-Identifier: GPL-2.0
>> +
>> +obj-$(CONFIG_BPF_SYSCALL) += crib.o files.o
>> diff --git a/kernel/bpf/crib/crib.c b/kernel/bpf/crib/crib.c
>> new file mode 100644
>> index 000000000000..e6536ee9a845
>> --- /dev/null
>> +++ b/kernel/bpf/crib/crib.c
>> @@ -0,0 +1,29 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Checkpoint/Restore In eBPF (CRIB)
>> + */
>> +
>> +#include <linux/bpf.h>
>> +#include <linux/btf.h>
>> +#include <linux/btf_ids.h>
>> +
>> +BTF_KFUNCS_START(bpf_crib_kfuncs)
>> +
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
> 
> This is in no way CRIB-specific, right? So I'd drop the CRIB reference
> and move code next to task_file BPF iterator program type
> implementation, this is a generic functionality.
> 
> Even more so, given Namhyung's recent work on adding kmem_cache
> iterator (both program type and open-coded iterator), it seems like it
> should be possible to cut down on code duplication by using open-coded
> iterator logic inside the BPF iterator program. Now that you are
> adding task_file open-coded iterator, can you please check if it can
> be reused. See kernel/bpf/task_iter.c (and I think that's where your
> code should live as well, btw).
> 
> pw-bot: cr
> 

Thanks for your reply!

Yes, I agree that it would be better to put the task_file open-coded
iterator together with the traditional task_file iterator (in the
same file).

I will move it in the next patch series.

>> +
>> +BTF_KFUNCS_END(bpf_crib_kfuncs)
>> +
>> +static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
>> +       .owner = THIS_MODULE,
>> +       .set   = &bpf_crib_kfuncs,
>> +};
>> +
>> +static int __init bpf_crib_init(void)
>> +{
>> +       return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_crib_kfunc_set);
>> +}
>> +
>> +late_initcall(bpf_crib_init);
>> diff --git a/kernel/bpf/crib/files.c b/kernel/bpf/crib/files.c
>> new file mode 100644
>> index 000000000000..ececf150303f
>> --- /dev/null
>> +++ b/kernel/bpf/crib/files.c
>> @@ -0,0 +1,105 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/btf.h>
>> +#include <linux/file.h>
>> +#include <linux/fdtable.h>
>> +#include <linux/net.h>
>> +
>> +struct bpf_iter_task_file {
>> +       __u64 __opaque[3];
>> +} __aligned(8);
>> +
>> +struct bpf_iter_task_file_kern {
>> +       struct task_struct *task;
>> +       struct file *file;
>> +       int fd;
>> +} __aligned(8);
>> +
>> +__bpf_kfunc_start_defs();
>> +
>> +/**
>> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
>> + * used to iterate over all files opened by a specified task
>> + *
>> + * @it: the new bpf_iter_task_file to be created
>> + * @task: a pointer pointing to a task to be iterated over
>> + */
>> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
>> +               struct task_struct *task)
>> +{
>> +       struct bpf_iter_task_file_kern *kit = (void *)it;
>> +
>> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
>> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
>> +                    __alignof__(struct bpf_iter_task_file));
>> +
>> +       kit->task = task;
>> +       kit->fd = -1;
>> +       kit->file = NULL;
>> +
>> +       return 0;
>> +}
>> +
>> +/**
>> + * bpf_iter_task_file_next() - Get the next file in bpf_iter_task_file
>> + *
>> + * bpf_iter_task_file_next acquires a reference to the returned struct file.
>> + *
>> + * The reference to struct file acquired by the previous
>> + * bpf_iter_task_file_next() is released in the next bpf_iter_task_file_next(),
>> + * and the last reference is released in the last bpf_iter_task_file_next()
>> + * that returns NULL.
>> + *
>> + * @it: the bpf_iter_task_file to be checked
>> + *
>> + * @returns a pointer to the struct file of the next file if further files
>> + * are available, otherwise returns NULL
>> + */
>> +__bpf_kfunc struct file *bpf_iter_task_file_next(struct bpf_iter_task_file *it)
>> +{
>> +       struct bpf_iter_task_file_kern *kit = (void *)it;
>> +
>> +       if (kit->file)
>> +               fput(kit->file);
>> +
>> +       kit->fd++;
>> +
>> +       rcu_read_lock();
>> +       kit->file = task_lookup_next_fdget_rcu(kit->task, &kit->fd);
>> +       rcu_read_unlock();
>> +
>> +       return kit->file;
>> +}
>> +
>> +/**
>> + * bpf_iter_task_file_get_fd() - Get the file descriptor corresponding to
>> + * the file in the current iteration
>> + *
>> + * @it: the bpf_iter_task_file to be checked
>> + *
>> + * @returns the file descriptor
>> + */
>> +__bpf_kfunc int bpf_iter_task_file_get_fd(struct bpf_iter_task_file *it__iter)
>> +{
>> +       struct bpf_iter_task_file_kern *kit = (void *)it__iter;
>> +
>> +       return kit->fd;
>> +}
>> +
> 
> I don't think we need this. It's probably better to return a pointer
> to a small struct representing "item" returned from this iterator.
> Something like
> 
> struct bpf_iter_task_file_item {
>      struct task_struct *task;
>      struct file *file;
>      int fd;
> };
> 
> You can then embed this struct into struct bpf_iter_task_file and
> return a pointer to it on each next() call (avoiding memory
> allocation)
> 
> 
> (naming just for illustrative purposes, I spent 0 seconds thinking about it)
> 

Yes, I agree that it is feasible.

But there is a question here, should we expose the internal state
structure of the iterator (If we want to embed) ?

I guess that we need two versions of data structures struct bpf_iter_xxx
and struct bpf_iter_xxx_kern is for the purpose of encapsulation?

With two versions of the data structure, users can only manipulate
the iterator using the iterator kfuncs, avoiding users from directly
accessing the internal state.

After we decide to return struct bpf_iter_task_file_item, these members
will not be able to change and users can directly access/change the
internal state of the iterator.

>> +/**
>> + * bpf_iter_task_file_destroy() - Destroy a bpf_iter_task_file
>> + *
>> + * If the iterator does not iterate to the end, then the last
>> + * struct file reference is released at this time.
>> + *
>> + * @it: the bpf_iter_task_file to be destroyed
>> + */
>> +__bpf_kfunc void bpf_iter_task_file_destroy(struct bpf_iter_task_file *it)
>> +{
>> +       struct bpf_iter_task_file_kern *kit = (void *)it;
>> +
>> +       if (kit->file)
>> +               fput(kit->file);
>> +}
>> +
>> +__bpf_kfunc_end_defs();
>> --
>> 2.39.5
>>
Andrii Nakryiko Nov. 6, 2024, 8:02 p.m. UTC | #3
On Fri, Nov 1, 2024 at 1:22 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> On 2024/11/1 19:06, Andrii Nakryiko wrote:
> > On Tue, Oct 29, 2024 at 5:15 PM Juntong Deng <juntong.deng@outlook.com> wrote:
> >>
> >> This patch adds the open-coded iterator style process file iterator
> >> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
> >> files opened by the specified process.
> >>
> >> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
> >> the file descriptor corresponding to the file in the current iteration.
> >>
> >> The reference to struct file acquired by the previous
> >> bpf_iter_task_file_next() is released in the next
> >> bpf_iter_task_file_next(), and the last reference is released in the
> >> last bpf_iter_task_file_next() that returns NULL.
> >>
> >> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
> >> the end, then the last struct file reference is released at this time.
> >>
> >> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> >> ---
> >>   kernel/bpf/Makefile      |   1 +
> >>   kernel/bpf/crib/Makefile |   3 ++
> >>   kernel/bpf/crib/crib.c   |  29 +++++++++++
> >>   kernel/bpf/crib/files.c  | 105 +++++++++++++++++++++++++++++++++++++++
> >>   4 files changed, 138 insertions(+)
> >>   create mode 100644 kernel/bpf/crib/Makefile
> >>   create mode 100644 kernel/bpf/crib/crib.c
> >>   create mode 100644 kernel/bpf/crib/files.c
> >>
> >> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> >> index 105328f0b9c0..933d36264e5e 100644
> >> --- a/kernel/bpf/Makefile
> >> +++ b/kernel/bpf/Makefile
> >> @@ -53,3 +53,4 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
> >>   obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
> >> +obj-$(CONFIG_BPF_SYSCALL) += crib/
> >> diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
> >> new file mode 100644
> >> index 000000000000..4e1bae1972dd
> >> --- /dev/null
> >> +++ b/kernel/bpf/crib/Makefile
> >> @@ -0,0 +1,3 @@
> >> +# SPDX-License-Identifier: GPL-2.0
> >> +
> >> +obj-$(CONFIG_BPF_SYSCALL) += crib.o files.o
> >> diff --git a/kernel/bpf/crib/crib.c b/kernel/bpf/crib/crib.c
> >> new file mode 100644
> >> index 000000000000..e6536ee9a845
> >> --- /dev/null
> >> +++ b/kernel/bpf/crib/crib.c
> >> @@ -0,0 +1,29 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Checkpoint/Restore In eBPF (CRIB)
> >> + */
> >> +
> >> +#include <linux/bpf.h>
> >> +#include <linux/btf.h>
> >> +#include <linux/btf_ids.h>
> >> +
> >> +BTF_KFUNCS_START(bpf_crib_kfuncs)
> >> +
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
> >
> > This is in no way CRIB-specific, right? So I'd drop the CRIB reference
> > and move code next to task_file BPF iterator program type
> > implementation, this is a generic functionality.
> >
> > Even more so, given Namhyung's recent work on adding kmem_cache
> > iterator (both program type and open-coded iterator), it seems like it
> > should be possible to cut down on code duplication by using open-coded
> > iterator logic inside the BPF iterator program. Now that you are
> > adding task_file open-coded iterator, can you please check if it can
> > be reused. See kernel/bpf/task_iter.c (and I think that's where your
> > code should live as well, btw).
> >
> > pw-bot: cr
> >
>
> Thanks for your reply!
>
> Yes, I agree that it would be better to put the task_file open-coded
> iterator together with the traditional task_file iterator (in the
> same file).
>
> I will move it in the next patch series.
>
> >> +
> >> +BTF_KFUNCS_END(bpf_crib_kfuncs)
> >> +
> >> +static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
> >> +       .owner = THIS_MODULE,
> >> +       .set   = &bpf_crib_kfuncs,
> >> +};
> >> +
> >> +static int __init bpf_crib_init(void)
> >> +{
> >> +       return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_crib_kfunc_set);
> >> +}
> >> +
> >> +late_initcall(bpf_crib_init);
> >> diff --git a/kernel/bpf/crib/files.c b/kernel/bpf/crib/files.c
> >> new file mode 100644
> >> index 000000000000..ececf150303f
> >> --- /dev/null
> >> +++ b/kernel/bpf/crib/files.c
> >> @@ -0,0 +1,105 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +
> >> +#include <linux/btf.h>
> >> +#include <linux/file.h>
> >> +#include <linux/fdtable.h>
> >> +#include <linux/net.h>
> >> +
> >> +struct bpf_iter_task_file {
> >> +       __u64 __opaque[3];
> >> +} __aligned(8);
> >> +
> >> +struct bpf_iter_task_file_kern {
> >> +       struct task_struct *task;
> >> +       struct file *file;
> >> +       int fd;
> >> +} __aligned(8);
> >> +
> >> +__bpf_kfunc_start_defs();
> >> +
> >> +/**
> >> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
> >> + * used to iterate over all files opened by a specified task
> >> + *
> >> + * @it: the new bpf_iter_task_file to be created
> >> + * @task: a pointer pointing to a task to be iterated over
> >> + */
> >> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> >> +               struct task_struct *task)
> >> +{
> >> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> >> +
> >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> >> +                    __alignof__(struct bpf_iter_task_file));
> >> +
> >> +       kit->task = task;
> >> +       kit->fd = -1;
> >> +       kit->file = NULL;
> >> +
> >> +       return 0;
> >> +}
> >> +
> >> +/**
> >> + * bpf_iter_task_file_next() - Get the next file in bpf_iter_task_file
> >> + *
> >> + * bpf_iter_task_file_next acquires a reference to the returned struct file.
> >> + *
> >> + * The reference to struct file acquired by the previous
> >> + * bpf_iter_task_file_next() is released in the next bpf_iter_task_file_next(),
> >> + * and the last reference is released in the last bpf_iter_task_file_next()
> >> + * that returns NULL.
> >> + *
> >> + * @it: the bpf_iter_task_file to be checked
> >> + *
> >> + * @returns a pointer to the struct file of the next file if further files
> >> + * are available, otherwise returns NULL
> >> + */
> >> +__bpf_kfunc struct file *bpf_iter_task_file_next(struct bpf_iter_task_file *it)
> >> +{
> >> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> >> +
> >> +       if (kit->file)
> >> +               fput(kit->file);
> >> +
> >> +       kit->fd++;
> >> +
> >> +       rcu_read_lock();
> >> +       kit->file = task_lookup_next_fdget_rcu(kit->task, &kit->fd);
> >> +       rcu_read_unlock();
> >> +
> >> +       return kit->file;
> >> +}
> >> +
> >> +/**
> >> + * bpf_iter_task_file_get_fd() - Get the file descriptor corresponding to
> >> + * the file in the current iteration
> >> + *
> >> + * @it: the bpf_iter_task_file to be checked
> >> + *
> >> + * @returns the file descriptor
> >> + */
> >> +__bpf_kfunc int bpf_iter_task_file_get_fd(struct bpf_iter_task_file *it__iter)
> >> +{
> >> +       struct bpf_iter_task_file_kern *kit = (void *)it__iter;
> >> +
> >> +       return kit->fd;
> >> +}
> >> +
> >
> > I don't think we need this. It's probably better to return a pointer
> > to a small struct representing "item" returned from this iterator.
> > Something like
> >
> > struct bpf_iter_task_file_item {
> >      struct task_struct *task;
> >      struct file *file;
> >      int fd;
> > };
> >
> > You can then embed this struct into struct bpf_iter_task_file and
> > return a pointer to it on each next() call (avoiding memory
> > allocation)
> >
> >
> > (naming just for illustrative purposes, I spent 0 seconds thinking about it)
> >
>
> Yes, I agree that it is feasible.
>
> But there is a question here, should we expose the internal state
> structure of the iterator (If we want to embed) ?
>
> I guess that we need two versions of data structures struct bpf_iter_xxx
> and struct bpf_iter_xxx_kern is for the purpose of encapsulation?

Yes, that's what we do for iterator state structure, and you should do
that as well. bpf_iter_xxx one will be opaque (see other examples, we
literally add `u64 __opaque[N];` there).

But this bpf_iter_task_file_item will be sort of internal API that is
returned from bpf_iter_task_file_next(). So you'll have something like

struct bpf_iter_task_file {
    .... additional state ...
    struct bpf_iter_task_file_item item;
};

then you have

struct bpf_iter_task_file_item bpf_iter_task_file_next(struct
bpf_iter_task_file *it)
{
    struct bpf_iter_task_file_kern *kit = (void *)it;

    ...
    kit->item.task = <sometask>;
    kit->item.file = <file>; /* and so on */

    return &kit->item;
}

>
> With two versions of the data structure, users can only manipulate
> the iterator using the iterator kfuncs, avoiding users from directly
> accessing the internal state.
>
> After we decide to return struct bpf_iter_task_file_item, these members
> will not be able to change and users can directly access/change the
> internal state of the iterator.

Yes, you have to carefully set up bpf_iter_task_file_item, but you
could expand it in the future without breaking any old users, because
you only return it by pointer and with BPF CO-RE all the field shifts
will be correctly handled.

>
> >> +/**
> >> + * bpf_iter_task_file_destroy() - Destroy a bpf_iter_task_file
> >> + *
> >> + * If the iterator does not iterate to the end, then the last
> >> + * struct file reference is released at this time.
> >> + *
> >> + * @it: the bpf_iter_task_file to be destroyed
> >> + */
> >> +__bpf_kfunc void bpf_iter_task_file_destroy(struct bpf_iter_task_file *it)
> >> +{
> >> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> >> +
> >> +       if (kit->file)
> >> +               fput(kit->file);
> >> +}
> >> +
> >> +__bpf_kfunc_end_defs();
> >> --
> >> 2.39.5
> >>
>
Juntong Deng Nov. 6, 2024, 9:05 p.m. UTC | #4
On 2024/11/6 20:02, Andrii Nakryiko wrote:

> On Fri, Nov 1, 2024 at 1:22 PM Juntong Deng <juntong.deng@outlook.com> wrote:

>> Yes, users can access file->f_op, but there seems to be no way for
>> users to get references to struct file_operations for the various file
>> types? For example, how does a user get a reference to socket_file_ops?
> 
> See [0]. Libbpf will find it for the BPF program from kallsyms.
> 
>    [0] https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/progs/test_ksyms.c#L13-L18
> 

Thanks for telling me this method.

Yes, then we don't need bpf_get_file_ops_type().

>>
>> Yes, I agree that it is feasible.
>>
>> But there is a question here, should we expose the internal state
>> structure of the iterator (If we want to embed) ?
>>
>> I guess that we need two versions of data structures struct bpf_iter_xxx
>> and struct bpf_iter_xxx_kern is for the purpose of encapsulation?
> 
> Yes, that's what we do for iterator state structure, and you should do
> that as well. bpf_iter_xxx one will be opaque (see other examples, we
> literally add `u64 __opaque[N];` there).
> 
> But this bpf_iter_task_file_item will be sort of internal API that is
> returned from bpf_iter_task_file_next(). So you'll have something like
> 
> struct bpf_iter_task_file {
>      .... additional state ...
>      struct bpf_iter_task_file_item item;
> };
> 
> then you have
> 
> struct bpf_iter_task_file_item bpf_iter_task_file_next(struct
> bpf_iter_task_file *it)
> {
>      struct bpf_iter_task_file_kern *kit = (void *)it;
> 
>      ...
>      kit->item.task = <sometask>;
>      kit->item.file = <file>; /* and so on */
> 
>      return &kit->item;
> }
> 
>>
>> With two versions of the data structure, users can only manipulate
>> the iterator using the iterator kfuncs, avoiding users from directly
>> accessing the internal state.
>>
>> After we decide to return struct bpf_iter_task_file_item, these members
>> will not be able to change and users can directly access/change the
>> internal state of the iterator.
> 
> Yes, you have to carefully set up bpf_iter_task_file_item, but you
> could expand it in the future without breaking any old users, because
> you only return it by pointer and with BPF CO-RE all the field shifts
> will be correctly handled.
> 

You are right, in the next patch series version I will use
struct bpf_iter_task_file_item.

Could you please also give some feedback on
"2. Where should we put CRIB related kfuncs?"
in the discussion section of the v3 patch series
cover letter [0]?

[0]: 
https://lore.kernel.org/bpf/AM6PR03MB58488FD29EB0D0B89D52AABB99532@AM6PR03MB5848.eurprd03.prod.outlook.com/T/#t

Then I can fix all the things in the v4 patch series.

Many thanks.
diff mbox series

Patch

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 105328f0b9c0..933d36264e5e 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -53,3 +53,4 @@  obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
 obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += crib/
diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
new file mode 100644
index 000000000000..4e1bae1972dd
--- /dev/null
+++ b/kernel/bpf/crib/Makefile
@@ -0,0 +1,3 @@ 
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_BPF_SYSCALL) += crib.o files.o
diff --git a/kernel/bpf/crib/crib.c b/kernel/bpf/crib/crib.c
new file mode 100644
index 000000000000..e6536ee9a845
--- /dev/null
+++ b/kernel/bpf/crib/crib.c
@@ -0,0 +1,29 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB)
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
+BTF_KFUNCS_START(bpf_crib_kfuncs)
+
+BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
+BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
+
+BTF_KFUNCS_END(bpf_crib_kfuncs)
+
+static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_crib_kfuncs,
+};
+
+static int __init bpf_crib_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_crib_kfunc_set);
+}
+
+late_initcall(bpf_crib_init);
diff --git a/kernel/bpf/crib/files.c b/kernel/bpf/crib/files.c
new file mode 100644
index 000000000000..ececf150303f
--- /dev/null
+++ b/kernel/bpf/crib/files.c
@@ -0,0 +1,105 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/btf.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/net.h>
+
+struct bpf_iter_task_file {
+	__u64 __opaque[3];
+} __aligned(8);
+
+struct bpf_iter_task_file_kern {
+	struct task_struct *task;
+	struct file *file;
+	int fd;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
+ * used to iterate over all files opened by a specified task
+ *
+ * @it: the new bpf_iter_task_file to be created
+ * @task: a pointer pointing to a task to be iterated over
+ */
+__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
+		struct task_struct *task)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
+		     __alignof__(struct bpf_iter_task_file));
+
+	kit->task = task;
+	kit->fd = -1;
+	kit->file = NULL;
+
+	return 0;
+}
+
+/**
+ * bpf_iter_task_file_next() - Get the next file in bpf_iter_task_file
+ *
+ * bpf_iter_task_file_next acquires a reference to the returned struct file.
+ *
+ * The reference to struct file acquired by the previous
+ * bpf_iter_task_file_next() is released in the next bpf_iter_task_file_next(),
+ * and the last reference is released in the last bpf_iter_task_file_next()
+ * that returns NULL.
+ *
+ * @it: the bpf_iter_task_file to be checked
+ *
+ * @returns a pointer to the struct file of the next file if further files
+ * are available, otherwise returns NULL
+ */
+__bpf_kfunc struct file *bpf_iter_task_file_next(struct bpf_iter_task_file *it)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	if (kit->file)
+		fput(kit->file);
+
+	kit->fd++;
+
+	rcu_read_lock();
+	kit->file = task_lookup_next_fdget_rcu(kit->task, &kit->fd);
+	rcu_read_unlock();
+
+	return kit->file;
+}
+
+/**
+ * bpf_iter_task_file_get_fd() - Get the file descriptor corresponding to
+ * the file in the current iteration
+ *
+ * @it: the bpf_iter_task_file to be checked
+ *
+ * @returns the file descriptor
+ */
+__bpf_kfunc int bpf_iter_task_file_get_fd(struct bpf_iter_task_file *it__iter)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it__iter;
+
+	return kit->fd;
+}
+
+/**
+ * bpf_iter_task_file_destroy() - Destroy a bpf_iter_task_file
+ *
+ * If the iterator does not iterate to the end, then the last
+ * struct file reference is released at this time.
+ *
+ * @it: the bpf_iter_task_file to be destroyed
+ */
+__bpf_kfunc void bpf_iter_task_file_destroy(struct bpf_iter_task_file *it)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	if (kit->file)
+		fput(kit->file);
+}
+
+__bpf_kfunc_end_defs();