diff mbox series

[bpf-next,v3,1/4] bpf/crib: Introduce task_file open-coded iterator kfuncs

Message ID AM6PR03MB5848C66D53C0204C4EE2655F99532@AM6PR03MB5848.eurprd03.prod.outlook.com (mailing list archive)
State New
Headers show
Series bpf/crib: Add open-coded style process file iterator and file related CRIB kfuncs | expand

Commit Message

Juntong Deng Nov. 6, 2024, 7:38 p.m. UTC
This patch adds the open-coded iterator style process file iterator
kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
files opened by the specified process.

In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
the file descriptor corresponding to the file in the current iteration.

The reference to struct file acquired by the previous
bpf_iter_task_file_next() is released in the next
bpf_iter_task_file_next(), and the last reference is released in the
last bpf_iter_task_file_next() that returns NULL.

In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
the end, then the last struct file reference is released at this time.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
---
 kernel/bpf/helpers.c   |  4 ++
 kernel/bpf/task_iter.c | 96 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

Comments

Alexei Starovoitov Nov. 6, 2024, 9:31 p.m. UTC | #1
On Wed, Nov 6, 2024 at 11:39 AM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> This patch adds the open-coded iterator style process file iterator
> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
> files opened by the specified process.

This is ok.

> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
> the file descriptor corresponding to the file in the current iteration.

Unnecessary. Use CORE to read iter internal fields.

> The reference to struct file acquired by the previous
> bpf_iter_task_file_next() is released in the next
> bpf_iter_task_file_next(), and the last reference is released in the
> last bpf_iter_task_file_next() that returns NULL.
>
> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
> the end, then the last struct file reference is released at this time.
>
> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> ---
>  kernel/bpf/helpers.c   |  4 ++
>  kernel/bpf/task_iter.c | 96 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 100 insertions(+)
>
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 395221e53832..1f0f7ca1c47a 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -3096,6 +3096,10 @@ BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
>  BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 5af9e130e500..32e15403a5a6 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -1031,6 +1031,102 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
>  {
>  }
>
> +struct bpf_iter_task_file {
> +       __u64 __opaque[3];
> +} __aligned(8);
> +
> +struct bpf_iter_task_file_kern {
> +       struct task_struct *task;
> +       struct file *file;
> +       int fd;
> +} __aligned(8);
> +
> +/**
> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
> + * used to iterate over all files opened by a specified task
> + *
> + * @it: the new bpf_iter_task_file to be created
> + * @task: a pointer pointing to a task to be iterated over
> + */
> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> +               struct task_struct *task)
> +{
> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> +                    __alignof__(struct bpf_iter_task_file));
> +
> +       kit->task = task;

This is broken, since task refcnt can drop while iter is running.

Before doing any of that I'd like to see a long term path for crib.
All these small additions are ok if they're generic and useful elsewhere.
I'm afraid there is no path forward for crib itself though.

pw-bot: cr
Andrii Nakryiko Nov. 6, 2024, 10:10 p.m. UTC | #2
On Wed, Nov 6, 2024 at 1:32 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Nov 6, 2024 at 11:39 AM Juntong Deng <juntong.deng@outlook.com> wrote:
> >
> > This patch adds the open-coded iterator style process file iterator
> > kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
> > files opened by the specified process.
>
> This is ok.
>
> > In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
> > the file descriptor corresponding to the file in the current iteration.
>
> Unnecessary. Use CORE to read iter internal fields.

+1, I suggested to use __ksym approach and compare to f_op

>
> > The reference to struct file acquired by the previous
> > bpf_iter_task_file_next() is released in the next
> > bpf_iter_task_file_next(), and the last reference is released in the
> > last bpf_iter_task_file_next() that returns NULL.
> >
> > In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
> > the end, then the last struct file reference is released at this time.
> >
> > Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> > ---
> >  kernel/bpf/helpers.c   |  4 ++
> >  kernel/bpf/task_iter.c | 96 ++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 100 insertions(+)
> >
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 395221e53832..1f0f7ca1c47a 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -3096,6 +3096,10 @@ BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
> >  BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
> >  BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
> > +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> > +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
> > +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
> > +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
> >  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 5af9e130e500..32e15403a5a6 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -1031,6 +1031,102 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
> >  {
> >  }
> >
> > +struct bpf_iter_task_file {
> > +       __u64 __opaque[3];
> > +} __aligned(8);
> > +
> > +struct bpf_iter_task_file_kern {
> > +       struct task_struct *task;
> > +       struct file *file;
> > +       int fd;
> > +} __aligned(8);
> > +
> > +/**
> > + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
> > + * used to iterate over all files opened by a specified task
> > + *
> > + * @it: the new bpf_iter_task_file to be created
> > + * @task: a pointer pointing to a task to be iterated over
> > + */
> > +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> > +               struct task_struct *task)
> > +{
> > +       struct bpf_iter_task_file_kern *kit = (void *)it;
> > +
> > +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> > +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> > +                    __alignof__(struct bpf_iter_task_file));
> > +
> > +       kit->task = task;
>
> This is broken, since task refcnt can drop while iter is running.

I noticed this as well, but I thought that given KF_TRUSTED_ARGS we
should have a guarantee that the task survives the iteration? Am I
mistaken?

>
> Before doing any of that I'd like to see a long term path for crib.
> All these small additions are ok if they're generic and useful elsewhere.
> I'm afraid there is no path forward for crib itself though.
>
> pw-bot: cr
Alexei Starovoitov Nov. 6, 2024, 10:13 p.m. UTC | #3
On Wed, Nov 6, 2024 at 2:10 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
> > > +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> > > +               struct task_struct *task)
> > > +{
> > > +       struct bpf_iter_task_file_kern *kit = (void *)it;
> > > +
> > > +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> > > +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> > > +                    __alignof__(struct bpf_iter_task_file));
> > > +
> > > +       kit->task = task;
> >
> > This is broken, since task refcnt can drop while iter is running.
>
> I noticed this as well, but I thought that given KF_TRUSTED_ARGS we
> should have a guarantee that the task survives the iteration? Am I
> mistaken?

KF_TRUSTED_ARGS will only guarantee that the task is valid when it's
passed into this kfunc. Right after the prog can call
bpf_task_release() to release the ref and kit->task will become
dangling.
If this object was RCU protected we could have marked this iter
as KF_RCU_PROTECTED, then the verifier would make sure that
RCU unlock doesn't happen between iter_new and iter_destroy.
Andrii Nakryiko Nov. 6, 2024, 10:18 p.m. UTC | #4
On Wed, Nov 6, 2024 at 2:13 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Nov 6, 2024 at 2:10 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> > > > +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> > > > +               struct task_struct *task)
> > > > +{
> > > > +       struct bpf_iter_task_file_kern *kit = (void *)it;
> > > > +
> > > > +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> > > > +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> > > > +                    __alignof__(struct bpf_iter_task_file));
> > > > +
> > > > +       kit->task = task;
> > >
> > > This is broken, since task refcnt can drop while iter is running.
> >
> > I noticed this as well, but I thought that given KF_TRUSTED_ARGS we
> > should have a guarantee that the task survives the iteration? Am I
> > mistaken?
>
> KF_TRUSTED_ARGS will only guarantee that the task is valid when it's
> passed into this kfunc. Right after the prog can call
> bpf_task_release() to release the ref and kit->task will become
> dangling.
> If this object was RCU protected we could have marked this iter
> as KF_RCU_PROTECTED, then the verifier would make sure that
> RCU unlock doesn't happen between iter_new and iter_destroy.

I see, it makes sense. I guess we'll need tryget_task_struct() here
and just return an error if we failed to get it.
Juntong Deng Nov. 6, 2024, 10:31 p.m. UTC | #5
On 2024/11/6 21:31, Alexei Starovoitov wrote:
> On Wed, Nov 6, 2024 at 11:39 AM Juntong Deng <juntong.deng@outlook.com> wrote:
>>
>> This patch adds the open-coded iterator style process file iterator
>> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
>> files opened by the specified process.
> 
> This is ok.
> 
>> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
>> the file descriptor corresponding to the file in the current iteration.
> 
> Unnecessary. Use CORE to read iter internal fields.
> 
>> The reference to struct file acquired by the previous
>> bpf_iter_task_file_next() is released in the next
>> bpf_iter_task_file_next(), and the last reference is released in the
>> last bpf_iter_task_file_next() that returns NULL.
>>
>> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
>> the end, then the last struct file reference is released at this time.
>>
>> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
>> ---
>>   kernel/bpf/helpers.c   |  4 ++
>>   kernel/bpf/task_iter.c | 96 ++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 100 insertions(+)
>>
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index 395221e53832..1f0f7ca1c47a 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -3096,6 +3096,10 @@ BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
>>   BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
>>   BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
>> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
>>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>> index 5af9e130e500..32e15403a5a6 100644
>> --- a/kernel/bpf/task_iter.c
>> +++ b/kernel/bpf/task_iter.c
>> @@ -1031,6 +1031,102 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
>>   {
>>   }
>>
>> +struct bpf_iter_task_file {
>> +       __u64 __opaque[3];
>> +} __aligned(8);
>> +
>> +struct bpf_iter_task_file_kern {
>> +       struct task_struct *task;
>> +       struct file *file;
>> +       int fd;
>> +} __aligned(8);
>> +
>> +/**
>> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
>> + * used to iterate over all files opened by a specified task
>> + *
>> + * @it: the new bpf_iter_task_file to be created
>> + * @task: a pointer pointing to a task to be iterated over
>> + */
>> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
>> +               struct task_struct *task)
>> +{
>> +       struct bpf_iter_task_file_kern *kit = (void *)it;
>> +
>> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
>> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
>> +                    __alignof__(struct bpf_iter_task_file));
>> +
>> +       kit->task = task;
> 
> This is broken, since task refcnt can drop while iter is running.
> 
> Before doing any of that I'd like to see a long term path for crib.
> All these small additions are ok if they're generic and useful elsewhere.
> I'm afraid there is no path forward for crib itself though.
> 
> pw-bot: cr

Thanks for your reply.

The long-term path of CRIB is consistent with the initial goal, adding
kfuncs to help the bpf program obtain process-related information.

I think most of the CRIB kfuncs are generic, such as process file
iterator, skb iterator, bpf_fget_task() that gets struct file based on
file descriptor, etc.

This is because obtaining process-related information is not a
requirement specific to checkpoint/restore scenarios, but is
required in other scenarios as well.

Here I would like to quote your vision on LPC 2022 [0] [1].

"Starovoitov concluded his presentation by sharing his vision for the
future of BPF: replacing kernel modules as the de-facto means of
extending the kernel."

"BPF programs are safe and portable kernel modules"

[0]: https://lwn.net/Articles/909095/
[1]: 
https://lpc.events/event/16/contributions/1346/attachments/1021/1966/bpf_LPC_2022.pdf

If the future of BPF is to become a better kernel module and BPF kfuncs
is the equivalent of a better EXPORT_SYMBOL_GPL.

Then all CRIB kfuncs are useful. CRIB essentially gives bpf programs the
ability to access process information.

Giving bpf the ability to access process information is part of making
bpf a generic and better "kernel module".

Therefore I believe that CRIB is consistent with the long-term vision of
BPF, or in other words CRIB is part of the long-term vision of BPF.

Many thanks.
Juntong Deng Nov. 6, 2024, 10:46 p.m. UTC | #6
On 2024/11/6 22:13, Alexei Starovoitov wrote:
> On Wed, Nov 6, 2024 at 2:10 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>>>> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
>>>> +               struct task_struct *task)
>>>> +{
>>>> +       struct bpf_iter_task_file_kern *kit = (void *)it;
>>>> +
>>>> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
>>>> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
>>>> +                    __alignof__(struct bpf_iter_task_file));
>>>> +
>>>> +       kit->task = task;
>>>
>>> This is broken, since task refcnt can drop while iter is running.
>>
>> I noticed this as well, but I thought that given KF_TRUSTED_ARGS we
>> should have a guarantee that the task survives the iteration? Am I
>> mistaken?
> 
> KF_TRUSTED_ARGS will only guarantee that the task is valid when it's
> passed into this kfunc. Right after the prog can call
> bpf_task_release() to release the ref and kit->task will become
> dangling.
> If this object was RCU protected we could have marked this iter
> as KF_RCU_PROTECTED, then the verifier would make sure that
> RCU unlock doesn't happen between iter_new and iter_destroy.

Thanks for pointing this out.

I will fix it in the next version of the patch series.
Alexei Starovoitov Nov. 7, 2024, 1:09 a.m. UTC | #7
On Wed, Nov 6, 2024 at 2:31 PM Juntong Deng <juntong.deng@outlook.com> wrote:
>
> On 2024/11/6 21:31, Alexei Starovoitov wrote:
> > On Wed, Nov 6, 2024 at 11:39 AM Juntong Deng <juntong.deng@outlook.com> wrote:
> >>
> >> This patch adds the open-coded iterator style process file iterator
> >> kfuncs bpf_iter_task_file_{new,next,destroy} that iterates over all
> >> files opened by the specified process.
> >
> > This is ok.
> >
> >> In addition, this patch adds bpf_iter_task_file_get_fd() getter to get
> >> the file descriptor corresponding to the file in the current iteration.
> >
> > Unnecessary. Use CORE to read iter internal fields.
> >
> >> The reference to struct file acquired by the previous
> >> bpf_iter_task_file_next() is released in the next
> >> bpf_iter_task_file_next(), and the last reference is released in the
> >> last bpf_iter_task_file_next() that returns NULL.
> >>
> >> In the bpf_iter_task_file_destroy(), if the iterator does not iterate to
> >> the end, then the last struct file reference is released at this time.
> >>
> >> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> >> ---
> >>   kernel/bpf/helpers.c   |  4 ++
> >>   kernel/bpf/task_iter.c | 96 ++++++++++++++++++++++++++++++++++++++++++
> >>   2 files changed, 100 insertions(+)
> >>
> >> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> >> index 395221e53832..1f0f7ca1c47a 100644
> >> --- a/kernel/bpf/helpers.c
> >> +++ b/kernel/bpf/helpers.c
> >> @@ -3096,6 +3096,10 @@ BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
> >>   BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
> >>   BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
> >> +BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_adjust)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_null)
> >>   BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> >> index 5af9e130e500..32e15403a5a6 100644
> >> --- a/kernel/bpf/task_iter.c
> >> +++ b/kernel/bpf/task_iter.c
> >> @@ -1031,6 +1031,102 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
> >>   {
> >>   }
> >>
> >> +struct bpf_iter_task_file {
> >> +       __u64 __opaque[3];
> >> +} __aligned(8);
> >> +
> >> +struct bpf_iter_task_file_kern {
> >> +       struct task_struct *task;
> >> +       struct file *file;
> >> +       int fd;
> >> +} __aligned(8);
> >> +
> >> +/**
> >> + * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
> >> + * used to iterate over all files opened by a specified task
> >> + *
> >> + * @it: the new bpf_iter_task_file to be created
> >> + * @task: a pointer pointing to a task to be iterated over
> >> + */
> >> +__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
> >> +               struct task_struct *task)
> >> +{
> >> +       struct bpf_iter_task_file_kern *kit = (void *)it;
> >> +
> >> +       BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
> >> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
> >> +                    __alignof__(struct bpf_iter_task_file));
> >> +
> >> +       kit->task = task;
> >
> > This is broken, since task refcnt can drop while iter is running.
> >
> > Before doing any of that I'd like to see a long term path for crib.
> > All these small additions are ok if they're generic and useful elsewhere.
> > I'm afraid there is no path forward for crib itself though.
> >
> > pw-bot: cr
>
> Thanks for your reply.
>
> The long-term path of CRIB is consistent with the initial goal, adding
> kfuncs to help the bpf program obtain process-related information.
>
> I think most of the CRIB kfuncs are generic, such as process file
> iterator, skb iterator, bpf_fget_task() that gets struct file based on
> file descriptor, etc.
>
> This is because obtaining process-related information is not a
> requirement specific to checkpoint/restore scenarios, but is
> required in other scenarios as well.
>
> Here I would like to quote your vision on LPC 2022 [0] [1].

:)

The reading part via iterators and access to kernel internals is fine,
but to complete CRIB idea the restore side is necessary and
for that bit I haven't heard a complete story that would be
acceptable upstream. At LPC the proposal was to add kfuncs
that will write into kernel data structures.
That part won't fly, since I don't see how one can make such
writing kfuncs safe. Restoring a socket, tcp connection, etc
is not a trivial process.
Just building a set of generic abstractions for reading is ok-ish,
since they're generic and reusable, but the end to end story
is necessary before we proceed.
diff mbox series

Patch

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 395221e53832..1f0f7ca1c47a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3096,6 +3096,10 @@  BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_file_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_file_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_file_get_fd)
+BTF_ID_FLAGS(func, bpf_iter_task_file_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5af9e130e500..32e15403a5a6 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -1031,6 +1031,102 @@  __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
 {
 }
 
+struct bpf_iter_task_file {
+	__u64 __opaque[3];
+} __aligned(8);
+
+struct bpf_iter_task_file_kern {
+	struct task_struct *task;
+	struct file *file;
+	int fd;
+} __aligned(8);
+
+/**
+ * bpf_iter_task_file_new() - Initialize a new task file iterator for a task,
+ * used to iterate over all files opened by a specified task
+ *
+ * @it: the new bpf_iter_task_file to be created
+ * @task: a pointer pointing to a task to be iterated over
+ */
+__bpf_kfunc int bpf_iter_task_file_new(struct bpf_iter_task_file *it,
+		struct task_struct *task)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_file_kern) > sizeof(struct bpf_iter_task_file));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_file_kern) !=
+		     __alignof__(struct bpf_iter_task_file));
+
+	kit->task = task;
+	kit->fd = -1;
+	kit->file = NULL;
+
+	return 0;
+}
+
+/**
+ * bpf_iter_task_file_next() - Get the next file in bpf_iter_task_file
+ *
+ * bpf_iter_task_file_next acquires a reference to the returned struct file.
+ *
+ * The reference to struct file acquired by the previous
+ * bpf_iter_task_file_next() is released in the next bpf_iter_task_file_next(),
+ * and the last reference is released in the last bpf_iter_task_file_next()
+ * that returns NULL.
+ *
+ * @it: the bpf_iter_task_file to be checked
+ *
+ * @returns a pointer to the struct file of the next file if further files
+ * are available, otherwise returns NULL
+ */
+__bpf_kfunc struct file *bpf_iter_task_file_next(struct bpf_iter_task_file *it)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	if (kit->file)
+		fput(kit->file);
+
+	kit->fd++;
+
+	rcu_read_lock();
+	kit->file = task_lookup_next_fdget_rcu(kit->task, &kit->fd);
+	rcu_read_unlock();
+
+	return kit->file;
+}
+
+/**
+ * bpf_iter_task_file_get_fd() - Get the file descriptor corresponding to
+ * the file in the current iteration
+ *
+ * @it: the bpf_iter_task_file to be checked
+ *
+ * @returns the file descriptor. If -1 is returned, it means the iteration
+ * has not started yet.
+ */
+__bpf_kfunc int bpf_iter_task_file_get_fd(struct bpf_iter_task_file *it__iter)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it__iter;
+
+	return kit->fd;
+}
+
+/**
+ * bpf_iter_task_file_destroy() - Destroy a bpf_iter_task_file
+ *
+ * If the iterator does not iterate to the end, then the last
+ * struct file reference is released at this time.
+ *
+ * @it: the bpf_iter_task_file to be destroyed
+ */
+__bpf_kfunc void bpf_iter_task_file_destroy(struct bpf_iter_task_file *it)
+{
+	struct bpf_iter_task_file_kern *kit = (void *)it;
+
+	if (kit->file)
+		fput(kit->file);
+}
+
 __bpf_kfunc_end_defs();
 
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);