diff mbox series

[bpf-next,v1,4/9] bpf: Introduce sleepable tracepoints

Message ID 20220225234339.2386398-5-haoluo@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Extend cgroup interface with bpf | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 17821 this patch: 17821
netdev/cc_maintainers warning 4 maintainers not CCed: rostedt@goodmis.org mingo@redhat.com john.fastabend@gmail.com netdev@vger.kernel.org
netdev/build_clang success Errors and warnings before: 3927 this patch: 3927
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 17436 this patch: 17436
netdev/checkpatch fail ERROR: Macros with complex values should be enclosed in parentheses WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next fail VM_Test

Commit Message

Hao Luo Feb. 25, 2022, 11:43 p.m. UTC
Add a new type of bpf tracepoints: sleepable tracepoints, which allows
the handler to make calls that may sleep. With sleepable tracepoints, a
set of syscall helpers (which may sleep) may also be called from
sleepable tracepoints.

In the following patches, we will whitelist some tracepoints to be
sleepable.

Signed-off-by: Hao Luo <haoluo@google.com>
---
 include/linux/bpf.h             | 10 +++++++-
 include/linux/tracepoint-defs.h |  1 +
 include/trace/bpf_probe.h       | 22 ++++++++++++++----
 kernel/bpf/syscall.c            | 41 +++++++++++++++++++++++----------
 kernel/trace/bpf_trace.c        |  5 ++++
 5 files changed, 61 insertions(+), 18 deletions(-)

Comments

Alexei Starovoitov March 2, 2022, 7:41 p.m. UTC | #1
On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> index e7c2276be33e..c73c7ab3680e 100644
> --- a/include/linux/tracepoint-defs.h
> +++ b/include/linux/tracepoint-defs.h
> @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
>  	void			*bpf_func;
>  	u32			num_args;
>  	u32			writable_size;
> +	u32			sleepable;

It increases the size for all tracepoints. 
See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
Please switch writeable_size and sleepable to u16.
>  
> -static const struct bpf_func_proto *
> -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> +const struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> +				const struct bpf_prog *prog)
>  {
>  	switch (func_id) {
>  	case BPF_FUNC_sys_bpf:
>  		return &bpf_sys_bpf_proto;
> -	case BPF_FUNC_btf_find_by_name_kind:
> -		return &bpf_btf_find_by_name_kind_proto;
>  	case BPF_FUNC_sys_close:
>  		return &bpf_sys_close_proto;
> -	case BPF_FUNC_kallsyms_lookup_name:
> -		return &bpf_kallsyms_lookup_name_proto;
>  	case BPF_FUNC_mkdir:
>  		return &bpf_mkdir_proto;
>  	case BPF_FUNC_rmdir:
>  		return &bpf_rmdir_proto;
>  	case BPF_FUNC_unlink:
>  		return &bpf_unlink_proto;
> +	default:
> +		return NULL;
> +	}
> +}

If I read this correctly the goal is to disallow find_by_name_kind
and lookup_name from sleepable tps. Why? What's the harm?
Yonghong Song March 2, 2022, 9:23 p.m. UTC | #2
On 2/25/22 3:43 PM, Hao Luo wrote:
> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> the handler to make calls that may sleep. With sleepable tracepoints, a
> set of syscall helpers (which may sleep) may also be called from
> sleepable tracepoints.

There are some old discussions on sleepable tracepoints, maybe
worthwhile to take a look.

https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/

> 
> In the following patches, we will whitelist some tracepoints to be
> sleepable.
> 
> Signed-off-by: Hao Luo <haoluo@google.com>
> ---
>   include/linux/bpf.h             | 10 +++++++-
>   include/linux/tracepoint-defs.h |  1 +
>   include/trace/bpf_probe.h       | 22 ++++++++++++++----
>   kernel/bpf/syscall.c            | 41 +++++++++++++++++++++++----------
>   kernel/trace/bpf_trace.c        |  5 ++++
>   5 files changed, 61 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c36eeced3838..759ade7b24b3 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1810,6 +1810,9 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
>   struct bpf_link *bpf_link_by_id(u32 id);
>   
>   const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
> +const struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
> +
>   void bpf_task_storage_free(struct task_struct *task);
>   bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
>   const struct btf_func_model *
> @@ -1822,7 +1825,6 @@ struct bpf_core_ctx {
>   
>   int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
>   		   int relo_idx, void *insn);
> -
>   #else /* !CONFIG_BPF_SYSCALL */
>   static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>   {
> @@ -2011,6 +2013,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>   	return NULL;
>   }
>   
> +static inline struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +	return NULL;
> +}
> +
>   static inline void bpf_task_storage_free(struct task_struct *task)
>   {
>   }
> diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> index e7c2276be33e..c73c7ab3680e 100644
> --- a/include/linux/tracepoint-defs.h
> +++ b/include/linux/tracepoint-defs.h
> @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
>   	void			*bpf_func;
>   	u32			num_args;
>   	u32			writable_size;
> +	u32			sleepable;
>   } __aligned(32);
>   
>   /*
[...]
Alexei Starovoitov March 2, 2022, 9:30 p.m. UTC | #3
On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 2/25/22 3:43 PM, Hao Luo wrote:
> > Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > the handler to make calls that may sleep. With sleepable tracepoints, a
> > set of syscall helpers (which may sleep) may also be called from
> > sleepable tracepoints.
>
> There are some old discussions on sleepable tracepoints, maybe
> worthwhile to take a look.
>
> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/

Right. It's very much related, but obsolete too.
We don't need any of that for sleeptable _raw_ tps.
I prefer to stay with "sleepable" name as well to
match the rest of the bpf sleepable code.
In all cases it's faultable.
Yonghong Song March 3, 2022, 1:08 a.m. UTC | #4
On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 2/25/22 3:43 PM, Hao Luo wrote:
>>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
>>> the handler to make calls that may sleep. With sleepable tracepoints, a
>>> set of syscall helpers (which may sleep) may also be called from
>>> sleepable tracepoints.
>>
>> There are some old discussions on sleepable tracepoints, maybe
>> worthwhile to take a look.
>>
>> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> 
> Right. It's very much related, but obsolete too.
> We don't need any of that for sleeptable _raw_ tps.
> I prefer to stay with "sleepable" name as well to
> match the rest of the bpf sleepable code.
> In all cases it's faultable.

sounds good to me. Agree that for the bpf user case, Hao's 
implementation should be enough.
Alexei Starovoitov March 3, 2022, 2:29 a.m. UTC | #5
On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >>
> >>
> >> On 2/25/22 3:43 PM, Hao Luo wrote:
> >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> >>> set of syscall helpers (which may sleep) may also be called from
> >>> sleepable tracepoints.
> >>
> >> There are some old discussions on sleepable tracepoints, maybe
> >> worthwhile to take a look.
> >>
> >> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> >
> > Right. It's very much related, but obsolete too.
> > We don't need any of that for sleeptable _raw_ tps.
> > I prefer to stay with "sleepable" name as well to
> > match the rest of the bpf sleepable code.
> > In all cases it's faultable.
>
> sounds good to me. Agree that for the bpf user case, Hao's
> implementation should be enough.

Just remembered that we can also do trivial noinline __weak
nop function and mark it sleepable on the verifier side.
That's what we were planning to do to trace map update/delete ops
in Joe Burton's series.
Then we don't need to extend tp infra.
I'm fine whichever way. I see pros and cons in both options.
Hao Luo March 3, 2022, 7:37 p.m. UTC | #6
On Wed, Mar 2, 2022 at 11:41 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> > index e7c2276be33e..c73c7ab3680e 100644
> > --- a/include/linux/tracepoint-defs.h
> > +++ b/include/linux/tracepoint-defs.h
> > @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> >       void                    *bpf_func;
> >       u32                     num_args;
> >       u32                     writable_size;
> > +     u32                     sleepable;
>
> It increases the size for all tracepoints.
> See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
> Please switch writeable_size and sleepable to u16.

No problem.

> >
> > -static const struct bpf_func_proto *
> > -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> > +const struct bpf_func_proto *
> > +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> > +                             const struct bpf_prog *prog)
> >  {
> >       switch (func_id) {
> >       case BPF_FUNC_sys_bpf:
> >               return &bpf_sys_bpf_proto;
> > -     case BPF_FUNC_btf_find_by_name_kind:
> > -             return &bpf_btf_find_by_name_kind_proto;
> >       case BPF_FUNC_sys_close:
> >               return &bpf_sys_close_proto;
> > -     case BPF_FUNC_kallsyms_lookup_name:
> > -             return &bpf_kallsyms_lookup_name_proto;
> >       case BPF_FUNC_mkdir:
> >               return &bpf_mkdir_proto;
> >       case BPF_FUNC_rmdir:
> >               return &bpf_rmdir_proto;
> >       case BPF_FUNC_unlink:
> >               return &bpf_unlink_proto;
> > +     default:
> > +             return NULL;
> > +     }
> > +}
>
> If I read this correctly the goal is to disallow find_by_name_kind
> and lookup_name from sleepable tps. Why? What's the harm?

A couple of thoughts, please correct me if they don't make sense. I
may think too much.

1. The very first reason is, I don't know the use case of them in
tracing. So I think I can leave them right now and add them later if
the maintainers want them.

2. A related question is, do we actually want all syscall helpers to
be in sleepable tracing? Some helpers may cause re-entering the
tracepoints. For a hypothetical example, if we call mkdir while
tracing some tracepoints in vfs_mkdir. Do we have protection for this?
Another potential problem is about lookup_name in particular,
sleepable_tracing could be triggered by any user, will lookup_name
leak kernel addresses to users in some way? The filesystem helpers
have some basic perm checks, I would think it's relatively safer.
Hao Luo March 3, 2022, 7:43 p.m. UTC | #7
On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <yhs@fb.com> wrote:
> >
> >
> >
> > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
> > >>
> > >>
> > >>
> > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > >>> set of syscall helpers (which may sleep) may also be called from
> > >>> sleepable tracepoints.
> > >>
> > >> There are some old discussions on sleepable tracepoints, maybe
> > >> worthwhile to take a look.
> > >>
> > >> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> > >
> > > Right. It's very much related, but obsolete too.
> > > We don't need any of that for sleeptable _raw_ tps.
> > > I prefer to stay with "sleepable" name as well to
> > > match the rest of the bpf sleepable code.
> > > In all cases it's faultable.
> >
> > sounds good to me. Agree that for the bpf user case, Hao's
> > implementation should be enough.
>
> Just remembered that we can also do trivial noinline __weak
> nop function and mark it sleepable on the verifier side.
> That's what we were planning to do to trace map update/delete ops
> in Joe Burton's series.
> Then we don't need to extend tp infra.
> I'm fine whichever way. I see pros and cons in both options.

Joe is also cc'ed in this patchset, I will sync up with him on the
status of trace map work.

Alexei, do we have potentially other variants of tp? We can make the
current u16 sleepable a flag, so we can reuse this flag later when we
have another type of tracepoints.
Alexei Starovoitov March 3, 2022, 7:59 p.m. UTC | #8
On Thu, Mar 3, 2022 at 11:37 AM Hao Luo <haoluo@google.com> wrote:
>
> On Wed, Mar 2, 2022 at 11:41 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> > > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> > > index e7c2276be33e..c73c7ab3680e 100644
> > > --- a/include/linux/tracepoint-defs.h
> > > +++ b/include/linux/tracepoint-defs.h
> > > @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> > >       void                    *bpf_func;
> > >       u32                     num_args;
> > >       u32                     writable_size;
> > > +     u32                     sleepable;
> >
> > It increases the size for all tracepoints.
> > See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
> > Please switch writeable_size and sleepable to u16.
>
> No problem.
>
> > >
> > > -static const struct bpf_func_proto *
> > > -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > > +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> > > +const struct bpf_func_proto *
> > > +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> > > +                             const struct bpf_prog *prog)
> > >  {
> > >       switch (func_id) {
> > >       case BPF_FUNC_sys_bpf:
> > >               return &bpf_sys_bpf_proto;
> > > -     case BPF_FUNC_btf_find_by_name_kind:
> > > -             return &bpf_btf_find_by_name_kind_proto;
> > >       case BPF_FUNC_sys_close:
> > >               return &bpf_sys_close_proto;
> > > -     case BPF_FUNC_kallsyms_lookup_name:
> > > -             return &bpf_kallsyms_lookup_name_proto;
> > >       case BPF_FUNC_mkdir:
> > >               return &bpf_mkdir_proto;
> > >       case BPF_FUNC_rmdir:
> > >               return &bpf_rmdir_proto;
> > >       case BPF_FUNC_unlink:
> > >               return &bpf_unlink_proto;
> > > +     default:
> > > +             return NULL;
> > > +     }
> > > +}
> >
> > If I read this correctly the goal is to disallow find_by_name_kind
> > and lookup_name from sleepable tps. Why? What's the harm?
>
> A couple of thoughts, please correct me if they don't make sense. I
> may think too much.
>
> 1. The very first reason is, I don't know the use case of them in
> tracing. So I think I can leave them right now and add them later if
> the maintainers want them.
>
> 2. A related question is, do we actually want all syscall helpers to
> be in sleepable tracing? Some helpers may cause re-entering the
> tracepoints. For a hypothetical example, if we call mkdir while
> tracing some tracepoints in vfs_mkdir. Do we have protection for this?

If we go with noinline weak nop function approach then we will
get recursion protection for free. All trampoline powered progs have it.
Both sleepable and not.

> Another potential problem is about lookup_name in particular,
> sleepable_tracing could be triggered by any user, will lookup_name
> leak kernel addresses to users in some way? The filesystem helpers
> have some basic perm checks, I would think it's relatively safer.

The tracepoint may be triggerable by any user, but the sleepable
tp bpf prog will be loaded with cap_perfmon permissions, so it has
the rights to read anything.
So I don't see any concerns with enabling lookup_name to both
syscall bpf prog and tp progs.
Alexei Starovoitov March 3, 2022, 8:02 p.m. UTC | #9
On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <haoluo@google.com> wrote:
>
> On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <yhs@fb.com> wrote:
> > >
> > >
> > >
> > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
> > > >>
> > > >>
> > > >>
> > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > >>> set of syscall helpers (which may sleep) may also be called from
> > > >>> sleepable tracepoints.
> > > >>
> > > >> There are some old discussions on sleepable tracepoints, maybe
> > > >> worthwhile to take a look.
> > > >>
> > > >> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> > > >
> > > > Right. It's very much related, but obsolete too.
> > > > We don't need any of that for sleeptable _raw_ tps.
> > > > I prefer to stay with "sleepable" name as well to
> > > > match the rest of the bpf sleepable code.
> > > > In all cases it's faultable.
> > >
> > > sounds good to me. Agree that for the bpf user case, Hao's
> > > implementation should be enough.
> >
> > Just remembered that we can also do trivial noinline __weak
> > nop function and mark it sleepable on the verifier side.
> > That's what we were planning to do to trace map update/delete ops
> > in Joe Burton's series.
> > Then we don't need to extend tp infra.
> > I'm fine whichever way. I see pros and cons in both options.
>
> Joe is also cc'ed in this patchset, I will sync up with him on the
> status of trace map work.
>
> Alexei, do we have potentially other variants of tp? We can make the
> current u16 sleepable a flag, so we can reuse this flag later when we
> have another type of tracepoints.

When we added the ability to attach to kernel functions and mark them
as allow_error_inject the usefulness of tracepoints and even
writeable tracepoints was deminissed.
If we do sleepable tracepoint, I suspect, it may be the last extension
in that area.
I guess I'm convincing myself that noinline weak nop func
is better here. Just like it's better for Joe's map tracing.
Alexei Starovoitov March 3, 2022, 8:04 p.m. UTC | #10
On Thu, Mar 3, 2022 at 12:02 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <haoluo@google.com> wrote:
> >
> > On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <yhs@fb.com> wrote:
> > > >
> > > >
> > > >
> > > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
> > > > >>
> > > > >>
> > > > >>
> > > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > > >>> set of syscall helpers (which may sleep) may also be called from
> > > > >>> sleepable tracepoints.
> > > > >>
> > > > >> There are some old discussions on sleepable tracepoints, maybe
> > > > >> worthwhile to take a look.
> > > > >>
> > > > >> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> > > > >
> > > > > Right. It's very much related, but obsolete too.
> > > > > We don't need any of that for sleeptable _raw_ tps.
> > > > > I prefer to stay with "sleepable" name as well to
> > > > > match the rest of the bpf sleepable code.
> > > > > In all cases it's faultable.
> > > >
> > > > sounds good to me. Agree that for the bpf user case, Hao's
> > > > implementation should be enough.
> > >
> > > Just remembered that we can also do trivial noinline __weak
> > > nop function and mark it sleepable on the verifier side.
> > > That's what we were planning to do to trace map update/delete ops
> > > in Joe Burton's series.
> > > Then we don't need to extend tp infra.
> > > I'm fine whichever way. I see pros and cons in both options.
> >
> > Joe is also cc'ed in this patchset, I will sync up with him on the
> > status of trace map work.
> >
> > Alexei, do we have potentially other variants of tp? We can make the
> > current u16 sleepable a flag, so we can reuse this flag later when we
> > have another type of tracepoints.
>
> When we added the ability to attach to kernel functions and mark them
> as allow_error_inject the usefulness of tracepoints and even
> writeable tracepoints was deminissed.
> If we do sleepable tracepoint, I suspect, it may be the last extension
> in that area.
> I guess I'm convincing myself that noinline weak nop func
> is better here. Just like it's better for Joe's map tracing.

To add to the above... The only downside of sleepable nop func
comparing to tp is the lack of static_branch.
So this nop call will always be there.
For map tracing and for cgroup mkdir/rmdir the few nanosecond
overhead of calling an empty function isn't even measurable.
Hao Luo March 3, 2022, 10:06 p.m. UTC | #11
On Thu, Mar 3, 2022 at 12:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 3, 2022 at 12:02 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <haoluo@google.com> wrote:
> > >
> > > On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <yhs@fb.com> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <yhs@fb.com> wrote:
> > > > > >>
> > > > > >>
> > > > > >>
> > > > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > > > >>> set of syscall helpers (which may sleep) may also be called from
> > > > > >>> sleepable tracepoints.
> > > > > >>
> > > > > >> There are some old discussions on sleepable tracepoints, maybe
> > > > > >> worthwhile to take a look.
> > > > > >>
> > > > > >> https://lore.kernel.org/bpf/20210218222125.46565-5-mjeanson@efficios.com/T/
> > > > > >
> > > > > > Right. It's very much related, but obsolete too.
> > > > > > We don't need any of that for sleeptable _raw_ tps.
> > > > > > I prefer to stay with "sleepable" name as well to
> > > > > > match the rest of the bpf sleepable code.
> > > > > > In all cases it's faultable.
> > > > >
> > > > > sounds good to me. Agree that for the bpf user case, Hao's
> > > > > implementation should be enough.
> > > >
> > > > Just remembered that we can also do trivial noinline __weak
> > > > nop function and mark it sleepable on the verifier side.
> > > > That's what we were planning to do to trace map update/delete ops
> > > > in Joe Burton's series.
> > > > Then we don't need to extend tp infra.
> > > > I'm fine whichever way. I see pros and cons in both options.
> > >
> > > Joe is also cc'ed in this patchset, I will sync up with him on the
> > > status of trace map work.
> > >
> > > Alexei, do we have potentially other variants of tp? We can make the
> > > current u16 sleepable a flag, so we can reuse this flag later when we
> > > have another type of tracepoints.
> >
> > When we added the ability to attach to kernel functions and mark them
> > as allow_error_inject the usefulness of tracepoints and even
> > writeable tracepoints was deminissed.
> > If we do sleepable tracepoint, I suspect, it may be the last extension
> > in that area.
> > I guess I'm convincing myself that noinline weak nop func
> > is better here. Just like it's better for Joe's map tracing.
>
> To add to the above... The only downside of sleepable nop func
> comparing to tp is the lack of static_branch.
> So this nop call will always be there.
> For map tracing and for cgroup mkdir/rmdir the few nanosecond
> overhead of calling an empty function isn't even measurable.

The overhead should be fine, I think. mkdir/rmdir won't be frequent
operations. Thanks for the explanation. Let me give it a try and
report back how it works.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c36eeced3838..759ade7b24b3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1810,6 +1810,9 @@  struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+
 void bpf_task_storage_free(struct task_struct *task);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
@@ -1822,7 +1825,6 @@  struct bpf_core_ctx {
 
 int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
 		   int relo_idx, void *insn);
-
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2011,6 +2013,12 @@  bpf_base_func_proto(enum bpf_func_id func_id)
 	return NULL;
 }
 
+static inline struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return NULL;
+}
+
 static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index e7c2276be33e..c73c7ab3680e 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -51,6 +51,7 @@  struct bpf_raw_event_map {
 	void			*bpf_func;
 	u32			num_args;
 	u32			writable_size;
+	u32			sleepable;
 } __aligned(32);
 
 /*
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 7660a7846586..4edfc6df2f52 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -88,7 +88,7 @@  __bpf_trace_##call(void *__data, proto)					\
  * to make sure that if the tracepoint handling changes, the
  * bpf probe will fail to compile unless it too is updated.
  */
-#define __DEFINE_EVENT(template, call, proto, args, size)		\
+#define __DEFINE_EVENT(template, call, proto, args, size, sleep)	\
 static inline void bpf_test_probe_##call(void)				\
 {									\
 	check_trace_callback_type_##call(__bpf_trace_##template);	\
@@ -104,6 +104,7 @@  __section("__bpf_raw_tp_map") = {					\
 		.bpf_func	= __bpf_trace_##template,		\
 		.num_args	= COUNT_ARGS(args),			\
 		.writable_size	= size,					\
+		.sleepable	= sleep,				\
 	},								\
 };
 
@@ -123,11 +124,15 @@  static inline void bpf_test_buffer_##call(void)				\
 #undef DEFINE_EVENT_WRITABLE
 #define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \
 	__CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \
-	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size)
+	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size, 0)
+
+#undef DEFINE_EVENT_SLEEPABLE
+#define DEFINE_EVENT_SLEEPABLE(template, call, proto, args)	\
+	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0, 1)
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
-	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0)
+	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0, 0)
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
@@ -136,19 +141,26 @@  static inline void bpf_test_buffer_##call(void)				\
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(call, proto, args)				\
 	__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))		\
-	__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0)
+	__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0, 0)
 
 #undef DECLARE_TRACE_WRITABLE
 #define DECLARE_TRACE_WRITABLE(call, proto, args, size) \
 	__CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \
 	__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \
-	__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size)
+	__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size, 0)
+
+#undef DECLARE_TRACE_SLEEPABLE
+#define DECLARE_TRACE_SLEEPABLE(call, proto, args)			\
+	__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))		\
+	__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0, 1)
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #undef DECLARE_TRACE_WRITABLE
 #undef DEFINE_EVENT_WRITABLE
 #undef __CHECK_WRITABLE_BUF_SIZE
+#undef DECLARE_TRACE_SLEEPABLE
+#undef DEFINE_EVENT_SLEEPABLE
 #undef __DEFINE_EVENT
 #undef FIRST
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9e6d8d0c8af5..0a12f52fe8a9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4827,12 +4827,6 @@  static const struct bpf_func_proto bpf_sys_bpf_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
-const struct bpf_func_proto * __weak
-tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
-	return bpf_base_func_proto(func_id);
-}
-
 BPF_CALL_1(bpf_sys_close, u32, fd)
 {
 	/* When bpf program calls this helper there should not be
@@ -5045,24 +5039,47 @@  const struct bpf_func_proto bpf_unlink_proto = {
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
-static const struct bpf_func_proto *
-syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+/* Syscall helpers that are also allowed in sleepable tracing prog. */
+const struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
+				const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
 		return &bpf_sys_bpf_proto;
-	case BPF_FUNC_btf_find_by_name_kind:
-		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
 		return &bpf_sys_close_proto;
-	case BPF_FUNC_kallsyms_lookup_name:
-		return &bpf_kallsyms_lookup_name_proto;
 	case BPF_FUNC_mkdir:
 		return &bpf_mkdir_proto;
 	case BPF_FUNC_rmdir:
 		return &bpf_rmdir_proto;
 	case BPF_FUNC_unlink:
 		return &bpf_unlink_proto;
+	default:
+		return NULL;
+	}
+}
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	const struct bpf_func_proto *fn;
+
+	fn = tracing_prog_syscall_func_proto(func_id, prog);
+	if (fn)
+		return fn;
+
+	return bpf_base_func_proto(func_id);
+}
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_btf_find_by_name_kind:
+		return &bpf_btf_find_by_name_kind_proto;
+	case BPF_FUNC_kallsyms_lookup_name:
+		return &bpf_kallsyms_lookup_name_proto;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a2024ba32a20..c816e0e0d4a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1691,6 +1691,8 @@  tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		fn = raw_tp_prog_func_proto(func_id, prog);
 		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
 			fn = bpf_iter_get_func_proto(func_id, prog);
+		if (!fn && prog->aux->sleepable)
+			fn = tracing_prog_syscall_func_proto(func_id, prog);
 		return fn;
 	}
 }
@@ -2053,6 +2055,9 @@  static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 	if (prog->aux->max_tp_access > btp->writable_size)
 		return -EINVAL;
 
+	if (prog->aux->sleepable && !btp->sleepable)
+		return -EPERM;
+
 	return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
 						   prog);
 }