Message ID | 20210821025837.1614098-3-davemarchevsky@fb.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | bpf: implement variadic printk helper | expand |
On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > This helper is meant to be "bpf_trace_printk, but with proper vararg We have bpf_snprintf() and bpf_seq_printf() names for other BPF helpers using the same approach. How about we call this one simply `bpf_printf`? It will be in line with other naming, it is logical BPF equivalent of user-space printf (which outputs to stderr, which in BPF land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical to have a nice and short BPF_PRINTF() convenience macro provided by libbpf. > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > array. Write to dmesg using the same mechanism as bpf_trace_printk. Are you sure about the dmesg part?... bpf_trace_printk is outputting into /sys/kernel/debug/tracing/trace_pipe. > > Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> > --- > include/linux/bpf.h | 1 + > include/uapi/linux/bpf.h | 23 +++++++++++++++ > kernel/bpf/core.c | 5 ++++ > kernel/bpf/helpers.c | 2 ++ > kernel/trace/bpf_trace.c | 52 +++++++++++++++++++++++++++++++++- > tools/include/uapi/linux/bpf.h | 23 +++++++++++++++ > 6 files changed, 105 insertions(+), 1 deletion(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index be8d57e6e78a..b6c45a6cbbba 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1088,6 +1088,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f > int bpf_prog_calc_tag(struct bpf_prog *fp); > > const struct bpf_func_proto *bpf_get_trace_printk_proto(void); > +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); > > typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, > unsigned long off, unsigned long len); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index c4f7892edb2b..899a2649d986 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -4871,6 +4871,28 @@ union bpf_attr { > * Return > * Value specified by user at BPF link creation/attachment time > * or 0, if it was not specified. > + * > + * u64 bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) > + * Description > + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 > + * to format. Supports up to 12 arguments to print in this way. we didn't specify 12 in the description of bpf_snprintf() or bpf_seq_printf(), so why start doing that here? For data/args format, let's just refer to bpf_snprintf() or bpf_seq_printf(), whichever does a better job explaining this :) > + * The *fmt* and *fmt_size* are for the format string itself. The *data* and > + * *data_len* are format string arguments. > + * > + * Each format specifier in **fmt** corresponds to one u64 element > + * in the **data** array. For strings and pointers where pointees > + * are accessed, only the pointer values are stored in the *data* > + * array. The *data_len* is the size of *data* in bytes. > + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. > + * Reading kernel memory may fail due to either invalid address or > + * valid address but requiring a major memory fault. If reading kernel memory > + * fails, the string for **%s** will be an empty string, and the ip > + * address for **%p{i,I}{4,6}** will be 0. Not returning error to > + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. This is just a copy/paste from other helpers. Let's avoid duplication and just point people to a description in other helpers. > + * > + * Return > + * The number of bytes written to the buffer, or a negative error > + * in case of failure. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -5048,6 +5070,7 @@ union bpf_attr { > FN(timer_cancel), \ > FN(get_func_ip), \ > FN(get_attach_cookie), \ > + FN(trace_vprintk), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 91f24c7b38a1..a137c550046c 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2357,6 +2357,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) > return NULL; > } > > +const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) > +{ > + return NULL; > +} > + > u64 __weak > bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, > void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 5ce19b376ef7..863e5ee68558 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -1419,6 +1419,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) > return &bpf_snprintf_btf_proto; > case BPF_FUNC_snprintf: > return &bpf_snprintf_proto; > + case BPF_FUNC_trace_vprintk: > + return bpf_get_trace_vprintk_proto(); > default: > return NULL; > } > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 2cf4bfa1ab7b..8b3f1ec9e082 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -398,7 +398,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { > .arg2_type = ARG_CONST_SIZE, > }; > > -const struct bpf_func_proto *bpf_get_trace_printk_proto(void) > +static __always_inline void __set_printk_clr_event(void) Please drop __always_inline, we only use __always_inline for absolutely performance critical routines. Let the compiler decide. > { > /* > * This program might be calling bpf_trace_printk, > @@ -410,10 +410,58 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) > */ > if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) > pr_warn_ratelimited("could not enable bpf_trace_printk events"); > +} > > +const struct bpf_func_proto *bpf_get_trace_printk_proto(void) > +{ > + __set_printk_clr_event(); > return &bpf_trace_printk_proto; > } > > +BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data, > + u32, data_len) > +{ > + static char buf[BPF_TRACE_PRINTK_SIZE]; > + unsigned long flags; > + int ret, num_args; > + u32 *bin_args; > + > + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || > + (data_len && !data)) > + return -EINVAL; > + num_args = data_len / 8; > + > + ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); > + if (ret < 0) > + return ret; > + > + raw_spin_lock_irqsave(&trace_printk_lock, flags); > + ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); > + > + trace_bpf_trace_printk(buf); > + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); > + > + bpf_bprintf_cleanup(); > + > + return ret; > +} > + > +static const struct bpf_func_proto bpf_trace_vprintk_proto = { > + .func = bpf_trace_vprintk, > + .gpl_only = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_MEM, > + .arg2_type = ARG_CONST_SIZE, > + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, > + .arg4_type = ARG_CONST_SIZE_OR_ZERO, > +}; > + > +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) > +{ > + __set_printk_clr_event(); > + return &bpf_trace_vprintk_proto; > +} > + > BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, > const void *, data, u32, data_len) > { > @@ -1113,6 +1161,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_snprintf_proto; > case BPF_FUNC_get_func_ip: > return &bpf_get_func_ip_proto_tracing; > + case BPF_FUNC_trace_vprintk: > + return bpf_get_trace_vprintk_proto(); > default: > return bpf_base_func_proto(func_id); > } > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index c4f7892edb2b..899a2649d986 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -4871,6 +4871,28 @@ union bpf_attr { > * Return > * Value specified by user at BPF link creation/attachment time > * or 0, if it was not specified. > + * > + * u64 bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) > + * Description > + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 > + * to format. Supports up to 12 arguments to print in this way. > + * The *fmt* and *fmt_size* are for the format string itself. The *data* and > + * *data_len* are format string arguments. > + * > + * Each format specifier in **fmt** corresponds to one u64 element > + * in the **data** array. For strings and pointers where pointees > + * are accessed, only the pointer values are stored in the *data* > + * array. The *data_len* is the size of *data* in bytes. > + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. > + * Reading kernel memory may fail due to either invalid address or > + * valid address but requiring a major memory fault. If reading kernel memory > + * fails, the string for **%s** will be an empty string, and the ip > + * address for **%p{i,I}{4,6}** will be 0. Not returning error to > + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. > + * > + * Return > + * The number of bytes written to the buffer, or a negative error > + * in case of failure. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -5048,6 +5070,7 @@ union bpf_attr { > FN(timer_cancel), \ > FN(get_func_ip), \ > FN(get_attach_cookie), \ > + FN(trace_vprintk), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > -- > 2.30.2 >
On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > helpers using the same approach. How about we call this one simply > `bpf_printf`? It will be in line with other naming, it is logical BPF > equivalent of user-space printf (which outputs to stderr, which in BPF > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > to have a nice and short BPF_PRINTF() convenience macro provided by > libbpf. > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > into /sys/kernel/debug/tracing/trace_pipe. Actually I like bpf_trace_vprintk() name, since it makes it obvious that it's a flavor of bpf_trace_printk() and its quirks that users learned to deal with. I would reserve bpf_printf() for the future. We might have standalone bpf programs in the future (without user space component) and a better equivalent of stdin/stdout. clang -target bpf hello_world.c -o a.out; ./a.out should print to a terminal. Such future hello world in bpf would be using bpf_printf() or bpf_dprintf().
On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > helpers using the same approach. How about we call this one simply > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > equivalent of user-space printf (which outputs to stderr, which in BPF > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > to have a nice and short BPF_PRINTF() convenience macro provided by > > libbpf. > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > into /sys/kernel/debug/tracing/trace_pipe. > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's mildly annoying (it's f at the end, and no v- prefix). Maybe bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? But either way you would be using BPF_PRINTF() macro for this. And we can make that macro use bpf_trace_printk() transparently for <3 args, so that new macro works on old kernels. > it's a flavor of bpf_trace_printk() and its quirks that users learned > to deal with. > I would reserve bpf_printf() for the future. We might have standalone > bpf programs in the future (without user space component) and a better > equivalent > of stdin/stdout. clang -target bpf hello_world.c -o a.out; ./a.out > should print to a terminal. Such future hello world in bpf would be > using bpf_printf() > or bpf_dprintf().
On Tue, Aug 24, 2021 at 11:02 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > > <andrii.nakryiko@gmail.com> wrote: > > > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > > helpers using the same approach. How about we call this one simply > > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > > equivalent of user-space printf (which outputs to stderr, which in BPF > > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > > to have a nice and short BPF_PRINTF() convenience macro provided by > > > libbpf. > > > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > > into /sys/kernel/debug/tracing/trace_pipe. > > > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that > > It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's > mildly annoying (it's f at the end, and no v- prefix). Maybe > bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? bpf_trace_printf could be ok, but see below. > But > either way you would be using BPF_PRINTF() macro for this. And we can > make that macro use bpf_trace_printk() transparently for <3 args, so > that new macro works on old kernels. Cannot we change the existing bpf_printk() macro to work on old and new kernels? So bpf_printk() would use bpf_trace_printf() on new and bpf_trace_printk() on old? I think bpf_trace_vprintk() looks cleaner in this context if we reuse bpf_printk() macro.
On Tue, Aug 24, 2021 at 11:17 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 11:02 AM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov > > <alexei.starovoitov@gmail.com> wrote: > > > > > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > > > helpers using the same approach. How about we call this one simply > > > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > > > equivalent of user-space printf (which outputs to stderr, which in BPF > > > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > > > to have a nice and short BPF_PRINTF() convenience macro provided by > > > > libbpf. > > > > > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > > > into /sys/kernel/debug/tracing/trace_pipe. > > > > > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that > > > > It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's > > mildly annoying (it's f at the end, and no v- prefix). Maybe > > bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? > > bpf_trace_printf could be ok, but see below. > > > But > > either way you would be using BPF_PRINTF() macro for this. And we can > > make that macro use bpf_trace_printk() transparently for <3 args, so > > that new macro works on old kernels. > > Cannot we change the existing bpf_printk() macro to work on old and new kernels? Only if we break backwards compatibility. And I only know how to detect the presence of new helper with CO-RE, which automatically makes any BPF program using this macro CO-RE-dependent, which might not be what users want (vmlinux BTF is still not universally available). If I could do something like that without breaking change and without CO-RE, I'd update bpf_printk() to use `const char *fmt` for format string a long time ago. But adding CO-RE dependency for bpf_printk() seems like a no-go. > So bpf_printk() would use bpf_trace_printf() on new and > bpf_trace_printk() on old? > I think bpf_trace_vprintk() looks cleaner in this context if we reuse > bpf_printk() macro.
On Tue, Aug 24, 2021 at 11:24 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 11:17 AM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > > On Tue, Aug 24, 2021 at 11:02 AM Andrii Nakryiko > > <andrii.nakryiko@gmail.com> wrote: > > > > > > On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov > > > <alexei.starovoitov@gmail.com> wrote: > > > > > > > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > > > > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > > > > helpers using the same approach. How about we call this one simply > > > > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > > > > equivalent of user-space printf (which outputs to stderr, which in BPF > > > > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > > > > to have a nice and short BPF_PRINTF() convenience macro provided by > > > > > libbpf. > > > > > > > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > > > > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > > > > into /sys/kernel/debug/tracing/trace_pipe. > > > > > > > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that > > > > > > It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's > > > mildly annoying (it's f at the end, and no v- prefix). Maybe > > > bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? > > > > bpf_trace_printf could be ok, but see below. > > > > > But > > > either way you would be using BPF_PRINTF() macro for this. And we can > > > make that macro use bpf_trace_printk() transparently for <3 args, so > > > that new macro works on old kernels. > > > > Cannot we change the existing bpf_printk() macro to work on old and new kernels? > > Only if we break backwards compatibility. And I only know how to > detect the presence of new helper with CO-RE, which automatically > makes any BPF program using this macro CO-RE-dependent, which might > not be what users want (vmlinux BTF is still not universally > available). If I could do something like that without breaking change > and without CO-RE, I'd update bpf_printk() to use `const char *fmt` > for format string a long time ago. But adding CO-RE dependency for > bpf_printk() seems like a no-go. I see. Naming is the hardest. I think Dave's current choice of lower case bpf_vprintk() macro and bpf_trace_vprintk() helper fits the existing bpf_printk/bpf_trace_printk the best. Yes, it's inconsistent with BPF_SEQ_PRINTF/BPF_SNPRINTF, but consistent with trace_printk. Whichever way we go it will be inconsistent. Stylistically I like the lower case macro, since it doesn't scream at me.
On Tue, Aug 24, 2021 at 2:00 PM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 11:24 AM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Aug 24, 2021 at 11:17 AM Alexei Starovoitov > > <alexei.starovoitov@gmail.com> wrote: > > > > > > On Tue, Aug 24, 2021 at 11:02 AM Andrii Nakryiko > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov > > > > <alexei.starovoitov@gmail.com> wrote: > > > > > > > > > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > > > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > > > > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > > > > > > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > > > > > helpers using the same approach. How about we call this one simply > > > > > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > > > > > equivalent of user-space printf (which outputs to stderr, which in BPF > > > > > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > > > > > to have a nice and short BPF_PRINTF() convenience macro provided by > > > > > > libbpf. > > > > > > > > > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > > > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > > > > > > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > > > > > into /sys/kernel/debug/tracing/trace_pipe. > > > > > > > > > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that > > > > > > > > It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's > > > > mildly annoying (it's f at the end, and no v- prefix). Maybe > > > > bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? > > > > > > bpf_trace_printf could be ok, but see below. > > > > > > > But > > > > either way you would be using BPF_PRINTF() macro for this. And we can > > > > make that macro use bpf_trace_printk() transparently for <3 args, so > > > > that new macro works on old kernels. > > > > > > Cannot we change the existing bpf_printk() macro to work on old and new kernels? > > > > Only if we break backwards compatibility. And I only know how to > > detect the presence of new helper with CO-RE, which automatically > > makes any BPF program using this macro CO-RE-dependent, which might > > not be what users want (vmlinux BTF is still not universally > > available). If I could do something like that without breaking change > > and without CO-RE, I'd update bpf_printk() to use `const char *fmt` > > for format string a long time ago. But adding CO-RE dependency for > > bpf_printk() seems like a no-go. > > I see. Naming is the hardest. > I think Dave's current choice of lower case bpf_vprintk() macro and > bpf_trace_vprintk() > helper fits the existing bpf_printk/bpf_trace_printk the best. > Yes, it's inconsistent with BPF_SEQ_PRINTF/BPF_SNPRINTF, > but consistent with trace_printk. Whichever way we go it will be inconsistent. > Stylistically I like the lower case macro, since it doesn't scream at me. Ok, it's fine. Even more so because we don't need a new macro, we can just extend the existing bpf_printk() macro to automatically pick bpf_trace_printk() if more than 3 arguments is provided. Dave, you'll have to solve a bit of a puzzle macro-wise, but it's possible to use either bpf_trace_printk() or bpf_trace_vprintk() transparently for the user. The only downside is that for <3 args, for backwards compatibility, we'd have to stick to char ___fmt[] = fmt; vs more efficient static const char ___fmt[] = fmt; But I'm thinking it might be time to finally make this improvement. We can also allow users to fallback to less efficient ways for really old kernels with some extra flag, like so #ifdef BPF_NO_GLOBAL_DATA char ___fmt[] = fmt; #else static const char ___fmt[] = fmt; #end Thoughts?
On Tue, Aug 24, 2021 at 2:24 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 2:00 PM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > > On Tue, Aug 24, 2021 at 11:24 AM Andrii Nakryiko > > <andrii.nakryiko@gmail.com> wrote: > > > > > > On Tue, Aug 24, 2021 at 11:17 AM Alexei Starovoitov > > > <alexei.starovoitov@gmail.com> wrote: > > > > > > > > On Tue, Aug 24, 2021 at 11:02 AM Andrii Nakryiko > > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > > > On Tue, Aug 24, 2021 at 10:57 AM Alexei Starovoitov > > > > > <alexei.starovoitov@gmail.com> wrote: > > > > > > > > > > > > On Mon, Aug 23, 2021 at 9:50 PM Andrii Nakryiko > > > > > > <andrii.nakryiko@gmail.com> wrote: > > > > > > > > > > > > > > On Fri, Aug 20, 2021 at 7:59 PM Dave Marchevsky <davemarchevsky@fb.com> wrote: > > > > > > > > > > > > > > > > This helper is meant to be "bpf_trace_printk, but with proper vararg > > > > > > > > > > > > > > We have bpf_snprintf() and bpf_seq_printf() names for other BPF > > > > > > > helpers using the same approach. How about we call this one simply > > > > > > > `bpf_printf`? It will be in line with other naming, it is logical BPF > > > > > > > equivalent of user-space printf (which outputs to stderr, which in BPF > > > > > > > land is /sys/kernel/debug/tracing/trace_pipe). And it will be logical > > > > > > > to have a nice and short BPF_PRINTF() convenience macro provided by > > > > > > > libbpf. > > > > > > > > > > > > > > > support". Follow bpf_snprintf's example and take a u64 pseudo-vararg > > > > > > > > array. Write to dmesg using the same mechanism as bpf_trace_printk. > > > > > > > > > > > > > > Are you sure about the dmesg part?... bpf_trace_printk is outputting > > > > > > > into /sys/kernel/debug/tracing/trace_pipe. > > > > > > > > > > > > Actually I like bpf_trace_vprintk() name, since it makes it obvious that > > > > > > > > > > It's the inconsistency with bpf_snprintf() and bpf_seq_printf() that's > > > > > mildly annoying (it's f at the end, and no v- prefix). Maybe > > > > > bpf_trace_printf() then? Or is it too close to bpf_trace_printk()? > > > > > > > > bpf_trace_printf could be ok, but see below. > > > > > > > > > But > > > > > either way you would be using BPF_PRINTF() macro for this. And we can > > > > > make that macro use bpf_trace_printk() transparently for <3 args, so > > > > > that new macro works on old kernels. > > > > > > > > Cannot we change the existing bpf_printk() macro to work on old and new kernels? > > > > > > Only if we break backwards compatibility. And I only know how to > > > detect the presence of new helper with CO-RE, which automatically > > > makes any BPF program using this macro CO-RE-dependent, which might > > > not be what users want (vmlinux BTF is still not universally > > > available). If I could do something like that without breaking change > > > and without CO-RE, I'd update bpf_printk() to use `const char *fmt` > > > for format string a long time ago. But adding CO-RE dependency for > > > bpf_printk() seems like a no-go. > > > > I see. Naming is the hardest. > > I think Dave's current choice of lower case bpf_vprintk() macro and > > bpf_trace_vprintk() > > helper fits the existing bpf_printk/bpf_trace_printk the best. > > Yes, it's inconsistent with BPF_SEQ_PRINTF/BPF_SNPRINTF, > > but consistent with trace_printk. Whichever way we go it will be inconsistent. > > Stylistically I like the lower case macro, since it doesn't scream at me. > > Ok, it's fine. Even more so because we don't need a new macro, we can > just extend the existing bpf_printk() macro to automatically pick > bpf_trace_printk() if more than 3 arguments is provided. > > Dave, you'll have to solve a bit of a puzzle macro-wise, but it's > possible to use either bpf_trace_printk() or bpf_trace_vprintk() > transparently for the user. > > The only downside is that for <3 args, for backwards compatibility, > we'd have to stick to > > char ___fmt[] = fmt; > > vs more efficient > > static const char ___fmt[] = fmt; > > But I'm thinking it might be time to finally make this improvement. We > can also allow users to fallback to less efficient ways for really old > kernels with some extra flag, like so > > #ifdef BPF_NO_GLOBAL_DATA > char ___fmt[] = fmt; > #else > static const char ___fmt[] = fmt; > #end > > Thoughts? +1 from me for the latter assuming macro magic is possible.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be8d57e6e78a..b6c45a6cbbba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1088,6 +1088,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4f7892edb2b..899a2649d986 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4871,6 +4871,28 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * u64 bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format. Supports up to 12 arguments to print in this way. + * The *fmt* and *fmt_size* are for the format string itself. The *data* and + * *data_len* are format string arguments. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5070,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(trace_vprintk), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 91f24c7b38a1..a137c550046c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2357,6 +2357,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) return NULL; } +const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) +{ + return NULL; +} + u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5ce19b376ef7..863e5ee68558 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1419,6 +1419,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_trace_vprintk: + return bpf_get_trace_vprintk_proto(); default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2cf4bfa1ab7b..8b3f1ec9e082 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -398,7 +398,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -const struct bpf_func_proto *bpf_get_trace_printk_proto(void) +static __always_inline void __set_printk_clr_event(void) { /* * This program might be calling bpf_trace_printk, @@ -410,10 +410,58 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) */ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); +} +const struct bpf_func_proto *bpf_get_trace_printk_proto(void) +{ + __set_printk_clr_event(); return &bpf_trace_printk_proto; } +BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data, + u32, data_len) +{ + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + int ret, num_args; + u32 *bin_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); + if (ret < 0) + return ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); + ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); + + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + + bpf_bprintf_cleanup(); + + return ret; +} + +static const struct bpf_func_proto bpf_trace_vprintk_proto = { + .func = bpf_trace_vprintk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) +{ + __set_printk_clr_event(); + return &bpf_trace_vprintk_proto; +} + BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { @@ -1113,6 +1161,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; + case BPF_FUNC_trace_vprintk: + return bpf_get_trace_vprintk_proto(); default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c4f7892edb2b..899a2649d986 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4871,6 +4871,28 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * u64 bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format. Supports up to 12 arguments to print in this way. + * The *fmt* and *fmt_size* are for the format string itself. The *data* and + * *data_len* are format string arguments. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5070,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(trace_vprintk), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper
This helper is meant to be "bpf_trace_printk, but with proper vararg support". Follow bpf_snprintf's example and take a u64 pseudo-vararg array. Write to dmesg using the same mechanism as bpf_trace_printk. Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 23 +++++++++++++++ kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 2 ++ kernel/trace/bpf_trace.c | 52 +++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 23 +++++++++++++++ 6 files changed, 105 insertions(+), 1 deletion(-)