diff mbox series

[bpf-next,v2] bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing

Message ID 20240910214037.3663272-1-yonghong.song@linux.dev (mailing list archive)
State Accepted
Commit 376bd59e2a0404b09767cc991cf5aed394cf0cf2
Delegated to: BPF
Headers show
Series [bpf-next,v2] bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
netdev/tree_selection success Clearly marked for bpf-next
netdev/apply success Patch already applied to bpf-next-0

Commit Message

Yonghong Song Sept. 10, 2024, 9:40 p.m. UTC
Salvatore Benedetto reported an issue that when doing syscall tracepoint
tracing the kernel stack is empty. For example, using the following
command line
  bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }'
  bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }'
the output for both commands is
===
  Kernel Stack
===

Further analysis shows that pt_regs used for bpf syscall tracepoint
tracing is from the one constructed during user->kernel transition.
The call stack looks like
  perf_syscall_enter+0x88/0x7c0
  trace_sys_enter+0x41/0x80
  syscall_trace_enter+0x100/0x160
  do_syscall_64+0x38/0xf0
  entry_SYSCALL_64_after_hwframe+0x76/0x7e

The ip address stored in pt_regs is from user space hence no kernel
stack is printed.

To fix the issue, kernel address from pt_regs is required.
In kernel repo, there are already a few cases like this. For example,
in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr)
instances are used to supply ip address or use ip address to construct
call stack.

Instead of allocate fake_regs in the stack which may consume
a lot of bytes, the function perf_trace_buf_alloc() in
perf_syscall_{enter, exit}() is leveraged to create fake_regs,
which will be passed to perf_call_bpf_{enter,exit}().

For the above bpftrace script, I got the following output with this patch:
for tracepoint:syscalls:sys_enter_read
===
  Kernel Stack

        syscall_trace_enter+407
        syscall_trace_enter+407
        do_syscall_64+74
        entry_SYSCALL_64_after_hwframe+75
===
and for tracepoint:syscalls:sys_exit_read
===
Kernel Stack

        syscall_exit_work+185
        syscall_exit_work+185
        syscall_exit_to_user_mode+305
        do_syscall_64+118
        entry_SYSCALL_64_after_hwframe+75
===

Reported-by: Salvatore Benedetto <salvabenedetto@meta.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 kernel/trace/trace_syscalls.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

Comments

Andrii Nakryiko Sept. 10, 2024, 10:27 p.m. UTC | #1
On Tue, Sep 10, 2024 at 2:40 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Salvatore Benedetto reported an issue that when doing syscall tracepoint
> tracing the kernel stack is empty. For example, using the following
> command line
>   bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }'
>   bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }'
> the output for both commands is
> ===
>   Kernel Stack
> ===
>
> Further analysis shows that pt_regs used for bpf syscall tracepoint
> tracing is from the one constructed during user->kernel transition.
> The call stack looks like
>   perf_syscall_enter+0x88/0x7c0
>   trace_sys_enter+0x41/0x80
>   syscall_trace_enter+0x100/0x160
>   do_syscall_64+0x38/0xf0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> The ip address stored in pt_regs is from user space hence no kernel
> stack is printed.
>
> To fix the issue, kernel address from pt_regs is required.
> In kernel repo, there are already a few cases like this. For example,
> in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr)
> instances are used to supply ip address or use ip address to construct
> call stack.
>
> Instead of allocate fake_regs in the stack which may consume
> a lot of bytes, the function perf_trace_buf_alloc() in
> perf_syscall_{enter, exit}() is leveraged to create fake_regs,
> which will be passed to perf_call_bpf_{enter,exit}().
>
> For the above bpftrace script, I got the following output with this patch:
> for tracepoint:syscalls:sys_enter_read
> ===
>   Kernel Stack
>
>         syscall_trace_enter+407
>         syscall_trace_enter+407
>         do_syscall_64+74
>         entry_SYSCALL_64_after_hwframe+75
> ===
> and for tracepoint:syscalls:sys_exit_read
> ===
> Kernel Stack
>
>         syscall_exit_work+185
>         syscall_exit_work+185

this duplication is unfortunate, but we have the same with non-syscall
tracepoint, so it's not really a new "issue"

>         syscall_exit_to_user_mode+305
>         do_syscall_64+118
>         entry_SYSCALL_64_after_hwframe+75
> ===
>
> Reported-by: Salvatore Benedetto <salvabenedetto@meta.com>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  kernel/trace/trace_syscalls.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>

Looks great, thank you!

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 9c581d6da843..785733245ead 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
>         BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
>
>         /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
> +       perf_fetch_caller_regs(regs);
>         *(struct pt_regs **)&param = regs;
>         param.syscall_nr = rec->nr;
>         for (i = 0; i < sys_data->nb_args; i++)
> @@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  {
>         struct syscall_metadata *sys_data;
>         struct syscall_trace_enter *rec;
> +       struct pt_regs *fake_regs;
>         struct hlist_head *head;
>         unsigned long args[6];
>         bool valid_prog_array;
> @@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>         size = ALIGN(size + sizeof(u32), sizeof(u64));
>         size -= sizeof(u32);
>
> -       rec = perf_trace_buf_alloc(size, NULL, &rctx);
> +       rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
>         if (!rec)
>                 return;
>
> @@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>         memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
>
>         if ((valid_prog_array &&
> -            !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
> +            !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
>             hlist_empty(head)) {
>                 perf_swevent_put_recursion_context(rctx);
>                 return;
> @@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
>         } __aligned(8) param;
>
>         /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
> +       perf_fetch_caller_regs(regs);
>         *(struct pt_regs **)&param = regs;
>         param.syscall_nr = rec->nr;
>         param.ret = rec->ret;
> @@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>  {
>         struct syscall_metadata *sys_data;
>         struct syscall_trace_exit *rec;
> +       struct pt_regs *fake_regs;
>         struct hlist_head *head;
>         bool valid_prog_array;
>         int syscall_nr;
> @@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>         size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
>         size -= sizeof(u32);
>
> -       rec = perf_trace_buf_alloc(size, NULL, &rctx);
> +       rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
>         if (!rec)
>                 return;
>
> @@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>         rec->ret = syscall_get_return_value(current, regs);
>
>         if ((valid_prog_array &&
> -            !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
> +            !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
>             hlist_empty(head)) {
>                 perf_swevent_put_recursion_context(rctx);
>                 return;
> --
> 2.43.5
>
patchwork-bot+netdevbpf@kernel.org Sept. 11, 2024, 8:30 p.m. UTC | #2
Hello:

This patch was applied to bpf/bpf-next.git (master)
by Andrii Nakryiko <andrii@kernel.org>:

On Tue, 10 Sep 2024 14:40:37 -0700 you wrote:
> Salvatore Benedetto reported an issue that when doing syscall tracepoint
> tracing the kernel stack is empty. For example, using the following
> command line
>   bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }'
>   bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }'
> the output for both commands is
> ===
>   Kernel Stack
> ===
> 
> [...]

Here is the summary with links:
  - [bpf-next,v2] bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing
    https://git.kernel.org/bpf/bpf-next/c/376bd59e2a04

You are awesome, thank you!
diff mbox series

Patch

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..785733245ead 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -564,6 +564,7 @@  static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
 	BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
 
 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+	perf_fetch_caller_regs(regs);
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +576,7 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
+	struct pt_regs *fake_regs;
 	struct hlist_head *head;
 	unsigned long args[6];
 	bool valid_prog_array;
@@ -602,7 +604,7 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = perf_trace_buf_alloc(size, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
 	if (!rec)
 		return;
 
@@ -611,7 +613,7 @@  static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
 	if ((valid_prog_array &&
-	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -666,6 +668,7 @@  static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
 	} __aligned(8) param;
 
 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+	perf_fetch_caller_regs(regs);
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
@@ -676,6 +679,7 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
+	struct pt_regs *fake_regs;
 	struct hlist_head *head;
 	bool valid_prog_array;
 	int syscall_nr;
@@ -701,7 +705,7 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = perf_trace_buf_alloc(size, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
 	if (!rec)
 		return;
 
@@ -709,7 +713,7 @@  static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->ret = syscall_get_return_value(current, regs);
 
 	if ((valid_prog_array &&
-	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+	     !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;