diff mbox series

[bpf-next,v2] bpf: Add kernel symbol for struct_ops trampoline

Message ID 20241101111948.1570547-1-xukuohai@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v2] bpf: Add kernel symbol for struct_ops trampoline | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 204 this patch: 204
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 6 maintainers not CCed: sdf@fomichev.me haoluo@google.com kpsingh@kernel.org song@kernel.org john.fastabend@gmail.com jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 253 this patch: 253
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 6962 this patch: 6962
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 175 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-10 success Logs for x86_64-gcc / build-release

Commit Message

Xu Kuohai Nov. 1, 2024, 11:19 a.m. UTC
From: Xu Kuohai <xukuohai@huawei.com>

Without kernel symbols for struct_ops trampoline, the unwinder may
produce unexpected stacktraces.

For example, the x86 ORC and FP unwinders check if an IP is in kernel
text by verifying the presence of the IP's kernel symbol. When a
struct_ops trampoline address is encountered, the unwinder stops due
to the absence of symbol, resulting in an incomplete stacktrace that
consists only of direct and indirect child functions called from the
trampoline.

The arm64 unwinder is another example. While the arm64 unwinder can
proceed across a struct_ops trampoline address, the corresponding
symbol name is displayed as "unknown", which is confusing.

Thus, add kernel symbol for struct_ops trampoline. The name is
bpf_trampoline_<PROG_NAME>, where PROG_NAME is the name of the
struct_ops prog linked to the trampoline.

Below is a comparison of stacktraces captured on x86 by perf record,
before and after this patch.

Before:

... FP chain: nr:4
.....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
.....  1: ffffffff8116545d
.....  2: ffffffff81167fcc
.....  3: ffffffff813088f4
 ... thread: iperf:595
 ...... dso: /proc/kcore
iperf     595 [002]  9015.616291:     824245  cycles:
        ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms])
        ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
        ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])

After:

... FP chain: nr:44
.....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
.....  1: ffffffff81165930
.....  2: ffffffff81167fcc
.....  3: ffffffff813088f4
.....  4: ffffffffc000da5e
.....  5: ffffffff81f243df
.....  6: ffffffff81f27326
.....  7: ffffffff81f3a3c3
.....  8: ffffffff81f3c99b
.....  9: ffffffff81ef9870
..... 10: ffffffff81ef9b13
..... 11: ffffffff81ef9c69
..... 12: ffffffff81ef9f47
..... 13: ffffffff81efa15d
..... 14: ffffffff81efa9c0
..... 15: ffffffff81d979eb
..... 16: ffffffff81d987e8
..... 17: ffffffff81ddce16
..... 18: ffffffff81bc7b90
..... 19: ffffffff81bcf677
..... 20: ffffffff81bd1b4f
..... 21: ffffffff81d99693
..... 22: ffffffff81d99a52
..... 23: ffffffff810c9eb2
..... 24: ffffffff810ca631
..... 25: ffffffff822367db
..... 26: ffffffff824015ef
..... 27: ffffffff811678e6
..... 28: ffffffff814f7d85
..... 29: ffffffff814f8119
..... 30: ffffffff81492fb9
..... 31: ffffffff81355c53
..... 32: ffffffff813d79d7
..... 33: ffffffff813d88fc
..... 34: ffffffff8139a52e
..... 35: ffffffff8139a661
..... 36: ffffffff8152c193
..... 37: ffffffff8152cbc5
..... 38: ffffffff814a5908
..... 39: ffffffff814a72d3
..... 40: ffffffff814a758b
..... 41: ffffffff81008869
..... 42: ffffffff822323e8
..... 43: ffffffff8240012f
 ... thread: sleep:493
 ...... dso: /proc/kcore
sleep     493 [000]    55.483168:     410428  cycles:
        ffffffff81165930 __lock_acquire+0x580 ([kernel.kallsyms])
        ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
        ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])
        ffffffffc000da5e bpf_trampoline_bpf_prog_075f577900bac1d2_bpf_cubic_acked+0x3a ([kernel.kallsyms])
        ffffffff81f243df tcp_ack+0xd4f ([kernel.kallsyms])
        ffffffff81f27326 tcp_rcv_established+0x3b6 ([kernel.kallsyms])
        ffffffff81f3a3c3 tcp_v4_do_rcv+0x193 ([kernel.kallsyms])
        ffffffff81f3c99b tcp_v4_rcv+0x11fb ([kernel.kallsyms])
        ffffffff81ef9870 ip_protocol_deliver_rcu+0x50 ([kernel.kallsyms])
        ffffffff81ef9b13 ip_local_deliver_finish+0xb3 ([kernel.kallsyms])
        ffffffff81ef9c69 ip_local_deliver+0x79 ([kernel.kallsyms])
        ffffffff81ef9f47 ip_sublist_rcv_finish+0xb7 ([kernel.kallsyms])
        ffffffff81efa15d ip_sublist_rcv+0x18d ([kernel.kallsyms])
        ffffffff81efa9c0 ip_list_rcv+0x110 ([kernel.kallsyms])
        ffffffff81d979eb __netif_receive_skb_list_core+0x21b ([kernel.kallsyms])
        ffffffff81d987e8 netif_receive_skb_list_internal+0x208 ([kernel.kallsyms])
        ffffffff81ddce16 napi_gro_receive+0xf6 ([kernel.kallsyms])
        ffffffff81bc7b90 virtnet_receive_done+0x340 ([kernel.kallsyms])
        ffffffff81bcf677 receive_buf+0xd7 ([kernel.kallsyms])
        ffffffff81bd1b4f virtnet_poll+0xcbf ([kernel.kallsyms])
        ffffffff81d99693 __napi_poll.constprop.0+0x33 ([kernel.kallsyms])
        ffffffff81d99a52 net_rx_action+0x1c2 ([kernel.kallsyms])
        ffffffff810c9eb2 handle_softirqs+0xe2 ([kernel.kallsyms])
        ffffffff810ca631 irq_exit_rcu+0x91 ([kernel.kallsyms])
        ffffffff822367db sysvec_apic_timer_interrupt+0x9b ([kernel.kallsyms])
        ffffffff824015ef asm_sysvec_apic_timer_interrupt+0x1f ([kernel.kallsyms])
        ffffffff811678e6 lock_release+0x186 ([kernel.kallsyms])
        ffffffff814f7d85 prepend_path+0x395 ([kernel.kallsyms])
        ffffffff814f8119 d_path+0x159 ([kernel.kallsyms])
        ffffffff81492fb9 file_path+0x19 ([kernel.kallsyms])
        ffffffff81355c53 perf_event_mmap+0x1e3 ([kernel.kallsyms])
        ffffffff813d79d7 mmap_region+0x2e7 ([kernel.kallsyms])
        ffffffff813d88fc do_mmap+0x4ec ([kernel.kallsyms])
        ffffffff8139a52e vm_mmap_pgoff+0xde ([kernel.kallsyms])
        ffffffff8139a661 vm_mmap+0x31 ([kernel.kallsyms])
        ffffffff8152c193 elf_load+0xa3 ([kernel.kallsyms])
        ffffffff8152cbc5 load_elf_binary+0x655 ([kernel.kallsyms])
        ffffffff814a5908 bprm_execve+0x2a8 ([kernel.kallsyms])
        ffffffff814a72d3 do_execveat_common.isra.0+0x193 ([kernel.kallsyms])
        ffffffff814a758b __x64_sys_execve+0x3b ([kernel.kallsyms])
        ffffffff81008869 x64_sys_call+0x1399 ([kernel.kallsyms])
        ffffffff822323e8 do_syscall_64+0x68 ([kernel.kallsyms])
        ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])

Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS")
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
---
v2:
Refine the commit message for clarity and fix a test bot warning

v1:
https://lore.kernel.org/bpf/20241030111533.907289-1-xukuohai@huaweicloud.com/
---
 include/linux/bpf.h         |  3 +-
 kernel/bpf/bpf_struct_ops.c | 67 +++++++++++++++++++++++++++++++++++++
 kernel/bpf/dispatcher.c     |  3 +-
 kernel/bpf/trampoline.c     |  9 +++--
 4 files changed, 78 insertions(+), 4 deletions(-)

Comments

Alexei Starovoitov Nov. 1, 2024, 6:19 p.m. UTC | #1
On Fri, Nov 1, 2024 at 4:08 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> From: Xu Kuohai <xukuohai@huawei.com>
>
> Without kernel symbols for struct_ops trampoline, the unwinder may
> produce unexpected stacktraces.
>
> For example, the x86 ORC and FP unwinders check if an IP is in kernel
> text by verifying the presence of the IP's kernel symbol. When a
> struct_ops trampoline address is encountered, the unwinder stops due
> to the absence of symbol, resulting in an incomplete stacktrace that
> consists only of direct and indirect child functions called from the
> trampoline.
>
> The arm64 unwinder is another example. While the arm64 unwinder can
> proceed across a struct_ops trampoline address, the corresponding
> symbol name is displayed as "unknown", which is confusing.
>
> Thus, add kernel symbol for struct_ops trampoline. The name is
> bpf_trampoline_<PROG_NAME>, where PROG_NAME is the name of the
> struct_ops prog linked to the trampoline.
>
> Below is a comparison of stacktraces captured on x86 by perf record,
> before and after this patch.
>
> Before:
>
> ... FP chain: nr:4
> .....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
> .....  1: ffffffff8116545d
> .....  2: ffffffff81167fcc
> .....  3: ffffffff813088f4
>  ... thread: iperf:595
>  ...... dso: /proc/kcore
> iperf     595 [002]  9015.616291:     824245  cycles:
>         ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms])
>         ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
>         ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])
>
> After:
>
> ... FP chain: nr:44
> .....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
> .....  1: ffffffff81165930
> .....  2: ffffffff81167fcc
> .....  3: ffffffff813088f4
> .....  4: ffffffffc000da5e
> .....  5: ffffffff81f243df
> .....  6: ffffffff81f27326
> .....  7: ffffffff81f3a3c3
> .....  8: ffffffff81f3c99b
> .....  9: ffffffff81ef9870
> ..... 10: ffffffff81ef9b13
> ..... 11: ffffffff81ef9c69
> ..... 12: ffffffff81ef9f47
> ..... 13: ffffffff81efa15d
> ..... 14: ffffffff81efa9c0
> ..... 15: ffffffff81d979eb
> ..... 16: ffffffff81d987e8
> ..... 17: ffffffff81ddce16
> ..... 18: ffffffff81bc7b90
> ..... 19: ffffffff81bcf677
> ..... 20: ffffffff81bd1b4f
> ..... 21: ffffffff81d99693
> ..... 22: ffffffff81d99a52
> ..... 23: ffffffff810c9eb2
> ..... 24: ffffffff810ca631
> ..... 25: ffffffff822367db
> ..... 26: ffffffff824015ef
> ..... 27: ffffffff811678e6
> ..... 28: ffffffff814f7d85
> ..... 29: ffffffff814f8119
> ..... 30: ffffffff81492fb9
> ..... 31: ffffffff81355c53
> ..... 32: ffffffff813d79d7
> ..... 33: ffffffff813d88fc
> ..... 34: ffffffff8139a52e
> ..... 35: ffffffff8139a661
> ..... 36: ffffffff8152c193
> ..... 37: ffffffff8152cbc5
> ..... 38: ffffffff814a5908
> ..... 39: ffffffff814a72d3
> ..... 40: ffffffff814a758b
> ..... 41: ffffffff81008869
> ..... 42: ffffffff822323e8
> ..... 43: ffffffff8240012f

The above is a visual noise.
Pls remove such addr dumps from the commit log.
The below part is enough.

>  ... thread: sleep:493
>  ...... dso: /proc/kcore
> sleep     493 [000]    55.483168:     410428  cycles:
>         ffffffff81165930 __lock_acquire+0x580 ([kernel.kallsyms])
>         ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
>         ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])
>         ffffffffc000da5e bpf_trampoline_bpf_prog_075f577900bac1d2_bpf_cubic_acked+0x3a ([kernel.kallsyms])
>         ffffffff81f243df tcp_ack+0xd4f ([kernel.kallsyms])
>         ffffffff81f27326 tcp_rcv_established+0x3b6 ([kernel.kallsyms])
>         ffffffff81f3a3c3 tcp_v4_do_rcv+0x193 ([kernel.kallsyms])
>         ffffffff81f3c99b tcp_v4_rcv+0x11fb ([kernel.kallsyms])
>         ffffffff81ef9870 ip_protocol_deliver_rcu+0x50 ([kernel.kallsyms])
>         ffffffff81ef9b13 ip_local_deliver_finish+0xb3 ([kernel.kallsyms])
>         ffffffff81ef9c69 ip_local_deliver+0x79 ([kernel.kallsyms])
>         ffffffff81ef9f47 ip_sublist_rcv_finish+0xb7 ([kernel.kallsyms])
>         ffffffff81efa15d ip_sublist_rcv+0x18d ([kernel.kallsyms])
>         ffffffff81efa9c0 ip_list_rcv+0x110 ([kernel.kallsyms])
>         ffffffff81d979eb __netif_receive_skb_list_core+0x21b ([kernel.kallsyms])
>         ffffffff81d987e8 netif_receive_skb_list_internal+0x208 ([kernel.kallsyms])
>         ffffffff81ddce16 napi_gro_receive+0xf6 ([kernel.kallsyms])
>         ffffffff81bc7b90 virtnet_receive_done+0x340 ([kernel.kallsyms])
>         ffffffff81bcf677 receive_buf+0xd7 ([kernel.kallsyms])
>         ffffffff81bd1b4f virtnet_poll+0xcbf ([kernel.kallsyms])
>         ffffffff81d99693 __napi_poll.constprop.0+0x33 ([kernel.kallsyms])
>         ffffffff81d99a52 net_rx_action+0x1c2 ([kernel.kallsyms])
>         ffffffff810c9eb2 handle_softirqs+0xe2 ([kernel.kallsyms])
>         ffffffff810ca631 irq_exit_rcu+0x91 ([kernel.kallsyms])
>         ffffffff822367db sysvec_apic_timer_interrupt+0x9b ([kernel.kallsyms])
>         ffffffff824015ef asm_sysvec_apic_timer_interrupt+0x1f ([kernel.kallsyms])
>         ffffffff811678e6 lock_release+0x186 ([kernel.kallsyms])
>         ffffffff814f7d85 prepend_path+0x395 ([kernel.kallsyms])
>         ffffffff814f8119 d_path+0x159 ([kernel.kallsyms])
>         ffffffff81492fb9 file_path+0x19 ([kernel.kallsyms])
>         ffffffff81355c53 perf_event_mmap+0x1e3 ([kernel.kallsyms])
>         ffffffff813d79d7 mmap_region+0x2e7 ([kernel.kallsyms])
>         ffffffff813d88fc do_mmap+0x4ec ([kernel.kallsyms])
>         ffffffff8139a52e vm_mmap_pgoff+0xde ([kernel.kallsyms])
>         ffffffff8139a661 vm_mmap+0x31 ([kernel.kallsyms])
>         ffffffff8152c193 elf_load+0xa3 ([kernel.kallsyms])
>         ffffffff8152cbc5 load_elf_binary+0x655 ([kernel.kallsyms])
>         ffffffff814a5908 bprm_execve+0x2a8 ([kernel.kallsyms])
>         ffffffff814a72d3 do_execveat_common.isra.0+0x193 ([kernel.kallsyms])
>         ffffffff814a758b __x64_sys_execve+0x3b ([kernel.kallsyms])
>         ffffffff81008869 x64_sys_call+0x1399 ([kernel.kallsyms])
>         ffffffff822323e8 do_syscall_64+0x68 ([kernel.kallsyms])
>         ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])
>
> Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS")
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Yonghong Song <yonghong.song@linux.dev>
> ---
> v2:
> Refine the commit message for clarity and fix a test bot warning
>
> v1:
> https://lore.kernel.org/bpf/20241030111533.907289-1-xukuohai@huaweicloud.com/
> ---
>  include/linux/bpf.h         |  3 +-
>  kernel/bpf/bpf_struct_ops.c | 67 +++++++++++++++++++++++++++++++++++++
>  kernel/bpf/dispatcher.c     |  3 +-
>  kernel/bpf/trampoline.c     |  9 +++--
>  4 files changed, 78 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c3ba4d475174..46f8d6c1a55c 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1402,7 +1402,8 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
>  void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
>                                 struct bpf_prog *to);
>  /* Called only from JIT-enabled code, so there's no need for stubs. */
> -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
> +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
> +void bpf_image_ksym_add(struct bpf_ksym *ksym);
>  void bpf_image_ksym_del(struct bpf_ksym *ksym);
>  void bpf_ksym_add(struct bpf_ksym *ksym);
>  void bpf_ksym_del(struct bpf_ksym *ksym);
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index fda3dd2ee984..172a081ed1c3 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -38,6 +38,9 @@ struct bpf_struct_ops_map {
>          * that stores the func args before calling the bpf_prog.
>          */
>         void *image_pages[MAX_TRAMP_IMAGE_PAGES];
> +       u32 ksyms_cnt;
> +       /* ksyms for bpf trampolines */
> +       struct bpf_ksym *ksyms;
>         /* The owner moduler's btf. */
>         struct btf *btf;
>         /* uvalue->data stores the kernel struct
> @@ -586,6 +589,35 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>         return 0;
>  }
>
> +static void bpf_struct_ops_ksym_init(struct bpf_prog *prog, void *image,
> +                                    unsigned int size, struct bpf_ksym *ksym)
> +{
> +       int n;
> +
> +       n = strscpy(ksym->name, "bpf_trampoline_", KSYM_NAME_LEN);
> +       strncat(ksym->name + n, prog->aux->ksym.name, KSYM_NAME_LEN - 1 - n);
> +       INIT_LIST_HEAD_RCU(&ksym->lnode);
> +       bpf_image_ksym_init(image, size, ksym);
> +}
> +
> +static void bpf_struct_ops_map_ksyms_add(struct bpf_struct_ops_map *st_map)
> +{
> +       struct bpf_ksym *ksym = st_map->ksyms;
> +       struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
> +
> +       while (ksym != end && ksym->start)
> +               bpf_image_ksym_add(ksym++);
> +}
> +
> +static void bpf_struct_ops_map_ksyms_del(struct bpf_struct_ops_map *st_map)
> +{
> +       struct bpf_ksym *ksym = st_map->ksyms;
> +       struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
> +
> +       while (ksym != end && ksym->start)
> +               bpf_image_ksym_del(ksym++);
> +}
> +
>  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>                                            void *value, u64 flags)
>  {
> @@ -601,6 +633,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>         int prog_fd, err;
>         u32 i, trampoline_start, image_off = 0;
>         void *cur_image = NULL, *image = NULL;
> +       struct bpf_ksym *ksym;
>
>         if (flags)
>                 return -EINVAL;
> @@ -640,6 +673,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>         kdata = &kvalue->data;
>
>         module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
> +       ksym = st_map->ksyms;
>         for_each_member(i, t, member) {
>                 const struct btf_type *mtype, *ptype;
>                 struct bpf_prog *prog;
> @@ -735,6 +769,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>
>                 /* put prog_id to udata */
>                 *(unsigned long *)(udata + moff) = prog->aux->id;
> +
> +               /* init ksym for this trampoline */
> +               bpf_struct_ops_ksym_init(prog, image + trampoline_start,
> +                                        image_off - trampoline_start,
> +                                        ksym++);

Thanks for the patch.
I think it's overkill to add ksym for each callsite within a single
trampoline.
1. The prog name will be next in the stack. No need to duplicate it.
2. ksym-ing callsites this way is quite unusual.
3. consider irq on other insns within a trampline.
   The unwinder won't find anything in such a case.

So I suggest to add only one ksym that covers the whole trampoline.
The name could be "bpf_trampoline_structopsname"
that is probably st_ops_desc->type.

>         }
>
>         if (st_ops->validate) {
> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>  unlock:
>         kfree(tlinks);
>         mutex_unlock(&st_map->lock);
> +       if (!err)
> +               bpf_struct_ops_map_ksyms_add(st_map);
>         return err;
>  }
>
> @@ -883,6 +924,10 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>          */
>         synchronize_rcu_mult(call_rcu, call_rcu_tasks);
>
> +       /* no trampoline in the map is running anymore, delete symbols */
> +       bpf_struct_ops_map_ksyms_del(st_map);
> +       synchronize_rcu();
> +

This is substantial overhead and why ?
synchronize_rcu_mult() is right above.

pw-bot: cr
Xu Kuohai Nov. 4, 2024, 11:55 a.m. UTC | #2
On 11/2/2024 2:19 AM, Alexei Starovoitov wrote:
> On Fri, Nov 1, 2024 at 4:08 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>
>> From: Xu Kuohai <xukuohai@huawei.com>
>>
>> Without kernel symbols for struct_ops trampoline, the unwinder may
>> produce unexpected stacktraces.
>>
>> For example, the x86 ORC and FP unwinders check if an IP is in kernel
>> text by verifying the presence of the IP's kernel symbol. When a
>> struct_ops trampoline address is encountered, the unwinder stops due
>> to the absence of symbol, resulting in an incomplete stacktrace that
>> consists only of direct and indirect child functions called from the
>> trampoline.
>>
>> The arm64 unwinder is another example. While the arm64 unwinder can
>> proceed across a struct_ops trampoline address, the corresponding
>> symbol name is displayed as "unknown", which is confusing.
>>
>> Thus, add kernel symbol for struct_ops trampoline. The name is
>> bpf_trampoline_<PROG_NAME>, where PROG_NAME is the name of the
>> struct_ops prog linked to the trampoline.
>>
>> Below is a comparison of stacktraces captured on x86 by perf record,
>> before and after this patch.
>>
>> Before:
>>
>> ... FP chain: nr:4
>> .....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
>> .....  1: ffffffff8116545d
>> .....  2: ffffffff81167fcc
>> .....  3: ffffffff813088f4
>>   ... thread: iperf:595
>>   ...... dso: /proc/kcore
>> iperf     595 [002]  9015.616291:     824245  cycles:
>>          ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms])
>>          ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
>>          ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])
>>
>> After:
>>
>> ... FP chain: nr:44
>> .....  0: ffffffffffffff80 # PERF_CONTEXT_KERNEL mark
>> .....  1: ffffffff81165930
>> .....  2: ffffffff81167fcc
>> .....  3: ffffffff813088f4
>> .....  4: ffffffffc000da5e
>> .....  5: ffffffff81f243df
>> .....  6: ffffffff81f27326
>> .....  7: ffffffff81f3a3c3
>> .....  8: ffffffff81f3c99b
>> .....  9: ffffffff81ef9870
>> ..... 10: ffffffff81ef9b13
>> ..... 11: ffffffff81ef9c69
>> ..... 12: ffffffff81ef9f47
>> ..... 13: ffffffff81efa15d
>> ..... 14: ffffffff81efa9c0
>> ..... 15: ffffffff81d979eb
>> ..... 16: ffffffff81d987e8
>> ..... 17: ffffffff81ddce16
>> ..... 18: ffffffff81bc7b90
>> ..... 19: ffffffff81bcf677
>> ..... 20: ffffffff81bd1b4f
>> ..... 21: ffffffff81d99693
>> ..... 22: ffffffff81d99a52
>> ..... 23: ffffffff810c9eb2
>> ..... 24: ffffffff810ca631
>> ..... 25: ffffffff822367db
>> ..... 26: ffffffff824015ef
>> ..... 27: ffffffff811678e6
>> ..... 28: ffffffff814f7d85
>> ..... 29: ffffffff814f8119
>> ..... 30: ffffffff81492fb9
>> ..... 31: ffffffff81355c53
>> ..... 32: ffffffff813d79d7
>> ..... 33: ffffffff813d88fc
>> ..... 34: ffffffff8139a52e
>> ..... 35: ffffffff8139a661
>> ..... 36: ffffffff8152c193
>> ..... 37: ffffffff8152cbc5
>> ..... 38: ffffffff814a5908
>> ..... 39: ffffffff814a72d3
>> ..... 40: ffffffff814a758b
>> ..... 41: ffffffff81008869
>> ..... 42: ffffffff822323e8
>> ..... 43: ffffffff8240012f
> 
> The above is a visual noise.
> Pls remove such addr dumps from the commit log.
> The below part is enough.
>

OK, will do.

>>   ... thread: sleep:493
>>   ...... dso: /proc/kcore
>> sleep     493 [000]    55.483168:     410428  cycles:
>>          ffffffff81165930 __lock_acquire+0x580 ([kernel.kallsyms])
>>          ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms])
>>          ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms])
>>          ffffffffc000da5e bpf_trampoline_bpf_prog_075f577900bac1d2_bpf_cubic_acked+0x3a ([kernel.kallsyms])
>>          ffffffff81f243df tcp_ack+0xd4f ([kernel.kallsyms])
>>          ffffffff81f27326 tcp_rcv_established+0x3b6 ([kernel.kallsyms])
>>          ffffffff81f3a3c3 tcp_v4_do_rcv+0x193 ([kernel.kallsyms])
>>          ffffffff81f3c99b tcp_v4_rcv+0x11fb ([kernel.kallsyms])
>>          ffffffff81ef9870 ip_protocol_deliver_rcu+0x50 ([kernel.kallsyms])
>>          ffffffff81ef9b13 ip_local_deliver_finish+0xb3 ([kernel.kallsyms])
>>          ffffffff81ef9c69 ip_local_deliver+0x79 ([kernel.kallsyms])
>>          ffffffff81ef9f47 ip_sublist_rcv_finish+0xb7 ([kernel.kallsyms])
>>          ffffffff81efa15d ip_sublist_rcv+0x18d ([kernel.kallsyms])
>>          ffffffff81efa9c0 ip_list_rcv+0x110 ([kernel.kallsyms])
>>          ffffffff81d979eb __netif_receive_skb_list_core+0x21b ([kernel.kallsyms])
>>          ffffffff81d987e8 netif_receive_skb_list_internal+0x208 ([kernel.kallsyms])
>>          ffffffff81ddce16 napi_gro_receive+0xf6 ([kernel.kallsyms])
>>          ffffffff81bc7b90 virtnet_receive_done+0x340 ([kernel.kallsyms])
>>          ffffffff81bcf677 receive_buf+0xd7 ([kernel.kallsyms])
>>          ffffffff81bd1b4f virtnet_poll+0xcbf ([kernel.kallsyms])
>>          ffffffff81d99693 __napi_poll.constprop.0+0x33 ([kernel.kallsyms])
>>          ffffffff81d99a52 net_rx_action+0x1c2 ([kernel.kallsyms])
>>          ffffffff810c9eb2 handle_softirqs+0xe2 ([kernel.kallsyms])
>>          ffffffff810ca631 irq_exit_rcu+0x91 ([kernel.kallsyms])
>>          ffffffff822367db sysvec_apic_timer_interrupt+0x9b ([kernel.kallsyms])
>>          ffffffff824015ef asm_sysvec_apic_timer_interrupt+0x1f ([kernel.kallsyms])
>>          ffffffff811678e6 lock_release+0x186 ([kernel.kallsyms])
>>          ffffffff814f7d85 prepend_path+0x395 ([kernel.kallsyms])
>>          ffffffff814f8119 d_path+0x159 ([kernel.kallsyms])
>>          ffffffff81492fb9 file_path+0x19 ([kernel.kallsyms])
>>          ffffffff81355c53 perf_event_mmap+0x1e3 ([kernel.kallsyms])
>>          ffffffff813d79d7 mmap_region+0x2e7 ([kernel.kallsyms])
>>          ffffffff813d88fc do_mmap+0x4ec ([kernel.kallsyms])
>>          ffffffff8139a52e vm_mmap_pgoff+0xde ([kernel.kallsyms])
>>          ffffffff8139a661 vm_mmap+0x31 ([kernel.kallsyms])
>>          ffffffff8152c193 elf_load+0xa3 ([kernel.kallsyms])
>>          ffffffff8152cbc5 load_elf_binary+0x655 ([kernel.kallsyms])
>>          ffffffff814a5908 bprm_execve+0x2a8 ([kernel.kallsyms])
>>          ffffffff814a72d3 do_execveat_common.isra.0+0x193 ([kernel.kallsyms])
>>          ffffffff814a758b __x64_sys_execve+0x3b ([kernel.kallsyms])
>>          ffffffff81008869 x64_sys_call+0x1399 ([kernel.kallsyms])
>>          ffffffff822323e8 do_syscall_64+0x68 ([kernel.kallsyms])
>>          ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms])
>>
>> Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS")
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> Acked-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>> v2:
>> Refine the commit message for clarity and fix a test bot warning
>>
>> v1:
>> https://lore.kernel.org/bpf/20241030111533.907289-1-xukuohai@huaweicloud.com/
>> ---
>>   include/linux/bpf.h         |  3 +-
>>   kernel/bpf/bpf_struct_ops.c | 67 +++++++++++++++++++++++++++++++++++++
>>   kernel/bpf/dispatcher.c     |  3 +-
>>   kernel/bpf/trampoline.c     |  9 +++--
>>   4 files changed, 78 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index c3ba4d475174..46f8d6c1a55c 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1402,7 +1402,8 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
>>   void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
>>                                  struct bpf_prog *to);
>>   /* Called only from JIT-enabled code, so there's no need for stubs. */
>> -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
>> +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
>> +void bpf_image_ksym_add(struct bpf_ksym *ksym);
>>   void bpf_image_ksym_del(struct bpf_ksym *ksym);
>>   void bpf_ksym_add(struct bpf_ksym *ksym);
>>   void bpf_ksym_del(struct bpf_ksym *ksym);
>> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
>> index fda3dd2ee984..172a081ed1c3 100644
>> --- a/kernel/bpf/bpf_struct_ops.c
>> +++ b/kernel/bpf/bpf_struct_ops.c
>> @@ -38,6 +38,9 @@ struct bpf_struct_ops_map {
>>           * that stores the func args before calling the bpf_prog.
>>           */
>>          void *image_pages[MAX_TRAMP_IMAGE_PAGES];
>> +       u32 ksyms_cnt;
>> +       /* ksyms for bpf trampolines */
>> +       struct bpf_ksym *ksyms;
>>          /* The owner moduler's btf. */
>>          struct btf *btf;
>>          /* uvalue->data stores the kernel struct
>> @@ -586,6 +589,35 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
>>          return 0;
>>   }
>>
>> +static void bpf_struct_ops_ksym_init(struct bpf_prog *prog, void *image,
>> +                                    unsigned int size, struct bpf_ksym *ksym)
>> +{
>> +       int n;
>> +
>> +       n = strscpy(ksym->name, "bpf_trampoline_", KSYM_NAME_LEN);
>> +       strncat(ksym->name + n, prog->aux->ksym.name, KSYM_NAME_LEN - 1 - n);
>> +       INIT_LIST_HEAD_RCU(&ksym->lnode);
>> +       bpf_image_ksym_init(image, size, ksym);
>> +}
>> +
>> +static void bpf_struct_ops_map_ksyms_add(struct bpf_struct_ops_map *st_map)
>> +{
>> +       struct bpf_ksym *ksym = st_map->ksyms;
>> +       struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
>> +
>> +       while (ksym != end && ksym->start)
>> +               bpf_image_ksym_add(ksym++);
>> +}
>> +
>> +static void bpf_struct_ops_map_ksyms_del(struct bpf_struct_ops_map *st_map)
>> +{
>> +       struct bpf_ksym *ksym = st_map->ksyms;
>> +       struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
>> +
>> +       while (ksym != end && ksym->start)
>> +               bpf_image_ksym_del(ksym++);
>> +}
>> +
>>   static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>                                             void *value, u64 flags)
>>   {
>> @@ -601,6 +633,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>          int prog_fd, err;
>>          u32 i, trampoline_start, image_off = 0;
>>          void *cur_image = NULL, *image = NULL;
>> +       struct bpf_ksym *ksym;
>>
>>          if (flags)
>>                  return -EINVAL;
>> @@ -640,6 +673,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>          kdata = &kvalue->data;
>>
>>          module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>> +       ksym = st_map->ksyms;
>>          for_each_member(i, t, member) {
>>                  const struct btf_type *mtype, *ptype;
>>                  struct bpf_prog *prog;
>> @@ -735,6 +769,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>
>>                  /* put prog_id to udata */
>>                  *(unsigned long *)(udata + moff) = prog->aux->id;
>> +
>> +               /* init ksym for this trampoline */
>> +               bpf_struct_ops_ksym_init(prog, image + trampoline_start,
>> +                                        image_off - trampoline_start,
>> +                                        ksym++);
> 
> Thanks for the patch.
> I think it's overkill to add ksym for each callsite within a single
> trampoline.
> 1. The prog name will be next in the stack. No need to duplicate it.
> 2. ksym-ing callsites this way is quite unusual.
> 3. consider irq on other insns within a trampline.
>     The unwinder won't find anything in such a case.
> 
> So I suggest to add only one ksym that covers the whole trampoline.
> The name could be "bpf_trampoline_structopsname"
> that is probably st_ops_desc->type.
>

IIUC, the "whole trampoline" for a struct_ops is actually the page
array st_map->image_pages[MAX_TRAMP_IMAGE_PAGES], where each page is
allocated by arch_alloc_bpf_trampoline(PAGE_SIZE).

Since the virtual addresses of these pages are *NOT* guaranteed to
be contiguous, I dont think we can create a single ksym for them.

And if we add a ksym for each individual page, it seems we will end
up with an odd name for each ksym.

Given that each page consists of one or more bpf trampolines, which
are not different from bpf trampolines for other prog types, such as
bpf trampolines for fentry, and since each bpf trampoline for other
prog types already has a ksym, I think it is not unusual to add ksym
for each single bpf trampoline in the page.

And, there are no instructions between adjacent bpf trampolines within
a page, nothing between two trampolines can be interrupted.

For the name, bpf_trampoline_<struct_ops_name>_<member_name>, like
bpf_trampoline_tcp_congestion_ops_pkts_acked, seems appropriate.

>>          }
>>
>>          if (st_ops->validate) {
>> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>   unlock:
>>          kfree(tlinks);
>>          mutex_unlock(&st_map->lock);
>> +       if (!err)
>> +               bpf_struct_ops_map_ksyms_add(st_map);
>>          return err;
>>   }
>>
>> @@ -883,6 +924,10 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>>           */
>>          synchronize_rcu_mult(call_rcu, call_rcu_tasks);
>>
>> +       /* no trampoline in the map is running anymore, delete symbols */
>> +       bpf_struct_ops_map_ksyms_del(st_map);
>> +       synchronize_rcu();
>> +
> 
> This is substantial overhead and why ?
> synchronize_rcu_mult() is right above.
>

I think we should ensure no trampoline is running or could run before
its ksym is deleted from the symbol table. If this order is not ensured,
a trampoline can be interrupted by a perf irq after its symbol is deleted,
resulting a broken stacktrace since the trampoline symbol cound not be
found by the perf irq handler.

This patch deletes ksyms after synchronize_rcu_mult() to ensure this order.

> pw-bot: cr
Alexei Starovoitov Nov. 4, 2024, 5:53 p.m. UTC | #3
On Mon, Nov 4, 2024 at 3:55 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> >>                  *(unsigned long *)(udata + moff) = prog->aux->id;
> >> +
> >> +               /* init ksym for this trampoline */
> >> +               bpf_struct_ops_ksym_init(prog, image + trampoline_start,
> >> +                                        image_off - trampoline_start,
> >> +                                        ksym++);
> >
> > Thanks for the patch.
> > I think it's overkill to add ksym for each callsite within a single
> > trampoline.
> > 1. The prog name will be next in the stack. No need to duplicate it.
> > 2. ksym-ing callsites this way is quite unusual.
> > 3. consider irq on other insns within a trampline.
> >     The unwinder won't find anything in such a case.
> >
> > So I suggest to add only one ksym that covers the whole trampoline.
> > The name could be "bpf_trampoline_structopsname"
> > that is probably st_ops_desc->type.
> >
>
> IIUC, the "whole trampoline" for a struct_ops is actually the page
> array st_map->image_pages[MAX_TRAMP_IMAGE_PAGES], where each page is
> allocated by arch_alloc_bpf_trampoline(PAGE_SIZE).
>
> Since the virtual addresses of these pages are *NOT* guaranteed to
> be contiguous, I dont think we can create a single ksym for them.
>
> And if we add a ksym for each individual page, it seems we will end
> up with an odd name for each ksym.

I see. Good point. Ok. Let's add ksym for each callback.

> Given that each page consists of one or more bpf trampolines, which
> are not different from bpf trampolines for other prog types, such as
> bpf trampolines for fentry, and since each bpf trampoline for other
> prog types already has a ksym, I think it is not unusual to add ksym
> for each single bpf trampoline in the page.
>
> And, there are no instructions between adjacent bpf trampolines within
> a page, nothing between two trampolines can be interrupted.
>
> For the name, bpf_trampoline_<struct_ops_name>_<member_name>, like
> bpf_trampoline_tcp_congestion_ops_pkts_acked, seems appropriate.

Agree. This naming convention makes sense.
I'd only shorten the prefix to 'bpf_tramp_' or even 'bpf__'
(with double underscore).
It's kinda obvious that it's a trampoline and it's an implementation
detail that doesn't need to be present in the name.

>
> >>          }
> >>
> >>          if (st_ops->validate) {
> >> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
> >>   unlock:
> >>          kfree(tlinks);
> >>          mutex_unlock(&st_map->lock);
> >> +       if (!err)
> >> +               bpf_struct_ops_map_ksyms_add(st_map);
> >>          return err;
> >>   }
> >>
> >> @@ -883,6 +924,10 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
> >>           */
> >>          synchronize_rcu_mult(call_rcu, call_rcu_tasks);
> >>
> >> +       /* no trampoline in the map is running anymore, delete symbols */
> >> +       bpf_struct_ops_map_ksyms_del(st_map);
> >> +       synchronize_rcu();
> >> +
> >
> > This is substantial overhead and why ?
> > synchronize_rcu_mult() is right above.
> >
>
> I think we should ensure no trampoline is running or could run before
> its ksym is deleted from the symbol table. If this order is not ensured,
> a trampoline can be interrupted by a perf irq after its symbol is deleted,
> resulting a broken stacktrace since the trampoline symbol cound not be
> found by the perf irq handler.
>
> This patch deletes ksyms after synchronize_rcu_mult() to ensure this order.

But the overhead is prohibitive. We had broken stacks with st_ops
for long time, so it may still hit 0.001% where st_ops are being switched
as the comment in bpf_struct_ops_map_free() explains.

As a separate clean up I would switch the freeing to call_rcu_tasks.
Synchronous waiting is expensive.

Martin,

any suggestions?
Martin KaFai Lau Nov. 4, 2024, 10:13 p.m. UTC | #4
On 11/4/24 9:53 AM, Alexei Starovoitov wrote:

> As a separate clean up I would switch the freeing to call_rcu_tasks.
> Synchronous waiting is expensive.
> 
> Martin,
> 
> any suggestions?

There is a map->rcu now. May be add a "bool free_after_rcu_tasks_gp" to "struct 
bpf_map" and do the call_rcu_tasks() in bpf_map_put(). The 
bpf_struct_ops_map_alloc() can set the map->free_after_rcu_tasks_gp.

Take this chance to remove the "st_map->rcu" from "struct bpf_struct_ops_map" 
also. It is a left over after cleaning up the kvalue->refcnt in the
commit b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.").

Xu, it will be great if you can follow up with this cleanup. Otherwise, I will 
put it under the top of my todo list. Let me know what you prefer.
Alexei Starovoitov Nov. 4, 2024, 10:32 p.m. UTC | #5
On Mon, Nov 4, 2024 at 2:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/4/24 9:53 AM, Alexei Starovoitov wrote:
>
> > As a separate clean up I would switch the freeing to call_rcu_tasks.
> > Synchronous waiting is expensive.
> >
> > Martin,
> >
> > any suggestions?
>
> There is a map->rcu now. May be add a "bool free_after_rcu_tasks_gp" to "struct
> bpf_map" and do the call_rcu_tasks() in bpf_map_put(). The
> bpf_struct_ops_map_alloc() can set the map->free_after_rcu_tasks_gp.

Ohh. Great point.
struct_ops map can just set the existing free_after_mult_rcu_gp flag
and get rid of this sync call.
Another flag is overkill imo.

> Take this chance to remove the "st_map->rcu" from "struct bpf_struct_ops_map"
> also. It is a left over after cleaning up the kvalue->refcnt in the
> commit b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.").

+1

> Xu, it will be great if you can follow up with this cleanup. Otherwise, I will
> put it under the top of my todo list. Let me know what you prefer.
Martin KaFai Lau Nov. 5, 2024, 12:10 a.m. UTC | #6
On 11/1/24 4:19 AM, Xu Kuohai wrote:
>   static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   					   void *value, u64 flags)
>   {
> @@ -601,6 +633,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	int prog_fd, err;
>   	u32 i, trampoline_start, image_off = 0;
>   	void *cur_image = NULL, *image = NULL;
> +	struct bpf_ksym *ksym;
>   
>   	if (flags)
>   		return -EINVAL;
> @@ -640,6 +673,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   	kdata = &kvalue->data;
>   
>   	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
> +	ksym = st_map->ksyms;
>   	for_each_member(i, t, member) {
>   		const struct btf_type *mtype, *ptype;
>   		struct bpf_prog *prog;
> @@ -735,6 +769,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   
>   		/* put prog_id to udata */
>   		*(unsigned long *)(udata + moff) = prog->aux->id;
> +
> +		/* init ksym for this trampoline */
> +		bpf_struct_ops_ksym_init(prog, image + trampoline_start,
> +					 image_off - trampoline_start,
> +					 ksym++);
>   	}
>   
>   	if (st_ops->validate) {
> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>   unlock:
>   	kfree(tlinks);
>   	mutex_unlock(&st_map->lock);
> +	if (!err)
> +		bpf_struct_ops_map_ksyms_add(st_map);
>   	return err;
>   }
>   

>   static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   {
>   	const struct bpf_struct_ops_desc *st_ops_desc;
> @@ -905,6 +963,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   	struct bpf_map *map;
>   	struct btf *btf;
>   	int ret;
> +	size_t ksyms_offset;
> +	u32 ksyms_cnt;
>   
>   	if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
>   		/* The map holds btf for its whole life time. */
> @@ -951,6 +1011,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   		 */
>   		(vt->size - sizeof(struct bpf_struct_ops_value));
>   
> +	st_map_size = round_up(st_map_size, sizeof(struct bpf_ksym));
> +	ksyms_offset = st_map_size;
> +	ksyms_cnt = count_func_ptrs(btf, t);
> +	st_map_size += ksyms_cnt * sizeof(struct bpf_ksym);
> +
>   	st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
>   	if (!st_map) {
>   		ret = -ENOMEM;
> @@ -958,6 +1023,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>   	}
>   
>   	st_map->st_ops_desc = st_ops_desc;
> +	st_map->ksyms = (void *)st_map + ksyms_offset;

nit. The st_map->ksyms is very similar to the existing st_map->links. Can we do 
the allocation similar to the st_map->links and use another bpf_map_area_alloc() 
instead of doing the round_up() and then figuring out the ksyms_offset.

> +	st_map->ksyms_cnt = ksyms_cnt;

The same goes for ksyms_cnt. ksyms_cnt is almost the same as the 
st_map->links_cnt. st_map->links_cnt unnecessarily includes the non func ptr 
(i.e. a waste). The st_map->links[i] must be NULL if the i-th member of a struct 
is not a func ptr.

If this patch adds the count_func_ptrs(), I think at least just have one 
variable to mean funcs_cnt instead of adding another new ksyms_cnt. Both the 
existing st_map->links and the new st_map->ksyms can use the same funcs_cnt. An 
adjustment is needed for link in update_elem (probably use link++ similar to 
your ksym++ idea). bpf_struct_ops_map_put_progs() should work as is.

Also, the actual bpf_link is currently allocated during update_elem() only when 
there is a bpf prog for an ops. The new st_map->ksyms pre-allocated everything 
during map_alloc() regardless if there will be a bpf prog (e.g. 
tcp_congestion_ops has 5 optional ops). I don't have a strong opinion on 
pre-allocate everything in map_alloc() or allocate on-demand in update_elem(). 
However, considering bpf_ksym has a "char name[KSYM_NAME_LEN]", the on-demand 
allocation on bpf_link becomes not very useful. If the next respin stays with 
the pre-allocate everything way, it is useful to followup later to stay with one 
way and do the same for bpf_link.
Xu Kuohai Nov. 5, 2024, 9:29 a.m. UTC | #7
On 11/5/2024 1:53 AM, Alexei Starovoitov wrote:
> On Mon, Nov 4, 2024 at 3:55 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>
>>>>                   *(unsigned long *)(udata + moff) = prog->aux->id;
>>>> +
>>>> +               /* init ksym for this trampoline */
>>>> +               bpf_struct_ops_ksym_init(prog, image + trampoline_start,
>>>> +                                        image_off - trampoline_start,
>>>> +                                        ksym++);
>>>
>>> Thanks for the patch.
>>> I think it's overkill to add ksym for each callsite within a single
>>> trampoline.
>>> 1. The prog name will be next in the stack. No need to duplicate it.
>>> 2. ksym-ing callsites this way is quite unusual.
>>> 3. consider irq on other insns within a trampline.
>>>      The unwinder won't find anything in such a case.
>>>
>>> So I suggest to add only one ksym that covers the whole trampoline.
>>> The name could be "bpf_trampoline_structopsname"
>>> that is probably st_ops_desc->type.
>>>
>>
>> IIUC, the "whole trampoline" for a struct_ops is actually the page
>> array st_map->image_pages[MAX_TRAMP_IMAGE_PAGES], where each page is
>> allocated by arch_alloc_bpf_trampoline(PAGE_SIZE).
>>
>> Since the virtual addresses of these pages are *NOT* guaranteed to
>> be contiguous, I dont think we can create a single ksym for them.
>>
>> And if we add a ksym for each individual page, it seems we will end
>> up with an odd name for each ksym.
> 
> I see. Good point. Ok. Let's add ksym for each callback.
> 
>> Given that each page consists of one or more bpf trampolines, which
>> are not different from bpf trampolines for other prog types, such as
>> bpf trampolines for fentry, and since each bpf trampoline for other
>> prog types already has a ksym, I think it is not unusual to add ksym
>> for each single bpf trampoline in the page.
>>
>> And, there are no instructions between adjacent bpf trampolines within
>> a page, nothing between two trampolines can be interrupted.
>>
>> For the name, bpf_trampoline_<struct_ops_name>_<member_name>, like
>> bpf_trampoline_tcp_congestion_ops_pkts_acked, seems appropriate.
> 
> Agree. This naming convention makes sense.
> I'd only shorten the prefix to 'bpf_tramp_' or even 'bpf__'
> (with double underscore).
> It's kinda obvious that it's a trampoline and it's an implementation
> detail that doesn't need to be present in the name.
>

OK, 'bpf__' looks great.

>>
>>>>           }
>>>>
>>>>           if (st_ops->validate) {
>>>> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>>>    unlock:
>>>>           kfree(tlinks);
>>>>           mutex_unlock(&st_map->lock);
>>>> +       if (!err)
>>>> +               bpf_struct_ops_map_ksyms_add(st_map);
>>>>           return err;
>>>>    }
>>>>
>>>> @@ -883,6 +924,10 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>>>>            */
>>>>           synchronize_rcu_mult(call_rcu, call_rcu_tasks);
>>>>
>>>> +       /* no trampoline in the map is running anymore, delete symbols */
>>>> +       bpf_struct_ops_map_ksyms_del(st_map);
>>>> +       synchronize_rcu();
>>>> +
>>>
>>> This is substantial overhead and why ?
>>> synchronize_rcu_mult() is right above.
>>>
>>
>> I think we should ensure no trampoline is running or could run before
>> its ksym is deleted from the symbol table. If this order is not ensured,
>> a trampoline can be interrupted by a perf irq after its symbol is deleted,
>> resulting a broken stacktrace since the trampoline symbol cound not be
>> found by the perf irq handler.
>>
>> This patch deletes ksyms after synchronize_rcu_mult() to ensure this order.
> 
> But the overhead is prohibitive. We had broken stacks with st_ops
> for long time, so it may still hit 0.001% where st_ops are being switched
> as the comment in bpf_struct_ops_map_free() explains.
>

Got it

> As a separate clean up I would switch the freeing to call_rcu_tasks.
> Synchronous waiting is expensive.
> 
> Martin,
> 
> any suggestions?
Xu Kuohai Nov. 5, 2024, 9:30 a.m. UTC | #8
On 11/5/2024 6:13 AM, Martin KaFai Lau wrote:
> On 11/4/24 9:53 AM, Alexei Starovoitov wrote:
> 
>> As a separate clean up I would switch the freeing to call_rcu_tasks.
>> Synchronous waiting is expensive.
>>
>> Martin,
>>
>> any suggestions?
> 
> There is a map->rcu now. May be add a "bool free_after_rcu_tasks_gp" to "struct bpf_map" and do the call_rcu_tasks() in bpf_map_put(). The bpf_struct_ops_map_alloc() can set the map->free_after_rcu_tasks_gp.
> 
> Take this chance to remove the "st_map->rcu" from "struct bpf_struct_ops_map" also. It is a left over after cleaning up the kvalue->refcnt in the
> commit b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.").
> 
> Xu, it will be great if you can follow up with this cleanup. Otherwise, I will put it under the top of my todo list. Let me know what you prefer.

Sure, I'll take this cleanup, thanks.
Xu Kuohai Nov. 5, 2024, 9:30 a.m. UTC | #9
On 11/5/2024 8:10 AM, Martin KaFai Lau wrote:
> On 11/1/24 4:19 AM, Xu Kuohai wrote:
>>   static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>                          void *value, u64 flags)
>>   {
>> @@ -601,6 +633,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>       int prog_fd, err;
>>       u32 i, trampoline_start, image_off = 0;
>>       void *cur_image = NULL, *image = NULL;
>> +    struct bpf_ksym *ksym;
>>       if (flags)
>>           return -EINVAL;
>> @@ -640,6 +673,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>       kdata = &kvalue->data;
>>       module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
>> +    ksym = st_map->ksyms;
>>       for_each_member(i, t, member) {
>>           const struct btf_type *mtype, *ptype;
>>           struct bpf_prog *prog;
>> @@ -735,6 +769,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>           /* put prog_id to udata */
>>           *(unsigned long *)(udata + moff) = prog->aux->id;
>> +
>> +        /* init ksym for this trampoline */
>> +        bpf_struct_ops_ksym_init(prog, image + trampoline_start,
>> +                     image_off - trampoline_start,
>> +                     ksym++);
>>       }
>>       if (st_ops->validate) {
>> @@ -790,6 +829,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
>>   unlock:
>>       kfree(tlinks);
>>       mutex_unlock(&st_map->lock);
>> +    if (!err)
>> +        bpf_struct_ops_map_ksyms_add(st_map);
>>       return err;
>>   }
> 
>>   static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>   {
>>       const struct bpf_struct_ops_desc *st_ops_desc;
>> @@ -905,6 +963,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       struct bpf_map *map;
>>       struct btf *btf;
>>       int ret;
>> +    size_t ksyms_offset;
>> +    u32 ksyms_cnt;
>>       if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
>>           /* The map holds btf for its whole life time. */
>> @@ -951,6 +1011,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>            */
>>           (vt->size - sizeof(struct bpf_struct_ops_value));
>> +    st_map_size = round_up(st_map_size, sizeof(struct bpf_ksym));
>> +    ksyms_offset = st_map_size;
>> +    ksyms_cnt = count_func_ptrs(btf, t);
>> +    st_map_size += ksyms_cnt * sizeof(struct bpf_ksym);
>> +
>>       st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
>>       if (!st_map) {
>>           ret = -ENOMEM;
>> @@ -958,6 +1023,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>>       }
>>       st_map->st_ops_desc = st_ops_desc;
>> +    st_map->ksyms = (void *)st_map + ksyms_offset;
> 
> nit. The st_map->ksyms is very similar to the existing st_map->links. Can we do the allocation similar to the st_map->links and use another bpf_map_area_alloc() instead of doing the round_up() and then figuring out the ksyms_offset.
> 
>> +    st_map->ksyms_cnt = ksyms_cnt;
> 
> The same goes for ksyms_cnt. ksyms_cnt is almost the same as the st_map->links_cnt. st_map->links_cnt unnecessarily includes the non func ptr (i.e. a waste). The st_map->links[i] must be NULL if the i-th member of a struct is not a func ptr.
> 
> If this patch adds the count_func_ptrs(), I think at least just have one variable to mean funcs_cnt instead of adding another new ksyms_cnt. Both the existing st_map->links and the new st_map->ksyms can use the same funcs_cnt. An adjustment is needed for 
> link in update_elem (probably use link++ similar to your ksym++ idea). bpf_struct_ops_map_put_progs() should work as is.
>

Great, agree.

> Also, the actual bpf_link is currently allocated during update_elem() only when there is a bpf prog for an ops. The new st_map->ksyms pre-allocated everything during map_alloc() regardless if there will be a bpf prog (e.g. tcp_congestion_ops has 5 optional 
> ops). I don't have a strong opinion on pre-allocate everything in map_alloc() or allocate on-demand in update_elem(). However, considering bpf_ksym has a "char name[KSYM_NAME_LEN]", the on-demand allocation on bpf_link becomes not very useful. If the next 
> respin stays with the pre-allocate everything way, it is useful to followup later to stay with one way and do the same for bpf_link.

OK, let’s go with the bpf_link way to avoid another cleanup patch.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..46f8d6c1a55c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1402,7 +1402,8 @@  int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym);
+void bpf_image_ksym_add(struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fda3dd2ee984..172a081ed1c3 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -38,6 +38,9 @@  struct bpf_struct_ops_map {
 	 * that stores the func args before calling the bpf_prog.
 	 */
 	void *image_pages[MAX_TRAMP_IMAGE_PAGES];
+	u32 ksyms_cnt;
+	/* ksyms for bpf trampolines */
+	struct bpf_ksym *ksyms;
 	/* The owner moduler's btf. */
 	struct btf *btf;
 	/* uvalue->data stores the kernel struct
@@ -586,6 +589,35 @@  int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	return 0;
 }
 
+static void bpf_struct_ops_ksym_init(struct bpf_prog *prog, void *image,
+				     unsigned int size, struct bpf_ksym *ksym)
+{
+	int n;
+
+	n = strscpy(ksym->name, "bpf_trampoline_", KSYM_NAME_LEN);
+	strncat(ksym->name + n, prog->aux->ksym.name, KSYM_NAME_LEN - 1 - n);
+	INIT_LIST_HEAD_RCU(&ksym->lnode);
+	bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_ksyms_add(struct bpf_struct_ops_map *st_map)
+{
+	struct bpf_ksym *ksym = st_map->ksyms;
+	struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
+
+	while (ksym != end && ksym->start)
+		bpf_image_ksym_add(ksym++);
+}
+
+static void bpf_struct_ops_map_ksyms_del(struct bpf_struct_ops_map *st_map)
+{
+	struct bpf_ksym *ksym = st_map->ksyms;
+	struct bpf_ksym *end = ksym + st_map->ksyms_cnt;
+
+	while (ksym != end && ksym->start)
+		bpf_image_ksym_del(ksym++);
+}
+
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 					   void *value, u64 flags)
 {
@@ -601,6 +633,7 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	int prog_fd, err;
 	u32 i, trampoline_start, image_off = 0;
 	void *cur_image = NULL, *image = NULL;
+	struct bpf_ksym *ksym;
 
 	if (flags)
 		return -EINVAL;
@@ -640,6 +673,7 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	kdata = &kvalue->data;
 
 	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
+	ksym = st_map->ksyms;
 	for_each_member(i, t, member) {
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
@@ -735,6 +769,11 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		/* put prog_id to udata */
 		*(unsigned long *)(udata + moff) = prog->aux->id;
+
+		/* init ksym for this trampoline */
+		bpf_struct_ops_ksym_init(prog, image + trampoline_start,
+					 image_off - trampoline_start,
+					 ksym++);
 	}
 
 	if (st_ops->validate) {
@@ -790,6 +829,8 @@  static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 unlock:
 	kfree(tlinks);
 	mutex_unlock(&st_map->lock);
+	if (!err)
+		bpf_struct_ops_map_ksyms_add(st_map);
 	return err;
 }
 
@@ -883,6 +924,10 @@  static void bpf_struct_ops_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
 
+	/* no trampoline in the map is running anymore, delete symbols */
+	bpf_struct_ops_map_ksyms_del(st_map);
+	synchronize_rcu();
+
 	__bpf_struct_ops_map_free(map);
 }
 
@@ -895,6 +940,19 @@  static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+	int i;
+	u32 count;
+	const struct btf_member *member;
+
+	count = 0;
+	for_each_member(i, t, member)
+		if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+			count++;
+	return count;
+}
+
 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 {
 	const struct bpf_struct_ops_desc *st_ops_desc;
@@ -905,6 +963,8 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	struct bpf_map *map;
 	struct btf *btf;
 	int ret;
+	size_t ksyms_offset;
+	u32 ksyms_cnt;
 
 	if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
 		/* The map holds btf for its whole life time. */
@@ -951,6 +1011,11 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		 */
 		(vt->size - sizeof(struct bpf_struct_ops_value));
 
+	st_map_size = round_up(st_map_size, sizeof(struct bpf_ksym));
+	ksyms_offset = st_map_size;
+	ksyms_cnt = count_func_ptrs(btf, t);
+	st_map_size += ksyms_cnt * sizeof(struct bpf_ksym);
+
 	st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
 	if (!st_map) {
 		ret = -ENOMEM;
@@ -958,6 +1023,8 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	}
 
 	st_map->st_ops_desc = st_ops_desc;
+	st_map->ksyms = (void *)st_map + ksyms_offset;
+	st_map->ksyms_cnt = ksyms_cnt;
 	map = &st_map->map;
 
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 70fb82bf1637..b77db7413f8c 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,8 @@  void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			d->image = NULL;
 			goto out;
 		}
-		bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
+		bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+		bpf_image_ksym_add(&d->ksym);
 	}
 
 	prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9f36c049f4c2..ecdd2660561f 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,14 @@  bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
-void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
 	ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
 	bpf_ksym_add(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
 			   PAGE_SIZE, false, ksym->name);
@@ -377,7 +381,8 @@  static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
 	ksym = &im->ksym;
 	INIT_LIST_HEAD_RCU(&ksym->lnode);
 	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
-	bpf_image_ksym_add(image, size, ksym);
+	bpf_image_ksym_init(image, size, ksym);
+	bpf_image_ksym_add(ksym);
 	return im;
 
 out_free_image: