diff mbox series

[bpf-next,1/2] bpf: Fix release of struct_ops map

Message ID 20241108082633.2338543-2-xukuohai@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Fix release of struct_ops map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 5
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 1 blamed authors not CCed: kuifeng@meta.com; 7 maintainers not CCed: john.fastabend@gmail.com haoluo@google.com kpsingh@kernel.org song@kernel.org jolsa@kernel.org kuifeng@meta.com sdf@fomichev.me
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 5 this patch: 5
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 62 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Xu Kuohai Nov. 8, 2024, 8:26 a.m. UTC
From: Xu Kuohai <xukuohai@huawei.com>

The test in the follow-up patch triggers the following kernel panic:

 Oops: int3: 0000 [#1] PREEMPT SMP PTI
 CPU: 0 UID: 0 PID: 465 Comm: test_progs Tainted: G           OE      6.12.0-rc4-gd1d187515
 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-pr4
 RIP: 0010:0xffffffffc0015041
 Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ccc
 RSP: 0018:ffffc9000187fd20 EFLAGS: 00000246
 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: ffffffff82c54639 RDI: 0000000000000000
 RBP: ffffc9000187fd48 R08: 0000000000000001 R09: 0000000000000000
 R10: 0000000000000001 R11: 000000004cba6383 R12: 0000000000000000
 R13: 0000000000000002 R14: ffff88810438b7a0 R15: ffffffff8369d7a0
 FS:  00007f05178006c0(0000) GS:ffff888236e00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f0508c94000 CR3: 0000000100d16003 CR4: 0000000000170ef0
 Call Trace:
  <TASK>
  ? show_regs+0x68/0x80
  ? die+0x3b/0x90
  ? exc_int3+0xca/0xe0
  ? asm_exc_int3+0x3e/0x50
  run_struct_ops+0x58/0xb0 [bpf_testmod]
  param_attr_store+0xa2/0x100
  module_attr_store+0x25/0x40
  sysfs_kf_write+0x50/0x70
  kernfs_fop_write_iter+0x146/0x1f0
  vfs_write+0x27e/0x530
  ksys_write+0x75/0x100
  __x64_sys_write+0x1d/0x30
  x64_sys_call+0x1d30/0x1f30
  do_syscall_64+0x68/0x140
  entry_SYSCALL_64_after_hwframe+0x76/0x7e
 RIP: 0033:0x7f051831727f
 Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 39 d5 f8 ff 48 8b 54 24 18 48 8b 74 24 108
 RSP: 002b:00007f05177ffdf0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
 RAX: ffffffffffffffda RBX: 00007f05178006c0 RCX: 00007f051831727f
 RDX: 0000000000000002 RSI: 00007f05177ffe30 RDI: 0000000000000004
 RBP: 00007f05177ffe90 R08: 0000000000000000 R09: 000000000000000b
 R10: 0000000000000000 R11: 0000000000000293 R12: ffffffffffffff30
 R13: 0000000000000002 R14: 00007ffd926bfd50 R15: 00007f0517000000
  </TASK>

It's because the sleepable prog is still running when the struct_ops
map is released.

To fix it, follow the approach used in bpf_tramp_image_put. First,
before release struct_ops map, wait a rcu_tasks_trace gp for sleepable
progs to finish. Then, wait a rcu_tasks gp for normal progs and the
rest trampoline insns to finish.

Additionally, switch to call_rcu to remove the synchronous waiting,
as suggested by Alexei and Martin.

Fixes: b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.")
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
---
 kernel/bpf/bpf_struct_ops.c | 37 +++++++++++++++++++------------------
 kernel/bpf/syscall.c        |  7 ++++++-
 2 files changed, 25 insertions(+), 19 deletions(-)

Comments

Alexei Starovoitov Nov. 8, 2024, 5 p.m. UTC | #1
On Fri, Nov 8, 2024 at 12:15 AM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> From: Xu Kuohai <xukuohai@huawei.com>
>
> The test in the follow-up patch triggers the following kernel panic:
>
>  Oops: int3: 0000 [#1] PREEMPT SMP PTI
>  CPU: 0 UID: 0 PID: 465 Comm: test_progs Tainted: G           OE      6.12.0-rc4-gd1d187515
>  Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-pr4
>  RIP: 0010:0xffffffffc0015041
>  Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ccc
>  RSP: 0018:ffffc9000187fd20 EFLAGS: 00000246
>  RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
>  RDX: 0000000000000000 RSI: ffffffff82c54639 RDI: 0000000000000000
>  RBP: ffffc9000187fd48 R08: 0000000000000001 R09: 0000000000000000
>  R10: 0000000000000001 R11: 000000004cba6383 R12: 0000000000000000
>  R13: 0000000000000002 R14: ffff88810438b7a0 R15: ffffffff8369d7a0
>  FS:  00007f05178006c0(0000) GS:ffff888236e00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007f0508c94000 CR3: 0000000100d16003 CR4: 0000000000170ef0
>  Call Trace:
>   <TASK>
>   ? show_regs+0x68/0x80
>   ? die+0x3b/0x90
>   ? exc_int3+0xca/0xe0
>   ? asm_exc_int3+0x3e/0x50
>   run_struct_ops+0x58/0xb0 [bpf_testmod]
>   param_attr_store+0xa2/0x100
>   module_attr_store+0x25/0x40
>   sysfs_kf_write+0x50/0x70
>   kernfs_fop_write_iter+0x146/0x1f0
>   vfs_write+0x27e/0x530
>   ksys_write+0x75/0x100
>   __x64_sys_write+0x1d/0x30
>   x64_sys_call+0x1d30/0x1f30
>   do_syscall_64+0x68/0x140
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>  RIP: 0033:0x7f051831727f
>  Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 39 d5 f8 ff 48 8b 54 24 18 48 8b 74 24 108
>  RSP: 002b:00007f05177ffdf0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
>  RAX: ffffffffffffffda RBX: 00007f05178006c0 RCX: 00007f051831727f
>  RDX: 0000000000000002 RSI: 00007f05177ffe30 RDI: 0000000000000004
>  RBP: 00007f05177ffe90 R08: 0000000000000000 R09: 000000000000000b
>  R10: 0000000000000000 R11: 0000000000000293 R12: ffffffffffffff30
>  R13: 0000000000000002 R14: 00007ffd926bfd50 R15: 00007f0517000000
>   </TASK>
>
> It's because the sleepable prog is still running when the struct_ops
> map is released.
>
> To fix it, follow the approach used in bpf_tramp_image_put. First,
> before release struct_ops map, wait a rcu_tasks_trace gp for sleepable
> progs to finish. Then, wait a rcu_tasks gp for normal progs and the
> rest trampoline insns to finish.
>
> Additionally, switch to call_rcu to remove the synchronous waiting,
> as suggested by Alexei and Martin.
>
> Fixes: b671c2067a04 ("bpf: Retire the struct_ops map kvalue->refcnt.")
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> ---
>  kernel/bpf/bpf_struct_ops.c | 37 +++++++++++++++++++------------------
>  kernel/bpf/syscall.c        |  7 ++++++-
>  2 files changed, 25 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index fda3dd2ee984..dd5f9bf12791 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -865,24 +865,6 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>          */
>         if (btf_is_module(st_map->btf))
>                 module_put(st_map->st_ops_desc->st_ops->owner);
> -
> -       /* The struct_ops's function may switch to another struct_ops.
> -        *
> -        * For example, bpf_tcp_cc_x->init() may switch to
> -        * another tcp_cc_y by calling
> -        * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> -        * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
> -        * and its refcount may reach 0 which then free its
> -        * trampoline image while tcp_cc_x is still running.
> -        *
> -        * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
> -        * to finish. bpf-tcp-cc prog is non sleepable.
> -        * A rcu_tasks gp is to wait for the last few insn
> -        * in the tramopline image to finish before releasing
> -        * the trampoline image.
> -        */
> -       synchronize_rcu_mult(call_rcu, call_rcu_tasks);
> -
>         __bpf_struct_ops_map_free(map);
>  }
>
> @@ -974,6 +956,25 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
>         mutex_init(&st_map->lock);
>         bpf_map_init_from_attr(map, attr);
>
> +       /* The struct_ops's function may switch to another struct_ops.
> +        *
> +        * For example, bpf_tcp_cc_x->init() may switch to
> +        * another tcp_cc_y by calling
> +        * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> +        * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
> +        * and its refcount may reach 0 which then free its
> +        * trampoline image while tcp_cc_x is still running.
> +        *
> +        * Since struct_ops prog can be sleepable, a rcu_tasks_trace gp
> +        * is to wait for sleepable progs in the map to finish. Then a
> +        * rcu_tasks gp is to wait for the normal progs and the last few
> +        * insns in the tramopline image to finish before releasing the
> +        * trampoline image.
> +        *
> +        * Also see the comment in function bpf_tramp_image_put.
> +        */
> +       WRITE_ONCE(map->free_after_mult_rcu_gp, true);
> +
>         return map;
>
>  errout_free:
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8254b2973157..ae927f087f04 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -886,7 +886,12 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
>
>  static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
>  {
> -       if (rcu_trace_implies_rcu_gp())
> +       struct bpf_map *map = container_of(rcu, struct bpf_map, rcu);
> +
> +       if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
> +               /* See comment in the end of bpf_struct_ops_map_alloc */

The fix makes sense, but pls copy paste a sentence here instead
of the above comment. Like:
"
rcu_tasks gp is necessary to wait for struct_ops bpf trampoline to finish.
Unlike all other bpf trampolines struct_ops trampoline is not
protected by percpu_ref.
"
> +               call_rcu_tasks(rcu, bpf_map_free_rcu_gp);
> +       else if (rcu_trace_implies_rcu_gp())

pw-bot: cr
diff mbox series

Patch

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fda3dd2ee984..dd5f9bf12791 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -865,24 +865,6 @@  static void bpf_struct_ops_map_free(struct bpf_map *map)
 	 */
 	if (btf_is_module(st_map->btf))
 		module_put(st_map->st_ops_desc->st_ops->owner);
-
-	/* The struct_ops's function may switch to another struct_ops.
-	 *
-	 * For example, bpf_tcp_cc_x->init() may switch to
-	 * another tcp_cc_y by calling
-	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-	 * and its refcount may reach 0 which then free its
-	 * trampoline image while tcp_cc_x is still running.
-	 *
-	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
-	 * to finish. bpf-tcp-cc prog is non sleepable.
-	 * A rcu_tasks gp is to wait for the last few insn
-	 * in the tramopline image to finish before releasing
-	 * the trampoline image.
-	 */
-	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
-
 	__bpf_struct_ops_map_free(map);
 }
 
@@ -974,6 +956,25 @@  static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	mutex_init(&st_map->lock);
 	bpf_map_init_from_attr(map, attr);
 
+	/* The struct_ops's function may switch to another struct_ops.
+	 *
+	 * For example, bpf_tcp_cc_x->init() may switch to
+	 * another tcp_cc_y by calling
+	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+	 * and its refcount may reach 0 which then free its
+	 * trampoline image while tcp_cc_x is still running.
+	 *
+	 * Since struct_ops prog can be sleepable, a rcu_tasks_trace gp
+	 * is to wait for sleepable progs in the map to finish. Then a
+	 * rcu_tasks gp is to wait for the normal progs and the last few
+	 * insns in the tramopline image to finish before releasing the
+	 * trampoline image.
+	 *
+	 * Also see the comment in function bpf_tramp_image_put.
+	 */
+	WRITE_ONCE(map->free_after_mult_rcu_gp, true);
+
 	return map;
 
 errout_free:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8254b2973157..ae927f087f04 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -886,7 +886,12 @@  static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
 
 static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
 {
-	if (rcu_trace_implies_rcu_gp())
+	struct bpf_map *map = container_of(rcu, struct bpf_map, rcu);
+
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+		/* See comment in the end of bpf_struct_ops_map_alloc */
+		call_rcu_tasks(rcu, bpf_map_free_rcu_gp);
+	else if (rcu_trace_implies_rcu_gp())
 		bpf_map_free_rcu_gp(rcu);
 	else
 		call_rcu(rcu, bpf_map_free_rcu_gp);