diff mbox series

[bpf-next,v5] bpf: Fix a race condition between btf_put() and map_free()

Message ID 20231208041621.2968241-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf-next,v5] bpf: Fix a race condition between btf_put() and map_free() | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7874 this patch: 7874
netdev/cc_maintainers fail 1 blamed authors not CCed: memxor@gmail.com; 8 maintainers not CCed: kpsingh@kernel.org sdf@google.com john.fastabend@gmail.com martin.lau@linux.dev song@kernel.org haoluo@google.com jolsa@kernel.org memxor@gmail.com
netdev/build_clang success Errors and warnings before: 2577 this patch: 2577
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 8409 this patch: 8409
netdev/checkpatch fail ERROR: Avoid using diff content in the commit message - patch(1) might not work WARNING: Possible repeated word: 'that' WARNING: line length of 100 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Yonghong Song Dec. 8, 2023, 4:16 a.m. UTC
When running `./test_progs -j` in my local vm with latest kernel,
I once hit a kasan error like below:

  [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830
  [ 1887.186498]
  [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G           OEL     6.7.0-rc3-00699-g90679706d486-dirty #494
  [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred
  [ 1887.190341] Call Trace:
  [ 1887.190666]  <TASK>
  [ 1887.190949]  dump_stack_lvl+0xac/0xe0
  [ 1887.191423]  ? nf_tcp_handle_invalid+0x1b0/0x1b0
  [ 1887.192019]  ? panic+0x3c0/0x3c0
  [ 1887.192449]  print_report+0x14f/0x720
  [ 1887.192930]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.193459]  ? __virt_addr_valid+0xac/0x120
  [ 1887.194004]  ? bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.194572]  kasan_report+0xc3/0x100
  [ 1887.195085]  ? bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.195668]  bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.196183]  ? __bpf_obj_drop_impl+0xb0/0xb0
  [ 1887.196736]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.197270]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.197802]  ? _raw_spin_unlock+0x1f/0x40
  [ 1887.198319]  bpf_obj_free_fields+0x1d4/0x260
  [ 1887.198883]  array_map_free+0x1a3/0x260
  [ 1887.199380]  bpf_map_free_deferred+0x7b/0xe0
  [ 1887.199943]  process_scheduled_works+0x3a2/0x6c0
  [ 1887.200549]  worker_thread+0x633/0x890
  [ 1887.201047]  ? __kthread_parkme+0xd7/0xf0
  [ 1887.201574]  ? kthread+0x102/0x1d0
  [ 1887.202020]  kthread+0x1ab/0x1d0
  [ 1887.202447]  ? pr_cont_work+0x270/0x270
  [ 1887.202954]  ? kthread_blkcg+0x50/0x50
  [ 1887.203444]  ret_from_fork+0x34/0x50
  [ 1887.203914]  ? kthread_blkcg+0x50/0x50
  [ 1887.204397]  ret_from_fork_asm+0x11/0x20
  [ 1887.204913]  </TASK>
  [ 1887.204913]  </TASK>
  [ 1887.205209]
  [ 1887.205416] Allocated by task 2197:
  [ 1887.205881]  kasan_set_track+0x3f/0x60
  [ 1887.206366]  __kasan_kmalloc+0x6e/0x80
  [ 1887.206856]  __kmalloc+0xac/0x1a0
  [ 1887.207293]  btf_parse_fields+0xa15/0x1480
  [ 1887.207836]  btf_parse_struct_metas+0x566/0x670
  [ 1887.208387]  btf_new_fd+0x294/0x4d0
  [ 1887.208851]  __sys_bpf+0x4ba/0x600
  [ 1887.209292]  __x64_sys_bpf+0x41/0x50
  [ 1887.209762]  do_syscall_64+0x4c/0xf0
  [ 1887.210222]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
  [ 1887.210868]
  [ 1887.211074] Freed by task 36:
  [ 1887.211460]  kasan_set_track+0x3f/0x60
  [ 1887.211951]  kasan_save_free_info+0x28/0x40
  [ 1887.212485]  ____kasan_slab_free+0x101/0x180
  [ 1887.213027]  __kmem_cache_free+0xe4/0x210
  [ 1887.213514]  btf_free+0x5b/0x130
  [ 1887.213918]  rcu_core+0x638/0xcc0
  [ 1887.214347]  __do_softirq+0x114/0x37e

The error happens at bpf_rb_root_free+0x1f8/0x2b0:

  00000000000034c0 <bpf_rb_root_free>:
  ; {
    34c0: f3 0f 1e fa                   endbr64
    34c4: e8 00 00 00 00                callq   0x34c9 <bpf_rb_root_free+0x9>
    34c9: 55                            pushq   %rbp
    34ca: 48 89 e5                      movq    %rsp, %rbp
  ...
  ;       if (rec && rec->refcount_off >= 0 &&
    36aa: 4d 85 ed                      testq   %r13, %r13
    36ad: 74 a9                         je      0x3658 <bpf_rb_root_free+0x198>
    36af: 49 8d 7d 10                   leaq    0x10(%r13), %rdi
    36b3: e8 00 00 00 00                callq   0x36b8 <bpf_rb_root_free+0x1f8>
                                        <==== kasan function
    36b8: 45 8b 7d 10                   movl    0x10(%r13), %r15d
                                        <==== use-after-free load
    36bc: 45 85 ff                      testl   %r15d, %r15d
    36bf: 78 8c                         js      0x364d <bpf_rb_root_free+0x18d>

So the problem is at rec->refcount_off in the above.

I did some source code analysis and find the reason.
                                  CPU A                        CPU B
  bpf_map_put:
    ...
    btf_put with rcu callback
    ...
    bpf_map_free_deferred
      with system_unbound_wq
    ...                          ...                           ...
    ...                          btf_free_rcu:                 ...
    ...                          ...                           bpf_map_free_deferred:
    ...                          ...
    ...         --------->       btf_struct_metas_free()
    ...         | race condition ...
    ...         --------->                                     map->ops->map_free()
    ...
    ...                          btf->struct_meta_tab = NULL

In the above, map_free() corresponds to array_map_free() and eventually
calling bpf_rb_root_free() which calls:
  ...
  __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
  ...

Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code:

  meta = btf_find_struct_meta(btf, btf_id);
  if (!meta)
    return -EFAULT;
  rec->fields[i].graph_root.value_rec = meta->record;

So basically, 'value_rec' is a pointer to the record in struct_metas_tab.
And it is possible that that particular record has been freed by
btf_struct_metas_free() and hence we have a kasan error here.

Actually it is very hard to reproduce the failure with current bpf/bpf-next
code, I only got the above error once. To increase reproducibility, I added
a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which
significantly increased reproducibility.

  diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
  index 5e43ddd1b83f..aae5b5213e93 100644
  --- a/kernel/bpf/syscall.c
  +++ b/kernel/bpf/syscall.c
  @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
        struct bpf_map *map = container_of(work, struct bpf_map, work);
        struct btf_record *rec = map->record;

  +     mdelay(100);
        security_bpf_map_free(map);
        bpf_map_release_memcg(map);
        /* implementation dependent freeing */

To fix the problem, we need to have a reference on btf in order to
safeguard accessing field->graph_root.value_rec in map->ops->map_free().
The function btf_parse_graph_root() is the place to get a btf reference.
The following are rough call stacks reaching bpf_parse_graph_root():

   btf_parse
     ...
       btf_parse_fields
         ...
           btf_parse_graph_root

   map_check_btf
     btf_parse_fields
       ...
         btf_parse_graph_root

Looking at the above call stack, the btf_parse_graph_root() is indirectly
called from btf_parse() or map_check_btf().

We cannot take a reference in btf_parse() case since at that moment,
btf is still in the middle to self-validation and initial reference
(refcount_set(&btf->refcnt, 1)) has not been triggered yet.
But even if refcount_set() is moved earlier, taking reference
will have other issues at btf_put() stage.
  btf_put <== to start destruction process unless refcount == 0
    ...
    btf_record_free <== in which if graph_root, a btf reference will be hold
                    <== reference is taken during btf_parse()
In order to free the reference taken during btf_parse(), we need to
have refcount 0 in btf_put(), but this is impossible since we need to
free the reference taken during btf_parse() in order to get refcount 0
for btf_put(). So the end result is btf_put() will not be able to
free btf-related memories and we have memory leak here.
Luckily, there is no need to take a btf reference during btf_parse()
since eventually graph_root.value_rec will refer to the rec in btf
struct_meta_tab.

But we do need to take a reference for map_check_btf() case as
the value_rec refers to a record in struct_meta_tab and we want
to make sure btf is not freed until value_rec usage at map_free
is done.

So the fix is to get a btf reference for map_check_btf() case
whenever a graph_root is found. A boolean field 'has_btf_ref' is added to
struct btf_field_graph_root so later the btf reference can be properly
released.

Rerun './test_progs -j' with the above mdelay() hack for a couple
of times and didn't observe the error.
Running Hou's test ([1]) is also successful.

  [1] https://lore.kernel.org/bpf/20231206110625.3188975-1-houtao@huaweicloud.com/

Cc: Hou Tao <houtao@huaweicloud.com>
Fixes: 958cf2e273f0 ("bpf: Introduce bpf_obj_new")
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h  |  1 +
 include/linux/btf.h  |  2 +-
 kernel/bpf/btf.c     | 27 ++++++++++++++++++---------
 kernel/bpf/syscall.c | 12 +++++++++---
 4 files changed, 29 insertions(+), 13 deletions(-)

Comments

Yonghong Song Dec. 8, 2023, 4:18 a.m. UTC | #1
On 12/7/23 8:16 PM, Yonghong Song wrote:
> When running `./test_progs -j` in my local vm with latest kernel,
> I once hit a kasan error like below:
>
>    [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830
>    [ 1887.186498]
>    [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G           OEL     6.7.0-rc3-00699-g90679706d486-dirty #494
>    [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
>    [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred
>    [ 1887.190341] Call Trace:
>    [ 1887.190666]  <TASK>
>    [ 1887.190949]  dump_stack_lvl+0xac/0xe0
>    [ 1887.191423]  ? nf_tcp_handle_invalid+0x1b0/0x1b0
>    [ 1887.192019]  ? panic+0x3c0/0x3c0
>    [ 1887.192449]  print_report+0x14f/0x720
>    [ 1887.192930]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.193459]  ? __virt_addr_valid+0xac/0x120
>    [ 1887.194004]  ? bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.194572]  kasan_report+0xc3/0x100
>    [ 1887.195085]  ? bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.195668]  bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.196183]  ? __bpf_obj_drop_impl+0xb0/0xb0
>    [ 1887.196736]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.197270]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.197802]  ? _raw_spin_unlock+0x1f/0x40
>    [ 1887.198319]  bpf_obj_free_fields+0x1d4/0x260
>    [ 1887.198883]  array_map_free+0x1a3/0x260
>    [ 1887.199380]  bpf_map_free_deferred+0x7b/0xe0
>    [ 1887.199943]  process_scheduled_works+0x3a2/0x6c0
>    [ 1887.200549]  worker_thread+0x633/0x890
>    [ 1887.201047]  ? __kthread_parkme+0xd7/0xf0
>    [ 1887.201574]  ? kthread+0x102/0x1d0
>    [ 1887.202020]  kthread+0x1ab/0x1d0
>    [ 1887.202447]  ? pr_cont_work+0x270/0x270
>    [ 1887.202954]  ? kthread_blkcg+0x50/0x50
>    [ 1887.203444]  ret_from_fork+0x34/0x50
>    [ 1887.203914]  ? kthread_blkcg+0x50/0x50
>    [ 1887.204397]  ret_from_fork_asm+0x11/0x20
>    [ 1887.204913]  </TASK>
>    [ 1887.204913]  </TASK>
>    [ 1887.205209]
>    [ 1887.205416] Allocated by task 2197:
>    [ 1887.205881]  kasan_set_track+0x3f/0x60
>    [ 1887.206366]  __kasan_kmalloc+0x6e/0x80
>    [ 1887.206856]  __kmalloc+0xac/0x1a0
>    [ 1887.207293]  btf_parse_fields+0xa15/0x1480
>    [ 1887.207836]  btf_parse_struct_metas+0x566/0x670
>    [ 1887.208387]  btf_new_fd+0x294/0x4d0
>    [ 1887.208851]  __sys_bpf+0x4ba/0x600
>    [ 1887.209292]  __x64_sys_bpf+0x41/0x50
>    [ 1887.209762]  do_syscall_64+0x4c/0xf0
>    [ 1887.210222]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
>    [ 1887.210868]
>    [ 1887.211074] Freed by task 36:
>    [ 1887.211460]  kasan_set_track+0x3f/0x60
>    [ 1887.211951]  kasan_save_free_info+0x28/0x40
>    [ 1887.212485]  ____kasan_slab_free+0x101/0x180
>    [ 1887.213027]  __kmem_cache_free+0xe4/0x210
>    [ 1887.213514]  btf_free+0x5b/0x130
>    [ 1887.213918]  rcu_core+0x638/0xcc0
>    [ 1887.214347]  __do_softirq+0x114/0x37e
>
> The error happens at bpf_rb_root_free+0x1f8/0x2b0:
>
>    00000000000034c0 <bpf_rb_root_free>:
>    ; {
>      34c0: f3 0f 1e fa                   endbr64
>      34c4: e8 00 00 00 00                callq   0x34c9 <bpf_rb_root_free+0x9>
>      34c9: 55                            pushq   %rbp
>      34ca: 48 89 e5                      movq    %rsp, %rbp
>    ...
>    ;       if (rec && rec->refcount_off >= 0 &&
>      36aa: 4d 85 ed                      testq   %r13, %r13
>      36ad: 74 a9                         je      0x3658 <bpf_rb_root_free+0x198>
>      36af: 49 8d 7d 10                   leaq    0x10(%r13), %rdi
>      36b3: e8 00 00 00 00                callq   0x36b8 <bpf_rb_root_free+0x1f8>
>                                          <==== kasan function
>      36b8: 45 8b 7d 10                   movl    0x10(%r13), %r15d
>                                          <==== use-after-free load
>      36bc: 45 85 ff                      testl   %r15d, %r15d
>      36bf: 78 8c                         js      0x364d <bpf_rb_root_free+0x18d>
>
> So the problem is at rec->refcount_off in the above.
>
> I did some source code analysis and find the reason.
>                                    CPU A                        CPU B
>    bpf_map_put:
>      ...
>      btf_put with rcu callback
>      ...
>      bpf_map_free_deferred
>        with system_unbound_wq
>      ...                          ...                           ...
>      ...                          btf_free_rcu:                 ...
>      ...                          ...                           bpf_map_free_deferred:
>      ...                          ...
>      ...         --------->       btf_struct_metas_free()
>      ...         | race condition ...
>      ...         --------->                                     map->ops->map_free()
>      ...
>      ...                          btf->struct_meta_tab = NULL
>
> In the above, map_free() corresponds to array_map_free() and eventually
> calling bpf_rb_root_free() which calls:
>    ...
>    __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
>    ...
>
> Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code:
>
>    meta = btf_find_struct_meta(btf, btf_id);
>    if (!meta)
>      return -EFAULT;
>    rec->fields[i].graph_root.value_rec = meta->record;
>
> So basically, 'value_rec' is a pointer to the record in struct_metas_tab.
> And it is possible that that particular record has been freed by
> btf_struct_metas_free() and hence we have a kasan error here.
>
> Actually it is very hard to reproduce the failure with current bpf/bpf-next
> code, I only got the above error once. To increase reproducibility, I added
> a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which
> significantly increased reproducibility.
>
>    diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>    index 5e43ddd1b83f..aae5b5213e93 100644
>    --- a/kernel/bpf/syscall.c
>    +++ b/kernel/bpf/syscall.c
>    @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
>          struct bpf_map *map = container_of(work, struct bpf_map, work);
>          struct btf_record *rec = map->record;
>
>    +     mdelay(100);
>          security_bpf_map_free(map);
>          bpf_map_release_memcg(map);
>          /* implementation dependent freeing */
>
> To fix the problem, we need to have a reference on btf in order to
> safeguard accessing field->graph_root.value_rec in map->ops->map_free().
> The function btf_parse_graph_root() is the place to get a btf reference.
> The following are rough call stacks reaching bpf_parse_graph_root():
>
>     btf_parse
>       ...
>         btf_parse_fields
>           ...
>             btf_parse_graph_root
>
>     map_check_btf
>       btf_parse_fields
>         ...
>           btf_parse_graph_root
>
> Looking at the above call stack, the btf_parse_graph_root() is indirectly
> called from btf_parse() or map_check_btf().
>
> We cannot take a reference in btf_parse() case since at that moment,
> btf is still in the middle to self-validation and initial reference
> (refcount_set(&btf->refcnt, 1)) has not been triggered yet.
> But even if refcount_set() is moved earlier, taking reference
> will have other issues at btf_put() stage.
>    btf_put <== to start destruction process unless refcount == 0
>      ...
>      btf_record_free <== in which if graph_root, a btf reference will be hold
>                      <== reference is taken during btf_parse()
> In order to free the reference taken during btf_parse(), we need to
> have refcount 0 in btf_put(), but this is impossible since we need to
> free the reference taken during btf_parse() in order to get refcount 0
> for btf_put(). So the end result is btf_put() will not be able to
> free btf-related memories and we have memory leak here.
> Luckily, there is no need to take a btf reference during btf_parse()
> since eventually graph_root.value_rec will refer to the rec in btf
> struct_meta_tab.
>
> But we do need to take a reference for map_check_btf() case as
> the value_rec refers to a record in struct_meta_tab and we want
> to make sure btf is not freed until value_rec usage at map_free
> is done.
>
> So the fix is to get a btf reference for map_check_btf() case
> whenever a graph_root is found. A boolean field 'has_btf_ref' is added to
> struct btf_field_graph_root so later the btf reference can be properly
> released.
>
> Rerun './test_progs -j' with the above mdelay() hack for a couple
> of times and didn't observe the error.
> Running Hou's test ([1]) is also successful.
>
>    [1] https://lore.kernel.org/bpf/20231206110625.3188975-1-houtao@huaweicloud.com/
>
> Cc: Hou Tao <houtao@huaweicloud.com>
> Fixes: 958cf2e273f0 ("bpf: Introduce bpf_obj_new")
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>

Forgot to add Hou's ack from v4. Since there is no code change, here the 
ack is:

Acked-by: Hou Tao<houtao1@huawei.com>

> ---
>   include/linux/bpf.h  |  1 +
>   include/linux/btf.h  |  2 +-
>   kernel/bpf/btf.c     | 27 ++++++++++++++++++---------
>   kernel/bpf/syscall.c | 12 +++++++++---
>   4 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c1a06263a4f3..cdeb0c95ec74 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -216,6 +216,7 @@ struct btf_field_graph_root {
>   	u32 value_btf_id;
>   	u32 node_offset;
>   	struct btf_record *value_rec;
> +	bool has_btf_ref;
>   };
>   
>   struct btf_field {
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index 59d404e22814..f7d4f6594308 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -217,7 +217,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
>   			   const struct btf_member *m,
>   			   u32 expected_offset, u32 expected_size);
>   struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
> -				    u32 field_mask, u32 value_size);
> +				    u32 field_mask, u32 value_size, bool from_map_check);
>   int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
>   bool btf_type_is_void(const struct btf_type *t);
>   s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 63cf4128fc05..f23d6a7b8b93 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3665,7 +3665,8 @@ static int btf_parse_graph_root(const struct btf *btf,
>   				struct btf_field *field,
>   				struct btf_field_info *info,
>   				const char *node_type_name,
> -				size_t node_type_align)
> +				size_t node_type_align,
> +				bool from_map_check)
>   {
>   	const struct btf_type *t, *n = NULL;
>   	const struct btf_member *member;
> @@ -3696,6 +3697,9 @@ static int btf_parse_graph_root(const struct btf *btf,
>   		if (offset % node_type_align)
>   			return -EINVAL;
>   
> +		if (from_map_check)
> +			btf_get((struct btf *)btf);
> +		field->graph_root.has_btf_ref = from_map_check;
>   		field->graph_root.btf = (struct btf *)btf;
>   		field->graph_root.value_btf_id = info->graph_root.value_btf_id;
>   		field->graph_root.node_offset = offset;
> @@ -3706,17 +3710,19 @@ static int btf_parse_graph_root(const struct btf *btf,
>   }
>   
>   static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
> -			       struct btf_field_info *info)
> +			       struct btf_field_info *info, bool from_map_check)
>   {
>   	return btf_parse_graph_root(btf, field, info, "bpf_list_node",
> -					    __alignof__(struct bpf_list_node));
> +				    __alignof__(struct bpf_list_node),
> +				    from_map_check);
>   }
>   
>   static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
> -			     struct btf_field_info *info)
> +			     struct btf_field_info *info, bool from_map_check)
>   {
>   	return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
> -					    __alignof__(struct bpf_rb_node));
> +				    __alignof__(struct bpf_rb_node),
> +				    from_map_check);
>   }
>   
>   static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
> @@ -3732,7 +3738,7 @@ static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
>   }
>   
>   struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
> -				    u32 field_mask, u32 value_size)
> +				    u32 field_mask, u32 value_size, bool from_map_check)
>   {
>   	struct btf_field_info info_arr[BTF_FIELDS_MAX];
>   	u32 next_off = 0, field_type_size;
> @@ -3798,12 +3804,14 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
>   				goto end;
>   			break;
>   		case BPF_LIST_HEAD:
> -			ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
> +			ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i],
> +						  from_map_check);
>   			if (ret < 0)
>   				goto end;
>   			break;
>   		case BPF_RB_ROOT:
> -			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
> +			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i],
> +						from_map_check);
>   			if (ret < 0)
>   				goto end;
>   			break;
> @@ -5390,7 +5398,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
>   		type = &tab->types[tab->cnt];
>   		type->btf_id = i;
>   		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
> -						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
> +						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size,
> +					  false);
>   		/* The record cannot be unset, treat it as an error if so */
>   		if (IS_ERR_OR_NULL(record)) {
>   			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index aff045eed375..bdc81221baf7 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -521,8 +521,11 @@ void btf_record_free(struct btf_record *rec)
>   			btf_put(rec->fields[i].kptr.btf);
>   			break;
>   		case BPF_LIST_HEAD:
> -		case BPF_LIST_NODE:
>   		case BPF_RB_ROOT:
> +			if (rec->fields[i].graph_root.has_btf_ref)
> +				btf_put(rec->fields[i].graph_root.btf);
> +			break;
> +		case BPF_LIST_NODE:
>   		case BPF_RB_NODE:
>   		case BPF_SPIN_LOCK:
>   		case BPF_TIMER:
> @@ -570,8 +573,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
>   			}
>   			break;
>   		case BPF_LIST_HEAD:
> -		case BPF_LIST_NODE:
>   		case BPF_RB_ROOT:
> +			if (fields[i].graph_root.has_btf_ref)
> +				btf_get(fields[i].graph_root.btf);
> +			break;
> +		case BPF_LIST_NODE:
>   		case BPF_RB_NODE:
>   		case BPF_SPIN_LOCK:
>   		case BPF_TIMER:
> @@ -1034,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
>   	map->record = btf_parse_fields(btf, value_type,
>   				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
>   				       BPF_RB_ROOT | BPF_REFCOUNT,
> -				       map->value_size);
> +				       map->value_size, true);
>   	if (!IS_ERR_OR_NULL(map->record)) {
>   		int i;
>
Martin KaFai Lau Dec. 8, 2023, 6:16 p.m. UTC | #2
On 12/7/23 8:16 PM, Yonghong Song wrote:
> When running `./test_progs -j` in my local vm with latest kernel,
> I once hit a kasan error like below:
> 
>    [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830
>    [ 1887.186498]
>    [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G           OEL     6.7.0-rc3-00699-g90679706d486-dirty #494
>    [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
>    [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred
>    [ 1887.190341] Call Trace:
>    [ 1887.190666]  <TASK>
>    [ 1887.190949]  dump_stack_lvl+0xac/0xe0
>    [ 1887.191423]  ? nf_tcp_handle_invalid+0x1b0/0x1b0
>    [ 1887.192019]  ? panic+0x3c0/0x3c0
>    [ 1887.192449]  print_report+0x14f/0x720
>    [ 1887.192930]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.193459]  ? __virt_addr_valid+0xac/0x120
>    [ 1887.194004]  ? bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.194572]  kasan_report+0xc3/0x100
>    [ 1887.195085]  ? bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.195668]  bpf_rb_root_free+0x1f8/0x2b0
>    [ 1887.196183]  ? __bpf_obj_drop_impl+0xb0/0xb0
>    [ 1887.196736]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.197270]  ? preempt_count_sub+0x1c/0xd0
>    [ 1887.197802]  ? _raw_spin_unlock+0x1f/0x40
>    [ 1887.198319]  bpf_obj_free_fields+0x1d4/0x260
>    [ 1887.198883]  array_map_free+0x1a3/0x260
>    [ 1887.199380]  bpf_map_free_deferred+0x7b/0xe0
>    [ 1887.199943]  process_scheduled_works+0x3a2/0x6c0
>    [ 1887.200549]  worker_thread+0x633/0x890
>    [ 1887.201047]  ? __kthread_parkme+0xd7/0xf0
>    [ 1887.201574]  ? kthread+0x102/0x1d0
>    [ 1887.202020]  kthread+0x1ab/0x1d0
>    [ 1887.202447]  ? pr_cont_work+0x270/0x270
>    [ 1887.202954]  ? kthread_blkcg+0x50/0x50
>    [ 1887.203444]  ret_from_fork+0x34/0x50
>    [ 1887.203914]  ? kthread_blkcg+0x50/0x50
>    [ 1887.204397]  ret_from_fork_asm+0x11/0x20
>    [ 1887.204913]  </TASK>
>    [ 1887.204913]  </TASK>
>    [ 1887.205209]
>    [ 1887.205416] Allocated by task 2197:
>    [ 1887.205881]  kasan_set_track+0x3f/0x60
>    [ 1887.206366]  __kasan_kmalloc+0x6e/0x80
>    [ 1887.206856]  __kmalloc+0xac/0x1a0
>    [ 1887.207293]  btf_parse_fields+0xa15/0x1480
>    [ 1887.207836]  btf_parse_struct_metas+0x566/0x670
>    [ 1887.208387]  btf_new_fd+0x294/0x4d0
>    [ 1887.208851]  __sys_bpf+0x4ba/0x600
>    [ 1887.209292]  __x64_sys_bpf+0x41/0x50
>    [ 1887.209762]  do_syscall_64+0x4c/0xf0
>    [ 1887.210222]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
>    [ 1887.210868]
>    [ 1887.211074] Freed by task 36:
>    [ 1887.211460]  kasan_set_track+0x3f/0x60
>    [ 1887.211951]  kasan_save_free_info+0x28/0x40
>    [ 1887.212485]  ____kasan_slab_free+0x101/0x180
>    [ 1887.213027]  __kmem_cache_free+0xe4/0x210
>    [ 1887.213514]  btf_free+0x5b/0x130
>    [ 1887.213918]  rcu_core+0x638/0xcc0
>    [ 1887.214347]  __do_softirq+0x114/0x37e
> 
> The error happens at bpf_rb_root_free+0x1f8/0x2b0:
> 
>    00000000000034c0 <bpf_rb_root_free>:
>    ; {
>      34c0: f3 0f 1e fa                   endbr64
>      34c4: e8 00 00 00 00                callq   0x34c9 <bpf_rb_root_free+0x9>
>      34c9: 55                            pushq   %rbp
>      34ca: 48 89 e5                      movq    %rsp, %rbp
>    ...
>    ;       if (rec && rec->refcount_off >= 0 &&
>      36aa: 4d 85 ed                      testq   %r13, %r13
>      36ad: 74 a9                         je      0x3658 <bpf_rb_root_free+0x198>
>      36af: 49 8d 7d 10                   leaq    0x10(%r13), %rdi
>      36b3: e8 00 00 00 00                callq   0x36b8 <bpf_rb_root_free+0x1f8>
>                                          <==== kasan function
>      36b8: 45 8b 7d 10                   movl    0x10(%r13), %r15d
>                                          <==== use-after-free load
>      36bc: 45 85 ff                      testl   %r15d, %r15d
>      36bf: 78 8c                         js      0x364d <bpf_rb_root_free+0x18d>
> 
> So the problem is at rec->refcount_off in the above.
> 
> I did some source code analysis and find the reason.
>                                    CPU A                        CPU B
>    bpf_map_put:
>      ...
>      btf_put with rcu callback
>      ...
>      bpf_map_free_deferred
>        with system_unbound_wq
>      ...                          ...                           ...
>      ...                          btf_free_rcu:                 ...
>      ...                          ...                           bpf_map_free_deferred:
>      ...                          ...
>      ...         --------->       btf_struct_metas_free()
>      ...         | race condition ...
>      ...         --------->                                     map->ops->map_free()
>      ...
>      ...                          btf->struct_meta_tab = NULL
> 
> In the above, map_free() corresponds to array_map_free() and eventually
> calling bpf_rb_root_free() which calls:
>    ...
>    __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
>    ...
> 
> Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code:
> 
>    meta = btf_find_struct_meta(btf, btf_id);
>    if (!meta)
>      return -EFAULT;
>    rec->fields[i].graph_root.value_rec = meta->record;
> 
> So basically, 'value_rec' is a pointer to the record in struct_metas_tab.
> And it is possible that that particular record has been freed by
> btf_struct_metas_free() and hence we have a kasan error here.
> 
> Actually it is very hard to reproduce the failure with current bpf/bpf-next
> code, I only got the above error once. To increase reproducibility, I added
> a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which
> significantly increased reproducibility.
> 
>    diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>    index 5e43ddd1b83f..aae5b5213e93 100644
>    --- a/kernel/bpf/syscall.c
>    +++ b/kernel/bpf/syscall.c
>    @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
>          struct bpf_map *map = container_of(work, struct bpf_map, work);
>          struct btf_record *rec = map->record;
> 
>    +     mdelay(100);
>          security_bpf_map_free(map);
>          bpf_map_release_memcg(map);
>          /* implementation dependent freeing */
> 
> To fix the problem, we need to have a reference on btf in order to
> safeguard accessing field->graph_root.value_rec in map->ops->map_free().
> The function btf_parse_graph_root() is the place to get a btf reference.
> The following are rough call stacks reaching bpf_parse_graph_root():
> 
>     btf_parse
>       ...
>         btf_parse_fields
>           ...
>             btf_parse_graph_root
> 
>     map_check_btf
>       btf_parse_fields
>         ...
>           btf_parse_graph_root
> 
> Looking at the above call stack, the btf_parse_graph_root() is indirectly
> called from btf_parse() or map_check_btf().
> 
> We cannot take a reference in btf_parse() case since at that moment,
> btf is still in the middle to self-validation and initial reference
> (refcount_set(&btf->refcnt, 1)) has not been triggered yet.
> But even if refcount_set() is moved earlier, taking reference
> will have other issues at btf_put() stage.
>    btf_put <== to start destruction process unless refcount == 0
>      ...
>      btf_record_free <== in which if graph_root, a btf reference will be hold
>                      <== reference is taken during btf_parse()
> In order to free the reference taken during btf_parse(), we need to
> have refcount 0 in btf_put(), but this is impossible since we need to
> free the reference taken during btf_parse() in order to get refcount 0
> for btf_put(). So the end result is btf_put() will not be able to
> free btf-related memories and we have memory leak here.
> Luckily, there is no need to take a btf reference during btf_parse()
> since eventually graph_root.value_rec will refer to the rec in btf
> struct_meta_tab.
> 
> But we do need to take a reference for map_check_btf() case as
> the value_rec refers to a record in struct_meta_tab and we want
> to make sure btf is not freed until value_rec usage at map_free
> is done.
> 
> So the fix is to get a btf reference for map_check_btf() case
> whenever a graph_root is found. A boolean field 'has_btf_ref' is added to
> struct btf_field_graph_root so later the btf reference can be properly
> released.
> 
> Rerun './test_progs -j' with the above mdelay() hack for a couple
> of times and didn't observe the error.
> Running Hou's test ([1]) is also successful.
> 
>    [1] https://lore.kernel.org/bpf/20231206110625.3188975-1-houtao@huaweicloud.com/

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c1a06263a4f3..cdeb0c95ec74 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,6 +216,7 @@  struct btf_field_graph_root {
 	u32 value_btf_id;
 	u32 node_offset;
 	struct btf_record *value_rec;
+	bool has_btf_ref;
 };
 
 struct btf_field {
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 59d404e22814..f7d4f6594308 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -217,7 +217,7 @@  bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
-				    u32 field_mask, u32 value_size);
+				    u32 field_mask, u32 value_size, bool from_map_check);
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63cf4128fc05..f23d6a7b8b93 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3665,7 +3665,8 @@  static int btf_parse_graph_root(const struct btf *btf,
 				struct btf_field *field,
 				struct btf_field_info *info,
 				const char *node_type_name,
-				size_t node_type_align)
+				size_t node_type_align,
+				bool from_map_check)
 {
 	const struct btf_type *t, *n = NULL;
 	const struct btf_member *member;
@@ -3696,6 +3697,9 @@  static int btf_parse_graph_root(const struct btf *btf,
 		if (offset % node_type_align)
 			return -EINVAL;
 
+		if (from_map_check)
+			btf_get((struct btf *)btf);
+		field->graph_root.has_btf_ref = from_map_check;
 		field->graph_root.btf = (struct btf *)btf;
 		field->graph_root.value_btf_id = info->graph_root.value_btf_id;
 		field->graph_root.node_offset = offset;
@@ -3706,17 +3710,19 @@  static int btf_parse_graph_root(const struct btf *btf,
 }
 
 static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
-			       struct btf_field_info *info)
+			       struct btf_field_info *info, bool from_map_check)
 {
 	return btf_parse_graph_root(btf, field, info, "bpf_list_node",
-					    __alignof__(struct bpf_list_node));
+				    __alignof__(struct bpf_list_node),
+				    from_map_check);
 }
 
 static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
-			     struct btf_field_info *info)
+			     struct btf_field_info *info, bool from_map_check)
 {
 	return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
-					    __alignof__(struct bpf_rb_node));
+				    __alignof__(struct bpf_rb_node),
+				    from_map_check);
 }
 
 static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
@@ -3732,7 +3738,7 @@  static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
 }
 
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
-				    u32 field_mask, u32 value_size)
+				    u32 field_mask, u32 value_size, bool from_map_check)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
 	u32 next_off = 0, field_type_size;
@@ -3798,12 +3804,14 @@  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 				goto end;
 			break;
 		case BPF_LIST_HEAD:
-			ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+			ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i],
+						  from_map_check);
 			if (ret < 0)
 				goto end;
 			break;
 		case BPF_RB_ROOT:
-			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i],
+						from_map_check);
 			if (ret < 0)
 				goto end;
 			break;
@@ -5390,7 +5398,8 @@  btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 		type = &tab->types[tab->cnt];
 		type->btf_id = i;
 		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
-						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size,
+					  false);
 		/* The record cannot be unset, treat it as an error if so */
 		if (IS_ERR_OR_NULL(record)) {
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aff045eed375..bdc81221baf7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -521,8 +521,11 @@  void btf_record_free(struct btf_record *rec)
 			btf_put(rec->fields[i].kptr.btf);
 			break;
 		case BPF_LIST_HEAD:
-		case BPF_LIST_NODE:
 		case BPF_RB_ROOT:
+			if (rec->fields[i].graph_root.has_btf_ref)
+				btf_put(rec->fields[i].graph_root.btf);
+			break;
+		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
@@ -570,8 +573,11 @@  struct btf_record *btf_record_dup(const struct btf_record *rec)
 			}
 			break;
 		case BPF_LIST_HEAD:
-		case BPF_LIST_NODE:
 		case BPF_RB_ROOT:
+			if (fields[i].graph_root.has_btf_ref)
+				btf_get(fields[i].graph_root.btf);
+			break;
+		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
@@ -1034,7 +1040,7 @@  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
 				       BPF_RB_ROOT | BPF_REFCOUNT,
-				       map->value_size);
+				       map->value_size, true);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;