diff mbox series

bpf: Convert lpm_trie::lock to 'raw_spinlock_t'

Message ID 20241108063214.578120-1-kunwu.chan@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Convert lpm_trie::lock to 'raw_spinlock_t' | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 18 of 18 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 48 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-7 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / build-release

Commit Message

Kunwu Chan Nov. 8, 2024, 6:32 a.m. UTC
From: Kunwu Chan <chentao@kylinos.cn>

When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
and bpf program has owned a raw_spinlock under a interrupt handler,
which results in invalid lock acquire context.

[ BUG: Invalid wait context ]
6.12.0-rc5-next-20241031-syzkaller #0 Not tainted
-----------------------------
swapper/0/0 is trying to lock:
ffff8880261e7a00 (&trie->lock){....}-{3:3},
at: trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
other info that might help us debug this:
context-{3:3}
5 locks held by swapper/0/0:
 #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
at: vp_vring_interrupt drivers/virtio/virtio_pci_common.c:80 [inline]
 #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
at: vp_interrupt+0x142/0x200 drivers/virtio/virtio_pci_common.c:113
 #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
at: spin_lock include/linux/spinlock.h:351 [inline]
 #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
at: stats_request+0x6f/0x230 drivers/virtio/virtio_balloon.c:438
 #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
 #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
 #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: __queue_work+0x199/0xf50 kernel/workqueue.c:2259
 #3: ffff8880b863dd18 (&pool->lock){-.-.}-{2:2},
at: __queue_work+0x759/0xf50
 #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
 #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
 #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: __bpf_trace_run kernel/trace/bpf_trace.c:2339 [inline]
 #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
at: bpf_trace_run1+0x1d6/0x520 kernel/trace/bpf_trace.c:2380
stack backtrace:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
6.12.0-rc5-next-20241031-syzkaller #0
Hardware name: Google Compute Engine/Google Compute Engine,
BIOS Google 09/13/2024
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:94 [inline]
 dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
 check_wait_context kernel/locking/lockdep.c:4898 [inline]
 __lock_acquire+0x15a8/0x2100 kernel/locking/lockdep.c:5176
 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
 trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
 bpf_prog_2c29ac5cdc6b1842+0x43/0x47
 bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
 __bpf_prog_run include/linux/filter.h:701 [inline]
 bpf_prog_run include/linux/filter.h:708 [inline]
 __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
 bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
 trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
 __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
 queue_work_on+0x1c2/0x380 kernel/workqueue.c:2390
 queue_work include/linux/workqueue.h:662 [inline]
 stats_request+0x1a3/0x230 drivers/virtio/virtio_balloon.c:441
 vring_interrupt+0x21d/0x380 drivers/virtio/virtio_ring.c:2595
 vp_vring_interrupt drivers/virtio/virtio_pci_common.c:82 [inline]
 vp_interrupt+0x192/0x200 drivers/virtio/virtio_pci_common.c:113
 __handle_irq_event_percpu+0x29a/0xa80 kernel/irq/handle.c:158
 handle_irq_event_percpu kernel/irq/handle.c:193 [inline]
 handle_irq_event+0x89/0x1f0 kernel/irq/handle.c:210
 handle_fasteoi_irq+0x48a/0xae0 kernel/irq/chip.c:720
 generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
 handle_irq arch/x86/kernel/irq.c:247 [inline]
 call_irq_handler arch/x86/kernel/irq.c:259 [inline]
 __common_interrupt+0x136/0x230 arch/x86/kernel/irq.c:285
 common_interrupt+0xb4/0xd0 arch/x86/kernel/irq.c:278
 </IRQ>

Reported-by: syzbot+b506de56cbbb63148c33@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/bpf/6723db4a.050a0220.35b515.0168.GAE@google.com/
Fixes: 66150d0dde03 ("bpf, lpm: Make locking RT friendly")
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
---
 kernel/bpf/lpm_trie.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Alexei Starovoitov Nov. 8, 2024, 8:22 p.m. UTC | #1
On Thu, Nov 7, 2024 at 10:32 PM Kunwu Chan <kunwu.chan@linux.dev> wrote:
>
> From: Kunwu Chan <chentao@kylinos.cn>
>
> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
> and bpf program has owned a raw_spinlock under a interrupt handler,
> which results in invalid lock acquire context.
>
> [ BUG: Invalid wait context ]
> 6.12.0-rc5-next-20241031-syzkaller #0 Not tainted
> -----------------------------
> swapper/0/0 is trying to lock:
> ffff8880261e7a00 (&trie->lock){....}-{3:3},
> at: trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
> other info that might help us debug this:
> context-{3:3}
> 5 locks held by swapper/0/0:
>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
> at: vp_vring_interrupt drivers/virtio/virtio_pci_common.c:80 [inline]
>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
> at: vp_interrupt+0x142/0x200 drivers/virtio/virtio_pci_common.c:113
>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
> at: spin_lock include/linux/spinlock.h:351 [inline]
>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
> at: stats_request+0x6f/0x230 drivers/virtio/virtio_balloon.c:438
>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: __queue_work+0x199/0xf50 kernel/workqueue.c:2259
>  #3: ffff8880b863dd18 (&pool->lock){-.-.}-{2:2},
> at: __queue_work+0x759/0xf50
>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: __bpf_trace_run kernel/trace/bpf_trace.c:2339 [inline]
>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> at: bpf_trace_run1+0x1d6/0x520 kernel/trace/bpf_trace.c:2380
> stack backtrace:
> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
> 6.12.0-rc5-next-20241031-syzkaller #0
> Hardware name: Google Compute Engine/Google Compute Engine,
> BIOS Google 09/13/2024
> Call Trace:
>  <IRQ>
>  __dump_stack lib/dump_stack.c:94 [inline]
>  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>  print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
>  check_wait_context kernel/locking/lockdep.c:4898 [inline]
>  __lock_acquire+0x15a8/0x2100 kernel/locking/lockdep.c:5176
>  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>  _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
>  trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462

This trace is from non-RT kernel where spin_lock == raw_spin_lock.

I don't think Hou's explanation earlier is correct.
https://lore.kernel.org/bpf/e14d8882-4760-7c9c-0cfc-db04eda494ee@huaweicloud.com/

>  bpf_prog_2c29ac5cdc6b1842+0x43/0x47
>  bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
>  __bpf_prog_run include/linux/filter.h:701 [inline]
>  bpf_prog_run include/linux/filter.h:708 [inline]
>  __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
>  bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
>  trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
>  __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
>  queue_work_on+0x1c2/0x380 kernel/workqueue.c:2390

here irqs are disabled, but raw_spin_lock in lpm should be fine.

>  queue_work include/linux/workqueue.h:662 [inline]
>  stats_request+0x1a3/0x230 drivers/virtio/virtio_balloon.c:441
>  vring_interrupt+0x21d/0x380 drivers/virtio/virtio_ring.c:2595
>  vp_vring_interrupt drivers/virtio/virtio_pci_common.c:82 [inline]
>  vp_interrupt+0x192/0x200 drivers/virtio/virtio_pci_common.c:113
>  __handle_irq_event_percpu+0x29a/0xa80 kernel/irq/handle.c:158
>  handle_irq_event_percpu kernel/irq/handle.c:193 [inline]
>  handle_irq_event+0x89/0x1f0 kernel/irq/handle.c:210
>  handle_fasteoi_irq+0x48a/0xae0 kernel/irq/chip.c:720
>  generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
>  handle_irq arch/x86/kernel/irq.c:247 [inline]
>  call_irq_handler arch/x86/kernel/irq.c:259 [inline]
>  __common_interrupt+0x136/0x230 arch/x86/kernel/irq.c:285
>  common_interrupt+0xb4/0xd0 arch/x86/kernel/irq.c:278
>  </IRQ>
>
> Reported-by: syzbot+b506de56cbbb63148c33@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/bpf/6723db4a.050a0220.35b515.0168.GAE@google.com/
> Fixes: 66150d0dde03 ("bpf, lpm: Make locking RT friendly")
> Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
> ---
>  kernel/bpf/lpm_trie.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index 9b60eda0f727..373cdcfa0505 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -35,7 +35,7 @@ struct lpm_trie {
>         size_t                          n_entries;
>         size_t                          max_prefixlen;
>         size_t                          data_size;
> -       spinlock_t                      lock;
> +       raw_spinlock_t                  lock;
>  };

We're certainly not going back.

pw-bot: cr
Hou Tao Nov. 9, 2024, 2:46 a.m. UTC | #2
Hi Alexei,

On 11/9/2024 4:22 AM, Alexei Starovoitov wrote:
> On Thu, Nov 7, 2024 at 10:32 PM Kunwu Chan <kunwu.chan@linux.dev> wrote:
>> From: Kunwu Chan <chentao@kylinos.cn>
>>
>> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
>> and bpf program has owned a raw_spinlock under a interrupt handler,
>> which results in invalid lock acquire context.
>>
>> [ BUG: Invalid wait context ]
>> 6.12.0-rc5-next-20241031-syzkaller #0 Not tainted
>> -----------------------------
>> swapper/0/0 is trying to lock:
>> ffff8880261e7a00 (&trie->lock){....}-{3:3},
>> at: trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
>> other info that might help us debug this:
>> context-{3:3}
>> 5 locks held by swapper/0/0:
>>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
>> at: vp_vring_interrupt drivers/virtio/virtio_pci_common.c:80 [inline]
>>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
>> at: vp_interrupt+0x142/0x200 drivers/virtio/virtio_pci_common.c:113
>>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
>> at: spin_lock include/linux/spinlock.h:351 [inline]
>>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
>> at: stats_request+0x6f/0x230 drivers/virtio/virtio_balloon.c:438
>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: __queue_work+0x199/0xf50 kernel/workqueue.c:2259
>>  #3: ffff8880b863dd18 (&pool->lock){-.-.}-{2:2},
>> at: __queue_work+0x759/0xf50
>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: __bpf_trace_run kernel/trace/bpf_trace.c:2339 [inline]
>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>> at: bpf_trace_run1+0x1d6/0x520 kernel/trace/bpf_trace.c:2380
>> stack backtrace:
>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
>> 6.12.0-rc5-next-20241031-syzkaller #0
>> Hardware name: Google Compute Engine/Google Compute Engine,
>> BIOS Google 09/13/2024
>> Call Trace:
>>  <IRQ>
>>  __dump_stack lib/dump_stack.c:94 [inline]
>>  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>>  print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
>>  check_wait_context kernel/locking/lockdep.c:4898 [inline]
>>  __lock_acquire+0x15a8/0x2100 kernel/locking/lockdep.c:5176
>>  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>>  _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
>>  trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
> This trace is from non-RT kernel where spin_lock == raw_spin_lock.

Yes. However, I think the reason for the warning is that lockdep
considers the case is possible under PREEMPT_RT and it violates the rule
of lock [1].

[1]:
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=560af5dc839eef08a273908f390cfefefb82aa04
>
> I don't think Hou's explanation earlier is correct.
> https://lore.kernel.org/bpf/e14d8882-4760-7c9c-0cfc-db04eda494ee@huaweicloud.com/

OK. Is the bpf mem allocator part OK for you ?
>
>>  bpf_prog_2c29ac5cdc6b1842+0x43/0x47
>>  bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
>>  __bpf_prog_run include/linux/filter.h:701 [inline]
>>  bpf_prog_run include/linux/filter.h:708 [inline]
>>  __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
>>  bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
>>  trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
>>  __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
>>  queue_work_on+0x1c2/0x380 kernel/workqueue.c:2390
> here irqs are disabled, but raw_spin_lock in lpm should be fine.
>
>>  queue_work include/linux/workqueue.h:662 [inline]
>>  stats_request+0x1a3/0x230 drivers/virtio/virtio_balloon.c:441
>>  vring_interrupt+0x21d/0x380 drivers/virtio/virtio_ring.c:2595
>>  vp_vring_interrupt drivers/virtio/virtio_pci_common.c:82 [inline]
>>  vp_interrupt+0x192/0x200 drivers/virtio/virtio_pci_common.c:113
>>  __handle_irq_event_percpu+0x29a/0xa80 kernel/irq/handle.c:158
>>  handle_irq_event_percpu kernel/irq/handle.c:193 [inline]
>>  handle_irq_event+0x89/0x1f0 kernel/irq/handle.c:210
>>  handle_fasteoi_irq+0x48a/0xae0 kernel/irq/chip.c:720
>>  generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
>>  handle_irq arch/x86/kernel/irq.c:247 [inline]
>>  call_irq_handler arch/x86/kernel/irq.c:259 [inline]
>>  __common_interrupt+0x136/0x230 arch/x86/kernel/irq.c:285
>>  common_interrupt+0xb4/0xd0 arch/x86/kernel/irq.c:278
>>  </IRQ>
>>
>> Reported-by: syzbot+b506de56cbbb63148c33@syzkaller.appspotmail.com
>> Closes: https://lore.kernel.org/bpf/6723db4a.050a0220.35b515.0168.GAE@google.com/
>> Fixes: 66150d0dde03 ("bpf, lpm: Make locking RT friendly")
>> Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
>> ---
>>  kernel/bpf/lpm_trie.c | 12 ++++++------
>>  1 file changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 9b60eda0f727..373cdcfa0505 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -35,7 +35,7 @@ struct lpm_trie {
>>         size_t                          n_entries;
>>         size_t                          max_prefixlen;
>>         size_t                          data_size;
>> -       spinlock_t                      lock;
>> +       raw_spinlock_t                  lock;
>>  };
> We're certainly not going back.

Only switching from spinlock_t to raw_spinlock_t is not enough, running
it under PREEMPT_RT after the change will still trigger the similar
lockdep warning. That is because kmalloc() may acquire a spinlock_t as
well. However, after changing the kmalloc and its variants to bpf memory
allocator, I think the switch to raw_spinlock_t will be safe. I have
already written a draft patch set. Will post after after polishing and
testing it. WDYT ?
>
> pw-bot: cr
Alexei Starovoitov Nov. 10, 2024, 12:04 a.m. UTC | #3
On Fri, Nov 8, 2024 at 6:46 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi Alexei,
>
> On 11/9/2024 4:22 AM, Alexei Starovoitov wrote:
> > On Thu, Nov 7, 2024 at 10:32 PM Kunwu Chan <kunwu.chan@linux.dev> wrote:
> >> From: Kunwu Chan <chentao@kylinos.cn>
> >>
> >> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
> >> and bpf program has owned a raw_spinlock under a interrupt handler,
> >> which results in invalid lock acquire context.
> >>
> >> [ BUG: Invalid wait context ]
> >> 6.12.0-rc5-next-20241031-syzkaller #0 Not tainted
> >> -----------------------------
> >> swapper/0/0 is trying to lock:
> >> ffff8880261e7a00 (&trie->lock){....}-{3:3},
> >> at: trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
> >> other info that might help us debug this:
> >> context-{3:3}
> >> 5 locks held by swapper/0/0:
> >>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
> >> at: vp_vring_interrupt drivers/virtio/virtio_pci_common.c:80 [inline]
> >>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
> >> at: vp_interrupt+0x142/0x200 drivers/virtio/virtio_pci_common.c:113
> >>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
> >> at: spin_lock include/linux/spinlock.h:351 [inline]
> >>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
> >> at: stats_request+0x6f/0x230 drivers/virtio/virtio_balloon.c:438
> >>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
> >>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
> >>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: __queue_work+0x199/0xf50 kernel/workqueue.c:2259
> >>  #3: ffff8880b863dd18 (&pool->lock){-.-.}-{2:2},
> >> at: __queue_work+0x759/0xf50
> >>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
> >>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
> >>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: __bpf_trace_run kernel/trace/bpf_trace.c:2339 [inline]
> >>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
> >> at: bpf_trace_run1+0x1d6/0x520 kernel/trace/bpf_trace.c:2380
> >> stack backtrace:
> >> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
> >> 6.12.0-rc5-next-20241031-syzkaller #0
> >> Hardware name: Google Compute Engine/Google Compute Engine,
> >> BIOS Google 09/13/2024
> >> Call Trace:
> >>  <IRQ>
> >>  __dump_stack lib/dump_stack.c:94 [inline]
> >>  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
> >>  print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
> >>  check_wait_context kernel/locking/lockdep.c:4898 [inline]
> >>  __lock_acquire+0x15a8/0x2100 kernel/locking/lockdep.c:5176
> >>  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
> >>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> >>  _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
> >>  trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
> > This trace is from non-RT kernel where spin_lock == raw_spin_lock.
>
> Yes. However, I think the reason for the warning is that lockdep
> considers the case is possible under PREEMPT_RT and it violates the rule
> of lock [1].
>
> [1]:
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=560af5dc839eef08a273908f390cfefefb82aa04
> >
> > I don't think Hou's explanation earlier is correct.
> > https://lore.kernel.org/bpf/e14d8882-4760-7c9c-0cfc-db04eda494ee@huaweicloud.com/
>
> OK. Is the bpf mem allocator part OK for you ?
> >
> >>  bpf_prog_2c29ac5cdc6b1842+0x43/0x47
> >>  bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
> >>  __bpf_prog_run include/linux/filter.h:701 [inline]
> >>  bpf_prog_run include/linux/filter.h:708 [inline]
> >>  __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
> >>  bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
> >>  trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
> >>  __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
> >>  queue_work_on+0x1c2/0x380 kernel/workqueue.c:2390
> > here irqs are disabled, but raw_spin_lock in lpm should be fine.
> >
> >>  queue_work include/linux/workqueue.h:662 [inline]
> >>  stats_request+0x1a3/0x230 drivers/virtio/virtio_balloon.c:441
> >>  vring_interrupt+0x21d/0x380 drivers/virtio/virtio_ring.c:2595
> >>  vp_vring_interrupt drivers/virtio/virtio_pci_common.c:82 [inline]
> >>  vp_interrupt+0x192/0x200 drivers/virtio/virtio_pci_common.c:113
> >>  __handle_irq_event_percpu+0x29a/0xa80 kernel/irq/handle.c:158
> >>  handle_irq_event_percpu kernel/irq/handle.c:193 [inline]
> >>  handle_irq_event+0x89/0x1f0 kernel/irq/handle.c:210
> >>  handle_fasteoi_irq+0x48a/0xae0 kernel/irq/chip.c:720
> >>  generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
> >>  handle_irq arch/x86/kernel/irq.c:247 [inline]
> >>  call_irq_handler arch/x86/kernel/irq.c:259 [inline]
> >>  __common_interrupt+0x136/0x230 arch/x86/kernel/irq.c:285
> >>  common_interrupt+0xb4/0xd0 arch/x86/kernel/irq.c:278
> >>  </IRQ>
> >>
> >> Reported-by: syzbot+b506de56cbbb63148c33@syzkaller.appspotmail.com
> >> Closes: https://lore.kernel.org/bpf/6723db4a.050a0220.35b515.0168.GAE@google.com/
> >> Fixes: 66150d0dde03 ("bpf, lpm: Make locking RT friendly")
> >> Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
> >> ---
> >>  kernel/bpf/lpm_trie.c | 12 ++++++------
> >>  1 file changed, 6 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> >> index 9b60eda0f727..373cdcfa0505 100644
> >> --- a/kernel/bpf/lpm_trie.c
> >> +++ b/kernel/bpf/lpm_trie.c
> >> @@ -35,7 +35,7 @@ struct lpm_trie {
> >>         size_t                          n_entries;
> >>         size_t                          max_prefixlen;
> >>         size_t                          data_size;
> >> -       spinlock_t                      lock;
> >> +       raw_spinlock_t                  lock;
> >>  };
> > We're certainly not going back.
>
> Only switching from spinlock_t to raw_spinlock_t is not enough, running
> it under PREEMPT_RT after the change will still trigger the similar
> lockdep warning. That is because kmalloc() may acquire a spinlock_t as
> well. However, after changing the kmalloc and its variants to bpf memory
> allocator, I think the switch to raw_spinlock_t will be safe. I have
> already written a draft patch set. Will post after after polishing and
> testing it. WDYT ?

Switching lpm to bpf_mem_alloc would address the issue.
Why do you want a switch to raw_spin_lock as well?
kfree_rcu() is already done outside of the lock.
Hou Tao Nov. 10, 2024, 2:08 a.m. UTC | #4
Hi,

On 11/10/2024 8:04 AM, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2024 at 6:46 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi Alexei,
>>
>> On 11/9/2024 4:22 AM, Alexei Starovoitov wrote:
>>> On Thu, Nov 7, 2024 at 10:32 PM Kunwu Chan <kunwu.chan@linux.dev> wrote:
>>>> From: Kunwu Chan <chentao@kylinos.cn>
>>>>
>>>> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
>>>> and bpf program has owned a raw_spinlock under a interrupt handler,
>>>> which results in invalid lock acquire context.
>>>>
>>>> [ BUG: Invalid wait context ]
>>>> 6.12.0-rc5-next-20241031-syzkaller #0 Not tainted
>>>> -----------------------------
>>>> swapper/0/0 is trying to lock:
>>>> ffff8880261e7a00 (&trie->lock){....}-{3:3},
>>>> at: trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
>>>> other info that might help us debug this:
>>>> context-{3:3}
>>>> 5 locks held by swapper/0/0:
>>>>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
>>>> at: vp_vring_interrupt drivers/virtio/virtio_pci_common.c:80 [inline]
>>>>  #0: ffff888020bb75c8 (&vp_dev->lock){-...}-{3:3},
>>>> at: vp_interrupt+0x142/0x200 drivers/virtio/virtio_pci_common.c:113
>>>>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
>>>> at: spin_lock include/linux/spinlock.h:351 [inline]
>>>>  #1: ffff88814174a120 (&vb->stop_update_lock){-...}-{3:3},
>>>> at: stats_request+0x6f/0x230 drivers/virtio/virtio_balloon.c:438
>>>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>>>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>>>>  #2: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: __queue_work+0x199/0xf50 kernel/workqueue.c:2259
>>>>  #3: ffff8880b863dd18 (&pool->lock){-.-.}-{2:2},
>>>> at: __queue_work+0x759/0xf50
>>>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline]
>>>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: rcu_read_lock include/linux/rcupdate.h:849 [inline]
>>>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: __bpf_trace_run kernel/trace/bpf_trace.c:2339 [inline]
>>>>  #4: ffffffff8e939f20 (rcu_read_lock){....}-{1:3},
>>>> at: bpf_trace_run1+0x1d6/0x520 kernel/trace/bpf_trace.c:2380
>>>> stack backtrace:
>>>> CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted
>>>> 6.12.0-rc5-next-20241031-syzkaller #0
>>>> Hardware name: Google Compute Engine/Google Compute Engine,
>>>> BIOS Google 09/13/2024
>>>> Call Trace:
>>>>  <IRQ>
>>>>  __dump_stack lib/dump_stack.c:94 [inline]
>>>>  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>>>>  print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline]
>>>>  check_wait_context kernel/locking/lockdep.c:4898 [inline]
>>>>  __lock_acquire+0x15a8/0x2100 kernel/locking/lockdep.c:5176
>>>>  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>>>>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>>>>  _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
>>>>  trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
>>> This trace is from non-RT kernel where spin_lock == raw_spin_lock.
>> Yes. However, I think the reason for the warning is that lockdep
>> considers the case is possible under PREEMPT_RT and it violates the rule
>> of lock [1].
>>
>> [1]:
>> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=560af5dc839eef08a273908f390cfefefb82aa04
>>> I don't think Hou's explanation earlier is correct.
>>> https://lore.kernel.org/bpf/e14d8882-4760-7c9c-0cfc-db04eda494ee@huaweicloud.com/
>> OK. Is the bpf mem allocator part OK for you ?
>>>>  bpf_prog_2c29ac5cdc6b1842+0x43/0x47
>>>>  bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
>>>>  __bpf_prog_run include/linux/filter.h:701 [inline]
>>>>  bpf_prog_run include/linux/filter.h:708 [inline]
>>>>  __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
>>>>  bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
>>>>  trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
>>>>  __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
>>>>  queue_work_on+0x1c2/0x380 kernel/workqueue.c:2390
>>> here irqs are disabled, but raw_spin_lock in lpm should be fine.
>>>
>>>>  queue_work include/linux/workqueue.h:662 [inline]
>>>>  stats_request+0x1a3/0x230 drivers/virtio/virtio_balloon.c:441
>>>>  vring_interrupt+0x21d/0x380 drivers/virtio/virtio_ring.c:2595
>>>>  vp_vring_interrupt drivers/virtio/virtio_pci_common.c:82 [inline]
>>>>  vp_interrupt+0x192/0x200 drivers/virtio/virtio_pci_common.c:113
>>>>  __handle_irq_event_percpu+0x29a/0xa80 kernel/irq/handle.c:158
>>>>  handle_irq_event_percpu kernel/irq/handle.c:193 [inline]
>>>>  handle_irq_event+0x89/0x1f0 kernel/irq/handle.c:210
>>>>  handle_fasteoi_irq+0x48a/0xae0 kernel/irq/chip.c:720
>>>>  generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
>>>>  handle_irq arch/x86/kernel/irq.c:247 [inline]
>>>>  call_irq_handler arch/x86/kernel/irq.c:259 [inline]
>>>>  __common_interrupt+0x136/0x230 arch/x86/kernel/irq.c:285
>>>>  common_interrupt+0xb4/0xd0 arch/x86/kernel/irq.c:278
>>>>  </IRQ>
>>>>
>>>> Reported-by: syzbot+b506de56cbbb63148c33@syzkaller.appspotmail.com
>>>> Closes: https://lore.kernel.org/bpf/6723db4a.050a0220.35b515.0168.GAE@google.com/
>>>> Fixes: 66150d0dde03 ("bpf, lpm: Make locking RT friendly")
>>>> Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
>>>> ---
>>>>  kernel/bpf/lpm_trie.c | 12 ++++++------
>>>>  1 file changed, 6 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>>>> index 9b60eda0f727..373cdcfa0505 100644
>>>> --- a/kernel/bpf/lpm_trie.c
>>>> +++ b/kernel/bpf/lpm_trie.c
>>>> @@ -35,7 +35,7 @@ struct lpm_trie {
>>>>         size_t                          n_entries;
>>>>         size_t                          max_prefixlen;
>>>>         size_t                          data_size;
>>>> -       spinlock_t                      lock;
>>>> +       raw_spinlock_t                  lock;
>>>>  };
>>> We're certainly not going back.
>> Only switching from spinlock_t to raw_spinlock_t is not enough, running
>> it under PREEMPT_RT after the change will still trigger the similar
>> lockdep warning. That is because kmalloc() may acquire a spinlock_t as
>> well. However, after changing the kmalloc and its variants to bpf memory
>> allocator, I think the switch to raw_spinlock_t will be safe. I have
>> already written a draft patch set. Will post after after polishing and
>> testing it. WDYT ?
> Switching lpm to bpf_mem_alloc would address the issue.
> Why do you want a switch to raw_spin_lock as well?
> kfree_rcu() is already done outside of the lock.

After switching to raw_spinlock_t, the lpm trie could be used under
interrupt context even under PREEMPT_RT.
> .
Thomas Gleixner Nov. 12, 2024, 3:08 p.m. UTC | #5
On Fri, Nov 08 2024 at 14:32, Kunwu Chan wrote:
> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
> and bpf program has owned a raw_spinlock under a interrupt handler,
> which results in invalid lock acquire context.

This explanation is just wrong.

The problem has nothing to do with an interrupt handler. Interrupt
handlers on RT kernels are force threaded.

>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>  _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
>  trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
>  bpf_prog_2c29ac5cdc6b1842+0x43/0x47
>  bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
>  __bpf_prog_run include/linux/filter.h:701 [inline]
>  bpf_prog_run include/linux/filter.h:708 [inline]
>  __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
>  bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
>  trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
>  __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338

The problematic lock nesting is the work queue pool lock, which is a raw
spinlock.

> @@ -330,7 +330,7 @@ static long trie_update_elem(struct bpf_map *map,
>  	if (key->prefixlen > trie->max_prefixlen)
>  		return -EINVAL;
>  
> -	spin_lock_irqsave(&trie->lock, irq_flags);
> +	raw_spin_lock_irqsave(&trie->lock, irq_flags);
>  
>  	/* Allocate and fill a new node */

Making this a raw spinlock moves the problem from the BPF trie code into
the memory allocator. On RT the memory allocator cannot be invoked under
a raw spinlock.

Thanks,

        tglx
Kunwu Chan Nov. 14, 2024, 2:43 a.m. UTC | #6
Thanks all for the reply.

On 2024/11/12 23:08, Thomas Gleixner wrote:
> On Fri, Nov 08 2024 at 14:32, Kunwu Chan wrote:
>> When PREEMPT_RT is enabled, 'spinlock_t' becomes preemptible
>> and bpf program has owned a raw_spinlock under a interrupt handler,
>> which results in invalid lock acquire context.
> This explanation is just wrong.
>
> The problem has nothing to do with an interrupt handler. Interrupt
> handlers on RT kernels are force threaded.
>
>>   __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>>   _raw_spin_lock_irqsave+0xd5/0x120 kernel/locking/spinlock.c:162
>>   trie_delete_elem+0x96/0x6a0 kernel/bpf/lpm_trie.c:462
>>   bpf_prog_2c29ac5cdc6b1842+0x43/0x47
>>   bpf_dispatcher_nop_func include/linux/bpf.h:1290 [inline]
>>   __bpf_prog_run include/linux/filter.h:701 [inline]
>>   bpf_prog_run include/linux/filter.h:708 [inline]
>>   __bpf_trace_run kernel/trace/bpf_trace.c:2340 [inline]
>>   bpf_trace_run1+0x2ca/0x520 kernel/trace/bpf_trace.c:2380
>>   trace_workqueue_activate_work+0x186/0x1f0 include/trace/events/workqueue.h:59
>>   __queue_work+0xc7b/0xf50 kernel/workqueue.c:2338
> The problematic lock nesting is the work queue pool lock, which is a raw
> spinlock.
>
>> @@ -330,7 +330,7 @@ static long trie_update_elem(struct bpf_map *map,
>>   	if (key->prefixlen > trie->max_prefixlen)
>>   		return -EINVAL;
>>   
>> -	spin_lock_irqsave(&trie->lock, irq_flags);
>> +	raw_spin_lock_irqsave(&trie->lock, irq_flags);
>>   
>>   	/* Allocate and fill a new node */
> Making this a raw spinlock moves the problem from the BPF trie code into
> the memory allocator. On RT the memory allocator cannot be invoked under
> a raw spinlock.
I'am newbiee in this field. But actually when i change it to a raw 
spinlock, the problem syzbot reported dispeared.
If don't change like this, we should do what to deal with this problem, 
if you have any good idea, pls tell me to do.
> Thanks,
>
>          tglx
>
Sebastian Andrzej Siewior Nov. 14, 2024, 10:09 a.m. UTC | #7
On 2024-11-10 10:08:00 [+0800], Hou Tao wrote:
> >> well. However, after changing the kmalloc and its variants to bpf memory
> >> allocator, I think the switch to raw_spinlock_t will be safe. I have
> >> already written a draft patch set. Will post after after polishing and
> >> testing it. WDYT ?
> > Switching lpm to bpf_mem_alloc would address the issue.
> > Why do you want a switch to raw_spin_lock as well?
> > kfree_rcu() is already done outside of the lock.
> 
> After switching to raw_spinlock_t, the lpm trie could be used under
> interrupt context even under PREEMPT_RT.

I would have to dig why the lock has been moved away from raw_spinlock_t
and why we need it back and what changed since. I have some vague memory
that there was a test case which added plenty of items and cleaning it
up created latency spikes.
Note that interrupts are threaded on PREEMPT_RT. Using it in "interrupt
context" would mean you need this in the primary handler/ hardirq.

Sebastian
Thomas Gleixner Nov. 15, 2024, 10:29 p.m. UTC | #8
On Thu, Nov 14 2024 at 10:43, Kunwu Chan wrote:
> On 2024/11/12 23:08, Thomas Gleixner wrote:
>>> @@ -330,7 +330,7 @@ static long trie_update_elem(struct bpf_map *map,
>>>   	if (key->prefixlen > trie->max_prefixlen)
>>>   		return -EINVAL;
>>>   
>>> -	spin_lock_irqsave(&trie->lock, irq_flags);
>>> +	raw_spin_lock_irqsave(&trie->lock, irq_flags);
>>>   
>>>   	/* Allocate and fill a new node */
>> Making this a raw spinlock moves the problem from the BPF trie code into
>> the memory allocator. On RT the memory allocator cannot be invoked under
>> a raw spinlock.
> I'am newbiee in this field. But actually when i change it to a raw 
> spinlock, the problem syzbot reported dispeared.

Yes, because the actual code path which is going to trigger this, is not
reached. But it will be reached at some point.

IIRC, BPF has it's own allocator which can be used everywhere.

Thanks,

        tglx
Sebastian Andrzej Siewior Nov. 16, 2024, 9:21 a.m. UTC | #9
On 2024-11-15 23:29:31 [+0100], Thomas Gleixner wrote:
> IIRC, BPF has it's own allocator which can be used everywhere.

Thomas Weißschuh made something. It appears to work. Need to take a
closer look.

> Thanks,
> 
>         tglx

Sebastian
Alexei Starovoitov Nov. 16, 2024, 4:01 p.m. UTC | #10
On Sat, Nov 16, 2024 at 1:21 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2024-11-15 23:29:31 [+0100], Thomas Gleixner wrote:
> > IIRC, BPF has it's own allocator which can be used everywhere.
>
> Thomas Weißschuh made something. It appears to work. Need to take a
> closer look.

Any more details?
bpf_mem_alloc is a stop gap.
As Vlastimil Babka suggested long ago:
https://lwn.net/Articles/974138/
"...next on the target list is the special allocator used by the BPF
subsystem. This allocator is intended to succeed in any calling
context, including in non-maskable interrupts (NMIs). BPF maintainer
Alexei Starovoitov is evidently in favor of this removal if SLUB is
able to handle the same use cases..."

Here is the first step:
https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/
Thomas Weißschuh Nov. 16, 2024, 4:15 p.m. UTC | #11
On 2024-11-16 08:01:49-0800, Alexei Starovoitov wrote:
> On Sat, Nov 16, 2024 at 1:21 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> >
> > On 2024-11-15 23:29:31 [+0100], Thomas Gleixner wrote:
> > > IIRC, BPF has it's own allocator which can be used everywhere.
> >
> > Thomas Weißschuh made something. It appears to work. Need to take a
> > closer look.
> 
> Any more details?
> bpf_mem_alloc is a stop gap.

It is indeed using bpf_mem_alloc.
It is a fairly straightforward conversion, using one cache for
intermediate and one for non-intermediate nodes.

I'll try to send it early next week.

> As Vlastimil Babka suggested long ago:
> https://lwn.net/Articles/974138/
> "...next on the target list is the special allocator used by the BPF
> subsystem. This allocator is intended to succeed in any calling
> context, including in non-maskable interrupts (NMIs). BPF maintainer
> Alexei Starovoitov is evidently in favor of this removal if SLUB is
> able to handle the same use cases..."
> 
> Here is the first step:
> https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/
Alexei Starovoitov Nov. 16, 2024, 4:42 p.m. UTC | #12
On Sat, Nov 16, 2024 at 8:15 AM Thomas Weißschuh <linux@weissschuh.net> wrote:
>
> On 2024-11-16 08:01:49-0800, Alexei Starovoitov wrote:
> > On Sat, Nov 16, 2024 at 1:21 AM Sebastian Andrzej Siewior
> > <bigeasy@linutronix.de> wrote:
> > >
> > > On 2024-11-15 23:29:31 [+0100], Thomas Gleixner wrote:
> > > > IIRC, BPF has it's own allocator which can be used everywhere.
> > >
> > > Thomas Weißschuh made something. It appears to work. Need to take a
> > > closer look.
> >
> > Any more details?
> > bpf_mem_alloc is a stop gap.
>
> It is indeed using bpf_mem_alloc.
> It is a fairly straightforward conversion, using one cache for
> intermediate and one for non-intermediate nodes.

Sounds like you're proposing to allocate two lpm specific bpf_ma-s ?
Just use bpf_global_ma.
More ma-s means more memory pinned in bpf specific freelists.
That's the main reason to teach slub and page_alloc about bpf requirements.
All memory should be shared by all subsystems.
Custom memory pools / freelists, whether it's bpf, networking
or whatever else, is a pain point for somebody.
The kernel needs to be optimal for all use cases.

> I'll try to send it early next week.

Looking forward.

> > As Vlastimil Babka suggested long ago:
> > https://lwn.net/Articles/974138/
> > "...next on the target list is the special allocator used by the BPF
> > subsystem. This allocator is intended to succeed in any calling
> > context, including in non-maskable interrupts (NMIs). BPF maintainer
> > Alexei Starovoitov is evidently in favor of this removal if SLUB is
> > able to handle the same use cases..."
> >
> > Here is the first step:
> > https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/
Hou Tao Nov. 17, 2024, 1:24 a.m. UTC | #13
Hi,

On 11/17/2024 12:42 AM, Alexei Starovoitov wrote:
> On Sat, Nov 16, 2024 at 8:15 AM Thomas Weißschuh <linux@weissschuh.net> wrote:
>> On 2024-11-16 08:01:49-0800, Alexei Starovoitov wrote:
>>> On Sat, Nov 16, 2024 at 1:21 AM Sebastian Andrzej Siewior
>>> <bigeasy@linutronix.de> wrote:
>>>> On 2024-11-15 23:29:31 [+0100], Thomas Gleixner wrote:
>>>>> IIRC, BPF has it's own allocator which can be used everywhere.
>>>> Thomas Weißschuh made something. It appears to work. Need to take a
>>>> closer look.
>>> Any more details?
>>> bpf_mem_alloc is a stop gap.
>> It is indeed using bpf_mem_alloc.
>> It is a fairly straightforward conversion, using one cache for
>> intermediate and one for non-intermediate nodes.
> Sounds like you're proposing to allocate two lpm specific bpf_ma-s ?
> Just use bpf_global_ma.
> More ma-s means more memory pinned in bpf specific freelists.
> That's the main reason to teach slub and page_alloc about bpf requirements.
> All memory should be shared by all subsystems.
> Custom memory pools / freelists, whether it's bpf, networking
> or whatever else, is a pain point for somebody.
> The kernel needs to be optimal for all use cases.

I have been working on it since last week [1] and have already written a
patch (a patch in a patch set) for it. In my patch, these two allocators
will be merged if they are mergable and now the merge is decided by the
return value of kmalloc_size_roundup(). Also considering about using
bpf_global_ma instead, but the biggest problem for bpf_global_ma is the
memory accounting. The allocated memory will be accounted under root
memory cgroup instead of the memory cgroup of users.

[1]:
https://lore.kernel.org/bpf/e14d8882-4760-7c9c-0cfc-db04eda494ee@huaweicloud.com/
>
>> I'll try to send it early next week.
> Looking forward.
>
>>> As Vlastimil Babka suggested long ago:
>>> https://lwn.net/Articles/974138/
>>> "...next on the target list is the special allocator used by the BPF
>>> subsystem. This allocator is intended to succeed in any calling
>>> context, including in non-maskable interrupts (NMIs). BPF maintainer
>>> Alexei Starovoitov is evidently in favor of this removal if SLUB is
>>> able to handle the same use cases..."
>>>
>>> Here is the first step:
>>> https://lore.kernel.org/bpf/20241116014854.55141-1-alexei.starovoitov@gmail.com/
>
> .
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9b60eda0f727..373cdcfa0505 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -35,7 +35,7 @@  struct lpm_trie {
 	size_t				n_entries;
 	size_t				max_prefixlen;
 	size_t				data_size;
-	spinlock_t			lock;
+	raw_spinlock_t			lock;
 };
 
 /* This trie implements a longest prefix match algorithm that can be used to
@@ -330,7 +330,7 @@  static long trie_update_elem(struct bpf_map *map,
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	spin_lock_irqsave(&trie->lock, irq_flags);
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Allocate and fill a new node */
 
@@ -437,7 +437,7 @@  static long trie_update_elem(struct bpf_map *map,
 		kfree(im_node);
 	}
 
-	spin_unlock_irqrestore(&trie->lock, irq_flags);
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
 	kfree_rcu(free_node, rcu);
 
 	return ret;
@@ -459,7 +459,7 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	spin_lock_irqsave(&trie->lock, irq_flags);
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Walk the tree looking for an exact key/length match and keeping
 	 * track of the path we traverse.  We will need to know the node
@@ -535,7 +535,7 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 	free_node = node;
 
 out:
-	spin_unlock_irqrestore(&trie->lock, irq_flags);
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
 	kfree_rcu(free_parent, rcu);
 	kfree_rcu(free_node, rcu);
 
@@ -581,7 +581,7 @@  static struct bpf_map *trie_alloc(union bpf_attr *attr)
 			  offsetof(struct bpf_lpm_trie_key_u8, data);
 	trie->max_prefixlen = trie->data_size * 8;
 
-	spin_lock_init(&trie->lock);
+	raw_spin_lock_init(&trie->lock);
 
 	return &trie->map;
 }