diff mbox series

bpf, sockmap: defer sk_psock_free_link() using RCU

Message ID 838e7959-a360-4ac1-b36a-a3469236129b@I-love.SAKURA.ne.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, sockmap: defer sk_psock_free_link() using RCU | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 967 this patch: 967
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 936 this patch: 936
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 978 this patch: 978
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 45 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-12--09-00 (tests: 1016)
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Tetsuo Handa May 12, 2024, 7:21 a.m. UTC
If a BPF program is attached to kfree() event, calling kfree()
with psock->link_lock held triggers lockdep warning.

Defer kfree() using RCU so that the attached BPF program runs
without holding psock->link_lock.

Reported-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=ec941d6e24f633a59172
Tested-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com
Reported-by: syzbot+a4ed4041b9bea8177ac3@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=a4ed4041b9bea8177ac3
Tested-by: syzbot+a4ed4041b9bea8177ac3@syzkaller.appspotmail.com
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/skmsg.h | 7 +++++--
 net/core/skmsg.c      | 2 ++
 net/core/sock_map.c   | 2 ++
 3 files changed, 9 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov May 21, 2024, 3:38 p.m. UTC | #1
On Sun, May 12, 2024 at 12:22 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> If a BPF program is attached to kfree() event, calling kfree()
> with psock->link_lock held triggers lockdep warning.
>
> Defer kfree() using RCU so that the attached BPF program runs
> without holding psock->link_lock.
>
> Reported-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=ec941d6e24f633a59172
> Tested-by: syzbot+ec941d6e24f633a59172@syzkaller.appspotmail.com
> Reported-by: syzbot+a4ed4041b9bea8177ac3@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=a4ed4041b9bea8177ac3
> Tested-by: syzbot+a4ed4041b9bea8177ac3@syzkaller.appspotmail.com
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/linux/skmsg.h | 7 +++++--
>  net/core/skmsg.c      | 2 ++
>  net/core/sock_map.c   | 2 ++
>  3 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index a509caf823d6..66590f20b777 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -66,7 +66,10 @@ enum sk_psock_state_bits {
>  };
>
>  struct sk_psock_link {
> -       struct list_head                list;
> +       union {
> +               struct list_head        list;
> +               struct rcu_head         rcu;
> +       };
>         struct bpf_map                  *map;
>         void                            *link_raw;
>  };
> @@ -418,7 +421,7 @@ static inline struct sk_psock_link *sk_psock_init_link(void)
>
>  static inline void sk_psock_free_link(struct sk_psock_link *link)
>  {
> -       kfree(link);
> +       kfree_rcu(link, rcu);
>  }
>
>  struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index fd20aae30be2..9cebfeecd3c9 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -791,10 +791,12 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
>  {
>         struct sk_psock_link *link, *tmp;
>
> +       rcu_read_lock();
>         list_for_each_entry_safe(link, tmp, &psock->link, list) {
>                 list_del(&link->list);
>                 sk_psock_free_link(link);
>         }
> +       rcu_read_unlock();
>  }
>
>  void sk_psock_stop(struct sk_psock *psock)
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 8598466a3805..8bec4b7a8ec7 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>         bool strp_stop = false, verdict_stop = false;
>         struct sk_psock_link *link, *tmp;
>
> +       rcu_read_lock();
>         spin_lock_bh(&psock->link_lock);

I think this is incorrect.
spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.

pw-bot: cr

>         list_for_each_entry_safe(link, tmp, &psock->link, list) {
>                 if (link->link_raw == link_raw) {
> @@ -159,6 +160,7 @@ static void sock_map_del_link(struct sock *sk,
>                 }
>         }
>         spin_unlock_bh(&psock->link_lock);
> +       rcu_read_unlock();
>         if (strp_stop || verdict_stop) {
>                 write_lock_bh(&sk->sk_callback_lock);
>                 if (strp_stop)
> --
> 2.34.1
>
Hillf Danton May 21, 2024, 10:59 p.m. UTC | #2
On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> > --- a/net/core/sock_map.c
> > +++ b/net/core/sock_map.c
> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
> >         bool strp_stop =3D false, verdict_stop =3D false;
> >         struct sk_psock_link *link, *tmp;
> >
> > +       rcu_read_lock();
> >         spin_lock_bh(&psock->link_lock);
> 
> I think this is incorrect.
> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.

Could you specify why it won't be safe in rcu cs if you are right?
What does rcu look like in RT if not nothing?
> 
> pw-bot: cr
Jakub Sitnicki May 22, 2024, 9:50 a.m. UTC | #3
On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
> On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>> > --- a/net/core/sock_map.c
>> > +++ b/net/core/sock_map.c
>> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>> >         bool strp_stop =3D false, verdict_stop =3D false;
>> >         struct sk_psock_link *link, *tmp;
>> >
>> > +       rcu_read_lock();
>> >         spin_lock_bh(&psock->link_lock);
>> 
>> I think this is incorrect.
>> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>
> Could you specify why it won't be safe in rcu cs if you are right?
> What does rcu look like in RT if not nothing?

RCU readers can't block, while spinlock RT doesn't disable preemption.

https://docs.kernel.org/RCU/rcu.html
https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt

I've finally gotten around to testing proposed fix that just disallows
map_delete_elem on sockmap/sockhash from BPF tracing progs
completely. This should put an end to this saga of syzkaller reports.

https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/
Tetsuo Handa May 22, 2024, 10:30 a.m. UTC | #4
On 2024/05/22 18:50, Jakub Sitnicki wrote:
> On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
>> On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>>> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>>> --- a/net/core/sock_map.c
>>>> +++ b/net/core/sock_map.c
>>>> @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>>>>         bool strp_stop =3D false, verdict_stop =3D false;
>>>>         struct sk_psock_link *link, *tmp;
>>>>
>>>> +       rcu_read_lock();
>>>>         spin_lock_bh(&psock->link_lock);
>>>
>>> I think this is incorrect.
>>> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>>
>> Could you specify why it won't be safe in rcu cs if you are right?
>> What does rcu look like in RT if not nothing?
> 
> RCU readers can't block, while spinlock RT doesn't disable preemption.
> 
> https://docs.kernel.org/RCU/rcu.html
> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
> 

I didn't catch what you mean.

https://elixir.bootlin.com/linux/latest/source/include/linux/spinlock_rt.h#L43 defines spin_lock() for RT as

static __always_inline void spin_lock(spinlock_t *lock)
{
	rt_spin_lock(lock);
}

and https://elixir.bootlin.com/linux/v6.9/source/include/linux/spinlock_rt.h#L85 defines spin_lock_bh() for RT as

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
	/* Investigate: Drop bh when blocking ? */
	local_bh_disable();
	rt_spin_lock(lock);
}

and https://elixir.bootlin.com/linux/latest/source/kernel/locking/spinlock_rt.c#L54 defines rt_spin_lock() for RT as

void __sched rt_spin_lock(spinlock_t *lock)
{
	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
	__rt_spin_lock(lock);
}

and https://elixir.bootlin.com/linux/v6.9/source/kernel/locking/spinlock_rt.c#L46 defines __rt_spin_lock() for RT as

static __always_inline void __rt_spin_lock(spinlock_t *lock)
{
	rtlock_might_resched();
	rtlock_lock(&lock->lock);
	rcu_read_lock();
	migrate_disable();
}

. You can see that calling spin_lock() or spin_lock_bh() automatically starts RCU critical section, can't you?

If spin_lock_bh() for RT might sleep and calling spin_lock_bh() under RCU critical section is not safe,
how can

  spin_lock(&lock1);
  spin_lock(&lock2);
  // do something
  spin_unlock(&lock2);
  spin_unlock(&lock1);

or

  spin_lock_bh(&lock1);
  spin_lock(&lock2);
  // do something
  spin_unlock(&lock2);
  spin_unlock_bh(&lock1);

be possible?

Unless rcu_read_lock() is implemented in a way that is safe to do

  rcu_read_lock();
  spin_lock(&lock2);
  // do something
  spin_unlock(&lock2);
  rcu_read_unlock();

and

  rcu_read_lock();
  spin_lock_bh(&lock2);
  // do something
  spin_unlock_bh(&lock2);
  rcu_read_unlock();

, I think RT kernels can't run safely.

Locking primitive ordering is too much complicated/distributed.
We need documentation using safe/unsafe ordering examples.
Jakub Sitnicki May 22, 2024, 11:08 a.m. UTC | #5
On Wed, May 22, 2024 at 07:30 PM +09, Tetsuo Handa wrote:
> On 2024/05/22 18:50, Jakub Sitnicki wrote:
>> On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
>>> On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>>>> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>>>> --- a/net/core/sock_map.c
>>>>> +++ b/net/core/sock_map.c
>>>>> @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>>>>>         bool strp_stop =3D false, verdict_stop =3D false;
>>>>>         struct sk_psock_link *link, *tmp;
>>>>>
>>>>> +       rcu_read_lock();
>>>>>         spin_lock_bh(&psock->link_lock);
>>>>
>>>> I think this is incorrect.
>>>> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>>>
>>> Could you specify why it won't be safe in rcu cs if you are right?
>>> What does rcu look like in RT if not nothing?
>> 
>> RCU readers can't block, while spinlock RT doesn't disable preemption.
>> 
>> https://docs.kernel.org/RCU/rcu.html
>> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
>> 
>
> I didn't catch what you mean.
>
> https://elixir.bootlin.com/linux/latest/source/include/linux/spinlock_rt.h#L43 defines spin_lock() for RT as
>
> static __always_inline void spin_lock(spinlock_t *lock)
> {
> 	rt_spin_lock(lock);
> }
>
> and https://elixir.bootlin.com/linux/v6.9/source/include/linux/spinlock_rt.h#L85 defines spin_lock_bh() for RT as
>
> static __always_inline void spin_lock_bh(spinlock_t *lock)
> {
> 	/* Investigate: Drop bh when blocking ? */
> 	local_bh_disable();
> 	rt_spin_lock(lock);
> }
>
> and https://elixir.bootlin.com/linux/latest/source/kernel/locking/spinlock_rt.c#L54 defines rt_spin_lock() for RT as
>
> void __sched rt_spin_lock(spinlock_t *lock)
> {
> 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> 	__rt_spin_lock(lock);
> }
>
> and https://elixir.bootlin.com/linux/v6.9/source/kernel/locking/spinlock_rt.c#L46 defines __rt_spin_lock() for RT as
>
> static __always_inline void __rt_spin_lock(spinlock_t *lock)
> {
> 	rtlock_might_resched();
> 	rtlock_lock(&lock->lock);
> 	rcu_read_lock();
> 	migrate_disable();
> }
>
> . You can see that calling spin_lock() or spin_lock_bh() automatically starts RCU critical section, can't you?
>
> If spin_lock_bh() for RT might sleep and calling spin_lock_bh() under RCU critical section is not safe,
> how can
>
>   spin_lock(&lock1);
>   spin_lock(&lock2);
>   // do something
>   spin_unlock(&lock2);
>   spin_unlock(&lock1);
>
> or
>
>   spin_lock_bh(&lock1);
>   spin_lock(&lock2);
>   // do something
>   spin_unlock(&lock2);
>   spin_unlock_bh(&lock1);
>
> be possible?
>
> Unless rcu_read_lock() is implemented in a way that is safe to do
>
>   rcu_read_lock();
>   spin_lock(&lock2);
>   // do something
>   spin_unlock(&lock2);
>   rcu_read_unlock();
>
> and
>
>   rcu_read_lock();
>   spin_lock_bh(&lock2);
>   // do something
>   spin_unlock_bh(&lock2);
>   rcu_read_unlock();
>
> , I think RT kernels can't run safely.
>
> Locking primitive ordering is too much complicated/distributed.
> We need documentation using safe/unsafe ordering examples.

You're right. My answer was too hasty. Docs say that RT kernels can
preempt RCU read-side critical sections:

https://docs.kernel.org/RCU/whatisRCU.html?highlight=rcu_read_lock#rcu-read-lock
Hillf Danton May 22, 2024, 11:33 a.m. UTC | #6
On Wed, 22 May 2024 11:50:49 +0200 Jakub Sitnicki <jakub@cloudflare.com>
On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
> > On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
> >> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> >> > --- a/net/core/sock_map.c
> >> > +++ b/net/core/sock_map.c
> >> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
> >> >         bool strp_stop =3D false, verdict_stop =3D false;
> >> >         struct sk_psock_link *link, *tmp;
> >> >
> >> > +       rcu_read_lock();
> >> >         spin_lock_bh(&psock->link_lock);
> >> 
> >> I think this is incorrect.
> >> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
> >
> > Could you specify why it won't be safe in rcu cs if you are right?
> > What does rcu look like in RT if not nothing?
> 
> RCU readers can't block, while spinlock RT doesn't disable preemption.
> 
> https://docs.kernel.org/RCU/rcu.html
> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
> 
> I've finally gotten around to testing proposed fix that just disallows
> map_delete_elem on sockmap/sockhash from BPF tracing progs
> completely. This should put an end to this saga of syzkaller reports.
> 
> https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/
> 
The locking info syzbot reported [2] suggests a known issue that like Alexei
you hit the send button earlier than expected.

4 locks held by syz-executor361/5090:
 #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
 #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
 #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: map_delete_elem+0x388/0x5e0 kernel/bpf/syscall.c:1695
 #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
 #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: sock_hash_delete_elem+0x17c/0x400 net/core/sock_map.c:945
 #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
 #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_del_link net/core/sock_map.c:145 [inline]
 #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_unref+0xcc/0x5e0 net/core/sock_map.c:180
 #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
 #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
 #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
 #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run2+0x114/0x420 kernel/trace/bpf_trace.c:2420

[2] https://lore.kernel.org/all/000000000000d0b87206170dd88f@google.com/


If CONFIG_PREEMPT_RCU=y rcu_read_lock() does not disable
preemption. This is even true for !RT kernels with CONFIG_PREEMPT=y

[3] Subject: Re: [patch 30/63] locking/spinlock: Provide RT variant
https://lore.kernel.org/all/874kc6rizr.ffs@tglx/
Jakub Sitnicki May 22, 2024, 12:12 p.m. UTC | #7
On Wed, May 22, 2024 at 07:33 PM +08, Hillf Danton wrote:
> On Wed, 22 May 2024 11:50:49 +0200 Jakub Sitnicki <jakub@cloudflare.com>
> On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
>> > On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>> >> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>> >> > --- a/net/core/sock_map.c
>> >> > +++ b/net/core/sock_map.c
>> >> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>> >> >         bool strp_stop =3D false, verdict_stop =3D false;
>> >> >         struct sk_psock_link *link, *tmp;
>> >> >
>> >> > +       rcu_read_lock();
>> >> >         spin_lock_bh(&psock->link_lock);
>> >> 
>> >> I think this is incorrect.
>> >> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>> >
>> > Could you specify why it won't be safe in rcu cs if you are right?
>> > What does rcu look like in RT if not nothing?
>> 
>> RCU readers can't block, while spinlock RT doesn't disable preemption.
>> 
>> https://docs.kernel.org/RCU/rcu.html
>> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
>> 
>> I've finally gotten around to testing proposed fix that just disallows
>> map_delete_elem on sockmap/sockhash from BPF tracing progs
>> completely. This should put an end to this saga of syzkaller reports.
>> 
>> https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/
>> 
> The locking info syzbot reported [2] suggests a known issue that like Alexei
> you hit the send button earlier than expected.
>
> 4 locks held by syz-executor361/5090:
>  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
>  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
>  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: map_delete_elem+0x388/0x5e0 kernel/bpf/syscall.c:1695
>  #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
>  #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: sock_hash_delete_elem+0x17c/0x400 net/core/sock_map.c:945
>  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
>  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_del_link net/core/sock_map.c:145 [inline]
>  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_unref+0xcc/0x5e0 net/core/sock_map.c:180
>  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
>  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
>  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
>  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run2+0x114/0x420 kernel/trace/bpf_trace.c:2420
>
> [2] https://lore.kernel.org/all/000000000000d0b87206170dd88f@google.com/
>
>
> If CONFIG_PREEMPT_RCU=y rcu_read_lock() does not disable
> preemption. This is even true for !RT kernels with CONFIG_PREEMPT=y
>
> [3] Subject: Re: [patch 30/63] locking/spinlock: Provide RT variant
> https://lore.kernel.org/all/874kc6rizr.ffs@tglx/

That locking issue is related to my earlier, as it turned out -
incomplete, fix:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff91059932401894e6c86341915615c5eb0eca48

We don't expect map_delete_elem to be called from map_update_elem for
sockmap/sockhash, but that is what syzkaller started doing by attaching
BPF tracing progs which call map_delete_elem.
Alexei Starovoitov May 22, 2024, 2:57 p.m. UTC | #8
On Wed, May 22, 2024 at 5:12 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Wed, May 22, 2024 at 07:33 PM +08, Hillf Danton wrote:
> > On Wed, 22 May 2024 11:50:49 +0200 Jakub Sitnicki <jakub@cloudflare.com>
> > On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
> >> > On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
> >> >> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
> >> >> > --- a/net/core/sock_map.c
> >> >> > +++ b/net/core/sock_map.c
> >> >> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
> >> >> >         bool strp_stop =3D false, verdict_stop =3D false;
> >> >> >         struct sk_psock_link *link, *tmp;
> >> >> >
> >> >> > +       rcu_read_lock();
> >> >> >         spin_lock_bh(&psock->link_lock);
> >> >>
> >> >> I think this is incorrect.
> >> >> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
> >> >
> >> > Could you specify why it won't be safe in rcu cs if you are right?
> >> > What does rcu look like in RT if not nothing?
> >>
> >> RCU readers can't block, while spinlock RT doesn't disable preemption.
> >>
> >> https://docs.kernel.org/RCU/rcu.html
> >> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
> >>
> >> I've finally gotten around to testing proposed fix that just disallows
> >> map_delete_elem on sockmap/sockhash from BPF tracing progs
> >> completely. This should put an end to this saga of syzkaller reports.
> >>
> >> https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/

Agree. Let's do that. According to John the delete path is not something
that is used in production. It's only a source of trouble with syzbot.


> >>
> > The locking info syzbot reported [2] suggests a known issue that like Alexei
> > you hit the send button earlier than expected.
> >
> > 4 locks held by syz-executor361/5090:
> >  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
> >  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
> >  #0: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: map_delete_elem+0x388/0x5e0 kernel/bpf/syscall.c:1695
> >  #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
> >  #1: ffff88807b2af8f8 (&htab->buckets[i].lock){+...}-{2:2}, at: sock_hash_delete_elem+0x17c/0x400 net/core/sock_map.c:945
> >  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: spin_lock_bh include/linux/spinlock.h:356 [inline]
> >  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_del_link net/core/sock_map.c:145 [inline]
> >  #2: ffff88801c2a4290 (&psock->link_lock){+...}-{2:2}, at: sock_map_unref+0xcc/0x5e0 net/core/sock_map.c:180
> >  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:329 [inline]
> >  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:781 [inline]
> >  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2380 [inline]
> >  #3: ffffffff8e334d20 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run2+0x114/0x420 kernel/trace/bpf_trace.c:2420
> >
> > [2] https://lore.kernel.org/all/000000000000d0b87206170dd88f@google.com/
> >
> >
> > If CONFIG_PREEMPT_RCU=y rcu_read_lock() does not disable
> > preemption. This is even true for !RT kernels with CONFIG_PREEMPT=y
> >
> > [3] Subject: Re: [patch 30/63] locking/spinlock: Provide RT variant
> > https://lore.kernel.org/all/874kc6rizr.ffs@tglx/
>
> That locking issue is related to my earlier, as it turned out -
> incomplete, fix:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff91059932401894e6c86341915615c5eb0eca48
>
> We don't expect map_delete_elem to be called from map_update_elem for
> sockmap/sockhash, but that is what syzkaller started doing by attaching
> BPF tracing progs which call map_delete_elem.
Jakub Sitnicki May 24, 2024, 1:06 p.m. UTC | #9
On Wed, May 22, 2024 at 07:57 AM -07, Alexei Starovoitov wrote:
> On Wed, May 22, 2024 at 5:12 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>
>> On Wed, May 22, 2024 at 07:33 PM +08, Hillf Danton wrote:
>> > On Wed, 22 May 2024 11:50:49 +0200 Jakub Sitnicki <jakub@cloudflare.com>
>> > On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
>> >> > On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>> >> >> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>> >> >> > --- a/net/core/sock_map.c
>> >> >> > +++ b/net/core/sock_map.c
>> >> >> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>> >> >> >         bool strp_stop =3D false, verdict_stop =3D false;
>> >> >> >         struct sk_psock_link *link, *tmp;
>> >> >> >
>> >> >> > +       rcu_read_lock();
>> >> >> >         spin_lock_bh(&psock->link_lock);
>> >> >>
>> >> >> I think this is incorrect.
>> >> >> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>> >> >
>> >> > Could you specify why it won't be safe in rcu cs if you are right?
>> >> > What does rcu look like in RT if not nothing?
>> >>
>> >> RCU readers can't block, while spinlock RT doesn't disable preemption.
>> >>
>> >> https://docs.kernel.org/RCU/rcu.html
>> >> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
>> >>
>> >> I've finally gotten around to testing proposed fix that just disallows
>> >> map_delete_elem on sockmap/sockhash from BPF tracing progs
>> >> completely. This should put an end to this saga of syzkaller reports.
>> >>
>> >> https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/
>
> Agree. Let's do that. According to John the delete path is not something
> that is used in production. It's only a source of trouble with syzbot.

Cool. The proposed API rule would be that if a BPF program type is
allowed to update a sockmap/sockhash, then it is also allowed to delete
from it.

So I need to tweak my patch to allow deletes from sock_ops progs.
We have a dedicated bpf_sock_map_update() helper there.

[...]
Jakub Sitnicki May 27, 2024, 11:22 a.m. UTC | #10
On Fri, May 24, 2024 at 03:06 PM +02, Jakub Sitnicki wrote:
> On Wed, May 22, 2024 at 07:57 AM -07, Alexei Starovoitov wrote:
>> On Wed, May 22, 2024 at 5:12 AM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>>>
>>> On Wed, May 22, 2024 at 07:33 PM +08, Hillf Danton wrote:
>>> > On Wed, 22 May 2024 11:50:49 +0200 Jakub Sitnicki <jakub@cloudflare.com>
>>> > On Wed, May 22, 2024 at 06:59 AM +08, Hillf Danton wrote:
>>> >> > On Tue, 21 May 2024 08:38:52 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com>
>>> >> >> On Sun, May 12, 2024 at 12:22=E2=80=AFAM Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> wrote:
>>> >> >> > --- a/net/core/sock_map.c
>>> >> >> > +++ b/net/core/sock_map.c
>>> >> >> > @@ -142,6 +142,7 @@ static void sock_map_del_link(struct sock *sk,
>>> >> >> >         bool strp_stop =3D false, verdict_stop =3D false;
>>> >> >> >         struct sk_psock_link *link, *tmp;
>>> >> >> >
>>> >> >> > +       rcu_read_lock();
>>> >> >> >         spin_lock_bh(&psock->link_lock);
>>> >> >>
>>> >> >> I think this is incorrect.
>>> >> >> spin_lock_bh may sleep in RT and it won't be safe to do in rcu cs.
>>> >> >
>>> >> > Could you specify why it won't be safe in rcu cs if you are right?
>>> >> > What does rcu look like in RT if not nothing?
>>> >>
>>> >> RCU readers can't block, while spinlock RT doesn't disable preemption.
>>> >>
>>> >> https://docs.kernel.org/RCU/rcu.html
>>> >> https://docs.kernel.org/locking/locktypes.html#spinlock-t-and-preempt-rt
>>> >>
>>> >> I've finally gotten around to testing proposed fix that just disallows
>>> >> map_delete_elem on sockmap/sockhash from BPF tracing progs
>>> >> completely. This should put an end to this saga of syzkaller reports.
>>> >>
>>> >> https://lore.kernel.org/all/87jzjnxaqf.fsf@cloudflare.com/
>>
>> Agree. Let's do that. According to John the delete path is not something
>> that is used in production. It's only a source of trouble with syzbot.
>
> Cool. The proposed API rule would be that if a BPF program type is
> allowed to update a sockmap/sockhash, then it is also allowed to delete
> from it.
>
> So I need to tweak my patch to allow deletes from sock_ops progs.
> We have a dedicated bpf_sock_map_update() helper there.
>
> [...]

Posted:

https://lore.kernel.org/r/20240527-sockmap-verify-deletes-v1-0-944b372f2101@cloudflare.com
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index a509caf823d6..66590f20b777 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -66,7 +66,10 @@  enum sk_psock_state_bits {
 };
 
 struct sk_psock_link {
-	struct list_head		list;
+	union {
+		struct list_head	list;
+		struct rcu_head		rcu;
+	};
 	struct bpf_map			*map;
 	void				*link_raw;
 };
@@ -418,7 +421,7 @@  static inline struct sk_psock_link *sk_psock_init_link(void)
 
 static inline void sk_psock_free_link(struct sk_psock_link *link)
 {
-	kfree(link);
+	kfree_rcu(link, rcu);
 }
 
 struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index fd20aae30be2..9cebfeecd3c9 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -791,10 +791,12 @@  static void sk_psock_link_destroy(struct sk_psock *psock)
 {
 	struct sk_psock_link *link, *tmp;
 
+	rcu_read_lock();
 	list_for_each_entry_safe(link, tmp, &psock->link, list) {
 		list_del(&link->list);
 		sk_psock_free_link(link);
 	}
+	rcu_read_unlock();
 }
 
 void sk_psock_stop(struct sk_psock *psock)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 8598466a3805..8bec4b7a8ec7 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -142,6 +142,7 @@  static void sock_map_del_link(struct sock *sk,
 	bool strp_stop = false, verdict_stop = false;
 	struct sk_psock_link *link, *tmp;
 
+	rcu_read_lock();
 	spin_lock_bh(&psock->link_lock);
 	list_for_each_entry_safe(link, tmp, &psock->link, list) {
 		if (link->link_raw == link_raw) {
@@ -159,6 +160,7 @@  static void sock_map_del_link(struct sock *sk,
 		}
 	}
 	spin_unlock_bh(&psock->link_lock);
+	rcu_read_unlock();
 	if (strp_stop || verdict_stop) {
 		write_lock_bh(&sk->sk_callback_lock);
 		if (strp_stop)