diff mbox series

[bpf,1/2] bpf: sockmap, fix introduced strparser recursive lock

Message ID 20240625201632.49024-2-john.fastabend@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Fix reported sockmap splat | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf
netdev/apply fail Patch does not apply to bpf-0

Commit Message

John Fastabend June 25, 2024, 8:16 p.m. UTC
Originally there was a race where removing a psock from the sock map while
it was also receiving an skb and calling sk_psock_data_ready(). It was
possible the removal code would NULL/set the data_ready callback while
concurrently calling the hook from receive path. The fix was to wrap the
access in sk_callback_lock to ensure the saved_data_ready pointer didn't
change under us. There was some discussion around doing a larger change
to ensure we could use READ_ONCE/WRITE_ONCE over the callback, but that
was for *next kernels not stable fixes.

But, we unfortunately introduced a regression with the fix because there
is another path into this code (that didn't have a test case) through
the stream parser. The stream parser runs with the lower lock which means
we get the following splat and lock up.


 ============================================
 WARNING: possible recursive locking detected
 6.10.0-rc2 #59 Not tainted
 --------------------------------------------
 test_sockmap/342 is trying to acquire lock:
 ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
 sk_psock_skb_ingress_enqueue (./include/linux/skmsg.h:467
 net/core/skmsg.c:555)

 but task is already holding lock:
 ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
 sk_psock_strp_data_ready (net/core/skmsg.c:1120)

To fix ensure we do not grap lock when we reach this code through the
strparser.

Fixes: 6648e613226e1 ("bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue")
Reported-by: Vincent Whitchurch <vincent.whitchurch@datadoghq.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/skmsg.h | 9 +++++++--
 net/core/skmsg.c      | 5 ++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

Comments

Jakub Sitnicki June 29, 2024, 3:34 p.m. UTC | #1
On Tue, Jun 25, 2024 at 01:16 PM -07, John Fastabend wrote:
> Originally there was a race where removing a psock from the sock map while
> it was also receiving an skb and calling sk_psock_data_ready(). It was
> possible the removal code would NULL/set the data_ready callback while
> concurrently calling the hook from receive path. The fix was to wrap the
> access in sk_callback_lock to ensure the saved_data_ready pointer didn't
> change under us. There was some discussion around doing a larger change
> to ensure we could use READ_ONCE/WRITE_ONCE over the callback, but that
> was for *next kernels not stable fixes.
>
> But, we unfortunately introduced a regression with the fix because there
> is another path into this code (that didn't have a test case) through
> the stream parser. The stream parser runs with the lower lock which means
> we get the following splat and lock up.
>
>
>  ============================================
>  WARNING: possible recursive locking detected
>  6.10.0-rc2 #59 Not tainted
>  --------------------------------------------
>  test_sockmap/342 is trying to acquire lock:
>  ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
>  sk_psock_skb_ingress_enqueue (./include/linux/skmsg.h:467
>  net/core/skmsg.c:555)
>
>  but task is already holding lock:
>  ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
>  sk_psock_strp_data_ready (net/core/skmsg.c:1120)
>
> To fix ensure we do not grap lock when we reach this code through the
> strparser.
>
> Fixes: 6648e613226e1 ("bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue")
> Reported-by: Vincent Whitchurch <vincent.whitchurch@datadoghq.com>
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/linux/skmsg.h | 9 +++++++--
>  net/core/skmsg.c      | 5 ++++-
>  2 files changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index c9efda9df285..3659e9b514d0 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -461,13 +461,18 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
>  		sk_psock_drop(sk, psock);
>  }
>  
> -static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
> +static inline void __sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
>  {
> -	read_lock_bh(&sk->sk_callback_lock);
>  	if (psock->saved_data_ready)
>  		psock->saved_data_ready(sk);
>  	else
>  		sk->sk_data_ready(sk);
> +}
> +
> +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
> +{
> +	read_lock_bh(&sk->sk_callback_lock);
> +	__sk_psock_data_ready(sk, psock);
>  	read_unlock_bh(&sk->sk_callback_lock);
>  }
>  
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index fd20aae30be2..8429daecbbb6 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -552,7 +552,10 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
>  	msg->skb = skb;
>  
>  	sk_psock_queue_msg(psock, msg);
> -	sk_psock_data_ready(sk, psock);
> +	if (skb_bpf_strparser(skb))
> +		__sk_psock_data_ready(sk, psock);
> +	else
> +		sk_psock_data_ready(sk, psock);
>  	return copied;
>  }

If I follow, this is the call chain that leads to the recursive lock:

sock::sk_data_ready → sk_psock_strp_data_ready
    write_lock_bh(&sk->sk_callback_lock)
    strp_data_ready
      strp_read_sock
        proto_ops::read_sock → tcp_read_sock
          strp_recv
            __strp_recv
              strp_callbacks::rcv_msg → sk_psock_strp_read
                  sk_psock_verdict_apply(verdict=__SK_PASS)
                    sk_psock_skb_ingress_self
                      sk_psock_skb_ingress_enqueue
                        sk_psock_data_ready
                          read_lock_bh(&sk->sk_callback_lock) !!!

What I don't get, though, is why strp_data_ready has to be called with a
_writer_ lock? Maybe that should just be a reader lock, and then it can
be recursive.
John Fastabend July 3, 2024, 1:12 a.m. UTC | #2
Jakub Sitnicki wrote:
> On Tue, Jun 25, 2024 at 01:16 PM -07, John Fastabend wrote:
> > Originally there was a race where removing a psock from the sock map while
> > it was also receiving an skb and calling sk_psock_data_ready(). It was
> > possible the removal code would NULL/set the data_ready callback while
> > concurrently calling the hook from receive path. The fix was to wrap the
> > access in sk_callback_lock to ensure the saved_data_ready pointer didn't
> > change under us. There was some discussion around doing a larger change
> > to ensure we could use READ_ONCE/WRITE_ONCE over the callback, but that
> > was for *next kernels not stable fixes.
> >
> > But, we unfortunately introduced a regression with the fix because there
> > is another path into this code (that didn't have a test case) through
> > the stream parser. The stream parser runs with the lower lock which means
> > we get the following splat and lock up.
> >
> >
> >  ============================================
> >  WARNING: possible recursive locking detected
> >  6.10.0-rc2 #59 Not tainted
> >  --------------------------------------------
> >  test_sockmap/342 is trying to acquire lock:
> >  ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
> >  sk_psock_skb_ingress_enqueue (./include/linux/skmsg.h:467
> >  net/core/skmsg.c:555)
> >
> >  but task is already holding lock:
> >  ffff888007a87228 (clock-AF_INET){++--}-{2:2}, at:
> >  sk_psock_strp_data_ready (net/core/skmsg.c:1120)
> >
> > To fix ensure we do not grap lock when we reach this code through the
> > strparser.
> >
> > Fixes: 6648e613226e1 ("bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue")
> > Reported-by: Vincent Whitchurch <vincent.whitchurch@datadoghq.com>
> > Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> > ---
> >  include/linux/skmsg.h | 9 +++++++--
> >  net/core/skmsg.c      | 5 ++++-
> >  2 files changed, 11 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index c9efda9df285..3659e9b514d0 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -461,13 +461,18 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
> >  		sk_psock_drop(sk, psock);
> >  }
> >  
> > -static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
> > +static inline void __sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
> >  {
> > -	read_lock_bh(&sk->sk_callback_lock);
> >  	if (psock->saved_data_ready)
> >  		psock->saved_data_ready(sk);
> >  	else
> >  		sk->sk_data_ready(sk);
> > +}
> > +
> > +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
> > +{
> > +	read_lock_bh(&sk->sk_callback_lock);
> > +	__sk_psock_data_ready(sk, psock);
> >  	read_unlock_bh(&sk->sk_callback_lock);
> >  }
> >  
> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> > index fd20aae30be2..8429daecbbb6 100644
> > --- a/net/core/skmsg.c
> > +++ b/net/core/skmsg.c
> > @@ -552,7 +552,10 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
> >  	msg->skb = skb;
> >  
> >  	sk_psock_queue_msg(psock, msg);
> > -	sk_psock_data_ready(sk, psock);
> > +	if (skb_bpf_strparser(skb))
> > +		__sk_psock_data_ready(sk, psock);
> > +	else
> > +		sk_psock_data_ready(sk, psock);
> >  	return copied;
> >  }
> 
> If I follow, this is the call chain that leads to the recursive lock:
> 
> sock::sk_data_ready → sk_psock_strp_data_ready
>     write_lock_bh(&sk->sk_callback_lock)
>     strp_data_ready
>       strp_read_sock
>         proto_ops::read_sock → tcp_read_sock
>           strp_recv
>             __strp_recv
>               strp_callbacks::rcv_msg → sk_psock_strp_read
>                   sk_psock_verdict_apply(verdict=__SK_PASS)
>                     sk_psock_skb_ingress_self
>                       sk_psock_skb_ingress_enqueue
>                         sk_psock_data_ready
>                           read_lock_bh(&sk->sk_callback_lock) !!!
> 
> What I don't get, though, is why strp_data_ready has to be called with a
> _writer_ lock? Maybe that should just be a reader lock, and then it can
> be recursive.

Agree read lock should be fine we just want to ensure the strp
is not changing during the callchain there. Let me do that
fix instead.
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c9efda9df285..3659e9b514d0 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -461,13 +461,18 @@  static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
 		sk_psock_drop(sk, psock);
 }
 
-static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
+static inline void __sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
 {
-	read_lock_bh(&sk->sk_callback_lock);
 	if (psock->saved_data_ready)
 		psock->saved_data_ready(sk);
 	else
 		sk->sk_data_ready(sk);
+}
+
+static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
+{
+	read_lock_bh(&sk->sk_callback_lock);
+	__sk_psock_data_ready(sk, psock);
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index fd20aae30be2..8429daecbbb6 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -552,7 +552,10 @@  static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
 	msg->skb = skb;
 
 	sk_psock_queue_msg(psock, msg);
-	sk_psock_data_ready(sk, psock);
+	if (skb_bpf_strparser(skb))
+		__sk_psock_data_ready(sk, psock);
+	else
+		sk_psock_data_ready(sk, psock);
 	return copied;
 }