diff mbox series

[v7,bpf-next,07/11] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.

Message ID 20210521182104.18273-8-kuniyu@amazon.co.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org john.fastabend@gmail.com songliubraving@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/header_inline success Link

Commit Message

Iwashima, Kuniyuki May 21, 2021, 6:21 p.m. UTC
This patch also changes the code to call reuseport_migrate_sock() and
inet_reqsk_clone(), but unlike the other cases, we do not call
inet_reqsk_clone() right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c             | 20 +++++++++++++------
 net/ipv4/tcp_minisocks.c        |  4 ++--
 net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
 4 files changed, 58 insertions(+), 14 deletions(-)

Comments

Eric Dumazet June 10, 2021, 8:36 p.m. UTC | #1
On 5/21/21 8:21 PM, Kuniyuki Iwashima wrote:
> This patch also changes the code to call reuseport_migrate_sock() and
> inet_reqsk_clone(), but unlike the other cases, we do not call
> inet_reqsk_clone() right after reuseport_migrate_sock().
> 
> Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
> has three kinds of refcnt:
> 
>   (A) for listener itself
>   (B) carried by reuqest_sock
>   (C) sock_hold() in tcp_v[46]_rcv()
> 
> While processing the req, (A) may disappear by close(listener). Also, (B)
> can disappear by accept(listener) once we put the req into the accept
> queue. So, we have to hold another refcnt (C) for the listener to prevent
> use-after-free.
> 
> For socket migration, we call reuseport_migrate_sock() to select a listener
> with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
> This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
> Thus we have to take another refcnt (B) for the newly cloned request_sock.
> 
> In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
> try to put the new req into the accept queue. By migrating req after
> winning the "own_req" race, we can avoid such a worst situation:
> 
>   CPU 1 looks up req1
>   CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
>   CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
>   ...
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
> Acked-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++---
>  net/ipv4/tcp_ipv4.c             | 20 +++++++++++++------
>  net/ipv4/tcp_minisocks.c        |  4 ++--
>  net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
>  4 files changed, 58 insertions(+), 14 deletions(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index c1f068464363..b795198f919a 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -1113,12 +1113,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
>  					 struct request_sock *req, bool own_req)
>  {
>  	if (own_req) {
> -		inet_csk_reqsk_queue_drop(sk, req);
> -		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
> -		if (inet_csk_reqsk_queue_add(sk, req, child))
> +		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
> +		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
> +
> +		if (sk != req->rsk_listener) {
> +			/* another listening sk has been selected,
> +			 * migrate the req to it.
> +			 */
> +			struct request_sock *nreq;
> +
> +			/* hold a refcnt for the nreq->rsk_listener
> +			 * which is assigned in inet_reqsk_clone()
> +			 */
> +			sock_hold(sk);
> +			nreq = inet_reqsk_clone(req, sk);
> +			if (!nreq) {
> +				inet_child_forget(sk, req, child);

Don't you need a sock_put(sk) here ?

\
> +				goto child_put;
> +			}
> +
> +			refcount_set(&nreq->rsk_refcnt, 1);
> +			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
> +				reqsk_migrate_reset(req);
> +				reqsk_put(req);
> +				return child;
> +			}
> +
> +			reqsk_migrate_reset(nreq);
> +			__reqsk_free(nreq);
> +		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
>  			return child;
> +		}
>
Iwashima, Kuniyuki June 10, 2021, 10:56 p.m. UTC | #2
From:   Eric Dumazet <eric.dumazet@gmail.com>
Date:   Thu, 10 Jun 2021 22:36:27 +0200
> On 5/21/21 8:21 PM, Kuniyuki Iwashima wrote:
> > This patch also changes the code to call reuseport_migrate_sock() and
> > inet_reqsk_clone(), but unlike the other cases, we do not call
> > inet_reqsk_clone() right after reuseport_migrate_sock().
> > 
> > Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
> > has three kinds of refcnt:
> > 
> >   (A) for listener itself
> >   (B) carried by reuqest_sock
> >   (C) sock_hold() in tcp_v[46]_rcv()
> > 
> > While processing the req, (A) may disappear by close(listener). Also, (B)
> > can disappear by accept(listener) once we put the req into the accept
> > queue. So, we have to hold another refcnt (C) for the listener to prevent
> > use-after-free.
> > 
> > For socket migration, we call reuseport_migrate_sock() to select a listener
> > with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
> > This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
> > Thus we have to take another refcnt (B) for the newly cloned request_sock.
> > 
> > In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
> > try to put the new req into the accept queue. By migrating req after
> > winning the "own_req" race, we can avoid such a worst situation:
> > 
> >   CPU 1 looks up req1
> >   CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
> >   CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
> >   ...
> > 
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
> > Acked-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++---
> >  net/ipv4/tcp_ipv4.c             | 20 +++++++++++++------
> >  net/ipv4/tcp_minisocks.c        |  4 ++--
> >  net/ipv6/tcp_ipv6.c             | 14 +++++++++++---
> >  4 files changed, 58 insertions(+), 14 deletions(-)
> > 
> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> > index c1f068464363..b795198f919a 100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -1113,12 +1113,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
> >  					 struct request_sock *req, bool own_req)
> >  {
> >  	if (own_req) {
> > -		inet_csk_reqsk_queue_drop(sk, req);
> > -		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
> > -		if (inet_csk_reqsk_queue_add(sk, req, child))
> > +		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
> > +		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
> > +
> > +		if (sk != req->rsk_listener) {
> > +			/* another listening sk has been selected,
> > +			 * migrate the req to it.
> > +			 */
> > +			struct request_sock *nreq;
> > +
> > +			/* hold a refcnt for the nreq->rsk_listener
> > +			 * which is assigned in inet_reqsk_clone()
> > +			 */
> > +			sock_hold(sk);
> > +			nreq = inet_reqsk_clone(req, sk);
> > +			if (!nreq) {
> > +				inet_child_forget(sk, req, child);
> 
> Don't you need a sock_put(sk) here ?

Yes.
If nreq == NULL, inet_reqsk_clone() calls sock_put().


> 
> \
> > +				goto child_put;
> > +			}
> > +
> > +			refcount_set(&nreq->rsk_refcnt, 1);
> > +			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
> > +				reqsk_migrate_reset(req);
> > +				reqsk_put(req);
> > +				return child;
> > +			}
> > +
> > +			reqsk_migrate_reset(nreq);
> > +			__reqsk_free(nreq);
> > +		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
> >  			return child;
> > +		}
> >
diff mbox series

Patch

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c1f068464363..b795198f919a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1113,12 +1113,40 @@  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req, bool own_req)
 {
 	if (own_req) {
-		inet_csk_reqsk_queue_drop(sk, req);
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		if (inet_csk_reqsk_queue_add(sk, req, child))
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+		if (sk != req->rsk_listener) {
+			/* another listening sk has been selected,
+			 * migrate the req to it.
+			 */
+			struct request_sock *nreq;
+
+			/* hold a refcnt for the nreq->rsk_listener
+			 * which is assigned in inet_reqsk_clone()
+			 */
+			sock_hold(sk);
+			nreq = inet_reqsk_clone(req, sk);
+			if (!nreq) {
+				inet_child_forget(sk, req, child);
+				goto child_put;
+			}
+
+			refcount_set(&nreq->rsk_refcnt, 1);
+			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				reqsk_migrate_reset(req);
+				reqsk_put(req);
+				return child;
+			}
+
+			reqsk_migrate_reset(nreq);
+			__reqsk_free(nreq);
+		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
 			return child;
+		}
 	}
 	/* Too bad, another child took ownership of the request, undo. */
+child_put:
 	bh_unlock_sock(child);
 	sock_put(child);
 	return NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f5b68a90be9..6cb8e269f1ab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2002,13 +2002,21 @@  int tcp_v4_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			/* We own a reference on the listener, increase it again
+			 * as we might lose it too soon.
+			 */
+			sock_hold(sk);
 		}
-		/* We own a reference on the listener, increase it again
-		 * as we might lose it too soon.
-		 */
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7513ba45553d..f258a4c0da71 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -775,8 +775,8 @@  struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		goto listen_overflow;
 
 	if (own_req && rsk_drop_req(req)) {
-		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-		inet_csk_reqsk_queue_drop_and_put(sk, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
 		return child;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4435fa342e7a..4d71464094b3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1664,10 +1664,18 @@  INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 			goto csum_error;
 		}
 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
-			inet_csk_reqsk_queue_drop_and_put(sk, req);
-			goto lookup;
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			sock_hold(sk);
 		}
-		sock_hold(sk);
 		refcounted = true;
 		nsk = NULL;
 		if (!tcp_filter(sk, skb)) {