Message ID | 20210521182104.18273-8-kuniyu@amazon.co.jp (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | Socket migration for SO_REUSEPORT. | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Clearly marked for bpf-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | warning | 6 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org john.fastabend@gmail.com songliubraving@fb.com |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 20 this patch: 20 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | warning | WARNING: line length of 82 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 20 this patch: 20 |
netdev/header_inline | success | Link |
On 5/21/21 8:21 PM, Kuniyuki Iwashima wrote: > This patch also changes the code to call reuseport_migrate_sock() and > inet_reqsk_clone(), but unlike the other cases, we do not call > inet_reqsk_clone() right after reuseport_migrate_sock(). > > Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener > has three kinds of refcnt: > > (A) for listener itself > (B) carried by reuqest_sock > (C) sock_hold() in tcp_v[46]_rcv() > > While processing the req, (A) may disappear by close(listener). Also, (B) > can disappear by accept(listener) once we put the req into the accept > queue. So, we have to hold another refcnt (C) for the listener to prevent > use-after-free. > > For socket migration, we call reuseport_migrate_sock() to select a listener > with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv(). > This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv(). > Thus we have to take another refcnt (B) for the newly cloned request_sock. > > In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and > try to put the new req into the accept queue. By migrating req after > winning the "own_req" race, we can avoid such a worst situation: > > CPU 1 looks up req1 > CPU 2 looks up req1, unhashes it, then CPU 1 loses the race > CPU 3 looks up req2, unhashes it, then CPU 2 loses the race > ... > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> > Acked-by: Martin KaFai Lau <kafai@fb.com> > --- > net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++--- > net/ipv4/tcp_ipv4.c | 20 +++++++++++++------ > net/ipv4/tcp_minisocks.c | 4 ++-- > net/ipv6/tcp_ipv6.c | 14 +++++++++++--- > 4 files changed, 58 insertions(+), 14 deletions(-) > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > index c1f068464363..b795198f919a 100644 > --- a/net/ipv4/inet_connection_sock.c > +++ b/net/ipv4/inet_connection_sock.c > @@ -1113,12 +1113,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, > struct request_sock *req, bool own_req) > { > if (own_req) { > - inet_csk_reqsk_queue_drop(sk, req); > - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); > - if (inet_csk_reqsk_queue_add(sk, req, child)) > + inet_csk_reqsk_queue_drop(req->rsk_listener, req); > + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); > + > + if (sk != req->rsk_listener) { > + /* another listening sk has been selected, > + * migrate the req to it. > + */ > + struct request_sock *nreq; > + > + /* hold a refcnt for the nreq->rsk_listener > + * which is assigned in inet_reqsk_clone() > + */ > + sock_hold(sk); > + nreq = inet_reqsk_clone(req, sk); > + if (!nreq) { > + inet_child_forget(sk, req, child); Don't you need a sock_put(sk) here ? \ > + goto child_put; > + } > + > + refcount_set(&nreq->rsk_refcnt, 1); > + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { > + reqsk_migrate_reset(req); > + reqsk_put(req); > + return child; > + } > + > + reqsk_migrate_reset(nreq); > + __reqsk_free(nreq); > + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { > return child; > + } >
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 10 Jun 2021 22:36:27 +0200 > On 5/21/21 8:21 PM, Kuniyuki Iwashima wrote: > > This patch also changes the code to call reuseport_migrate_sock() and > > inet_reqsk_clone(), but unlike the other cases, we do not call > > inet_reqsk_clone() right after reuseport_migrate_sock(). > > > > Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener > > has three kinds of refcnt: > > > > (A) for listener itself > > (B) carried by reuqest_sock > > (C) sock_hold() in tcp_v[46]_rcv() > > > > While processing the req, (A) may disappear by close(listener). Also, (B) > > can disappear by accept(listener) once we put the req into the accept > > queue. So, we have to hold another refcnt (C) for the listener to prevent > > use-after-free. > > > > For socket migration, we call reuseport_migrate_sock() to select a listener > > with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv(). > > This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv(). > > Thus we have to take another refcnt (B) for the newly cloned request_sock. > > > > In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and > > try to put the new req into the accept queue. By migrating req after > > winning the "own_req" race, we can avoid such a worst situation: > > > > CPU 1 looks up req1 > > CPU 2 looks up req1, unhashes it, then CPU 1 loses the race > > CPU 3 looks up req2, unhashes it, then CPU 2 loses the race > > ... > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> > > Acked-by: Martin KaFai Lau <kafai@fb.com> > > --- > > net/ipv4/inet_connection_sock.c | 34 ++++++++++++++++++++++++++++++--- > > net/ipv4/tcp_ipv4.c | 20 +++++++++++++------ > > net/ipv4/tcp_minisocks.c | 4 ++-- > > net/ipv6/tcp_ipv6.c | 14 +++++++++++--- > > 4 files changed, 58 insertions(+), 14 deletions(-) > > > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > > index c1f068464363..b795198f919a 100644 > > --- a/net/ipv4/inet_connection_sock.c > > +++ b/net/ipv4/inet_connection_sock.c > > @@ -1113,12 +1113,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, > > struct request_sock *req, bool own_req) > > { > > if (own_req) { > > - inet_csk_reqsk_queue_drop(sk, req); > > - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); > > - if (inet_csk_reqsk_queue_add(sk, req, child)) > > + inet_csk_reqsk_queue_drop(req->rsk_listener, req); > > + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); > > + > > + if (sk != req->rsk_listener) { > > + /* another listening sk has been selected, > > + * migrate the req to it. > > + */ > > + struct request_sock *nreq; > > + > > + /* hold a refcnt for the nreq->rsk_listener > > + * which is assigned in inet_reqsk_clone() > > + */ > > + sock_hold(sk); > > + nreq = inet_reqsk_clone(req, sk); > > + if (!nreq) { > > + inet_child_forget(sk, req, child); > > Don't you need a sock_put(sk) here ? Yes. If nreq == NULL, inet_reqsk_clone() calls sock_put(). > > \ > > + goto child_put; > > + } > > + > > + refcount_set(&nreq->rsk_refcnt, 1); > > + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { > > + reqsk_migrate_reset(req); > > + reqsk_put(req); > > + return child; > > + } > > + > > + reqsk_migrate_reset(nreq); > > + __reqsk_free(nreq); > > + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { > > return child; > > + } > >
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c1f068464363..b795198f919a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1113,12 +1113,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4f5b68a90be9..6cb8e269f1ab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2002,13 +2002,21 @@ int tcp_v4_rcv(struct sk_buff *skb) goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7513ba45553d..f258a4c0da71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4435fa342e7a..4d71464094b3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1664,10 +1664,18 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + sock_hold(sk); } - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) {