@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
struct sock_reuseport {
struct rcu_head rcu;
- u16 max_socks; /* length of socks */
- u16 num_socks; /* elements in socks */
+ u16 max_socks; /* length of socks */
+ u16 num_socks; /* elements in socks */
+ u16 num_closed_socks; /* closed elements in socks */
/* The last synq overflow event timestamp of this
* reuse->socks[] group.
*/
@@ -23,6 +24,7 @@ struct sock_reuseport {
unsigned int reuseport_id;
unsigned int bind_inany:1;
unsigned int has_conns:1;
+ unsigned int migrate_req:1;
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
struct sock *socks[]; /* array of sock pointers */
};
@@ -36,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
+ struct net *net = sock_net(sk);
int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
@@ -75,6 +76,8 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
+ reuse->migrate_req = sk->sk_protocol == IPPROTO_TCP ?
+ net->ipv4.sysctl_tcp_migrate_req : 0;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -98,16 +101,22 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
return NULL;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
more_reuse->has_conns = reuse->has_conns;
+ more_reuse->migrate_req = reuse->migrate_req;
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
- more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
+ memcpy(more_reuse->socks +
+ (more_reuse->max_socks - more_reuse->num_closed_socks),
+ reuse->socks + reuse->num_socks,
+ reuse->num_closed_socks * sizeof(struct sock *));
- for (i = 0; i < reuse->num_socks; ++i)
+ for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
@@ -129,6 +138,25 @@ static void reuseport_free_rcu(struct rcu_head *head)
kfree(reuse);
}
+static int reuseport_sock_index(struct sock_reuseport *reuse, struct sock *sk,
+ bool closed)
+{
+ int left, right;
+
+ if (!closed) {
+ left = 0;
+ right = reuse->num_socks;
+ } else {
+ left = reuse->max_socks - reuse->num_closed_socks;
+ right = reuse->max_socks;
+ }
+
+ for (; left < right; left++)
+ if (reuse->socks[left] == sk)
+ return left;
+ return -1;
+}
+
/**
* reuseport_add_sock - Add a socket to the reuseport group of another.
* @sk: New socket to add to the group.
@@ -153,12 +181,23 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- if (old_reuse && old_reuse->num_socks != 1) {
+
+ if (old_reuse == reuse) {
+ int i = reuseport_sock_index(reuse, sk, true);
+
+ if (i == -1) {
+ spin_unlock_bh(&reuseport_lock);
+ return -EBUSY;
+ }
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ reuse->num_closed_socks--;
+ } else if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
- if (reuse->num_socks == reuse->max_socks) {
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
@@ -174,8 +213,9 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
spin_unlock_bh(&reuseport_lock);
- if (old_reuse)
+ if (old_reuse && old_reuse != reuse)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
return 0;
}
EXPORT_SYMBOL(reuseport_add_sock);
@@ -199,17 +239,34 @@ void reuseport_detach_sock(struct sock *sk)
*/
bpf_sk_reuseport_detach(sk);
- rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+ if (!reuse->migrate_req || sk->sk_state == TCP_LISTEN) {
+ i = reuseport_sock_index(reuse, sk, false);
+ if (i == -1)
+ goto out;
+
+ reuse->num_socks--;
+ reuse->socks[i] = reuse->socks[reuse->num_socks];
- for (i = 0; i < reuse->num_socks; i++) {
- if (reuse->socks[i] == sk) {
- reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
- reuse->num_socks--;
- if (reuse->num_socks == 0)
- call_rcu(&reuse->rcu, reuseport_free_rcu);
- break;
+ if (reuse->migrate_req) {
+ reuse->num_closed_socks++;
+ reuse->socks[reuse->max_socks - reuse->num_closed_socks] = sk;
+ } else {
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
}
+ } else {
+ i = reuseport_sock_index(reuse, sk, true);
+ if (i == -1)
+ goto out;
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ reuse->num_closed_socks--;
+
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
}
+
+ if (reuse->num_socks + reuse->num_closed_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+out:
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
@@ -138,6 +138,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
bool reuse = sk->sk_reuse;
bool reuseport = !!sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
+ struct sock_reuseport *reuseport_cb = rcu_access_pointer(sk->sk_reuseport_cb);
/*
* Unlike other sk lookup places we do not check
@@ -156,14 +157,16 @@ static int inet_csk_bind_conflict(const struct sock *sk,
if ((!relax ||
(!reuseport_ok &&
reuseport && sk2->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ (!reuseport_cb ||
+ reuseport_cb == rcu_access_pointer(sk2->sk_reuseport_cb)) &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(uid, sock_i_uid(sk2))))) &&
inet_rcv_saddr_equal(sk, sk2, true))
break;
} else if (!reuseport_ok ||
!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (reuseport_cb &&
+ reuseport_cb != rcu_access_pointer(sk2->sk_reuseport_cb)) ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))