@@ -187,6 +187,42 @@ static int inet_autobind(struct sock *sk)
return 0;
}
+static int inet_autobind_reuse(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = -EAGAIN;
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ lock_sock(sk);
+ if (inet->inet_num)
+ goto ok;
+
+ if (sk->sk_reuse && !sk->sk_reuseport && prot->bind_add &&
+ inet->inet_rcv_saddr && inet->inet_saddr) {
+ if (prot->bind_add(sk, uaddr, addr_len))
+ goto fail;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = usin->sin_addr.s_addr;
+ inet->inet_dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ } else {
+ if (prot->get_port(sk, 0))
+ goto fail;
+ inet->inet_sport = htons(inet->inet_num);
+ }
+ok:
+ err = 0;
+fail:
+ release_sock(sk);
+ return err;
+}
+
/*
* Move a socket into listening state.
*/
@@ -571,8 +607,9 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
return err;
}
- if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
+ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind_reuse(sk, uaddr, addr_len))
return -EAGAIN;
+
return sk->sk_prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
@@ -68,8 +68,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
- inet->inet_daddr = fl4->daddr;
- inet->inet_dport = usin->sin_port;
+ if (!inet->inet_daddr)
+ inet->inet_daddr = fl4->daddr;
+ if (!inet->inet_dport)
+ inet->inet_dport = usin->sin_port;
reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
@@ -78,6 +80,11 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
sk_dst_set(sk, &rt->dst);
err = 0;
out:
+ if (err) {
+ /* Dissolve any destination association auto-bind might have created */
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ }
return err;
}
EXPORT_SYMBOL(__ip4_datagram_connect);
@@ -163,6 +163,28 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
return 0;
}
+static void udp_v4_lport_reuse_inuse(const struct net *net,
+ const struct udp_hslot *hslot,
+ unsigned long *bitmap,
+ struct sock *sk, unsigned int log,
+ __be32 daddr, __be16 dport)
+{
+ struct sock *sk2;
+
+ sk_for_each(sk2, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ ((!sk2->sk_reuse &&
+ inet_rcv_saddr_equal(sk, sk2, true)) ||
+ (sk2->sk_reuse &&
+ inet_rcv_saddr_equal(sk, sk2, false) &&
+ inet_sk(sk2)->inet_daddr == daddr &&
+ inet_sk(sk2)->inet_dport == dport)))
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ }
+}
+
/*
* Note: we still hold spinlock of primary hash chain, so no other writer
* can insert/delete a socket with local_port == num
@@ -356,6 +378,72 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
+static int udp_v4_bind_add(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct udp_table *udptable = prot->h.udp_table;
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ struct udp_hslot *hslot, *hslot2;
+ struct net *net = sock_net(sk);
+ int low, high, remaining;
+ u16 first, last, snum;
+ u32 rand;
+
+ inet_sk_get_local_port_range(sk, &low, &high);
+ remaining = (high - low) + 1;
+
+ rand = prandom_u32();
+ first = reciprocal_scale(rand, remaining) + low;
+ last = first + udptable->mask + 1;
+ /* force rand to be an odd multiple of UDP_HTABLE_SIZE */
+ rand = (rand | 1) * (udptable->mask + 1);
+
+ do {
+ bitmap_zero(bitmap, PORTS_PER_CHAIN);
+
+ hslot = udp_hashslot(udptable, net, first);
+ spin_lock(&hslot->lock);
+
+ udp_v4_lport_reuse_inuse(net, hslot, bitmap, sk, udptable->log,
+ usin->sin_addr.s_addr, usin->sin_port);
+
+ snum = first;
+ do {
+ if (low <= snum && snum <= high &&
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_local_reserved_port(net, snum))
+ goto found;
+ snum += rand;
+ } while (snum != first);
+
+ spin_unlock(&hslot->lock);
+ cond_resched();
+ } while (++first != last);
+
+ return 1;
+found:
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash =
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, snum);
+
+ sk_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
+ sock_prot_inuse_add(net, prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ spin_unlock(&hslot->lock);
+
+ return 0;
+}
+
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
@@ -2939,6 +3027,7 @@ struct proto udp_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
+ .bind_add = udp_v4_bind_add,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
This is an RFC patch accompanying an LPC 2022 talk [1]. Users of connected UDP sockets who want to delegate the free source port search to the kernel, by leaving the port unspecified at bind() time, face a limitation today. If the delayed auto-bind flag, IP_BIND_ADDRESS_NO_PORT, is set on the socket, the source (IP, port) actually will not be shared between two connected UDP sockets: # if there is just one ephemeral port system("sysctl -w net.ipv4.ip_local_port_range='60000 60000'") s1 = socket(AF_INET, SOCK_DGRAM) s1.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1) s1.bind(("192.0.2.1", 0)) s1.connect(("1.1.1.1", 53)) s2 = socket(AF_INET, SOCK_DGRAM) s2.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1) s2.bind(("192.0.2.1", 0)) s2.connect(("1.0.0.1", 53)) # -> EAGAIN This leaves users in a situation where the number of connected UDP sockets on given IP is limited to the number of ephemeral ports. If the user would like to share the source port when the 4-tuple is unique, they have to resort to user-space free port search implementation with 4-tuple conflict detection, which is non-trivial [2]. To address this limitation, implement a new protocol operation for finding a free port but avoiding the 4-tuple conflicts. The new operation is similar to ->get_port but applies stricter criteria for determining if a port is busy. Destination IP and port of existing sockets is checked against the address the user passed to connect(), in addition to what ->get_port checks today (netns, src addr, device). There already happens to exist a proto operation that has a signature matching our needs here, that is takes a socket reference and a destination address as arguments - named ->bind_add(). It is currently used only by SCTP code, so we can re-purpose it. To remain backward compatible, we call into ->bind_add at connect() time to find a free port only if the user: 1. has specified the local source IP but left port unspecified, and 2. enabled IP_BIND_ADDRESS_NO_PORT, and 3. enabled port sharing with SO_REUSEADDR. If the above condition is met, we will try to find a local port that can be shared with other existing sockets as long as the 4-tuple is unique. Or fail with EAGAIN if we are out of local ports. Rationale here is that today when source address sharing with REUSEADDR is enabled for a UDP socket, setting BIND_ADDRESS_NO_PORT has no effect on port selection and conflict detection. It merely delays the auto-bind from bind() to connect()/sendmsg() time. At the same time, users are unlikely to run into EAGAIN errors from connect() calling into ->bind_add(), if for some reason they are setting both REUSEADDR and BIND_ADDRESS_NO_PORT on their connected UDP sockets already. For that to happen, we would have to encounter a 4-tuple conflict with another existing connected UDP socket and completely run out of ephemeral ports. As this is an RFC submission and there are still a few things left to do: - get rid of duplicated code between ->get_port and ->bind_add - UDP-Lite and UDPv6 support is missing - split code into patches - add support for IPv6 sockets - add selftests/net - add man page docs [1] https://lpc.events/event/16/contributions/1349/ [2] https://github.com/cloudflare/cloudflare-blog/blob/232b432c1d57/2022-02-connectx/connectx.py#L116 Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> --- net/ipv4/af_inet.c | 39 +++++++++++++++++++- net/ipv4/datagram.c | 11 +++++- net/ipv4/udp.c | 89 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 3 deletions(-)