@@ -1090,6 +1090,26 @@ udp_rmem_min - INTEGER
udp_wmem_min - INTEGER
UDP does not have tx memory accounting and this tunable has no effect.
+udp_hash_entries - INTEGER
+ Read-only number of hash buckets for UDP sockets in the current
+ networking namespace.
+
+ A negative value means the networking namespace does not own its
+ hash buckets and shares the initial networking namespace's one.
+
+udp_child_ehash_entries - INTEGER
+ Control the number of hash buckets for UDP sockets in the child
+ networking namespace, which must be set before clone() or unshare().
+
+ The written value except for 0 is rounded up to 2^n. 0 is a special
+ value, meaning the child networking namespace will share the initial
+ networking namespace's hash buckets.
+
+ Note that the child will use the global one in case the kernel
+ fails to allocate enough memory.
+
+ Default: 0
+
RAW variables
=============
@@ -200,6 +200,8 @@ struct netns_ipv4 {
atomic_t dev_addr_genid;
+ unsigned int sysctl_udp_child_hash_entries;
+
#ifdef CONFIG_SYSCTL
unsigned long *sysctl_local_reserved_ports;
int sysctl_ip_prot_sock;
@@ -424,6 +424,47 @@ static int proc_tcp_child_ehash_entries(struct ctl_table *table, int write,
return 0;
}
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_udp_child_hash_entries);
+ int udp_hash_entries;
+ struct ctl_table tbl;
+
+ udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+ /* A negative number indicates that the child netns
+ * shares the global udp_table.
+ */
+ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+ udp_hash_entries *= -1;
+
+ tbl.data = &udp_hash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
+static int proc_udp_child_hash_entries(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int udp_child_hash_entries;
+ int ret;
+
+ ret = proc_douintvec(table, write, buffer, lenp, ppos);
+ if (!write || ret)
+ return ret;
+
+ udp_child_hash_entries = READ_ONCE(*(unsigned int *)table->data);
+ if (udp_child_hash_entries)
+ udp_child_hash_entries = roundup_pow_of_two(udp_child_hash_entries);
+
+ WRITE_ONCE(*(unsigned int *)table->data, udp_child_hash_entries);
+
+ return 0;
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp,
@@ -1378,6 +1419,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
+ {
+ .procname = "udp_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .mode = 0444,
+ .proc_handler = proc_udp_hash_entries,
+ },
+ {
+ .procname = "udp_child_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_udp_child_hash_entries,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ },
{
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
@@ -3309,8 +3309,77 @@ static int __net_init udp_sysctl_init(struct net *net)
return 0;
}
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+ struct udp_table *udptable;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ udptable->hash = kvmalloc_array(hash_entries * 2,
+ sizeof(struct udp_hslot), GFP_KERNEL);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = udptable->hash + hash_entries;
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].head);
+ udptable->hash2[i].count = 0;
+ spin_lock_init(&udptable->hash2[i].lock);
+ }
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static int __net_init udp_pernet_table_init(struct net *net, struct net *old_net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto out;
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable)
+ net->ipv4.udp_table = udptable;
+ else
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to use the global one\n",
+ hash_entries);
+out:
+ return 0;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
static struct pernet_operations __net_initdata udp_sysctl_ops = {
.init = udp_sysctl_init,
+ .init2 = udp_pernet_table_init,
+ .exit = udp_pernet_table_free,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
We introduce an optional per-netns hash table for UDP. With a smaller hash table, we can look up sockets faster and isolate noisy neighbours. Also, we can reduce lock contention. We can control the hash table size by a new sysctl knob. However, depending on workloads, it will require very sensitive tuning, so we disable the feature by default (net.ipv4.udp_child_ehash_entries == 0). Moreover, we can fall back to using the global hash table in case we fail to allocate enough memory for a new hash table. We can check the current hash table size by another read-only sysctl knob, net.ipv4.udp_hash_entries. A negative value means the netns shares the global hash table (per-netns hash table is disabled or failed to allocate memory). We could optimise the hash table lookup/iteration further by removing netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> --- Documentation/networking/ip-sysctl.rst | 20 ++++++++ include/net/netns/ipv4.h | 2 + net/ipv4/sysctl_net_ipv4.c | 56 +++++++++++++++++++++ net/ipv4/udp.c | 69 ++++++++++++++++++++++++++ 4 files changed, 147 insertions(+)