diff mbox series

[v1,net-next,13/13] udp: Introduce optional per-netns hash table.

Message ID 20220826000445.46552-14-kuniyu@amazon.com (mailing list archive)
State New, archived
Headers show
Series tcp/udp: Introduce optional per-netns hash table. | expand

Commit Message

Kuniyuki Iwashima Aug. 26, 2022, 12:04 a.m. UTC
We introduce an optional per-netns hash table for UDP.

With a smaller hash table, we can look up sockets faster and isolate
noisy neighbours.  Also, we can reduce lock contention.

We can control the hash table size by a new sysctl knob.  However,
depending on workloads, it will require very sensitive tuning, so we
disable the feature by default (net.ipv4.udp_child_ehash_entries == 0).
Moreover, we can fall back to using the global hash table in case we
fail to allocate enough memory for a new hash table.

We can check the current hash table size by another read-only sysctl
knob, net.ipv4.udp_hash_entries.  A negative value means the netns
shares the global hash table (per-netns hash table is disabled or
failed to allocate memory).

We could optimise the hash table lookup/iteration further by removing
netns comparison for the per-netns one in the future.  Also, we could
optimise the sparse udp_hslot layout by putting it in udp_table.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 Documentation/networking/ip-sysctl.rst | 20 ++++++++
 include/net/netns/ipv4.h               |  2 +
 net/ipv4/sysctl_net_ipv4.c             | 56 +++++++++++++++++++++
 net/ipv4/udp.c                         | 69 ++++++++++++++++++++++++++
 4 files changed, 147 insertions(+)
diff mbox series

Patch

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 97a0952b11e3..6dc4e2853e39 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1090,6 +1090,26 @@  udp_rmem_min - INTEGER
 udp_wmem_min - INTEGER
 	UDP does not have tx memory accounting and this tunable has no effect.
 
+udp_hash_entries - INTEGER
+	Read-only number of hash buckets for UDP sockets in the current
+	networking namespace.
+
+	A negative value means the networking namespace does not own its
+	hash buckets and shares the initial networking namespace's one.
+
+udp_child_ehash_entries - INTEGER
+	Control the number of hash buckets for UDP sockets in the child
+	networking namespace, which must be set before clone() or unshare().
+
+	The written value except for 0 is rounded up to 2^n.  0 is a special
+	value, meaning the child networking namespace will share the initial
+	networking namespace's hash buckets.
+
+	Note that the child will use the global one in case the kernel
+	fails to allocate enough memory.
+
+	Default: 0
+
 RAW variables
 =============
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index c367da5d61e2..a1be7ebb7338 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -200,6 +200,8 @@  struct netns_ipv4 {
 
 	atomic_t dev_addr_genid;
 
+	unsigned int sysctl_udp_child_hash_entries;
+
 #ifdef CONFIG_SYSCTL
 	unsigned long *sysctl_local_reserved_ports;
 	int sysctl_ip_prot_sock;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 03a3187c4705..b3cea3f36463 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -424,6 +424,47 @@  static int proc_tcp_child_ehash_entries(struct ctl_table *table, int write,
 	return 0;
 }
 
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.sysctl_udp_child_hash_entries);
+	int udp_hash_entries;
+	struct ctl_table tbl;
+
+	udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+	/* A negative number indicates that the child netns
+	 * shares the global udp_table.
+	 */
+	if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+		udp_hash_entries *= -1;
+
+	tbl.data = &udp_hash_entries;
+	tbl.maxlen = sizeof(int);
+
+	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
+static int proc_udp_child_hash_entries(struct ctl_table *table, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int udp_child_hash_entries;
+	int ret;
+
+	ret = proc_douintvec(table, write, buffer, lenp, ppos);
+	if (!write || ret)
+		return ret;
+
+	udp_child_hash_entries = READ_ONCE(*(unsigned int *)table->data);
+	if (udp_child_hash_entries)
+		udp_child_hash_entries = roundup_pow_of_two(udp_child_hash_entries);
+
+	WRITE_ONCE(*(unsigned int *)table->data, udp_child_hash_entries);
+
+	return 0;
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
 					  void *buffer, size_t *lenp,
@@ -1378,6 +1419,21 @@  static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_INT_MAX,
 	},
+	{
+		.procname	= "udp_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.mode		= 0444,
+		.proc_handler	= proc_udp_hash_entries,
+	},
+	{
+		.procname	= "udp_child_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_udp_child_hash_entries,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f4825e38762a..c41306225305 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3309,8 +3309,77 @@  static int __net_init udp_sysctl_init(struct net *net)
 	return 0;
 }
 
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+	struct udp_table *udptable;
+	int i;
+
+	udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+	if (!udptable)
+		goto out;
+
+	udptable->hash = kvmalloc_array(hash_entries * 2,
+					sizeof(struct udp_hslot), GFP_KERNEL);
+	if (!udptable->hash)
+		goto free_table;
+
+	udptable->hash2 = udptable->hash + hash_entries;
+	udptable->mask = hash_entries - 1;
+	udptable->log = ilog2(hash_entries);
+
+	for (i = 0; i < hash_entries; i++) {
+		INIT_HLIST_HEAD(&udptable->hash[i].head);
+		udptable->hash[i].count = 0;
+		spin_lock_init(&udptable->hash[i].lock);
+
+		INIT_HLIST_HEAD(&udptable->hash2[i].head);
+		udptable->hash2[i].count = 0;
+		spin_lock_init(&udptable->hash2[i].lock);
+	}
+
+	return udptable;
+
+free_table:
+	kfree(udptable);
+out:
+	return NULL;
+}
+
+static int __net_init udp_pernet_table_init(struct net *net, struct net *old_net)
+{
+	struct udp_table *udptable;
+	unsigned int hash_entries;
+
+	hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+	if (!hash_entries)
+		goto out;
+
+	udptable = udp_pernet_table_alloc(hash_entries);
+	if (udptable)
+		net->ipv4.udp_table = udptable;
+	else
+		pr_warn("Failed to allocate UDP hash table (entries: %u) "
+			"for a netns, fallback to use the global one\n",
+			hash_entries);
+out:
+	return 0;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+
+	if (udptable == &udp_table)
+		return;
+
+	kvfree(udptable->hash);
+	kfree(udptable);
+}
+
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
 	.init	= udp_sysctl_init,
+	.init2	= udp_pernet_table_init,
+	.exit	= udp_pernet_table_free,
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)