diff mbox series

[RFC,net-next,02/10] ipv4: Add a sysctl to control multipath hash fields

Message ID 20210502162257.3472453-3-idosch@idosch.org (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Add support for custom multipath hash | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 10 maintainers not CCed: dsahern@kernel.org pablo@netfilter.org yoshfuji@linux-ipv6.org andreas.a.roeseler@gmail.com linux-doc@vger.kernel.org edumazet@google.com corbet@lwn.net fw@strlen.de amcohen@nvidia.com weiwan@google.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 6699 this patch: 6699
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 6912 this patch: 6912
netdev/header_inline success Link

Commit Message

Ido Schimmel May 2, 2021, 4:22 p.m. UTC
From: Ido Schimmel <idosch@nvidia.com>

A subsequent patch will add a new multipath hash policy where the packet
fields used for multipath hash calculation are determined by user space.
This patch adds a sysctl that allows user space to set these fields.

The packet fields are represented using a bitmap and are common between
IPv4 and IPv6 to allow user space to use the same numbering across both
protocols. For example, to hash based on standard 5-tuple:

 # sysctl -w net.ipv4.fib_multipath_hash_fields=0-2,4-5
 net.ipv4.fib_multipath_hash_fields = 0-2,4-5

More fields can be added in the future, if needed.

The 'need_outer' and 'need_inner' variables are set in the control path
to indicate whether dissection of the outer or inner flow is needed.
They will be used by a subsequent patch to allow the data path to avoid
dissection of the outer or inner flow when not needed.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
---
 Documentation/networking/ip-sysctl.rst | 29 ++++++++++++++++
 include/net/ip_fib.h                   | 46 ++++++++++++++++++++++++++
 include/net/netns/ipv4.h               |  4 +++
 net/ipv4/fib_frontend.c                | 24 ++++++++++++++
 net/ipv4/sysctl_net_ipv4.c             | 32 ++++++++++++++++++
 5 files changed, 135 insertions(+)

Comments

David Ahern May 4, 2021, 2:33 a.m. UTC | #1
On 5/2/21 10:22 AM, Ido Schimmel wrote:
> From: Ido Schimmel <idosch@nvidia.com>
> 
> A subsequent patch will add a new multipath hash policy where the packet
> fields used for multipath hash calculation are determined by user space.
> This patch adds a sysctl that allows user space to set these fields.
> 
> The packet fields are represented using a bitmap and are common between
> IPv4 and IPv6 to allow user space to use the same numbering across both
> protocols. For example, to hash based on standard 5-tuple:
> 
>  # sysctl -w net.ipv4.fib_multipath_hash_fields=0-2,4-5
>  net.ipv4.fib_multipath_hash_fields = 0-2,4-5
> 
> More fields can be added in the future, if needed.
> 
> The 'need_outer' and 'need_inner' variables are set in the control path
> to indicate whether dissection of the outer or inner flow is needed.
> They will be used by a subsequent patch to allow the data path to avoid
> dissection of the outer or inner flow when not needed.
> 
> Signed-off-by: Ido Schimmel <idosch@nvidia.com>
> ---
>  Documentation/networking/ip-sysctl.rst | 29 ++++++++++++++++
>  include/net/ip_fib.h                   | 46 ++++++++++++++++++++++++++
>  include/net/netns/ipv4.h               |  4 +++
>  net/ipv4/fib_frontend.c                | 24 ++++++++++++++
>  net/ipv4/sysctl_net_ipv4.c             | 32 ++++++++++++++++++
>  5 files changed, 135 insertions(+)
> 
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index c2ecc9894fd0..8ab61f4edf02 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -100,6 +100,35 @@ fib_multipath_hash_policy - INTEGER
>  	- 1 - Layer 4
>  	- 2 - Layer 3 or inner Layer 3 if present
>  
> +fib_multipath_hash_fields - list of comma separated ranges
> +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
> +	fields used for multipath hash calculation are determined by this
> +	sysctl.
> +
> +	The format used for both input and output is a comma separated list of
> +	ranges (e.g., "0-2" for source IP, destination IP and IP protocol).
> +	Writing to the file will clear all previous ranges and update the
> +	current list with the input.
> +
> +	Possible fields are:
> +
> +	== ============================
> +	 0 Source IP address
> +	 1 Destination IP address
> +	 2 IP protocol
> +	 3 Unused
> +	 4 Source port
> +	 5 Destination port
> +	 6 Inner source IP address
> +	 7 Inner destination IP address
> +	 8 Inner IP protocol
> +	 9 Inner Flow Label
> +	10 Inner source port
> +	11 Inner destination port
> +	== ============================
> +
> +	Default: 0-2 (source IP, destination IP and IP protocol)

since you are already requiring a name to id conversion, why not just
use a bitmask here as the input? if the value is a 32-bit bitmask do you
need bitmap_zalloc and its overhead?

Also, you could implement the current default using this scheme since
you have the default fields as the current L3 policy.
Ido Schimmel May 5, 2021, 7:48 a.m. UTC | #2
On Mon, May 03, 2021 at 08:33:40PM -0600, David Ahern wrote:
> On 5/2/21 10:22 AM, Ido Schimmel wrote:
> > From: Ido Schimmel <idosch@nvidia.com>
> > 
> > A subsequent patch will add a new multipath hash policy where the packet
> > fields used for multipath hash calculation are determined by user space.
> > This patch adds a sysctl that allows user space to set these fields.
> > 
> > The packet fields are represented using a bitmap and are common between
> > IPv4 and IPv6 to allow user space to use the same numbering across both
> > protocols. For example, to hash based on standard 5-tuple:
> > 
> >  # sysctl -w net.ipv4.fib_multipath_hash_fields=0-2,4-5
> >  net.ipv4.fib_multipath_hash_fields = 0-2,4-5
> > 
> > More fields can be added in the future, if needed.
> > 
> > The 'need_outer' and 'need_inner' variables are set in the control path
> > to indicate whether dissection of the outer or inner flow is needed.
> > They will be used by a subsequent patch to allow the data path to avoid
> > dissection of the outer or inner flow when not needed.
> > 
> > Signed-off-by: Ido Schimmel <idosch@nvidia.com>
> > ---
> >  Documentation/networking/ip-sysctl.rst | 29 ++++++++++++++++
> >  include/net/ip_fib.h                   | 46 ++++++++++++++++++++++++++
> >  include/net/netns/ipv4.h               |  4 +++
> >  net/ipv4/fib_frontend.c                | 24 ++++++++++++++
> >  net/ipv4/sysctl_net_ipv4.c             | 32 ++++++++++++++++++
> >  5 files changed, 135 insertions(+)
> > 
> > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> > index c2ecc9894fd0..8ab61f4edf02 100644
> > --- a/Documentation/networking/ip-sysctl.rst
> > +++ b/Documentation/networking/ip-sysctl.rst
> > @@ -100,6 +100,35 @@ fib_multipath_hash_policy - INTEGER
> >  	- 1 - Layer 4
> >  	- 2 - Layer 3 or inner Layer 3 if present
> >  
> > +fib_multipath_hash_fields - list of comma separated ranges
> > +	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
> > +	fields used for multipath hash calculation are determined by this
> > +	sysctl.
> > +
> > +	The format used for both input and output is a comma separated list of
> > +	ranges (e.g., "0-2" for source IP, destination IP and IP protocol).
> > +	Writing to the file will clear all previous ranges and update the
> > +	current list with the input.
> > +
> > +	Possible fields are:
> > +
> > +	== ============================
> > +	 0 Source IP address
> > +	 1 Destination IP address
> > +	 2 IP protocol
> > +	 3 Unused
> > +	 4 Source port
> > +	 5 Destination port
> > +	 6 Inner source IP address
> > +	 7 Inner destination IP address
> > +	 8 Inner IP protocol
> > +	 9 Inner Flow Label
> > +	10 Inner source port
> > +	11 Inner destination port
> > +	== ============================
> > +
> > +	Default: 0-2 (source IP, destination IP and IP protocol)
> 
> since you are already requiring a name to id conversion, why not just
> use a bitmask here as the input? if the value is a 32-bit bitmask do you
> need bitmap_zalloc and its overhead?

A bitmask was what I originally planned to use, but a bitmap seemed to
provide a better user interface. In practice, it is probably not a big
deal given that most people will just put the relevant value in
/etc/sysctl.d/ and forget about it. Will try the bitmask option.

> Also, you could implement the current default using this scheme since
> you have the default fields as the current L3 policy.

Will reply in patch #3.
diff mbox series

Patch

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index c2ecc9894fd0..8ab61f4edf02 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -100,6 +100,35 @@  fib_multipath_hash_policy - INTEGER
 	- 1 - Layer 4
 	- 2 - Layer 3 or inner Layer 3 if present
 
+fib_multipath_hash_fields - list of comma separated ranges
+	When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
+	fields used for multipath hash calculation are determined by this
+	sysctl.
+
+	The format used for both input and output is a comma separated list of
+	ranges (e.g., "0-2" for source IP, destination IP and IP protocol).
+	Writing to the file will clear all previous ranges and update the
+	current list with the input.
+
+	Possible fields are:
+
+	== ============================
+	 0 Source IP address
+	 1 Destination IP address
+	 2 IP protocol
+	 3 Unused
+	 4 Source port
+	 5 Destination port
+	 6 Inner source IP address
+	 7 Inner destination IP address
+	 8 Inner IP protocol
+	 9 Inner Flow Label
+	10 Inner source port
+	11 Inner destination port
+	== ============================
+
+	Default: 0-2 (source IP, destination IP and IP protocol)
+
 fib_sync_mem - UNSIGNED INTEGER
 	Amount of dirty memory from fib entries that can be backlogged before
 	synchronize_rcu is forced.
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a914f33f3ed5..d70a4c524bef 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -466,6 +466,52 @@  int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig);
 
+/* Fields used for sysctl_fib_multipath_hash_fields.
+ * Common to IPv4 and IPv6.
+ */
+enum {
+	FIB_MULTIPATH_HASH_FIELD_SRC_IP,
+	FIB_MULTIPATH_HASH_FIELD_DST_IP,
+	FIB_MULTIPATH_HASH_FIELD_IP_PROTO,
+	FIB_MULTIPATH_HASH_FIELD_FLOWLABEL,
+	FIB_MULTIPATH_HASH_FIELD_SRC_PORT,
+	FIB_MULTIPATH_HASH_FIELD_DST_PORT,
+	FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP,
+	FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP,
+	FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO,
+	FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL,
+	FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT,
+	FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT,
+
+	/* Add new fields above. This is user API. */
+	__FIB_MULTIPATH_HASH_FIELD_CNT,
+};
+
+#define FIB_MULTIPATH_HASH_TEST_FIELD(_field, _hash_fields)		      \
+	test_bit(FIB_MULTIPATH_HASH_FIELD_##_field, _hash_fields)
+
+static inline bool
+fib_multipath_hash_need_outer(const unsigned long *hash_fields)
+{
+	return FIB_MULTIPATH_HASH_TEST_FIELD(SRC_IP, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(DST_IP, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(IP_PROTO, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(FLOWLABEL, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(SRC_PORT, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(DST_PORT, hash_fields);
+}
+
+static inline bool
+fib_multipath_hash_need_inner(const unsigned long *hash_fields)
+{
+	return FIB_MULTIPATH_HASH_TEST_FIELD(INNER_SRC_IP, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(INNER_DST_IP, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(INNER_IP_PROTO, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(INNER_FLOWLABEL, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(INNER_SRC_PORT, hash_fields) ||
+	       FIB_MULTIPATH_HASH_TEST_FIELD(INNER_DST_PORT, hash_fields);
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f6af8d96d3c6..d0fcd968be44 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -210,6 +210,10 @@  struct netns_ipv4 {
 #endif
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+	unsigned long *sysctl_fib_multipath_hash_fields;
+	u8 fib_multipath_hash_fields_need_outer:1,
+	   fib_multipath_hash_fields_need_inner:1,
+	   unused:6;
 	u8 sysctl_fib_multipath_use_neigh;
 	u8 sysctl_fib_multipath_hash_policy;
 #endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 84bb707bd88d..f685e84b03b6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1516,6 +1516,23 @@  static int __net_init ip_fib_net_init(struct net *net)
 	if (err)
 		return err;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	net->ipv4.sysctl_fib_multipath_hash_fields =
+		bitmap_zalloc(__FIB_MULTIPATH_HASH_FIELD_CNT, GFP_KERNEL);
+	if (!net->ipv4.sysctl_fib_multipath_hash_fields)
+		goto err_hash_fields_alloc;
+
+	/* Default to 3-tuple */
+	set_bit(FIB_MULTIPATH_HASH_FIELD_SRC_IP,
+		net->ipv4.sysctl_fib_multipath_hash_fields);
+	set_bit(FIB_MULTIPATH_HASH_FIELD_DST_IP,
+		net->ipv4.sysctl_fib_multipath_hash_fields);
+	set_bit(FIB_MULTIPATH_HASH_FIELD_IP_PROTO,
+		net->ipv4.sysctl_fib_multipath_hash_fields);
+	net->ipv4.fib_multipath_hash_fields_need_outer = 1;
+	net->ipv4.fib_multipath_hash_fields_need_inner = 0;
+#endif
+
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
@@ -1533,6 +1550,10 @@  static int __net_init ip_fib_net_init(struct net *net)
 err_rules_init:
 	kfree(net->ipv4.fib_table_hash);
 err_table_hash_alloc:
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	bitmap_free(net->ipv4.sysctl_fib_multipath_hash_fields);
+err_hash_fields_alloc:
+#endif
 	fib4_notifier_exit(net);
 	return err;
 }
@@ -1568,6 +1589,9 @@  static void ip_fib_net_exit(struct net *net)
 #endif
 	rtnl_unlock();
 	kfree(net->ipv4.fib_table_hash);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	bitmap_free(net->ipv4.sysctl_fib_multipath_hash_fields);
+#endif
 	fib4_notifier_exit(net);
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a62934b9f15a..0db7e68c38cd 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -19,6 +19,7 @@ 
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
+#include <net/ip_fib.h>
 #include <net/route.h>
 #include <net/tcp.h>
 #include <net/udp.h>
@@ -461,6 +462,30 @@  static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
 
 	return ret;
 }
+
+static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	unsigned long *hash_fields;
+	struct net *net;
+	int ret;
+
+	net = container_of(table->data, struct net,
+			   ipv4.sysctl_fib_multipath_hash_fields);
+	ret = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+	if (!write || ret)
+		goto out;
+
+	hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
+	net->ipv4.fib_multipath_hash_fields_need_outer =
+		fib_multipath_hash_need_outer(hash_fields);
+	net->ipv4.fib_multipath_hash_fields_need_inner =
+		fib_multipath_hash_need_inner(hash_fields);
+
+out:
+	return ret;
+}
 #endif
 
 static struct ctl_table ipv4_table[] = {
@@ -1052,6 +1077,13 @@  static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+	{
+		.procname	= "fib_multipath_hash_fields",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+		.maxlen		= __FIB_MULTIPATH_HASH_FIELD_CNT,
+		.mode		= 0644,
+		.proc_handler	= proc_fib_multipath_hash_fields,
+	},
 #endif
 	{
 		.procname	= "ip_unprivileged_port_start",