diff mbox series

[net-next,2/2] net/smc: Unbind r/w buffer size from clcsock and make them tunable

Message ID 1663642482-31639-1-git-send-email-guwen@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Separate SMC parameter settings from TCP sysctls | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4577 this patch: 4577
netdev/cc_maintainers warning 4 maintainers not CCed: tonylu@linux.alibaba.com linux-doc@vger.kernel.org dust.li@linux.alibaba.com corbet@lwn.net
netdev/build_clang success Errors and warnings before: 1114 this patch: 1114
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4764 this patch: 4764
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Wen Gu Sept. 20, 2022, 2:54 a.m. UTC
From: Tony Lu <tonylu@linux.alibaba.com>

Currently, SMC uses smc->sk.sk_{rcv|snd}buf to create buffers for
send buffer and RMB. And the values of buffer size are from tcp_{w|r}mem
in clcsock.

The buffer size from TCP socket doesn't fit SMC well. Generally, buffers
are usually larger than TCP for SMC-R/-D to get higher performance, for
they are different underlay devices and paths.

So this patch unbinds buffer size from TCP, and introduces two sysctl
knobs to tune them independently. Also, these knobs are per net
namespace and work for containers.

Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
---
 Documentation/networking/smc-sysctl.rst | 18 ++++++++++++++++++
 include/net/netns/smc.h                 |  2 ++
 net/smc/af_smc.c                        |  5 ++---
 net/smc/smc_core.c                      |  8 ++++----
 net/smc/smc_sysctl.c                    | 20 ++++++++++++++++++++
 5 files changed, 46 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst
index f8c3d59..2e45bbe 100644
--- a/Documentation/networking/smc-sysctl.rst
+++ b/Documentation/networking/smc-sysctl.rst
@@ -41,3 +41,21 @@  smcr_testlink_time - INTEGER
 	value is (INT_MAX / HZ) seconds, the minimum value is 1 second.
 
 	Default: 30 seconds.
+
+wmem - INTEGER
+	Initial size of send buffer used by SMC sockets.
+	The default value inherits from net.ipv4.tcp_wmem[1].
+
+	The minimum value is 16KiB and there is no hard limit for max value, but
+	only allowed 512KiB for SMC-R and 1MiB for SMC-D.
+
+	Default: 16K
+
+rmem - INTEGER
+	Initial size of receive buffer (RMB) used by SMC sockets.
+	The default value inherits from net.ipv4.tcp_rmem[1].
+
+	The minimum value is 16KiB and there is no hard limit for max value, but
+	only allowed 512KiB for SMC-R and 1MiB for SMC-D.
+
+	Default: 128K
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index d295e2c..582212a 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -20,5 +20,7 @@  struct netns_smc {
 	unsigned int			sysctl_autocorking_size;
 	unsigned int			sysctl_smcr_buf_type;
 	int				sysctl_smcr_testlink_time;
+	int				sysctl_wmem;
+	int				sysctl_rmem;
 };
 #endif
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0939cc3..e44ca70 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -379,6 +379,8 @@  static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
 	sk->sk_protocol = protocol;
+	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
+	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_WORK(&smc->connect_work, smc_connect_work);
@@ -3253,9 +3255,6 @@  static int __smc_create(struct net *net, struct socket *sock, int protocol,
 		smc->clcsock = clcsock;
 	}
 
-	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
-	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
-
 out:
 	return rc;
 }
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ebf56cd..ea41f22 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -2307,10 +2307,10 @@  static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 	if (is_rmb)
 		/* use socket recv buffer size (w/o overhead) as start value */
-		sk_buf_size = smc->sk.sk_rcvbuf / 2;
+		sk_buf_size = smc->sk.sk_rcvbuf;
 	else
 		/* use socket send buffer size (w/o overhead) as start value */
-		sk_buf_size = smc->sk.sk_sndbuf / 2;
+		sk_buf_size = smc->sk.sk_sndbuf;
 
 	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
 	     bufsize_short >= 0; bufsize_short--) {
@@ -2369,7 +2369,7 @@  static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	if (is_rmb) {
 		conn->rmb_desc = buf_desc;
 		conn->rmbe_size_short = bufsize_short;
-		smc->sk.sk_rcvbuf = bufsize * 2;
+		smc->sk.sk_rcvbuf = bufsize;
 		atomic_set(&conn->bytes_to_rcv, 0);
 		conn->rmbe_update_limit =
 			smc_rmb_wnd_update_limit(buf_desc->len);
@@ -2377,7 +2377,7 @@  static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
 	} else {
 		conn->sndbuf_desc = buf_desc;
-		smc->sk.sk_sndbuf = bufsize * 2;
+		smc->sk.sk_sndbuf = bufsize;
 		atomic_set(&conn->sndbuf_space, bufsize);
 	}
 	return 0;
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index 7f68520..0046a88 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -21,6 +21,8 @@ 
 
 static int smcr_testlink_time_min = 1;
 static int smcr_testlink_time_max = (INT_MAX / HZ);
+static int min_sndbuf = SMC_BUF_MIN_SIZE;
+static int min_rcvbuf = SMC_BUF_MIN_SIZE;
 
 static struct ctl_table smc_table[] = {
 	{
@@ -48,6 +50,22 @@ 
 		.extra1		= &smcr_testlink_time_min,
 		.extra2		= &smcr_testlink_time_max,
 	},
+	{
+		.procname	= "wmem",
+		.data		= &init_net.smc.sysctl_wmem,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_sndbuf,
+	},
+	{
+		.procname	= "rmem",
+		.data		= &init_net.smc.sysctl_rmem,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_rcvbuf,
+	},
 	{  }
 };
 
@@ -74,6 +92,8 @@  int __net_init smc_sysctl_net_init(struct net *net)
 	net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
 	net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
 	net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
+	WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]));
+	WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]));
 
 	return 0;