diff mbox series

[20/20] lnet: socklnd: limit retries on conns_per_peer mismatch

Message ID 1665783491-13827-21-git-send-email-jsimmons@infradead.org (mailing list archive)
State New, archived
Headers show
Series lustre: backport OpenSFS work as of Oct 14, 2022 | expand

Commit Message

James Simmons Oct. 14, 2022, 9:38 p.m. UTC
From: Serguei Smirnov <ssmirnov@whamcloud.com>

If connection initiator has a higher conns-per-peer setting than
its peer, don't try to create extra connections forever as the
peer will keep rejecting them. A few retries should suffice to
resolve a valid race.

Fixes: 511ace4a ("lnet: socklnd: add conns_per_peer parameter")
WC-bug-id: https://jira.whamcloud.com/browse/LU-16191
Lustre-commit: da893c6c9707ca3b2 ("LU-16191 socklnd: limit retries on conns_per_peer mismatch")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48664
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 net/lnet/klnds/socklnd/socklnd.c    |  1 +
 net/lnet/klnds/socklnd/socklnd.h    |  4 ++++
 net/lnet/klnds/socklnd/socklnd_cb.c | 25 +++++++++++++++++++------
 3 files changed, 24 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/net/lnet/klnds/socklnd/socklnd.c b/net/lnet/klnds/socklnd/socklnd.c
index 9c8b75f0b2a2..00e33c88dfaa 100644
--- a/net/lnet/klnds/socklnd/socklnd.c
+++ b/net/lnet/klnds/socklnd/socklnd.c
@@ -144,6 +144,7 @@  ksocknal_create_conn_cb(struct sockaddr *addr)
 	conn_cb->ksnr_blki_conn_count = 0;
 	conn_cb->ksnr_blko_conn_count = 0;
 	conn_cb->ksnr_max_conns = 0;
+	conn_cb->ksnr_busy_retry_count = 0;
 
 	return conn_cb;
 }
diff --git a/net/lnet/klnds/socklnd/socklnd.h b/net/lnet/klnds/socklnd/socklnd.h
index dcb4b2952f8e..bb68a3df596a 100644
--- a/net/lnet/klnds/socklnd/socklnd.h
+++ b/net/lnet/klnds/socklnd/socklnd.h
@@ -379,6 +379,7 @@  struct ksock_conn {
 };
 
 #define SOCKNAL_CONN_COUNT_MAX_BITS	8	/* max conn count bits */
+#define SOCKNAL_MAX_BUSY_RETRIES	3
 
 struct ksock_conn_cb {
 	struct list_head	ksnr_connd_list;	/* chain on ksnr_connd_routes */
@@ -407,6 +408,9 @@  struct ksock_conn_cb {
 	unsigned int		ksnr_max_conns;		/* conns_per_peer at
 							 * peer creation
 							 */
+	unsigned int		ksnr_busy_retry_count;	/* counts retry attempts
+							 * due to EALREADY rc
+							 */
 };
 
 #define SOCKNAL_KEEPALIVE_PING	1	/* cookie for keepalive ping */
diff --git a/net/lnet/klnds/socklnd/socklnd_cb.c b/net/lnet/klnds/socklnd/socklnd_cb.c
index b2da535fbfbe..f358875a2afe 100644
--- a/net/lnet/klnds/socklnd/socklnd_cb.c
+++ b/net/lnet/klnds/socklnd/socklnd_cb.c
@@ -1785,7 +1785,7 @@  ksocknal_connect(struct ksock_conn_cb *conn_cb)
 {
 	LIST_HEAD(zombies);
 	struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
-	int type;
+	int type = SOCKLND_CONN_NONE;
 	int wanted;
 	struct socket *sock;
 	time64_t deadline;
@@ -1863,14 +1863,18 @@  ksocknal_connect(struct ksock_conn_cb *conn_cb)
 			goto failed;
 		}
 
-		/*
-		 * A +ve RC means I have to retry because I lost the connection
+		if (rc == EALREADY && conn_cb->ksnr_conn_count > 0)
+			conn_cb->ksnr_busy_retry_count += 1;
+		else
+			conn_cb->ksnr_busy_retry_count = 0;
+
+		/* A +ve RC means I have to retry because I lost the connection
 		 * race or I have to renegotiate protocol version
 		 */
-		retry_later = (rc);
+		retry_later = (rc != 0);
 		if (retry_later)
-			CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
-			       libcfs_nidstr(&peer_ni->ksnp_id.nid));
+			CDEBUG(D_NET, "peer_ni %s: conn race, retry later. rc %d\n",
+			       libcfs_nidstr(&peer_ni->ksnp_id.nid), rc);
 
 		write_lock_bh(&ksocknal_data.ksnd_global_lock);
 	}
@@ -1878,6 +1882,15 @@  ksocknal_connect(struct ksock_conn_cb *conn_cb)
 	conn_cb->ksnr_scheduled = 0;
 	conn_cb->ksnr_connecting = 0;
 
+	if (conn_cb->ksnr_busy_retry_count >= SOCKNAL_MAX_BUSY_RETRIES &&
+	    type > SOCKLND_CONN_NONE) {
+		/* After so many retries due to EALREADY assume that
+		 * the peer doesn't support as many connections as we want
+		 */
+		conn_cb->ksnr_connected |= BIT(type);
+		retry_later = false;
+	}
+
 	if (retry_later) {
 		/*
 		 * re-queue for attention; this frees me up to handle