diff mbox series

[net-next,v4,01/10] net/smc: remove locks smc_client_lgr_pending and smc_server_lgr_pending

Message ID 1666529042-40828-2-git-send-email-alibuda@linux.alibaba.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series optimize the parallelism of SMC-R connections | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 2 maintainers not CCed: edumazet@google.com pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 2

Commit Message

D. Wythe Oct. 23, 2022, 12:43 p.m. UTC
From: "D. Wythe" <alibuda@linux.alibaba.com>

This patch attempts to remove locks named smc_client_lgr_pending and
smc_server_lgr_pending, which aim to serialize the creation of link
group. However, once link group existed already, those locks are
meaningless, worse still, they make incoming connections have to be
queued one after the other.

Now, the creation of link group is no longer generated by competition,
but allocated through following strategy.

1. Try to find a suitable link group, if successd, current connection
is considered as NON first contact connection. ends.

2. Check the number of connections currently waiting for a suitable
link group to be created, if it is not less that the number of link
groups to be created multiplied by (SMC_RMBS_PER_LGR_MAX - 1), then
increase the number of link groups to be created, current connection
is considered as the first contact connection. ends.

3. Increase the number of connections currently waiting, and wait
for woken up.

4. Decrease the number of connections currently waiting, goto 1.

We wake up the connection that was put to sleep in stage 3 through
the SMC link state change event. Once the link moves out of the
SMC_LNK_ACTIVATING state, decrease the number of link groups to
be created, and then wake up at most (SMC_RMBS_PER_LGR_MAX - 1)
connections.

In the iplementation, we introduce the concept of lnk cluster, which is
a collection of links with the same characteristics (see
smcr_lnk_cluster_cmpfn() with more details), which makes it possible to
wake up efficiently in the scenario of N v.s 1.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 net/smc/af_smc.c   |  41 ++----
 net/smc/smc_core.c | 375 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_core.h |  10 ++
 3 files changed, 392 insertions(+), 34 deletions(-)
diff mbox series

Patch

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e44ca70..e369ab6 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -53,13 +53,6 @@ 
 #include "smc_tracepoint.h"
 #include "smc_sysctl.h"
 
-static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
-						 * creation on server
-						 */
-static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
-						 * creation on client
-						 */
-
 static struct workqueue_struct	*smc_tcp_ls_wq;	/* wq for tcp listen work */
 struct workqueue_struct	*smc_hs_wq;	/* wq for handshake work */
 struct workqueue_struct	*smc_close_wq;	/* wq for close work */
@@ -1196,10 +1189,8 @@  static int smc_connect_rdma(struct smc_sock *smc,
 	if (reason_code)
 		return reason_code;
 
-	mutex_lock(&smc_client_lgr_pending);
 	reason_code = smc_conn_create(smc, ini);
 	if (reason_code) {
-		mutex_unlock(&smc_client_lgr_pending);
 		return reason_code;
 	}
 
@@ -1291,7 +1282,6 @@  static int smc_connect_rdma(struct smc_sock *smc,
 		if (reason_code)
 			goto connect_abort;
 	}
-	mutex_unlock(&smc_client_lgr_pending);
 
 	smc_copy_sock_settings_to_clc(smc);
 	smc->connect_nonblock = 0;
@@ -1301,7 +1291,6 @@  static int smc_connect_rdma(struct smc_sock *smc,
 	return 0;
 connect_abort:
 	smc_conn_abort(smc, ini->first_contact_local);
-	mutex_unlock(&smc_client_lgr_pending);
 	smc->connect_nonblock = 0;
 
 	return reason_code;
@@ -1347,11 +1336,8 @@  static int smc_connect_ism(struct smc_sock *smc,
 	}
 	ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
 
-	/* there is only one lgr role for SMC-D; use server lock */
-	mutex_lock(&smc_server_lgr_pending);
 	rc = smc_conn_create(smc, ini);
 	if (rc) {
-		mutex_unlock(&smc_server_lgr_pending);
 		return rc;
 	}
 
@@ -1378,7 +1364,6 @@  static int smc_connect_ism(struct smc_sock *smc,
 				  aclc->hdr.version, eid, NULL);
 	if (rc)
 		goto connect_abort;
-	mutex_unlock(&smc_server_lgr_pending);
 
 	smc_copy_sock_settings_to_clc(smc);
 	smc->connect_nonblock = 0;
@@ -1388,7 +1373,6 @@  static int smc_connect_ism(struct smc_sock *smc,
 	return 0;
 connect_abort:
 	smc_conn_abort(smc, ini->first_contact_local);
-	mutex_unlock(&smc_server_lgr_pending);
 	smc->connect_nonblock = 0;
 
 	return rc;
@@ -1504,6 +1488,9 @@  static int __smc_connect(struct smc_sock *smc)
 
 	SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
 	smc_connect_ism_vlan_cleanup(smc, ini);
+	if (ini->first_contact_local)
+		smc_lgr_decision_maker_on_first_contact_done(ini, true /* success */);
+
 	kfree(buf);
 	kfree(ini);
 	return 0;
@@ -1512,6 +1499,8 @@  static int __smc_connect(struct smc_sock *smc)
 	smc_connect_ism_vlan_cleanup(smc, ini);
 	kfree(buf);
 fallback:
+	if (ini->first_contact_local)
+		smc_lgr_decision_maker_on_first_contact_done(ini, false /* fail */);
 	kfree(ini);
 	return smc_connect_decline_fallback(smc, rc, version);
 }
@@ -2378,7 +2367,6 @@  static void smc_listen_work(struct work_struct *work)
 	if (rc)
 		goto out_decl;
 
-	mutex_lock(&smc_server_lgr_pending);
 	smc_close_init(new_smc);
 	smc_rx_init(new_smc);
 	smc_tx_init(new_smc);
@@ -2386,18 +2374,14 @@  static void smc_listen_work(struct work_struct *work)
 	/* determine ISM or RoCE device used for connection */
 	rc = smc_listen_find_device(new_smc, pclc, ini);
 	if (rc)
-		goto out_unlock;
+		goto out_decl;
 
 	/* send SMC Accept CLC message */
 	accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
 	rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
 				 accept_version, ini->negotiated_eid);
 	if (rc)
-		goto out_unlock;
-
-	/* SMC-D does not need this lock any more */
-	if (ini->is_smcd)
-		mutex_unlock(&smc_server_lgr_pending);
+		goto out_decl;
 
 	/* receive SMC Confirm CLC message */
 	memset(buf, 0, sizeof(*buf));
@@ -2405,8 +2389,6 @@  static void smc_listen_work(struct work_struct *work)
 	rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
 			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
 	if (rc) {
-		if (!ini->is_smcd)
-			goto out_unlock;
 		goto out_decl;
 	}
 
@@ -2415,17 +2397,18 @@  static void smc_listen_work(struct work_struct *work)
 		rc = smc_listen_rdma_finish(new_smc, cclc,
 					    ini->first_contact_local, ini);
 		if (rc)
-			goto out_unlock;
-		mutex_unlock(&smc_server_lgr_pending);
+			goto out_decl;
 	}
 	smc_conn_save_peer_info(new_smc, cclc);
 	smc_listen_out_connected(new_smc);
 	SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
+	if (ini->first_contact_local)
+		smc_lgr_decision_maker_on_first_contact_done(ini, true /* success */);
 	goto out_free;
 
-out_unlock:
-	mutex_unlock(&smc_server_lgr_pending);
 out_decl:
+	if (ini && ini->first_contact_local)
+		smc_lgr_decision_maker_on_first_contact_done(ini, false /* fail */);
 	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
 			   proposal_version);
 out_free:
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index e6ee797..fff41c3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -46,6 +46,282 @@  struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 	.num = 0,
 };
 
+struct smc_lgr_decision_maker {
+	struct rhash_head	rnode;	/* node for rhashtable */
+	struct wait_queue_head	wq;	/* queue for connection that have been
+					 * decided to wait
+					 */
+	spinlock_t		lock;	/* protection for decision maker */
+	refcount_t		ref;	/* refcount for decision maker */
+	int			type;	/* smc type */
+	int			role;	/* smc role */
+	unsigned long		pending_capability;
+					/* maximum pending number of connections that
+					 * need to wait.
+					 */
+	unsigned long		conns_pending;
+					/* the number of pending connections */
+	u32			lacking_first_contact;
+					/* indicate that the connection
+					 * should perform first contact.
+					 */
+};
+
+struct smcr_lgr_decision_maker {
+	struct smc_lgr_decision_maker	ldm;
+	u8	peer_systemid[SMC_SYSTEMID_LEN];
+	u8	peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
+	u8	peer_gid[SMC_GID_SIZE];	/* gid of peer*/
+	int	clcqpn;
+};
+
+struct smcd_lgr_decision_maker {
+	struct smc_lgr_decision_maker  ldm;
+	u64	peer_gid;
+	struct	smcd_dev *dev;
+};
+
+static int smcr_lgr_decision_maker_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct smc_init_info *ini = arg->key;
+	const struct smcr_lgr_decision_maker *rldm = obj;
+
+	if (ini->role != rldm->ldm.role)
+		return 1;
+
+	if (memcmp(ini->peer_systemid, rldm->peer_systemid, SMC_SYSTEMID_LEN))
+		return 1;
+
+	if (memcmp(ini->peer_gid, rldm->peer_gid, SMC_GID_SIZE))
+		return 1;
+
+	if ((ini->role == SMC_SERV || ini->ib_clcqpn == rldm->clcqpn) &&
+	    (ini->smcr_version == SMC_V2 ||
+	    !memcmp(ini->peer_mac, rldm->peer_mac, ETH_ALEN)))
+		return 0;
+
+	return 1;
+}
+
+static u32 smcr_lgr_decision_maker_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct smcr_lgr_decision_maker *rldm = data;
+
+	return jhash2((u32 *)rldm->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed)
+		+ ((rldm->ldm.role == SMC_SERV) ? 0 : rldm->clcqpn);
+}
+
+static u32 smcr_lgr_decision_maker_arg_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct smc_init_info *ini = data;
+
+	return jhash2((u32 *)ini->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed)
+		+ ((ini->role == SMC_SERV) ? 0 : ini->ib_clcqpn);
+}
+
+static void smcr_lgr_decision_maker_init(struct smc_lgr_decision_maker *ldm,
+					 struct smc_init_info *ini)
+{
+	struct smcr_lgr_decision_maker *rldm = (struct smcr_lgr_decision_maker *)ldm;
+
+	memcpy(rldm->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN);
+	memcpy(rldm->peer_gid, ini->peer_gid, SMC_GID_SIZE);
+	memcpy(rldm->peer_mac, ini->peer_mac, ETH_ALEN);
+	rldm->clcqpn = ini->ib_clcqpn;
+}
+
+static int smcd_lgr_decision_maker_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct smc_init_info *ini = arg->key;
+	const struct smcd_lgr_decision_maker *dldm = obj;
+
+	if (ini->role != dldm->ldm.role)
+		return 1;
+
+	if (ini->ism_peer_gid[ini->ism_selected] != dldm->peer_gid)
+		return 1;
+
+	if (ini->ism_dev[ini->ism_selected] != dldm->dev)
+		return 1;
+
+	return 0;
+}
+
+static u32 smcd_lgr_decision_maker_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct smcd_lgr_decision_maker *dlcm = data;
+
+	return jhash2((u32 *)&dlcm->peer_gid, sizeof(dlcm->peer_gid) / sizeof(u32), seed);
+}
+
+static u32 smcd_lgr_decision_maker_arg_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct smc_init_info *ini = data;
+	u64 select_gid;
+
+	select_gid = ini->ism_peer_gid[ini->ism_selected];
+	return jhash2((u32 *)&select_gid, sizeof(select_gid) / sizeof(u32), seed);
+}
+
+static void smcd_lgr_decision_maker_init(struct smc_lgr_decision_maker *ldm,
+					 struct smc_init_info *ini)
+{
+	struct smcd_lgr_decision_maker *dldm = (struct smcd_lgr_decision_maker *)ldm;
+
+	dldm->peer_gid = ini->ism_peer_gid[ini->ism_selected];
+	dldm->dev = ini->ism_dev[ini->ism_selected];
+}
+
+struct smc_lgr_decision_builder {
+	struct rhashtable	map;
+	spinlock_t	map_lock;	/* protect map */
+	struct rhashtable_params	default_params;
+					/* how to serach and insert decision maker by ini */
+	void (*init)(struct smc_lgr_decision_maker *ldm, struct smc_init_info *ini);
+					/* init maker by ini */
+	u32	sz;			/* size */
+};
+
+static struct smc_lgr_decision_builder	smc_lgr_decision_set[SMC_TYPE_D + 1] = {
+	/* SMC_TYPE_R = 0 */
+	{
+		.sz		= sizeof(struct smcr_lgr_decision_maker),
+		.init		= smcr_lgr_decision_maker_init,
+		.map_lock	= __SPIN_LOCK_UNLOCKED(smc_lgr_decision_set[SMC_TYPE_R].map_lock),
+		.default_params = {
+			.head_offset = offsetof(struct smc_lgr_decision_maker, rnode),
+			.key_len = sizeof(struct smc_init_info),
+			.obj_cmpfn = smcr_lgr_decision_maker_cmpfn,
+			.obj_hashfn = smcr_lgr_decision_maker_hashfn,
+			.hashfn = smcr_lgr_decision_maker_arg_hashfn,
+			.automatic_shrinking = true,
+		},
+	},
+	/* SMC_TYPE_D = 1 */
+	{
+		.sz		= sizeof(struct smcd_lgr_decision_maker),
+		.init		= smcd_lgr_decision_maker_init,
+		.map_lock	= __SPIN_LOCK_UNLOCKED(smc_lgr_decision_set[SMC_TYPE_D].map_lock),
+		.default_params = {
+			.head_offset = offsetof(struct smc_lgr_decision_maker, rnode),
+			.key_len = sizeof(struct smc_init_info),
+			.obj_cmpfn = smcd_lgr_decision_maker_cmpfn,
+			.obj_hashfn = smcd_lgr_decision_maker_hashfn,
+			.hashfn = smcd_lgr_decision_maker_arg_hashfn,
+			.automatic_shrinking = true,
+		},
+	},
+};
+
+/* hold a reference for smc_lgr_decision_maker */
+static inline void smc_lgr_decision_maker_hold(struct smc_lgr_decision_maker *ldm)
+{
+	if (likely(ldm))
+		refcount_inc(&ldm->ref);
+}
+
+/* release a reference for smc_lgr_decision_maker */
+static inline void smc_lgr_decision_maker_put(struct smc_lgr_decision_maker *ldm)
+{
+	bool do_free = false;
+	int type;
+
+	if (unlikely(!ldm))
+		return;
+
+	if (refcount_dec_not_one(&ldm->ref))
+		return;
+
+	type = ldm->type;
+
+	spin_lock_bh(&smc_lgr_decision_set[type].map_lock);
+	/* last ref */
+	if (refcount_dec_and_test(&ldm->ref)) {
+		do_free = true;
+		rhashtable_remove_fast(&smc_lgr_decision_set[type].map, &ldm->rnode,
+				       smc_lgr_decision_set[type].default_params);
+	}
+	spin_unlock_bh(&smc_lgr_decision_set[type].map_lock);
+	if (do_free)
+		kfree(ldm);
+}
+
+static struct smc_lgr_decision_maker *
+smc_get_or_create_lgr_decision_maker(struct smc_init_info *ini)
+{
+	struct smc_lgr_decision_maker *ldm;
+	int err, type;
+
+	type = ini->is_smcd ? SMC_TYPE_D : SMC_TYPE_R;
+
+	spin_lock_bh(&smc_lgr_decision_set[type].map_lock);
+	ldm = rhashtable_lookup_fast(&smc_lgr_decision_set[type].map, ini,
+				     smc_lgr_decision_set[type].default_params);
+	if (!ldm) {
+		ldm = kzalloc(smc_lgr_decision_set[type].sz, GFP_ATOMIC);
+		if (unlikely(!ldm))
+			goto fail;
+
+		/* common init */
+		spin_lock_init(&ldm->lock);
+		init_waitqueue_head(&ldm->wq);
+		refcount_set(&ldm->ref, 1);
+		ldm->type = type;
+		ldm->role = ini->role;
+
+		/* init */
+		if (smc_lgr_decision_set[type].init)
+			smc_lgr_decision_set[type].init(ldm, ini);
+
+		err = rhashtable_insert_fast(&smc_lgr_decision_set[type].map,
+					     &ldm->rnode,
+					     smc_lgr_decision_set[type].default_params);
+		if (unlikely(err)) {
+			pr_warn_ratelimited("smc: rhashtable_insert_fast failed (%d)", err);
+			kfree(ldm);
+			ldm = NULL;
+		}
+	} else {
+		smc_lgr_decision_maker_hold(ldm);
+	}
+fail:
+	spin_unlock_bh(&smc_lgr_decision_set[type].map_lock);
+	return ldm;
+}
+
+void smc_lgr_decision_maker_on_first_contact_done(struct smc_init_info *ini, bool success)
+{
+	struct smc_lgr_decision_maker *ldm;
+	int nr;
+
+	if (unlikely(!ini->first_contact_local))
+		return;
+
+	/* get lgr decision maker */
+	ldm = ini->ldm;
+
+	if (unlikely(!ldm))
+		return;
+
+	spin_lock_bh(&ldm->lock);
+	ldm->pending_capability -= (SMC_RMBS_PER_LGR_MAX - 1);
+	nr = SMC_RMBS_PER_LGR_MAX - 1;
+	if (unlikely(!success) && ldm->role == SMC_SERV) {
+		ldm->lacking_first_contact++;
+		/* only to wake up one connection to perfrom
+		 * first contact in server side, client MUST wake up
+		 * all to decline.
+		 */
+		nr = 1;
+	}
+	spin_unlock_bh(&ldm->lock);
+	if (nr)
+		wake_up_nr(&ldm->wq, nr);
+
+	/* hold in smc_lgr_create */
+	smc_lgr_decision_maker_put(ldm);
+}
+
 static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
 static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
 
@@ -756,6 +1032,7 @@  int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
 	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
+	rwlock_init(&lnk->rtokens_lock);
 	lnk->link_idx = link_idx;
 	lnk->wr_rx_id_compl = 0;
 	smc_ibdev_cnt_inc(lnk);
@@ -914,6 +1191,11 @@  static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		atomic_inc(&lgr_cnt);
 	}
 	smc->conn.lgr = lgr;
+
+	lgr->ldm = ini->ldm;
+	/* smc_lgr_decision_maker_put in __smc_lgr_free() */
+	smc_lgr_decision_maker_hold(lgr->ldm);
+
 	spin_lock_bh(lgr_lock);
 	list_add_tail(&lgr->list, lgr_list);
 	spin_unlock_bh(lgr_lock);
@@ -1363,6 +1645,9 @@  static void __smc_lgr_free(struct smc_link_group *lgr)
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
+	/* smc_lgr_decision_maker_hold in smc_lgr_create() */
+	if (lgr->ldm)
+		smc_lgr_decision_maker_put(lgr->ldm);
 	kfree(lgr);
 }
 
@@ -1851,11 +2136,13 @@  int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 {
 	struct smc_connection *conn = &smc->conn;
 	struct net *net = sock_net(&smc->sk);
+	DECLARE_WAITQUEUE(wait, current);
+	struct smc_lgr_decision_maker *ldm = NULL;
 	struct list_head *lgr_list;
 	struct smc_link_group *lgr;
 	enum smc_lgr_role role;
 	spinlock_t *lgr_lock;
-	int rc = 0;
+	int rc = 0, timeo = CLC_WAIT_TIME;
 
 	lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
 				  &smc_lgr_list.list;
@@ -1863,12 +2150,24 @@  int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 				  &smc_lgr_list.lock;
 	ini->first_contact_local = 1;
 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
-	if (role == SMC_CLNT && ini->first_contact_peer)
+	ini->role = role;
+
+	ldm = smc_get_or_create_lgr_decision_maker(ini);
+	if (unlikely(!ldm))
+		return SMC_CLC_DECL_INTERR;
+
+	if (role == SMC_CLNT && ini->first_contact_peer) {
+		spin_lock_bh(&ldm->lock);
+		ldm->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1);
+		spin_unlock_bh(&ldm->lock);
 		/* create new link group as well */
 		goto create;
+	}
 
 	/* determine if an existing link group can be reused */
 	spin_lock_bh(lgr_lock);
+	spin_lock(&ldm->lock);
+again:
 	list_for_each_entry(lgr, lgr_list, list) {
 		write_lock_bh(&lgr->conns_lock);
 		if ((ini->is_smcd ?
@@ -1895,9 +2194,42 @@  int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 		}
 		write_unlock_bh(&lgr->conns_lock);
 	}
+	/* make decision */
+	if (ini->first_contact_local) {
+		if (ldm->pending_capability > ldm->conns_pending) {
+			ldm->conns_pending++;
+			add_wait_queue(&ldm->wq, &wait);
+			spin_unlock(&ldm->lock);
+			spin_unlock_bh(lgr_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+			/* need to wait at least once first contact done */
+			timeo = schedule_timeout(timeo);
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&ldm->wq, &wait);
+			spin_lock_bh(lgr_lock);
+			spin_lock(&ldm->lock);
+
+			ldm->conns_pending--;
+			if (likely(timeo && !ldm->lacking_first_contact))
+				goto again;
+
+			/* lnk create failed, only server side can raise
+			 * a new first contact. client side here will
+			 * fallback by SMC_CLC_DECL_SYNCERR.
+			 */
+			if (role == SMC_SERV && ldm->lacking_first_contact)
+				ldm->lacking_first_contact--;
+		}
+		if (role == SMC_SERV) {
+			/* first_contact */
+			ldm->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1);
+		}
+	}
+
+	spin_unlock(&ldm->lock);
 	spin_unlock_bh(lgr_lock);
 	if (rc)
-		return rc;
+		goto out;
 
 	if (role == SMC_CLNT && !ini->first_contact_peer &&
 	    ini->first_contact_local) {
@@ -1905,11 +2237,15 @@  int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 		 * a new one
 		 * send out_of_sync decline, reason synchr. error
 		 */
-		return SMC_CLC_DECL_SYNCERR;
+		rc = SMC_CLC_DECL_SYNCERR;
+		goto out;
 	}
 
 create:
 	if (ini->first_contact_local) {
+		ini->ldm = ldm;
+		/* smc_lgr_decision_maker_put in first_contact_done() */
+		smc_lgr_decision_maker_hold(ldm);
 		rc = smc_lgr_create(smc, ini);
 		if (rc)
 			goto out;
@@ -1942,6 +2278,8 @@  int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 #endif
 
 out:
+	/* smc_lgr_decision_maker_hold in smc_get_or_create_lgr_decision_make() */
+	smc_lgr_decision_maker_put(ldm);
 	return rc;
 }
 
@@ -2504,19 +2842,24 @@  int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
 	u32 rkey = ntohl(nw_rkey);
 	int i;
 
+	write_lock_bh(&lnk->rtokens_lock);
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
 		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
 		    lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
 			/* already in list */
+			write_unlock_bh(&lnk->rtokens_lock);
 			return i;
 		}
 	}
 	i = smc_rmb_reserve_rtoken_idx(lgr);
-	if (i < 0)
+	if (i < 0) {
+		write_unlock_bh(&lnk->rtokens_lock);
 		return i;
+	}
 	lgr->rtokens[i][lnk->link_idx].rkey = rkey;
 	lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
+	write_unlock_bh(&lnk->rtokens_lock);
 	return i;
 }
 
@@ -2527,6 +2870,7 @@  int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
 	u32 rkey = ntohl(nw_rkey);
 	int i, j;
 
+	write_lock_bh(&lnk->rtokens_lock);
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
 		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
@@ -2535,9 +2879,11 @@  int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
 				lgr->rtokens[i][j].dma_addr = 0;
 			}
 			clear_bit(i, lgr->rtokens_used_mask);
+			write_unlock_bh(&lnk->rtokens_lock);
 			return 0;
 		}
 	}
+	write_unlock_bh(&lnk->rtokens_lock);
 	return -ENOENT;
 }
 
@@ -2603,12 +2949,31 @@  static int smc_core_reboot_event(struct notifier_block *this,
 
 int __init smc_core_init(void)
 {
+	int i;
+
+	/* init smc lgr decision maker builder */
+	for (i = 0; i <= SMC_TYPE_D; i++)
+		rhashtable_init(&smc_lgr_decision_set[i].map,
+				&smc_lgr_decision_set[i].default_params);
+
 	return register_reboot_notifier(&smc_reboot_notifier);
 }
 
+static void smc_lgr_decision_maker_free_cb(void *ptr, void *arg)
+{
+	kfree(ptr);
+}
+
 /* Called (from smc_exit) when module is removed */
 void smc_core_exit(void)
 {
+	int i;
+
 	unregister_reboot_notifier(&smc_reboot_notifier);
 	smc_lgrs_shutdown();
+
+	/* destroy smc lgr decision maker builder */
+	for (i = 0; i <= SMC_TYPE_D; i++)
+		rhashtable_free_and_destroy(&smc_lgr_decision_set[i].map,
+					    smc_lgr_decision_maker_free_cb, NULL);
 }
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 285f9bd..ad22751 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -15,6 +15,7 @@ 
 #include <linux/atomic.h>
 #include <linux/smc.h>
 #include <linux/pci.h>
+#include <linux/rhashtable.h>
 #include <rdma/ib_verbs.h>
 #include <net/genetlink.h>
 
@@ -107,6 +108,7 @@  struct smc_link {
 	u32			wr_tx_cnt;	/* number of WR send buffers */
 	wait_queue_head_t	wr_tx_wait;	/* wait for free WR send buf */
 	atomic_t		wr_tx_refcnt;	/* tx refs to link */
+	rwlock_t		rtokens_lock;
 
 	struct smc_wr_buf	*wr_rx_bufs;	/* WR recv payload buffers */
 	struct ib_recv_wr	*wr_rx_ibs;	/* WR recv meta data */
@@ -244,6 +246,8 @@  struct smc_llc_flow {
 	struct smc_llc_qentry *qentry;
 };
 
+struct smc_lgr_decision_maker;
+
 struct smc_link_group {
 	struct list_head	list;
 	struct rb_root		conns_all;	/* connection tree */
@@ -335,6 +339,8 @@  struct smc_link_group {
 						/* peer triggered shutdownn */
 		};
 	};
+	struct smc_lgr_decision_maker	*ldm;
+						/* who decides to create this lgr */
 };
 
 struct smc_clc_msg_local;
@@ -373,6 +379,7 @@  struct smc_init_info {
 	unsigned short		vlan_id;
 	u32			rc;
 	u8			negotiated_eid[SMC_MAX_EID_LEN];
+	struct smc_lgr_decision_maker *ldm;
 	/* SMC-R */
 	u8			smcr_version;
 	u8			check_smcrv2;
@@ -391,6 +398,7 @@  struct smc_init_info {
 	u8			ism_offered_cnt; /* # of ISM devices offered */
 	u8			ism_selected;    /* index of selected ISM dev*/
 	u8			smcd_version;
+	u8			role;
 };
 
 /* Find the connection associated with the given alert token in the link group.
@@ -559,6 +567,8 @@  struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
 int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
 int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
 
+void smc_lgr_decision_maker_on_first_contact_done(struct smc_init_info *ini, bool success);
+
 static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
 {
 	return link->lgr;