@@ -588,28 +588,32 @@ static inline int kiblnd_timeout(void)
return dev->ibd_can_failover;
}
-#define kiblnd_conn_addref(conn) \
-do { \
- CDEBUG(D_NET, "conn[%p] (%d)++\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- atomic_inc(&(conn)->ibc_refcount); \
-} while (0)
-
-#define kiblnd_conn_decref(conn) \
-do { \
- unsigned long flags; \
- \
- CDEBUG(D_NET, "conn[%p] (%d)--\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \
- if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \
- spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \
- list_add_tail(&(conn)->ibc_list, \
- &kiblnd_data.kib_connd_zombies); \
- wake_up(&kiblnd_data.kib_connd_waitq); \
- spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
- } \
-} while (0)
+static inline void kiblnd_conn_addref(struct kib_conn *conn)
+{
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+ CDEBUG(D_NET, "conn[%p] (%d)++\n",
+ conn, atomic_read(&conn->ibc_refcount));
+#endif
+ atomic_inc(&(conn)->ibc_refcount);
+}
+
+static inline void kiblnd_conn_decref(struct kib_conn *conn)
+{
+ unsigned long flags;
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+ CDEBUG(D_NET, "conn[%p] (%d)--\n",
+ conn, atomic_read(&conn->ibc_refcount));
+#endif
+ LASSERT_ATOMIC_POS(&conn->ibc_refcount);
+ if (atomic_dec_and_test(&conn->ibc_refcount)) {
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ list_add_tail(&conn->ibc_list,
+ &kiblnd_data.kib_connd_zombies);
+ wake_up(&kiblnd_data.kib_connd_waitq);
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+ }
+}
void kiblnd_destroy_peer(struct kref *kref);
@@ -971,3 +975,33 @@ void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
int delayed, struct iov_iter *to, unsigned int rlen);
unsigned int kiblnd_get_dev_prio(struct lnet_ni *ni, unsigned int dev_idx);
+
+#define kiblnd_dump_conn_dbg(conn) \
+({ \
+ if (conn && conn->ibc_cmid) \
+ CDEBUG(D_NET, \
+ "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id %p qp_num 0x%x device_name %s\n", \
+ conn, \
+ conn->ibc_state, \
+ conn->ibc_noops_posted, \
+ conn->ibc_nsends_posted, \
+ conn->ibc_credits, \
+ conn->ibc_outstanding_credits, \
+ conn->ibc_reserved_credits, \
+ conn->ibc_comms_error, \
+ conn->ibc_cmid, \
+ conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0, \
+ conn->ibc_cmid->qp ? (conn->ibc_cmid->qp->device ? dev_name(&conn->ibc_cmid->qp->device->dev) : "NULL") : "NULL"); \
+ else if (conn) \
+ CDEBUG(D_NET, \
+ "conn %p state %d nposted %d/%d c/o/r %d/%d/%d ce %d : cm_id NULL\n", \
+ conn, \
+ conn->ibc_state, \
+ conn->ibc_noops_posted, \
+ conn->ibc_nsends_posted, \
+ conn->ibc_credits, \
+ conn->ibc_outstanding_credits, \
+ conn->ibc_reserved_credits, \
+ conn->ibc_comms_error \
+ ); \
+})
@@ -337,9 +337,12 @@ static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
- CDEBUG(D_NET, "Received %x[%d] from %s\n",
+ CDEBUG(D_NET, "Received %x[%d] nob %u cm_id %p qp_num 0x%x\n",
msg->ibm_type, credits,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ msg->ibm_nob,
+ conn->ibc_cmid,
+ conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0);
+ kiblnd_dump_conn_dbg(conn);
if (credits) {
/* Have I received credits that will let me send? */
@@ -760,8 +763,11 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
}
if (credit && !conn->ibc_credits) { /* no credits */
- CDEBUG(D_NET, "%s: no credits\n",
- libcfs_nid2str(peer_ni->ibp_nid));
+ CDEBUG(D_NET, "%s: no credits cm_id %p qp_num 0x%x\n",
+ libcfs_nid2str(peer_ni->ibp_nid),
+ conn->ibc_cmid,
+ conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0);
+ kiblnd_dump_conn_dbg(conn);
return -EAGAIN;
}
@@ -790,12 +796,22 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
kiblnd_tx_done(tx);
spin_lock(&conn->ibc_lock);
- CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+ CDEBUG(D_NET, "%s(%d): redundant or enough NOOP cm_id %p qp_num 0x%x\n",
libcfs_nid2str(peer_ni->ibp_nid),
- conn->ibc_noops_posted);
+ conn->ibc_noops_posted,
+ conn->ibc_cmid,
+ conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0);
+ kiblnd_dump_conn_dbg(conn);
return 0;
}
+ CDEBUG(D_NET, "Transmit %x[%d] nob %u cm_id %p qp_num 0x%x\n",
+ msg->ibm_type, credit,
+ msg->ibm_nob,
+ conn->ibc_cmid,
+ conn->ibc_cmid->qp ? conn->ibc_cmid->qp->qp_num : 0);
+ kiblnd_dump_conn_dbg(conn);
+
kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver,
conn->ibc_outstanding_credits,
peer_ni->ibp_nid, conn->ibc_incarnation);
@@ -1000,6 +1016,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_DROPPED;
tx->tx_waiting = 0; /* don't wait for peer_ni */
tx->tx_status = -EIO;
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+ kiblnd_dump_conn_dbg(conn);
+#endif
}
idle = !tx->tx_sending && /* This is the final callback */
@@ -1982,10 +2001,12 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
list_empty(&conn->ibc_tx_queue_rsrvd) &&
list_empty(&conn->ibc_tx_queue_nocred) &&
list_empty(&conn->ibc_active_txs)) {
- CDEBUG(D_NET, "closing conn to %s\n",
+ CDEBUG(D_NET, "closing conn %p to %s\n",
+ conn,
libcfs_nid2str(peer_ni->ibp_nid));
} else {
- CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+ CNETERR("Closing conn %p to %s: error %d%s%s%s%s%s\n",
+ conn,
libcfs_nid2str(peer_ni->ibp_nid), error,
list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
@@ -2660,11 +2681,11 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
cp.retry_count = *kiblnd_tunables.kib_retry_count;
cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
- CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+ CDEBUG(D_NET, "Accept %s conn %p\n", libcfs_nid2str(nid), conn);
rc = rdma_accept(cmid, &cp);
if (rc) {
- CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+ CNETERR("Can't accept %s: %d cm_id %p\n", libcfs_nid2str(nid), rc, cmid);
rej.ibr_version = version;
rej.ibr_why = IBLND_REJECT_FATAL;
@@ -3085,10 +3106,13 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
rc = rdma_connect(cmid, &cp);
if (rc) {
- CERROR("Can't connect to %s: %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), rc);
+ CNETERR("Can't connect to %s: %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), rc, cmid);
kiblnd_connreq_done(conn, rc);
kiblnd_conn_decref(conn);
+ } else {
+ CDEBUG(D_NET, "Connected to %s: cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), cmid);
}
return 0;
@@ -3112,13 +3136,13 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
rc = kiblnd_passive_connect(cmid,
(void *)KIBLND_CONN_PARAM(event),
KIBLND_CONN_PARAM_LEN(event));
- CDEBUG(D_NET, "connreq: %d\n", rc);
+ CDEBUG(D_NET, "connreq: %d cm_id %p\n", rc, cmid);
return rc;
case RDMA_CM_EVENT_ADDR_ERROR:
peer_ni = (struct kib_peer_ni *)cmid->context;
- CNETERR("%s: ADDR ERROR %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), event->status);
+ CNETERR("%s: ADDR ERROR %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), event->status, cmid);
kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
kiblnd_peer_decref(peer_ni);
return -EHOSTUNREACH; /* rc destroys cmid */
@@ -3126,13 +3150,13 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
case RDMA_CM_EVENT_ADDR_RESOLVED:
peer_ni = (struct kib_peer_ni *)cmid->context;
- CDEBUG(D_NET, "%s Addr resolved: %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), event->status);
+ CDEBUG(D_NET, "%s Addr resolved: %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), event->status, cmid);
if (event->status) {
- CNETERR("Can't resolve address for %s: %d\n",
+ CNETERR("Can't resolve address for %s: %d cm_id %p\n",
libcfs_nid2str(peer_ni->ibp_nid),
- event->status);
+ event->status, cmid);
rc = event->status;
} else {
rc = rdma_resolve_route(cmid,
@@ -3151,8 +3175,8 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
}
/* Can't initiate route resolution */
- CERROR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), rc);
+ CNETERR("Can't resolve route for %s: %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), rc, cmid);
}
kiblnd_peer_connect_failed(peer_ni, 1, rc);
kiblnd_peer_decref(peer_ni);
@@ -3160,8 +3184,8 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
case RDMA_CM_EVENT_ROUTE_ERROR:
peer_ni = (struct kib_peer_ni *)cmid->context;
- CNETERR("%s: ROUTE ERROR %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), event->status);
+ CNETERR("%s: ROUTE ERROR %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), event->status, cmid);
kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
kiblnd_peer_decref(peer_ni);
return -EHOSTUNREACH; /* rc destroys cmid */
@@ -3174,17 +3198,15 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
if (!event->status)
return kiblnd_active_connect(cmid);
- CNETERR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer_ni->ibp_nid), event->status);
+ CNETERR("Can't resolve route for %s: %d cm_id %p\n",
+ libcfs_nid2str(peer_ni->ibp_nid), event->status, cmid);
kiblnd_peer_connect_failed(peer_ni, 1, event->status);
kiblnd_peer_decref(peer_ni);
return event->status; /* rc destroys cmid */
case RDMA_CM_EVENT_UNREACHABLE:
- CNETERR("%s: UNREACHABLE %d, ibc_state: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- event->status,
- conn->ibc_state);
+ CNETERR("%s: UNREACHABLE %d cm_id %p conn %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status, cmid, conn);
LASSERT(conn->ibc_state != IBLND_CONN_ESTABLISHED &&
conn->ibc_state != IBLND_CONN_INIT);
if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
@@ -3198,8 +3220,8 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
conn = (struct kib_conn *)cmid->context;
LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
- CNETERR("%s: CONNECT ERROR %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+ CNETERR("%s: CONNECT ERROR %d cm_id %p conn %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status, cmid, conn);
kiblnd_connreq_done(conn, -ENOTCONN);
kiblnd_conn_decref(conn);
return 0;
@@ -3211,9 +3233,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
LBUG();
case IBLND_CONN_PASSIVE_WAIT:
- CERROR("%s: REJECTED %d\n",
+ CERROR("%s: REJECTED %d cm_id %p\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid),
- event->status);
+ event->status, cmid);
kiblnd_connreq_done(conn, -ECONNRESET);
break;
@@ -3233,14 +3255,14 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
LBUG();
case IBLND_CONN_PASSIVE_WAIT:
- CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CDEBUG(D_NET, "ESTABLISHED (passive): %s cm_id %p conn %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), cmid, conn);
kiblnd_connreq_done(conn, 0);
break;
case IBLND_CONN_ACTIVE_CONNECT:
- CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CDEBUG(D_NET, "ESTABLISHED(active): %s cm_id %p conn %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), cmid, conn);
kiblnd_check_connreply(conn,
(void *)KIBLND_CONN_PARAM(event),
KIBLND_CONN_PARAM_LEN(event));
@@ -3255,8 +3277,8 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
case RDMA_CM_EVENT_DISCONNECTED:
conn = (struct kib_conn *)cmid->context;
if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- CERROR("%s DISCONNECTED\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CERROR("%s DISCONNECTED cm_id %p conn %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), cmid, conn);
kiblnd_connreq_done(conn, -ECONNRESET);
} else {
kiblnd_close_conn(conn, 0);
@@ -3372,6 +3394,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
conn->ibc_credits,
conn->ibc_outstanding_credits,
conn->ibc_reserved_credits);
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+ kiblnd_dump_conn_dbg(conn);
+#endif
list_add(&conn->ibc_connd_list, &closes);
} else {
list_add(&conn->ibc_connd_list, &checksends);
@@ -3425,7 +3450,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
LASSERT(!in_interrupt());
LASSERT(current == kiblnd_data.kib_connd);
LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
-
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+ kiblnd_dump_conn_dbg(conn);
+#endif
rdma_disconnect(conn->ibc_cmid);
kiblnd_finalise_conn(conn);
@@ -3716,6 +3743,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
(conn->ibc_nrx > 0 ||
conn->ibc_nsends_posted > 0)) {
kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+ kiblnd_dump_conn_dbg(conn);
conn->ibc_scheduled = 1;
list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
@@ -3788,8 +3816,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
rc = ib_req_notify_cq(conn->ibc_cq,
IB_CQ_NEXT_COMP);
if (rc < 0) {
- CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ CWARN("%s: ib_req_notify_cq failed: %d, closing connection %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc, conn);
kiblnd_close_conn(conn, -EIO);
kiblnd_conn_decref(conn);
spin_lock_irqsave(&sched->ibs_lock,
@@ -3810,9 +3839,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
}
if (rc < 0) {
- CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
+ CWARN("%s: ib_poll_cq failed: %d, closing connection %p\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc);
+ rc, conn);
kiblnd_close_conn(conn, -EIO);
kiblnd_conn_decref(conn);
spin_lock_irqsave(&sched->ibs_lock, flags);