@@ -445,6 +445,7 @@ void lnet_res_lh_initialize(struct lnet_res_container *rec,
rspt = kzalloc(sizeof(*rspt), GFP_NOFS);
lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->rst_alloc++;
lnet_net_unlock(cpt);
return rspt;
}
@@ -454,6 +455,7 @@ void lnet_res_lh_initialize(struct lnet_res_container *rec,
{
kfree(rspt);
lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->rst_alloc--;
lnet_net_unlock(cpt);
}
@@ -278,11 +278,24 @@ struct lnet_ping_info {
struct lnet_counters {
__u32 msgs_alloc;
__u32 msgs_max;
+ __u32 rst_alloc;
__u32 errors;
__u32 send_count;
__u32 recv_count;
__u32 route_count;
__u32 drop_count;
+ __u32 resend_count;
+ __u32 response_timeout_count;
+ __u32 local_interrupt_count;
+ __u32 local_dropped_count;
+ __u32 local_aborted_count;
+ __u32 local_no_route_count;
+ __u32 local_timeout_count;
+ __u32 local_error_count;
+ __u32 remote_dropped_count;
+ __u32 remote_error_count;
+ __u32 remote_timeout_count;
+ __u32 network_timeout_count;
__u64 send_length;
__u64 recv_length;
__u64 route_length;
@@ -694,7 +694,20 @@ static void lnet_assert_wire_constants(void)
cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
counters->msgs_max += ctr->msgs_max;
counters->msgs_alloc += ctr->msgs_alloc;
+ counters->rst_alloc += ctr->rst_alloc;
counters->errors += ctr->errors;
+ counters->resend_count += ctr->resend_count;
+ counters->response_timeout_count += ctr->response_timeout_count;
+ counters->local_interrupt_count += ctr->local_interrupt_count;
+ counters->local_dropped_count += ctr->local_dropped_count;
+ counters->local_aborted_count += ctr->local_aborted_count;
+ counters->local_no_route_count += ctr->local_no_route_count;
+ counters->local_timeout_count += ctr->local_timeout_count;
+ counters->local_error_count += ctr->local_error_count;
+ counters->remote_dropped_count += ctr->remote_dropped_count;
+ counters->remote_error_count += ctr->remote_error_count;
+ counters->remote_timeout_count += ctr->remote_timeout_count;
+ counters->network_timeout_count += ctr->network_timeout_count;
counters->send_count += ctr->send_count;
counters->recv_count += ctr->recv_count;
counters->route_count += ctr->route_count;
@@ -2501,6 +2501,10 @@ struct lnet_mt_event_info {
md->md_rspt_ptr = NULL;
lnet_res_unlock(i);
+ lnet_net_lock(i);
+ the_lnet.ln_counters[i]->response_timeout_count++;
+ lnet_net_unlock(i);
+
list_del_init(&rspt->rspt_on_list);
CDEBUG(D_NET,
@@ -2567,6 +2571,11 @@ struct lnet_mt_event_info {
lnet_peer_ni_decref_locked(lpni);
lnet_net_unlock(cpt);
+ CDEBUG(D_NET, "resending %s->%s: %s recovery %d\n",
+ libcfs_nid2str(src_nid),
+ libcfs_id2str(msg->msg_target),
+ lnet_msgtyp2str(msg->msg_type),
+ msg->msg_recovery);
rc = lnet_send(src_nid, msg, LNET_NID_ANY);
if (rc) {
CERROR("Error sending %s to %s: %d\n",
@@ -2576,6 +2585,8 @@ struct lnet_mt_event_info {
lnet_finalize(msg, rc);
}
lnet_net_lock(cpt);
+ if (!rc)
+ the_lnet.ln_counters[cpt]->resend_count++;
}
}
}
@@ -546,41 +546,52 @@
{
struct lnet_ni *ni = msg->msg_txni;
struct lnet_peer_ni *lpni = msg->msg_txpeer;
+ struct lnet_counters *counters = the_lnet.ln_counters[0];
switch (hstatus) {
case LNET_MSG_STATUS_LOCAL_INTERRUPT:
atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+ counters->local_interrupt_count++;
break;
case LNET_MSG_STATUS_LOCAL_DROPPED:
atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+ counters->local_dropped_count++;
break;
case LNET_MSG_STATUS_LOCAL_ABORTED:
atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+ counters->local_aborted_count++;
break;
case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+ counters->local_no_route_count++;
break;
case LNET_MSG_STATUS_LOCAL_TIMEOUT:
atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+ counters->local_timeout_count++;
break;
case LNET_MSG_STATUS_LOCAL_ERROR:
atomic_inc(&ni->ni_hstats.hlt_local_error);
+ counters->local_error_count++;
break;
case LNET_MSG_STATUS_REMOTE_DROPPED:
if (lpni)
atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+ counters->remote_dropped_count++;
break;
case LNET_MSG_STATUS_REMOTE_ERROR:
if (lpni)
atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+ counters->remote_error_count++;
break;
case LNET_MSG_STATUS_REMOTE_TIMEOUT:
if (lpni)
atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+ counters->remote_timeout_count++;
break;
case LNET_MSG_STATUS_NETWORK_TIMEOUT:
if (lpni)
atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+ counters->network_timeout_count++;
break;
case LNET_MSG_STATUS_OK:
break;
@@ -601,6 +612,10 @@
enum lnet_msg_hstatus hstatus = msg->msg_health_status;
bool lo = false;
+ /* if we're shutting down no point in handling health. */
+ if (the_lnet.ln_state != LNET_STATE_RUNNING)
+ return -1;
+
LASSERT(msg->msg_txni);
/* if we're sending to the LOLND then the msg_txpeer will not be
@@ -611,15 +626,18 @@
else
lo = true;
- lnet_incr_hstats(msg, hstatus);
-
if (hstatus != LNET_MSG_STATUS_OK &&
ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
return -1;
- /* if we're shutting down no point in handling health. */
- if (the_lnet.ln_state != LNET_STATE_RUNNING)
- return -1;
+ /* stats are only incremented for errors so avoid wasting time
+ * incrementing statistics if there is no error.
+ */
+ if (hstatus != LNET_MSG_STATUS_OK) {
+ lnet_net_lock(0);
+ lnet_incr_hstats(msg, hstatus);
+ lnet_net_unlock(0);
+ }
CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
libcfs_nid2str(msg->msg_txni->ni_nid),