@@ -338,6 +338,22 @@ struct lnet_element_stats {
struct lnet_comm_count el_drop_stats;
};
+struct lnet_health_local_stats {
+ atomic_t hlt_local_interrupt;
+ atomic_t hlt_local_dropped;
+ atomic_t hlt_local_aborted;
+ atomic_t hlt_local_no_route;
+ atomic_t hlt_local_timeout;
+ atomic_t hlt_local_error;
+};
+
+struct lnet_health_remote_stats {
+ atomic_t hlt_remote_dropped;
+ atomic_t hlt_remote_timeout;
+ atomic_t hlt_remote_error;
+ atomic_t hlt_network_timeout;
+};
+
struct lnet_net {
/* chain on the ln_nets */
struct list_head net_list;
@@ -426,6 +442,7 @@ struct lnet_ni {
/* NI statistics */
struct lnet_element_stats ni_stats;
+ struct lnet_health_local_stats ni_hstats;
/* physical device CPT */
int ni_dev_cpt;
@@ -511,6 +528,7 @@ struct lnet_peer_ni {
struct list_head lpni_rtr_list;
/* statistics kept on each peer NI */
struct lnet_element_stats lpni_stats;
+ struct lnet_health_remote_stats lpni_hstats;
/* spin lock protecting credits and lpni_txq / lpni_rtrq */
spinlock_t lpni_lock;
/* # tx credits available */
@@ -541,6 +541,54 @@
lnet_net_unlock(0);
}
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+ struct lnet_ni *ni = msg->msg_txni;
+ struct lnet_peer_ni *lpni = msg->msg_txpeer;
+
+ switch (hstatus) {
+ case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+ atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+ break;
+ case LNET_MSG_STATUS_LOCAL_DROPPED:
+ atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+ break;
+ case LNET_MSG_STATUS_LOCAL_ABORTED:
+ atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+ break;
+ case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+ atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+ break;
+ case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+ atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+ break;
+ case LNET_MSG_STATUS_LOCAL_ERROR:
+ atomic_inc(&ni->ni_hstats.hlt_local_error);
+ break;
+ case LNET_MSG_STATUS_REMOTE_DROPPED:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+ break;
+ case LNET_MSG_STATUS_REMOTE_ERROR:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+ break;
+ case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+ break;
+ case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+ break;
+ case LNET_MSG_STATUS_OK:
+ break;
+ default:
+ LBUG();
+ }
+}
+
/* Do a health check on the message:
* return -1 if we're not going to handle the error or
* if we've reached the maximum number of retries.
@@ -553,8 +601,6 @@
enum lnet_msg_hstatus hstatus = msg->msg_health_status;
bool lo = false;
- /* TODO: lnet_incr_hstats(hstatus); */
-
LASSERT(msg->msg_txni);
/* if we're sending to the LOLND then the msg_txpeer will not be
@@ -565,6 +611,8 @@
else
lo = true;
+ lnet_incr_hstats(msg, hstatus);
+
if (hstatus != LNET_MSG_STATUS_OK &&
ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
return -1;