@@ -52,6 +52,12 @@
#define LNET_MAX_IOV (LNET_MAX_PAYLOAD >> PAGE_SHIFT)
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
+
/* forward refs */
struct lnet_libmd;
@@ -388,6 +394,15 @@ struct lnet_ni {
u32 ni_seq;
/*
+ * health value
+ * initialized to LNET_MAX_HEALTH_VALUE
+ * Value is decremented every time we fail to send a message over
+ * this NI because of a NI specific failure.
+ * Value is incremented if we successfully send a message.
+ */
+ atomic_t ni_healthv;
+
+ /*
* equivalent interfaces to use
* This is an array because socklnd bonding can still be configured
*/
@@ -1817,6 +1817,7 @@ static void lnet_push_target_fini(void)
atomic_set(&ni->ni_tx_credits,
lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+ atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
libcfs_nid2str(ni->ni_nid),
@@ -1276,6 +1276,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
struct lnet_ni *ni = NULL;
unsigned int shortest_distance;
int best_credits;
+ int best_healthv;
/* If there is no peer_ni that we can send to on this network,
* then there is no point in looking for a new best_ni here.
@@ -1286,20 +1287,21 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
if (!best_ni) {
shortest_distance = UINT_MAX;
best_credits = INT_MIN;
+ best_healthv = 0;
} else {
shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
best_ni->ni_dev_cpt);
best_credits = atomic_read(&best_ni->ni_tx_credits);
+ best_healthv = atomic_read(&best_ni->ni_healthv);
}
while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
unsigned int distance;
int ni_credits;
-
- if (!lnet_is_ni_healthy_locked(ni))
- continue;
+ int ni_healthv;
ni_credits = atomic_read(&ni->ni_tx_credits);
+ ni_healthv = atomic_read(&ni->ni_healthv);
/*
* calculate the distance from the CPT on which
@@ -1325,21 +1327,24 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
distance = lnet_numa_range;
/*
- * Select on shorter distance, then available
+ * Select on health, shorter distance, available
* credits, then round-robin.
*/
- if (distance > shortest_distance) {
+ if (ni_healthv < best_healthv) {
+ continue;
+ } else if (distance > shortest_distance) {
continue;
} else if (distance < shortest_distance) {
shortest_distance = distance;
} else if (ni_credits < best_credits) {
continue;
} else if (ni_credits == best_credits) {
- if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+ if (best_ni && best_ni->ni_seq <= ni->ni_seq)
continue;
}
best_ni = ni;
best_credits = ni_credits;
+ best_healthv = ni_healthv;
}
CDEBUG(D_NET, "selected best_ni %s\n",