@@ -678,7 +678,8 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
* may drop the lnet_net_lock
*/
static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+ struct lnet_msg *msg)
{
time64_t now = ktime_get_seconds();
@@ -689,6 +690,13 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
return 1;
/*
+ * If we're resending a message, let's attempt to send it even if
+ * the peer is down to fulfill our resend quota on the message
+ */
+ if (msg->msg_retry_count > 0)
+ return 1;
+
+ /*
* Peer appears dead, but we should avoid frequent NI queries (at
* most once per lnet_queryinterval seconds).
*/
@@ -746,7 +754,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
/* NB 'lp' is always the next hop */
if (!(msg->msg_target.pid & LNET_PID_USERFLAG) &&
- !lnet_peer_alive_locked(ni, lp)) {
+ !lnet_peer_alive_locked(ni, lp, msg)) {
the_lnet.ln_counters[cpt]->drop_count++;
the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
lnet_net_unlock(cpt);
@@ -1042,7 +1042,13 @@ int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg)
}
rcd = rtr->lpni_rcd;
- if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis)
+
+ /* The response to the router checker ping could've timed out and
+ * the mdh might've been invalidated, so we need to update it
+ * again.
+ */
+ if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
+ LNetMDHandleIsInvalid(rcd->rcd_mdh))
rcd = lnet_update_rc_data_locked(rtr);
if (!rcd)
return;