@@ -672,6 +672,9 @@ struct lnet_peer {
/* tasks waiting on discovery of this peer */
wait_queue_head_t lp_dc_waitq;
+
+ /* cached peer aliveness */
+ bool lp_alive;
};
/*
@@ -216,6 +216,10 @@
init_waitqueue_head(&lp->lp_dc_waitq);
spin_lock_init(&lp->lp_lock);
lp->lp_primary_nid = nid;
+ if (lnet_peers_start_down())
+ lp->lp_alive = false;
+ else
+ lp->lp_alive = true;
/* all peers created on a router should have health on
* if it's not already on.
@@ -179,7 +179,9 @@ static int rtr_sensitivity_set(const char *val,
return check_routers_before_use;
}
-/* A net is alive if at least one gateway NI on the network is alive. */
+/* The peer_net of a gateway is alive if at least one of the peer_ni's on
+ * that peer_net is alive.
+ */
static bool
lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
{
@@ -200,6 +202,9 @@ bool lnet_is_gateway_alive(struct lnet_peer *gw)
{
struct lnet_peer_net *lpn;
+ if (!gw->lp_alive)
+ return false;
+
list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
if (!lnet_is_gateway_net_alive(lpn))
return false;
@@ -219,7 +224,10 @@ bool lnet_is_route_alive(struct lnet_route *route)
struct lnet_peer *gw = route->lr_gateway;
struct lnet_peer_net *llpn;
struct lnet_peer_net *rlpn;
- bool route_alive;
+
+ /* If the gateway is down then all routes are considered down */
+ if (!gw->lp_alive)
+ return false;
/* if discovery is disabled then rely on the cached aliveness
* information. This is handicapped information which we log when
@@ -230,36 +238,34 @@ bool lnet_is_route_alive(struct lnet_route *route)
if (lnet_is_discovery_disabled(gw))
return route->lr_alive;
- /* check the gateway's interfaces on the route rnet to make sure
- * that the gateway is viable.
- */
+ /* check the gateway's interfaces on the local network */
llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
if (!llpn)
return false;
- route_alive = lnet_is_gateway_net_alive(llpn);
+ if (!lnet_is_gateway_net_alive(llpn))
+ return false;
if (avoid_asym_router_failure) {
+ /* Check the gateway's interfaces on the remote network */
rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
if (!rlpn)
return false;
- route_alive = route_alive &&
- lnet_is_gateway_net_alive(rlpn);
+ if (!lnet_is_gateway_net_alive(rlpn))
+ return false;
}
- if (!route_alive)
- return route_alive;
-
spin_lock(&gw->lp_lock);
if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
+ spin_unlock(&gw->lp_lock);
if (gw->lp_rtr_refcount > 0)
CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
libcfs_nid2str(gw->lp_primary_nid));
- route_alive = false;
+ return false;
}
spin_unlock(&gw->lp_lock);
- return route_alive;
+ return true;
}
void
@@ -409,21 +415,22 @@ bool lnet_is_route_alive(struct lnet_route *route)
spin_lock(&lp->lp_lock);
lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY;
lp->lp_state |= LNET_PEER_RTR_DISCOVERED;
+ lp->lp_alive = lp->lp_dc_error == 0;
spin_unlock(&lp->lp_lock);
/* Router discovery successful? All peer information would've been
* updated already. No need to do any more processing
*/
- if (!lp->lp_dc_error)
+ if (lp->lp_alive)
return;
- /* discovery failed? then we need to set the status of each lpni
- * to DOWN. It will be updated the next time we discover the
- * router. For router peer NIs not on local networks, we never send
- * messages directly to them, so their health will always remain
- * at maximum. We can only tell if they are up or down from the
- * status returned in the PING response. If we fail to get that
- * status in our scheduled router discovery, then we'll assume
- * it's down until we're told otherwise.
+
+ /* We do not send messages directly to the remote interfaces
+ * of an LNet router. As such, we rely on the PING response
+ * to determine the up/down status of these interfaces. If
+ * a PING response is not receieved, or some other problem with
+ * discovery occurs that prevents us from getting this status,
+ * we assume all interfaces are down until we're able to
+ * determine otherwise.
*/
CDEBUG(D_NET, "%s: Router discovery failed %d\n",
libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error);
@@ -1629,6 +1636,7 @@ bool lnet_router_checker_active(void)
lnet_peer_ni_decref_locked(lpni);
if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) {
lp = lpni->lpni_peer_net->lpn_peer;
+ lp->lp_alive = alive;
list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
lnet_set_route_aliveness(route, alive);
}