@@ -1040,4 +1040,16 @@ u32 lnet_sum_stats(struct lnet_element_stats *stats,
void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
struct lnet_element_stats *stats);
+static inline void
+lnet_set_route_aliveness(struct lnet_route *route, bool alive)
+{
+ bool old = atomic_xchg(&route->lr_alive, alive);
+
+ if (old != alive)
+ CERROR("route to %s through %s has gone from %s to %s\n",
+ libcfs_net2str(route->lr_net),
+ libcfs_nid2str(route->lr_gateway->lp_primary_nid),
+ old ? "up" : "down",
+ alive ? "up" : "down");
+}
#endif
@@ -868,7 +868,7 @@ struct lnet_route {
/* route priority */
unsigned int lr_priority;
/* cached route aliveness */
- bool lr_alive;
+ atomic_t lr_alive;
/* this route is single-hop */
bool lr_single_hop;
};
@@ -4360,7 +4360,12 @@ void lnet_monitor_thr_stop(void)
goto drop;
}
- if (lnet_drop_asym_route && for_me &&
+ /* If this message was forwarded to us from a router then we may need
+ * to update router aliveness or check for an asymmetrical route
+ * (or both)
+ */
+ if (((lnet_drop_asym_route && for_me) ||
+ !lpni->lpni_peer_net->lpn_peer->lp_alive) &&
LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
u32 src_net_id = LNET_NIDNET(src_nid);
struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer;
@@ -4370,10 +4375,24 @@ void lnet_monitor_thr_stop(void)
list_for_each_entry(route, &gw->lp_routes, lr_gwlist) {
if (route->lr_net == src_net_id) {
found = true;
- break;
+ /* If we're transitioning the gateway from
+ * dead -> alive, and discovery is disabled
+ * locally or on the gateway, then we need to
+ * update the cached route aliveness for each
+ * route to the src_nid's net.
+ *
+ * Otherwise, we're only checking for
+ * symmetrical route, and we can break the
+ * loop
+ */
+ if (!gw->lp_alive &&
+ lnet_is_discovery_disabled(gw))
+ lnet_set_route_aliveness(route, true);
+ else
+ break;
}
}
- if (!found) {
+ if (lnet_drop_asym_route && for_me && !found) {
lnet_net_unlock(cpt);
/* we would not use from_nid to route a message to
* src_nid
@@ -4385,6 +4404,18 @@ void lnet_monitor_thr_stop(void)
kfree(msg);
goto drop;
}
+ if (!gw->lp_alive) {
+ struct lnet_peer_net *lpn;
+ struct lnet_peer_ni *lpni2;
+
+ gw->lp_alive = true;
+ /* Mark all remote NIs on src_nid's net UP */
+ lpn = lnet_peer_get_net_locked(gw, src_net_id);
+ if (lpn)
+ list_for_each_entry(lpni2, &lpn->lpn_peer_nis,
+ lpni_peer_nis)
+ lpni2->lpni_ns_status = LNET_NI_STATUS_UP;
+ }
}
lpni->lpni_last_alive = ktime_get_seconds();
@@ -303,7 +303,7 @@ bool lnet_is_route_alive(struct lnet_route *route)
* enabled.
*/
if (lnet_is_discovery_disabled(gw))
- return route->lr_alive;
+ return atomic_read(&route->lr_alive) == 1;
/* check the gateway's interfaces on the local network */
llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
@@ -394,21 +394,6 @@ bool lnet_is_route_alive(struct lnet_route *route)
}
/* Must hold net_lock/EX */
-static inline void
-lnet_set_route_aliveness(struct lnet_route *route, bool alive)
-{
- /* Log when there's a state change */
- if (route->lr_alive != alive) {
- CERROR("route to %s through %s has gone from %s to %s\n",
- libcfs_net2str(route->lr_net),
- libcfs_nid2str(route->lr_gateway->lp_primary_nid),
- (route->lr_alive) ? "up" : "down",
- alive ? "up" : "down");
- route->lr_alive = alive;
- }
-}
-
-/* Must hold net_lock/EX */
void
lnet_router_discovery_ping_reply(struct lnet_peer *lp)
{
@@ -706,6 +691,10 @@ static void lnet_shuffle_seed(void)
route->lr_nid = gateway;
route->lr_priority = priority;
route->lr_hops = hops;
+ if (lnet_peers_start_down())
+ atomic_set(&route->lr_alive, 0);
+ else
+ atomic_set(&route->lr_alive, 1);
lnet_net_lock(LNET_LOCK_EX);
@@ -1770,14 +1759,8 @@ bool lnet_router_checker_active(void)
*/
if (lnet_is_discovery_disabled(lp)) {
list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
- if (route->lr_nid == lpni->lpni_nid &&
- route->lr_alive != alive) {
- lnet_net_unlock(0);
- lnet_net_lock(LNET_LOCK_EX);
+ if (route->lr_nid == lpni->lpni_nid)
lnet_set_route_aliveness(route, alive);
- lnet_net_unlock(LNET_LOCK_EX);
- lnet_net_lock(0);
- }
}
}
}