diff mbox series

[25/26] lnet: o2iblnd: clear fatal error on successful failover

Message ID 1627933851-7603-31-git-send-email-jsimmons@infradead.org (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

James Simmons Aug. 2, 2021, 7:50 p.m. UTC
From: Serguei Smirnov <ssmirnov@whamcloud.com>

In IB bonding configuration link down event causes fatal error
flag to be set on the bonded interface so it is not selected by
LNet for tx, e.g. when just one of the two cables is pulled.
This change allows for the interface status to be restored on
successful failover.

WC-bug-id: https://jira.whamcloud.com/browse/LU-14806
Lustre-commit: 4668283cd13079dd ("LU-14806 o2iblnd: clear fatal error on successful failover")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44139
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 net/lnet/klnds/o2iblnd/o2iblnd.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/net/lnet/klnds/o2iblnd/o2iblnd.c b/net/lnet/klnds/o2iblnd/o2iblnd.c
index 3141953..686581a 100644
--- a/net/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/net/lnet/klnds/o2iblnd/o2iblnd.c
@@ -1487,6 +1487,21 @@  static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
 	}
 }
 
+static int kiblnd_get_link_status(struct net_device *dev)
+{
+	int ret = -1;
+
+	LASSERT(dev);
+
+	if (!netif_running(dev))
+		ret = 0;
+	/* Some devices may not be providing link settings */
+	else if (dev->ethtool_ops->get_link)
+		ret = dev->ethtool_ops->get_link(dev);
+
+	return ret;
+}
+
 static int
 kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
 			struct kib_net *net,
@@ -2347,6 +2362,7 @@  int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 	struct ib_pd *pd;
 	struct kib_net *net;
 	struct sockaddr_in addr;
+	struct net_device *netdev;
 	unsigned long flags;
 	int rc = 0;
 	int i;
@@ -2467,11 +2483,18 @@  int kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 	if (hdev)
 		kiblnd_hdev_decref(hdev);
 
-	if (rc)
+	if (rc) {
 		dev->ibd_failed_failover++;
-	else
+	} else {
 		dev->ibd_failed_failover = 0;
 
+		rcu_read_lock();
+		netdev = dev_get_by_name_rcu(ns, dev->ibd_ifname);
+		if (netdev && (kiblnd_get_link_status(netdev) == 1))
+			kiblnd_set_ni_fatal_on(dev->ibd_hdev, 0);
+		rcu_read_unlock();
+	}
+
 	return rc;
 }