===================================================================
@@ -785,6 +785,7 @@ static void mcast_event_handler(struct i
case IB_EVENT_PORT_ERR:
case IB_EVENT_LID_CHANGE:
case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_DEVICE_FATAL:
case IB_EVENT_CLIENT_REREGISTER:
mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
break;
===================================================================
@@ -443,6 +443,7 @@ static void ib_sa_event(struct ib_event_
event->event == IB_EVENT_LID_CHANGE ||
event->event == IB_EVENT_PKEY_CHANGE ||
event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_DEVICE_FATAL ||
event->event == IB_EVENT_CLIENT_REREGISTER) {
unsigned long flags;
struct ib_sa_device *sa_dev =
===================================================================
@@ -289,6 +289,7 @@ void ipoib_event(struct ib_event_handler
queue_work(ipoib_workqueue, &priv->flush_light);
} else if (record->event == IB_EVENT_PORT_ERR ||
record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_DEVICE_FATAL ||
record->event == IB_EVENT_LID_CHANGE) {
queue_work(ipoib_workqueue, &priv->flush_normal);
} else if (record->event == IB_EVENT_PKEY_CHANGE) {
This patch is to avoid this hang: kernel: Call Trace: kernel: [C0000000FF9E34D0] [C0000000FF9E3560] 0xc0000000ff9e3560 (unreliable) kernel: [C0000000FF9E36A0] [C00000000001070C] .__switch_to+0x124/0x148 kernel: [C0000000FF9E3730] [C0000000003E6D30] .schedule+0xc10/0xdc4 kernel: [C0000000FF9E3840] [C0000000003E7024] .wait_for_completion+0xcc/0x150 kernel: [C0000000FF9E3900] [D000000000882288] .mcast_remove_one+0x8c/0xe8 [ib_sa] kernel: [C0000000FF9E39A0] [D0000000004E404C] .ib_unregister_device+0x64/0x15c [ib_core] kernel: [C0000000FF9E3A40] [D000000000542A4C] .mlx4_ib_remove+0x50/0x148 [mlx4_ib] kernel: [C0000000FF9E3AD0] [D0000000004A6EBC] .mlx4_remove_device+0xa0/0xf0 [mlx4_core] kernel: [C0000000FF9E3B60] [D0000000004A73F0] .mlx4_unregister_device+0x44/0xa8 [mlx4_core] kernel: [C0000000FF9E3BF0] [D0000000004AA0A8] .mlx4_remove_one+0x40/0x1bc [mlx4_core] kernel: [C0000000FF9E3C80] [D0000000004AA240] .mlx4_pci_err_detected+0x1c/0x48 [mlx4_core] kernel: [C0000000FF9E3D10] [C000000000053E84] .eeh_report_error+0x70/0xb4 kernel: [C0000000FF9E3DA0] [C0000000001DCB18] .pci_walk_bus+0xf8/0x168 kernel: [C0000000FF9E3E50] [C000000000054254] .handle_eeh_events+0x1a8/0x3d0 kernel: [C0000000FF9E3F00] [C000000000054580] .eeh_event_handler+0xc0/0x160 kernel: [C0000000FF9E3F90] [C000000000027A3C] .kernel_thread+0x4c/0x68 Add IB_EVENT_DEVICE_FATAL event to ib_sa, multicast and ipoib event handlers so the event handler will make the multicast group that are in joined state to move from that state so it will decrease the counter that will create this hang. Signed-off-by: Carol Soto <clsoto@linux.vnet.ibm.com> --- drivers/infiniband/core/multicast.c | 1 + drivers/infiniband/core/sa_query.c | 1 + drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 1 + 3 files changed, 3 insertions(+)