diff mbox series

[for-rc,3/4] IB/hfi1: Fix early init panic

Message ID 20211129192003.101968.33612.stgit@awfm-01.cornelisnetworks.com (mailing list archive)
State Accepted
Delegated to: Jason Gunthorpe
Headers show
Series Some more RC fixes for 5.16 | expand

Commit Message

Dennis Dalessandro Nov. 29, 2021, 7:20 p.m. UTC
From: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>

The following trace can be observed with an init failure
such as firmware load failures:

[   18.421033] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[   18.430189] PGD 0 P4D 0
[   18.433435] Oops: 0010 [#1] SMP PTI
[   18.437715] CPU: 0 PID: 537 Comm: kworker/0:3 Tainted: G           OE    --------- -  - 4.18.0-240.el8.x86_64 #1
[   18.461788] Workqueue: events work_for_cpu_fn
[   18.467104] RIP: 0010:0x0
[   18.470493] Code: Bad RIP value.
[   18.474549] RSP: 0000:ffffae5f878a3c98 EFLAGS: 00010046
[   18.480819] RAX: 0000000000000000 RBX: ffff95e48e025c00 RCX: 0000000000000000
[   18.489243] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff95e48e025c00
[   18.497655] RBP: ffff95e4bf3660a4 R08: 0000000000000000 R09: ffffffff86d5e100
[   18.506069] R10: ffff95e49e1de600 R11: 0000000000000001 R12: ffff95e4bf366180
[   18.514478] R13: ffff95e48e025c00 R14: ffff95e4bf366028 R15: ffff95e4bf366000
[   18.522869] FS:  0000000000000000(0000) GS:ffff95e4df200000(0000) knlGS:0000000000000000
[   18.532369] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   18.539238] CR2: ffffffffffffffd6 CR3: 0000000f86a0a003 CR4: 00000000001606f0
[   18.547660] Call Trace:
[   18.550862]  receive_context_interrupt+0x1f/0x40 [hfi1]
[   18.557165]  __free_irq+0x201/0x300
[   18.561528]  free_irq+0x2e/0x60
[   18.565497]  pci_free_irq+0x18/0x30
[   18.569846]  msix_free_irq.part.2+0x46/0x80 [hfi1]
[   18.575662]  msix_clean_up_interrupts+0x2b/0x70 [hfi1]
[   18.581846]  hfi1_init_dd+0x640/0x1a90 [hfi1]
[   18.587170]  do_init_one.isra.19+0x34d/0x680 [hfi1]
[   18.593058]  local_pci_probe+0x41/0x90
[   18.597684]  work_for_cpu_fn+0x16/0x20
[   18.602332]  process_one_work+0x1a7/0x360
[   18.607256]  worker_thread+0x1cf/0x390
[   18.611872]  ? create_worker+0x1a0/0x1a0
[   18.616694]  kthread+0x112/0x130
[   18.620737]  ? kthread_flush_work_fn+0x10/0x10
[   18.626147]  ret_from_fork+0x35/0x40
[   18.655466] CR2: 0000000000000000
[   18.659703] ---[ end trace 40218ba9776cac37 ]---

The free_irq() results in a callback to the registered
interrupt handler, and rcd->do_interrupt is NULL because
the receive context data structures are not fully
initialized.

Fix by ensuring that the do_interrupt is always assigned and adding
a guards in the slow path handler to detect and handle a partially
initialized receive context and noop the receive.

Cc: stable@vger.kernel.org
Fixes: b0ba3c18d6bf ("IB/hfi1: Move normal functions from hfi1_devdata to const array")
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
---
 drivers/infiniband/hw/hfi1/chip.c   |    2 ++
 drivers/infiniband/hw/hfi1/driver.c |    2 ++
 drivers/infiniband/hw/hfi1/init.c   |    5 ++---
 3 files changed, 6 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index ec37f4f..f1245c9 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -8415,6 +8415,8 @@  static void receive_interrupt_common(struct hfi1_ctxtdata *rcd)
  */
 static void __hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd)
 {
+	if (!rcd->rcvhdrq)
+		return;
 	clear_recv_intr(rcd);
 	if (check_packet_present(rcd))
 		force_recv_intr(rcd);
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index 61f341c..e2c634a 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1012,6 +1012,8 @@  int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
 	struct hfi1_packet packet;
 	int skip_pkt = 0;
 
+	if (!rcd->rcvhdrq)
+		return RCV_PKT_OK;
 	/* Control context will always use the slow path interrupt handler */
 	needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
 
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 8e1236b..6422dd6 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -113,7 +113,6 @@  static int hfi1_create_kctxt(struct hfi1_devdata *dd,
 	rcd->fast_handler = get_dma_rtail_setting(rcd) ?
 				handle_receive_interrupt_dma_rtail :
 				handle_receive_interrupt_nodma_rtail;
-	rcd->slow_handler = handle_receive_interrupt;
 
 	hfi1_set_seq_cnt(rcd, 1);
 
@@ -334,6 +333,8 @@  int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
 		rcd->numa_id = numa;
 		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
 		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
+		rcd->slow_handler = handle_receive_interrupt;
+		rcd->do_interrupt = rcd->slow_handler;
 		rcd->msix_intr = CCE_NUM_MSIX_VECTORS;
 
 		mutex_init(&rcd->exp_mutex);
@@ -898,8 +899,6 @@  int hfi1_init(struct hfi1_devdata *dd, int reinit)
 		if (!rcd)
 			continue;
 
-		rcd->do_interrupt = &handle_receive_interrupt;
-
 		lastfail = hfi1_create_rcvhdrq(dd, rcd);
 		if (!lastfail)
 			lastfail = hfi1_setup_eagerbufs(rcd);