Message ID | 20210908072846.10011-5-njavali@marvell.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | qla2xxx driver bug fixes | expand |
> On Sep 8, 2021, at 2:28 AM, Nilesh Javali <njavali@marvell.com> wrote: > > From: Arun Easi <aeasi@marvell.com> > > System crash was seen when I/O was run against a NVME target and when I/O > aborts were occurring. > > Crash stack is: > > -- relevant crash stack -- > BUG: kernel NULL pointer dereference, address: 0000000000000010 > : > #6 [ffffae1f8666bdd0] page_fault at ffffffffa740122e > [exception RIP: qla_nvme_abort_work+339] > RIP: ffffffffc0f592e3 RSP: ffffae1f8666be80 RFLAGS: 00010297 > RAX: 0000000000000000 RBX: ffff9b581fc8af80 RCX: ffffffffc0f83bd0 > RDX: 0000000000000001 RSI: ffff9b5839c6c7c8 RDI: 0000000008000000 > RBP: ffff9b6832f85000 R8: ffffffffc0f68160 R9: ffffffffc0f70652 > R10: ffffae1f862ffdc8 R11: 0000000000000300 R12: 000000000000010d > R13: 0000000000000000 R14: ffff9b5839cea000 R15: 0ffff9b583fab170 > ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 > #7 [ffffae1f8666be98] process_one_work at ffffffffa6aba184 > #8 [ffffae1f8666bed8] worker_thread at ffffffffa6aba39d > #9 [ffffae1f8666bf10] kthread at ffffffffa6ac06ed > > The crash was due to a stale SRB structure access after it was aborted. > Fixed the issue by removing stale access. > Add following Fixes: 2cabf10dbbe38 (“scsi: qla2xxx: Fix hang on NVMe command timeouts ”) Cc: stable@vger.kernel.org > Signed-off-by: Arun Easi <aeasi@marvell.com> > Signed-off-by: Nilesh Javali <njavali@marvell.com> > --- > drivers/scsi/qla2xxx/qla_nvme.c | 14 ++++++++++++-- > 1 file changed, 12 insertions(+), 2 deletions(-) > > diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c > index 1c5da2dbd6f9..877b2b625020 100644 > --- a/drivers/scsi/qla2xxx/qla_nvme.c > +++ b/drivers/scsi/qla2xxx/qla_nvme.c > @@ -228,6 +228,8 @@ static void qla_nvme_abort_work(struct work_struct *work) > fc_port_t *fcport = sp->fcport; > struct qla_hw_data *ha = fcport->vha->hw; > int rval, abts_done_called = 1; > + bool io_wait_for_abort_done; > + uint32_t handle; > > ql_dbg(ql_dbg_io, fcport->vha, 0xffff, > "%s called for sp=%p, hndl=%x on fcport=%p desc=%p deleted=%d\n", > @@ -244,12 +246,20 @@ static void qla_nvme_abort_work(struct work_struct *work) > goto out; > } > > + /* > + * sp may not be valid after abort_command if return code is either > + * SUCCESS or ERR_FROM_FW codes, so cache the value here. > + */ > + io_wait_for_abort_done = ql2xabts_wait_nvme && > + QLA_ABTS_WAIT_ENABLED(sp); > + handle = sp->handle; > + > rval = ha->isp_ops->abort_command(sp); > > ql_dbg(ql_dbg_io, fcport->vha, 0x212b, > "%s: %s command for sp=%p, handle=%x on fcport=%p rval=%x\n", > __func__, (rval != QLA_SUCCESS) ? "Failed to abort" : "Aborted", > - sp, sp->handle, fcport, rval); > + sp, handle, fcport, rval); > > /* > * If async tmf is enabled, the abort callback is called only on > @@ -264,7 +274,7 @@ static void qla_nvme_abort_work(struct work_struct *work) > * are waited until ABTS complete. This kref is decreased > * at qla24xx_abort_sp_done function. > */ > - if (abts_done_called && ql2xabts_wait_nvme && QLA_ABTS_WAIT_ENABLED(sp)) > + if (abts_done_called && io_wait_for_abort_done) > return; > out: > /* kref_get was done before work was schedule. */ > -- > 2.19.0.rc0 > Otherwise Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com> -- Himanshu Madhani Oracle Linux Engineering
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 1c5da2dbd6f9..877b2b625020 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -228,6 +228,8 @@ static void qla_nvme_abort_work(struct work_struct *work) fc_port_t *fcport = sp->fcport; struct qla_hw_data *ha = fcport->vha->hw; int rval, abts_done_called = 1; + bool io_wait_for_abort_done; + uint32_t handle; ql_dbg(ql_dbg_io, fcport->vha, 0xffff, "%s called for sp=%p, hndl=%x on fcport=%p desc=%p deleted=%d\n", @@ -244,12 +246,20 @@ static void qla_nvme_abort_work(struct work_struct *work) goto out; } + /* + * sp may not be valid after abort_command if return code is either + * SUCCESS or ERR_FROM_FW codes, so cache the value here. + */ + io_wait_for_abort_done = ql2xabts_wait_nvme && + QLA_ABTS_WAIT_ENABLED(sp); + handle = sp->handle; + rval = ha->isp_ops->abort_command(sp); ql_dbg(ql_dbg_io, fcport->vha, 0x212b, "%s: %s command for sp=%p, handle=%x on fcport=%p rval=%x\n", __func__, (rval != QLA_SUCCESS) ? "Failed to abort" : "Aborted", - sp, sp->handle, fcport, rval); + sp, handle, fcport, rval); /* * If async tmf is enabled, the abort callback is called only on @@ -264,7 +274,7 @@ static void qla_nvme_abort_work(struct work_struct *work) * are waited until ABTS complete. This kref is decreased * at qla24xx_abort_sp_done function. */ - if (abts_done_called && ql2xabts_wait_nvme && QLA_ABTS_WAIT_ENABLED(sp)) + if (abts_done_called && io_wait_for_abort_done) return; out: /* kref_get was done before work was schedule. */