Message ID | 014738b6-698e-4ea1-82f9-287378bfec19@CMEXHTCAS2.ad.emulex.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Devesh, This looks a lot better. I still have a couple of small suggestions, though. On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: > If the rdma_create_qp fails to create qp due to device firmware being in invalid state > xprtrdma still tries to destroy the non-existant qp and ends up in a NULL pointer reference > crash. > Adding proper checks for vaidating QP pointer avoids this to happen. > > Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> > --- > net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- > 1 files changed, 25 insertions(+), 4 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index 9372656..902ac78 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) > if (ep->rep_connected != 0) { > struct rpcrdma_xprt *xprt; > retry: > - rc = rpcrdma_ep_disconnect(ep, ia); > - if (rc && rc != -ENOTCONN) > - dprintk("RPC: %s: rpcrdma_ep_disconnect" > + if (ia->ri_id->qp) { > + rc = rpcrdma_ep_disconnect(ep, ia); > + if (rc && rc != -ENOTCONN) > + dprintk("RPC: %s: rpcrdma_ep_disconnect" > " status %i\n", __func__, rc); > + } > rpcrdma_clean_cq(ep->rep_cq); > > xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); > @@ -859,7 +861,9 @@ retry: > goto out; > } > /* END TEMP */ > - rdma_destroy_qp(ia->ri_id); > + if (ia->ri_id->qp) { > + rdma_destroy_qp(ia->ri_id); > + } Nit: No need for braces here. > rdma_destroy_id(ia->ri_id); > ia->ri_id = id; > } > @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, > frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; > DECR_CQCOUNT(&r_xprt->rx_ep); > > + if (!ia->ri_is->qp) { > + rc = -EINVAL; > + while (i--) > + rpcrdma_unmap_one(ia, --seg); > + goto out; > + } Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not just do if (ia->ri_is->qp) rc = ib_post_send(…) else rc = -EINVAL; BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? > + > rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); > > if (rc) { > @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, > seg1->mr_len = len; > } > *nsegs = i; > +out: > return rc; > } > > @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, > invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; > DECR_CQCOUNT(&r_xprt->rx_ep); > > + if (!ia->ri_id->qp) > + return -EINVAL; > + > rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); > if (rc) > dprintk("RPC: %s: failed ib_post_send for invalidate," > @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > send_wr.send_flags = IB_SEND_SIGNALED; > } > > + if (!ia->ri_id->qp) > + return -EINVAL; > + > rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); > if (rc) > dprintk("RPC: %s: ib_post_send returned %i\n", __func__, > @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, > rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); > > DECR_CQCOUNT(ep); > + > + if (!ia->ri_id->qp) > + return -EINVAL; > rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); > > if (rc) > -- > 1.7.1 >
On Apr 9, 2014, at 4:22 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: > Hi Devesh, > > This looks a lot better. I still have a couple of small suggestions, though. > > On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: > >> If the rdma_create_qp fails to create qp due to device firmware being in invalid state >> xprtrdma still tries to destroy the non-existant qp and ends up in a NULL pointer reference >> crash. >> Adding proper checks for vaidating QP pointer avoids this to happen. >> >> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> >> --- >> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- >> 1 files changed, 25 insertions(+), 4 deletions(-) >> >> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c >> index 9372656..902ac78 100644 >> --- a/net/sunrpc/xprtrdma/verbs.c >> +++ b/net/sunrpc/xprtrdma/verbs.c >> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) >> if (ep->rep_connected != 0) { >> struct rpcrdma_xprt *xprt; >> retry: >> - rc = rpcrdma_ep_disconnect(ep, ia); >> - if (rc && rc != -ENOTCONN) >> - dprintk("RPC: %s: rpcrdma_ep_disconnect" >> + if (ia->ri_id->qp) { >> + rc = rpcrdma_ep_disconnect(ep, ia); >> + if (rc && rc != -ENOTCONN) >> + dprintk("RPC: %s: rpcrdma_ep_disconnect" >> " status %i\n", __func__, rc); >> + } >> rpcrdma_clean_cq(ep->rep_cq); >> >> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); >> @@ -859,7 +861,9 @@ retry: >> goto out; >> } >> /* END TEMP */ >> - rdma_destroy_qp(ia->ri_id); >> + if (ia->ri_id->qp) { >> + rdma_destroy_qp(ia->ri_id); >> + } > > Nit: No need for braces here. > >> rdma_destroy_id(ia->ri_id); >> ia->ri_id = id; >> } >> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >> DECR_CQCOUNT(&r_xprt->rx_ep); I don’t think you can DECR_CQCOUNT, then exit without posting the send. That will screw up the completion counter and result in a transport hang, won’t it? >> >> + if (!ia->ri_is->qp) { >> + rc = -EINVAL; >> + while (i--) >> + rpcrdma_unmap_one(ia, --seg); >> + goto out; >> + } > > Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not just do > > if (ia->ri_is->qp) > rc = ib_post_send(…) > else > rc = -EINVAL; > > BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? > >> + >> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); >> >> if (rc) { >> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >> seg1->mr_len = len; >> } >> *nsegs = i; >> +out: >> return rc; >> } >> >> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, >> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >> DECR_CQCOUNT(&r_xprt->rx_ep); Ditto. >> >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> + >> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); >> if (rc) >> dprintk("RPC: %s: failed ib_post_send for invalidate," >> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >> send_wr.send_flags = IB_SEND_SIGNALED; >> } Ditto. >> >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> + >> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); >> if (rc) >> dprintk("RPC: %s: ib_post_send returned %i\n", __func__, >> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, >> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); >> >> DECR_CQCOUNT(ep); And here. >> + >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); >> >> if (rc) >> -- >> 1.7.1 >> > > _________________________________ > Trond Myklebust > Linux NFS client maintainer, PrimaryData > trond.myklebust@primarydata.com > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chuk and Trond I will resend a v2 for this. What if ib_post_send() fails with immidate error, I that case also DECR_CQCOUNT() will be called but no completion will be reported. Will that not cause any problems? Also in rpcrdma_register_frmr_external() I am seeing DECT_CQCOUNT is called twice First at line 1538 (unlikely however) and second at line 1562. Shouldn't it be only at 1562? -----Original Message----- From: Chuck Lever [mailto:chuck.lever@oracle.com] Sent: Thursday, April 10, 2014 1:57 AM To: Devesh Sharma Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks On Apr 9, 2014, at 4:22 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: > Hi Devesh, > > This looks a lot better. I still have a couple of small suggestions, though. > > On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: > >> If the rdma_create_qp fails to create qp due to device firmware being >> in invalid state xprtrdma still tries to destroy the non-existant qp >> and ends up in a NULL pointer reference crash. >> Adding proper checks for vaidating QP pointer avoids this to happen. >> >> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> >> --- >> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- >> 1 files changed, 25 insertions(+), 4 deletions(-) >> >> diff --git a/net/sunrpc/xprtrdma/verbs.c >> b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 >> --- a/net/sunrpc/xprtrdma/verbs.c >> +++ b/net/sunrpc/xprtrdma/verbs.c >> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) >> if (ep->rep_connected != 0) { >> struct rpcrdma_xprt *xprt; >> retry: >> - rc = rpcrdma_ep_disconnect(ep, ia); >> - if (rc && rc != -ENOTCONN) >> - dprintk("RPC: %s: rpcrdma_ep_disconnect" >> + if (ia->ri_id->qp) { >> + rc = rpcrdma_ep_disconnect(ep, ia); >> + if (rc && rc != -ENOTCONN) >> + dprintk("RPC: %s: rpcrdma_ep_disconnect" >> " status %i\n", __func__, rc); >> + } >> rpcrdma_clean_cq(ep->rep_cq); >> >> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ -859,7 >> +861,9 @@ retry: >> goto out; >> } >> /* END TEMP */ >> - rdma_destroy_qp(ia->ri_id); >> + if (ia->ri_id->qp) { >> + rdma_destroy_qp(ia->ri_id); >> + } > > Nit: No need for braces here. > >> rdma_destroy_id(ia->ri_id); >> ia->ri_id = id; >> } >> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >> DECR_CQCOUNT(&r_xprt->rx_ep); I don't think you can DECR_CQCOUNT, then exit without posting the send. That will screw up the completion counter and result in a transport hang, won't it? >> >> + if (!ia->ri_is->qp) { >> + rc = -EINVAL; >> + while (i--) >> + rpcrdma_unmap_one(ia, --seg); >> + goto out; >> + } > > Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not > just do > > if (ia->ri_is->qp) > rc = ib_post_send(...) > else > rc = -EINVAL; > > BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? > >> + >> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); >> >> if (rc) { >> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >> seg1->mr_len = len; >> } >> *nsegs = i; >> +out: >> return rc; >> } >> >> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, >> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >> DECR_CQCOUNT(&r_xprt->rx_ep); Ditto. >> >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> + >> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); >> if (rc) >> dprintk("RPC: %s: failed ib_post_send for invalidate," >> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >> send_wr.send_flags = IB_SEND_SIGNALED; >> } Ditto. >> >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> + >> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); >> if (rc) >> dprintk("RPC: %s: ib_post_send returned %i\n", __func__, >> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, >> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); >> >> DECR_CQCOUNT(ep); And here. >> + >> + if (!ia->ri_id->qp) >> + return -EINVAL; >> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); >> >> if (rc) >> -- >> 1.7.1 >> > > _________________________________ > Trond Myklebust > Linux NFS client maintainer, PrimaryData > trond.myklebust@primarydata.com > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" > in the body of a message to majordomo@vger.kernel.org More majordomo > info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 9, 2014, at 7:56 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > Hi Chuk and Trond > > I will resend a v2 for this. > What if ib_post_send() fails with immidate error, I that case also DECR_CQCOUNT() will be called but no completion will be reported. Will that not cause any problems? We should investigate whether an error return from ib_post_{send,recv} means there will be no completion. But I’ve never seen these verbs fail in practice, so I’m not in a hurry to make work for anyone! ;-) However it seems to me the new (!ia->ri_id->qp) checks outside the connect logic are unnecessary. Clearly, as you noticed, the ib_post_{send,recv} verbs do not check that their “qp" argument is NULL before dereferencing it. But I don’t understand how xprtrdma can post any operation if the transport isn’t connected. In other words, how would it be possible to call rpcrdma_ep_post_recv() if the connect had failed and there was no QP? If disconnect wipes ia->ri_id->qp while there are still operations in progress, that would be the real bug. > Also in rpcrdma_register_frmr_external() I am seeing DECT_CQCOUNT is called twice > First at line 1538 (unlikely however) and second at line 1562. Shouldn't it be only at 1562? if (seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID) then rpcrdma_register_frmr_external() posts two Work Requests (LOCAL_INV then FAST_REG_MR) with one ib_post_send(). Thus it is correct to DECR_CQCOUNT twice in that case because each WR will trigger a separate completion event. > -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Thursday, April 10, 2014 1:57 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 9, 2014, at 4:22 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: > >> Hi Devesh, >> >> This looks a lot better. I still have a couple of small suggestions, though. >> >> On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: >> >>> If the rdma_create_qp fails to create qp due to device firmware being >>> in invalid state xprtrdma still tries to destroy the non-existant qp >>> and ends up in a NULL pointer reference crash. >>> Adding proper checks for vaidating QP pointer avoids this to happen. >>> >>> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> >>> --- >>> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- >>> 1 files changed, 25 insertions(+), 4 deletions(-) >>> >>> diff --git a/net/sunrpc/xprtrdma/verbs.c >>> b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 >>> --- a/net/sunrpc/xprtrdma/verbs.c >>> +++ b/net/sunrpc/xprtrdma/verbs.c >>> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) >>> if (ep->rep_connected != 0) { >>> struct rpcrdma_xprt *xprt; >>> retry: >>> - rc = rpcrdma_ep_disconnect(ep, ia); >>> - if (rc && rc != -ENOTCONN) >>> - dprintk("RPC: %s: rpcrdma_ep_disconnect" >>> + if (ia->ri_id->qp) { >>> + rc = rpcrdma_ep_disconnect(ep, ia); >>> + if (rc && rc != -ENOTCONN) >>> + dprintk("RPC: %s: rpcrdma_ep_disconnect" >>> " status %i\n", __func__, rc); >>> + } >>> rpcrdma_clean_cq(ep->rep_cq); >>> >>> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ -859,7 >>> +861,9 @@ retry: >>> goto out; >>> } >>> /* END TEMP */ >>> - rdma_destroy_qp(ia->ri_id); >>> + if (ia->ri_id->qp) { >>> + rdma_destroy_qp(ia->ri_id); >>> + } >> >> Nit: No need for braces here. >> >>> rdma_destroy_id(ia->ri_id); >>> ia->ri_id = id; >>> } >>> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>> DECR_CQCOUNT(&r_xprt->rx_ep); > > I don't think you can DECR_CQCOUNT, then exit without posting the send. That will screw up the completion counter and result in a transport hang, won't it? > >>> >>> + if (!ia->ri_is->qp) { >>> + rc = -EINVAL; >>> + while (i--) >>> + rpcrdma_unmap_one(ia, --seg); >>> + goto out; >>> + } >> >> Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not >> just do >> >> if (ia->ri_is->qp) >> rc = ib_post_send(...) >> else >> rc = -EINVAL; >> >> BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? >> >>> + >>> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); >>> >>> if (rc) { >>> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>> seg1->mr_len = len; >>> } >>> *nsegs = i; >>> +out: >>> return rc; >>> } >>> >>> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, >>> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>> DECR_CQCOUNT(&r_xprt->rx_ep); > > Ditto. > >>> >>> + if (!ia->ri_id->qp) >>> + return -EINVAL; >>> + >>> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); >>> if (rc) >>> dprintk("RPC: %s: failed ib_post_send for invalidate," >>> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >>> send_wr.send_flags = IB_SEND_SIGNALED; >>> } > > Ditto. > >>> >>> + if (!ia->ri_id->qp) >>> + return -EINVAL; >>> + >>> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); >>> if (rc) >>> dprintk("RPC: %s: ib_post_send returned %i\n", __func__, >>> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, >>> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); >>> >>> DECR_CQCOUNT(ep); > > And here. > >>> + >>> + if (!ia->ri_id->qp) >>> + return -EINVAL; >>> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); >>> >>> if (rc) >>> -- >>> 1.7.1 >>> >> >> _________________________________ >> Trond Myklebust >> Linux NFS client maintainer, PrimaryData >> trond.myklebust@primarydata.com >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-nfs" >> in the body of a message to majordomo@vger.kernel.org More majordomo >> info at http://vger.kernel.org/majordomo-info.html > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/9/2014 7:26 PM, Chuck Lever wrote: > On Apr 9, 2014, at 7:56 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > >> Hi Chuk and Trond >> >> I will resend a v2 for this. >> What if ib_post_send() fails with immidate error, I that case also DECR_CQCOUNT() will be called but no completion will be reported. Will that not cause any problems? > We should investigate whether an error return from ib_post_{send,recv} means there will be no completion. But I’ve never seen these verbs fail in practice, so I’m not in a hurry to make work for anyone! ;-) A synchronous failure from ib_post_* means the WR (or at least one of them if there were > 1) failed and did not get submitted to HW. So there will be no completion for those that failed. > However it seems to me the new (!ia->ri_id->qp) checks outside the connect logic are unnecessary. > > Clearly, as you noticed, the ib_post_{send,recv} verbs do not check that their “qp" argument is NULL before dereferencing it. > > But I don’t understand how xprtrdma can post any operation if the transport isn’t connected. In other words, how would it be possible to call rpcrdma_ep_post_recv() if the connect had failed and there was no QP? > > If disconnect wipes ia->ri_id->qp while there are still operations in progress, that would be the real bug. > > >> Also in rpcrdma_register_frmr_external() I am seeing DECT_CQCOUNT is called twice >> First at line 1538 (unlikely however) and second at line 1562. Shouldn't it be only at 1562? > if (seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID) then rpcrdma_register_frmr_external() posts two Work Requests (LOCAL_INV then FAST_REG_MR) with one ib_post_send(). Thus it is correct to DECR_CQCOUNT twice in that case because each WR will trigger a separate completion event. > > >> -----Original Message----- >> From: Chuck Lever [mailto:chuck.lever@oracle.com] >> Sent: Thursday, April 10, 2014 1:57 AM >> To: Devesh Sharma >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >> >> >> On Apr 9, 2014, at 4:22 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: >> >>> Hi Devesh, >>> >>> This looks a lot better. I still have a couple of small suggestions, though. >>> >>> On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: >>> >>>> If the rdma_create_qp fails to create qp due to device firmware being >>>> in invalid state xprtrdma still tries to destroy the non-existant qp >>>> and ends up in a NULL pointer reference crash. >>>> Adding proper checks for vaidating QP pointer avoids this to happen. >>>> >>>> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> >>>> --- >>>> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- >>>> 1 files changed, 25 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/net/sunrpc/xprtrdma/verbs.c >>>> b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 >>>> --- a/net/sunrpc/xprtrdma/verbs.c >>>> +++ b/net/sunrpc/xprtrdma/verbs.c >>>> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) >>>> if (ep->rep_connected != 0) { >>>> struct rpcrdma_xprt *xprt; >>>> retry: >>>> - rc = rpcrdma_ep_disconnect(ep, ia); >>>> - if (rc && rc != -ENOTCONN) >>>> - dprintk("RPC: %s: rpcrdma_ep_disconnect" >>>> + if (ia->ri_id->qp) { >>>> + rc = rpcrdma_ep_disconnect(ep, ia); >>>> + if (rc && rc != -ENOTCONN) >>>> + dprintk("RPC: %s: rpcrdma_ep_disconnect" >>>> " status %i\n", __func__, rc); >>>> + } >>>> rpcrdma_clean_cq(ep->rep_cq); >>>> >>>> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ -859,7 >>>> +861,9 @@ retry: >>>> goto out; >>>> } >>>> /* END TEMP */ >>>> - rdma_destroy_qp(ia->ri_id); >>>> + if (ia->ri_id->qp) { >>>> + rdma_destroy_qp(ia->ri_id); >>>> + } >>> Nit: No need for braces here. >>> >>>> rdma_destroy_id(ia->ri_id); >>>> ia->ri_id = id; >>>> } >>>> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>>> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>>> DECR_CQCOUNT(&r_xprt->rx_ep); >> I don't think you can DECR_CQCOUNT, then exit without posting the send. That will screw up the completion counter and result in a transport hang, won't it? >> >>>> + if (!ia->ri_is->qp) { >>>> + rc = -EINVAL; >>>> + while (i--) >>>> + rpcrdma_unmap_one(ia, --seg); >>>> + goto out; >>>> + } >>> Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not >>> just do >>> >>> if (ia->ri_is->qp) >>> rc = ib_post_send(...) >>> else >>> rc = -EINVAL; >>> >>> BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? >>> >>>> + >>>> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); >>>> >>>> if (rc) { >>>> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>>> seg1->mr_len = len; >>>> } >>>> *nsegs = i; >>>> +out: >>>> return rc; >>>> } >>>> >>>> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, >>>> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>>> DECR_CQCOUNT(&r_xprt->rx_ep); >> Ditto. >> >>>> + if (!ia->ri_id->qp) >>>> + return -EINVAL; >>>> + >>>> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); >>>> if (rc) >>>> dprintk("RPC: %s: failed ib_post_send for invalidate," >>>> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >>>> send_wr.send_flags = IB_SEND_SIGNALED; >>>> } >> Ditto. >> >>>> + if (!ia->ri_id->qp) >>>> + return -EINVAL; >>>> + >>>> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); >>>> if (rc) >>>> dprintk("RPC: %s: ib_post_send returned %i\n", __func__, >>>> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, >>>> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); >>>> >>>> DECR_CQCOUNT(ep); >> And here. >> >>>> + >>>> + if (!ia->ri_id->qp) >>>> + return -EINVAL; >>>> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); >>>> >>>> if (rc) >>>> -- >>>> 1.7.1 >>>> >>> _________________________________ >>> Trond Myklebust >>> Linux NFS client maintainer, PrimaryData >>> trond.myklebust@primarydata.com >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" >>> in the body of a message to majordomo@vger.kernel.org More majordomo >>> info at http://vger.kernel.org/majordomo-info.html >> -- >> Chuck Lever >> chuck[dot]lever[at]oracle[dot]com >> >> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Thursday, April 10, 2014 5:56 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 9, 2014, at 7:56 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> > wrote: > > > Hi Chuk and Trond > > > > I will resend a v2 for this. > > What if ib_post_send() fails with immidate error, I that case also > DECR_CQCOUNT() will be called but no completion will be reported. Will that > not cause any problems? > > We should investigate whether an error return from ib_post_{send,recv} > means there will be no completion. But I've never seen these verbs fail in > practice, so I'm not in a hurry to make work for anyone! ;-) Any verb can fail, may be due to system is under memory pressure? > > However it seems to me the new (!ia->ri_id->qp) checks outside the connect > logic are unnecessary. > > Clearly, as you noticed, the ib_post_{send,recv} verbs do not check that their > "qp" argument is NULL before dereferencing it. > > But I don't understand how xprtrdma can post any operation if the transport > isn't connected. In other words, how would it be possible to call > rpcrdma_ep_post_recv() if the connect had failed and there was no QP? > > If disconnect wipes ia->ri_id->qp while there are still operations in progress, > that would be the real bug. Yes!, But I have seen one more kernel oops where QP is destroyed and xprtrdma still try to post in LOCAL_INV WR on a NULL QP pointer and hence system crashes. So, I think what you missioned is really happening. > > > > Also in rpcrdma_register_frmr_external() I am seeing DECT_CQCOUNT is > > called twice First at line 1538 (unlikely however) and second at line 1562. > Shouldn't it be only at 1562? > > if (seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID) then > rpcrdma_register_frmr_external() posts two Work Requests (LOCAL_INV > then FAST_REG_MR) with one ib_post_send(). Thus it is correct to > DECR_CQCOUNT twice in that case because each WR will trigger a separate > completion event. Oh! I missed that. > > > > -----Original Message----- > > From: Chuck Lever [mailto:chuck.lever@oracle.com] > > Sent: Thursday, April 10, 2014 1:57 AM > > To: Devesh Sharma > > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > > Myklebust > > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > > > > On Apr 9, 2014, at 4:22 PM, Trond Myklebust > <trond.myklebust@primarydata.com> wrote: > > > >> Hi Devesh, > >> > >> This looks a lot better. I still have a couple of small suggestions, though. > >> > >> On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> > wrote: > >> > >>> If the rdma_create_qp fails to create qp due to device firmware > >>> being in invalid state xprtrdma still tries to destroy the > >>> non-existant qp and ends up in a NULL pointer reference crash. > >>> Adding proper checks for vaidating QP pointer avoids this to happen. > >>> > >>> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> > >>> --- > >>> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- > >>> 1 files changed, 25 insertions(+), 4 deletions(-) > >>> > >>> diff --git a/net/sunrpc/xprtrdma/verbs.c > >>> b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 > >>> --- a/net/sunrpc/xprtrdma/verbs.c > >>> +++ b/net/sunrpc/xprtrdma/verbs.c > >>> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, > struct rpcrdma_ia *ia) > >>> if (ep->rep_connected != 0) { > >>> struct rpcrdma_xprt *xprt; > >>> retry: > >>> - rc = rpcrdma_ep_disconnect(ep, ia); > >>> - if (rc && rc != -ENOTCONN) > >>> - dprintk("RPC: %s: rpcrdma_ep_disconnect" > >>> + if (ia->ri_id->qp) { > >>> + rc = rpcrdma_ep_disconnect(ep, ia); > >>> + if (rc && rc != -ENOTCONN) > >>> + dprintk("RPC: %s: > rpcrdma_ep_disconnect" > >>> " status %i\n", __func__, rc); > >>> + } > >>> rpcrdma_clean_cq(ep->rep_cq); > >>> > >>> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ - > 859,7 > >>> +861,9 @@ retry: > >>> goto out; > >>> } > >>> /* END TEMP */ > >>> - rdma_destroy_qp(ia->ri_id); > >>> + if (ia->ri_id->qp) { > >>> + rdma_destroy_qp(ia->ri_id); > >>> + } > >> > >> Nit: No need for braces here. > >> > >>> rdma_destroy_id(ia->ri_id); > >>> ia->ri_id = id; > >>> } > >>> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct > rpcrdma_mr_seg *seg, > >>> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr- > >rkey; > >>> DECR_CQCOUNT(&r_xprt->rx_ep); > > > > I don't think you can DECR_CQCOUNT, then exit without posting the send. > That will screw up the completion counter and result in a transport hang, > won't it? > > > >>> > >>> + if (!ia->ri_is->qp) { > >>> + rc = -EINVAL; > >>> + while (i--) > >>> + rpcrdma_unmap_one(ia, --seg); > >>> + goto out; > >>> + } > >> > >> Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not > >> just do > >> > >> if (ia->ri_is->qp) > >> rc = ib_post_send(...) > >> else > >> rc = -EINVAL; > >> > >> BTW: can we not simply test for ia->ri_is->qp before we even call > rpcrdma_map_one() and hence bail out before we have to do any cleanup? > >> > >>> + > >>> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); > >>> > >>> if (rc) { > >>> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct > rpcrdma_mr_seg *seg, > >>> seg1->mr_len = len; > >>> } > >>> *nsegs = i; > >>> +out: > >>> return rc; > >>> } > >>> > >>> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct > rpcrdma_mr_seg *seg, > >>> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw- > >r.frmr.fr_mr->rkey; > >>> DECR_CQCOUNT(&r_xprt->rx_ep); > > > > Ditto. > > > >>> > >>> + if (!ia->ri_id->qp) > >>> + return -EINVAL; > >>> + > >>> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); > >>> if (rc) > >>> dprintk("RPC: %s: failed ib_post_send for invalidate," > >>> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, > >>> send_wr.send_flags = IB_SEND_SIGNALED; > >>> } > > > > Ditto. > > > >>> > >>> + if (!ia->ri_id->qp) > >>> + return -EINVAL; > >>> + > >>> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); > >>> if (rc) > >>> dprintk("RPC: %s: ib_post_send returned %i\n", > __func__, > >>> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, > >>> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); > >>> > >>> DECR_CQCOUNT(ep); > > > > And here. > > > >>> + > >>> + if (!ia->ri_id->qp) > >>> + return -EINVAL; > >>> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); > >>> > >>> if (rc) > >>> -- > >>> 1.7.1 > >>> > >> > >> _________________________________ > >> Trond Myklebust > >> Linux NFS client maintainer, PrimaryData > >> trond.myklebust@primarydata.com > >> > >> -- > >> To unsubscribe from this list: send the line "unsubscribe linux-nfs" > >> in the body of a message to majordomo@vger.kernel.org More > majordomo > >> info at http://vger.kernel.org/majordomo-info.html > > > > -- > > Chuck Lever > > chuck[dot]lever[at]oracle[dot]com > > > > > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" > > in the body of a message to majordomo@vger.kernel.org More > majordomo > > info at http://vger.kernel.org/majordomo-info.html > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 10, 2014, at 11:01 AM, Steve Wise <swise@opengridcomputing.com> wrote: > On 4/9/2014 7:26 PM, Chuck Lever wrote: >> On Apr 9, 2014, at 7:56 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: >> >>> Hi Chuk and Trond >>> >>> I will resend a v2 for this. >>> What if ib_post_send() fails with immidate error, I that case also DECR_CQCOUNT() will be called but no completion will be reported. Will that not cause any problems? >> We should investigate whether an error return from ib_post_{send,recv} means there will be no completion. But I’ve never seen these verbs fail in practice, so I’m not in a hurry to make work for anyone! ;-) > > A synchronous failure from ib_post_* means the WR (or at least one of them if there were > 1) failed and did not get submitted to HW. So there will be no completion for those that failed. OK. Our post operations are largely single WRs. Before we address CQCOUNT in error cases, we’d have to deal with chained WRs. Chained WRs are used only when rpcrdma_register_frmr_external() finds an MR that hasn’t been invalidated. That’s actually working around a FRMR re-use bug (commit 5c635e09). If the underlying re-use problem was fixed, we could get rid of the chained WR in register_frmr_external() (and we wouldn’t need completions at all for FAST_REG_MR). But at 100,000 feet, if a post operation fails, that seems like a very serious issue. I wonder whether we would be better off disconnecting and starting over in those cases. > >> However it seems to me the new (!ia->ri_id->qp) checks outside the connect logic are unnecessary. >> >> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check that their “qp" argument is NULL before dereferencing it. >> >> But I don’t understand how xprtrdma can post any operation if the transport isn’t connected. In other words, how would it be possible to call rpcrdma_ep_post_recv() if the connect had failed and there was no QP? >> >> If disconnect wipes ia->ri_id->qp while there are still operations in progress, that would be the real bug. >> >> >>> Also in rpcrdma_register_frmr_external() I am seeing DECT_CQCOUNT is called twice >>> First at line 1538 (unlikely however) and second at line 1562. Shouldn't it be only at 1562? >> if (seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID) then rpcrdma_register_frmr_external() posts two Work Requests (LOCAL_INV then FAST_REG_MR) with one ib_post_send(). Thus it is correct to DECR_CQCOUNT twice in that case because each WR will trigger a separate completion event. >> >> >>> -----Original Message----- >>> From: Chuck Lever [mailto:chuck.lever@oracle.com] >>> Sent: Thursday, April 10, 2014 1:57 AM >>> To: Devesh Sharma >>> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust >>> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >>> >>> >>> On Apr 9, 2014, at 4:22 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: >>> >>>> Hi Devesh, >>>> >>>> This looks a lot better. I still have a couple of small suggestions, though. >>>> >>>> On Apr 9, 2014, at 14:40, Devesh Sharma <devesh.sharma@emulex.com> wrote: >>>> >>>>> If the rdma_create_qp fails to create qp due to device firmware being >>>>> in invalid state xprtrdma still tries to destroy the non-existant qp >>>>> and ends up in a NULL pointer reference crash. >>>>> Adding proper checks for vaidating QP pointer avoids this to happen. >>>>> >>>>> Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> >>>>> --- >>>>> net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- >>>>> 1 files changed, 25 insertions(+), 4 deletions(-) >>>>> >>>>> diff --git a/net/sunrpc/xprtrdma/verbs.c >>>>> b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 >>>>> --- a/net/sunrpc/xprtrdma/verbs.c >>>>> +++ b/net/sunrpc/xprtrdma/verbs.c >>>>> @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) >>>>> if (ep->rep_connected != 0) { >>>>> struct rpcrdma_xprt *xprt; >>>>> retry: >>>>> - rc = rpcrdma_ep_disconnect(ep, ia); >>>>> - if (rc && rc != -ENOTCONN) >>>>> - dprintk("RPC: %s: rpcrdma_ep_disconnect" >>>>> + if (ia->ri_id->qp) { >>>>> + rc = rpcrdma_ep_disconnect(ep, ia); >>>>> + if (rc && rc != -ENOTCONN) >>>>> + dprintk("RPC: %s: rpcrdma_ep_disconnect" >>>>> " status %i\n", __func__, rc); >>>>> + } >>>>> rpcrdma_clean_cq(ep->rep_cq); >>>>> >>>>> xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ -859,7 >>>>> +861,9 @@ retry: >>>>> goto out; >>>>> } >>>>> /* END TEMP */ >>>>> - rdma_destroy_qp(ia->ri_id); >>>>> + if (ia->ri_id->qp) { >>>>> + rdma_destroy_qp(ia->ri_id); >>>>> + } >>>> Nit: No need for braces here. >>>> >>>>> rdma_destroy_id(ia->ri_id); >>>>> ia->ri_id = id; >>>>> } >>>>> @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>>>> frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>>>> DECR_CQCOUNT(&r_xprt->rx_ep); >>> I don't think you can DECR_CQCOUNT, then exit without posting the send. That will screw up the completion counter and result in a transport hang, won't it? >>> >>>>> + if (!ia->ri_is->qp) { >>>>> + rc = -EINVAL; >>>>> + while (i--) >>>>> + rpcrdma_unmap_one(ia, --seg); >>>>> + goto out; >>>>> + } >>>> Instead of duplicating the rpcrdma_unmap_one() cleanup here, why not >>>> just do >>>> >>>> if (ia->ri_is->qp) >>>> rc = ib_post_send(...) >>>> else >>>> rc = -EINVAL; >>>> >>>> BTW: can we not simply test for ia->ri_is->qp before we even call rpcrdma_map_one() and hence bail out before we have to do any cleanup? >>>> >>>>> + >>>>> rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); >>>>> >>>>> if (rc) { >>>>> @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, >>>>> seg1->mr_len = len; >>>>> } >>>>> *nsegs = i; >>>>> +out: >>>>> return rc; >>>>> } >>>>> >>>>> @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, >>>>> invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; >>>>> DECR_CQCOUNT(&r_xprt->rx_ep); >>> Ditto. >>> >>>>> + if (!ia->ri_id->qp) >>>>> + return -EINVAL; >>>>> + >>>>> rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); >>>>> if (rc) >>>>> dprintk("RPC: %s: failed ib_post_send for invalidate," >>>>> @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, >>>>> send_wr.send_flags = IB_SEND_SIGNALED; >>>>> } >>> Ditto. >>> >>>>> + if (!ia->ri_id->qp) >>>>> + return -EINVAL; >>>>> + >>>>> rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); >>>>> if (rc) >>>>> dprintk("RPC: %s: ib_post_send returned %i\n", __func__, >>>>> @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, >>>>> rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); >>>>> >>>>> DECR_CQCOUNT(ep); >>> And here. >>> >>>>> + >>>>> + if (!ia->ri_id->qp) >>>>> + return -EINVAL; >>>>> rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); >>>>> >>>>> if (rc) >>>>> -- >>>>> 1.7.1 >>>>> >>>> _________________________________ >>>> Trond Myklebust >>>> Linux NFS client maintainer, PrimaryData >>>> trond.myklebust@primarydata.com >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" >>>> in the body of a message to majordomo@vger.kernel.org More majordomo >>>> info at http://vger.kernel.org/majordomo-info.html >>> -- >>> Chuck Lever >>> chuck[dot]lever[at]oracle[dot]com >>> >>> >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> -- >> Chuck Lever >> chuck[dot]lever[at]oracle[dot]com >> >> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 10, 2014, at 1:42 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: >> However it seems to me the new (!ia->ri_id->qp) checks outside the connect >> logic are unnecessary. >> >> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check that their >> "qp" argument is NULL before dereferencing it. >> >> But I don't understand how xprtrdma can post any operation if the transport >> isn't connected. In other words, how would it be possible to call >> rpcrdma_ep_post_recv() if the connect had failed and there was no QP? >> >> If disconnect wipes ia->ri_id->qp while there are still operations in progress, >> that would be the real bug. > Yes!, But I have seen one more kernel oops where QP is destroyed and xprtrdma still try to post in LOCAL_INV > WR on a NULL QP pointer and hence system crashes. So, I think what you missioned is really happening. I’d like to see the crash data (back trace, etc), if you’ve got it. -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Alright here it is: <3>ocrdma_mbx_create_qp(0) rq_err <3>ocrdma_mbx_create_qp(0) sq_err <3>ocrdma_create_qp(0) error=-1 <1>BUG: unable to handle kernel NULL pointer dereference at (null) <1>IP: [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4>PGD 455942067 PUD 458356067 PMD 0 <4>Oops: 0000 [#1] SMP <4>last sysfs file: /sys/devices/pci0000:80/0000:80:03.0/0000:8b:00.1/class <4>CPU 1 <4>Modules linked in: nfs fscache xprtrdma(U) ocrdma(U) fuse ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables bridge nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 sunrpc target_core_iblock target_core_file target_core_pscsi target_core_mod configfs bnx2fc cnic uio fcoe libfcoe 8021q garp libfc stp llc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) vfat fat vhost_net macvtap macvlan tun kvm_intel kvm uinput sg cdc_ether usbnet mii microcode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support shpchp igb ptp pps_core ioatdma dca be2net(U) ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif lpfc scsi_transport_fc scsi_tgt ahci wmi megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: speedstep_lib] <4> <4>Pid: 9204, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 IBM System x3650 M4 -[7915AC1]-/00J6528 <4>RIP: 0010:[<ffffffffa078e8ac>] [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4>RSP: 0018:ffff8804551877f8 EFLAGS: 00010217 <4>RAX: ffff880462243800 RBX: ffff88045646a028 RCX: 0000000000000000 <4>RDX: ffff880455187860 RSI: ffff8804551877f8 RDI: 0000000000000000 <4>RBP: ffff880455187888 R08: 0000000000000000 R09: 0000000000000000 <4>R10: 0000000000000000 R11: 0000000000000000 R12: ffff88047601c598 <4>R13: ffff88047601c000 R14: ffff88045646a068 R15: 0000000000000000 <4>FS: 00007fd669be07a0(0000) GS:ffff880028220000(0000) knlGS:0000000000000000 <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b <4>CR2: 0000000000000000 CR3: 00000004557de000 CR4: 00000000000407e0 <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 <4>Process ls (pid: 9204, threadinfo ffff880455186000, task ffff880456735540) <4>Stack: <4> 0000000000000000 ffff88045584a700 0000000000000000 0000000a00000000 <4><d> 080424b400000002 0000000000000000 0000000000000000 0000000000000000 <4><d> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: <4> [<ffffffffa078ea03>] rpcrdma_deregister_external+0x113/0x2d0 [xprtrdma] <4> [<ffffffffa078c4fc>] xprt_rdma_free+0x8c/0x210 [xprtrdma] <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 <4> [<ffffffffa07bba60>] xprt_release+0xc0/0x220 [sunrpc] <4> [<ffffffffa07c2f5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] <4> [<ffffffffa07c3a84>] __rpc_execute+0x174/0x350 [sunrpc] <4> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 <4> [<ffffffffa07c3cc1>] rpc_execute+0x61/0xa0 [sunrpc] <4> [<ffffffffa07ba3a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> [<ffffffffa07ba4c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> [<ffffffffa08b6f6d>] nfs3_rpc_wrapper.clone.0+0x3d/0xd0 [nfs] <4> [<ffffffffa08b734c>] nfs3_proc_access+0xbc/0x180 [nfs] <4> [<ffffffffa089f1e9>] nfs_do_access+0x199/0x3c0 [nfs] <4> [<ffffffffa07c6305>] ? generic_lookup_cred+0x15/0x20 [sunrpc] <4> [<ffffffffa07c52e0>] ? rpcauth_lookupcred+0x70/0xc0 [sunrpc] <4> [<ffffffffa089f4b8>] nfs_permission+0xa8/0x1e0 [nfs] <4> [<ffffffff8119053d>] __link_path_walk+0xad/0x1030 <4> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> [<ffffffff8132b1fa>] ? copy_termios+0x6a/0x80 <4> [<ffffffff8119174a>] path_walk+0x6a/0xe0 <4> [<ffffffff8119191b>] do_path_lookup+0x5b/0xa0 <4> [<ffffffff811925a7>] user_path_at+0x57/0xa0 <4> [<ffffffff81194ed2>] ? vfs_ioctl+0x22/0xa0 <4> [<ffffffff811869bc>] vfs_fstatat+0x3c/0x80 <4> [<ffffffff81085151>] ? do_sigaction+0x91/0x1d0 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> [<ffffffff8151311e>] ? do_page_fault+0x3e/0xa0 <4> [<ffffffff815104d5>] ? page_fault+0x25/0x30 <4> [<ffffffff8100b072>] system_call_fastpath+0x16/0x1b <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 8d 05 00 <1>RIP [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP <ffff8804551877f8> <4>CR2: 0000000000000000 > -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Chuck Lever > Sent: Thursday, April 10, 2014 11:21 PM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 10, 2014, at 1:42 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> wrote: > > >> However it seems to me the new (!ia->ri_id->qp) checks outside the > >> connect logic are unnecessary. > >> > >> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check > >> that their "qp" argument is NULL before dereferencing it. > >> > >> But I don't understand how xprtrdma can post any operation if the > >> transport isn't connected. In other words, how would it be possible > >> to call > >> rpcrdma_ep_post_recv() if the connect had failed and there was no QP? > >> > >> If disconnect wipes ia->ri_id->qp while there are still operations in > >> progress, that would be the real bug. > > Yes!, But I have seen one more kernel oops where QP is destroyed and > > xprtrdma still try to post in LOCAL_INV WR on a NULL QP pointer and hence > system crashes. So, I think what you missioned is really happening. > > I'd like to see the crash data (back trace, etc), if you've got it. > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the > body of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Thursday, April 10, 2014 12:44 PM > To: Steve Wise > Cc: Devesh Sharma; Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 10, 2014, at 11:01 AM, Steve Wise <swise@opengridcomputing.com> wrote: > > > On 4/9/2014 7:26 PM, Chuck Lever wrote: > >> On Apr 9, 2014, at 7:56 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > >> > >>> Hi Chuk and Trond > >>> > >>> I will resend a v2 for this. > >>> What if ib_post_send() fails with immidate error, I that case also DECR_CQCOUNT() will > be called but no completion will be reported. Will that not cause any problems? > >> We should investigate whether an error return from ib_post_{send,recv} means there will > be no completion. But I've never seen these verbs fail in practice, so I'm not in a hurry to make > work for anyone! ;-) > > > > A synchronous failure from ib_post_* means the WR (or at least one of them if there were > > 1) failed and did not get submitted to HW. So there will be no completion for those that failed. > > OK. > > Our post operations are largely single WRs. Before we address CQCOUNT in error cases, we'd > have to deal with chained WRs. > > Chained WRs are used only when rpcrdma_register_frmr_external() finds an MR that hasn't > been invalidated. That's actually working around a FRMR re-use bug (commit 5c635e09). If the > underlying re-use problem was fixed, we could get rid of the chained WR in > register_frmr_external() (and we wouldn't need completions at all for FAST_REG_MR). > > But at 100,000 feet, if a post operation fails, that seems like a very serious issue. I wonder > whether we would be better off disconnecting and starting over in those cases. > I agree. The application is responsible to flow-control its posting of WRs to the SQs/RQs. So we should never see sync failures with ib_post_* due to over-running the queues. However, if the QP moves out of RTS for whatever reason, then a multi-threaded application could encounter sync failures because the QP exited RTS. Anyway, I agree: if there are any failures with ib_post_*, the application should kill the connection, (LOG SOMETHING!), and setup a new connection. My 2 centimes. :) Steve -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Devesh- On Apr 10, 2014, at 1:54 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > Alright here it is: > > <3>ocrdma_mbx_create_qp(0) rq_err > <3>ocrdma_mbx_create_qp(0) sq_err > <3>ocrdma_create_qp(0) error=-1 > <1>BUG: unable to handle kernel NULL pointer dereference at (null) > <1>IP: [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] As near as I can ascertain, the RDMA connection is torn down while an NFS workload is running, and a new connection cannot be completely set up. Can you try this: 1. On your client, # rpcdebug -m rpc -s call xprt sched trans 2. Reproduce the failure 3. Post the relevant contents of /var/log/messages (like the last RPC request or two before the BUG) And post the relevant line in /proc/mounts corresponding to your test NFS/RDMA mount. > <4>PGD 455942067 PUD 458356067 PMD 0 > <4>Oops: 0000 [#1] SMP > <4>last sysfs file: /sys/devices/pci0000:80/0000:80:03.0/0000:8b:00.1/class > <4>CPU 1 > <4>Modules linked in: nfs fscache xprtrdma(U) ocrdma(U) fuse ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables bridge nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 sunrpc target_core_iblock target_core_file target_core_pscsi target_core_mod configfs bnx2fc cnic uio fcoe libfcoe 8021q garp libfc stp llc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) vfat fat vhost_net macvtap macvlan tun kvm_intel kvm uinput sg cdc_ether usbnet mii microcode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support shpchp igb ptp pps_core ioatdma dca be2net(U) ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif lpfc scsi_transport_fc scsi_tgt ahci wm i megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: speedstep_lib] > <4> > <4>Pid: 9204, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 IBM System x3650 M4 -[7915AC1]-/00J6528 > <4>RIP: 0010:[<ffffffffa078e8ac>] [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > <4>RSP: 0018:ffff8804551877f8 EFLAGS: 00010217 > <4>RAX: ffff880462243800 RBX: ffff88045646a028 RCX: 0000000000000000 > <4>RDX: ffff880455187860 RSI: ffff8804551877f8 RDI: 0000000000000000 > <4>RBP: ffff880455187888 R08: 0000000000000000 R09: 0000000000000000 > <4>R10: 0000000000000000 R11: 0000000000000000 R12: ffff88047601c598 > <4>R13: ffff88047601c000 R14: ffff88045646a068 R15: 0000000000000000 > <4>FS: 00007fd669be07a0(0000) GS:ffff880028220000(0000) knlGS:0000000000000000 > <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > <4>CR2: 0000000000000000 CR3: 00000004557de000 CR4: 00000000000407e0 > <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > <4>Process ls (pid: 9204, threadinfo ffff880455186000, task ffff880456735540) > <4>Stack: > <4> 0000000000000000 ffff88045584a700 0000000000000000 0000000a00000000 > <4><d> 080424b400000002 0000000000000000 0000000000000000 0000000000000000 > <4><d> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > <4>Call Trace: > <4> [<ffffffffa078ea03>] rpcrdma_deregister_external+0x113/0x2d0 [xprtrdma] > <4> [<ffffffffa078c4fc>] xprt_rdma_free+0x8c/0x210 [xprtrdma] > <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 > <4> [<ffffffffa07bba60>] xprt_release+0xc0/0x220 [sunrpc] > <4> [<ffffffffa07c2f5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] > <4> [<ffffffffa07c3a84>] __rpc_execute+0x174/0x350 [sunrpc] > <4> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 > <4> [<ffffffffa07c3cc1>] rpc_execute+0x61/0xa0 [sunrpc] > <4> [<ffffffffa07ba3a5>] rpc_run_task+0x75/0x90 [sunrpc] > <4> [<ffffffffa07ba4c2>] rpc_call_sync+0x42/0x70 [sunrpc] > <4> [<ffffffffa08b6f6d>] nfs3_rpc_wrapper.clone.0+0x3d/0xd0 [nfs] > <4> [<ffffffffa08b734c>] nfs3_proc_access+0xbc/0x180 [nfs] > <4> [<ffffffffa089f1e9>] nfs_do_access+0x199/0x3c0 [nfs] > <4> [<ffffffffa07c6305>] ? generic_lookup_cred+0x15/0x20 [sunrpc] > <4> [<ffffffffa07c52e0>] ? rpcauth_lookupcred+0x70/0xc0 [sunrpc] > <4> [<ffffffffa089f4b8>] nfs_permission+0xa8/0x1e0 [nfs] > <4> [<ffffffff8119053d>] __link_path_walk+0xad/0x1030 > <4> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 > <4> [<ffffffff8132b1fa>] ? copy_termios+0x6a/0x80 > <4> [<ffffffff8119174a>] path_walk+0x6a/0xe0 > <4> [<ffffffff8119191b>] do_path_lookup+0x5b/0xa0 > <4> [<ffffffff811925a7>] user_path_at+0x57/0xa0 > <4> [<ffffffff81194ed2>] ? vfs_ioctl+0x22/0xa0 > <4> [<ffffffff811869bc>] vfs_fstatat+0x3c/0x80 > <4> [<ffffffff81085151>] ? do_sigaction+0x91/0x1d0 > <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 > <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 > <4> [<ffffffff8151311e>] ? do_page_fault+0x3e/0xa0 > <4> [<ffffffff815104d5>] ? page_fault+0x25/0x30 > <4> [<ffffffff8100b072>] system_call_fastpath+0x16/0x1b > <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 8d 05 00 > <1>RIP [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > <4> RSP <ffff8804551877f8> > <4>CR2: 0000000000000000 > >> -----Original Message----- >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >> owner@vger.kernel.org] On Behalf Of Chuck Lever >> Sent: Thursday, April 10, 2014 11:21 PM >> To: Devesh Sharma >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >> >> >> On Apr 10, 2014, at 1:42 PM, Devesh Sharma >> <Devesh.Sharma@Emulex.Com> wrote: >> >>>> However it seems to me the new (!ia->ri_id->qp) checks outside the >>>> connect logic are unnecessary. >>>> >>>> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check >>>> that their "qp" argument is NULL before dereferencing it. >>>> >>>> But I don't understand how xprtrdma can post any operation if the >>>> transport isn't connected. In other words, how would it be possible >>>> to call >>>> rpcrdma_ep_post_recv() if the connect had failed and there was no QP? >>>> >>>> If disconnect wipes ia->ri_id->qp while there are still operations in >>>> progress, that would be the real bug. >>> Yes!, But I have seen one more kernel oops where QP is destroyed and >>> xprtrdma still try to post in LOCAL_INV WR on a NULL QP pointer and hence >> system crashes. So, I think what you missioned is really happening. >> >> I'd like to see the crash data (back trace, etc), if you've got it. >> >> -- >> Chuck Lever >> chuck[dot]lever[at]oracle[dot]com >> >> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the >> body of a message to majordomo@vger.kernel.org More majordomo info at >> http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chuck, Yes that is the case, Following is the trace I got. <4>RPC: 355 setting alarm for 60000 ms <4>RPC: 355 sync task going to sleep <4>RPC: xprt_rdma_connect_worker: reconnect <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 <3>ocrdma_mbx_create_qp(0) rq_err <3>ocrdma_mbx_create_qp(0) sq_err <3>ocrdma_create_qp(0) error=-1 <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 <4>RPC: 355 __rpc_wake_up_task (now 4296956756) <4>RPC: 355 disabling timer <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" <4>RPC: __rpc_wake_up_task done <4>RPC: xprt_rdma_connect_worker: exit <4>RPC: 355 sync task resuming <4>RPC: 355 xprt_connect_status: error 1 connecting to server 192.168.1.1 <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") <4>RPC: 355 call_connect_status (status -5) <4>RPC: 355 return 0, status -5 <4>RPC: 355 release task <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") <4>RPC: xprt_rdma_free: called on 0x(null) <1>BUG: unable to handle kernel NULL pointer dereference at (null) <1>IP: [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4>PGD 454554067 PUD 4665b7067 PMD 0 <4>Oops: 0000 [#1] SMP <4>last sysfs file: /sys/devices/pci0000:00/0000:00:03.0/0000:03:00.0/infiniband/ocrdma0/fwerr <4>CPU 6 <4>Modules linked in: xprtrdma(U) nfs lockd fscache auth_rpcgss nfs_acl ocrdma(U) be2net(U) ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables bridge stp llc autofs4 des_generic ecb md4 nls_utf8 cifs sunrpc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) vhost_net macvtap macvlan tun kvm uinput power_meter sg microcode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ptp pps_core ioatdma dca i7core_edac edac_core ext3 jbd mbcache sr_mod cdrom sd_mod crc_t10dif usb_storage pata_acpi ata_generic ata_piix mptsas mptscsih mptbase scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: be2net] <4> <4>Pid: 3597, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 Cisco Systems Inc R210-2121605W/R210-2121605W <4>RIP: 0010:[<ffffffffa05b68ac>] [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4>RSP: 0018:ffff880465aff9a8 EFLAGS: 00010217 <4>RAX: ffff8804673fcc00 RBX: ffff880466578028 RCX: 0000000000000000 <4>RDX: ffff880465affa10 RSI: ffff880465aff9a8 RDI: 0000000000000000 <4>RBP: ffff880465affa38 R08: 0000000000000000 R09: 0000000000000000 <4>R10: 000000000000000f R11: 000000000000000f R12: ffff880454578598 <4>R13: ffff880454578000 R14: ffff880466578068 R15: 0000000000000000 <4>FS: 00007fe61f3107a0(0000) GS:ffff8800368c0000(0000) knlGS:0000000000000000 <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b <4>CR2: 0000000000000000 CR3: 000000046520a000 CR4: 00000000000007e0 <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 <4>Process ls (pid: 3597, threadinfo ffff880465afe000, task ffff8804639a5500) <4>Stack: <4> 0000000000000000 ffff880462287370 0000000000000000 0000000a00000000 <4><d> 0802dd3b00000002 0000000000000000 0000000000000000 0000000000000000 <4><d> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: <4> [<ffffffff8109c97f>] ? up+0x2f/0x50 <4> [<ffffffffa05b6a03>] rpcrdma_deregister_external+0x113/0x2d0 [xprtrdma] <4> [<ffffffff8150d0d1>] ? printk+0x41/0x48 <4> [<ffffffffa05b44fc>] xprt_rdma_free+0x8c/0x210 [xprtrdma] <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 <4> [<ffffffffa05c8a60>] xprt_release+0xc0/0x220 [sunrpc] <4> [<ffffffffa05cff5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] <4> [<ffffffffa05d0a84>] __rpc_execute+0x174/0x350 [sunrpc] <4> [<ffffffff8150d0d1>] ? printk+0x41/0x48 <4> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 <4> [<ffffffffa05d0cc1>] rpc_execute+0x61/0xa0 [sunrpc] <4> [<ffffffffa05c73a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> [<ffffffffa05c74c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> [<ffffffffa074e030>] _nfs4_call_sync+0x30/0x40 [nfs] <4> [<ffffffffa07461dc>] _nfs4_proc_getattr+0xac/0xc0 [nfs] <4> [<ffffffffa07494be>] nfs4_proc_getattr+0x4e/0x70 [nfs] <4> [<ffffffffa072f3e3>] __nfs_revalidate_inode+0xe3/0x220 [nfs] <4> [<ffffffffa072fdb6>] nfs_getattr+0xb6/0x120 [nfs] <4> [<ffffffff81186951>] vfs_getattr+0x51/0x80 <4> [<ffffffff811869e0>] vfs_fstatat+0x60/0x80 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> [<ffffffff810dc817>] ? audit_syscall_entry+0x1d7/0x200 <4> [<ffffffff8100b072>] system_call_fastpath+0x16/0x1b <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 dd 03 00 <1>RIP [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP <ffff880465aff9a8> <4>CR2: 0000000000000000 > -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Friday, April 11, 2014 1:24 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > Hi Devesh- > > On Apr 10, 2014, at 1:54 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> wrote: > > > Alright here it is: > > > > <3>ocrdma_mbx_create_qp(0) rq_err > > <3>ocrdma_mbx_create_qp(0) sq_err > > <3>ocrdma_create_qp(0) error=-1 > > <1>BUG: unable to handle kernel NULL pointer dereference at (null) > > <1>IP: [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 > > [xprtrdma] > > As near as I can ascertain, the RDMA connection is torn down while an NFS > workload is running, and a new connection cannot be completely set up. > > Can you try this: > > 1. On your client, # rpcdebug -m rpc -s call xprt sched trans > > 2. Reproduce the failure > > 3. Post the relevant contents of /var/log/messages (like the last RPC request > or two before the BUG) > > And post the relevant line in /proc/mounts corresponding to your test > NFS/RDMA mount. > > > > <4>PGD 455942067 PUD 458356067 PMD 0 > > <4>Oops: 0000 [#1] SMP > > <4>last sysfs file: > > /sys/devices/pci0000:80/0000:80:03.0/0000:8b:00.1/class > > <4>CPU 1 > > <4>Modules linked in: nfs fscache xprtrdma(U) ocrdma(U) fuse > > ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE > > iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state > > nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter > > ip_tables bridge nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 > > sunrpc target_core_iblock target_core_file target_core_pscsi > > target_core_mod configfs bnx2fc cnic uio fcoe libfcoe 8021q garp libfc > > stp llc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) > > ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c > > iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) > > mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) > > vfat fat vhost_net macvtap macvlan tun kvm_intel kvm uinput sg > > cdc_ether usbnet mii microcode i2c_i801 i2c_core iTCO_wdt > > iTCO_vendor_support shpchp igb ptp pps_core ioatdma dca be2net(U) > ext4 > > mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif lpfc scsi_transport_fc > > scsi_tgt ahci wmi megaraid_sas dm_mirror dm_region_hash dm_log > dm_mod > > [last unloaded: speedstep_lib] <4> > > <4>Pid: 9204, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 IBM System > > x3650 M4 -[7915AC1]-/00J6528 > > <4>RIP: 0010:[<ffffffffa078e8ac>] [<ffffffffa078e8ac>] > > rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > > <4>RSP: 0018:ffff8804551877f8 EFLAGS: 00010217 > > <4>RAX: ffff880462243800 RBX: ffff88045646a028 RCX: 0000000000000000 > > <4>RDX: ffff880455187860 RSI: ffff8804551877f8 RDI: 0000000000000000 > > <4>RBP: ffff880455187888 R08: 0000000000000000 R09: 0000000000000000 > > <4>R10: 0000000000000000 R11: 0000000000000000 R12: ffff88047601c598 > > <4>R13: ffff88047601c000 R14: ffff88045646a068 R15: 0000000000000000 > > <4>FS: 00007fd669be07a0(0000) GS:ffff880028220000(0000) > > knlGS:0000000000000000 > > <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > > <4>CR2: 0000000000000000 CR3: 00000004557de000 CR4: 00000000000407e0 > > <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > > <4>Process ls (pid: 9204, threadinfo ffff880455186000, task > > ffff880456735540) > > <4>Stack: > > <4> 0000000000000000 ffff88045584a700 0000000000000000 > > 0000000a00000000 <4><d> 080424b400000002 0000000000000000 > > 0000000000000000 0000000000000000 <4><d> 0000000000000000 > > 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: > > <4> [<ffffffffa078ea03>] rpcrdma_deregister_external+0x113/0x2d0 > > [xprtrdma] <4> [<ffffffffa078c4fc>] xprt_rdma_free+0x8c/0x210 > > [xprtrdma] <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 <4> > > [<ffffffffa07bba60>] xprt_release+0xc0/0x220 [sunrpc] <4> > > [<ffffffffa07c2f5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] <4> > > [<ffffffffa07c3a84>] __rpc_execute+0x174/0x350 [sunrpc] <4> > > [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 <4> > > [<ffffffffa07c3cc1>] rpc_execute+0x61/0xa0 [sunrpc] <4> > > [<ffffffffa07ba3a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> > > [<ffffffffa07ba4c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> > > [<ffffffffa08b6f6d>] nfs3_rpc_wrapper.clone.0+0x3d/0xd0 [nfs] <4> > > [<ffffffffa08b734c>] nfs3_proc_access+0xbc/0x180 [nfs] <4> > > [<ffffffffa089f1e9>] nfs_do_access+0x199/0x3c0 [nfs] <4> > > [<ffffffffa07c6305>] ? generic_lookup_cred+0x15/0x20 [sunrpc] <4> > > [<ffffffffa07c52e0>] ? rpcauth_lookupcred+0x70/0xc0 [sunrpc] <4> > > [<ffffffffa089f4b8>] nfs_permission+0xa8/0x1e0 [nfs] <4> > > [<ffffffff8119053d>] __link_path_walk+0xad/0x1030 <4> > > [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> > > [<ffffffff8132b1fa>] ? copy_termios+0x6a/0x80 <4> [<ffffffff8119174a>] > > path_walk+0x6a/0xe0 <4> [<ffffffff8119191b>] > do_path_lookup+0x5b/0xa0 > > <4> [<ffffffff811925a7>] user_path_at+0x57/0xa0 <4> > > [<ffffffff81194ed2>] ? vfs_ioctl+0x22/0xa0 <4> [<ffffffff811869bc>] > > vfs_fstatat+0x3c/0x80 <4> [<ffffffff81085151>] ? > > do_sigaction+0x91/0x1d0 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 > > <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> > > [<ffffffff8151311e>] ? do_page_fault+0x3e/0xa0 <4> > > [<ffffffff815104d5>] ? page_fault+0x25/0x30 <4> [<ffffffff8100b072>] > > system_call_fastpath+0x16/0x1b > > <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff > > ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 > > <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 8d 05 00 > > <1>RIP [<ffffffffa078e8ac>] > > rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP > > <ffff8804551877f8> > > <4>CR2: 0000000000000000 > > > >> -----Original Message----- > >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > >> owner@vger.kernel.org] On Behalf Of Chuck Lever > >> Sent: Thursday, April 10, 2014 11:21 PM > >> To: Devesh Sharma > >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > >> Myklebust > >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > >> > >> > >> On Apr 10, 2014, at 1:42 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> > >> wrote: > >> > >>>> However it seems to me the new (!ia->ri_id->qp) checks outside the > >>>> connect logic are unnecessary. > >>>> > >>>> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check > >>>> that their "qp" argument is NULL before dereferencing it. > >>>> > >>>> But I don't understand how xprtrdma can post any operation if the > >>>> transport isn't connected. In other words, how would it be possible > >>>> to call > >>>> rpcrdma_ep_post_recv() if the connect had failed and there was no > QP? > >>>> > >>>> If disconnect wipes ia->ri_id->qp while there are still operations > >>>> in progress, that would be the real bug. > >>> Yes!, But I have seen one more kernel oops where QP is destroyed and > >>> xprtrdma still try to post in LOCAL_INV WR on a NULL QP pointer and > >>> hence > >> system crashes. So, I think what you missioned is really happening. > >> > >> I'd like to see the crash data (back trace, etc), if you've got it. > >> > >> -- > >> Chuck Lever > >> chuck[dot]lever[at]oracle[dot]com > >> > >> > >> > >> -- > >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" > >> in the body of a message to majordomo@vger.kernel.org More > majordomo > >> info at http://vger.kernel.org/majordomo-info.html > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 11, 2014, at 7:51 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > Hi Chuck, > Yes that is the case, Following is the trace I got. > > <4>RPC: 355 setting alarm for 60000 ms > <4>RPC: 355 sync task going to sleep > <4>RPC: xprt_rdma_connect_worker: reconnect > <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 > <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 > <3>ocrdma_mbx_create_qp(0) rq_err > <3>ocrdma_mbx_create_qp(0) sq_err > <3>ocrdma_create_qp(0) error=-1 > <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 > <4>RPC: 355 __rpc_wake_up_task (now 4296956756) > <4>RPC: 355 disabling timer > <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" > <4>RPC: __rpc_wake_up_task done > <4>RPC: xprt_rdma_connect_worker: exit > <4>RPC: 355 sync task resuming > <4>RPC: 355 xprt_connect_status: error 1 connecting to server 192.168.1.1 xprtrdma’s connect worker is returning “1” instead of a negative errno. That’s the bug that triggers this chain of events. RPC tasks waiting for the reconnect are awoken. xprt_connect_status() doesn’t recognize a tk_status of “1”, so it turns it into -EIO, and kills each waiting RPC task. > <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > <4>RPC: 355 call_connect_status (status -5) > <4>RPC: 355 return 0, status -5 > <4>RPC: 355 release task > <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > <4>RPC: xprt_rdma_free: called on 0x(null) And as part of exiting, the RPC task has to free its buffer. Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. This is why rpcrdma_deregister_external() is invoked here. Eventually this gets around to attempting to post a LOCAL_INV WR with ->qp set to NULL, and the panic below occurs. But xprtrdma has gone off the rails well before this (see above). I’ll look at this more on Monday. > <1>BUG: unable to handle kernel NULL pointer dereference at (null) > <1>IP: [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > <4>PGD 454554067 PUD 4665b7067 PMD 0 > <4>Oops: 0000 [#1] SMP > <4>last sysfs file: /sys/devices/pci0000:00/0000:00:03.0/0000:03:00.0/infiniband/ocrdma0/fwerr > <4>CPU 6 > <4>Modules linked in: xprtrdma(U) nfs lockd fscache auth_rpcgss nfs_acl ocrdma(U) be2net(U) ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables bridge stp llc autofs4 des_generic ecb md4 nls_utf8 cifs sunrpc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) vhost_net macvtap macvlan tun kvm uinput power_meter sg microcode i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support igb ptp pps_core ioatdma dca i7core_edac edac_core ext3 jbd mbcache sr_mod cdrom sd_mod crc_t10dif usb_storage pata_acpi ata_generic ata_piix mptsas mptscsih mptbase scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: be2net] > <4> > <4>Pid: 3597, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 Cisco Systems Inc R210-2121605W/R210-2121605W > <4>RIP: 0010:[<ffffffffa05b68ac>] [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > <4>RSP: 0018:ffff880465aff9a8 EFLAGS: 00010217 > <4>RAX: ffff8804673fcc00 RBX: ffff880466578028 RCX: 0000000000000000 > <4>RDX: ffff880465affa10 RSI: ffff880465aff9a8 RDI: 0000000000000000 > <4>RBP: ffff880465affa38 R08: 0000000000000000 R09: 0000000000000000 > <4>R10: 000000000000000f R11: 000000000000000f R12: ffff880454578598 > <4>R13: ffff880454578000 R14: ffff880466578068 R15: 0000000000000000 > <4>FS: 00007fe61f3107a0(0000) GS:ffff8800368c0000(0000) knlGS:0000000000000000 > <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > <4>CR2: 0000000000000000 CR3: 000000046520a000 CR4: 00000000000007e0 > <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > <4>Process ls (pid: 3597, threadinfo ffff880465afe000, task ffff8804639a5500) > <4>Stack: > <4> 0000000000000000 ffff880462287370 0000000000000000 0000000a00000000 > <4><d> 0802dd3b00000002 0000000000000000 0000000000000000 0000000000000000 > <4><d> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > <4>Call Trace: > <4> [<ffffffff8109c97f>] ? up+0x2f/0x50 > <4> [<ffffffffa05b6a03>] rpcrdma_deregister_external+0x113/0x2d0 [xprtrdma] > <4> [<ffffffff8150d0d1>] ? printk+0x41/0x48 > <4> [<ffffffffa05b44fc>] xprt_rdma_free+0x8c/0x210 [xprtrdma] > <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 > <4> [<ffffffffa05c8a60>] xprt_release+0xc0/0x220 [sunrpc] > <4> [<ffffffffa05cff5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] > <4> [<ffffffffa05d0a84>] __rpc_execute+0x174/0x350 [sunrpc] > <4> [<ffffffff8150d0d1>] ? printk+0x41/0x48 > <4> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 > <4> [<ffffffffa05d0cc1>] rpc_execute+0x61/0xa0 [sunrpc] > <4> [<ffffffffa05c73a5>] rpc_run_task+0x75/0x90 [sunrpc] > <4> [<ffffffffa05c74c2>] rpc_call_sync+0x42/0x70 [sunrpc] > <4> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 > <4> [<ffffffffa074e030>] _nfs4_call_sync+0x30/0x40 [nfs] > <4> [<ffffffffa07461dc>] _nfs4_proc_getattr+0xac/0xc0 [nfs] > <4> [<ffffffffa07494be>] nfs4_proc_getattr+0x4e/0x70 [nfs] > <4> [<ffffffffa072f3e3>] __nfs_revalidate_inode+0xe3/0x220 [nfs] > <4> [<ffffffffa072fdb6>] nfs_getattr+0xb6/0x120 [nfs] > <4> [<ffffffff81186951>] vfs_getattr+0x51/0x80 > <4> [<ffffffff811869e0>] vfs_fstatat+0x60/0x80 > <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 > <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 > <4> [<ffffffff810dc817>] ? audit_syscall_entry+0x1d7/0x200 > <4> [<ffffffff8100b072>] system_call_fastpath+0x16/0x1b > <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 dd 03 00 > <1>RIP [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > <4> RSP <ffff880465aff9a8> > <4>CR2: 0000000000000000 > >> -----Original Message----- >> From: Chuck Lever [mailto:chuck.lever@oracle.com] >> Sent: Friday, April 11, 2014 1:24 AM >> To: Devesh Sharma >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >> >> Hi Devesh- >> >> On Apr 10, 2014, at 1:54 PM, Devesh Sharma >> <Devesh.Sharma@Emulex.Com> wrote: >> >>> Alright here it is: >>> >>> <3>ocrdma_mbx_create_qp(0) rq_err >>> <3>ocrdma_mbx_create_qp(0) sq_err >>> <3>ocrdma_create_qp(0) error=-1 >>> <1>BUG: unable to handle kernel NULL pointer dereference at (null) >>> <1>IP: [<ffffffffa078e8ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 >>> [xprtrdma] >> >> As near as I can ascertain, the RDMA connection is torn down while an NFS >> workload is running, and a new connection cannot be completely set up. >> >> Can you try this: >> >> 1. On your client, # rpcdebug -m rpc -s call xprt sched trans >> >> 2. Reproduce the failure >> >> 3. Post the relevant contents of /var/log/messages (like the last RPC request >> or two before the BUG) >> >> And post the relevant line in /proc/mounts corresponding to your test >> NFS/RDMA mount. >> >> >>> <4>PGD 455942067 PUD 458356067 PMD 0 >>> <4>Oops: 0000 [#1] SMP >>> <4>last sysfs file: >>> /sys/devices/pci0000:80/0000:80:03.0/0000:8b:00.1/class >>> <4>CPU 1 >>> <4>Modules linked in: nfs fscache xprtrdma(U) ocrdma(U) fuse >>> ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE >>> iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state >>> nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter >>> ip_tables bridge nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 >>> sunrpc target_core_iblock target_core_file target_core_pscsi >>> target_core_mod configfs bnx2fc cnic uio fcoe libfcoe 8021q garp libfc >>> stp llc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) >>> ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c >>> iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) mlx4_en(U) >>> mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) >>> vfat fat vhost_net macvtap macvlan tun kvm_intel kvm uinput sg >>> cdc_ether usbnet mii microcode i2c_i801 i2c_core iTCO_wdt >>> iTCO_vendor_support shpchp igb ptp pps_core ioatdma dca be2net(U) >> ext4 >>> mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif lpfc scsi_transport_fc >>> scsi_tgt ahci wmi megaraid_sas dm_mirror dm_region_hash dm_log >> dm_mod >>> [last unloaded: speedstep_lib] <4> >>> <4>Pid: 9204, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 IBM System >>> x3650 M4 -[7915AC1]-/00J6528 >>> <4>RIP: 0010:[<ffffffffa078e8ac>] [<ffffffffa078e8ac>] >>> rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] >>> <4>RSP: 0018:ffff8804551877f8 EFLAGS: 00010217 >>> <4>RAX: ffff880462243800 RBX: ffff88045646a028 RCX: 0000000000000000 >>> <4>RDX: ffff880455187860 RSI: ffff8804551877f8 RDI: 0000000000000000 >>> <4>RBP: ffff880455187888 R08: 0000000000000000 R09: 0000000000000000 >>> <4>R10: 0000000000000000 R11: 0000000000000000 R12: ffff88047601c598 >>> <4>R13: ffff88047601c000 R14: ffff88045646a068 R15: 0000000000000000 >>> <4>FS: 00007fd669be07a0(0000) GS:ffff880028220000(0000) >>> knlGS:0000000000000000 >>> <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b >>> <4>CR2: 0000000000000000 CR3: 00000004557de000 CR4: 00000000000407e0 >>> <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >>> <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 >>> <4>Process ls (pid: 9204, threadinfo ffff880455186000, task >>> ffff880456735540) >>> <4>Stack: >>> <4> 0000000000000000 ffff88045584a700 0000000000000000 >>> 0000000a00000000 <4><d> 080424b400000002 0000000000000000 >>> 0000000000000000 0000000000000000 <4><d> 0000000000000000 >>> 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: >>> <4> [<ffffffffa078ea03>] rpcrdma_deregister_external+0x113/0x2d0 >>> [xprtrdma] <4> [<ffffffffa078c4fc>] xprt_rdma_free+0x8c/0x210 >>> [xprtrdma] <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 <4> >>> [<ffffffffa07bba60>] xprt_release+0xc0/0x220 [sunrpc] <4> >>> [<ffffffffa07c2f5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] <4> >>> [<ffffffffa07c3a84>] __rpc_execute+0x174/0x350 [sunrpc] <4> >>> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 <4> >>> [<ffffffffa07c3cc1>] rpc_execute+0x61/0xa0 [sunrpc] <4> >>> [<ffffffffa07ba3a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> >>> [<ffffffffa07ba4c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> >>> [<ffffffffa08b6f6d>] nfs3_rpc_wrapper.clone.0+0x3d/0xd0 [nfs] <4> >>> [<ffffffffa08b734c>] nfs3_proc_access+0xbc/0x180 [nfs] <4> >>> [<ffffffffa089f1e9>] nfs_do_access+0x199/0x3c0 [nfs] <4> >>> [<ffffffffa07c6305>] ? generic_lookup_cred+0x15/0x20 [sunrpc] <4> >>> [<ffffffffa07c52e0>] ? rpcauth_lookupcred+0x70/0xc0 [sunrpc] <4> >>> [<ffffffffa089f4b8>] nfs_permission+0xa8/0x1e0 [nfs] <4> >>> [<ffffffff8119053d>] __link_path_walk+0xad/0x1030 <4> >>> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> >>> [<ffffffff8132b1fa>] ? copy_termios+0x6a/0x80 <4> [<ffffffff8119174a>] >>> path_walk+0x6a/0xe0 <4> [<ffffffff8119191b>] >> do_path_lookup+0x5b/0xa0 >>> <4> [<ffffffff811925a7>] user_path_at+0x57/0xa0 <4> >>> [<ffffffff81194ed2>] ? vfs_ioctl+0x22/0xa0 <4> [<ffffffff811869bc>] >>> vfs_fstatat+0x3c/0x80 <4> [<ffffffff81085151>] ? >>> do_sigaction+0x91/0x1d0 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 >>> <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> >>> [<ffffffff8151311e>] ? do_page_fault+0x3e/0xa0 <4> >>> [<ffffffff815104d5>] ? page_fault+0x25/0x30 <4> [<ffffffff8100b072>] >>> system_call_fastpath+0x16/0x1b >>> <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff >>> ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 >>> <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 8d 05 00 >>> <1>RIP [<ffffffffa078e8ac>] >>> rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP >>> <ffff8804551877f8> >>> <4>CR2: 0000000000000000 >>> >>>> -----Original Message----- >>>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >>>> owner@vger.kernel.org] On Behalf Of Chuck Lever >>>> Sent: Thursday, April 10, 2014 11:21 PM >>>> To: Devesh Sharma >>>> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond >>>> Myklebust >>>> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >>>> >>>> >>>> On Apr 10, 2014, at 1:42 PM, Devesh Sharma >> <Devesh.Sharma@Emulex.Com> >>>> wrote: >>>> >>>>>> However it seems to me the new (!ia->ri_id->qp) checks outside the >>>>>> connect logic are unnecessary. >>>>>> >>>>>> Clearly, as you noticed, the ib_post_{send,recv} verbs do not check >>>>>> that their "qp" argument is NULL before dereferencing it. >>>>>> >>>>>> But I don't understand how xprtrdma can post any operation if the >>>>>> transport isn't connected. In other words, how would it be possible >>>>>> to call >>>>>> rpcrdma_ep_post_recv() if the connect had failed and there was no >> QP? >>>>>> >>>>>> If disconnect wipes ia->ri_id->qp while there are still operations >>>>>> in progress, that would be the real bug. >>>>> Yes!, But I have seen one more kernel oops where QP is destroyed and >>>>> xprtrdma still try to post in LOCAL_INV WR on a NULL QP pointer and >>>>> hence >>>> system crashes. So, I think what you missioned is really happening. >>>> >>>> I'd like to see the crash data (back trace, etc), if you've got it. >>>> >>>> -- >>>> Chuck Lever >>>> chuck[dot]lever[at]oracle[dot]com >>>> >>>> >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" >>>> in the body of a message to majordomo@vger.kernel.org More >> majordomo >>>> info at http://vger.kernel.org/majordomo-info.html >> >> -- >> Chuck Lever >> chuck[dot]lever[at]oracle[dot]com >> >> > -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Devesh- On Apr 13, 2014, at 12:01 AM, Chuck Lever <chuck.lever@oracle.com> wrote: > > On Apr 11, 2014, at 7:51 PM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > >> Hi Chuck, >> Yes that is the case, Following is the trace I got. >> >> <4>RPC: 355 setting alarm for 60000 ms >> <4>RPC: 355 sync task going to sleep >> <4>RPC: xprt_rdma_connect_worker: reconnect >> <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 >> <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 >> <3>ocrdma_mbx_create_qp(0) rq_err >> <3>ocrdma_mbx_create_qp(0) sq_err >> <3>ocrdma_create_qp(0) error=-1 >> <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 >> <4>RPC: 355 __rpc_wake_up_task (now 4296956756) >> <4>RPC: 355 disabling timer >> <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" >> <4>RPC: __rpc_wake_up_task done >> <4>RPC: xprt_rdma_connect_worker: exit >> <4>RPC: 355 sync task resuming >> <4>RPC: 355 xprt_connect_status: error 1 connecting to server 192.168.1.1 > > xprtrdma’s connect worker is returning “1” instead of a negative errno. > That’s the bug that triggers this chain of events. rdma_create_qp() has returned -EPERM. There’s very little xprtrdma can do if the provider won’t even create a QP. That seems like a rare and fatal problem. For the moment, I’m inclined to think that a panic is correct behavior, since there are outstanding registered memory regions that cannot be cleaned up without a QP (see below). > RPC tasks waiting for the reconnect are awoken. xprt_connect_status() doesn’t > recognize a tk_status of “1”, so it turns it into -EIO, and kills each waiting > RPC task. >> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") >> <4>RPC: 355 call_connect_status (status -5) >> <4>RPC: 355 return 0, status -5 >> <4>RPC: 355 release task >> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") >> <4>RPC: xprt_rdma_free: called on 0x(null) > > And as part of exiting, the RPC task has to free its buffer. > > Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. > This is why rpcrdma_deregister_external() is invoked here. > > Eventually this gets around to attempting to post a LOCAL_INV WR with > ->qp set to NULL, and the panic below occurs. This is a somewhat different problem. Not only do we need to have a good ->qp here, but it has to be connected and in the ready-to-send state before LOCAL_INV work requests can be posted. The implication of this is that if a server disconnects (server crash or network partition), the client is stuck waiting for the server to come back before it can deregister memory and retire outstanding RPC requests. This is bad for ^C or soft timeouts or umount … when the server is unavailable. So I feel we need better clean-up when the client cannot reconnect. Probably deregistering RPC chunk MR’s before finally tearing down the old QP is what is necessary. I’ll play around with this idea. -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chuck > -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Tuesday, April 15, 2014 2:24 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > Hi Devesh- > > > On Apr 13, 2014, at 12:01 AM, Chuck Lever <chuck.lever@oracle.com> wrote: > > > > > On Apr 11, 2014, at 7:51 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> wrote: > > > >> Hi Chuck, > >> Yes that is the case, Following is the trace I got. > >> > >> <4>RPC: 355 setting alarm for 60000 ms > >> <4>RPC: 355 sync task going to sleep > >> <4>RPC: xprt_rdma_connect_worker: reconnect > >> <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 > >> <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 > >> <3>ocrdma_mbx_create_qp(0) rq_err > >> <3>ocrdma_mbx_create_qp(0) sq_err > >> <3>ocrdma_create_qp(0) error=-1 > >> <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 > >> <4>RPC: 355 __rpc_wake_up_task (now 4296956756) > >> <4>RPC: 355 disabling timer > >> <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" > >> <4>RPC: __rpc_wake_up_task done > >> <4>RPC: xprt_rdma_connect_worker: exit > >> <4>RPC: 355 sync task resuming > >> <4>RPC: 355 xprt_connect_status: error 1 connecting to server > 192.168.1.1 > > > > xprtrdma's connect worker is returning "1" instead of a negative errno. > > That's the bug that triggers this chain of events. > > rdma_create_qp() has returned -EPERM. There's very little xprtrdma can do > if the provider won't even create a QP. That seems like a rare and fatal > problem. > > For the moment, I'm inclined to think that a panic is correct behavior, since > there are outstanding registered memory regions that cannot be cleaned up > without a QP (see below). Well, I think the system should still remain alive. This will definatly cause a memory leak. But QP create failure does not mean system should also crash. I think for the time being it is worth to put Null pointer checks to prevent system from crash. > > > > RPC tasks waiting for the reconnect are awoken. xprt_connect_status() > > doesn't recognize a tk_status of "1", so it turns it into -EIO, and > > kills each waiting RPC task. > > >> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > >> <4>RPC: 355 call_connect_status (status -5) > >> <4>RPC: 355 return 0, status -5 > >> <4>RPC: 355 release task > >> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > >> <4>RPC: xprt_rdma_free: called on 0x(null) > > > > And as part of exiting, the RPC task has to free its buffer. > > > > Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. > > This is why rpcrdma_deregister_external() is invoked here. > > > > Eventually this gets around to attempting to post a LOCAL_INV WR with > > ->qp set to NULL, and the panic below occurs. > > This is a somewhat different problem. > > Not only do we need to have a good ->qp here, but it has to be connected > and in the ready-to-send state before LOCAL_INV work requests can be > posted. > > The implication of this is that if a server disconnects (server crash or network > partition), the client is stuck waiting for the server to come back before it can > deregister memory and retire outstanding RPC requests. This is a real problem to solve. In the existing state of xprtrdma code. Even a Server reboot will cause Client to crash. > > This is bad for ^C or soft timeouts or umount ... when the server is > unavailable. > > So I feel we need better clean-up when the client cannot reconnect. Unreg old frmrs with the help of new QP? Until the new QP is created with same PD and FRMR is bound to PD and not to QP. > Probably deregistering RPC chunk MR's before finally tearing down the old > QP is what is necessary. We need a scheme that handles Memory registrations separately from connection establishment and do book-keeping of which region is Registered and which one is not. Once the new connection is back. Either start using old mem-regions as it is, or invalidate old and re-register on the new QP. What is the existing scheme xprtrdma is following? Is it the same? I think it is possible to create FRMR on qp->qp_num = x while invalidate on qp->qp_num = y until qpx.pd == qpy.pd > > I'll play around with this idea. > > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Chuck Lever > Sent: Sunday, April 13, 2014 9:31 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 11, 2014, at 7:51 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> wrote: > > > Hi Chuck, > > Yes that is the case, Following is the trace I got. > > > > <4>RPC: 355 setting alarm for 60000 ms > > <4>RPC: 355 sync task going to sleep > > <4>RPC: xprt_rdma_connect_worker: reconnect > > <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 > > <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 > > <3>ocrdma_mbx_create_qp(0) rq_err > > <3>ocrdma_mbx_create_qp(0) sq_err > > <3>ocrdma_create_qp(0) error=-1 > > <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 > > <4>RPC: 355 __rpc_wake_up_task (now 4296956756) > > <4>RPC: 355 disabling timer > > <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" > > <4>RPC: __rpc_wake_up_task done > > <4>RPC: xprt_rdma_connect_worker: exit > > <4>RPC: 355 sync task resuming > > <4>RPC: 355 xprt_connect_status: error 1 connecting to server 192.168.1.1 > > xprtrdma's connect worker is returning "1" instead of a negative errno. > That's the bug that triggers this chain of events. default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status -------------->Mind the Minus(-) sign here., xprt->servername); xprt_release_write(xprt, task); task->tk_status = -EIO; So, ep_connect _is_ returning -EPERM, but xprt is printing -(-EPERM). Off-course EPERM is not handled here which turns into -EIO. > > RPC tasks waiting for the reconnect are awoken. xprt_connect_status() > doesn't recognize a tk_status of "1", so it turns it into -EIO, and kills each > waiting RPC task. > > > <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > > <4>RPC: 355 call_connect_status (status -5) > > <4>RPC: 355 return 0, status -5 > > <4>RPC: 355 release task > > <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > > <4>RPC: xprt_rdma_free: called on 0x(null) > > And as part of exiting, the RPC task has to free its buffer. > > Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. > This is why rpcrdma_deregister_external() is invoked here. > > Eventually this gets around to attempting to post a LOCAL_INV WR with > ->qp set to NULL, and the panic below occurs. But xprtrdma has gone off > the rails well before this (see above). > > I'll look at this more on Monday. > > > > <1>BUG: unable to handle kernel NULL pointer dereference at (null) > > <1>IP: [<ffffffffa05b68ac>] rpcrdma_deregister_frmr_external+0x9c/0xe0 > > [xprtrdma] <4>PGD 454554067 PUD 4665b7067 PMD 0 > > <4>Oops: 0000 [#1] SMP > > <4>last sysfs file: > > /sys/devices/pci0000:00/0000:00:03.0/0000:03:00.0/infiniband/ocrdma0/f > > werr > > <4>CPU 6 > > <4>Modules linked in: xprtrdma(U) nfs lockd fscache auth_rpcgss > > nfs_acl ocrdma(U) be2net(U) ip6table_filter ip6_tables ebtable_nat > > ebtables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 > > nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM > > iptable_mangle iptable_filter ip_tables bridge stp llc autofs4 > > des_generic ecb md4 nls_utf8 cifs sunrpc rdma_ucm(U) rdma_cm(U) > > iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) > > ib_umad(U) iw_nes(U) libcrc32c iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) > > cxgb3(U) mdio ib_qib(U) mlx4_en(U) mlx4_ib(U) mlx4_core(U) > ib_mthca(U) > > ib_mad(U) ib_core(U) compat(U) vhost_net macvtap macvlan tun kvm > > uinput power_meter sg microcode i2c_i801 i2c_core iTCO_wdt > > iTCO_vendor_support igb ptp pps_core ioatdma dca i7core_edac > edac_core > > ext3 jbd mbcache sr_mod cdrom sd_mod crc_t10dif usb_storage pata_acpi > > ata_generic ata_piix mptsas mptscsih mptbase scsi_transport_sas > > dm_mirror dm_region_hash dm_log dm_mod [last unloaded: be2net] <4> > > <4>Pid: 3597, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 Cisco > > Systems Inc R210-2121605W/R210-2121605W > > <4>RIP: 0010:[<ffffffffa05b68ac>] [<ffffffffa05b68ac>] > > rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > > <4>RSP: 0018:ffff880465aff9a8 EFLAGS: 00010217 > > <4>RAX: ffff8804673fcc00 RBX: ffff880466578028 RCX: 0000000000000000 > > <4>RDX: ffff880465affa10 RSI: ffff880465aff9a8 RDI: 0000000000000000 > > <4>RBP: ffff880465affa38 R08: 0000000000000000 R09: 0000000000000000 > > <4>R10: 000000000000000f R11: 000000000000000f R12: ffff880454578598 > > <4>R13: ffff880454578000 R14: ffff880466578068 R15: 0000000000000000 > > <4>FS: 00007fe61f3107a0(0000) GS:ffff8800368c0000(0000) > > knlGS:0000000000000000 > > <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > > <4>CR2: 0000000000000000 CR3: 000000046520a000 CR4: 00000000000007e0 > > <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > > <4>Process ls (pid: 3597, threadinfo ffff880465afe000, task > > ffff8804639a5500) > > <4>Stack: > > <4> 0000000000000000 ffff880462287370 0000000000000000 > > 0000000a00000000 <4><d> 0802dd3b00000002 0000000000000000 > > 0000000000000000 0000000000000000 <4><d> 0000000000000000 > > 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: > > <4> [<ffffffff8109c97f>] ? up+0x2f/0x50 <4> [<ffffffffa05b6a03>] > > rpcrdma_deregister_external+0x113/0x2d0 [xprtrdma] <4> > > [<ffffffff8150d0d1>] ? printk+0x41/0x48 <4> [<ffffffffa05b44fc>] > > xprt_rdma_free+0x8c/0x210 [xprtrdma] <4> [<ffffffff81082014>] ? > > mod_timer+0x144/0x220 <4> [<ffffffffa05c8a60>] > xprt_release+0xc0/0x220 > > [sunrpc] <4> [<ffffffffa05cff5d>] rpc_release_resources_task+0x1d/0x50 > > [sunrpc] <4> [<ffffffffa05d0a84>] __rpc_execute+0x174/0x350 [sunrpc] > > <4> [<ffffffff8150d0d1>] ? printk+0x41/0x48 <4> [<ffffffff81096b47>] ? > > bit_waitqueue+0x17/0xd0 <4> [<ffffffffa05d0cc1>] > rpc_execute+0x61/0xa0 > > [sunrpc] <4> [<ffffffffa05c73a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> > > [<ffffffffa05c74c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> > > [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> > > [<ffffffffa074e030>] _nfs4_call_sync+0x30/0x40 [nfs] <4> > > [<ffffffffa07461dc>] _nfs4_proc_getattr+0xac/0xc0 [nfs] <4> > > [<ffffffffa07494be>] nfs4_proc_getattr+0x4e/0x70 [nfs] <4> > > [<ffffffffa072f3e3>] __nfs_revalidate_inode+0xe3/0x220 [nfs] <4> > > [<ffffffffa072fdb6>] nfs_getattr+0xb6/0x120 [nfs] <4> > > [<ffffffff81186951>] vfs_getattr+0x51/0x80 <4> [<ffffffff811869e0>] > > vfs_fstatat+0x60/0x80 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 <4> > > [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> [<ffffffff810dc817>] ? > > audit_syscall_entry+0x1d7/0x200 <4> [<ffffffff8100b072>] > > system_call_fastpath+0x16/0x1b > > <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff > > ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 > > <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 dd 03 00 > > <1>RIP [<ffffffffa05b68ac>] > > rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP > > <ffff880465aff9a8> > > <4>CR2: 0000000000000000 > > > >> -----Original Message----- > >> From: Chuck Lever [mailto:chuck.lever@oracle.com] > >> Sent: Friday, April 11, 2014 1:24 AM > >> To: Devesh Sharma > >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > >> Myklebust > >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > >> > >> Hi Devesh- > >> > >> On Apr 10, 2014, at 1:54 PM, Devesh Sharma > <Devesh.Sharma@Emulex.Com> > >> wrote: > >> > >>> Alright here it is: > >>> > >>> <3>ocrdma_mbx_create_qp(0) rq_err > >>> <3>ocrdma_mbx_create_qp(0) sq_err > >>> <3>ocrdma_create_qp(0) error=-1 > >>> <1>BUG: unable to handle kernel NULL pointer dereference at (null) > >>> <1>IP: [<ffffffffa078e8ac>] > >>> rpcrdma_deregister_frmr_external+0x9c/0xe0 > >>> [xprtrdma] > >> > >> As near as I can ascertain, the RDMA connection is torn down while an > >> NFS workload is running, and a new connection cannot be completely set > up. > >> > >> Can you try this: > >> > >> 1. On your client, # rpcdebug -m rpc -s call xprt sched trans > >> > >> 2. Reproduce the failure > >> > >> 3. Post the relevant contents of /var/log/messages (like the last RPC > >> request or two before the BUG) > >> > >> And post the relevant line in /proc/mounts corresponding to your test > >> NFS/RDMA mount. > >> > >> > >>> <4>PGD 455942067 PUD 458356067 PMD 0 > >>> <4>Oops: 0000 [#1] SMP > >>> <4>last sysfs file: > >>> /sys/devices/pci0000:80/0000:80:03.0/0000:8b:00.1/class > >>> <4>CPU 1 > >>> <4>Modules linked in: nfs fscache xprtrdma(U) ocrdma(U) fuse > >>> ip6table_filter ip6_tables ebtable_nat ebtables ipt_MASQUERADE > >>> iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state > >>> nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter > >>> ip_tables bridge nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 > >>> sunrpc target_core_iblock target_core_file target_core_pscsi > >>> target_core_mod configfs bnx2fc cnic uio fcoe libfcoe 8021q garp > >>> libfc stp llc rdma_ucm(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) > >>> ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) iw_nes(U) libcrc32c > >>> iw_cxgb4(U) cxgb4(U) iw_cxgb3(U) cxgb3(U) mdio ib_qib(U) > mlx4_en(U) > >>> mlx4_ib(U) mlx4_core(U) ib_mthca(U) ib_mad(U) ib_core(U) compat(U) > >>> vfat fat vhost_net macvtap macvlan tun kvm_intel kvm uinput sg > >>> cdc_ether usbnet mii microcode i2c_i801 i2c_core iTCO_wdt > >>> iTCO_vendor_support shpchp igb ptp pps_core ioatdma dca be2net(U) > >> ext4 > >>> mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif lpfc scsi_transport_fc > >>> scsi_tgt ahci wmi megaraid_sas dm_mirror dm_region_hash dm_log > >> dm_mod > >>> [last unloaded: speedstep_lib] <4> > >>> <4>Pid: 9204, comm: ls Not tainted 2.6.32-358.el6.x86_64 #1 IBM > >>> System > >>> x3650 M4 -[7915AC1]-/00J6528 > >>> <4>RIP: 0010:[<ffffffffa078e8ac>] [<ffffffffa078e8ac>] > >>> rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] > >>> <4>RSP: 0018:ffff8804551877f8 EFLAGS: 00010217 > >>> <4>RAX: ffff880462243800 RBX: ffff88045646a028 RCX: 0000000000000000 > >>> <4>RDX: ffff880455187860 RSI: ffff8804551877f8 RDI: 0000000000000000 > >>> <4>RBP: ffff880455187888 R08: 0000000000000000 R09: > 0000000000000000 > >>> <4>R10: 0000000000000000 R11: 0000000000000000 R12: ffff88047601c598 > >>> <4>R13: ffff88047601c000 R14: ffff88045646a068 R15: 0000000000000000 > >>> <4>FS: 00007fd669be07a0(0000) GS:ffff880028220000(0000) > >>> knlGS:0000000000000000 > >>> <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > >>> <4>CR2: 0000000000000000 CR3: 00000004557de000 CR4: > 00000000000407e0 > >>> <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > >>> <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: > 0000000000000400 > >>> <4>Process ls (pid: 9204, threadinfo ffff880455186000, task > >>> ffff880456735540) > >>> <4>Stack: > >>> <4> 0000000000000000 ffff88045584a700 0000000000000000 > >>> 0000000a00000000 <4><d> 080424b400000002 0000000000000000 > >>> 0000000000000000 0000000000000000 <4><d> 0000000000000000 > >>> 0000000000000000 0000000000000000 0000000000000000 <4>Call Trace: > >>> <4> [<ffffffffa078ea03>] rpcrdma_deregister_external+0x113/0x2d0 > >>> [xprtrdma] <4> [<ffffffffa078c4fc>] xprt_rdma_free+0x8c/0x210 > >>> [xprtrdma] <4> [<ffffffff81082014>] ? mod_timer+0x144/0x220 <4> > >>> [<ffffffffa07bba60>] xprt_release+0xc0/0x220 [sunrpc] <4> > >>> [<ffffffffa07c2f5d>] rpc_release_resources_task+0x1d/0x50 [sunrpc] > >>> <4> [<ffffffffa07c3a84>] __rpc_execute+0x174/0x350 [sunrpc] <4> > >>> [<ffffffff81096b47>] ? bit_waitqueue+0x17/0xd0 <4> > >>> [<ffffffffa07c3cc1>] rpc_execute+0x61/0xa0 [sunrpc] <4> > >>> [<ffffffffa07ba3a5>] rpc_run_task+0x75/0x90 [sunrpc] <4> > >>> [<ffffffffa07ba4c2>] rpc_call_sync+0x42/0x70 [sunrpc] <4> > >>> [<ffffffffa08b6f6d>] nfs3_rpc_wrapper.clone.0+0x3d/0xd0 [nfs] <4> > >>> [<ffffffffa08b734c>] nfs3_proc_access+0xbc/0x180 [nfs] <4> > >>> [<ffffffffa089f1e9>] nfs_do_access+0x199/0x3c0 [nfs] <4> > >>> [<ffffffffa07c6305>] ? generic_lookup_cred+0x15/0x20 [sunrpc] <4> > >>> [<ffffffffa07c52e0>] ? rpcauth_lookupcred+0x70/0xc0 [sunrpc] <4> > >>> [<ffffffffa089f4b8>] nfs_permission+0xa8/0x1e0 [nfs] <4> > >>> [<ffffffff8119053d>] __link_path_walk+0xad/0x1030 <4> > >>> [<ffffffff81143a17>] ? handle_pte_fault+0x487/0xb50 <4> > >>> [<ffffffff8132b1fa>] ? copy_termios+0x6a/0x80 <4> > >>> [<ffffffff8119174a>] > >>> path_walk+0x6a/0xe0 <4> [<ffffffff8119191b>] > >> do_path_lookup+0x5b/0xa0 > >>> <4> [<ffffffff811925a7>] user_path_at+0x57/0xa0 <4> > >>> [<ffffffff81194ed2>] ? vfs_ioctl+0x22/0xa0 <4> [<ffffffff811869bc>] > >>> vfs_fstatat+0x3c/0x80 <4> [<ffffffff81085151>] ? > >>> do_sigaction+0x91/0x1d0 <4> [<ffffffff81186b2b>] vfs_stat+0x1b/0x20 > >>> <4> [<ffffffff81186b54>] sys_newstat+0x24/0x50 <4> > >>> [<ffffffff8151311e>] ? do_page_fault+0x3e/0xa0 <4> > >>> [<ffffffff815104d5>] ? page_fault+0x25/0x30 <4> [<ffffffff8100b072>] > >>> system_call_fastpath+0x16/0x1b > >>> <4>Code: 48 89 85 78 ff ff ff 48 8b 40 08 8b 40 1c 89 45 94 b8 ff ff > >>> ff ff f0 41 0f c1 85 e0 05 00 00 49 8b 04 24 48 8d 55 d8 48 8b 78 10 > >>> <48> 8b 07 ff 90 b0 01 00 00 85 c0 89 c3 74 09 80 3d 56 8d 05 00 > >>> <1>RIP [<ffffffffa078e8ac>] > >>> rpcrdma_deregister_frmr_external+0x9c/0xe0 [xprtrdma] <4> RSP > >>> <ffff8804551877f8> > >>> <4>CR2: 0000000000000000 > >>> > >>>> -----Original Message----- > >>>> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > >>>> owner@vger.kernel.org] On Behalf Of Chuck Lever > >>>> Sent: Thursday, April 10, 2014 11:21 PM > >>>> To: Devesh Sharma > >>>> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > >>>> Myklebust > >>>> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > >>>> > >>>> > >>>> On Apr 10, 2014, at 1:42 PM, Devesh Sharma > >> <Devesh.Sharma@Emulex.Com> > >>>> wrote: > >>>> > >>>>>> However it seems to me the new (!ia->ri_id->qp) checks outside > >>>>>> the connect logic are unnecessary. > >>>>>> > >>>>>> Clearly, as you noticed, the ib_post_{send,recv} verbs do not > >>>>>> check that their "qp" argument is NULL before dereferencing it. > >>>>>> > >>>>>> But I don't understand how xprtrdma can post any operation if the > >>>>>> transport isn't connected. In other words, how would it be > >>>>>> possible to call > >>>>>> rpcrdma_ep_post_recv() if the connect had failed and there was no > >> QP? > >>>>>> > >>>>>> If disconnect wipes ia->ri_id->qp while there are still > >>>>>> operations in progress, that would be the real bug. > >>>>> Yes!, But I have seen one more kernel oops where QP is destroyed > >>>>> and xprtrdma still try to post in LOCAL_INV WR on a NULL QP > >>>>> pointer and hence > >>>> system crashes. So, I think what you missioned is really happening. > >>>> > >>>> I'd like to see the crash data (back trace, etc), if you've got it. > >>>> > >>>> -- > >>>> Chuck Lever > >>>> chuck[dot]lever[at]oracle[dot]com > >>>> > >>>> > >>>> > >>>> -- > >>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" > >>>> in the body of a message to majordomo@vger.kernel.org More > >> majordomo > >>>> info at http://vger.kernel.org/majordomo-info.html > >> > >> -- > >> Chuck Lever > >> chuck[dot]lever[at]oracle[dot]com > >> > >> > > > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the > body of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 14, 2014, at 6:46 PM, Devesh Sharma <devesh.sharma@emulex.com> wrote: > Hi Chuck > >> -----Original Message----- >> From: Chuck Lever [mailto:chuck.lever@oracle.com] >> Sent: Tuesday, April 15, 2014 2:24 AM >> To: Devesh Sharma >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >> >> Hi Devesh- >> >> >> On Apr 13, 2014, at 12:01 AM, Chuck Lever <chuck.lever@oracle.com> wrote: >> >>> >>> On Apr 11, 2014, at 7:51 PM, Devesh Sharma >> <Devesh.Sharma@Emulex.Com> wrote: >>> >>>> Hi Chuck, >>>> Yes that is the case, Following is the trace I got. >>>> >>>> <4>RPC: 355 setting alarm for 60000 ms >>>> <4>RPC: 355 sync task going to sleep >>>> <4>RPC: xprt_rdma_connect_worker: reconnect >>>> <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 >>>> <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 >>>> <3>ocrdma_mbx_create_qp(0) rq_err >>>> <3>ocrdma_mbx_create_qp(0) sq_err >>>> <3>ocrdma_create_qp(0) error=-1 >>>> <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 >>>> <4>RPC: 355 __rpc_wake_up_task (now 4296956756) >>>> <4>RPC: 355 disabling timer >>>> <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" >>>> <4>RPC: __rpc_wake_up_task done >>>> <4>RPC: xprt_rdma_connect_worker: exit >>>> <4>RPC: 355 sync task resuming >>>> <4>RPC: 355 xprt_connect_status: error 1 connecting to server >> 192.168.1.1 >>> >>> xprtrdma's connect worker is returning "1" instead of a negative errno. >>> That's the bug that triggers this chain of events. >> >> rdma_create_qp() has returned -EPERM. There's very little xprtrdma can do >> if the provider won't even create a QP. That seems like a rare and fatal >> problem. >> >> For the moment, I'm inclined to think that a panic is correct behavior, since >> there are outstanding registered memory regions that cannot be cleaned up >> without a QP (see below). > Well, I think the system should still remain alive. Sure, in the long run. I'm not suggesting we leave it this way. > This will definatly cause a memory leak. But QP create failure does not mean system should also crash. It's more than leaked memory. A permanent QP creation failure can leave pages in the page cache registered and pinned, as I understand it. > I think for the time being it is worth to put Null pointer checks to prevent system from crash. Common practice in the Linux kernel is to avoid unnecessary NULL checks. Work-around fixes are typically rejected, and not with a happy face either. Once the connection tear-down code is fixed, it should be clear where NULL checks need to go. >> >>> RPC tasks waiting for the reconnect are awoken. xprt_connect_status() >>> doesn't recognize a tk_status of "1", so it turns it into -EIO, and >>> kills each waiting RPC task. >> >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") >>>> <4>RPC: 355 call_connect_status (status -5) >>>> <4>RPC: 355 return 0, status -5 >>>> <4>RPC: 355 release task >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") >>>> <4>RPC: xprt_rdma_free: called on 0x(null) >>> >>> And as part of exiting, the RPC task has to free its buffer. >>> >>> Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. >>> This is why rpcrdma_deregister_external() is invoked here. >>> >>> Eventually this gets around to attempting to post a LOCAL_INV WR with >>> ->qp set to NULL, and the panic below occurs. >> >> This is a somewhat different problem. >> >> Not only do we need to have a good ->qp here, but it has to be connected >> and in the ready-to-send state before LOCAL_INV work requests can be >> posted. >> >> The implication of this is that if a server disconnects (server crash or network >> partition), the client is stuck waiting for the server to come back before it can >> deregister memory and retire outstanding RPC requests. > This is a real problem to solve. In the existing state of xprtrdma code. Even a Server reboot will cause > Client to crash. I don't see how that can happen if the HCA/provider manages to create a fresh QP successfully and then rdma_connect() succeeds. A soft timeout or a ^C while the server is rebooting might be a problem. >> >> This is bad for ^C or soft timeouts or umount ... when the server is >> unavailable. >> >> So I feel we need better clean-up when the client cannot reconnect. > Unreg old frmrs with the help of new QP? Until the new QP is created with same PD and FRMR is bound to PD and not to QP. >> Probably deregistering RPC chunk MR's before finally tearing down the old >> QP is what is necessary. > > We need a scheme that handles Memory registrations separately from connection establishment and do book-keeping of which region is Registered and which one is not. > Once the new connection is back. Either start using old mem-regions as it is, or invalidate old and re-register on the new QP. > What is the existing scheme xprtrdma is following? Is it the same? This is what is going on now. Clearly, when managing its own memory resources, the client should never depend on the server ever coming back. The proposal is to deregister _before_ the old QP is torn down, using ib_dereg_mr() in the connect worker process. All RPC requests on that connection should be sleeping waiting for the reconnect to complete. If chunks are created and marshaled during xprt_transmit(), the waiting RPC requests should simply re-register when they are ready to be sent again. > I think it is possible to create FRMR on qp->qp_num = x while invalidate on qp->qp_num = y until qpx.pd == qpy.pd
> -----Original Message----- > From: Chuck Lever [mailto:chuck.lever@oracle.com] > Sent: Tuesday, April 15, 2014 6:10 AM > To: Devesh Sharma > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 14, 2014, at 6:46 PM, Devesh Sharma <devesh.sharma@emulex.com> > wrote: > > > Hi Chuck > > > >> -----Original Message----- > >> From: Chuck Lever [mailto:chuck.lever@oracle.com] > >> Sent: Tuesday, April 15, 2014 2:24 AM > >> To: Devesh Sharma > >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > >> Myklebust > >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > >> > >> Hi Devesh- > >> > >> > >> On Apr 13, 2014, at 12:01 AM, Chuck Lever <chuck.lever@oracle.com> > wrote: > >> > >>> > >>> On Apr 11, 2014, at 7:51 PM, Devesh Sharma > >> <Devesh.Sharma@Emulex.Com> wrote: > >>> > >>>> Hi Chuck, > >>>> Yes that is the case, Following is the trace I got. > >>>> > >>>> <4>RPC: 355 setting alarm for 60000 ms > >>>> <4>RPC: 355 sync task going to sleep > >>>> <4>RPC: xprt_rdma_connect_worker: reconnect > >>>> <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 > >>>> <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 > >>>> <3>ocrdma_mbx_create_qp(0) rq_err > >>>> <3>ocrdma_mbx_create_qp(0) sq_err > >>>> <3>ocrdma_create_qp(0) error=-1 > >>>> <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 > >>>> <4>RPC: 355 __rpc_wake_up_task (now 4296956756) > >>>> <4>RPC: 355 disabling timer > >>>> <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" > >>>> <4>RPC: __rpc_wake_up_task done > >>>> <4>RPC: xprt_rdma_connect_worker: exit > >>>> <4>RPC: 355 sync task resuming > >>>> <4>RPC: 355 xprt_connect_status: error 1 connecting to server > >> 192.168.1.1 > >>> > >>> xprtrdma's connect worker is returning "1" instead of a negative errno. > >>> That's the bug that triggers this chain of events. > >> > >> rdma_create_qp() has returned -EPERM. There's very little xprtrdma > >> can do if the provider won't even create a QP. That seems like a rare > >> and fatal problem. > >> > >> For the moment, I'm inclined to think that a panic is correct > >> behavior, since there are outstanding registered memory regions that > >> cannot be cleaned up without a QP (see below). > > Well, I think the system should still remain alive. > > Sure, in the long run. I'm not suggesting we leave it this way. Okay, Agreed. > > > This will definatly cause a memory leak. But QP create failure does not > mean system should also crash. > > It's more than leaked memory. A permanent QP creation failure can leave > pages in the page cache registered and pinned, as I understand it. Yes! true. > > > I think for the time being it is worth to put Null pointer checks to prevent > system from crash. > > Common practice in the Linux kernel is to avoid unnecessary NULL checks. > Work-around fixes are typically rejected, and not with a happy face either. > > Once the connection tear-down code is fixed, it should be clear where NULL > checks need to go. Okay. > > >> > >>> RPC tasks waiting for the reconnect are awoken. > >>> xprt_connect_status() doesn't recognize a tk_status of "1", so it > >>> turns it into -EIO, and kills each waiting RPC task. > >> > >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > >>>> <4>RPC: 355 call_connect_status (status -5) > >>>> <4>RPC: 355 return 0, status -5 > >>>> <4>RPC: 355 release task > >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > >>>> <4>RPC: xprt_rdma_free: called on 0x(null) > >>> > >>> And as part of exiting, the RPC task has to free its buffer. > >>> > >>> Not exactly sure why req->rl_nchunks is not zero for an NFSv4 GETATTR. > >>> This is why rpcrdma_deregister_external() is invoked here. > >>> > >>> Eventually this gets around to attempting to post a LOCAL_INV WR > >>> with > >>> ->qp set to NULL, and the panic below occurs. > >> > >> This is a somewhat different problem. > >> > >> Not only do we need to have a good ->qp here, but it has to be > >> connected and in the ready-to-send state before LOCAL_INV work > >> requests can be posted. > >> > >> The implication of this is that if a server disconnects (server crash > >> or network partition), the client is stuck waiting for the server to > >> come back before it can deregister memory and retire outstanding RPC > requests. > > This is a real problem to solve. In the existing state of xprtrdma > > code. Even a Server reboot will cause Client to crash. > > I don't see how that can happen if the HCA/provider manages to create a > fresh QP successfully and then rdma_connect() succeeds. Okay yes, since QP creation will still succeed. > > A soft timeout or a ^C while the server is rebooting might be a problem. > > >> > >> This is bad for ^C or soft timeouts or umount ... when the server is > >> unavailable. > >> > >> So I feel we need better clean-up when the client cannot reconnect. > > Unreg old frmrs with the help of new QP? Until the new QP is created with > same PD and FRMR is bound to PD and not to QP. > >> Probably deregistering RPC chunk MR's before finally tearing down the > >> old QP is what is necessary. > > > > We need a scheme that handles Memory registrations separately from > connection establishment and do book-keeping of which region is Registered > and which one is not. > > Once the new connection is back. Either start using old mem-regions as it is, > or invalidate old and re-register on the new QP. > > What is the existing scheme xprtrdma is following? Is it the same? > > This is what is going on now. Clearly, when managing its own memory > resources, the client should never depend on the server ever coming back. > > The proposal is to deregister _before_ the old QP is torn down, using > ib_dereg_mr() in the connect worker process. All RPC requests on that > connection should be sleeping waiting for the reconnect to complete. > > If chunks are created and marshaled during xprt_transmit(), the waiting RPC > requests should simply re-register when they are ready to be sent again. > Ok, I will try to change this and test, I may take a week's time to understand and rollout V3. > > I think it is possible to create FRMR on qp->qp_num = x while > > invalidate on qp->qp_num = y until qpx.pd == qpy.pd > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chuck Following is the complete call trace of a typical NFS-RDMA transaction while mounting a share. It is unavoidable to stop calling post-send in case it is not created. Therefore, applying checks to the connection state is a must While registering/deregistering frmrs on-the-fly. The unconnected state of QP implies don't call post_send/post_recv from any context. call_start nfs4 proc GETATTR (sync) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_reserve (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 reserved req ffff8804678b8800 xid 53abc98d Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_event_process: event rep ffff88046230f980 status 0 opcode 7 length 48 Apr 23 20:00:34 neo03-el64 kernel: RPC: wake_up_next(ffff880465ae6190 "xprt_sending") Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_reserveresult (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_refresh (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 looking up UNIX cred Apr 23 20:00:34 neo03-el64 kernel: RPC: looking up UNIX cred Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 refreshing UNIX cred ffff880467b2cec0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_refreshresult (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_allocate (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: xprt_rdma_allocate: size 1052 too large for buffer[1024]: prog 100003 vers 4 proc 1 Apr 23 20:00:34 neo03-el64 kernel: RPC: xprt_rdma_allocate: size 1052, request 0xffff8804650e2000 ------------->>>>>> A new buffer is allocated from the Pre-Created Buffer pool, and since buffer is smaller to hold requested data size <<<<<<<----------------- ------------->>>>>> allocate new, do book keeping and create phys_mr for the newly allocated buffer. Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_bind (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_connect xprt ffff880465ae6000 is connected Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_transmit (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 xprt_prepare_transmit Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 xprt_cwnd_limited cong = 0 cwnd = 4096 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 rpc_xdr_encode (status 0) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 marshaling UNIX cred ffff880467b2cec0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 using AUTH_UNIX cred ffff880467b2cec0 to wrap rpc data Apr 23 20:00:34 neo03-el64 kernel: encode_compound: tag= Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 xprt_transmit(120) Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_inline_pullup: pad 0 destp 0xffff8804650e37d8 len 120 hdrlen 120 Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_register_frmr_external: Using frmr ffff88046230ef30 to map 1 segments ---------------->>>>>>>>> This is where post_send is called for FRMR creations. If xprt is not connected, even then post_send call continues with FRMR cration. --------------->>>>>>>>>> if QP is connected call post send else fail the reg-call and submit the buffers back to the pools and start over with call_bind() at RPC layer. Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_create_chunks: reply chunk elem 592@0x4650e392c:0x805f505 (last) Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_marshal_req: reply chunk: hdrlen 48 rpclen 120 padlen 0 headerp 0xffff8804650e3100 base 0xffff8804650e3760 lkey 0x0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 xmit complete Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 sleep_on(queue "xprt_pending" time 4296808435) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 added to queue ffff880465ae6258 "xprt_pending" Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 setting alarm for 60000 ms Apr 23 20:00:34 neo03-el64 kernel: RPC: wake_up_next(ffff880465ae6190 "xprt_sending") Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 sync task going to sleep Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_event_process: event rep ffff88046230ef30 status 0 opcode 8 length 48 Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_event_process: event rep ffff88046502a000 status 0 opcode 80 length 48 ---------------->>>>>>>>>> If the completion is Flush, Update the QP connection state immediately, don't wait for tasklet to schedule. Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_reply_handler: reply 0xffff88046502a000 completes request 0xffff8804650e2000 Apr 23 20:00:34 neo03-el64 kernel: RPC request 0xffff8804678b8800 xid 0x8dc9ab53 Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_count_chunks: chunk 212@0x4650e392c:0x805f505 Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_reply_handler: xprt_complete_rqst(0xffff880465ae6000, 0xffff8804678b8800, 212) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 xid 53abc98d complete (212 bytes received) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 __rpc_wake_up_task (now 4296808436) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 disabling timer Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 removed from queue ffff880465ae6258 "xprt_pending" Apr 23 20:00:34 neo03-el64 kernel: RPC: __rpc_wake_up_task done Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 sync task resuming Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_status (status 212) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_decode (status 212) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 validating UNIX cred ffff880467b2cec0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 using AUTH_UNIX cred ffff880467b2cec0 to unwrap rpc data Apr 23 20:00:34 neo03-el64 kernel: decode_attr_type: type=040000 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_change: change attribute=952326959717679104 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_size: file size=4096 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_fsid: fsid=(0x0/0x0) Apr 23 20:00:34 neo03-el64 kernel: decode_attr_fileid: fileid=2 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_fs_locations: fs_locations done, error = 0 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_mode: file mode=0555 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_nlink: nlink=32 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_owner: uid=0 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_group: gid=0 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_rdev: rdev=(0x0:0x0) Apr 23 20:00:34 neo03-el64 kernel: decode_attr_space_used: space used=8192 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_time_access: atime=1398288115 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_time_metadata: ctime=1398290189 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_time_modify: mtime=1398290189 Apr 23 20:00:34 neo03-el64 kernel: decode_attr_mounted_on_fileid: fileid=0 Apr 23 20:00:34 neo03-el64 kernel: decode_getfattr_attrs: xdr returned 0 Apr 23 20:00:34 neo03-el64 kernel: decode_getfattr_generic: xdr returned 0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 call_decode result 0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 return 0, status 0 Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 release task Apr 23 20:00:34 neo03-el64 kernel: RPC: wake_up_next(ffff880465ae6190 "xprt_sending") Apr 23 20:00:34 neo03-el64 kernel: RPC: xprt_rdma_free: called on 0xffff88046502a000 --------->>>>>xprt_rdma_free calls ib_post_send irrespective of QP connection state. Apply check here as-well. ------------->>>>>>>>> xprt_rdma_free internally tries to invalidate FRMRs, If QP is not connected, free-up buffer without invalidation modify the state of frmr.state = INVALID. Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 release request ffff8804678b8800 Apr 23 20:00:34 neo03-el64 kernel: RPC: wake_up_next(ffff880465ae6320 "xprt_backlog") Apr 23 20:00:34 neo03-el64 kernel: RPC: rpc_release_client(ffff8804651c1e00) Apr 23 20:00:34 neo03-el64 kernel: RPC: 178 freeing task Apr 23 20:00:34 neo03-el64 kernel: NFS: nfs_fhget(0:21/2 ct=1) Apr 23 20:00:34 neo03-el64 kernel: <-- nfs4_get_root() Apr 23 20:00:34 neo03-el64 kernel: RPC: looking up Generic cred Apr 23 20:00:34 neo03-el64 kernel: RPC: rpcrdma_event_process: event rep ffff88046230ef30 status 0 opcode 7 length 48 ----------->>>>> New Task Initialised<<<<<<<<<--------------- Apr 23 20:00:34 neo03-el64 kernel: RPC: new task initialized, procpid 3491 > -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Devesh Sharma > Sent: Tuesday, April 15, 2014 11:56 PM > To: Chuck Lever > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond Myklebust > Subject: RE: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > > > -----Original Message----- > > From: Chuck Lever [mailto:chuck.lever@oracle.com] > > Sent: Tuesday, April 15, 2014 6:10 AM > > To: Devesh Sharma > > Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > > Myklebust > > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > > > > On Apr 14, 2014, at 6:46 PM, Devesh Sharma > <devesh.sharma@emulex.com> > > wrote: > > > > > Hi Chuck > > > > > >> -----Original Message----- > > >> From: Chuck Lever [mailto:chuck.lever@oracle.com] > > >> Sent: Tuesday, April 15, 2014 2:24 AM > > >> To: Devesh Sharma > > >> Cc: Linux NFS Mailing List; linux-rdma@vger.kernel.org; Trond > > >> Myklebust > > >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > >> > > >> Hi Devesh- > > >> > > >> > > >> On Apr 13, 2014, at 12:01 AM, Chuck Lever <chuck.lever@oracle.com> > > wrote: > > >> > > >>> > > >>> On Apr 11, 2014, at 7:51 PM, Devesh Sharma > > >> <Devesh.Sharma@Emulex.Com> wrote: > > >>> > > >>>> Hi Chuck, > > >>>> Yes that is the case, Following is the trace I got. > > >>>> > > >>>> <4>RPC: 355 setting alarm for 60000 ms > > >>>> <4>RPC: 355 sync task going to sleep > > >>>> <4>RPC: xprt_rdma_connect_worker: reconnect > > >>>> <4>RPC: rpcrdma_ep_disconnect: rdma_disconnect -1 > > >>>> <4>RPC: rpcrdma_ep_connect: rpcrdma_ep_disconnect status -1 > > >>>> <3>ocrdma_mbx_create_qp(0) rq_err > > >>>> <3>ocrdma_mbx_create_qp(0) sq_err > > >>>> <3>ocrdma_create_qp(0) error=-1 > > >>>> <4>RPC: rpcrdma_ep_connect: rdma_create_qp failed -1 > > >>>> <4>RPC: 355 __rpc_wake_up_task (now 4296956756) > > >>>> <4>RPC: 355 disabling timer > > >>>> <4>RPC: 355 removed from queue ffff880454578258 "xprt_pending" > > >>>> <4>RPC: __rpc_wake_up_task done > > >>>> <4>RPC: xprt_rdma_connect_worker: exit > > >>>> <4>RPC: 355 sync task resuming > > >>>> <4>RPC: 355 xprt_connect_status: error 1 connecting to server > > >> 192.168.1.1 > > >>> > > >>> xprtrdma's connect worker is returning "1" instead of a negative errno. > > >>> That's the bug that triggers this chain of events. > > >> > > >> rdma_create_qp() has returned -EPERM. There's very little xprtrdma > > >> can do if the provider won't even create a QP. That seems like a > > >> rare and fatal problem. > > >> > > >> For the moment, I'm inclined to think that a panic is correct > > >> behavior, since there are outstanding registered memory regions > > >> that cannot be cleaned up without a QP (see below). > > > Well, I think the system should still remain alive. > > > > Sure, in the long run. I'm not suggesting we leave it this way. > Okay, Agreed. > > > > > This will definatly cause a memory leak. But QP create failure does > > > not > > mean system should also crash. > > > > It's more than leaked memory. A permanent QP creation failure can > > leave pages in the page cache registered and pinned, as I understand it. > Yes! true. > > > > > I think for the time being it is worth to put Null pointer checks to > > > prevent > > system from crash. > > > > Common practice in the Linux kernel is to avoid unnecessary NULL checks. > > Work-around fixes are typically rejected, and not with a happy face either. > > > > Once the connection tear-down code is fixed, it should be clear where > > NULL checks need to go. > Okay. > > > > >> > > >>> RPC tasks waiting for the reconnect are awoken. > > >>> xprt_connect_status() doesn't recognize a tk_status of "1", so it > > >>> turns it into -EIO, and kills each waiting RPC task. > > >> > > >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > > >>>> <4>RPC: 355 call_connect_status (status -5) > > >>>> <4>RPC: 355 return 0, status -5 > > >>>> <4>RPC: 355 release task > > >>>> <4>RPC: wake_up_next(ffff880454578190 "xprt_sending") > > >>>> <4>RPC: xprt_rdma_free: called on 0x(null) > > >>> > > >>> And as part of exiting, the RPC task has to free its buffer. > > >>> > > >>> Not exactly sure why req->rl_nchunks is not zero for an NFSv4 > GETATTR. > > >>> This is why rpcrdma_deregister_external() is invoked here. > > >>> > > >>> Eventually this gets around to attempting to post a LOCAL_INV WR > > >>> with > > >>> ->qp set to NULL, and the panic below occurs. > > >> > > >> This is a somewhat different problem. > > >> > > >> Not only do we need to have a good ->qp here, but it has to be > > >> connected and in the ready-to-send state before LOCAL_INV work > > >> requests can be posted. > > >> > > >> The implication of this is that if a server disconnects (server > > >> crash or network partition), the client is stuck waiting for the > > >> server to come back before it can deregister memory and retire > > >> outstanding RPC > > requests. > > > This is a real problem to solve. In the existing state of xprtrdma > > > code. Even a Server reboot will cause Client to crash. > > > > I don't see how that can happen if the HCA/provider manages to create > > a fresh QP successfully and then rdma_connect() succeeds. > Okay yes, since QP creation will still succeed. > > > > A soft timeout or a ^C while the server is rebooting might be a problem. > > > > >> > > >> This is bad for ^C or soft timeouts or umount ... when the server > > >> is unavailable. > > >> > > >> So I feel we need better clean-up when the client cannot reconnect. > > > Unreg old frmrs with the help of new QP? Until the new QP is created > > > with > > same PD and FRMR is bound to PD and not to QP. > > >> Probably deregistering RPC chunk MR's before finally tearing down > > >> the old QP is what is necessary. > > > > > > We need a scheme that handles Memory registrations separately from > > connection establishment and do book-keeping of which region is > > Registered and which one is not. > > > Once the new connection is back. Either start using old mem-regions > > > as it is, > > or invalidate old and re-register on the new QP. > > > What is the existing scheme xprtrdma is following? Is it the same? > > > > This is what is going on now. Clearly, when managing its own memory > > resources, the client should never depend on the server ever coming back. > > > > The proposal is to deregister _before_ the old QP is torn down, using > > ib_dereg_mr() in the connect worker process. All RPC requests on that > > connection should be sleeping waiting for the reconnect to complete. > > > > If chunks are created and marshaled during xprt_transmit(), the > > waiting RPC requests should simply re-register when they are ready to be > sent again. > > > Ok, I will try to change this and test, I may take a week's time to understand > and rollout V3. > > > > I think it is possible to create FRMR on qp->qp_num = x while > > > invalidate on qp->qp_num = y until qpx.pd == qpy.pd > > > > -- > > Chuck Lever > > chuck[dot]lever[at]oracle[dot]com > > > > > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the > body of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/24/2014 2:30 AM, Devesh Sharma wrote: > Hi Chuck > > Following is the complete call trace of a typical NFS-RDMA transaction while mounting a share. > It is unavoidable to stop calling post-send in case it is not created. Therefore, applying checks to the connection state is a must > While registering/deregistering frmrs on-the-fly. The unconnected state of QP implies don't call post_send/post_recv from any context. > Long thread... didn't follow it all. If I understand correctly this race comes only for *cleanup* (LINV) of FRMR registration while teardown flow destroyed the QP. I think this might be disappear if for each registration you post LINV+FRMR. This is assuming that a situation where trying to post Fastreg on a "bad" QP can never happen (usually since teardown flow typically suspends outgoing commands). Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 24, 2014, at 3:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > On 4/24/2014 2:30 AM, Devesh Sharma wrote: >> Hi Chuck >> >> Following is the complete call trace of a typical NFS-RDMA transaction while mounting a share. >> It is unavoidable to stop calling post-send in case it is not created. Therefore, applying checks to the connection state is a must >> While registering/deregistering frmrs on-the-fly. The unconnected state of QP implies don't call post_send/post_recv from any context. >> > > Long thread... didn't follow it all. I think you got the gist of it. > If I understand correctly this race comes only for *cleanup* (LINV) of FRMR registration while teardown flow destroyed the QP. > I think this might be disappear if for each registration you post LINV+FRMR. > This is assuming that a situation where trying to post Fastreg on a "bad" QP can > never happen (usually since teardown flow typically suspends outgoing commands). That’s typically true for “hard” NFS mounts. But “soft” NFS mounts wake RPCs after a timeout while the transport is disconnected, in order to kill them. At that point, deregistration still needs to succeed somehow. IMO there are three related problems. 1. rpcrdma_ep_connect() is allowing RPC tasks to be awoken while there is no QP at all (->qp is NULL). The woken RPC tasks are trying to deregister buffers that may include page cache pages, and it’s oopsing because ->qp is NULL. That’s a logic bug in rpcrdma_ep_connect(), and I have an idea how to address it. 2. If a QP is present but disconnected, posting LOCAL_INV won’t work. That leaves buffers (and page cache pages, potentially) registered. That could be addressed with LINV+FRMR. But... 3. The client should not leave page cache pages registered indefinitely. Both LINV+FRMR and our current approach depends on having a working QP _at_ _some_ _point_ … but the client simply can’t depend on that. What happens if an NFS server is, say, destroyed by fire while there are active client mount points? What if the HCA’s firmware is permanently not allowing QP creation? Here's a relevant comment in rpcrdma_ep_connect(): 815 /* TEMP TEMP TEMP - fail if new device: 816 * Deregister/remarshal *all* requests! 817 * Close and recreate adapter, pd, etc! 818 * Re-determine all attributes still sane! 819 * More stuff I haven't thought of! 820 * Rrrgh! 821 */ xprtrdma does not do this today. When a new device is created, all existing RPC requests could be deregistered and re-marshalled. As far as I can tell, rpcrdma_ep_connect() is executing in a synchronous context (the connect worker) and we can simply use dereg_mr, as long as later, when the RPCs are re-driven, they know they need to re-marshal. I’ll try some things today. -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Thanks Chuck for summarizing. One more issue is being added to the list below. > -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Chuck Lever > Sent: Thursday, April 24, 2014 8:31 PM > To: Sagi Grimberg > Cc: Devesh Sharma; Linux NFS Mailing List; linux-rdma@vger.kernel.org; > Trond Myklebust > Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks > > > On Apr 24, 2014, at 3:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> > wrote: > > > On 4/24/2014 2:30 AM, Devesh Sharma wrote: > >> Hi Chuck > >> > >> Following is the complete call trace of a typical NFS-RDMA transaction > while mounting a share. > >> It is unavoidable to stop calling post-send in case it is not > >> created. Therefore, applying checks to the connection state is a must > While registering/deregistering frmrs on-the-fly. The unconnected state of > QP implies don't call post_send/post_recv from any context. > >> > > > > Long thread... didn't follow it all. > > I think you got the gist of it. > > > If I understand correctly this race comes only for *cleanup* (LINV) of FRMR > registration while teardown flow destroyed the QP. > > I think this might be disappear if for each registration you post LINV+FRMR. > > This is assuming that a situation where trying to post Fastreg on a > > "bad" QP can never happen (usually since teardown flow typically suspends > outgoing commands). > > That's typically true for "hard" NFS mounts. But "soft" NFS mounts wake > RPCs after a timeout while the transport is disconnected, in order to kill > them. At that point, deregistration still needs to succeed somehow. > > IMO there are three related problems. > > 1. rpcrdma_ep_connect() is allowing RPC tasks to be awoken while > there is no QP at all (->qp is NULL). The woken RPC tasks are > trying to deregister buffers that may include page cache pages, > and it's oopsing because ->qp is NULL. > > That's a logic bug in rpcrdma_ep_connect(), and I have an idea > how to address it. > > 2. If a QP is present but disconnected, posting LOCAL_INV won't work. > That leaves buffers (and page cache pages, potentially) registered. > That could be addressed with LINV+FRMR. But... > > 3. The client should not leave page cache pages registered indefinitely. > Both LINV+FRMR and our current approach depends on having a working > QP _at_ _some_ _point_ ... but the client simply can't depend on that. > What happens if an NFS server is, say, destroyed by fire while there > are active client mount points? What if the HCA's firmware is > permanently not allowing QP creation? Addition to the list 4. If rdma traffic is in progress and the network link goes down and comes back up after some time (t > 10 secs ), The rpcrdma_ep_connect() does not destroys the existing QP because rpcrdma_create_id fails (rdma_resolve_addr fails). Now, once the connect worker thread Gets rescheduled again, every time CM fails with establishment error. Finally, after multiple tries CM fails with rdma_cm_event = 15 and entire recovery thread sits silently forever and kernel reports user app is blocked for more than 120 secs. > > Here's a relevant comment in rpcrdma_ep_connect(): > > 815 /* TEMP TEMP TEMP - fail if new device: > 816 * Deregister/remarshal *all* requests! > 817 * Close and recreate adapter, pd, etc! > 818 * Re-determine all attributes still sane! > 819 * More stuff I haven't thought of! > 820 * Rrrgh! > 821 */ > > xprtrdma does not do this today. > > When a new device is created, all existing RPC requests could be > deregistered and re-marshalled. As far as I can tell, > rpcrdma_ep_connect() is executing in a synchronous context (the connect > worker) and we can simply use dereg_mr, as long as later, when the RPCs are > re-driven, they know they need to re-marshal. > > I'll try some things today. > > -- > Chuck Lever > chuck[dot]lever[at]oracle[dot]com > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the > body of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 24, 2014, at 11:48 AM, Devesh Sharma <Devesh.Sharma@Emulex.Com> wrote: > Thanks Chuck for summarizing. > One more issue is being added to the list below. > >> -----Original Message----- >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >> owner@vger.kernel.org] On Behalf Of Chuck Lever >> Sent: Thursday, April 24, 2014 8:31 PM >> To: Sagi Grimberg >> Cc: Devesh Sharma; Linux NFS Mailing List; linux-rdma@vger.kernel.org; >> Trond Myklebust >> Subject: Re: [PATCH V1] NFS-RDMA: fix qp pointer validation checks >> >> >> On Apr 24, 2014, at 3:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> >> wrote: >> >>> On 4/24/2014 2:30 AM, Devesh Sharma wrote: >>>> Hi Chuck >>>> >>>> Following is the complete call trace of a typical NFS-RDMA transaction >> while mounting a share. >>>> It is unavoidable to stop calling post-send in case it is not >>>> created. Therefore, applying checks to the connection state is a must >> While registering/deregistering frmrs on-the-fly. The unconnected state of >> QP implies don't call post_send/post_recv from any context. >>>> >>> >>> Long thread... didn't follow it all. >> >> I think you got the gist of it. >> >>> If I understand correctly this race comes only for *cleanup* (LINV) of FRMR >> registration while teardown flow destroyed the QP. >>> I think this might be disappear if for each registration you post LINV+FRMR. >>> This is assuming that a situation where trying to post Fastreg on a >>> "bad" QP can never happen (usually since teardown flow typically suspends >> outgoing commands). >> >> That's typically true for "hard" NFS mounts. But "soft" NFS mounts wake >> RPCs after a timeout while the transport is disconnected, in order to kill >> them. At that point, deregistration still needs to succeed somehow. >> >> IMO there are three related problems. >> >> 1. rpcrdma_ep_connect() is allowing RPC tasks to be awoken while >> there is no QP at all (->qp is NULL). The woken RPC tasks are >> trying to deregister buffers that may include page cache pages, >> and it's oopsing because ->qp is NULL. >> >> That's a logic bug in rpcrdma_ep_connect(), and I have an idea >> how to address it. >> >> 2. If a QP is present but disconnected, posting LOCAL_INV won't work. >> That leaves buffers (and page cache pages, potentially) registered. >> That could be addressed with LINV+FRMR. But... >> >> 3. The client should not leave page cache pages registered indefinitely. >> Both LINV+FRMR and our current approach depends on having a working >> QP _at_ _some_ _point_ ... but the client simply can't depend on that. >> What happens if an NFS server is, say, destroyed by fire while there >> are active client mount points? What if the HCA's firmware is >> permanently not allowing QP creation? > Addition to the list > 4. If rdma traffic is in progress and the network link goes down and comes back up after some time (t > 10 secs ), > The rpcrdma_ep_connect() does not destroys the existing QP because rpcrdma_create_id fails (rdma_resolve_addr fails). > Now, once the connect worker thread Gets rescheduled again, every time CM fails with establishment error. Finally, after multiple tries > CM fails with rdma_cm_event = 15 and entire recovery thread sits silently forever and kernel reports user app is blocked for more than 120 secs. I think I see that now. I should be able to address it with the fixes for 1. -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/24/2014 6:01 PM, Chuck Lever wrote: > On Apr 24, 2014, at 3:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > >> On 4/24/2014 2:30 AM, Devesh Sharma wrote: >>> Hi Chuck >>> >>> Following is the complete call trace of a typical NFS-RDMA transaction while mounting a share. >>> It is unavoidable to stop calling post-send in case it is not created. Therefore, applying checks to the connection state is a must >>> While registering/deregistering frmrs on-the-fly. The unconnected state of QP implies don't call post_send/post_recv from any context. >>> >> Long thread... didn't follow it all. > I think you got the gist of it. > >> If I understand correctly this race comes only for *cleanup* (LINV) of FRMR registration while teardown flow destroyed the QP. >> I think this might be disappear if for each registration you post LINV+FRMR. >> This is assuming that a situation where trying to post Fastreg on a "bad" QP can >> never happen (usually since teardown flow typically suspends outgoing commands). > That’s typically true for “hard” NFS mounts. But “soft” NFS mounts > wake RPCs after a timeout while the transport is disconnected, in > order to kill them. At that point, deregistration still needs to > succeed somehow. Not sure I understand, Can you please explain why deregistration will not succeed? AFAIK You are allowed to register FRMR and then deregister it without having to invalidate it. Can you please explain why you logically connected LINV with deregistration? > > IMO there are three related problems. > > 1. rpcrdma_ep_connect() is allowing RPC tasks to be awoken while > there is no QP at all (->qp is NULL). The woken RPC tasks are > trying to deregister buffers that may include page cache pages, > and it’s oopsing because ->qp is NULL. > > That’s a logic bug in rpcrdma_ep_connect(), and I have an idea > how to address it. Why not first create a new id+qp and assign them - and then destroy the old id+qp? see SRP related section: ib_srp.x:srp_create_target_ib() Anyway it is indeed important to guarantee that no xmit flows happens concurrently to that, and cleanups are processed synchronously and in-order. > > 2. If a QP is present but disconnected, posting LOCAL_INV won’t work. > That leaves buffers (and page cache pages, potentially) registered. > That could be addressed with LINV+FRMR. But... > > 3. The client should not leave page cache pages registered indefinitely. > Both LINV+FRMR and our current approach depends on having a working > QP _at_ _some_ _point_ … but the client simply can’t depend on that. > What happens if an NFS server is, say, destroyed by fire while there > are active client mount points? What if the HCA’s firmware is > permanently not allowing QP creation? Again, I don't understand why you can't dereg_mr(). How about allocating the FRMR pool *after* the QP was successfully created/connected (makes sense as the FRMRs are not usable until then), and destroy/cleanup the pool before the QP is disconnected/destroyed. it also makes sense as they must match PDs. > > Here's a relevant comment in rpcrdma_ep_connect(): > > 815 /* TEMP TEMP TEMP - fail if new device: > 816 * Deregister/remarshal *all* requests! > 817 * Close and recreate adapter, pd, etc! > 818 * Re-determine all attributes still sane! > 819 * More stuff I haven't thought of! > 820 * Rrrgh! > 821 */ > > xprtrdma does not do this today. > > When a new device is created, all existing RPC requests could be > deregistered and re-marshalled. As far as I can tell, > rpcrdma_ep_connect() is executing in a synchronous context (the connect > worker) and we can simply use dereg_mr, as long as later, when the RPCs > are re-driven, they know they need to re-marshal. Agree. Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 27, 2014, at 6:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: > On 4/24/2014 6:01 PM, Chuck Lever wrote: >> On Apr 24, 2014, at 3:12 AM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote: >> >>> On 4/24/2014 2:30 AM, Devesh Sharma wrote: >>>> Hi Chuck >>>> >>>> Following is the complete call trace of a typical NFS-RDMA transaction while mounting a share. >>>> It is unavoidable to stop calling post-send in case it is not created. Therefore, applying checks to the connection state is a must >>>> While registering/deregistering frmrs on-the-fly. The unconnected state of QP implies don't call post_send/post_recv from any context. >>>> >>> Long thread... didn't follow it all. >> I think you got the gist of it. >> >>> If I understand correctly this race comes only for *cleanup* (LINV) of FRMR registration while teardown flow destroyed the QP. >>> I think this might be disappear if for each registration you post LINV+FRMR. >>> This is assuming that a situation where trying to post Fastreg on a "bad" QP can >>> never happen (usually since teardown flow typically suspends outgoing commands). >> That’s typically true for “hard” NFS mounts. But “soft” NFS mounts >> wake RPCs after a timeout while the transport is disconnected, in >> order to kill them. At that point, deregistration still needs to >> succeed somehow. > > Not sure I understand, Can you please explain why deregistration will not succeed? > AFAIK You are allowed to register FRMR and then deregister it without having to invalidate it. > > Can you please explain why you logically connected LINV with deregistration? Confusion. Sorry. > >> >> IMO there are three related problems. >> >> 1. rpcrdma_ep_connect() is allowing RPC tasks to be awoken while >> there is no QP at all (->qp is NULL). The woken RPC tasks are >> trying to deregister buffers that may include page cache pages, >> and it’s oopsing because ->qp is NULL. >> >> That’s a logic bug in rpcrdma_ep_connect(), and I have an idea >> how to address it. > > Why not first create a new id+qp and assign them - and then destroy the old id+qp? > see SRP related section: ib_srp.x:srp_create_target_ib() > > Anyway it is indeed important to guarantee that no xmit flows happens concurrently to that, > and cleanups are processed synchronously and in-order. I posted a patch on Friday that should provide that serialization. > >> >> 2. If a QP is present but disconnected, posting LOCAL_INV won’t work. >> That leaves buffers (and page cache pages, potentially) registered. >> That could be addressed with LINV+FRMR. But... >> >> 3. The client should not leave page cache pages registered indefinitely. >> Both LINV+FRMR and our current approach depends on having a working >> QP _at_ _some_ _point_ … but the client simply can’t depend on that. >> What happens if an NFS server is, say, destroyed by fire while there >> are active client mount points? What if the HCA’s firmware is >> permanently not allowing QP creation? > > Again, I don't understand why you can't dereg_mr(). > > How about allocating the FRMR pool *after* the QP was successfully created/connected (makes sense as the FRMRs are > not usable until then), and destroy/cleanup the pool before the QP is disconnected/destroyed. it also makes sense as they > must match PDs. It’s not about deregistration, but rather about invalidation, I was confused. xprt_rdma_free() invalidates and then frees the chunks on RPC chunk lists. We just need to see that those invalidations can be successful while the transport is disconnected. I understand that even in the error state, a QP should allow consumers to post send WRs to invalidate FRMRs…? The other case is whether the consumer can _replace_ a QP with a fresh one, and still have invalidations succeed, even if the transport remains disconnected, once waiting RPCs are awoken. An alternative would be to invalidate all waiting RPC chunk lists on a transport as soon as the QP goes to error state but before it is destroyed, and fastreg the chunks again when waiting RPCs are remarshalled. I think the goals are: 1. Avoid fastreg on an FRMR that is already valid 2. Avoid leaving FRMRs valid indefinitely (preferably just long enough to execute the RPC request, and no longer) > >> >> Here's a relevant comment in rpcrdma_ep_connect(): >> >> 815 /* TEMP TEMP TEMP - fail if new device: >> 816 * Deregister/remarshal *all* requests! >> 817 * Close and recreate adapter, pd, etc! >> 818 * Re-determine all attributes still sane! >> 819 * More stuff I haven't thought of! >> 820 * Rrrgh! >> 821 */ >> >> xprtrdma does not do this today. >> >> When a new device is created, all existing RPC requests could be >> deregistered and re-marshalled. As far as I can tell, >> rpcrdma_ep_connect() is executing in a synchronous context (the connect >> worker) and we can simply use dereg_mr, as long as later, when the RPCs >> are re-driven, they know they need to re-marshal. > > Agree. > > Sagi. > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/27/2014 3:37 PM, Chuck Lever wrote: <SNIP> >> Why not first create a new id+qp and assign them - and then destroy the old id+qp? >> see SRP related section: ib_srp.x:srp_create_target_ib() >> >> Anyway it is indeed important to guarantee that no xmit flows happens concurrently to that, >> and cleanups are processed synchronously and in-order. > I posted a patch on Friday that should provide that serialization. > >>> 2. If a QP is present but disconnected, posting LOCAL_INV won’t work. >>> That leaves buffers (and page cache pages, potentially) registered. >>> That could be addressed with LINV+FRMR. But... >>> >>> 3. The client should not leave page cache pages registered indefinitely. >>> Both LINV+FRMR and our current approach depends on having a working >>> QP _at_ _some_ _point_ … but the client simply can’t depend on that. >>> What happens if an NFS server is, say, destroyed by fire while there >>> are active client mount points? What if the HCA’s firmware is >>> permanently not allowing QP creation? >> Again, I don't understand why you can't dereg_mr(). >> >> How about allocating the FRMR pool *after* the QP was successfully created/connected (makes sense as the FRMRs are >> not usable until then), and destroy/cleanup the pool before the QP is disconnected/destroyed. it also makes sense as they >> must match PDs. > It’s not about deregistration, but rather about invalidation, I was > confused. OK got it. > xprt_rdma_free() invalidates and then frees the chunks on RPC chunk > lists. We just need to see that those invalidations can be successful > while the transport is disconnected. They can't be completed though. Can't you just skip invalidation? will be done when FRMR is reused. Sorry to be difficult, but I still don't understand why invalidation must occur in this case. > I understand that even in the error state, a QP should allow consumers > to post send WRs to invalidate FRMRs…? Its allowed, they won't execute though (I'll be surprised if they will). AFAIK posting on a QP in error state has only one use-case - post a colored WQE to know that FLUSH errors has ended. > > The other case is whether the consumer can _replace_ a QP with a fresh > one, and still have invalidations succeed, even if the transport remains > disconnected, once waiting RPCs are awoken. Which invalidations succeeded and which didn't are known - so I don't see the problem here. The only thing is the corner case that Steve indicated wrt flush errors, but if I recall correctly posting LINV on an invalidated MR is allowed. > > An alternative would be to invalidate all waiting RPC chunk lists on a > transport as soon as the QP goes to error state but before it is > destroyed, and fastreg the chunks again when waiting RPCs are > remarshalled. > > I think the goals are: > > 1. Avoid fastreg on an FRMR that is already valid > > 2. Avoid leaving FRMRs valid indefinitely (preferably just long enough > to execute the RPC request, and no longer) (1) is a non-issue for INV+FASTREG. Can you please explain your concern on (2)? is it security (server can keep doing RDMA)? because you have remote invalidate for that (server can implement SEND+INVALIDATE). Having said that, I probably don't see the full picture here like you guys so I might be missing some things. Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9372656..902ac78 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -831,10 +831,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc && rc != -ENOTCONN) - dprintk("RPC: %s: rpcrdma_ep_disconnect" + if (ia->ri_id->qp) { + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc && rc != -ENOTCONN) + dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); + } rpcrdma_clean_cq(ep->rep_cq); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); @@ -859,7 +861,9 @@ retry: goto out; } /* END TEMP */ - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id->qp) { + rdma_destroy_qp(ia->ri_id); + } rdma_destroy_id(ia->ri_id); ia->ri_id = id; } @@ -1557,6 +1561,13 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + if (!ia->ri_is->qp) { + rc = -EINVAL; + while (i--) + rpcrdma_unmap_one(ia, --seg); + goto out; + } + rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); if (rc) { @@ -1571,6 +1582,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, seg1->mr_len = len; } *nsegs = i; +out: return rc; } @@ -1592,6 +1604,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + if (!ia->ri_id->qp) + return -EINVAL; + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); if (rc) dprintk("RPC: %s: failed ib_post_send for invalidate," @@ -1923,6 +1938,9 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.send_flags = IB_SEND_SIGNALED; } + if (!ia->ri_id->qp) + return -EINVAL; + rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); if (rc) dprintk("RPC: %s: ib_post_send returned %i\n", __func__, @@ -1951,6 +1969,9 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); DECR_CQCOUNT(ep); + + if (!ia->ri_id->qp) + return -EINVAL; rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); if (rc)
If the rdma_create_qp fails to create qp due to device firmware being in invalid state xprtrdma still tries to destroy the non-existant qp and ends up in a NULL pointer reference crash. Adding proper checks for vaidating QP pointer avoids this to happen. Signed-off-by: Devesh Sharma <devesh.sharma@emulex.com> --- net/sunrpc/xprtrdma/verbs.c | 29 +++++++++++++++++++++++++---- 1 files changed, 25 insertions(+), 4 deletions(-)