Message ID | 1427971145-26943-4-git-send-email-ogerlitz@mellanox.com (mailing list archive) |
---|---|
State | Rejected |
Headers | show |
> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote: > > From: Erez Shitrit <erezsh@mellanox.com> > > As the result of a completion error the QP can moved to SQE state by > the hardware. Since it's not the Error state, there are no flushes > and hence the driver doesn't know about that. > > The fix creates a task that after completion with error which is not a > flush tracks the QP state and if it is in SQE state moves it back to RTS. I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6? Otherwise, this is a “never should happen” situation, yes? > Signed-off-by: Erez Shitrit <erezsh@mellanox.com> > Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> > --- > drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ > drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 ++++++++++++++++++++++++++++++- > 2 files changed, 63 insertions(+), 1 deletions(-) > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h > index 769044c..2703d9a 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib.h > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h > @@ -299,6 +299,11 @@ struct ipoib_neigh_table { > struct completion deleted; > }; > > +struct ipoib_qp_state_validate { > + struct work_struct work; > + struct ipoib_dev_priv *priv; > +}; Why did you choose to do an allocated work struct? All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static. Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp? > /* > * Device private locking: network stack tx_lock protects members used > * in TX fast path, lock protects everything else. lock nests inside > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > index 29b376d..63b92cb 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, > } > } > > +/* > + * As the result of a completion error the QP Can be transferred to SQE states. > + * The function checks if the (send)QP is in SQE state and > + * moves it back to RTS state, that in order to have it functional again. > + */ > +static void ipoib_qp_state_validate_work(struct work_struct *work) > +{ > + struct ipoib_qp_state_validate *qp_work = > + container_of(work, struct ipoib_qp_state_validate, work); > + > + struct ipoib_dev_priv *priv = qp_work->priv; > + struct ib_qp_attr qp_attr; > + struct ib_qp_init_attr query_init_attr; > + int ret; > + > + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); > + if (ret) { > + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", > + __func__, ret); > + goto free_res; > + } > + pr_info("%s: QP: 0x%x is in state: %d\n", > + __func__, priv->qp->qp_num, qp_attr.qp_state); > + > + /* currently support only in SQE->RTS transition*/ > + if (qp_attr.qp_state == IB_QPS_SQE) { > + qp_attr.qp_state = IB_QPS_RTS; > + > + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); > + if (ret) { > + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", > + ret, priv->qp->qp_num); > + goto free_res; > + } > + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", > + __func__, priv->qp->qp_num); > + } else { > + pr_warn("QP (%d) will stay in state: %d\n", > + priv->qp->qp_num, qp_attr.qp_state); > + } > + > +free_res: > + kfree(qp_work); > +} > + > static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) > { > struct ipoib_dev_priv *priv = netdev_priv(dev); > @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) > netif_wake_queue(dev); > > if (wc->status != IB_WC_SUCCESS && > - wc->status != IB_WC_WR_FLUSH_ERR) > + wc->status != IB_WC_WR_FLUSH_ERR) { > + struct ipoib_qp_state_validate *qp_work; > ipoib_warn(priv, "failed send event " > "(status=%d, wrid=%d vend_err %x)\n", > wc->status, wr_id, wc->vendor_err); > + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); > + if (!qp_work) { > + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", > + __func__, priv->qp->qp_num); > + return; > + } > + > + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); > + qp_work->priv = priv; > + queue_work(priv->wq, &qp_work->work); > + } > } > > static int poll_tx(struct ipoib_dev_priv *priv) > -- > 1.7.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html — Doug Ledford <dledford@redhat.com> GPG Key ID: 0E572FDD
Hi Doug (now with "reply to all" :) ) On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford@redhat.com> wrote: > >> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote: >> >> From: Erez Shitrit <erezsh@mellanox.com> >> >> As the result of a completion error the QP can moved to SQE state by >> the hardware. Since it's not the Error state, there are no flushes >> and hence the driver doesn't know about that. >> >> The fix creates a task that after completion with error which is not a >> flush tracks the QP state and if it is in SQE state moves it back to RTS. > > I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6? Otherwise, this is a “never should happen” situation, yes? > Not only. it happened in the case that described in patch#6 and also in some other errors that caused the FW/HW to decide move the UD QO to SQE state (in the category of "bad thing happened.." like damaged WQE, FW problem, etc.) The patch originally was written to solve such cases which we really saw, at testing, customers etc. and not for patch#6 >> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> >> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> >> --- >> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ >> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 ++++++++++++++++++++++++++++++- >> 2 files changed, 63 insertions(+), 1 deletions(-) >> >> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h >> index 769044c..2703d9a 100644 >> --- a/drivers/infiniband/ulp/ipoib/ipoib.h >> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h >> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { >> struct completion deleted; >> }; >> >> +struct ipoib_qp_state_validate { >> + struct work_struct work; >> + struct ipoib_dev_priv *priv; >> +}; > > Why did you choose to do an allocated work struct? All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static. Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp? > It looks (at least for me) more elegant to create "on-the-fly" work create/queue and delete at the end (BTW, I saw it many times, other than ipoib). There is no need for that in the CM mode, there the connection and the QP are destroyed, and new one will be created, only in UD mode the QP is forever. >> /* >> * Device private locking: network stack tx_lock protects members used >> * in TX fast path, lock protects everything else. lock nests inside >> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> index 29b376d..63b92cb 100644 >> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, >> } >> } >> >> +/* >> + * As the result of a completion error the QP Can be transferred to SQE states. >> + * The function checks if the (send)QP is in SQE state and >> + * moves it back to RTS state, that in order to have it functional again. >> + */ >> +static void ipoib_qp_state_validate_work(struct work_struct *work) >> +{ >> + struct ipoib_qp_state_validate *qp_work = >> + container_of(work, struct ipoib_qp_state_validate, work); >> + >> + struct ipoib_dev_priv *priv = qp_work->priv; >> + struct ib_qp_attr qp_attr; >> + struct ib_qp_init_attr query_init_attr; >> + int ret; >> + >> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); >> + if (ret) { >> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", >> + __func__, ret); >> + goto free_res; >> + } >> + pr_info("%s: QP: 0x%x is in state: %d\n", >> + __func__, priv->qp->qp_num, qp_attr.qp_state); >> + >> + /* currently support only in SQE->RTS transition*/ >> + if (qp_attr.qp_state == IB_QPS_SQE) { >> + qp_attr.qp_state = IB_QPS_RTS; >> + >> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); >> + if (ret) { >> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", >> + ret, priv->qp->qp_num); >> + goto free_res; >> + } >> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", >> + __func__, priv->qp->qp_num); >> + } else { >> + pr_warn("QP (%d) will stay in state: %d\n", >> + priv->qp->qp_num, qp_attr.qp_state); >> + } >> + >> +free_res: >> + kfree(qp_work); >> +} >> + >> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >> { >> struct ipoib_dev_priv *priv = netdev_priv(dev); >> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >> netif_wake_queue(dev); >> >> if (wc->status != IB_WC_SUCCESS && >> - wc->status != IB_WC_WR_FLUSH_ERR) >> + wc->status != IB_WC_WR_FLUSH_ERR) { >> + struct ipoib_qp_state_validate *qp_work; >> ipoib_warn(priv, "failed send event " >> "(status=%d, wrid=%d vend_err %x)\n", >> wc->status, wr_id, wc->vendor_err); >> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); >> + if (!qp_work) { >> + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", >> + __func__, priv->qp->qp_num); >> + return; >> + } >> + >> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); >> + qp_work->priv = priv; >> + queue_work(priv->wq, &qp_work->work); >> + } >> } >> >> static int poll_tx(struct ipoib_dev_priv *priv) >> -- >> 1.7.1 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > — > Doug Ledford <dledford@redhat.com> > GPG Key ID: 0E572FDD > > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > owner@vger.kernel.org] On Behalf Of Or Gerlitz > Sent: Thursday, April 02, 2015 4:09 PM > To: Roland Dreier; Doug Ledford > Cc: linux-rdma@vger.kernel.org; Erez Shitrit; Tal Alon; Amir Vadai; Or Gerlitz > Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state > > From: Erez Shitrit <erezsh@mellanox.com> > > As the result of a completion error the QP can moved to SQE state by the > hardware. Since it's not the Error state, there are no flushes and hence the > driver doesn't know about that. As per spec, QP transition to SQE causes flush completion for subsequent WQEs, the description is telling other way. Am I missing something? > > The fix creates a task that after completion with error which is not a flush > tracks the QP state and if it is in SQE state moves it back to RTS. > > Signed-off-by: Erez Shitrit <erezsh@mellanox.com> > Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> > --- > drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ > drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 > ++++++++++++++++++++++++++++++- > 2 files changed, 63 insertions(+), 1 deletions(-) > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h > b/drivers/infiniband/ulp/ipoib/ipoib.h > index 769044c..2703d9a 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib.h > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h > @@ -299,6 +299,11 @@ struct ipoib_neigh_table { > struct completion deleted; > }; > > +struct ipoib_qp_state_validate { > + struct work_struct work; > + struct ipoib_dev_priv *priv; > +}; > + > /* > * Device private locking: network stack tx_lock protects members used > * in TX fast path, lock protects everything else. lock nests inside diff --git > a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > index 29b376d..63b92cb 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, > } > } > > +/* > + * As the result of a completion error the QP Can be transferred to SQE states. > + * The function checks if the (send)QP is in SQE state and > + * moves it back to RTS state, that in order to have it functional again. > + */ > +static void ipoib_qp_state_validate_work(struct work_struct *work) { > + struct ipoib_qp_state_validate *qp_work = > + container_of(work, struct ipoib_qp_state_validate, work); > + > + struct ipoib_dev_priv *priv = qp_work->priv; > + struct ib_qp_attr qp_attr; > + struct ib_qp_init_attr query_init_attr; > + int ret; > + > + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); > + if (ret) { > + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", > + __func__, ret); > + goto free_res; > + } > + pr_info("%s: QP: 0x%x is in state: %d\n", > + __func__, priv->qp->qp_num, qp_attr.qp_state); > + > + /* currently support only in SQE->RTS transition*/ > + if (qp_attr.qp_state == IB_QPS_SQE) { > + qp_attr.qp_state = IB_QPS_RTS; > + > + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); > + if (ret) { > + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", > + ret, priv->qp->qp_num); > + goto free_res; > + } > + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to > IB_QPS_RTS\n", > + __func__, priv->qp->qp_num); > + } else { > + pr_warn("QP (%d) will stay in state: %d\n", > + priv->qp->qp_num, qp_attr.qp_state); > + } > + > +free_res: > + kfree(qp_work); > +} > + > static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { > struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22 > @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc > *wc) > netif_wake_queue(dev); > > if (wc->status != IB_WC_SUCCESS && > - wc->status != IB_WC_WR_FLUSH_ERR) > + wc->status != IB_WC_WR_FLUSH_ERR) { > + struct ipoib_qp_state_validate *qp_work; > ipoib_warn(priv, "failed send event " > "(status=%d, wrid=%d vend_err %x)\n", > wc->status, wr_id, wc->vendor_err); > + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); > + if (!qp_work) { > + ipoib_warn(priv, "%s Failed alloc > ipoib_qp_state_validate for qp: 0x%x\n", > + __func__, priv->qp->qp_num); > + return; > + } > + > + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); > + qp_work->priv = priv; > + queue_work(priv->wq, &qp_work->work); > + } > } > > static int poll_tx(struct ipoib_dev_priv *priv) > -- > 1.7.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body > of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/15/2015 8:20 PM, Devesh Sharma wrote: >> -----Original Message----- >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- >> owner@vger.kernel.org] On Behalf Of Or Gerlitz >> Sent: Thursday, April 02, 2015 4:09 PM >> To: Roland Dreier; Doug Ledford >> Cc: linux-rdma@vger.kernel.org; Erez Shitrit; Tal Alon; Amir Vadai; Or Gerlitz >> Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state >> >> From: Erez Shitrit <erezsh@mellanox.com> >> >> As the result of a completion error the QP can moved to SQE state by the >> hardware. Since it's not the Error state, there are no flushes and hence the >> driver doesn't know about that. > As per spec, QP transition to SQE causes flush completion for subsequent WQEs, the description is telling other way. Am I missing something? No you are not :) . the description tries to say the following: the driver cannot distinguish between IB_WC_WR_FLUSH_ERR that threated as IB_WC_SUCCESS to IB_WC_WR_FLUSH_ERR that comes after other errors that take the QP to SQE, The driver must recognize the first error that is not IB_WC_WR_FLUSH_ERR and handle accordingly. For example, the driver can take the QP to ERROR state as a part of its life cycle (drain it, driver down, etc.) and at these situations many IB_WC_WR_FLUSH_ERR return and no need to change the state of the QP, it is already under the handling of the driver, which is not when other error comes. this is the intention of that patch, to return the QP back to life after that (un-handed) cases. >> The fix creates a task that after completion with error which is not a flush >> tracks the QP state and if it is in SQE state moves it back to RTS. >> >> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> >> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> >> --- >> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ >> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 >> ++++++++++++++++++++++++++++++- >> 2 files changed, 63 insertions(+), 1 deletions(-) >> >> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h >> b/drivers/infiniband/ulp/ipoib/ipoib.h >> index 769044c..2703d9a 100644 >> --- a/drivers/infiniband/ulp/ipoib/ipoib.h >> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h >> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { >> struct completion deleted; >> }; >> >> +struct ipoib_qp_state_validate { >> + struct work_struct work; >> + struct ipoib_dev_priv *priv; >> +}; >> + >> /* >> * Device private locking: network stack tx_lock protects members used >> * in TX fast path, lock protects everything else. lock nests inside diff --git >> a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> index 29b376d..63b92cb 100644 >> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, >> } >> } >> >> +/* >> + * As the result of a completion error the QP Can be transferred to SQE states. >> + * The function checks if the (send)QP is in SQE state and >> + * moves it back to RTS state, that in order to have it functional again. >> + */ >> +static void ipoib_qp_state_validate_work(struct work_struct *work) { >> + struct ipoib_qp_state_validate *qp_work = >> + container_of(work, struct ipoib_qp_state_validate, work); >> + >> + struct ipoib_dev_priv *priv = qp_work->priv; >> + struct ib_qp_attr qp_attr; >> + struct ib_qp_init_attr query_init_attr; >> + int ret; >> + >> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); >> + if (ret) { >> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", >> + __func__, ret); >> + goto free_res; >> + } >> + pr_info("%s: QP: 0x%x is in state: %d\n", >> + __func__, priv->qp->qp_num, qp_attr.qp_state); >> + >> + /* currently support only in SQE->RTS transition*/ >> + if (qp_attr.qp_state == IB_QPS_SQE) { >> + qp_attr.qp_state = IB_QPS_RTS; >> + >> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); >> + if (ret) { >> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", >> + ret, priv->qp->qp_num); >> + goto free_res; >> + } >> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to >> IB_QPS_RTS\n", >> + __func__, priv->qp->qp_num); >> + } else { >> + pr_warn("QP (%d) will stay in state: %d\n", >> + priv->qp->qp_num, qp_attr.qp_state); >> + } >> + >> +free_res: >> + kfree(qp_work); >> +} >> + >> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { >> struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22 >> @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc >> *wc) >> netif_wake_queue(dev); >> >> if (wc->status != IB_WC_SUCCESS && >> - wc->status != IB_WC_WR_FLUSH_ERR) >> + wc->status != IB_WC_WR_FLUSH_ERR) { >> + struct ipoib_qp_state_validate *qp_work; >> ipoib_warn(priv, "failed send event " >> "(status=%d, wrid=%d vend_err %x)\n", >> wc->status, wr_id, wc->vendor_err); >> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); >> + if (!qp_work) { >> + ipoib_warn(priv, "%s Failed alloc >> ipoib_qp_state_validate for qp: 0x%x\n", >> + __func__, priv->qp->qp_num); >> + return; >> + } >> + >> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); >> + qp_work->priv = priv; >> + queue_work(priv->wq, &qp_work->work); >> + } >> } >> >> static int poll_tx(struct ipoib_dev_priv *priv) >> -- >> 1.7.1 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body >> of a message to majordomo@vger.kernel.org More majordomo info at >> http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> -----Original Message----- > From: Erez Shitrit [mailto:erezsh@dev.mellanox.co.il] > Sent: Thursday, April 16, 2015 12:14 PM > To: Devesh Sharma; Or Gerlitz; Roland Dreier; Doug Ledford > Cc: linux-rdma@vger.kernel.org; Erez Shitrit; Tal Alon; Amir Vadai > Subject: Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state > > On 4/15/2015 8:20 PM, Devesh Sharma wrote: > >> -----Original Message----- > >> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma- > >> owner@vger.kernel.org] On Behalf Of Or Gerlitz > >> Sent: Thursday, April 02, 2015 4:09 PM > >> To: Roland Dreier; Doug Ledford > >> Cc: linux-rdma@vger.kernel.org; Erez Shitrit; Tal Alon; Amir Vadai; > >> Or Gerlitz > >> Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state > >> > >> From: Erez Shitrit <erezsh@mellanox.com> > >> > >> As the result of a completion error the QP can moved to SQE state by > >> the hardware. Since it's not the Error state, there are no flushes > >> and hence the driver doesn't know about that. > > As per spec, QP transition to SQE causes flush completion for subsequent > WQEs, the description is telling other way. Am I missing something? > > No you are not :) . the description tries to say the following: the driver cannot > distinguish between IB_WC_WR_FLUSH_ERR that threated as IB_WC_SUCCESS > to IB_WC_WR_FLUSH_ERR that comes after other errors that take the QP to > SQE, The driver must recognize the first error that is not > IB_WC_WR_FLUSH_ERR and handle accordingly. > For example, the driver can take the QP to ERROR state as a part of its life > cycle (drain it, driver down, etc.) and at these situations many > IB_WC_WR_FLUSH_ERR return and no need to change the state of the QP, it is > already under the handling of the driver, which is not when other error comes. > this is the intention of that patch, to return the QP back to life after that (un- > handed) cases. Okay, makes sense to me. > > >> The fix creates a task that after completion with error which is not > >> a flush tracks the QP state and if it is in SQE state moves it back to RTS. > >> > >> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> > >> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> > >> --- > >> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ > >> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 > >> ++++++++++++++++++++++++++++++- > >> 2 files changed, 63 insertions(+), 1 deletions(-) > >> > >> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h > >> b/drivers/infiniband/ulp/ipoib/ipoib.h > >> index 769044c..2703d9a 100644 > >> --- a/drivers/infiniband/ulp/ipoib/ipoib.h > >> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h > >> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { > >> struct completion deleted; > >> }; > >> > >> +struct ipoib_qp_state_validate { > >> + struct work_struct work; > >> + struct ipoib_dev_priv *priv; > >> +}; > >> + > >> /* > >> * Device private locking: network stack tx_lock protects members used > >> * in TX fast path, lock protects everything else. lock nests > >> inside diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > >> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > >> index 29b376d..63b92cb 100644 > >> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c > >> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c > >> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device > *ca, > >> } > >> } > >> > >> +/* > >> + * As the result of a completion error the QP Can be transferred to SQE > states. > >> + * The function checks if the (send)QP is in SQE state and > >> + * moves it back to RTS state, that in order to have it functional again. > >> + */ > >> +static void ipoib_qp_state_validate_work(struct work_struct *work) { > >> + struct ipoib_qp_state_validate *qp_work = > >> + container_of(work, struct ipoib_qp_state_validate, work); > >> + > >> + struct ipoib_dev_priv *priv = qp_work->priv; > >> + struct ib_qp_attr qp_attr; > >> + struct ib_qp_init_attr query_init_attr; > >> + int ret; > >> + > >> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); > >> + if (ret) { > >> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", > >> + __func__, ret); > >> + goto free_res; > >> + } > >> + pr_info("%s: QP: 0x%x is in state: %d\n", > >> + __func__, priv->qp->qp_num, qp_attr.qp_state); > >> + > >> + /* currently support only in SQE->RTS transition*/ > >> + if (qp_attr.qp_state == IB_QPS_SQE) { > >> + qp_attr.qp_state = IB_QPS_RTS; > >> + > >> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); > >> + if (ret) { > >> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", > >> + ret, priv->qp->qp_num); > >> + goto free_res; > >> + } > >> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to > >> IB_QPS_RTS\n", > >> + __func__, priv->qp->qp_num); > >> + } else { > >> + pr_warn("QP (%d) will stay in state: %d\n", > >> + priv->qp->qp_num, qp_attr.qp_state); > >> + } > >> + > >> +free_res: > >> + kfree(qp_work); > >> +} > >> + > >> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc > *wc) { > >> struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22 > >> @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct > >> ib_wc > >> *wc) > >> netif_wake_queue(dev); > >> > >> if (wc->status != IB_WC_SUCCESS && > >> - wc->status != IB_WC_WR_FLUSH_ERR) > >> + wc->status != IB_WC_WR_FLUSH_ERR) { > >> + struct ipoib_qp_state_validate *qp_work; > >> ipoib_warn(priv, "failed send event " > >> "(status=%d, wrid=%d vend_err %x)\n", > >> wc->status, wr_id, wc->vendor_err); > >> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); > >> + if (!qp_work) { > >> + ipoib_warn(priv, "%s Failed alloc > >> ipoib_qp_state_validate for qp: 0x%x\n", > >> + __func__, priv->qp->qp_num); > >> + return; > >> + } > >> + > >> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); > >> + qp_work->priv = priv; > >> + queue_work(priv->wq, &qp_work->work); > >> + } > >> } > >> > >> static int poll_tx(struct ipoib_dev_priv *priv) > >> -- > >> 1.7.1 > >> > >> -- > >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" > >> in the body of a message to majordomo@vger.kernel.org More majordomo > >> info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" > > in the body of a message to majordomo@vger.kernel.org More majordomo > > info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh@dev.mellanox.co.il> wrote: > > Hi Doug > > (now with "reply to all" :) ) > > On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford@redhat.com> wrote: >> >>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote: >>> >>> From: Erez Shitrit <erezsh@mellanox.com> >>> >>> As the result of a completion error the QP can moved to SQE state by >>> the hardware. Since it's not the Error state, there are no flushes >>> and hence the driver doesn't know about that. >>> >>> The fix creates a task that after completion with error which is not a >>> flush tracks the QP state and if it is in SQE state moves it back to RTS. >> >> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6? Otherwise, this is a “never should happen” situation, yes? >> > > Not only. it happened in the case that described in patch#6 and also > in some other errors that caused the FW/HW to decide move the UD QO to > SQE state (in the category of "bad thing happened.." like damaged > WQE, FW problem, etc.) > The patch originally was written to solve such cases which we really > saw, at testing, customers etc. and not for patch#6 > >>> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> >>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> >>> --- >>> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ >>> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 ++++++++++++++++++++++++++++++- >>> 2 files changed, 63 insertions(+), 1 deletions(-) >>> >>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h >>> index 769044c..2703d9a 100644 >>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h >>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h >>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { >>> struct completion deleted; >>> }; >>> >>> +struct ipoib_qp_state_validate { >>> + struct work_struct work; >>> + struct ipoib_dev_priv *priv; >>> +}; >> >> Why did you choose to do an allocated work struct? All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static. Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp? >> > > It looks (at least for me) more elegant to create "on-the-fly" work > create/queue and delete at the end (BTW, I saw it many times, other > than ipoib). > There is no need for that in the CM mode, there the connection and the > QP are destroyed, and new one will be created, only in UD mode the QP > is forever. I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over. However, I really think we need to be consistent in this driver. If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?” I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once. But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs. I’d like to swap this one out of my tree for your new one ASAP. > >>> /* >>> * Device private locking: network stack tx_lock protects members used >>> * in TX fast path, lock protects everything else. lock nests inside >>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>> index 29b376d..63b92cb 100644 >>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, >>> } >>> } >>> >>> +/* >>> + * As the result of a completion error the QP Can be transferred to SQE states. >>> + * The function checks if the (send)QP is in SQE state and >>> + * moves it back to RTS state, that in order to have it functional again. >>> + */ >>> +static void ipoib_qp_state_validate_work(struct work_struct *work) >>> +{ >>> + struct ipoib_qp_state_validate *qp_work = >>> + container_of(work, struct ipoib_qp_state_validate, work); >>> + >>> + struct ipoib_dev_priv *priv = qp_work->priv; >>> + struct ib_qp_attr qp_attr; >>> + struct ib_qp_init_attr query_init_attr; >>> + int ret; >>> + >>> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); >>> + if (ret) { >>> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", >>> + __func__, ret); >>> + goto free_res; >>> + } >>> + pr_info("%s: QP: 0x%x is in state: %d\n", >>> + __func__, priv->qp->qp_num, qp_attr.qp_state); >>> + >>> + /* currently support only in SQE->RTS transition*/ >>> + if (qp_attr.qp_state == IB_QPS_SQE) { >>> + qp_attr.qp_state = IB_QPS_RTS; >>> + >>> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); >>> + if (ret) { >>> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", >>> + ret, priv->qp->qp_num); >>> + goto free_res; >>> + } >>> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", >>> + __func__, priv->qp->qp_num); >>> + } else { >>> + pr_warn("QP (%d) will stay in state: %d\n", >>> + priv->qp->qp_num, qp_attr.qp_state); >>> + } >>> + >>> +free_res: >>> + kfree(qp_work); >>> +} >>> + >>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>> { >>> struct ipoib_dev_priv *priv = netdev_priv(dev); >>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>> netif_wake_queue(dev); >>> >>> if (wc->status != IB_WC_SUCCESS && >>> - wc->status != IB_WC_WR_FLUSH_ERR) >>> + wc->status != IB_WC_WR_FLUSH_ERR) { >>> + struct ipoib_qp_state_validate *qp_work; >>> ipoib_warn(priv, "failed send event " >>> "(status=%d, wrid=%d vend_err %x)\n", >>> wc->status, wr_id, wc->vendor_err); >>> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); >>> + if (!qp_work) { >>> + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", >>> + __func__, priv->qp->qp_num); >>> + return; >>> + } >>> + >>> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); >>> + qp_work->priv = priv; >>> + queue_work(priv->wq, &qp_work->work); >>> + } >>> } >>> >>> static int poll_tx(struct ipoib_dev_priv *priv) >>> -- >>> 1.7.1 >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> — >> Doug Ledford <dledford@redhat.com> >> GPG Key ID: 0E572FDD >> >> >> >> >> > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html — Doug Ledford <dledford@redhat.com> GPG Key ID: 0E572FDD
On Thu, Apr 16, 2015 at 4:29 PM, Doug Ledford <dledford@redhat.com> wrote: > >> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh@dev.mellanox.co.il> wrote: >> >> Hi Doug >> >> (now with "reply to all" :) ) >> >> On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford@redhat.com> wrote: >>> >>>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote: >>>> >>>> From: Erez Shitrit <erezsh@mellanox.com> >>>> >>>> As the result of a completion error the QP can moved to SQE state by >>>> the hardware. Since it's not the Error state, there are no flushes >>>> and hence the driver doesn't know about that. >>>> >>>> The fix creates a task that after completion with error which is not a >>>> flush tracks the QP state and if it is in SQE state moves it back to RTS. >>> >>> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6? Otherwise, this is a “never should happen” situation, yes? >>> >> >> Not only. it happened in the case that described in patch#6 and also >> in some other errors that caused the FW/HW to decide move the UD QO to >> SQE state (in the category of "bad thing happened.." like damaged >> WQE, FW problem, etc.) >> The patch originally was written to solve such cases which we really >> saw, at testing, customers etc. and not for patch#6 >> >>>> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> >>>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> >>>> --- >>>> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ >>>> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 ++++++++++++++++++++++++++++++- >>>> 2 files changed, 63 insertions(+), 1 deletions(-) >>>> >>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h >>>> index 769044c..2703d9a 100644 >>>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h >>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h >>>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { >>>> struct completion deleted; >>>> }; >>>> >>>> +struct ipoib_qp_state_validate { >>>> + struct work_struct work; >>>> + struct ipoib_dev_priv *priv; >>>> +}; >>> >>> Why did you choose to do an allocated work struct? All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static. Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp? >>> >> >> It looks (at least for me) more elegant to create "on-the-fly" work >> create/queue and delete at the end (BTW, I saw it many times, other >> than ipoib). >> There is no need for that in the CM mode, there the connection and the >> QP are destroyed, and new one will be created, only in UD mode the QP >> is forever. > > I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over. > > However, I really think we need to be consistent in this driver. If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?” I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once. But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs. I’d like to swap this one out of my tree for your new one ASAP. > There is a plan to have more than one UD QP per device, in order to have multi-queue interfaces (not according to the CM QP but with the same idea that you said, many objects that can use different work each). So, i think that this way to allocate work object is still reasonable. >> >>>> /* >>>> * Device private locking: network stack tx_lock protects members used >>>> * in TX fast path, lock protects everything else. lock nests inside >>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>> index 29b376d..63b92cb 100644 >>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, >>>> } >>>> } >>>> >>>> +/* >>>> + * As the result of a completion error the QP Can be transferred to SQE states. >>>> + * The function checks if the (send)QP is in SQE state and >>>> + * moves it back to RTS state, that in order to have it functional again. >>>> + */ >>>> +static void ipoib_qp_state_validate_work(struct work_struct *work) >>>> +{ >>>> + struct ipoib_qp_state_validate *qp_work = >>>> + container_of(work, struct ipoib_qp_state_validate, work); >>>> + >>>> + struct ipoib_dev_priv *priv = qp_work->priv; >>>> + struct ib_qp_attr qp_attr; >>>> + struct ib_qp_init_attr query_init_attr; >>>> + int ret; >>>> + >>>> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); >>>> + if (ret) { >>>> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", >>>> + __func__, ret); >>>> + goto free_res; >>>> + } >>>> + pr_info("%s: QP: 0x%x is in state: %d\n", >>>> + __func__, priv->qp->qp_num, qp_attr.qp_state); >>>> + >>>> + /* currently support only in SQE->RTS transition*/ >>>> + if (qp_attr.qp_state == IB_QPS_SQE) { >>>> + qp_attr.qp_state = IB_QPS_RTS; >>>> + >>>> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); >>>> + if (ret) { >>>> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", >>>> + ret, priv->qp->qp_num); >>>> + goto free_res; >>>> + } >>>> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", >>>> + __func__, priv->qp->qp_num); >>>> + } else { >>>> + pr_warn("QP (%d) will stay in state: %d\n", >>>> + priv->qp->qp_num, qp_attr.qp_state); >>>> + } >>>> + >>>> +free_res: >>>> + kfree(qp_work); >>>> +} >>>> + >>>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>>> { >>>> struct ipoib_dev_priv *priv = netdev_priv(dev); >>>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>>> netif_wake_queue(dev); >>>> >>>> if (wc->status != IB_WC_SUCCESS && >>>> - wc->status != IB_WC_WR_FLUSH_ERR) >>>> + wc->status != IB_WC_WR_FLUSH_ERR) { >>>> + struct ipoib_qp_state_validate *qp_work; >>>> ipoib_warn(priv, "failed send event " >>>> "(status=%d, wrid=%d vend_err %x)\n", >>>> wc->status, wr_id, wc->vendor_err); >>>> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); >>>> + if (!qp_work) { >>>> + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", >>>> + __func__, priv->qp->qp_num); >>>> + return; >>>> + } >>>> + >>>> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); >>>> + qp_work->priv = priv; >>>> + queue_work(priv->wq, &qp_work->work); >>>> + } >>>> } >>>> >>>> static int poll_tx(struct ipoib_dev_priv *priv) >>>> -- >>>> 1.7.1 >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> >>> — >>> Doug Ledford <dledford@redhat.com> >>> GPG Key ID: 0E572FDD >>> >>> >>> >>> >>> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > — > Doug Ledford <dledford@redhat.com> > GPG Key ID: 0E572FDD > > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Apr 16, 2015, at 10:32 AM, Erez Shitrit <erezsh@dev.mellanox.co.il> wrote: > > On Thu, Apr 16, 2015 at 4:29 PM, Doug Ledford <dledford@redhat.com> wrote: >> >>> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh@dev.mellanox.co.il> wrote: >>> >>> Hi Doug >>> >>> (now with "reply to all" :) ) >>> >>> On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford@redhat.com> wrote: >>>> >>>>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote: >>>>> >>>>> From: Erez Shitrit <erezsh@mellanox.com> >>>>> >>>>> As the result of a completion error the QP can moved to SQE state by >>>>> the hardware. Since it's not the Error state, there are no flushes >>>>> and hence the driver doesn't know about that. >>>>> >>>>> The fix creates a task that after completion with error which is not a >>>>> flush tracks the QP state and if it is in SQE state moves it back to RTS. >>>> >>>> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6? Otherwise, this is a “never should happen” situation, yes? >>>> >>> >>> Not only. it happened in the case that described in patch#6 and also >>> in some other errors that caused the FW/HW to decide move the UD QO to >>> SQE state (in the category of "bad thing happened.." like damaged >>> WQE, FW problem, etc.) >>> The patch originally was written to solve such cases which we really >>> saw, at testing, customers etc. and not for patch#6 >>> >>>>> Signed-off-by: Erez Shitrit <erezsh@mellanox.com> >>>>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> >>>>> --- >>>>> drivers/infiniband/ulp/ipoib/ipoib.h | 5 +++ >>>>> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 59 ++++++++++++++++++++++++++++++- >>>>> 2 files changed, 63 insertions(+), 1 deletions(-) >>>>> >>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h >>>>> index 769044c..2703d9a 100644 >>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h >>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h >>>>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table { >>>>> struct completion deleted; >>>>> }; >>>>> >>>>> +struct ipoib_qp_state_validate { >>>>> + struct work_struct work; >>>>> + struct ipoib_dev_priv *priv; >>>>> +}; >>>> >>>> Why did you choose to do an allocated work struct? All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static. Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp? >>>> >>> >>> It looks (at least for me) more elegant to create "on-the-fly" work >>> create/queue and delete at the end (BTW, I saw it many times, other >>> than ipoib). >>> There is no need for that in the CM mode, there the connection and the >>> QP are destroyed, and new one will be created, only in UD mode the QP >>> is forever. >> >> I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over. >> >> However, I really think we need to be consistent in this driver. If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?” I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once. But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs. I’d like to swap this one out of my tree for your new one ASAP. >> > > There is a plan to have more than one UD QP per device, in order to > have multi-queue interfaces (not according to the CM QP but with the > same idea that you said, many objects that can use different work > each). > So, i think that this way to allocate work object is still reasonable. OK, then yes, that changes things. The patch as it stands is good then. > >>> >>>>> /* >>>>> * Device private locking: network stack tx_lock protects members used >>>>> * in TX fast path, lock protects everything else. lock nests inside >>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>>> index 29b376d..63b92cb 100644 >>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c >>>>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, >>>>> } >>>>> } >>>>> >>>>> +/* >>>>> + * As the result of a completion error the QP Can be transferred to SQE states. >>>>> + * The function checks if the (send)QP is in SQE state and >>>>> + * moves it back to RTS state, that in order to have it functional again. >>>>> + */ >>>>> +static void ipoib_qp_state_validate_work(struct work_struct *work) >>>>> +{ >>>>> + struct ipoib_qp_state_validate *qp_work = >>>>> + container_of(work, struct ipoib_qp_state_validate, work); >>>>> + >>>>> + struct ipoib_dev_priv *priv = qp_work->priv; >>>>> + struct ib_qp_attr qp_attr; >>>>> + struct ib_qp_init_attr query_init_attr; >>>>> + int ret; >>>>> + >>>>> + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); >>>>> + if (ret) { >>>>> + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", >>>>> + __func__, ret); >>>>> + goto free_res; >>>>> + } >>>>> + pr_info("%s: QP: 0x%x is in state: %d\n", >>>>> + __func__, priv->qp->qp_num, qp_attr.qp_state); >>>>> + >>>>> + /* currently support only in SQE->RTS transition*/ >>>>> + if (qp_attr.qp_state == IB_QPS_SQE) { >>>>> + qp_attr.qp_state = IB_QPS_RTS; >>>>> + >>>>> + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); >>>>> + if (ret) { >>>>> + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", >>>>> + ret, priv->qp->qp_num); >>>>> + goto free_res; >>>>> + } >>>>> + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", >>>>> + __func__, priv->qp->qp_num); >>>>> + } else { >>>>> + pr_warn("QP (%d) will stay in state: %d\n", >>>>> + priv->qp->qp_num, qp_attr.qp_state); >>>>> + } >>>>> + >>>>> +free_res: >>>>> + kfree(qp_work); >>>>> +} >>>>> + >>>>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>>>> { >>>>> struct ipoib_dev_priv *priv = netdev_priv(dev); >>>>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) >>>>> netif_wake_queue(dev); >>>>> >>>>> if (wc->status != IB_WC_SUCCESS && >>>>> - wc->status != IB_WC_WR_FLUSH_ERR) >>>>> + wc->status != IB_WC_WR_FLUSH_ERR) { >>>>> + struct ipoib_qp_state_validate *qp_work; >>>>> ipoib_warn(priv, "failed send event " >>>>> "(status=%d, wrid=%d vend_err %x)\n", >>>>> wc->status, wr_id, wc->vendor_err); >>>>> + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); >>>>> + if (!qp_work) { >>>>> + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", >>>>> + __func__, priv->qp->qp_num); >>>>> + return; >>>>> + } >>>>> + >>>>> + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); >>>>> + qp_work->priv = priv; >>>>> + queue_work(priv->wq, &qp_work->work); >>>>> + } >>>>> } >>>>> >>>>> static int poll_tx(struct ipoib_dev_priv *priv) >>>>> -- >>>>> 1.7.1 >>>>> >>>>> -- >>>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>>>> the body of a message to majordomo@vger.kernel.org >>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>>> >>>> — >>>> Doug Ledford <dledford@redhat.com> >>>> GPG Key ID: 0E572FDD >>>> >>>> >>>> >>>> >>>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> — >> Doug Ledford <dledford@redhat.com> >> GPG Key ID: 0E572FDD — Doug Ledford <dledford@redhat.com> GPG Key ID: 0E572FDD
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 769044c..2703d9a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -299,6 +299,11 @@ struct ipoib_neigh_table { struct completion deleted; }; +struct ipoib_qp_state_validate { + struct work_struct work; + struct ipoib_dev_priv *priv; +}; + /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 29b376d..63b92cb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, } } +/* + * As the result of a completion error the QP Can be transferred to SQE states. + * The function checks if the (send)QP is in SQE state and + * moves it back to RTS state, that in order to have it functional again. + */ +static void ipoib_qp_state_validate_work(struct work_struct *work) +{ + struct ipoib_qp_state_validate *qp_work = + container_of(work, struct ipoib_qp_state_validate, work); + + struct ipoib_dev_priv *priv = qp_work->priv; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", + __func__, ret); + goto free_res; + } + pr_info("%s: QP: 0x%x is in state: %d\n", + __func__, priv->qp->qp_num, qp_attr.qp_state); + + /* currently support only in SQE->RTS transition*/ + if (qp_attr.qp_state == IB_QPS_SQE) { + qp_attr.qp_state = IB_QPS_RTS; + + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); + if (ret) { + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", + ret, priv->qp->qp_num); + goto free_res; + } + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", + __func__, priv->qp->qp_num); + } else { + pr_warn("QP (%d) will stay in state: %d\n", + priv->qp->qp_num, qp_attr.qp_state); + } + +free_res: + kfree(qp_work); +} + static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_wake_queue(dev); if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_qp_state_validate *qp_work; ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); + if (!qp_work) { + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", + __func__, priv->qp->qp_num); + return; + } + + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); + qp_work->priv = priv; + queue_work(priv->wq, &qp_work->work); + } } static int poll_tx(struct ipoib_dev_priv *priv)