diff mbox

[1/3] IB: new common API for draining a queue pair

Message ID 2da1db58d642789e8df154e34d622a37295d1ba3.1454709317.git.swise@chelsio.com (mailing list archive)
State Superseded
Headers show

Commit Message

Steve Wise Feb. 5, 2016, 9:13 p.m. UTC
From: Steve Wise <swise@opengridcomputing.com.com>

Add provider-specific drain_qp function for providers needing special
drain logic.

Add static function __ib_drain_qp() which posts noop WRs to the RQ and
SQ and blocks until their completions are processed.  This ensures the
applications completions have all been processed.

Add API function ib_drain_qp() which calls the provider-specific drain
if it exists or __ib_drain_qp().

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h         |  2 ++
 2 files changed, 74 insertions(+)

Comments

Chuck Lever III Feb. 5, 2016, 9:49 p.m. UTC | #1
> On Feb 5, 2016, at 4:13 PM, Steve Wise <swise@opengridcomputing.com> wrote:
> 
> From: Steve Wise <swise@opengridcomputing.com.com>
> 
> Add provider-specific drain_qp function for providers needing special
> drain logic.
> 
> Add static function __ib_drain_qp() which posts noop WRs to the RQ and
> SQ and blocks until their completions are processed.  This ensures the
> applications completions have all been processed.
> 
> Add API function ib_drain_qp() which calls the provider-specific drain
> if it exists or __ib_drain_qp().
> 
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> ---
> drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
> include/rdma/ib_verbs.h         |  2 ++
> 2 files changed, 74 insertions(+)
> 
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index 5af6d02..31b82cd 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -1657,3 +1657,75 @@ next_page:
> 	return i;
> }
> EXPORT_SYMBOL(ib_sg_to_pages);
> +
> +struct ib_drain_cqe {
> +	struct ib_cqe cqe;
> +	struct completion done;
> +};
> +
> +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
> +						cqe);
> +
> +	complete(&cqe->done);
> +}
> +
> +/*
> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
> + */
> +static void __ib_drain_qp(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +	struct ib_drain_cqe rdrain, sdrain;
> +	struct ib_recv_wr rwr = {}, *bad_rwr;
> +	struct ib_send_wr swr = {}, *bad_swr;
> +	int ret;
> +
> +	rwr.wr_cqe = &rdrain.cqe;
> +	rdrain.cqe.done = ib_drain_qp_done;
> +	init_completion(&rdrain.done);
> +
> +	swr.wr_cqe = &sdrain.cqe;
> +	sdrain.cqe.done = ib_drain_qp_done;

OK. ib_cqe is what hooks the completion events for these
blank WRs, so those completions are never exposed to the
RDMA consumer.

But does a consumer have to bump its SQE and RQE count
when allocating its CQs, or is that done automatically
by ib_alloc_cq() ?



> +	init_completion(&sdrain.done);
> +
> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_recv(qp, &rwr, &bad_rwr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_send(qp, &swr, &bad_swr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> +		return;
> +	}
> +
> +	wait_for_completion(&rdrain.done);
> +	wait_for_completion(&sdrain.done);
> +}
> +
> +/**
> + * ib_drain_qp() - Block until all CQEs have been consumed by the
> + *		   application.
> + * @qp:            queue pair to drain
> + *
> + * If the device has a provider-specific drain function, then
> + * call that.  Otherwise call the generic drain function
> + * __ib_drain_qp().
> + */
> +void ib_drain_qp(struct ib_qp *qp)
> +{
> +	if (qp->device->drain_qp)
> +		qp->device->drain_qp(qp);
> +	else
> +		__ib_drain_qp(qp);
> +}
> +EXPORT_SYMBOL(ib_drain_qp);
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 284b00c..d8533ab 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1846,6 +1846,7 @@ struct ib_device {
> 	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
> 						      struct ib_mr_status *mr_status);
> 	void			   (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
> +	void			   (*drain_qp)(struct ib_qp *qp);
> 
> 	struct ib_dma_mapping_ops   *dma_ops;
> 
> @@ -3094,4 +3095,5 @@ int ib_sg_to_pages(struct ib_mr *mr,
> 		   int sg_nents,
> 		   int (*set_page)(struct ib_mr *, u64));
> 
> +void ib_drain_qp(struct ib_qp *qp);
> #endif /* IB_VERBS_H */
> -- 
> 2.7.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Chuck Lever




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Feb. 5, 2016, 9:53 p.m. UTC | #2
> > From: Steve Wise <swise@opengridcomputing.com.com>
> >
> > Add provider-specific drain_qp function for providers needing special
> > drain logic.
> >
> > Add static function __ib_drain_qp() which posts noop WRs to the RQ and
> > SQ and blocks until their completions are processed.  This ensures the
> > applications completions have all been processed.
> >
> > Add API function ib_drain_qp() which calls the provider-specific drain
> > if it exists or __ib_drain_qp().
> >
> > Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> > ---
> > drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
> > include/rdma/ib_verbs.h         |  2 ++
> > 2 files changed, 74 insertions(+)
> >
> > diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> > index 5af6d02..31b82cd 100644
> > --- a/drivers/infiniband/core/verbs.c
> > +++ b/drivers/infiniband/core/verbs.c
> > @@ -1657,3 +1657,75 @@ next_page:
> > 	return i;
> > }
> > EXPORT_SYMBOL(ib_sg_to_pages);
> > +
> > +struct ib_drain_cqe {
> > +	struct ib_cqe cqe;
> > +	struct completion done;
> > +};
> > +
> > +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
> > +{
> > +	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
> > +						cqe);
> > +
> > +	complete(&cqe->done);
> > +}
> > +
> > +/*
> > + * Post a WR and block until its completion is reaped for both the RQ and SQ.
> > + */
> > +static void __ib_drain_qp(struct ib_qp *qp)
> > +{
> > +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> > +	struct ib_drain_cqe rdrain, sdrain;
> > +	struct ib_recv_wr rwr = {}, *bad_rwr;
> > +	struct ib_send_wr swr = {}, *bad_swr;
> > +	int ret;
> > +
> > +	rwr.wr_cqe = &rdrain.cqe;
> > +	rdrain.cqe.done = ib_drain_qp_done;
> > +	init_completion(&rdrain.done);
> > +
> > +	swr.wr_cqe = &sdrain.cqe;
> > +	sdrain.cqe.done = ib_drain_qp_done;
> 
> OK. ib_cqe is what hooks the completion events for these
> blank WRs, so those completions are never exposed to the
> RDMA consumer.
> 

Right, which means only consumers that use the new style CQ processing can make use of this.

> But does a consumer have to bump its SQE and RQE count
> when allocating its CQs, or is that done automatically
> by ib_alloc_cq() ?
>

The consumer has to make sure there is room in the SQ, RQ and CQ.  Going forward, we could enhance QP and CQ allocation to allow the
consumer to specify it wants drain capability so the consumer doesn't have to do this. It could be done under the covers.  In fact,
if we did that, then ib_destroy_qp() could do the drain if need be.
 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III Feb. 5, 2016, 10 p.m. UTC | #3
> On Feb 5, 2016, at 4:53 PM, Steve Wise <swise@opengridcomputing.com> wrote:
> 
>>> From: Steve Wise <swise@opengridcomputing.com.com>
>>> 
>>> Add provider-specific drain_qp function for providers needing special
>>> drain logic.
>>> 
>>> Add static function __ib_drain_qp() which posts noop WRs to the RQ and
>>> SQ and blocks until their completions are processed.  This ensures the
>>> applications completions have all been processed.
>>> 
>>> Add API function ib_drain_qp() which calls the provider-specific drain
>>> if it exists or __ib_drain_qp().
>>> 
>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>> ---
>>> drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
>>> include/rdma/ib_verbs.h         |  2 ++
>>> 2 files changed, 74 insertions(+)
>>> 
>>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>>> index 5af6d02..31b82cd 100644
>>> --- a/drivers/infiniband/core/verbs.c
>>> +++ b/drivers/infiniband/core/verbs.c
>>> @@ -1657,3 +1657,75 @@ next_page:
>>> 	return i;
>>> }
>>> EXPORT_SYMBOL(ib_sg_to_pages);
>>> +
>>> +struct ib_drain_cqe {
>>> +	struct ib_cqe cqe;
>>> +	struct completion done;
>>> +};
>>> +
>>> +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
>>> +{
>>> +	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
>>> +						cqe);
>>> +
>>> +	complete(&cqe->done);
>>> +}
>>> +
>>> +/*
>>> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
>>> + */
>>> +static void __ib_drain_qp(struct ib_qp *qp)
>>> +{
>>> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>>> +	struct ib_drain_cqe rdrain, sdrain;
>>> +	struct ib_recv_wr rwr = {}, *bad_rwr;
>>> +	struct ib_send_wr swr = {}, *bad_swr;
>>> +	int ret;
>>> +
>>> +	rwr.wr_cqe = &rdrain.cqe;
>>> +	rdrain.cqe.done = ib_drain_qp_done;
>>> +	init_completion(&rdrain.done);
>>> +
>>> +	swr.wr_cqe = &sdrain.cqe;
>>> +	sdrain.cqe.done = ib_drain_qp_done;
>> 
>> OK. ib_cqe is what hooks the completion events for these
>> blank WRs, so those completions are never exposed to the
>> RDMA consumer.
>> 
> 
> Right, which means only consumers that use the new style CQ processing can make use of this.
> 
>> But does a consumer have to bump its SQE and RQE count
>> when allocating its CQs, or is that done automatically
>> by ib_alloc_cq() ?
>> 
> 
> The consumer has to make sure there is room in the SQ, RQ and CQ.

The documenting comment in front of ib_drain_qp() should
mention this, and it should also state the requirement to
use ib_alloc_cq().


> Going forward, we could enhance QP and CQ allocation to allow the
> consumer to specify it wants drain capability so the consumer doesn't have to do this. It could be done under the covers.  In fact,
> if we did that, then ib_destroy_qp() could do the drain if need be.


--
Chuck Lever




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever III Feb. 5, 2016, 10:20 p.m. UTC | #4
> On Feb 5, 2016, at 5:00 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
> 
>> 
>> On Feb 5, 2016, at 4:53 PM, Steve Wise <swise@opengridcomputing.com> wrote:
>> 
>>>> From: Steve Wise <swise@opengridcomputing.com.com>
>>>> 
>>>> Add provider-specific drain_qp function for providers needing special
>>>> drain logic.
>>>> 
>>>> Add static function __ib_drain_qp() which posts noop WRs to the RQ and
>>>> SQ and blocks until their completions are processed.  This ensures the
>>>> applications completions have all been processed.
>>>> 
>>>> Add API function ib_drain_qp() which calls the provider-specific drain
>>>> if it exists or __ib_drain_qp().
>>>> 
>>>> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
>>>> ---
>>>> drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
>>>> include/rdma/ib_verbs.h         |  2 ++
>>>> 2 files changed, 74 insertions(+)
>>>> 
>>>> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
>>>> index 5af6d02..31b82cd 100644
>>>> --- a/drivers/infiniband/core/verbs.c
>>>> +++ b/drivers/infiniband/core/verbs.c
>>>> @@ -1657,3 +1657,75 @@ next_page:
>>>> 	return i;
>>>> }
>>>> EXPORT_SYMBOL(ib_sg_to_pages);
>>>> +
>>>> +struct ib_drain_cqe {
>>>> +	struct ib_cqe cqe;
>>>> +	struct completion done;
>>>> +};
>>>> +
>>>> +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
>>>> +{
>>>> +	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
>>>> +						cqe);
>>>> +
>>>> +	complete(&cqe->done);
>>>> +}
>>>> +
>>>> +/*
>>>> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
>>>> + */
>>>> +static void __ib_drain_qp(struct ib_qp *qp)
>>>> +{
>>>> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>>>> +	struct ib_drain_cqe rdrain, sdrain;
>>>> +	struct ib_recv_wr rwr = {}, *bad_rwr;
>>>> +	struct ib_send_wr swr = {}, *bad_swr;
>>>> +	int ret;
>>>> +
>>>> +	rwr.wr_cqe = &rdrain.cqe;
>>>> +	rdrain.cqe.done = ib_drain_qp_done;
>>>> +	init_completion(&rdrain.done);
>>>> +
>>>> +	swr.wr_cqe = &sdrain.cqe;
>>>> +	sdrain.cqe.done = ib_drain_qp_done;
>>> 
>>> OK. ib_cqe is what hooks the completion events for these
>>> blank WRs, so those completions are never exposed to the
>>> RDMA consumer.
>>> 
>> 
>> Right, which means only consumers that use the new style CQ processing can make use of this.
>> 
>>> But does a consumer have to bump its SQE and RQE count
>>> when allocating its CQs, or is that done automatically
>>> by ib_alloc_cq() ?
>>> 
>> 
>> The consumer has to make sure there is room in the SQ, RQ and CQ.
> 
> The documenting comment in front of ib_drain_qp() should
> mention this, and it should also state the requirement to
> use ib_alloc_cq().

And otherwise, I'm happy to see this work!

Reviewed-by: Chuck Lever <chuck.lever@oracle.com>


>> Going forward, we could enhance QP and CQ allocation to allow the
>> consumer to specify it wants drain capability so the consumer doesn't have to do this. It could be done under the covers.  In fact,
>> if we did that, then ib_destroy_qp() could do the drain if need be.
> 
> 
> --
> Chuck Lever
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Chuck Lever




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Devesh Sharma Feb. 6, 2016, 4:10 p.m. UTC | #5
Hi Steve,


On Sat, Feb 6, 2016 at 2:43 AM, Steve Wise <swise@opengridcomputing.com> wrote:
> From: Steve Wise <swise@opengridcomputing.com.com>
>
> Add provider-specific drain_qp function for providers needing special
> drain logic.
>
> Add static function __ib_drain_qp() which posts noop WRs to the RQ and
> SQ and blocks until their completions are processed.  This ensures the
> applications completions have all been processed.
>
> Add API function ib_drain_qp() which calls the provider-specific drain
> if it exists or __ib_drain_qp().
>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> ---
>  drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h         |  2 ++
>  2 files changed, 74 insertions(+)
>
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index 5af6d02..31b82cd 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -1657,3 +1657,75 @@ next_page:
>         return i;
>  }
>  EXPORT_SYMBOL(ib_sg_to_pages);
> +
> +struct ib_drain_cqe {
> +       struct ib_cqe cqe;
> +       struct completion done;
> +};
> +
> +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +       struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
> +                                               cqe);
> +
> +       complete(&cqe->done);
> +}
> +
> +/*
> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
> + */
> +static void __ib_drain_qp(struct ib_qp *qp)
> +{
> +       struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +       struct ib_drain_cqe rdrain, sdrain;
> +       struct ib_recv_wr rwr = {}, *bad_rwr;
> +       struct ib_send_wr swr = {}, *bad_swr;
> +       int ret;
> +
> +       rwr.wr_cqe = &rdrain.cqe;
> +       rdrain.cqe.done = ib_drain_qp_done;
> +       init_completion(&rdrain.done);
> +
> +       swr.wr_cqe = &sdrain.cqe;
> +       sdrain.cqe.done = ib_drain_qp_done;
> +       init_completion(&sdrain.done);
> +
> +       ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +       if (ret) {
> +               WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +               return;
> +       }

I was thinking if we really need this modify_qp here. generally
drain_qp is called on
an error'ed out QP. In a graceful tear down rdma_disconnect takes care
to modify-qp
to error. while in error state qp is already in error state.

> +
> +       ret = ib_post_recv(qp, &rwr, &bad_rwr);
> +       if (ret) {
> +               WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> +               return;
> +       }
> +
> +       ret = ib_post_send(qp, &swr, &bad_swr);
> +       if (ret) {
> +               WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> +               return;
> +       }
> +
> +       wait_for_completion(&rdrain.done);
> +       wait_for_completion(&sdrain.done);
> +}
> +
> +/**
> + * ib_drain_qp() - Block until all CQEs have been consumed by the
> + *                application.
> + * @qp:            queue pair to drain
> + *
> + * If the device has a provider-specific drain function, then
> + * call that.  Otherwise call the generic drain function
> + * __ib_drain_qp().
> + */
> +void ib_drain_qp(struct ib_qp *qp)
> +{
> +       if (qp->device->drain_qp)
> +               qp->device->drain_qp(qp);
> +       else
> +               __ib_drain_qp(qp);
> +}
> +EXPORT_SYMBOL(ib_drain_qp);
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 284b00c..d8533ab 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1846,6 +1846,7 @@ struct ib_device {
>         int                        (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
>                                                       struct ib_mr_status *mr_status);
>         void                       (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
> +       void                       (*drain_qp)(struct ib_qp *qp);
>
>         struct ib_dma_mapping_ops   *dma_ops;
>
> @@ -3094,4 +3095,5 @@ int ib_sg_to_pages(struct ib_mr *mr,
>                    int sg_nents,
>                    int (*set_page)(struct ib_mr *, u64));
>
> +void ib_drain_qp(struct ib_qp *qp);
>  #endif /* IB_VERBS_H */
> --
> 2.7.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky Feb. 6, 2016, 5:08 p.m. UTC | #6
On Fri, Feb 05, 2016 at 01:13:16PM -0800, Steve Wise wrote:
> From: Steve Wise <swise@opengridcomputing.com.com>
> 
> Add provider-specific drain_qp function for providers needing special
> drain logic.
> 
> Add static function __ib_drain_qp() which posts noop WRs to the RQ and
> SQ and blocks until their completions are processed.  This ensures the
> applications completions have all been processed.
> 
> Add API function ib_drain_qp() which calls the provider-specific drain
> if it exists or __ib_drain_qp().
> 
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> ---
>  drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_verbs.h         |  2 ++
>  2 files changed, 74 insertions(+)
> 

 ...

> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_recv(qp, &rwr, &bad_rwr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_send(qp, &swr, &bad_swr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> +		return;

You are returning here despite the fact that recv queue drained
successfully and you can wait for completion of rdrain safely.
Is it done on purpose?

> +	}
> +
> +	wait_for_completion(&rdrain.done);
> +	wait_for_completion(&sdrain.done);
> +}
> +
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Feb. 7, 2016, 11:51 a.m. UTC | #7
>> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>> +	if (ret) {
>> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +		return;
>> +	}
>> +
>> +	ret = ib_post_recv(qp, &rwr, &bad_rwr);
>> +	if (ret) {
>> +		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
>> +		return;
>> +	}
>> +
>> +	ret = ib_post_send(qp, &swr, &bad_swr);
>> +	if (ret) {
>> +		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
>> +		return;
>
> You are returning here despite the fact that recv queue drained
> successfully and you can wait for completion of rdrain safely.
> Is it done on purpose?

Good catch, the completion structures are on-stack.

Steve, you need to wait for the completion of the
successful post in this case...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Feb. 7, 2016, 11:53 a.m. UTC | #8
>> +/*
>> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
>> + */
>> +static void __ib_drain_qp(struct ib_qp *qp)
>> +{
>> +       struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>> +       struct ib_drain_cqe rdrain, sdrain;
>> +       struct ib_recv_wr rwr = {}, *bad_rwr;
>> +       struct ib_send_wr swr = {}, *bad_swr;
>> +       int ret;
>> +
>> +       rwr.wr_cqe = &rdrain.cqe;
>> +       rdrain.cqe.done = ib_drain_qp_done;
>> +       init_completion(&rdrain.done);
>> +
>> +       swr.wr_cqe = &sdrain.cqe;
>> +       sdrain.cqe.done = ib_drain_qp_done;
>> +       init_completion(&sdrain.done);
>> +
>> +       ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>> +       if (ret) {
>> +               WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +               return;
>> +       }
>
> I was thinking if we really need this modify_qp here. generally
> drain_qp is called on
> an error'ed out QP. In a graceful tear down rdma_disconnect takes care
> to modify-qp
> to error. while in error state qp is already in error state.

We could get it conditional, but I don't see any harm done
in having it as it would mean a passed in flag.

It would be better to have the modify_qp implementers do
nothing for a ERROR -> ERROR modify...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Devesh Sharma Feb. 8, 2016, 10:37 a.m. UTC | #9
On Sun, Feb 7, 2016 at 5:23 PM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
>
>>> +/*
>>> + * Post a WR and block until its completion is reaped for both the RQ
>>> and SQ.
>>> + */
>>> +static void __ib_drain_qp(struct ib_qp *qp)
>>> +{
>>> +       struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>>> +       struct ib_drain_cqe rdrain, sdrain;
>>> +       struct ib_recv_wr rwr = {}, *bad_rwr;
>>> +       struct ib_send_wr swr = {}, *bad_swr;
>>> +       int ret;
>>> +
>>> +       rwr.wr_cqe = &rdrain.cqe;
>>> +       rdrain.cqe.done = ib_drain_qp_done;
>>> +       init_completion(&rdrain.done);
>>> +
>>> +       swr.wr_cqe = &sdrain.cqe;
>>> +       sdrain.cqe.done = ib_drain_qp_done;
>>> +       init_completion(&sdrain.done);
>>> +
>>> +       ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>>> +       if (ret) {
>>> +               WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>>> +               return;
>>> +       }
>>
>>
>> I was thinking if we really need this modify_qp here. generally
>> drain_qp is called on
>> an error'ed out QP. In a graceful tear down rdma_disconnect takes care
>> to modify-qp
>> to error. while in error state qp is already in error state.
>
>
> We could get it conditional, but I don't see any harm done
> in having it as it would mean a passed in flag.
>

Since Spec allows this, I would say okay let it be there.

> It would be better to have the modify_qp implementers do
> nothing for a ERROR -> ERROR modify...

Driver implementer has a choice here which is okay.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Feb. 8, 2016, 3:23 p.m. UTC | #10
> >> OK. ib_cqe is what hooks the completion events for these
> >> blank WRs, so those completions are never exposed to the
> >> RDMA consumer.
> >>
> >
> > Right, which means only consumers that use the new style CQ processing can make use of this.
> >
> >> But does a consumer have to bump its SQE and RQE count
> >> when allocating its CQs, or is that done automatically
> >> by ib_alloc_cq() ?
> >>
> >
> > The consumer has to make sure there is room in the SQ, RQ and CQ.
> 
> The documenting comment in front of ib_drain_qp() should
> mention this, and it should also state the requirement to
> use ib_alloc_cq().
>

Will do.
 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Feb. 8, 2016, 3:24 p.m. UTC | #11
> -----Original Message-----
> From: Sagi Grimberg [mailto:sagig@dev.mellanox.co.il]
> Sent: Sunday, February 07, 2016 5:52 AM
> To: Steve Wise; linux-rdma@vger.kernel.org
> Subject: Re: [PATCH 1/3] IB: new common API for draining a queue pair
> 
> 
> >> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >> +	if (ret) {
> >> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +		return;
> >> +	}
> >> +
> >> +	ret = ib_post_recv(qp, &rwr, &bad_rwr);
> >> +	if (ret) {
> >> +		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> >> +		return;
> >> +	}
> >> +
> >> +	ret = ib_post_send(qp, &swr, &bad_swr);
> >> +	if (ret) {
> >> +		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> >> +		return;
> >
> > You are returning here despite the fact that recv queue drained
> > successfully and you can wait for completion of rdrain safely.
> > Is it done on purpose?
> 
> Good catch, the completion structures are on-stack.
> 
> Steve, you need to wait for the completion of the
> successful post in this case...

Yup.  Will do.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Feb. 8, 2016, 3:34 p.m. UTC | #12
> >> +/*
> >> + * Post a WR and block until its completion is reaped for both the RQ and SQ.
> >> + */
> >> +static void __ib_drain_qp(struct ib_qp *qp)
> >> +{
> >> +       struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >> +       struct ib_drain_cqe rdrain, sdrain;
> >> +       struct ib_recv_wr rwr = {}, *bad_rwr;
> >> +       struct ib_send_wr swr = {}, *bad_swr;
> >> +       int ret;
> >> +
> >> +       rwr.wr_cqe = &rdrain.cqe;
> >> +       rdrain.cqe.done = ib_drain_qp_done;
> >> +       init_completion(&rdrain.done);
> >> +
> >> +       swr.wr_cqe = &sdrain.cqe;
> >> +       sdrain.cqe.done = ib_drain_qp_done;
> >> +       init_completion(&sdrain.done);
> >> +
> >> +       ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >> +       if (ret) {
> >> +               WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +               return;
> >> +       }
> >
> > I was thinking if we really need this modify_qp here. generally
> > drain_qp is called on
> > an error'ed out QP. In a graceful tear down rdma_disconnect takes care
> > to modify-qp
> > to error. while in error state qp is already in error state.
> 
> We could get it conditional, but I don't see any harm done
> in having it as it would mean a passed in flag.
> 
> It would be better to have the modify_qp implementers do
> nothing for a ERROR -> ERROR modify...

The IBTA spec sez ERROR->ERROR is allowed, so nobody should be failing that transition. iWARP, however has to be different. :)  It sez any attempt to transition out of ERROR to anything other than IDLE results in an immediate error.  But since iWARP provider will need their own handler, I guess I think we should just leave the modify->ERROR in as-is.  Or perhaps ignore the return code.

Steve.



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise Feb. 8, 2016, 3:53 p.m. UTC | #13
> On Sun, Feb 7, 2016 at 5:23 PM, Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
> >
> >>> +/*
> >>> + * Post a WR and block until its completion is reaped for both the RQ
> >>> and SQ.
> >>> + */
> >>> +static void __ib_drain_qp(struct ib_qp *qp)
> >>> +{
> >>> +       struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >>> +       struct ib_drain_cqe rdrain, sdrain;
> >>> +       struct ib_recv_wr rwr = {}, *bad_rwr;
> >>> +       struct ib_send_wr swr = {}, *bad_swr;
> >>> +       int ret;
> >>> +
> >>> +       rwr.wr_cqe = &rdrain.cqe;
> >>> +       rdrain.cqe.done = ib_drain_qp_done;
> >>> +       init_completion(&rdrain.done);
> >>> +
> >>> +       swr.wr_cqe = &sdrain.cqe;
> >>> +       sdrain.cqe.done = ib_drain_qp_done;
> >>> +       init_completion(&sdrain.done);
> >>> +
> >>> +       ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >>> +       if (ret) {
> >>> +               WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >>> +               return;
> >>> +       }
> >>
> >>
> >> I was thinking if we really need this modify_qp here. generally
> >> drain_qp is called on
> >> an error'ed out QP. In a graceful tear down rdma_disconnect takes care
> >> to modify-qp
> >> to error. while in error state qp is already in error state.
> >
> >
> > We could get it conditional, but I don't see any harm done
> > in having it as it would mean a passed in flag.
> >
> 
> Since Spec allows this, I would say okay let it be there.
> 
> > It would be better to have the modify_qp implementers do
> > nothing for a ERROR -> ERROR modify...
> 
> Driver implementer has a choice here which is okay.

Ok, I'll leave it as-is then.  Thanks for the reviews!



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 5af6d02..31b82cd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1657,3 +1657,75 @@  next_page:
 	return i;
 }
 EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+	struct ib_cqe cqe;
+	struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+						cqe);
+
+	complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for both the RQ and SQ.
+ */
+static void __ib_drain_qp(struct ib_qp *qp)
+{
+	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	struct ib_drain_cqe rdrain, sdrain;
+	struct ib_recv_wr rwr = {}, *bad_rwr;
+	struct ib_send_wr swr = {}, *bad_swr;
+	int ret;
+
+	rwr.wr_cqe = &rdrain.cqe;
+	rdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&rdrain.done);
+
+	swr.wr_cqe = &sdrain.cqe;
+	sdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&sdrain.done);
+
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+		return;
+	}
+
+	ret = ib_post_recv(qp, &rwr, &bad_rwr);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+		return;
+	}
+
+	ret = ib_post_send(qp, &swr, &bad_swr);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+		return;
+	}
+
+	wait_for_completion(&rdrain.done);
+	wait_for_completion(&sdrain.done);
+}
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ *		   application.
+ * @qp:            queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that.  Otherwise call the generic drain function
+ * __ib_drain_qp().
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+	if (qp->device->drain_qp)
+		qp->device->drain_qp(qp);
+	else
+		__ib_drain_qp(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 284b00c..d8533ab 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1846,6 +1846,7 @@  struct ib_device {
 	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
 						      struct ib_mr_status *mr_status);
 	void			   (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
+	void			   (*drain_qp)(struct ib_qp *qp);
 
 	struct ib_dma_mapping_ops   *dma_ops;
 
@@ -3094,4 +3095,5 @@  int ib_sg_to_pages(struct ib_mr *mr,
 		   int sg_nents,
 		   int (*set_page)(struct ib_mr *, u64));
 
+void ib_drain_qp(struct ib_qp *qp);
 #endif /* IB_VERBS_H */