diff mbox

[v3,1/9] RDMA/core: Add implicit per-device completion queue pools

Message ID 20171108095742.25365-2-sagi@grimberg.me (mailing list archive)
State Changes Requested
Headers show

Commit Message

Sagi Grimberg Nov. 8, 2017, 9:57 a.m. UTC
Allow a ULP to ask the core to implicitly assign a completion
queue to a queue-pair based on a least-used search on a per-device
cq pools. The device CQ pools grow in a lazy fashion with every
QP creation.

In addition, expose an affinity hint for a queue pair creation.
If passed, the core will attempt to attach a CQ with a completion
vector that is directed to the cpu core as the affinity hint
provided.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/infiniband/core/core_priv.h |   6 ++
 drivers/infiniband/core/cq.c        | 193 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/device.c    |   4 +
 drivers/infiniband/core/verbs.c     |  69 +++++++++++--
 include/rdma/ib_verbs.h             |  31 ++++--
 5 files changed, 291 insertions(+), 12 deletions(-)

Comments

Max Gurtovoy Nov. 9, 2017, 10:45 a.m. UTC | #1
On 11/8/2017 11:57 AM, Sagi Grimberg wrote:
> Allow a ULP to ask the core to implicitly assign a completion
> queue to a queue-pair based on a least-used search on a per-device
> cq pools. The device CQ pools grow in a lazy fashion with every
> QP creation.
> 
> In addition, expose an affinity hint for a queue pair creation.
> If passed, the core will attempt to attach a CQ with a completion
> vector that is directed to the cpu core as the affinity hint
> provided.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>   drivers/infiniband/core/core_priv.h |   6 ++
>   drivers/infiniband/core/cq.c        | 193 ++++++++++++++++++++++++++++++++++++
>   drivers/infiniband/core/device.c    |   4 +
>   drivers/infiniband/core/verbs.c     |  69 +++++++++++--
>   include/rdma/ib_verbs.h             |  31 ++++--
>   5 files changed, 291 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
> index a1d687a664f8..4f6cd4cf5116 100644
> --- a/drivers/infiniband/core/core_priv.h
> +++ b/drivers/infiniband/core/core_priv.h
> @@ -179,6 +179,12 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
>   	return netdev_has_upper_dev_all_rcu(dev, upper);
>   }
>   
> +void ib_init_cq_pools(struct ib_device *dev);
> +void ib_purge_cq_pools(struct ib_device *dev);
> +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,
> +		enum ib_poll_context poll_ctx, int affinity_hint);
> +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe);
> +
>   int addr_init(void);
>   void addr_cleanup(void);
>   
> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> index f2ae75fa3128..8b9f9be5386b 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -15,6 +15,9 @@
>   #include <linux/slab.h>
>   #include <rdma/ib_verbs.h>
>   
> +/* XXX: wild guess - should not be too large or too small to avoid wastage */
> +#define IB_CQE_BATCH			1024
> +
>   /* # of WCs to poll for with a single call to ib_poll_cq */
>   #define IB_POLL_BATCH			16
>   
> @@ -149,6 +152,8 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
>   	cq->cq_context = private;
>   	cq->poll_ctx = poll_ctx;
>   	atomic_set(&cq->usecnt, 0);
> +	cq->cqe_used = 0;
> +	cq->comp_vector = comp_vector;
>   
>   	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
>   	if (!cq->wc)
> @@ -194,6 +199,8 @@ void ib_free_cq(struct ib_cq *cq)
>   
>   	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
>   		return;
> +	if (WARN_ON_ONCE(cq->cqe_used != 0))
> +		return;
>   
>   	switch (cq->poll_ctx) {
>   	case IB_POLL_DIRECT:
> @@ -213,3 +220,189 @@ void ib_free_cq(struct ib_cq *cq)
>   	WARN_ON_ONCE(ret);
>   }
>   EXPORT_SYMBOL(ib_free_cq);
> +
> +void ib_init_cq_pools(struct ib_device *dev)
> +{
> +	int i;
> +
> +	spin_lock_init(&dev->cq_lock);
> +	for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++)
> +		INIT_LIST_HEAD(&dev->cq_pools[i]);
> +}
> +
> +void ib_purge_cq_pools(struct ib_device *dev)
> +{
> +	struct ib_cq *cq, *n;
> +	LIST_HEAD(tmp_list);
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&dev->cq_lock, flags);
> +		list_splice_init(&dev->cq_pools[i], &tmp_list);
> +		spin_unlock_irqrestore(&dev->cq_lock, flags);
> +	}
> +
> +	list_for_each_entry_safe(cq, n, &tmp_list, pool_entry)
> +		ib_free_cq(cq);
> +}
> +
> +/**
> + * ib_find_vector_affinity() - Find the first completion vector mapped to a given
> + *     cpu core affinity
> + * @device:            rdma device
> + * @cpu:               cpu for the corresponding completion vector affinity
> + * @vector:            output target completion vector
> + *
> + * If the device expose vector affinity we will search each of the vectors
> + * and if we find one that gives us the desired cpu core we return true
> + * and assign @vector to the corresponding completion vector. Otherwise
> + * we return false. We stop at the first appropriate completion vector
> + * we find as we don't have any preference for multiple vectors with the
> + * same affinity.
> + */
> +static bool ib_find_vector_affinity(struct ib_device *device, int cpu,
> +		unsigned int *vector)
> +{
> +	bool found = false;
> +	unsigned int c;
> +	int vec;
> +
> +	if (cpu == -1)
> +		goto out;
> +
> +	for (vec = 0; vec < device->num_comp_vectors; vec++) {
> +		const struct cpumask *mask;
> +
> +		mask = ib_get_vector_affinity(device, vec);
> +		if (!mask)
> +			goto out;
> +
> +		for_each_cpu(c, mask) {
> +			if (c == cpu) {
> +				*vector = vec;
> +				found = true;
> +				goto out;
> +			}
> +		}
> +	}
> +
> +out:
> +	return found;
> +}
> +
> +static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes,
> +		enum ib_poll_context poll_ctx)
> +{
> +	LIST_HEAD(tmp_list);
> +	struct ib_cq *cq;
> +	unsigned long flags;
> +	int nr_cqs, ret, i;
> +
> +	/*
> +	 * Allocated at least as many CQEs as requested, and otherwise
> +	 * a reasonable batch size so that we can share CQs between
> +	 * multiple users instead of allocating a larger number of CQs.
> +	 */
> +	nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH));

did you mean min() ?

> +	nr_cqs = min_t(int, dev->num_comp_vectors, num_possible_cpus());
> +	for (i = 0; i < nr_cqs; i++) {
> +		cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
> +		if (IS_ERR(cq)) {
> +			ret = PTR_ERR(cq);
> +			pr_err("%s: failed to create CQ ret=%d\n",
> +				__func__, ret);
> +			goto out_free_cqs;
> +		}
> +		list_add_tail(&cq->pool_entry, &tmp_list);
> +	}
> +
> +	spin_lock_irqsave(&dev->cq_lock, flags);
> +	list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
> +	spin_unlock_irqrestore(&dev->cq_lock, flags);
> +
> +	return 0;
> +
> +out_free_cqs:
> +	list_for_each_entry(cq, &tmp_list, pool_entry)
> +		ib_free_cq(cq);
> +	return ret;
> +}
> +
> +/*
> + * ib_find_get_cq() - Find the least used completion queue that matches
> + *     a given affinity hint (or least used for wild card affinity)
> + *     and fits nr_cqe
> + * @dev:              rdma device
> + * @nr_cqe:           number of needed cqe entries
> + * @poll_ctx:         cq polling context
> + * @affinity_hint:    affinity hint (-1) for wild-card assignment
> + *
> + * Finds a cq that satisfies @affinity_hint and @nr_cqe requirements and claim
> + * entries in it for us. In case there is no available cq, allocate a new cq
> + * with the requirements and add it to the device pool.
> + */
> +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,
> +		enum ib_poll_context poll_ctx, int affinity_hint)
> +{
> +	struct ib_cq *cq, *found;
> +	unsigned long flags;
> +	int vector, ret;
> +
> +	if (poll_ctx >= ARRAY_SIZE(dev->cq_pools))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) {
> +		/*
> +		 * Couldn't find matching vector affinity so project
> +		 * the affinty to the device completion vector range
> +		 */
> +		vector = affinity_hint % dev->num_comp_vectors;
> +	}
> +
> +restart:
> +	/*
> +	 * Find the least used CQ with correct affinity and
> +	 * enough free cq entries
> +	 */
> +	found = NULL;
> +	spin_lock_irqsave(&dev->cq_lock, flags);
> +	list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) {
> +		if (vector != -1 && vector != cq->comp_vector)

how vector can be -1 ?

> +			continue;
> +		if (cq->cqe_used + nr_cqe > cq->cqe)
> +			continue;
> +		if (found && cq->cqe_used >= found->cqe_used)
> +			continue;
> +		found = cq;
> +	}
> +
> +	if (found) {
> +		found->cqe_used += nr_cqe;
> +		spin_unlock_irqrestore(&dev->cq_lock, flags);
> +		return found;
> +	}
> +	spin_unlock_irqrestore(&dev->cq_lock, flags);
> +
> +	/*
> +	 * Didn't find a match or ran out of CQs,
> +	 * device pool, allocate a new array of CQs.
> +	 */
> +	ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	/* Now search again */
> +	goto restart;
> +}
> +
> +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cq->device->cq_lock, flags);
> +	cq->cqe_used -= nr_cqe;
> +	WARN_ON_ONCE(cq->cqe_used < 0);
> +	spin_unlock_irqrestore(&cq->device->cq_lock, flags);
> +}
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 84fc32a2c8b3..c828845c46d8 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -468,6 +468,8 @@ int ib_register_device(struct ib_device *device,
>   		device->dma_device = parent;
>   	}
>   
> +	ib_init_cq_pools(device);
> +
>   	mutex_lock(&device_mutex);
>   
>   	if (strchr(device->name, '%')) {
> @@ -590,6 +592,8 @@ void ib_unregister_device(struct ib_device *device)
>   	up_write(&lists_rwsem);
>   
>   	device->reg_state = IB_DEV_UNREGISTERED;
> +
> +	ib_purge_cq_pools(device);
>   }
>   EXPORT_SYMBOL(ib_unregister_device);
>   
> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index de57d6c11a25..fcc9ecba6741 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -793,14 +793,16 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>   			   struct ib_qp_init_attr *qp_init_attr)
>   {
>   	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
> +	struct ib_cq *cq = NULL;
>   	struct ib_qp *qp;
> -	int ret;
> +	u32 nr_cqes = 0;
> +	int ret = -EINVAL;
>   
>   	if (qp_init_attr->rwq_ind_tbl &&
>   	    (qp_init_attr->recv_cq ||
>   	    qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
>   	    qp_init_attr->cap.max_recv_sge))
> -		return ERR_PTR(-EINVAL);
> +		goto out;
>   
>   	/*
>   	 * If the callers is using the RDMA API calculate the resources
> @@ -811,9 +813,51 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>   	if (qp_init_attr->cap.max_rdma_ctxs)
>   		rdma_rw_init_qp(device, qp_init_attr);
>   
> +	if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) {
> +		int affinity = -1;
> +
> +		if (WARN_ON(qp_init_attr->recv_cq))
> +			goto out;
> +		if (WARN_ON(qp_init_attr->send_cq))
> +			goto out;
> +
> +		if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT)
> +			affinity = qp_init_attr->affinity_hint;
> +
> +		nr_cqes = qp_init_attr->cap.max_recv_wr +
> +			  qp_init_attr->cap.max_send_wr;
> +		if (nr_cqes) {

what will happen if nr_cqes == 0 in that case ?

> +			cq = ib_find_get_cq(device, nr_cqes,
> +					    qp_init_attr->poll_ctx, affinity);
> +			if (IS_ERR(cq)) {
> +				ret = PTR_ERR(cq);
> +				goto out;
> +			}
> +
> +			if (qp_init_attr->cap.max_send_wr)
> +				qp_init_attr->send_cq = cq;
> +
> +			if (qp_init_attr->cap.max_recv_wr) {
> +				qp_init_attr->recv_cq = cq;
> +
> +				/*
> +				 * Low-level drivers expect max_recv_wr == 0
> +				 * for the SRQ case:
> +				 */
> +				if (qp_init_attr->srq)
> +					qp_init_attr->cap.max_recv_wr = 0;
> +			}
> +		}
> +
> +		qp_init_attr->create_flags &=
> +			~(IB_QP_CREATE_ASSIGN_CQS | IB_QP_CREATE_AFFINITY_HINT);
> +	}
> +
>   	qp = device->create_qp(pd, qp_init_attr, NULL);
> -	if (IS_ERR(qp))
> -		return qp;
> +	if (IS_ERR(qp)) {
> +		ret = PTR_ERR(qp);
> +		goto out_put_cq;
> +	}
>   
>   	ret = ib_create_qp_security(qp, device);
>   	if (ret) {
> @@ -826,6 +870,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>   	qp->uobject    = NULL;
>   	qp->qp_type    = qp_init_attr->qp_type;
>   	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
> +	qp->nr_cqes    = nr_cqes;
>   
>   	atomic_set(&qp->usecnt, 0);
>   	qp->mrs_used = 0;
> @@ -865,8 +910,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>   		ret = rdma_rw_init_mrs(qp, qp_init_attr);
>   		if (ret) {
>   			pr_err("failed to init MR pool ret= %d\n", ret);
> -			ib_destroy_qp(qp);
> -			return ERR_PTR(ret);
> +			goto out_destroy_qp;
>   		}
>   	}
>   
> @@ -880,6 +924,14 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>   				 device->attrs.max_sge_rd);
>   
>   	return qp;
> +
> +out_destroy_qp:
> +	ib_destroy_qp(qp);
> +out_put_cq:
> +	if (cq)
> +		ib_put_cq(cq, nr_cqes);
> +out:
> +	return ERR_PTR(ret);
>   }
>   EXPORT_SYMBOL(ib_create_qp);
>   
> @@ -1478,6 +1530,11 @@ int ib_destroy_qp(struct ib_qp *qp)
>   			atomic_dec(&ind_tbl->usecnt);
>   		if (sec)
>   			ib_destroy_qp_security_end(sec);
> +
> +		if (qp->nr_cqes) {
> +			WARN_ON_ONCE(rcq && rcq != scq);
> +			ib_put_cq(scq, qp->nr_cqes);
> +		}
>   	} else {
>   		if (sec)
>   			ib_destroy_qp_security_abort(sec);
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index bdb1279a415b..56d42e753eb4 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1098,11 +1098,22 @@ enum ib_qp_create_flags {
>   	IB_QP_CREATE_SCATTER_FCS		= 1 << 8,
>   	IB_QP_CREATE_CVLAN_STRIPPING		= 1 << 9,
>   	IB_QP_CREATE_SOURCE_QPN			= 1 << 10,
> +
> +	/* only used by the core, not passed to low-level drivers */
> +	IB_QP_CREATE_ASSIGN_CQS			= 1 << 24,
> +	IB_QP_CREATE_AFFINITY_HINT		= 1 << 25,
> +
>   	/* reserve bits 26-31 for low level drivers' internal use */
>   	IB_QP_CREATE_RESERVED_START		= 1 << 26,
>   	IB_QP_CREATE_RESERVED_END		= 1 << 31,
>   };
>   
> +enum ib_poll_context {
> +	IB_POLL_SOFTIRQ,	/* poll from softirq context */
> +	IB_POLL_WORKQUEUE,	/* poll from workqueue */
> +	IB_POLL_DIRECT,		/* caller context, no hw completions */
> +};
> +
>   /*
>    * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
>    * callback to destroy the passed in QP.
> @@ -1124,6 +1135,13 @@ struct ib_qp_init_attr {
>   	 * Only needed for special QP types, or when using the RW API.
>   	 */
>   	u8			port_num;
> +
> +	/*
> +	 * Only needed when not passing in explicit CQs.
> +	 */
> +	enum ib_poll_context	poll_ctx;
> +	int			affinity_hint;
> +
>   	struct ib_rwq_ind_table *rwq_ind_tbl;
>   	u32			source_qpn;
>   };
> @@ -1536,12 +1554,6 @@ struct ib_ah {
>   
>   typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
>   
> -enum ib_poll_context {
> -	IB_POLL_DIRECT,		/* caller context, no hw completions */
> -	IB_POLL_SOFTIRQ,	/* poll from softirq context */
> -	IB_POLL_WORKQUEUE,	/* poll from workqueue */
> -};
> -
>   struct ib_cq {
>   	struct ib_device       *device;
>   	struct ib_uobject      *uobject;
> @@ -1549,9 +1561,12 @@ struct ib_cq {
>   	void                  (*event_handler)(struct ib_event *, void *);
>   	void                   *cq_context;
>   	int               	cqe;
> +	unsigned int		cqe_used;
>   	atomic_t          	usecnt; /* count number of work queues */
>   	enum ib_poll_context	poll_ctx;
> +	int			comp_vector;
>   	struct ib_wc		*wc;
> +	struct list_head	pool_entry;
>   	union {
>   		struct irq_poll		iop;
>   		struct work_struct	work;
> @@ -1731,6 +1746,7 @@ struct ib_qp {
>   	struct ib_rwq_ind_table *rwq_ind_tbl;
>   	struct ib_qp_security  *qp_sec;
>   	u8			port;
> +	u32			nr_cqes;
>   };
>   
>   struct ib_mr {
> @@ -2338,6 +2354,9 @@ struct ib_device {
>   
>   	u32                          index;
>   
> +	spinlock_t		     cq_lock;

maybe can be called cq_pools_lock (cq_lock is general) ?

> +	struct list_head	     cq_pools[IB_POLL_WORKQUEUE + 1];

maybe it's better to add and use IB_POLL_LAST ?

> +
>   	/**
>   	 * The following mandatory functions are used only at device
>   	 * registration.  Keep functions such as these at the end of this
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Nov. 9, 2017, 5:31 p.m. UTC | #2
>> +static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes,
>> +        enum ib_poll_context poll_ctx)
>> +{
>> +    LIST_HEAD(tmp_list);
>> +    struct ib_cq *cq;
>> +    unsigned long flags;
>> +    int nr_cqs, ret, i;
>> +
>> +    /*
>> +     * Allocated at least as many CQEs as requested, and otherwise
>> +     * a reasonable batch size so that we can share CQs between
>> +     * multiple users instead of allocating a larger number of CQs.
>> +     */
>> +    nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH));
> 
> did you mean min() ?

No, I meant max. If we choose the CQ size, we choose the min between the
default and the device capability, if the user chooses, we rely that it
asked for no more than the device capability (and if not, allocation
will fail, as it should).

>> +restart:
>> +    /*
>> +     * Find the least used CQ with correct affinity and
>> +     * enough free cq entries
>> +     */
>> +    found = NULL;
>> +    spin_lock_irqsave(&dev->cq_lock, flags);
>> +    list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) {
>> +        if (vector != -1 && vector != cq->comp_vector)
> 
> how vector can be -1 ?

-1 is a wild-card affinity hint value that chooses the least used cq
(see ib_create_qp).

>> @@ -811,9 +813,51 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
>>       if (qp_init_attr->cap.max_rdma_ctxs)
>>           rdma_rw_init_qp(device, qp_init_attr);
>> +    if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) {
>> +        int affinity = -1;
>> +
>> +        if (WARN_ON(qp_init_attr->recv_cq))
>> +            goto out;
>> +        if (WARN_ON(qp_init_attr->send_cq))
>> +            goto out;
>> +
>> +        if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT)
>> +            affinity = qp_init_attr->affinity_hint;
>> +
>> +        nr_cqes = qp_init_attr->cap.max_recv_wr +
>> +              qp_init_attr->cap.max_send_wr;
>> +        if (nr_cqes) {
> 
> what will happen if nr_cqes == 0 in that case ?

The same thing that would happen without this code I think. This is
creating a qp without ability to post send and/or receive work requests.

>> @@ -2338,6 +2354,9 @@ struct ib_device {
>>       u32                          index;
>> +    spinlock_t             cq_lock;
> 
> maybe can be called cq_pools_lock (cq_lock is general) ?

I can change that.

>> +    struct list_head         cq_pools[IB_POLL_WORKQUEUE + 1];
> 
> maybe it's better to add and use IB_POLL_LAST ?

Yea, I can change that.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Nov. 9, 2017, 5:33 p.m. UTC | #3
T24gVGh1LCAyMDE3LTExLTA5IGF0IDE5OjMxICswMjAwLCBTYWdpIEdyaW1iZXJnIHdyb3RlOg0K
PiA+ID4gK3N0YXRpYyBpbnQgaWJfYWxsb2NfY3FzKHN0cnVjdCBpYl9kZXZpY2UgKmRldiwgaW50
IG5yX2NxZXMsDQo+ID4gPiArICAgICAgICBlbnVtIGliX3BvbGxfY29udGV4dCBwb2xsX2N0eCkN
Cj4gPiA+ICt7DQo+ID4gPiArICAgIExJU1RfSEVBRCh0bXBfbGlzdCk7DQo+ID4gPiArICAgIHN0
cnVjdCBpYl9jcSAqY3E7DQo+ID4gPiArICAgIHVuc2lnbmVkIGxvbmcgZmxhZ3M7DQo+ID4gPiAr
ICAgIGludCBucl9jcXMsIHJldCwgaTsNCj4gPiA+ICsNCj4gPiA+ICsgICAgLyoNCj4gPiA+ICsg
ICAgICogQWxsb2NhdGVkIGF0IGxlYXN0IGFzIG1hbnkgQ1FFcyBhcyByZXF1ZXN0ZWQsIGFuZCBv
dGhlcndpc2UNCj4gPiA+ICsgICAgICogYSByZWFzb25hYmxlIGJhdGNoIHNpemUgc28gdGhhdCB3
ZSBjYW4gc2hhcmUgQ1FzIGJldHdlZW4NCj4gPiA+ICsgICAgICogbXVsdGlwbGUgdXNlcnMgaW5z
dGVhZCBvZiBhbGxvY2F0aW5nIGEgbGFyZ2VyIG51bWJlciBvZiBDUXMuDQo+ID4gPiArICAgICAq
Lw0KPiA+ID4gKyAgICBucl9jcWVzID0gbWF4KG5yX2NxZXMsIG1pbihkZXYtPmF0dHJzLm1heF9j
cWUsIElCX0NRRV9CQVRDSCkpOw0KPiA+IA0KPiA+IGRpZCB5b3UgbWVhbiBtaW4oKSA/DQo+IA0K
PiBObywgSSBtZWFudCBtYXguIElmIHdlIGNob29zZSB0aGUgQ1Egc2l6ZSwgd2UgY2hvb3NlIHRo
ZSBtaW4gYmV0d2VlbiB0aGUNCj4gZGVmYXVsdCBhbmQgdGhlIGRldmljZSBjYXBhYmlsaXR5LCBp
ZiB0aGUgdXNlciBjaG9vc2VzLCB3ZSByZWx5IHRoYXQgaXQNCj4gYXNrZWQgZm9yIG5vIG1vcmUg
dGhhbiB0aGUgZGV2aWNlIGNhcGFiaWxpdHkgKGFuZCBpZiBub3QsIGFsbG9jYXRpb24NCj4gd2ls
bCBmYWlsLCBhcyBpdCBzaG91bGQpLg0KDQpIZWxsbyBTYWdpLA0KDQpIb3cgYWJvdXQgdGhlIGZv
bGxvd2luZzoNCg0KCW1pbihkZXYtPmF0dHJzLm1heF9jcWUsIG1heChucl9jcWVzLCBJQl9DUUVf
QkFUQ0gpKQ0KDQpCYXJ0Lg0K
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Nov. 13, 2017, 8:28 p.m. UTC | #4
> Hello Sagi,
> 
> How about the following:
> 
> 	min(dev->attrs.max_cqe, max(nr_cqes, IB_CQE_BATCH))

That would work too, thanks Bart!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Nov. 14, 2017, 4:28 p.m. UTC | #5
On Wed, 2017-11-08 at 11:57 +0200, Sagi Grimberg wrote:
> +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,

> +               enum ib_poll_context poll_ctx, int affinity_hint)

> +{

> +       struct ib_cq *cq, *found;

> +       unsigned long flags;

> +       int vector, ret;

> +

> +       if (poll_ctx >= ARRAY_SIZE(dev->cq_pools))

> +               return ERR_PTR(-EINVAL);

> +

> +       if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) {

> +               /*

> +                * Couldn't find matching vector affinity so project

> +                * the affinty to the device completion vector range

> +                */

> +               vector = affinity_hint % dev->num_comp_vectors;

> +       }


So depending on whether or not the HCA driver implements .get_vector_affinity()
either pci_irq_get_affinity() is used or "vector = affinity_hint %
dev->num_comp_vectors"? Sorry but I think that kind of differences makes it
unnecessarily hard for ULP maintainers to provide predictable performance and
consistent behavior across HCAs.

Bart.
Sagi Grimberg Nov. 20, 2017, 12:31 p.m. UTC | #6
>> +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,
>> +               enum ib_poll_context poll_ctx, int affinity_hint)
>> +{
>> +       struct ib_cq *cq, *found;
>> +       unsigned long flags;
>> +       int vector, ret;
>> +
>> +       if (poll_ctx >= ARRAY_SIZE(dev->cq_pools))
>> +               return ERR_PTR(-EINVAL);
>> +
>> +       if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) {
>> +               /*
>> +                * Couldn't find matching vector affinity so project
>> +                * the affinty to the device completion vector range
>> +                */
>> +               vector = affinity_hint % dev->num_comp_vectors;
>> +       }
> 
> So depending on whether or not the HCA driver implements .get_vector_affinity()
> either pci_irq_get_affinity() is used or "vector = affinity_hint %
> dev->num_comp_vectors"? Sorry but I think that kind of differences makes it
> unnecessarily hard for ULP maintainers to provide predictable performance and
> consistent behavior across HCAs.

Well, as a ULP maintainer I think that in the lack of
.get_vector_affinity() I would do that same thing as this code. srp
itself is doing the same thing in srp_create_target()
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Dec. 11, 2017, 11:50 p.m. UTC | #7
On Wed, Nov 08, 2017 at 11:57:34AM +0200, Sagi Grimberg wrote:
> Allow a ULP to ask the core to implicitly assign a completion
> queue to a queue-pair based on a least-used search on a per-device
> cq pools. The device CQ pools grow in a lazy fashion with every
> QP creation.
> 
> In addition, expose an affinity hint for a queue pair creation.
> If passed, the core will attempt to attach a CQ with a completion
> vector that is directed to the cpu core as the affinity hint
> provided.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Sagi, Bart,

Did we reach a conclusion on this? Is v3 the series to take, and
should it all go through the RDMA tree? It looks like there are some
missing acks for that??

I think there was also an unapplied comment from bart in the
patchworks notes...

Could you please add some commit messages when you resend it? Not sure
I should be accepting such large commits with empty messages???

Thanks,
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Jan. 3, 2018, 5:47 p.m. UTC | #8
On Mon, Dec 11, 2017 at 04:50:44PM -0700, Jason Gunthorpe wrote:
> On Wed, Nov 08, 2017 at 11:57:34AM +0200, Sagi Grimberg wrote:
> > Allow a ULP to ask the core to implicitly assign a completion
> > queue to a queue-pair based on a least-used search on a per-device
> > cq pools. The device CQ pools grow in a lazy fashion with every
> > QP creation.
> > 
> > In addition, expose an affinity hint for a queue pair creation.
> > If passed, the core will attempt to attach a CQ with a completion
> > vector that is directed to the cpu core as the affinity hint
> > provided.
> > 
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> 
> Sagi, Bart,
> 
> Did we reach a conclusion on this? Is v3 the series to take, and
> should it all go through the RDMA tree? It looks like there are some
> missing acks for that??
> 
> I think there was also an unapplied comment from bart in the
> patchworks notes...
> 
> Could you please add some commit messages when you resend it? Not sure
> I should be accepting such large commits with empty messages???

Hearing nothing, I've dropped this series off patchworks. Please
resend it when you are ready.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a1d687a664f8..4f6cd4cf5116 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -179,6 +179,12 @@  static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
 	return netdev_has_upper_dev_all_rcu(dev, upper);
 }
 
+void ib_init_cq_pools(struct ib_device *dev);
+void ib_purge_cq_pools(struct ib_device *dev);
+struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,
+		enum ib_poll_context poll_ctx, int affinity_hint);
+void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe);
+
 int addr_init(void);
 void addr_cleanup(void);
 
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index f2ae75fa3128..8b9f9be5386b 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -15,6 +15,9 @@ 
 #include <linux/slab.h>
 #include <rdma/ib_verbs.h>
 
+/* XXX: wild guess - should not be too large or too small to avoid wastage */
+#define IB_CQE_BATCH			1024
+
 /* # of WCs to poll for with a single call to ib_poll_cq */
 #define IB_POLL_BATCH			16
 
@@ -149,6 +152,8 @@  struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 	cq->cq_context = private;
 	cq->poll_ctx = poll_ctx;
 	atomic_set(&cq->usecnt, 0);
+	cq->cqe_used = 0;
+	cq->comp_vector = comp_vector;
 
 	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
 	if (!cq->wc)
@@ -194,6 +199,8 @@  void ib_free_cq(struct ib_cq *cq)
 
 	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 		return;
+	if (WARN_ON_ONCE(cq->cqe_used != 0))
+		return;
 
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
@@ -213,3 +220,189 @@  void ib_free_cq(struct ib_cq *cq)
 	WARN_ON_ONCE(ret);
 }
 EXPORT_SYMBOL(ib_free_cq);
+
+void ib_init_cq_pools(struct ib_device *dev)
+{
+	int i;
+
+	spin_lock_init(&dev->cq_lock);
+	for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++)
+		INIT_LIST_HEAD(&dev->cq_pools[i]);
+}
+
+void ib_purge_cq_pools(struct ib_device *dev)
+{
+	struct ib_cq *cq, *n;
+	LIST_HEAD(tmp_list);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dev->cq_lock, flags);
+		list_splice_init(&dev->cq_pools[i], &tmp_list);
+		spin_unlock_irqrestore(&dev->cq_lock, flags);
+	}
+
+	list_for_each_entry_safe(cq, n, &tmp_list, pool_entry)
+		ib_free_cq(cq);
+}
+
+/**
+ * ib_find_vector_affinity() - Find the first completion vector mapped to a given
+ *     cpu core affinity
+ * @device:            rdma device
+ * @cpu:               cpu for the corresponding completion vector affinity
+ * @vector:            output target completion vector
+ *
+ * If the device expose vector affinity we will search each of the vectors
+ * and if we find one that gives us the desired cpu core we return true
+ * and assign @vector to the corresponding completion vector. Otherwise
+ * we return false. We stop at the first appropriate completion vector
+ * we find as we don't have any preference for multiple vectors with the
+ * same affinity.
+ */
+static bool ib_find_vector_affinity(struct ib_device *device, int cpu,
+		unsigned int *vector)
+{
+	bool found = false;
+	unsigned int c;
+	int vec;
+
+	if (cpu == -1)
+		goto out;
+
+	for (vec = 0; vec < device->num_comp_vectors; vec++) {
+		const struct cpumask *mask;
+
+		mask = ib_get_vector_affinity(device, vec);
+		if (!mask)
+			goto out;
+
+		for_each_cpu(c, mask) {
+			if (c == cpu) {
+				*vector = vec;
+				found = true;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return found;
+}
+
+static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes,
+		enum ib_poll_context poll_ctx)
+{
+	LIST_HEAD(tmp_list);
+	struct ib_cq *cq;
+	unsigned long flags;
+	int nr_cqs, ret, i;
+
+	/*
+	 * Allocated at least as many CQEs as requested, and otherwise
+	 * a reasonable batch size so that we can share CQs between
+	 * multiple users instead of allocating a larger number of CQs.
+	 */
+	nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH));
+	nr_cqs = min_t(int, dev->num_comp_vectors, num_possible_cpus());
+	for (i = 0; i < nr_cqs; i++) {
+		cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
+		if (IS_ERR(cq)) {
+			ret = PTR_ERR(cq);
+			pr_err("%s: failed to create CQ ret=%d\n",
+				__func__, ret);
+			goto out_free_cqs;
+		}
+		list_add_tail(&cq->pool_entry, &tmp_list);
+	}
+
+	spin_lock_irqsave(&dev->cq_lock, flags);
+	list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
+	spin_unlock_irqrestore(&dev->cq_lock, flags);
+
+	return 0;
+
+out_free_cqs:
+	list_for_each_entry(cq, &tmp_list, pool_entry)
+		ib_free_cq(cq);
+	return ret;
+}
+
+/*
+ * ib_find_get_cq() - Find the least used completion queue that matches
+ *     a given affinity hint (or least used for wild card affinity)
+ *     and fits nr_cqe
+ * @dev:              rdma device
+ * @nr_cqe:           number of needed cqe entries
+ * @poll_ctx:         cq polling context
+ * @affinity_hint:    affinity hint (-1) for wild-card assignment
+ *
+ * Finds a cq that satisfies @affinity_hint and @nr_cqe requirements and claim
+ * entries in it for us. In case there is no available cq, allocate a new cq
+ * with the requirements and add it to the device pool.
+ */
+struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe,
+		enum ib_poll_context poll_ctx, int affinity_hint)
+{
+	struct ib_cq *cq, *found;
+	unsigned long flags;
+	int vector, ret;
+
+	if (poll_ctx >= ARRAY_SIZE(dev->cq_pools))
+		return ERR_PTR(-EINVAL);
+
+	if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) {
+		/*
+		 * Couldn't find matching vector affinity so project
+		 * the affinty to the device completion vector range
+		 */
+		vector = affinity_hint % dev->num_comp_vectors;
+	}
+
+restart:
+	/*
+	 * Find the least used CQ with correct affinity and
+	 * enough free cq entries
+	 */
+	found = NULL;
+	spin_lock_irqsave(&dev->cq_lock, flags);
+	list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) {
+		if (vector != -1 && vector != cq->comp_vector)
+			continue;
+		if (cq->cqe_used + nr_cqe > cq->cqe)
+			continue;
+		if (found && cq->cqe_used >= found->cqe_used)
+			continue;
+		found = cq;
+	}
+
+	if (found) {
+		found->cqe_used += nr_cqe;
+		spin_unlock_irqrestore(&dev->cq_lock, flags);
+		return found;
+	}
+	spin_unlock_irqrestore(&dev->cq_lock, flags);
+
+	/*
+	 * Didn't find a match or ran out of CQs,
+	 * device pool, allocate a new array of CQs.
+	 */
+	ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
+	if (ret)
+		return ERR_PTR(ret);
+
+	/* Now search again */
+	goto restart;
+}
+
+void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->device->cq_lock, flags);
+	cq->cqe_used -= nr_cqe;
+	WARN_ON_ONCE(cq->cqe_used < 0);
+	spin_unlock_irqrestore(&cq->device->cq_lock, flags);
+}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 84fc32a2c8b3..c828845c46d8 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -468,6 +468,8 @@  int ib_register_device(struct ib_device *device,
 		device->dma_device = parent;
 	}
 
+	ib_init_cq_pools(device);
+
 	mutex_lock(&device_mutex);
 
 	if (strchr(device->name, '%')) {
@@ -590,6 +592,8 @@  void ib_unregister_device(struct ib_device *device)
 	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
+
+	ib_purge_cq_pools(device);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index de57d6c11a25..fcc9ecba6741 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -793,14 +793,16 @@  struct ib_qp *ib_create_qp(struct ib_pd *pd,
 			   struct ib_qp_init_attr *qp_init_attr)
 {
 	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+	struct ib_cq *cq = NULL;
 	struct ib_qp *qp;
-	int ret;
+	u32 nr_cqes = 0;
+	int ret = -EINVAL;
 
 	if (qp_init_attr->rwq_ind_tbl &&
 	    (qp_init_attr->recv_cq ||
 	    qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
 	    qp_init_attr->cap.max_recv_sge))
-		return ERR_PTR(-EINVAL);
+		goto out;
 
 	/*
 	 * If the callers is using the RDMA API calculate the resources
@@ -811,9 +813,51 @@  struct ib_qp *ib_create_qp(struct ib_pd *pd,
 	if (qp_init_attr->cap.max_rdma_ctxs)
 		rdma_rw_init_qp(device, qp_init_attr);
 
+	if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) {
+		int affinity = -1;
+
+		if (WARN_ON(qp_init_attr->recv_cq))
+			goto out;
+		if (WARN_ON(qp_init_attr->send_cq))
+			goto out;
+
+		if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT)
+			affinity = qp_init_attr->affinity_hint;
+
+		nr_cqes = qp_init_attr->cap.max_recv_wr +
+			  qp_init_attr->cap.max_send_wr;
+		if (nr_cqes) {
+			cq = ib_find_get_cq(device, nr_cqes,
+					    qp_init_attr->poll_ctx, affinity);
+			if (IS_ERR(cq)) {
+				ret = PTR_ERR(cq);
+				goto out;
+			}
+
+			if (qp_init_attr->cap.max_send_wr)
+				qp_init_attr->send_cq = cq;
+
+			if (qp_init_attr->cap.max_recv_wr) {
+				qp_init_attr->recv_cq = cq;
+
+				/*
+				 * Low-level drivers expect max_recv_wr == 0
+				 * for the SRQ case:
+				 */
+				if (qp_init_attr->srq)
+					qp_init_attr->cap.max_recv_wr = 0;
+			}
+		}
+
+		qp_init_attr->create_flags &=
+			~(IB_QP_CREATE_ASSIGN_CQS | IB_QP_CREATE_AFFINITY_HINT);
+	}
+
 	qp = device->create_qp(pd, qp_init_attr, NULL);
-	if (IS_ERR(qp))
-		return qp;
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto out_put_cq;
+	}
 
 	ret = ib_create_qp_security(qp, device);
 	if (ret) {
@@ -826,6 +870,7 @@  struct ib_qp *ib_create_qp(struct ib_pd *pd,
 	qp->uobject    = NULL;
 	qp->qp_type    = qp_init_attr->qp_type;
 	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
+	qp->nr_cqes    = nr_cqes;
 
 	atomic_set(&qp->usecnt, 0);
 	qp->mrs_used = 0;
@@ -865,8 +910,7 @@  struct ib_qp *ib_create_qp(struct ib_pd *pd,
 		ret = rdma_rw_init_mrs(qp, qp_init_attr);
 		if (ret) {
 			pr_err("failed to init MR pool ret= %d\n", ret);
-			ib_destroy_qp(qp);
-			return ERR_PTR(ret);
+			goto out_destroy_qp;
 		}
 	}
 
@@ -880,6 +924,14 @@  struct ib_qp *ib_create_qp(struct ib_pd *pd,
 				 device->attrs.max_sge_rd);
 
 	return qp;
+
+out_destroy_qp:
+	ib_destroy_qp(qp);
+out_put_cq:
+	if (cq)
+		ib_put_cq(cq, nr_cqes);
+out:
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(ib_create_qp);
 
@@ -1478,6 +1530,11 @@  int ib_destroy_qp(struct ib_qp *qp)
 			atomic_dec(&ind_tbl->usecnt);
 		if (sec)
 			ib_destroy_qp_security_end(sec);
+
+		if (qp->nr_cqes) {
+			WARN_ON_ONCE(rcq && rcq != scq);
+			ib_put_cq(scq, qp->nr_cqes);
+		}
 	} else {
 		if (sec)
 			ib_destroy_qp_security_abort(sec);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index bdb1279a415b..56d42e753eb4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1098,11 +1098,22 @@  enum ib_qp_create_flags {
 	IB_QP_CREATE_SCATTER_FCS		= 1 << 8,
 	IB_QP_CREATE_CVLAN_STRIPPING		= 1 << 9,
 	IB_QP_CREATE_SOURCE_QPN			= 1 << 10,
+
+	/* only used by the core, not passed to low-level drivers */
+	IB_QP_CREATE_ASSIGN_CQS			= 1 << 24,
+	IB_QP_CREATE_AFFINITY_HINT		= 1 << 25,
+
 	/* reserve bits 26-31 for low level drivers' internal use */
 	IB_QP_CREATE_RESERVED_START		= 1 << 26,
 	IB_QP_CREATE_RESERVED_END		= 1 << 31,
 };
 
+enum ib_poll_context {
+	IB_POLL_SOFTIRQ,	/* poll from softirq context */
+	IB_POLL_WORKQUEUE,	/* poll from workqueue */
+	IB_POLL_DIRECT,		/* caller context, no hw completions */
+};
+
 /*
  * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
  * callback to destroy the passed in QP.
@@ -1124,6 +1135,13 @@  struct ib_qp_init_attr {
 	 * Only needed for special QP types, or when using the RW API.
 	 */
 	u8			port_num;
+
+	/*
+	 * Only needed when not passing in explicit CQs.
+	 */
+	enum ib_poll_context	poll_ctx;
+	int			affinity_hint;
+
 	struct ib_rwq_ind_table *rwq_ind_tbl;
 	u32			source_qpn;
 };
@@ -1536,12 +1554,6 @@  struct ib_ah {
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
-enum ib_poll_context {
-	IB_POLL_DIRECT,		/* caller context, no hw completions */
-	IB_POLL_SOFTIRQ,	/* poll from softirq context */
-	IB_POLL_WORKQUEUE,	/* poll from workqueue */
-};
-
 struct ib_cq {
 	struct ib_device       *device;
 	struct ib_uobject      *uobject;
@@ -1549,9 +1561,12 @@  struct ib_cq {
 	void                  (*event_handler)(struct ib_event *, void *);
 	void                   *cq_context;
 	int               	cqe;
+	unsigned int		cqe_used;
 	atomic_t          	usecnt; /* count number of work queues */
 	enum ib_poll_context	poll_ctx;
+	int			comp_vector;
 	struct ib_wc		*wc;
+	struct list_head	pool_entry;
 	union {
 		struct irq_poll		iop;
 		struct work_struct	work;
@@ -1731,6 +1746,7 @@  struct ib_qp {
 	struct ib_rwq_ind_table *rwq_ind_tbl;
 	struct ib_qp_security  *qp_sec;
 	u8			port;
+	u32			nr_cqes;
 };
 
 struct ib_mr {
@@ -2338,6 +2354,9 @@  struct ib_device {
 
 	u32                          index;
 
+	spinlock_t		     cq_lock;
+	struct list_head	     cq_pools[IB_POLL_WORKQUEUE + 1];
+
 	/**
 	 * The following mandatory functions are used only at device
 	 * registration.  Keep functions such as these at the end of this