Message ID | 20171108095742.25365-2-sagi@grimberg.me (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
On 11/8/2017 11:57 AM, Sagi Grimberg wrote: > Allow a ULP to ask the core to implicitly assign a completion > queue to a queue-pair based on a least-used search on a per-device > cq pools. The device CQ pools grow in a lazy fashion with every > QP creation. > > In addition, expose an affinity hint for a queue pair creation. > If passed, the core will attempt to attach a CQ with a completion > vector that is directed to the cpu core as the affinity hint > provided. > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me> > --- > drivers/infiniband/core/core_priv.h | 6 ++ > drivers/infiniband/core/cq.c | 193 ++++++++++++++++++++++++++++++++++++ > drivers/infiniband/core/device.c | 4 + > drivers/infiniband/core/verbs.c | 69 +++++++++++-- > include/rdma/ib_verbs.h | 31 ++++-- > 5 files changed, 291 insertions(+), 12 deletions(-) > > diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h > index a1d687a664f8..4f6cd4cf5116 100644 > --- a/drivers/infiniband/core/core_priv.h > +++ b/drivers/infiniband/core/core_priv.h > @@ -179,6 +179,12 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, > return netdev_has_upper_dev_all_rcu(dev, upper); > } > > +void ib_init_cq_pools(struct ib_device *dev); > +void ib_purge_cq_pools(struct ib_device *dev); > +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, > + enum ib_poll_context poll_ctx, int affinity_hint); > +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe); > + > int addr_init(void); > void addr_cleanup(void); > > diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c > index f2ae75fa3128..8b9f9be5386b 100644 > --- a/drivers/infiniband/core/cq.c > +++ b/drivers/infiniband/core/cq.c > @@ -15,6 +15,9 @@ > #include <linux/slab.h> > #include <rdma/ib_verbs.h> > > +/* XXX: wild guess - should not be too large or too small to avoid wastage */ > +#define IB_CQE_BATCH 1024 > + > /* # of WCs to poll for with a single call to ib_poll_cq */ > #define IB_POLL_BATCH 16 > > @@ -149,6 +152,8 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, > cq->cq_context = private; > cq->poll_ctx = poll_ctx; > atomic_set(&cq->usecnt, 0); > + cq->cqe_used = 0; > + cq->comp_vector = comp_vector; > > cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); > if (!cq->wc) > @@ -194,6 +199,8 @@ void ib_free_cq(struct ib_cq *cq) > > if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) > return; > + if (WARN_ON_ONCE(cq->cqe_used != 0)) > + return; > > switch (cq->poll_ctx) { > case IB_POLL_DIRECT: > @@ -213,3 +220,189 @@ void ib_free_cq(struct ib_cq *cq) > WARN_ON_ONCE(ret); > } > EXPORT_SYMBOL(ib_free_cq); > + > +void ib_init_cq_pools(struct ib_device *dev) > +{ > + int i; > + > + spin_lock_init(&dev->cq_lock); > + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) > + INIT_LIST_HEAD(&dev->cq_pools[i]); > +} > + > +void ib_purge_cq_pools(struct ib_device *dev) > +{ > + struct ib_cq *cq, *n; > + LIST_HEAD(tmp_list); > + int i; > + > + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { > + unsigned long flags; > + > + spin_lock_irqsave(&dev->cq_lock, flags); > + list_splice_init(&dev->cq_pools[i], &tmp_list); > + spin_unlock_irqrestore(&dev->cq_lock, flags); > + } > + > + list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) > + ib_free_cq(cq); > +} > + > +/** > + * ib_find_vector_affinity() - Find the first completion vector mapped to a given > + * cpu core affinity > + * @device: rdma device > + * @cpu: cpu for the corresponding completion vector affinity > + * @vector: output target completion vector > + * > + * If the device expose vector affinity we will search each of the vectors > + * and if we find one that gives us the desired cpu core we return true > + * and assign @vector to the corresponding completion vector. Otherwise > + * we return false. We stop at the first appropriate completion vector > + * we find as we don't have any preference for multiple vectors with the > + * same affinity. > + */ > +static bool ib_find_vector_affinity(struct ib_device *device, int cpu, > + unsigned int *vector) > +{ > + bool found = false; > + unsigned int c; > + int vec; > + > + if (cpu == -1) > + goto out; > + > + for (vec = 0; vec < device->num_comp_vectors; vec++) { > + const struct cpumask *mask; > + > + mask = ib_get_vector_affinity(device, vec); > + if (!mask) > + goto out; > + > + for_each_cpu(c, mask) { > + if (c == cpu) { > + *vector = vec; > + found = true; > + goto out; > + } > + } > + } > + > +out: > + return found; > +} > + > +static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes, > + enum ib_poll_context poll_ctx) > +{ > + LIST_HEAD(tmp_list); > + struct ib_cq *cq; > + unsigned long flags; > + int nr_cqs, ret, i; > + > + /* > + * Allocated at least as many CQEs as requested, and otherwise > + * a reasonable batch size so that we can share CQs between > + * multiple users instead of allocating a larger number of CQs. > + */ > + nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH)); did you mean min() ? > + nr_cqs = min_t(int, dev->num_comp_vectors, num_possible_cpus()); > + for (i = 0; i < nr_cqs; i++) { > + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); > + if (IS_ERR(cq)) { > + ret = PTR_ERR(cq); > + pr_err("%s: failed to create CQ ret=%d\n", > + __func__, ret); > + goto out_free_cqs; > + } > + list_add_tail(&cq->pool_entry, &tmp_list); > + } > + > + spin_lock_irqsave(&dev->cq_lock, flags); > + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); > + spin_unlock_irqrestore(&dev->cq_lock, flags); > + > + return 0; > + > +out_free_cqs: > + list_for_each_entry(cq, &tmp_list, pool_entry) > + ib_free_cq(cq); > + return ret; > +} > + > +/* > + * ib_find_get_cq() - Find the least used completion queue that matches > + * a given affinity hint (or least used for wild card affinity) > + * and fits nr_cqe > + * @dev: rdma device > + * @nr_cqe: number of needed cqe entries > + * @poll_ctx: cq polling context > + * @affinity_hint: affinity hint (-1) for wild-card assignment > + * > + * Finds a cq that satisfies @affinity_hint and @nr_cqe requirements and claim > + * entries in it for us. In case there is no available cq, allocate a new cq > + * with the requirements and add it to the device pool. > + */ > +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, > + enum ib_poll_context poll_ctx, int affinity_hint) > +{ > + struct ib_cq *cq, *found; > + unsigned long flags; > + int vector, ret; > + > + if (poll_ctx >= ARRAY_SIZE(dev->cq_pools)) > + return ERR_PTR(-EINVAL); > + > + if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) { > + /* > + * Couldn't find matching vector affinity so project > + * the affinty to the device completion vector range > + */ > + vector = affinity_hint % dev->num_comp_vectors; > + } > + > +restart: > + /* > + * Find the least used CQ with correct affinity and > + * enough free cq entries > + */ > + found = NULL; > + spin_lock_irqsave(&dev->cq_lock, flags); > + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) { > + if (vector != -1 && vector != cq->comp_vector) how vector can be -1 ? > + continue; > + if (cq->cqe_used + nr_cqe > cq->cqe) > + continue; > + if (found && cq->cqe_used >= found->cqe_used) > + continue; > + found = cq; > + } > + > + if (found) { > + found->cqe_used += nr_cqe; > + spin_unlock_irqrestore(&dev->cq_lock, flags); > + return found; > + } > + spin_unlock_irqrestore(&dev->cq_lock, flags); > + > + /* > + * Didn't find a match or ran out of CQs, > + * device pool, allocate a new array of CQs. > + */ > + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); > + if (ret) > + return ERR_PTR(ret); > + > + /* Now search again */ > + goto restart; > +} > + > +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&cq->device->cq_lock, flags); > + cq->cqe_used -= nr_cqe; > + WARN_ON_ONCE(cq->cqe_used < 0); > + spin_unlock_irqrestore(&cq->device->cq_lock, flags); > +} > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c > index 84fc32a2c8b3..c828845c46d8 100644 > --- a/drivers/infiniband/core/device.c > +++ b/drivers/infiniband/core/device.c > @@ -468,6 +468,8 @@ int ib_register_device(struct ib_device *device, > device->dma_device = parent; > } > > + ib_init_cq_pools(device); > + > mutex_lock(&device_mutex); > > if (strchr(device->name, '%')) { > @@ -590,6 +592,8 @@ void ib_unregister_device(struct ib_device *device) > up_write(&lists_rwsem); > > device->reg_state = IB_DEV_UNREGISTERED; > + > + ib_purge_cq_pools(device); > } > EXPORT_SYMBOL(ib_unregister_device); > > diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c > index de57d6c11a25..fcc9ecba6741 100644 > --- a/drivers/infiniband/core/verbs.c > +++ b/drivers/infiniband/core/verbs.c > @@ -793,14 +793,16 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, > struct ib_qp_init_attr *qp_init_attr) > { > struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; > + struct ib_cq *cq = NULL; > struct ib_qp *qp; > - int ret; > + u32 nr_cqes = 0; > + int ret = -EINVAL; > > if (qp_init_attr->rwq_ind_tbl && > (qp_init_attr->recv_cq || > qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || > qp_init_attr->cap.max_recv_sge)) > - return ERR_PTR(-EINVAL); > + goto out; > > /* > * If the callers is using the RDMA API calculate the resources > @@ -811,9 +813,51 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, > if (qp_init_attr->cap.max_rdma_ctxs) > rdma_rw_init_qp(device, qp_init_attr); > > + if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) { > + int affinity = -1; > + > + if (WARN_ON(qp_init_attr->recv_cq)) > + goto out; > + if (WARN_ON(qp_init_attr->send_cq)) > + goto out; > + > + if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT) > + affinity = qp_init_attr->affinity_hint; > + > + nr_cqes = qp_init_attr->cap.max_recv_wr + > + qp_init_attr->cap.max_send_wr; > + if (nr_cqes) { what will happen if nr_cqes == 0 in that case ? > + cq = ib_find_get_cq(device, nr_cqes, > + qp_init_attr->poll_ctx, affinity); > + if (IS_ERR(cq)) { > + ret = PTR_ERR(cq); > + goto out; > + } > + > + if (qp_init_attr->cap.max_send_wr) > + qp_init_attr->send_cq = cq; > + > + if (qp_init_attr->cap.max_recv_wr) { > + qp_init_attr->recv_cq = cq; > + > + /* > + * Low-level drivers expect max_recv_wr == 0 > + * for the SRQ case: > + */ > + if (qp_init_attr->srq) > + qp_init_attr->cap.max_recv_wr = 0; > + } > + } > + > + qp_init_attr->create_flags &= > + ~(IB_QP_CREATE_ASSIGN_CQS | IB_QP_CREATE_AFFINITY_HINT); > + } > + > qp = device->create_qp(pd, qp_init_attr, NULL); > - if (IS_ERR(qp)) > - return qp; > + if (IS_ERR(qp)) { > + ret = PTR_ERR(qp); > + goto out_put_cq; > + } > > ret = ib_create_qp_security(qp, device); > if (ret) { > @@ -826,6 +870,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, > qp->uobject = NULL; > qp->qp_type = qp_init_attr->qp_type; > qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; > + qp->nr_cqes = nr_cqes; > > atomic_set(&qp->usecnt, 0); > qp->mrs_used = 0; > @@ -865,8 +910,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, > ret = rdma_rw_init_mrs(qp, qp_init_attr); > if (ret) { > pr_err("failed to init MR pool ret= %d\n", ret); > - ib_destroy_qp(qp); > - return ERR_PTR(ret); > + goto out_destroy_qp; > } > } > > @@ -880,6 +924,14 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, > device->attrs.max_sge_rd); > > return qp; > + > +out_destroy_qp: > + ib_destroy_qp(qp); > +out_put_cq: > + if (cq) > + ib_put_cq(cq, nr_cqes); > +out: > + return ERR_PTR(ret); > } > EXPORT_SYMBOL(ib_create_qp); > > @@ -1478,6 +1530,11 @@ int ib_destroy_qp(struct ib_qp *qp) > atomic_dec(&ind_tbl->usecnt); > if (sec) > ib_destroy_qp_security_end(sec); > + > + if (qp->nr_cqes) { > + WARN_ON_ONCE(rcq && rcq != scq); > + ib_put_cq(scq, qp->nr_cqes); > + } > } else { > if (sec) > ib_destroy_qp_security_abort(sec); > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index bdb1279a415b..56d42e753eb4 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -1098,11 +1098,22 @@ enum ib_qp_create_flags { > IB_QP_CREATE_SCATTER_FCS = 1 << 8, > IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, > IB_QP_CREATE_SOURCE_QPN = 1 << 10, > + > + /* only used by the core, not passed to low-level drivers */ > + IB_QP_CREATE_ASSIGN_CQS = 1 << 24, > + IB_QP_CREATE_AFFINITY_HINT = 1 << 25, > + > /* reserve bits 26-31 for low level drivers' internal use */ > IB_QP_CREATE_RESERVED_START = 1 << 26, > IB_QP_CREATE_RESERVED_END = 1 << 31, > }; > > +enum ib_poll_context { > + IB_POLL_SOFTIRQ, /* poll from softirq context */ > + IB_POLL_WORKQUEUE, /* poll from workqueue */ > + IB_POLL_DIRECT, /* caller context, no hw completions */ > +}; > + > /* > * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler > * callback to destroy the passed in QP. > @@ -1124,6 +1135,13 @@ struct ib_qp_init_attr { > * Only needed for special QP types, or when using the RW API. > */ > u8 port_num; > + > + /* > + * Only needed when not passing in explicit CQs. > + */ > + enum ib_poll_context poll_ctx; > + int affinity_hint; > + > struct ib_rwq_ind_table *rwq_ind_tbl; > u32 source_qpn; > }; > @@ -1536,12 +1554,6 @@ struct ib_ah { > > typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); > > -enum ib_poll_context { > - IB_POLL_DIRECT, /* caller context, no hw completions */ > - IB_POLL_SOFTIRQ, /* poll from softirq context */ > - IB_POLL_WORKQUEUE, /* poll from workqueue */ > -}; > - > struct ib_cq { > struct ib_device *device; > struct ib_uobject *uobject; > @@ -1549,9 +1561,12 @@ struct ib_cq { > void (*event_handler)(struct ib_event *, void *); > void *cq_context; > int cqe; > + unsigned int cqe_used; > atomic_t usecnt; /* count number of work queues */ > enum ib_poll_context poll_ctx; > + int comp_vector; > struct ib_wc *wc; > + struct list_head pool_entry; > union { > struct irq_poll iop; > struct work_struct work; > @@ -1731,6 +1746,7 @@ struct ib_qp { > struct ib_rwq_ind_table *rwq_ind_tbl; > struct ib_qp_security *qp_sec; > u8 port; > + u32 nr_cqes; > }; > > struct ib_mr { > @@ -2338,6 +2354,9 @@ struct ib_device { > > u32 index; > > + spinlock_t cq_lock; maybe can be called cq_pools_lock (cq_lock is general) ? > + struct list_head cq_pools[IB_POLL_WORKQUEUE + 1]; maybe it's better to add and use IB_POLL_LAST ? > + > /** > * The following mandatory functions are used only at device > * registration. Keep functions such as these at the end of this > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
>> +static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes, >> + enum ib_poll_context poll_ctx) >> +{ >> + LIST_HEAD(tmp_list); >> + struct ib_cq *cq; >> + unsigned long flags; >> + int nr_cqs, ret, i; >> + >> + /* >> + * Allocated at least as many CQEs as requested, and otherwise >> + * a reasonable batch size so that we can share CQs between >> + * multiple users instead of allocating a larger number of CQs. >> + */ >> + nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH)); > > did you mean min() ? No, I meant max. If we choose the CQ size, we choose the min between the default and the device capability, if the user chooses, we rely that it asked for no more than the device capability (and if not, allocation will fail, as it should). >> +restart: >> + /* >> + * Find the least used CQ with correct affinity and >> + * enough free cq entries >> + */ >> + found = NULL; >> + spin_lock_irqsave(&dev->cq_lock, flags); >> + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) { >> + if (vector != -1 && vector != cq->comp_vector) > > how vector can be -1 ? -1 is a wild-card affinity hint value that chooses the least used cq (see ib_create_qp). >> @@ -811,9 +813,51 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, >> if (qp_init_attr->cap.max_rdma_ctxs) >> rdma_rw_init_qp(device, qp_init_attr); >> + if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) { >> + int affinity = -1; >> + >> + if (WARN_ON(qp_init_attr->recv_cq)) >> + goto out; >> + if (WARN_ON(qp_init_attr->send_cq)) >> + goto out; >> + >> + if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT) >> + affinity = qp_init_attr->affinity_hint; >> + >> + nr_cqes = qp_init_attr->cap.max_recv_wr + >> + qp_init_attr->cap.max_send_wr; >> + if (nr_cqes) { > > what will happen if nr_cqes == 0 in that case ? The same thing that would happen without this code I think. This is creating a qp without ability to post send and/or receive work requests. >> @@ -2338,6 +2354,9 @@ struct ib_device { >> u32 index; >> + spinlock_t cq_lock; > > maybe can be called cq_pools_lock (cq_lock is general) ? I can change that. >> + struct list_head cq_pools[IB_POLL_WORKQUEUE + 1]; > > maybe it's better to add and use IB_POLL_LAST ? Yea, I can change that. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
T24gVGh1LCAyMDE3LTExLTA5IGF0IDE5OjMxICswMjAwLCBTYWdpIEdyaW1iZXJnIHdyb3RlOg0K PiA+ID4gK3N0YXRpYyBpbnQgaWJfYWxsb2NfY3FzKHN0cnVjdCBpYl9kZXZpY2UgKmRldiwgaW50 IG5yX2NxZXMsDQo+ID4gPiArICAgICAgICBlbnVtIGliX3BvbGxfY29udGV4dCBwb2xsX2N0eCkN Cj4gPiA+ICt7DQo+ID4gPiArICAgIExJU1RfSEVBRCh0bXBfbGlzdCk7DQo+ID4gPiArICAgIHN0 cnVjdCBpYl9jcSAqY3E7DQo+ID4gPiArICAgIHVuc2lnbmVkIGxvbmcgZmxhZ3M7DQo+ID4gPiAr ICAgIGludCBucl9jcXMsIHJldCwgaTsNCj4gPiA+ICsNCj4gPiA+ICsgICAgLyoNCj4gPiA+ICsg ICAgICogQWxsb2NhdGVkIGF0IGxlYXN0IGFzIG1hbnkgQ1FFcyBhcyByZXF1ZXN0ZWQsIGFuZCBv dGhlcndpc2UNCj4gPiA+ICsgICAgICogYSByZWFzb25hYmxlIGJhdGNoIHNpemUgc28gdGhhdCB3 ZSBjYW4gc2hhcmUgQ1FzIGJldHdlZW4NCj4gPiA+ICsgICAgICogbXVsdGlwbGUgdXNlcnMgaW5z dGVhZCBvZiBhbGxvY2F0aW5nIGEgbGFyZ2VyIG51bWJlciBvZiBDUXMuDQo+ID4gPiArICAgICAq Lw0KPiA+ID4gKyAgICBucl9jcWVzID0gbWF4KG5yX2NxZXMsIG1pbihkZXYtPmF0dHJzLm1heF9j cWUsIElCX0NRRV9CQVRDSCkpOw0KPiA+IA0KPiA+IGRpZCB5b3UgbWVhbiBtaW4oKSA/DQo+IA0K PiBObywgSSBtZWFudCBtYXguIElmIHdlIGNob29zZSB0aGUgQ1Egc2l6ZSwgd2UgY2hvb3NlIHRo ZSBtaW4gYmV0d2VlbiB0aGUNCj4gZGVmYXVsdCBhbmQgdGhlIGRldmljZSBjYXBhYmlsaXR5LCBp ZiB0aGUgdXNlciBjaG9vc2VzLCB3ZSByZWx5IHRoYXQgaXQNCj4gYXNrZWQgZm9yIG5vIG1vcmUg dGhhbiB0aGUgZGV2aWNlIGNhcGFiaWxpdHkgKGFuZCBpZiBub3QsIGFsbG9jYXRpb24NCj4gd2ls bCBmYWlsLCBhcyBpdCBzaG91bGQpLg0KDQpIZWxsbyBTYWdpLA0KDQpIb3cgYWJvdXQgdGhlIGZv bGxvd2luZzoNCg0KCW1pbihkZXYtPmF0dHJzLm1heF9jcWUsIG1heChucl9jcWVzLCBJQl9DUUVf QkFUQ0gpKQ0KDQpCYXJ0Lg0K -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Hello Sagi, > > How about the following: > > min(dev->attrs.max_cqe, max(nr_cqes, IB_CQE_BATCH)) That would work too, thanks Bart! -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2017-11-08 at 11:57 +0200, Sagi Grimberg wrote: > +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, > + enum ib_poll_context poll_ctx, int affinity_hint) > +{ > + struct ib_cq *cq, *found; > + unsigned long flags; > + int vector, ret; > + > + if (poll_ctx >= ARRAY_SIZE(dev->cq_pools)) > + return ERR_PTR(-EINVAL); > + > + if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) { > + /* > + * Couldn't find matching vector affinity so project > + * the affinty to the device completion vector range > + */ > + vector = affinity_hint % dev->num_comp_vectors; > + } So depending on whether or not the HCA driver implements .get_vector_affinity() either pci_irq_get_affinity() is used or "vector = affinity_hint % dev->num_comp_vectors"? Sorry but I think that kind of differences makes it unnecessarily hard for ULP maintainers to provide predictable performance and consistent behavior across HCAs. Bart.
>> +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, >> + enum ib_poll_context poll_ctx, int affinity_hint) >> +{ >> + struct ib_cq *cq, *found; >> + unsigned long flags; >> + int vector, ret; >> + >> + if (poll_ctx >= ARRAY_SIZE(dev->cq_pools)) >> + return ERR_PTR(-EINVAL); >> + >> + if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) { >> + /* >> + * Couldn't find matching vector affinity so project >> + * the affinty to the device completion vector range >> + */ >> + vector = affinity_hint % dev->num_comp_vectors; >> + } > > So depending on whether or not the HCA driver implements .get_vector_affinity() > either pci_irq_get_affinity() is used or "vector = affinity_hint % > dev->num_comp_vectors"? Sorry but I think that kind of differences makes it > unnecessarily hard for ULP maintainers to provide predictable performance and > consistent behavior across HCAs. Well, as a ULP maintainer I think that in the lack of .get_vector_affinity() I would do that same thing as this code. srp itself is doing the same thing in srp_create_target() -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Nov 08, 2017 at 11:57:34AM +0200, Sagi Grimberg wrote: > Allow a ULP to ask the core to implicitly assign a completion > queue to a queue-pair based on a least-used search on a per-device > cq pools. The device CQ pools grow in a lazy fashion with every > QP creation. > > In addition, expose an affinity hint for a queue pair creation. > If passed, the core will attempt to attach a CQ with a completion > vector that is directed to the cpu core as the affinity hint > provided. > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Sagi, Bart, Did we reach a conclusion on this? Is v3 the series to take, and should it all go through the RDMA tree? It looks like there are some missing acks for that?? I think there was also an unapplied comment from bart in the patchworks notes... Could you please add some commit messages when you resend it? Not sure I should be accepting such large commits with empty messages??? Thanks, Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Dec 11, 2017 at 04:50:44PM -0700, Jason Gunthorpe wrote: > On Wed, Nov 08, 2017 at 11:57:34AM +0200, Sagi Grimberg wrote: > > Allow a ULP to ask the core to implicitly assign a completion > > queue to a queue-pair based on a least-used search on a per-device > > cq pools. The device CQ pools grow in a lazy fashion with every > > QP creation. > > > > In addition, expose an affinity hint for a queue pair creation. > > If passed, the core will attempt to attach a CQ with a completion > > vector that is directed to the cpu core as the affinity hint > > provided. > > > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me> > > Sagi, Bart, > > Did we reach a conclusion on this? Is v3 the series to take, and > should it all go through the RDMA tree? It looks like there are some > missing acks for that?? > > I think there was also an unapplied comment from bart in the > patchworks notes... > > Could you please add some commit messages when you resend it? Not sure > I should be accepting such large commits with empty messages??? Hearing nothing, I've dropped this series off patchworks. Please resend it when you are ready. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a1d687a664f8..4f6cd4cf5116 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -179,6 +179,12 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, return netdev_has_upper_dev_all_rcu(dev, upper); } +void ib_init_cq_pools(struct ib_device *dev); +void ib_purge_cq_pools(struct ib_device *dev); +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, + enum ib_poll_context poll_ctx, int affinity_hint); +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe); + int addr_init(void); void addr_cleanup(void); diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index f2ae75fa3128..8b9f9be5386b 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -15,6 +15,9 @@ #include <linux/slab.h> #include <rdma/ib_verbs.h> +/* XXX: wild guess - should not be too large or too small to avoid wastage */ +#define IB_CQE_BATCH 1024 + /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 @@ -149,6 +152,8 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); + cq->cqe_used = 0; + cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) @@ -194,6 +199,8 @@ void ib_free_cq(struct ib_cq *cq) if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; + if (WARN_ON_ONCE(cq->cqe_used != 0)) + return; switch (cq->poll_ctx) { case IB_POLL_DIRECT: @@ -213,3 +220,189 @@ void ib_free_cq(struct ib_cq *cq) WARN_ON_ONCE(ret); } EXPORT_SYMBOL(ib_free_cq); + +void ib_init_cq_pools(struct ib_device *dev) +{ + int i; + + spin_lock_init(&dev->cq_lock); + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) + INIT_LIST_HEAD(&dev->cq_pools[i]); +} + +void ib_purge_cq_pools(struct ib_device *dev) +{ + struct ib_cq *cq, *n; + LIST_HEAD(tmp_list); + int i; + + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { + unsigned long flags; + + spin_lock_irqsave(&dev->cq_lock, flags); + list_splice_init(&dev->cq_pools[i], &tmp_list); + spin_unlock_irqrestore(&dev->cq_lock, flags); + } + + list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) + ib_free_cq(cq); +} + +/** + * ib_find_vector_affinity() - Find the first completion vector mapped to a given + * cpu core affinity + * @device: rdma device + * @cpu: cpu for the corresponding completion vector affinity + * @vector: output target completion vector + * + * If the device expose vector affinity we will search each of the vectors + * and if we find one that gives us the desired cpu core we return true + * and assign @vector to the corresponding completion vector. Otherwise + * we return false. We stop at the first appropriate completion vector + * we find as we don't have any preference for multiple vectors with the + * same affinity. + */ +static bool ib_find_vector_affinity(struct ib_device *device, int cpu, + unsigned int *vector) +{ + bool found = false; + unsigned int c; + int vec; + + if (cpu == -1) + goto out; + + for (vec = 0; vec < device->num_comp_vectors; vec++) { + const struct cpumask *mask; + + mask = ib_get_vector_affinity(device, vec); + if (!mask) + goto out; + + for_each_cpu(c, mask) { + if (c == cpu) { + *vector = vec; + found = true; + goto out; + } + } + } + +out: + return found; +} + +static int ib_alloc_cqs(struct ib_device *dev, int nr_cqes, + enum ib_poll_context poll_ctx) +{ + LIST_HEAD(tmp_list); + struct ib_cq *cq; + unsigned long flags; + int nr_cqs, ret, i; + + /* + * Allocated at least as many CQEs as requested, and otherwise + * a reasonable batch size so that we can share CQs between + * multiple users instead of allocating a larger number of CQs. + */ + nr_cqes = max(nr_cqes, min(dev->attrs.max_cqe, IB_CQE_BATCH)); + nr_cqs = min_t(int, dev->num_comp_vectors, num_possible_cpus()); + for (i = 0; i < nr_cqs; i++) { + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + pr_err("%s: failed to create CQ ret=%d\n", + __func__, ret); + goto out_free_cqs; + } + list_add_tail(&cq->pool_entry, &tmp_list); + } + + spin_lock_irqsave(&dev->cq_lock, flags); + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); + spin_unlock_irqrestore(&dev->cq_lock, flags); + + return 0; + +out_free_cqs: + list_for_each_entry(cq, &tmp_list, pool_entry) + ib_free_cq(cq); + return ret; +} + +/* + * ib_find_get_cq() - Find the least used completion queue that matches + * a given affinity hint (or least used for wild card affinity) + * and fits nr_cqe + * @dev: rdma device + * @nr_cqe: number of needed cqe entries + * @poll_ctx: cq polling context + * @affinity_hint: affinity hint (-1) for wild-card assignment + * + * Finds a cq that satisfies @affinity_hint and @nr_cqe requirements and claim + * entries in it for us. In case there is no available cq, allocate a new cq + * with the requirements and add it to the device pool. + */ +struct ib_cq *ib_find_get_cq(struct ib_device *dev, unsigned int nr_cqe, + enum ib_poll_context poll_ctx, int affinity_hint) +{ + struct ib_cq *cq, *found; + unsigned long flags; + int vector, ret; + + if (poll_ctx >= ARRAY_SIZE(dev->cq_pools)) + return ERR_PTR(-EINVAL); + + if (!ib_find_vector_affinity(dev, affinity_hint, &vector)) { + /* + * Couldn't find matching vector affinity so project + * the affinty to the device completion vector range + */ + vector = affinity_hint % dev->num_comp_vectors; + } + +restart: + /* + * Find the least used CQ with correct affinity and + * enough free cq entries + */ + found = NULL; + spin_lock_irqsave(&dev->cq_lock, flags); + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) { + if (vector != -1 && vector != cq->comp_vector) + continue; + if (cq->cqe_used + nr_cqe > cq->cqe) + continue; + if (found && cq->cqe_used >= found->cqe_used) + continue; + found = cq; + } + + if (found) { + found->cqe_used += nr_cqe; + spin_unlock_irqrestore(&dev->cq_lock, flags); + return found; + } + spin_unlock_irqrestore(&dev->cq_lock, flags); + + /* + * Didn't find a match or ran out of CQs, + * device pool, allocate a new array of CQs. + */ + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); + if (ret) + return ERR_PTR(ret); + + /* Now search again */ + goto restart; +} + +void ib_put_cq(struct ib_cq *cq, unsigned int nr_cqe) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->device->cq_lock, flags); + cq->cqe_used -= nr_cqe; + WARN_ON_ONCE(cq->cqe_used < 0); + spin_unlock_irqrestore(&cq->device->cq_lock, flags); +} diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 84fc32a2c8b3..c828845c46d8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -468,6 +468,8 @@ int ib_register_device(struct ib_device *device, device->dma_device = parent; } + ib_init_cq_pools(device); + mutex_lock(&device_mutex); if (strchr(device->name, '%')) { @@ -590,6 +592,8 @@ void ib_unregister_device(struct ib_device *device) up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; + + ib_purge_cq_pools(device); } EXPORT_SYMBOL(ib_unregister_device); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index de57d6c11a25..fcc9ecba6741 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -793,14 +793,16 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_cq *cq = NULL; struct ib_qp *qp; - int ret; + u32 nr_cqes = 0; + int ret = -EINVAL; if (qp_init_attr->rwq_ind_tbl && (qp_init_attr->recv_cq || qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || qp_init_attr->cap.max_recv_sge)) - return ERR_PTR(-EINVAL); + goto out; /* * If the callers is using the RDMA API calculate the resources @@ -811,9 +813,51 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); + if (qp_init_attr->create_flags & IB_QP_CREATE_ASSIGN_CQS) { + int affinity = -1; + + if (WARN_ON(qp_init_attr->recv_cq)) + goto out; + if (WARN_ON(qp_init_attr->send_cq)) + goto out; + + if (qp_init_attr->create_flags & IB_QP_CREATE_AFFINITY_HINT) + affinity = qp_init_attr->affinity_hint; + + nr_cqes = qp_init_attr->cap.max_recv_wr + + qp_init_attr->cap.max_send_wr; + if (nr_cqes) { + cq = ib_find_get_cq(device, nr_cqes, + qp_init_attr->poll_ctx, affinity); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out; + } + + if (qp_init_attr->cap.max_send_wr) + qp_init_attr->send_cq = cq; + + if (qp_init_attr->cap.max_recv_wr) { + qp_init_attr->recv_cq = cq; + + /* + * Low-level drivers expect max_recv_wr == 0 + * for the SRQ case: + */ + if (qp_init_attr->srq) + qp_init_attr->cap.max_recv_wr = 0; + } + } + + qp_init_attr->create_flags &= + ~(IB_QP_CREATE_ASSIGN_CQS | IB_QP_CREATE_AFFINITY_HINT); + } + qp = device->create_qp(pd, qp_init_attr, NULL); - if (IS_ERR(qp)) - return qp; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto out_put_cq; + } ret = ib_create_qp_security(qp, device); if (ret) { @@ -826,6 +870,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; + qp->nr_cqes = nr_cqes; atomic_set(&qp->usecnt, 0); qp->mrs_used = 0; @@ -865,8 +910,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, ret = rdma_rw_init_mrs(qp, qp_init_attr); if (ret) { pr_err("failed to init MR pool ret= %d\n", ret); - ib_destroy_qp(qp); - return ERR_PTR(ret); + goto out_destroy_qp; } } @@ -880,6 +924,14 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, device->attrs.max_sge_rd); return qp; + +out_destroy_qp: + ib_destroy_qp(qp); +out_put_cq: + if (cq) + ib_put_cq(cq, nr_cqes); +out: + return ERR_PTR(ret); } EXPORT_SYMBOL(ib_create_qp); @@ -1478,6 +1530,11 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&ind_tbl->usecnt); if (sec) ib_destroy_qp_security_end(sec); + + if (qp->nr_cqes) { + WARN_ON_ONCE(rcq && rcq != scq); + ib_put_cq(scq, qp->nr_cqes); + } } else { if (sec) ib_destroy_qp_security_abort(sec); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bdb1279a415b..56d42e753eb4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1098,11 +1098,22 @@ enum ib_qp_create_flags { IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, IB_QP_CREATE_SOURCE_QPN = 1 << 10, + + /* only used by the core, not passed to low-level drivers */ + IB_QP_CREATE_ASSIGN_CQS = 1 << 24, + IB_QP_CREATE_AFFINITY_HINT = 1 << 25, + /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; +enum ib_poll_context { + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ +}; + /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. @@ -1124,6 +1135,13 @@ struct ib_qp_init_attr { * Only needed for special QP types, or when using the RW API. */ u8 port_num; + + /* + * Only needed when not passing in explicit CQs. + */ + enum ib_poll_context poll_ctx; + int affinity_hint; + struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; @@ -1536,12 +1554,6 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); -enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ -}; - struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; @@ -1549,9 +1561,12 @@ struct ib_cq { void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; + unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; + int comp_vector; struct ib_wc *wc; + struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; @@ -1731,6 +1746,7 @@ struct ib_qp { struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u8 port; + u32 nr_cqes; }; struct ib_mr { @@ -2338,6 +2354,9 @@ struct ib_device { u32 index; + spinlock_t cq_lock; + struct list_head cq_pools[IB_POLL_WORKQUEUE + 1]; + /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this
Allow a ULP to ask the core to implicitly assign a completion queue to a queue-pair based on a least-used search on a per-device cq pools. The device CQ pools grow in a lazy fashion with every QP creation. In addition, expose an affinity hint for a queue pair creation. If passed, the core will attempt to attach a CQ with a completion vector that is directed to the cpu core as the affinity hint provided. Signed-off-by: Sagi Grimberg <sagi@grimberg.me> --- drivers/infiniband/core/core_priv.h | 6 ++ drivers/infiniband/core/cq.c | 193 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 4 + drivers/infiniband/core/verbs.c | 69 +++++++++++-- include/rdma/ib_verbs.h | 31 ++++-- 5 files changed, 291 insertions(+), 12 deletions(-)