diff mbox

[rdma-core,2/2] rdma-core/cxgb4: Add support for user mode srqs

Message ID 20180710072208.28686-3-rajur@chelsio.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Raju Rangoju July 10, 2018, 7:22 a.m. UTC
- Added create/destroy/modify routines to support user mode srq
- Added post_srq function
- Updated poll_cq code to deal with srqs
- Handled user mode SRQ_LIMIT events
- Handled flushed SRQ buffers

Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 providers/cxgb4/cq.c    | 139 +++++++++++++++++++++++++++---
 providers/cxgb4/dev.c   |   2 +
 providers/cxgb4/qp.c    | 129 +++++++++++++++++++++++++++-
 providers/cxgb4/t4.h    |   6 +-
 providers/cxgb4/verbs.c | 219 +++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 437 insertions(+), 58 deletions(-)
diff mbox

Patch

diff --git a/providers/cxgb4/cq.c b/providers/cxgb4/cq.c
index bb4f6447..2421e2fb 100644
--- a/providers/cxgb4/cq.c
+++ b/providers/cxgb4/cq.c
@@ -40,7 +40,7 @@ 
 #include "libcxgb4.h"
 #include "cxgb4-abi.h"
 
-static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq, u32 srqidx)
 {
 	union t4_cqe cqe = {};
 	__be64 *gen = GEN_ADDR(&cqe);
@@ -53,6 +53,9 @@  static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
 				 V_CQE_SWCQE(1) |
 				 V_CQE_QPID(wq->sq.qid));
 	*gen = htobe64(V_CQE_GENBIT((u64)cq->gen));
+	if (srqidx)
+		cqe.b64.u.srcqe.abs_rqe_idx = htobe32(srqidx);
+
 	memcpy(Q_ENTRY(cq->sw_queue, cq->sw_pidx), &cqe, CQE_SIZE(&cqe));
 	t4_swcq_produce(cq);
 }
@@ -66,7 +69,7 @@  int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
 	PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
 	     wq, cq, wq->rq.in_use, count);
 	while (in_use--) {
-		insert_recv_cqe(wq, cq);
+		insert_recv_cqe(wq, cq, 0);
 		flushed++;
 	}
 	return flushed;
@@ -354,6 +357,78 @@  static void dump_cqe(void *arg)
 
 }
 
+static void post_pending_srq_wrs(struct t4_srq *srq)
+{
+	struct t4_srq_pending_wr *pwr;
+	u16 idx = 0;
+
+	while (srq->pending_in_use) {
+
+		assert(!srq->sw_rq[srq->pidx].valid);
+
+		pwr = &srq->pending_wrs[srq->pending_cidx];
+		srq->sw_rq[srq->pidx].wr_id = pwr->wr_id;
+		srq->sw_rq[srq->pidx].valid = 1;
+
+		PDBG("%s posting pending cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u wr_id %llx\n", __func__,
+				srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size,
+				(unsigned long long)pwr->wr_id);
+
+		c4iw_copy_wr_to_srq(srq, &pwr->wqe, pwr->len16);
+		t4_srq_consume_pending_wr(srq);
+		t4_srq_produce(srq, pwr->len16);
+		idx += DIV_ROUND_UP(pwr->len16*16, T4_EQ_ENTRY_SIZE);
+	}
+
+	if (idx) {
+		t4_ring_srq_db(srq, idx, pwr->len16, &pwr->wqe);
+		srq->queue[srq->size].status.host_wq_pidx =
+			srq->wq_pidx;
+	}
+}
+
+static u64 reap_srq_cqe(union t4_cqe *hw_cqe, struct t4_srq *srq)
+{
+	int rel_idx = CQE_ABS_RQE_IDX(&hw_cqe->b64) - srq->rqt_abs_idx;
+	u64 wr_id;
+
+	BUG_ON(rel_idx >= srq->size);
+
+	assert(srq->sw_rq[rel_idx].valid);
+	srq->sw_rq[rel_idx].valid = 0;
+	wr_id = srq->sw_rq[rel_idx].wr_id;
+
+	if (rel_idx == srq->cidx) {
+		PDBG("%s in order cqe rel_idx %u cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u wr_id %llx\n", __func__,
+				rel_idx, srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size,
+				(unsigned long long)srq->sw_rq[rel_idx].wr_id);
+		t4_srq_consume(srq);
+		while (srq->ooo_count && !srq->sw_rq[srq->cidx].valid) {
+			PDBG("%s eat ooo cidx %u pidx %u wq_pidx %u "
+					"in_use %u rq_size %u ooo_count %u wr_id %llx\n", __func__,
+					srq->cidx, srq->pidx,
+					srq->wq_pidx, srq->in_use, srq->size, srq->ooo_count,
+					(unsigned long long)srq->sw_rq[srq->cidx].wr_id);
+			t4_srq_consume_ooo(srq);
+		}
+		if (srq->ooo_count == 0 && srq->pending_in_use)
+			post_pending_srq_wrs(srq);
+	} else {
+		BUG_ON(srq->in_use == 0);
+		PDBG("%s ooo cqe rel_idx %u cidx %u pidx %u wq_pidx %u "
+				"in_use %u rq_size %u ooo_count %u wr_id %llx\n", __func__,
+				rel_idx, srq->cidx, srq->pidx,
+				srq->wq_pidx, srq->in_use, srq->size, srq->ooo_count,
+				(unsigned long long)srq->sw_rq[rel_idx].wr_id);
+		t4_srq_produce_ooo(srq);
+	}
+	return wr_id;
+}
+
 /*
  * poll_cq
  *
@@ -370,8 +445,9 @@  static void dump_cqe(void *arg)
  *    -EAGAIN       CQE skipped, try again.
  *    -EOVERFLOW    CQ overflow detected.
  */
-static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, union t4_cqe *cqe,
-	           u8 *cqe_flushed, u64 *cookie, u32 *credit)
+static int poll_cq(struct t4_wq *wq, struct t4_cq *cq,
+		   union t4_cqe *cqe, u8 *cqe_flushed,
+		   u64 *cookie, u32 *credit, struct t4_srq *srq)
 {
 	int ret = 0;
 	union t4_cqe *hw_cqe, read_cqe;
@@ -495,7 +571,7 @@  static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, union t4_cqe *cqe,
 		 * error.
 		 */
 
-		if (t4_rq_empty(wq)) {
+		if (srq ? t4_srq_empty(srq) : t4_rq_empty(wq)) {
 			t4_set_wq_in_error(wq);
 			ret = -EAGAIN;
 			goto skip_cqe;
@@ -563,11 +639,15 @@  proc_cqe:
 		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
 		t4_sq_consume(wq);
 	} else {
-		PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
-		BUG_ON(wq->rq.cidx >= wq->rq.size);
-		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
-		BUG_ON(t4_rq_empty(wq));
-		t4_rq_consume(wq);
+		if (!srq) {
+			PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
+			BUG_ON(wq->rq.cidx >= wq->rq.size);
+			*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+			BUG_ON(t4_rq_empty(wq));
+			t4_rq_consume(wq);
+		} else
+			*cookie = reap_srq_cqe(hw_cqe, srq);
+		wq->rq.msn++;
 		goto skip_cqe;
 	}
 
@@ -590,6 +670,18 @@  skip_cqe:
 	return ret;
 }
 
+static void generate_srq_limit_event(struct c4iw_srq *srq)
+{
+	struct ibv_modify_srq cmd;
+	struct ibv_srq_attr attr = {0};
+	int ret;
+
+	srq->armed = 0;
+	ret = ibv_cmd_modify_srq(&srq->ibv_srq, &attr, 0, &cmd, sizeof cmd);
+	if (ret)
+		fprintf(stderr, "Failure to send srq_limit event - ret %d errno %d\n", ret, errno);
+}
+
 /*
  * Get one cq entry from c4iw and map it to openib.
  *
@@ -602,6 +694,7 @@  skip_cqe:
 static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 {
 	struct c4iw_qp *qhp = NULL;
+	struct c4iw_srq *srq = NULL;
 	struct t4_cqe_common *com;
 	union t4_cqe uninitialized_var(cqe), *rd_cqe;
 	struct t4_wq *wq;
@@ -637,8 +730,12 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 	else {
 		pthread_spin_lock(&qhp->lock);
 		wq = &(qhp->wq);
+		srq = qhp->srq;
+		if (srq)
+			pthread_spin_lock(&srq->lock);
 	}
-	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit,
+		      srq ? &srq->wq : NULL);
 	if (ret)
 		goto out;
 
@@ -649,6 +746,13 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 	wc->vendor_err = CQE_STATUS(com);
 	wc->wc_flags = 0;
 
+	/*
+	 * Simulate a SRQ_LIMIT_REACHED HW notification if required.
+	 */
+	if (srq && !(srq->flags & T4_SRQ_LIMIT_SUPPORT) && srq->armed &&
+			srq->wq.in_use < srq->srq_limit)
+		generate_srq_limit_event(srq);
+
 	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
 	     "lo 0x%x cookie 0x%llx\n", __func__,
 	     CQE_QPID(com), CQE_TYPE(com),
@@ -747,8 +851,11 @@  static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc)
 			chp->cq.cqid, CQE_QPID(com), CQE_TYPE(com),
 			CQE_OPCODE(com), CQE_STATUS(com));
 out:
-	if (wq)
+	if (wq) {
 		pthread_spin_unlock(&qhp->lock);
+		if (srq)
+			pthread_spin_unlock(&srq->lock);
+	}
 	return ret;
 }
 
@@ -792,3 +899,11 @@  int c4iw_arm_cq(struct ibv_cq *ibcq, int solicited)
 	pthread_spin_unlock(&chp->lock);
 	return ret;
 }
+
+void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx)
+{
+	struct c4iw_cq * rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq);
+
+	/* create a SRQ RECV CQE for srqidx */
+	insert_recv_cqe(&qhp->wq, &rchp->cq, srqidx);
+}
diff --git a/providers/cxgb4/dev.c b/providers/cxgb4/dev.c
index b1870219..3479e561 100644
--- a/providers/cxgb4/dev.c
+++ b/providers/cxgb4/dev.c
@@ -84,6 +84,7 @@  static const struct verbs_context_ops  c4iw_ctx_common_ops = {
 	.create_srq = c4iw_create_srq,
 	.modify_srq = c4iw_modify_srq,
 	.destroy_srq = c4iw_destroy_srq,
+	.query_srq = c4iw_query_srq,
 	.create_qp = c4iw_create_qp,
 	.modify_qp = c4iw_modify_qp,
 	.destroy_qp = c4iw_destroy_qp,
@@ -456,6 +457,7 @@  static struct verbs_device *c4iw_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
 	dev->abi_version = sysfs_dev->abi_ver;
 	list_node_init(&dev->list);
 
+	list_head_init(&dev->srq_list);
 	PDBG("%s device claimed\n", __FUNCTION__);
 	list_add_tail(&devices, &dev->list);
 #ifdef STALL_DETECTION
diff --git a/providers/cxgb4/qp.c b/providers/cxgb4/qp.c
index 5d90510c..eadfc6d5 100644
--- a/providers/cxgb4/qp.c
+++ b/providers/cxgb4/qp.c
@@ -92,6 +92,23 @@  static void copy_wr_to_rq(struct t4_wq *wq, union t4_recv_wr *wqe, u8 len16)
 	}
 }
 
+void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16)
+{
+	u64 *src, *dst;
+
+	src = (u64 *)wqe;
+	dst = (u64 *)((u8 *)srq->queue + srq->wq_pidx * T4_EQ_ENTRY_SIZE);
+	while (len16) {
+		*dst++ = *src++;
+		if (dst >= (u64 *)&srq->queue[srq->size])
+			dst = (u64 *)srq->queue;
+		*dst++ = *src++;
+		if (dst >= (u64 *)&srq->queue[srq->size])
+			dst = (u64 *)srq->queue;
+		len16--;
+	}
+}
+
 static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
 		      struct ibv_send_wr *wr, int max, u32 *plenp)
 {
@@ -277,6 +294,19 @@  static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
 	return 0;
 }
 
+static int build_srq_recv(union t4_recv_wr *wqe, struct ibv_recv_wr *wr,
+		u8 *len16)
+{
+	int ret;
+
+	ret = build_isgl(&wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
+	if (ret)
+		return ret;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
 static void ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 idx)
 {
 	struct ibv_modify_qp cmd = {};
@@ -406,6 +436,89 @@  int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 	return err;
 }
 
+static void defer_srq_wr(struct t4_srq *srq, union t4_recv_wr *wqe, uint64_t wr_id, u8 len16)
+{
+	struct t4_srq_pending_wr *pwr = &srq->pending_wrs[srq->pending_pidx];
+
+	PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u ooo_count %u wr_id 0x%llx "
+			"pending_cidx %u pending_pidx %u pending_in_use %u\n",
+			__func__, srq->cidx, srq->pidx, srq->wq_pidx,
+			srq->in_use, srq->ooo_count, (unsigned long long)wr_id, srq->pending_cidx,
+			srq->pending_pidx, srq->pending_in_use);
+	pwr->wr_id = wr_id;
+	pwr->len16 = len16;
+	memcpy(&pwr->wqe, wqe, len16*16);
+	t4_srq_produce_pending_wr(srq);
+}
+
+int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr,
+		struct ibv_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_srq *srq;
+	union t4_recv_wr *wqe, lwqe;
+	u32 num_wrs;
+	u8 len16 = 0;
+	u16 idx = 0;
+
+	srq = to_c4iw_srq(ibsrq);
+	pthread_spin_lock(&srq->lock);
+	INC_STAT(srq_recv);
+	num_wrs = t4_srq_avail(&srq->wq);
+	if (num_wrs == 0) {
+		pthread_spin_unlock(&srq->lock);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = &lwqe;
+		if (num_wrs)
+			err = build_srq_recv(wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = srq->wq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+
+		if (srq->wq.ooo_count || srq->wq.pending_in_use || srq->wq.sw_rq[srq->wq.pidx].valid)
+			defer_srq_wr(&srq->wq, wqe, wr->wr_id, len16);
+		else {
+			srq->wq.sw_rq[srq->wq.pidx].wr_id = wr->wr_id;
+			srq->wq.sw_rq[srq->wq.pidx].valid = 1;
+			c4iw_copy_wr_to_srq(&srq->wq, wqe, len16);
+			PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u "
+					"wr_id 0x%llx \n", __func__, srq->wq.cidx,
+					srq->wq.pidx, srq->wq.wq_pidx, srq->wq.in_use,
+					(unsigned long long)wr->wr_id);
+			t4_srq_produce(&srq->wq, len16);
+			idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+		}
+		wr = wr->next;
+		num_wrs--;
+	}
+
+	if (idx) {
+		t4_ring_srq_db(&srq->wq, idx, len16, wqe);
+		srq->wq.queue[srq->wq.size].status.host_wq_pidx =
+			srq->wq.wq_pidx;
+	}
+	pthread_spin_unlock(&srq->lock);
+	return err;
+}
+
 int c4iw_post_receive(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			   struct ibv_recv_wr **bad_wr)
 {
@@ -491,8 +604,10 @@  static void update_qp_state(struct c4iw_qp *qhp)
 void c4iw_flush_qp(struct c4iw_qp *qhp)
 {
 	struct c4iw_cq *rchp, *schp;
+	u32 srqidx;
 	int count;
 
+	srqidx = t4_wq_srqidx(&qhp->wq);
 	rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq);
 	schp = to_c4iw_cq(qhp->ibv_qp.send_cq);
 
@@ -515,16 +630,26 @@  void c4iw_flush_qp(struct c4iw_qp *qhp)
 	qhp->wq.flushed = 1;
 	t4_set_wq_in_error(&qhp->wq);
 
+	if (qhp->srq)
+		pthread_spin_lock(&qhp->srq->lock);
+
+	if (srqidx)
+		c4iw_flush_srqidx(qhp, srqidx);
+
 	update_qp_state(qhp);
 
 	c4iw_flush_hw_cq(rchp, qhp);
-	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
-	c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	if (!qhp->srq) {
+		c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+		c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	}
 
 	if (schp != rchp)
 		c4iw_flush_hw_cq(schp, qhp);
 
 	c4iw_flush_sq(qhp);
+	if (qhp->srq)
+		pthread_spin_unlock(&qhp->srq->lock);
 
 	pthread_spin_unlock(&qhp->lock);
 	if (schp != rchp)
diff --git a/providers/cxgb4/t4.h b/providers/cxgb4/t4.h
index 67f411d9..08f29fa7 100644
--- a/providers/cxgb4/t4.h
+++ b/providers/cxgb4/t4.h
@@ -373,6 +373,7 @@  struct t4_sq {
 
 struct t4_swrqe {
 	u64 wr_id;
+	int valid;
 };
 
 struct t4_rq {
@@ -440,7 +441,6 @@  static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
 static inline void t4_rq_consume(struct t4_wq *wq)
 {
 	wq->rq.in_use--;
-	wq->rq.msn++;
 	if (++wq->rq.cidx == wq->rq.size)
 		wq->rq.cidx = 0;
 	assert((wq->rq.cidx != wq->rq.pidx) || wq->rq.in_use == 0);
@@ -566,7 +566,7 @@  static inline void t4_srq_consume(struct t4_srq *srq)
 
 static inline int t4_wq_in_error(struct t4_wq *wq)
 {
-	return wq->error || wq->rq.queue[wq->rq.size].status.qp_err;
+	return wq->error || *wq->qp_errp;
 }
 
 static inline u32 t4_wq_srqidx(struct t4_wq *wq)
@@ -742,7 +742,7 @@  static inline void t4_ring_srq_db(struct t4_srq *srq, u16 inc, u8 len16,
 
 static inline void t4_set_wq_in_error(struct t4_wq *wq)
 {
-	wq->rq.queue[wq->rq.size].status.qp_err = 1;
+	*wq->qp_errp = 1;
 }
 
 extern int c4iw_abi_version;
diff --git a/providers/cxgb4/verbs.c b/providers/cxgb4/verbs.c
index a8935def..e43992e8 100644
--- a/providers/cxgb4/verbs.c
+++ b/providers/cxgb4/verbs.c
@@ -286,24 +286,141 @@  int c4iw_destroy_cq(struct ibv_cq *ibcq)
 struct ibv_srq *c4iw_create_srq(struct ibv_pd *pd,
 				struct ibv_srq_init_attr *attr)
 {
+	struct c4iw_dev *dev = to_c4iw_dev(pd->context->device);
+	struct uc4iw_create_srq_resp resp;
+	unsigned long segment_offset;
+	struct ibv_create_srq cmd;
+	struct c4iw_srq *srq;
+	void *dbva;
+	int ret;
+
+	PDBG("%s enter\n", __func__);
+	srq = calloc(1, sizeof *srq);
+	if (!srq)
+		goto err;
+
+	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd,
+			sizeof cmd, &resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_free_srq_mem;
+
+	PDBG("%s srq id 0x%x srq key %" PRIx64 " srq db/gts key %" PRIx64
+			" qid_mask 0x%x\n", __func__,
+			resp.srqid, resp.srq_key, resp.srq_db_gts_key,
+			resp.qid_mask);
+
+	srq->rhp = dev;
+	srq->wq.qid = resp.srqid;
+	srq->wq.size = resp.srq_size;
+	srq->wq.memsize = resp.srq_memsize;
+	srq->wq.rqt_abs_idx = resp.rqt_abs_idx;
+	srq->flags = resp.flags;
+	pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE);
+
+	dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
+			pd->context->cmd_fd, resp.srq_db_gts_key);
+	if (dbva == MAP_FAILED)
+		goto err_destroy_srq;
+	srq->wq.udb = dbva;
+
+	segment_offset = 128 * (srq->wq.qid & resp.qid_mask);
+	if (segment_offset < c4iw_page_size) {
+		srq->wq.udb += segment_offset / 4;
+		srq->wq.wc_reg_available = 1;
+	} else
+		srq->wq.bar2_qid = srq->wq.qid & resp.qid_mask;
+	srq->wq.udb += 2;
+
+	srq->wq.queue = mmap(NULL, srq->wq.memsize,
+			PROT_WRITE, MAP_SHARED,
+			pd->context->cmd_fd, resp.srq_key);
+	if (srq->wq.queue == MAP_FAILED)
+		goto err_unmap_udb;
+
+	srq->wq.sw_rq = calloc(srq->wq.size, sizeof (struct t4_swrqe));
+	if (!srq->wq.sw_rq)
+		goto err_unmap_queue;
+	srq->wq.pending_wrs = calloc(srq->wq.size, sizeof *srq->wq.pending_wrs);
+	if (!srq->wq.pending_wrs)
+		goto err_free_sw_rq;
+
+	pthread_spin_lock(&dev->lock);
+	list_add_tail(&dev->srq_list, &srq->list);
+	pthread_spin_unlock(&dev->lock);
+
+	PDBG("%s srq dbva %p srq qva %p srq depth %u srq memsize %lu\n",
+			__func__, srq->wq.udb, srq->wq.queue,
+			srq->wq.size, srq->wq.memsize);
+
+	INC_STAT(srq);
+	return &srq->ibv_srq;
+err_free_sw_rq:
+	free(srq->wq.sw_rq);
+err_unmap_queue:
+	munmap((void *)srq->wq.queue, srq->wq.memsize);
+err_unmap_udb:
+	munmap(MASKED(srq->wq.udb), c4iw_page_size);
+err_destroy_srq:
+	(void)ibv_cmd_destroy_srq(&srq->ibv_srq);
+err_free_srq_mem:
+	free(srq);
+err:
+
 	return NULL;
 }
 
-int c4iw_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr,
+int c4iw_modify_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr,
 		    int attr_mask)
 {
-	return ENOSYS;
+	struct c4iw_srq *srq = to_c4iw_srq(ibsrq);
+	struct ibv_modify_srq cmd;
+	int ret;
+
+	/* XXX no support for this yet */
+	if (attr_mask & IBV_SRQ_MAX_WR)
+		return ENOSYS;
+
+	ret = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, &cmd, sizeof cmd);
+	if (!ret) {
+		if (attr_mask & IBV_SRQ_LIMIT) {
+			srq->armed = 1;
+			srq->srq_limit = attr->srq_limit;
+		}
+	}
+	return ret;
 }
 
-int c4iw_destroy_srq(struct ibv_srq *srq)
+int c4iw_destroy_srq(struct ibv_srq *ibsrq)
 {
-	return ENOSYS;
+	int ret;
+	struct c4iw_srq *srq = to_c4iw_srq(ibsrq);
+
+	PDBG("%s enter qp %p\n", __func__, ibsrq);
+
+	ret = ibv_cmd_destroy_srq(ibsrq);
+	if (ret) {
+		return ret;
+	}
+
+	pthread_spin_lock(&srq->rhp->lock);
+	list_del(&srq->list);
+	pthread_spin_unlock(&srq->rhp->lock);
+
+	munmap(MASKED(srq->wq.udb), c4iw_page_size);
+	munmap(srq->wq.queue, srq->wq.memsize);
+
+	free(srq->wq.pending_wrs);
+	free(srq->wq.sw_rq);
+	free(srq);
+	return 0;
+
 }
 
-int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr,
-		       struct ibv_recv_wr **bad_wr)
+int c4iw_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr)
 {
-	return ENOSYS;
+	struct ibv_query_srq cmd;
+
+	return ibv_cmd_query_srq(ibsrq, attr, &cmd, sizeof cmd);
 }
 
 static struct ibv_qp *create_qp_v0(struct ibv_pd *pd,
@@ -372,7 +489,7 @@  static struct ibv_qp *create_qp_v0(struct ibv_pd *pd,
 	if (!qhp->wq.sq.sw_sq)
 		goto err7;
 
-	qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t));
+	qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (struct t4_swrqe));
 	if (!qhp->wq.rq.sw_rq)
 		goto err8;
 
@@ -445,9 +562,12 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	qhp->wq.sq.flags = resp.flags & C4IW_QPF_ONCHIP ? T4_SQ_ONCHIP : 0;
 	qhp->wq.sq.flush_cidx = -1;
 	qhp->wq.rq.msn = 1;
-	qhp->wq.rq.qid = resp.rqid;
-	qhp->wq.rq.size = resp.rq_size;
-	qhp->wq.rq.memsize = resp.rq_memsize;
+	qhp->srq = to_c4iw_srq(attr->srq);
+	if (!attr->srq) {
+		qhp->wq.rq.qid = resp.rqid;
+		qhp->wq.rq.size = resp.rq_size;
+		qhp->wq.rq.memsize = resp.rq_memsize;
+	}
 	if (ma_wr && resp.sq_memsize < (resp.sq_size + 1) *
 	    sizeof *qhp->wq.sq.queue + 16*sizeof(__be64) ) {
 		ma_wr = 0;
@@ -479,35 +599,39 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	if (qhp->wq.sq.queue == MAP_FAILED)
 		goto err4;
 
-	dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
-		    pd->context->cmd_fd, resp.rq_db_gts_key);
-	if (dbva == MAP_FAILED)
-		goto err5;
-	qhp->wq.rq.udb = dbva;
-	if (!dev_is_t4(qhp->rhp)) {
-		unsigned long segment_offset = 128 * (qhp->wq.rq.qid &
-						      qhp->wq.qid_mask);
-
-		if (segment_offset < c4iw_page_size) {
-			qhp->wq.rq.udb += segment_offset / 4;
-			qhp->wq.rq.wc_reg_available = 1;
-		} else
-			qhp->wq.rq.bar2_qid = qhp->wq.rq.qid & qhp->wq.qid_mask;
-		qhp->wq.rq.udb += 2;
+	if (!attr->srq) {
+		dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED,
+				pd->context->cmd_fd, resp.rq_db_gts_key);
+		if (dbva == MAP_FAILED)
+			goto err5;
+		qhp->wq.rq.udb = dbva;
+		if (!dev_is_t4(qhp->rhp)) {
+			unsigned long segment_offset = 128 * (qhp->wq.rq.qid &
+					qhp->wq.qid_mask);
+
+			if (segment_offset < c4iw_page_size) {
+				qhp->wq.rq.udb += segment_offset / 4;
+				qhp->wq.rq.wc_reg_available = 1;
+			} else
+				qhp->wq.rq.bar2_qid = qhp->wq.rq.qid & qhp->wq.qid_mask;
+			qhp->wq.rq.udb += 2;
+		}
+		qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize,
+				PROT_WRITE, MAP_SHARED,
+				pd->context->cmd_fd, resp.rq_key);
+		if (qhp->wq.rq.queue == MAP_FAILED)
+			goto err6;
 	}
-	qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize,
-			    PROT_WRITE, MAP_SHARED,
-			    pd->context->cmd_fd, resp.rq_key);
-	if (qhp->wq.rq.queue == MAP_FAILED)
-		goto err6;
 
 	qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe));
 	if (!qhp->wq.sq.sw_sq)
 		goto err7;
 
-	qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (uint64_t));
-	if (!qhp->wq.rq.sw_rq)
-		goto err8;
+	if (!attr->srq) {
+		qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof (struct t4_swrqe));
+		if (!qhp->wq.rq.sw_rq)
+			goto err8;
+	}
 
 	if (t4_sq_onchip(&qhp->wq)) {
 		qhp->wq.sq.ma_sync = mmap(NULL, c4iw_page_size, PROT_WRITE,
@@ -520,11 +644,18 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 
 	if (ctx->status_page_size) {
 		qhp->wq.db_offp = &ctx->status_page->db_off;
-	} else {
+	} else if (!attr->srq) {
 		qhp->wq.db_offp = 
 			&qhp->wq.rq.queue[qhp->wq.rq.size].status.db_off;
 	}
 
+	if (!attr->srq)
+		qhp->wq.qp_errp = &qhp->wq.rq.queue[qhp->wq.rq.size].status.qp_err;
+	else {
+		qhp->wq.qp_errp = &qhp->wq.sq.queue[qhp->wq.sq.size].status.qp_err;
+		qhp->wq.srqidxp = &qhp->wq.sq.queue[qhp->wq.sq.size].status.srqidx;
+	}
+
 	PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu "
 	       " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n",
 	     __func__,
@@ -541,13 +672,16 @@  static struct ibv_qp *create_qp(struct ibv_pd *pd,
 	INC_STAT(qp);
 	return &qhp->ibv_qp;
 err9:
-	free(qhp->wq.rq.sw_rq);
+	if (!attr->srq)
+		free(qhp->wq.rq.sw_rq);
 err8:
 	free(qhp->wq.sq.sw_sq);
 err7:
-	munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	if (!attr->srq)
+		munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize);
 err6:
-	munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
+	if (!attr->srq)
+		munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
 err5:
 	munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize);
 err4:
@@ -619,15 +753,18 @@  int c4iw_destroy_qp(struct ibv_qp *ibqp)
 		munmap((void *)qhp->wq.sq.ma_sync, c4iw_page_size);
 	}
 	munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size);
-	munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
 	munmap(qhp->wq.sq.queue, qhp->wq.sq.memsize);
-	munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	if (!qhp->srq) {
+		munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size);
+		munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize);
+	}
 
 	pthread_spin_lock(&dev->lock);
 	dev->qpid2ptr[qhp->wq.sq.qid] = NULL;
 	pthread_spin_unlock(&dev->lock);
 
-	free(qhp->wq.rq.sw_rq);
+	if (!qhp->srq)
+		free(qhp->wq.rq.sw_rq);
 	free(qhp->wq.sq.sw_sq);
 	free(qhp);
 	return 0;