diff mbox series

[16/20] RDMA/rw: Use IB_WR_REG_MR_INTEGRITY for PI handover

Message ID 1559222731-16715-17-git-send-email-maxg@mellanox.com (mailing list archive)
State Superseded
Headers show
Series Introduce new API for T10-PI offload | expand

Commit Message

Max Gurtovoy May 30, 2019, 1:25 p.m. UTC
From: Israel Rukshin <israelr@mellanox.com>

Replace the old signature handover API with the new one. The new API
simplifes PI handover code complexity for ULPs and improve performance.
For RW API it will reduce the maximum number of work requests per task
and the need of dealing with multiple MRs (and their registrations and
invalidations) per task. All the mappings and registration of the data
and the protection buffers is done by the LLD using a single WR and a
special MR type (IB_MR_TYPE_INTEGRITY) for the PI handover operation.

The setup of the tested benchmark (using iSER ULP):
 - 2 servers with 24 cores (1 initiator and 1 target)
 - ConnectX-4/ConnectX-5 adapters
 - 24 target sessions with 1 LUN each
 - ramdisk backstore
 - PI active

Performance results running fio (24 jobs, 128 iodepth) using
write_generate=1 and read_verify=1 (w/w.o patch):

bs      IOPS(read)        IOPS(write)
----    ----------        ----------
512   1243.3K/1182.3K    1725.1K/1680.2K
4k    571233/528835      743293/748259
32k   72388/71086        71789/93573

Using write_generate=0 and read_verify=0 (w/w.o patch):
bs      IOPS(read)        IOPS(write)
----    ----------        ----------
512   1572.1K/1427.2K    1823.5K/1724.3K
4k    921992/916194      753772/768267
32k   75052/73960        73180/95484

There is a performance degradation when writing big block sizes.
Degradation is caused by the complexity of combining multiple
indirections and perform RDMA READ operation from it. This will be
fixed in the following patches by reducing the indirections if
possible.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
---
 drivers/infiniband/core/rw.c            | 195 +++++++++++++++-----------------
 drivers/infiniband/ulp/isert/ib_isert.c |   4 +-
 include/rdma/rw.h                       |   9 --
 3 files changed, 91 insertions(+), 117 deletions(-)

Comments

Christoph Hellwig June 4, 2019, 7:53 a.m. UTC | #1
> -	if (reg->mr->need_inval) {
> -		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
> -		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
> -		reg->inv_wr.next = &reg->reg_wr.wr;
> -		count++;
> -	} else {
> -		reg->inv_wr.next = NULL;
> -	}
> +	count += rdma_rw_inv_key(reg);

Splitting out this helper seems separate, would be nice to have it as
a prep patch separate from this big one.

> +	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));

Why do we need to do a struct copy here instead of setting up a pointer?

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Sagi Grimberg June 5, 2019, 10:55 p.m. UTC | #2
>> +	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
> 
> Why do we need to do a struct copy here instead of setting up a pointer?

Yea, can't we use the mr->sig_attrs directly?
Max Gurtovoy June 5, 2019, 11:26 p.m. UTC | #3
On 6/6/2019 1:55 AM, Sagi Grimberg wrote:
>
>>> + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct 
>>> ib_sig_attrs));
>>
>> Why do we need to do a struct copy here instead of setting up a pointer?
>
> Yea, can't we use the mr->sig_attrs directly?

No, the MR is internal to RW API and the sig_attrs comes from isert. I 
don't want to loose the pointer I've allocated. it will cause a leak.
Sagi Grimberg June 5, 2019, 11:47 p.m. UTC | #4
>>>> + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct 
>>>> ib_sig_attrs));
>>>
>>> Why do we need to do a struct copy here instead of setting up a pointer?
>>
>> Yea, can't we use the mr->sig_attrs directly?
> 
> No, the MR is internal to RW API and the sig_attrs comes from isert. I 
> don't want to loose the pointer I've allocated. it will cause a leak.

Yes, that makes sense Max.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index f825990bacfa..ffbd9dc98b5f 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -59,10 +59,34 @@  static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
 	return false;
 }
 
-static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+					   bool pi_support)
 {
+	u32 max_pages;
+
+	if (pi_support)
+		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+	else
+		max_pages = dev->attrs.max_fast_reg_page_list_len;
+
 	/* arbitrary limit to avoid allocating gigantic resources */
-	return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+	return min_t(u32, max_pages, 256);
+}
+
+static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
+{
+	int count = 0;
+
+	if (reg->mr->need_inval) {
+		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+		reg->inv_wr.next = &reg->reg_wr.wr;
+		count++;
+	} else {
+		reg->inv_wr.next = NULL;
+	}
+
+	return count;
 }
 
 /* Caller must have zero-initialized *reg. */
@@ -70,7 +94,8 @@  static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
 		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
 		u32 sg_cnt, u32 offset)
 {
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->signature_en);
 	u32 nents = min(sg_cnt, pages_per_mr);
 	int count = 0, ret;
 
@@ -78,14 +103,7 @@  static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
 	if (!reg->mr)
 		return -EAGAIN;
 
-	if (reg->mr->need_inval) {
-		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
-		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
-		reg->inv_wr.next = &reg->reg_wr.wr;
-		count++;
-	} else {
-		reg->inv_wr.next = NULL;
-	}
+	count += rdma_rw_inv_key(reg);
 
 	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
 	if (ret < 0 || ret < nents) {
@@ -110,7 +128,8 @@  static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct rdma_rw_reg_ctx *prev = NULL;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->signature_en);
 	int i, j, ret = 0, count = 0;
 
 	ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
@@ -351,10 +370,10 @@  int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->signature_en);
 	struct ib_rdma_wr *rdma_wr;
-	struct ib_send_wr *prev_wr = NULL;
-	int count = 0, ret;
+	int count = 0, ret, n;
 
 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
 		pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n",
@@ -367,75 +386,59 @@  int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		return -ENOMEM;
 	sg_cnt = ret;
 
-	ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-	if (!ret) {
-		ret = -ENOMEM;
-		goto out_unmap_sg;
+	if (prot_sg_cnt) {
+		ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+		if (!ret) {
+			ret = -ENOMEM;
+			goto out_unmap_sg;
+		}
+		prot_sg_cnt = ret;
 	}
-	prot_sg_cnt = ret;
 
 	ctx->type = RDMA_RW_SIG_MR;
 	ctx->nr_ops = 1;
-	ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
-	if (!ctx->sig) {
+	ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL);
+	if (!ctx->reg) {
 		ret = -ENOMEM;
 		goto out_unmap_prot_sg;
 	}
 
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
-	if (ret < 0)
-		goto out_free_ctx;
-	count += ret;
-	prev_wr = &ctx->sig->data.reg_wr.wr;
-
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
-				  prot_sg, prot_sg_cnt, 0);
-	if (ret < 0)
-		goto out_destroy_data_mr;
-	count += ret;
-
-	if (ctx->sig->prot.inv_wr.next)
-		prev_wr->next = &ctx->sig->prot.inv_wr;
-	else
-		prev_wr->next = &ctx->sig->prot.reg_wr.wr;
-	prev_wr = &ctx->sig->prot.reg_wr.wr;
-
-	ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
-	if (!ctx->sig->sig_mr) {
+	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+	if (!ctx->reg->mr) {
 		ret = -EAGAIN;
-		goto out_destroy_prot_mr;
+		goto out_free_ctx;
 	}
 
-	if (ctx->sig->sig_mr->need_inval) {
-		memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+	count += rdma_rw_inv_key(ctx->reg);
 
-		ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
-		ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
 
-		prev_wr->next = &ctx->sig->sig_inv_wr;
-		prev_wr = &ctx->sig->sig_inv_wr;
+	n = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
+			    prot_sg_cnt, NULL, SZ_4K);
+	if (unlikely(n != sg_cnt + prot_sg_cnt)) {
+		pr_err("failed to map sg (%d/%d)\n", n, sg_cnt + prot_sg_cnt);
+		ret = n < 0 ? n : -EINVAL;
+		goto out_destroy_sig_mr;
 	}
 
-	ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-	ctx->sig->sig_wr.wr.wr_cqe = NULL;
-	ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
-	ctx->sig->sig_wr.wr.num_sge = 1;
-	ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-	ctx->sig->sig_wr.sig_attrs = sig_attrs;
-	ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
-	if (prot_sg_cnt)
-		ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
-	prev_wr->next = &ctx->sig->sig_wr.wr;
-	prev_wr = &ctx->sig->sig_wr.wr;
+	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+	ctx->reg->reg_wr.wr.wr_cqe = NULL;
+	ctx->reg->reg_wr.wr.num_sge = 0;
+	ctx->reg->reg_wr.wr.send_flags = 0;
+	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+	if (rdma_protocol_iwarp(qp->device, port_num))
+		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+	ctx->reg->reg_wr.mr = ctx->reg->mr;
+	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
 	count++;
 
-	ctx->sig->sig_sge.addr = 0;
-	ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
-	if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
-		ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+	ctx->reg->sge.addr = ctx->reg->mr->iova;
+	ctx->reg->sge.length = ctx->reg->mr->length;
+	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
 
-	rdma_wr = &ctx->sig->data.wr;
-	rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+	rdma_wr = &ctx->reg->wr;
+	rdma_wr->wr.sg_list = &ctx->reg->sge;
 	rdma_wr->wr.num_sge = 1;
 	rdma_wr->remote_addr = remote_addr;
 	rdma_wr->rkey = rkey;
@@ -443,21 +446,18 @@  int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 	else
 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-	prev_wr->next = &rdma_wr->wr;
-	prev_wr = &rdma_wr->wr;
+	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
 	count++;
 
 	return count;
 
-out_destroy_prot_mr:
-	if (prot_sg_cnt)
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
-out_destroy_data_mr:
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_destroy_sig_mr:
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 out_free_ctx:
-	kfree(ctx->sig);
+	kfree(ctx->reg);
 out_unmap_prot_sg:
-	ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+	if (prot_sg_cnt)
+		ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
 	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
 	return ret;
@@ -500,22 +500,8 @@  struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 
 	switch (ctx->type) {
 	case RDMA_RW_SIG_MR:
-		rdma_rw_update_lkey(&ctx->sig->data, true);
-		if (ctx->sig->prot.mr)
-			rdma_rw_update_lkey(&ctx->sig->prot, true);
-	
-		ctx->sig->sig_mr->need_inval = true;
-		ib_update_fast_reg_key(ctx->sig->sig_mr,
-			ib_inc_rkey(ctx->sig->sig_mr->lkey));
-		ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
-
-		if (ctx->sig->data.inv_wr.next)
-			first_wr = &ctx->sig->data.inv_wr;
-		else
-			first_wr = &ctx->sig->data.reg_wr.wr;
-		last_wr = &ctx->sig->data.wr.wr;
-		break;
 	case RDMA_RW_MR:
+		/* fallthrough */
 		for (i = 0; i < ctx->nr_ops; i++) {
 			rdma_rw_update_lkey(&ctx->reg[i],
 				ctx->reg[i].wr.wr.opcode !=
@@ -632,16 +618,12 @@  void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
 		return;
 
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
-	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+	kfree(ctx->reg);
 
-	if (ctx->sig->prot.mr) {
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	if (prot_sg_cnt)
 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-	}
-
-	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
-	kfree(ctx->sig);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
@@ -662,7 +644,7 @@  unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
 	unsigned int mr_pages;
 
 	if (rdma_rw_can_use_mr(device, port_num))
-		mr_pages = rdma_rw_fr_page_list_len(device);
+		mr_pages = rdma_rw_fr_page_list_len(device, false);
 	else
 		mr_pages = device->attrs.max_sge_rd;
 	return DIV_ROUND_UP(maxpages, mr_pages);
@@ -688,9 +670,8 @@  void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 	 * we'll need two additional MRs for the registrations and the
 	 * invalidation.
 	 */
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-		factor += 6;	/* (inv + reg) * (data + prot + sig) */
-	else if (rdma_rw_can_use_mr(dev, attr->port_num))
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN ||
+	    rdma_rw_can_use_mr(dev, attr->port_num))
 		factor += 2;	/* inv + reg */
 
 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
@@ -706,20 +687,22 @@  void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 nr_mrs = 0, nr_sig_mrs = 0;
+	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
 	int ret = 0;
 
 	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
-		nr_mrs = attr->cap.max_rdma_ctxs * 2;
+		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
 		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
 	}
 
 	if (nr_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
 				IB_MR_TYPE_MEM_REG,
-				rdma_rw_fr_page_list_len(dev), 0);
+				max_num_sg, 0);
 		if (ret) {
 			pr_err("%s: failed to allocated %d MRs\n",
 				__func__, nr_mrs);
@@ -729,7 +712,7 @@  int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 
 	if (nr_sig_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-				IB_MR_TYPE_SIGNATURE, 2, 0);
+				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
 		if (ret) {
 			pr_err("%s: failed to allocated %d SIG MRs\n",
 				__func__, nr_sig_mrs);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index ffef4ac152ca..8d8c6d64291f 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1677,7 +1677,7 @@  isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 
 	isert_dbg("Cmd %p\n", isert_cmd);
 
-	ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+	ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
 	if (ret) {
@@ -1723,7 +1723,7 @@  isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 	iscsit_stop_dataout_timer(cmd);
 
 	if (isert_prot_cmd(isert_conn, se_cmd))
-		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 	cmd->write_data_done = 0;
 
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index a3cbbc7b6417..bcb221241b5d 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -47,15 +47,6 @@  struct rdma_rw_ctx {
 			struct ib_send_wr	inv_wr;
 			struct ib_mr		*mr;
 		} *reg;
-
-		struct {
-			struct rdma_rw_reg_ctx	data;
-			struct rdma_rw_reg_ctx	prot;
-			struct ib_send_wr	sig_inv_wr;
-			struct ib_mr		*sig_mr;
-			struct ib_sge		sig_sge;
-			struct ib_sig_handover_wr sig_wr;
-		} *sig;
 	};
 };