Message ID | 20180416150046.22630-1-sergeygo@mellanox.com (mailing list archive) |
---|---|
State | Rejected |
Delegated to: | Doug Ledford |
Headers | show |
Jason/Doug/Leon, can you please apply this patch ? Sergey made the needed changes. -Max. On 4/16/2018 6:00 PM, Sergey Gorenko wrote: > Some users complain about RNR errors on the target, when heavy > high-priority tasks run on the initiator. After the > investigation, we found out that the receive WRs were exhausted, > because the initiator could not post them on time. > > Receive work reqeusts are posted in chunks of min_posted_rx to > reduce the number of hits to the HCA. The WRs are posted in the > receive completion handler when the number of free receive buffers > reaches min_posted_rx. But on a high-loaded host, receive CQEs > processing can be delayed and all receive WRs will be exhausted. > In this case, the target will get an RNR error. > > To avoid this, we post receive WR, as soon as at least one receive > buffer is freed. This increases the number of hits to the HCA, but > test results show that performance degradation is not significant. > > Performance results running fio (8 jobs, 64 iodepth) using ramdisk > (w/w.o patch): > > bs IOPS(randread) IOPS(randwrite) > ------ --------------- --------------- > 512 329.4K / 340.3K 379.1K / 387.7K > 1K 333.4K / 337.6K 364.2K / 370.0K > 2K 321.1K / 323.5K 334.2K / 337.8K > 4K 300.7K / 302.9K 290.2K / 291.6K > 8K 235.9K / 237.1K 228.2K / 228.8K > 16K 176.3K / 177.0K 126.3K / 125.9K > 32K 115.2K / 115.4K 82.2K / 82.0K > 64K 70.7K / 70.7K 47.8K / 47.6K > 128K 38.5K / 38.6K 25.8K / 25.7K > 256K 20.7K / 20.7K 13.7K / 13.6K > 512K 10.0K / 10.0K 7.0K / 7.0K > > Signed-off-by: Sergey Gorenko <sergeygo@mellanox.com> > Signed-off-by: Vladimir Neyelov <vladimirn@mellanox.com> > Reviewed-by: Max Gurtovoy <maxg@mellanox.com> > --- > Changes from v2: > - Propagate return code in iser_post_rx_bufs() as suggested by Max. > Changes from v1: > - Follow Leon's suggestion, the changelog is placed after "---" and > after all Signed-off-by/Reviewed-by. > Changes from v0: > - Follow Sagi's suggestion, iser_post_rx_bufs() is refactored. Removed > unnecessary variable and centralized the exit from the function. > --- > drivers/infiniband/ulp/iser/iscsi_iser.h | 14 +----- > drivers/infiniband/ulp/iser/iser_initiator.c | 64 ++++++++++++---------------- > drivers/infiniband/ulp/iser/iser_verbs.c | 39 +++++------------ > 3 files changed, 40 insertions(+), 77 deletions(-) > > diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h > index c1ae4aeae2f9..925996ddcf9f 100644 > --- a/drivers/infiniband/ulp/iser/iscsi_iser.h > +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h > @@ -123,8 +123,6 @@ > > #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) > > -#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) > - > /* the max TX (send) WR supported by the iSER QP is defined by * > * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * > * to have at max for SCSI command. The tx posting & completion handling code * > @@ -452,9 +450,7 @@ struct iser_fr_pool { > * > * @cma_id: rdma_cm connection maneger handle > * @qp: Connection Queue-pair > - * @post_recv_buf_count: post receive counter > * @sig_count: send work request signal count > - * @rx_wr: receive work request for batch posts > * @device: reference to iser device > * @comp: iser completion context > * @fr_pool: connection fast registration poool > @@ -463,9 +459,7 @@ struct iser_fr_pool { > struct ib_conn { > struct rdma_cm_id *cma_id; > struct ib_qp *qp; > - int post_recv_buf_count; > u8 sig_count; > - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; > struct iser_device *device; > struct iser_comp *comp; > struct iser_fr_pool fr_pool; > @@ -482,8 +476,6 @@ struct ib_conn { > * @state: connection logical state > * @qp_max_recv_dtos: maximum number of data outs, corresponds > * to max number of post recvs > - * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) > - * @min_posted_rx: (qp_max_recv_dtos >> 2) > * @max_cmds: maximum cmds allowed for this connection > * @name: connection peer portal > * @release_work: deffered work for release job > @@ -494,7 +486,6 @@ struct ib_conn { > * (state is ISER_CONN_UP) > * @conn_list: entry in ig conn list > * @login_desc: login descriptor > - * @rx_desc_head: head of rx_descs cyclic buffer > * @rx_descs: rx buffers array (cyclic buffer) > * @num_rx_descs: number of rx descriptors > * @scsi_sg_tablesize: scsi host sg_tablesize > @@ -505,8 +496,6 @@ struct iser_conn { > struct iscsi_endpoint *ep; > enum iser_conn_state state; > unsigned qp_max_recv_dtos; > - unsigned qp_max_recv_dtos_mask; > - unsigned min_posted_rx; > u16 max_cmds; > char name[ISER_OBJECT_NAME_SIZE]; > struct work_struct release_work; > @@ -516,7 +505,6 @@ struct iser_conn { > struct completion up_completion; > struct list_head conn_list; > struct iser_login_desc login_desc; > - unsigned int rx_desc_head; > struct iser_rx_desc *rx_descs; > u32 num_rx_descs; > unsigned short scsi_sg_tablesize; > @@ -638,7 +626,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, > enum iser_data_dir cmd_dir); > > int iser_post_recvl(struct iser_conn *iser_conn); > -int iser_post_recvm(struct iser_conn *iser_conn, int count); > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc); > int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, > bool signal); > > diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c > index df49c4eb67f7..5fc648f9afe1 100644 > --- a/drivers/infiniband/ulp/iser/iser_initiator.c > +++ b/drivers/infiniband/ulp/iser/iser_initiator.c > @@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > struct iser_device *device = ib_conn->device; > > iser_conn->qp_max_recv_dtos = session->cmds_max; > - iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ > - iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; > > if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, > iser_conn->scsi_sg_tablesize)) > @@ -279,7 +277,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > rx_sg->lkey = device->pd->local_dma_lkey; > } > > - iser_conn->rx_desc_head = 0; > return 0; > > rx_desc_dma_map_failed: > @@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) > static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) > { > struct iser_conn *iser_conn = conn->dd_data; > - struct ib_conn *ib_conn = &iser_conn->ib_conn; > struct iscsi_session *session = conn->session; > + int err = 0; > + int i; > > iser_dbg("req op %x flags %x\n", req->opcode, req->flags); > /* check if this is the last login - going to full feature phase */ > if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) > - return 0; > - > - /* > - * Check that there is one posted recv buffer > - * (for the last login response). > - */ > - WARN_ON(ib_conn->post_recv_buf_count != 1); > + goto out; > > if (session->discovery_sess) { > iser_info("Discovery session, re-using login RX buffer\n"); > - return 0; > - } else > - iser_info("Normal session, posting batch of RX %d buffers\n", > - iser_conn->min_posted_rx); > + goto out; > + } > > - /* Initial post receive buffers */ > - if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) > - return -ENOMEM; > + iser_info("Normal session, posting batch of RX %d buffers\n", > + iser_conn->qp_max_recv_dtos - 1); > > - return 0; > + /* > + * Initial post receive buffers. > + * There is one already posted recv buffer (for the last login > + * response). Therefore, the first recv buffer is skipped here. > + */ > + for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) { > + err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]); > + if (err) > + goto out; > + } > +out: > + return err; > } > > static inline bool iser_signal_comp(u8 sig_count) > @@ -585,7 +585,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) > desc->rsp_dma, ISER_RX_LOGIN_SIZE, > DMA_FROM_DEVICE); > > - ib_conn->post_recv_buf_count--; > + if (iser_conn->iscsi_conn->session->discovery_sess) > + return; > + > + /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */ > + iser_post_recvm(iser_conn, iser_conn->rx_descs); > } > > static inline void > @@ -645,8 +649,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > struct iser_conn *iser_conn = to_iser_conn(ib_conn); > struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); > struct iscsi_hdr *hdr; > - int length; > - int outstanding, count, err; > + int length, err; > > if (unlikely(wc->status != IB_WC_SUCCESS)) { > iser_err_comp(wc, "task_rsp"); > @@ -675,20 +678,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > desc->dma_addr, ISER_RX_PAYLOAD_SIZE, > DMA_FROM_DEVICE); > > - /* decrementing conn->post_recv_buf_count only --after-- freeing the * > - * task eliminates the need to worry on tasks which are completed in * > - * parallel to the execution of iser_conn_term. So the code that waits * > - * for the posted rx bufs refcount to become zero handles everything */ > - ib_conn->post_recv_buf_count--; > - > - outstanding = ib_conn->post_recv_buf_count; > - if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { > - count = min(iser_conn->qp_max_recv_dtos - outstanding, > - iser_conn->min_posted_rx); > - err = iser_post_recvm(iser_conn, count); > - if (err) > - iser_err("posting %d rx bufs err %d\n", count, err); > - } > + err = iser_post_recvm(iser_conn, desc); > + if (err) > + iser_err("posting rx buffer err %d\n", err); > } > > void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) > diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c > index 56b7240a3fc3..2019500a2425 100644 > --- a/drivers/infiniband/ulp/iser/iser_verbs.c > +++ b/drivers/infiniband/ulp/iser/iser_verbs.c > @@ -936,7 +936,6 @@ void iser_conn_init(struct iser_conn *iser_conn) > INIT_LIST_HEAD(&iser_conn->conn_list); > mutex_init(&iser_conn->state_mutex); > > - ib_conn->post_recv_buf_count = 0; > ib_conn->reg_cqe.done = iser_reg_comp; > } > > @@ -1020,44 +1019,28 @@ int iser_post_recvl(struct iser_conn *iser_conn) > wr.num_sge = 1; > wr.next = NULL; > > - ib_conn->post_recv_buf_count++; > ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > - if (ib_ret) { > + if (ib_ret) > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > - ib_conn->post_recv_buf_count--; > - } > > return ib_ret; > } > > -int iser_post_recvm(struct iser_conn *iser_conn, int count) > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc) > { > struct ib_conn *ib_conn = &iser_conn->ib_conn; > - unsigned int my_rx_head = iser_conn->rx_desc_head; > - struct iser_rx_desc *rx_desc; > - struct ib_recv_wr *wr, *wr_failed; > - int i, ib_ret; > - > - for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { > - rx_desc = &iser_conn->rx_descs[my_rx_head]; > - rx_desc->cqe.done = iser_task_rsp; > - wr->wr_cqe = &rx_desc->cqe; > - wr->sg_list = &rx_desc->rx_sg; > - wr->num_sge = 1; > - wr->next = wr + 1; > - my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; > - } > + struct ib_recv_wr wr, *wr_failed; > + int ib_ret; > > - wr--; > - wr->next = NULL; /* mark end of work requests list */ > + rx_desc->cqe.done = iser_task_rsp; > + wr.wr_cqe = &rx_desc->cqe; > + wr.sg_list = &rx_desc->rx_sg; > + wr.num_sge = 1; > + wr.next = NULL; > > - ib_conn->post_recv_buf_count += count; > - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); > - if (ib_ret) { > + ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > + if (ib_ret) > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > - ib_conn->post_recv_buf_count -= count; > - } else > - iser_conn->rx_desc_head = my_rx_head; > > return ib_ret; > } > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Acked-by: Sagi Grimberg <sagi@grimberg.me>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2018-04-30 at 13:31 +0300, Max Gurtovoy wrote: > Jason/Doug/Leon, > can you please apply this patch ? > Sergey made the needed changes. > > -Max. > > On 4/16/2018 6:00 PM, Sergey Gorenko wrote: > > Some users complain about RNR errors on the target, when heavy > > high-priority tasks run on the initiator. After the > > investigation, we found out that the receive WRs were exhausted, > > because the initiator could not post them on time. > > > > Receive work reqeusts are posted in chunks of min_posted_rx to > > reduce the number of hits to the HCA. The WRs are posted in the > > receive completion handler when the number of free receive buffers > > reaches min_posted_rx. But on a high-loaded host, receive CQEs > > processing can be delayed and all receive WRs will be exhausted. > > In this case, the target will get an RNR error. > > > > To avoid this, we post receive WR, as soon as at least one receive > > buffer is freed. This increases the number of hits to the HCA, but > > test results show that performance degradation is not significant. > > > > Performance results running fio (8 jobs, 64 iodepth) using ramdisk > > (w/w.o patch): > > > > bs IOPS(randread) IOPS(randwrite) > > ------ --------------- --------------- > > 512 329.4K / 340.3K 379.1K / 387.7K 3% performance drop > > 1K 333.4K / 337.6K 364.2K / 370.0K 1.4% performance drop > > 2K 321.1K / 323.5K 334.2K / 337.8K .75% performance drop I know you said the performance hit was not significant, and by the time you get to 2k reads/writes, I agree with you, but at the low end, I'm not sure you can call 3% "not significant". Is a 3% performance hit better than the transient rnr error? And is this the only solution available? > > 4K 300.7K / 302.9K 290.2K / 291.6K > > 8K 235.9K / 237.1K 228.2K / 228.8K > > 16K 176.3K / 177.0K 126.3K / 125.9K > > 32K 115.2K / 115.4K 82.2K / 82.0K > > 64K 70.7K / 70.7K 47.8K / 47.6K > > 128K 38.5K / 38.6K 25.8K / 25.7K > > 256K 20.7K / 20.7K 13.7K / 13.6K > > 512K 10.0K / 10.0K 7.0K / 7.0K > > > > Signed-off-by: Sergey Gorenko <sergeygo@mellanox.com> > > Signed-off-by: Vladimir Neyelov <vladimirn@mellanox.com> > > Reviewed-by: Max Gurtovoy <maxg@mellanox.com> > > --- > > Changes from v2: > > - Propagate return code in iser_post_rx_bufs() as suggested by Max. > > Changes from v1: > > - Follow Leon's suggestion, the changelog is placed after "---" and > > after all Signed-off-by/Reviewed-by. > > Changes from v0: > > - Follow Sagi's suggestion, iser_post_rx_bufs() is refactored. Removed > > unnecessary variable and centralized the exit from the function. > > --- > > drivers/infiniband/ulp/iser/iscsi_iser.h | 14 +----- > > drivers/infiniband/ulp/iser/iser_initiator.c | 64 ++++++++++++---------------- > > drivers/infiniband/ulp/iser/iser_verbs.c | 39 +++++------------ > > 3 files changed, 40 insertions(+), 77 deletions(-) > > > > diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h > > index c1ae4aeae2f9..925996ddcf9f 100644 > > --- a/drivers/infiniband/ulp/iser/iscsi_iser.h > > +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h > > @@ -123,8 +123,6 @@ > > > > #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) > > > > -#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) > > - > > /* the max TX (send) WR supported by the iSER QP is defined by * > > * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * > > * to have at max for SCSI command. The tx posting & completion handling code * > > @@ -452,9 +450,7 @@ struct iser_fr_pool { > > * > > * @cma_id: rdma_cm connection maneger handle > > * @qp: Connection Queue-pair > > - * @post_recv_buf_count: post receive counter > > * @sig_count: send work request signal count > > - * @rx_wr: receive work request for batch posts > > * @device: reference to iser device > > * @comp: iser completion context > > * @fr_pool: connection fast registration poool > > @@ -463,9 +459,7 @@ struct iser_fr_pool { > > struct ib_conn { > > struct rdma_cm_id *cma_id; > > struct ib_qp *qp; > > - int post_recv_buf_count; > > u8 sig_count; > > - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; > > struct iser_device *device; > > struct iser_comp *comp; > > struct iser_fr_pool fr_pool; > > @@ -482,8 +476,6 @@ struct ib_conn { > > * @state: connection logical state > > * @qp_max_recv_dtos: maximum number of data outs, corresponds > > * to max number of post recvs > > - * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) > > - * @min_posted_rx: (qp_max_recv_dtos >> 2) > > * @max_cmds: maximum cmds allowed for this connection > > * @name: connection peer portal > > * @release_work: deffered work for release job > > @@ -494,7 +486,6 @@ struct ib_conn { > > * (state is ISER_CONN_UP) > > * @conn_list: entry in ig conn list > > * @login_desc: login descriptor > > - * @rx_desc_head: head of rx_descs cyclic buffer > > * @rx_descs: rx buffers array (cyclic buffer) > > * @num_rx_descs: number of rx descriptors > > * @scsi_sg_tablesize: scsi host sg_tablesize > > @@ -505,8 +496,6 @@ struct iser_conn { > > struct iscsi_endpoint *ep; > > enum iser_conn_state state; > > unsigned qp_max_recv_dtos; > > - unsigned qp_max_recv_dtos_mask; > > - unsigned min_posted_rx; > > u16 max_cmds; > > char name[ISER_OBJECT_NAME_SIZE]; > > struct work_struct release_work; > > @@ -516,7 +505,6 @@ struct iser_conn { > > struct completion up_completion; > > struct list_head conn_list; > > struct iser_login_desc login_desc; > > - unsigned int rx_desc_head; > > struct iser_rx_desc *rx_descs; > > u32 num_rx_descs; > > unsigned short scsi_sg_tablesize; > > @@ -638,7 +626,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, > > enum iser_data_dir cmd_dir); > > > > int iser_post_recvl(struct iser_conn *iser_conn); > > -int iser_post_recvm(struct iser_conn *iser_conn, int count); > > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc); > > int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, > > bool signal); > > > > diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c > > index df49c4eb67f7..5fc648f9afe1 100644 > > --- a/drivers/infiniband/ulp/iser/iser_initiator.c > > +++ b/drivers/infiniband/ulp/iser/iser_initiator.c > > @@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > > struct iser_device *device = ib_conn->device; > > > > iser_conn->qp_max_recv_dtos = session->cmds_max; > > - iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ > > - iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; > > > > if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, > > iser_conn->scsi_sg_tablesize)) > > @@ -279,7 +277,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > > rx_sg->lkey = device->pd->local_dma_lkey; > > } > > > > - iser_conn->rx_desc_head = 0; > > return 0; > > > > rx_desc_dma_map_failed: > > @@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) > > static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) > > { > > struct iser_conn *iser_conn = conn->dd_data; > > - struct ib_conn *ib_conn = &iser_conn->ib_conn; > > struct iscsi_session *session = conn->session; > > + int err = 0; > > + int i; > > > > iser_dbg("req op %x flags %x\n", req->opcode, req->flags); > > /* check if this is the last login - going to full feature phase */ > > if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) > > - return 0; > > - > > - /* > > - * Check that there is one posted recv buffer > > - * (for the last login response). > > - */ > > - WARN_ON(ib_conn->post_recv_buf_count != 1); > > + goto out; > > > > if (session->discovery_sess) { > > iser_info("Discovery session, re-using login RX buffer\n"); > > - return 0; > > - } else > > - iser_info("Normal session, posting batch of RX %d buffers\n", > > - iser_conn->min_posted_rx); > > + goto out; > > + } > > > > - /* Initial post receive buffers */ > > - if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) > > - return -ENOMEM; > > + iser_info("Normal session, posting batch of RX %d buffers\n", > > + iser_conn->qp_max_recv_dtos - 1); > > > > - return 0; > > + /* > > + * Initial post receive buffers. > > + * There is one already posted recv buffer (for the last login > > + * response). Therefore, the first recv buffer is skipped here. > > + */ > > + for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) { > > + err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]); > > + if (err) > > + goto out; > > + } > > +out: > > + return err; > > } > > > > static inline bool iser_signal_comp(u8 sig_count) > > @@ -585,7 +585,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) > > desc->rsp_dma, ISER_RX_LOGIN_SIZE, > > DMA_FROM_DEVICE); > > > > - ib_conn->post_recv_buf_count--; > > + if (iser_conn->iscsi_conn->session->discovery_sess) > > + return; > > + > > + /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */ > > + iser_post_recvm(iser_conn, iser_conn->rx_descs); > > } > > > > static inline void > > @@ -645,8 +649,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > > struct iser_conn *iser_conn = to_iser_conn(ib_conn); > > struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); > > struct iscsi_hdr *hdr; > > - int length; > > - int outstanding, count, err; > > + int length, err; > > > > if (unlikely(wc->status != IB_WC_SUCCESS)) { > > iser_err_comp(wc, "task_rsp"); > > @@ -675,20 +678,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > > desc->dma_addr, ISER_RX_PAYLOAD_SIZE, > > DMA_FROM_DEVICE); > > > > - /* decrementing conn->post_recv_buf_count only --after-- freeing the * > > - * task eliminates the need to worry on tasks which are completed in * > > - * parallel to the execution of iser_conn_term. So the code that waits * > > - * for the posted rx bufs refcount to become zero handles everything */ > > - ib_conn->post_recv_buf_count--; > > - > > - outstanding = ib_conn->post_recv_buf_count; > > - if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { > > - count = min(iser_conn->qp_max_recv_dtos - outstanding, > > - iser_conn->min_posted_rx); > > - err = iser_post_recvm(iser_conn, count); > > - if (err) > > - iser_err("posting %d rx bufs err %d\n", count, err); > > - } > > + err = iser_post_recvm(iser_conn, desc); > > + if (err) > > + iser_err("posting rx buffer err %d\n", err); > > } > > > > void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) > > diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c > > index 56b7240a3fc3..2019500a2425 100644 > > --- a/drivers/infiniband/ulp/iser/iser_verbs.c > > +++ b/drivers/infiniband/ulp/iser/iser_verbs.c > > @@ -936,7 +936,6 @@ void iser_conn_init(struct iser_conn *iser_conn) > > INIT_LIST_HEAD(&iser_conn->conn_list); > > mutex_init(&iser_conn->state_mutex); > > > > - ib_conn->post_recv_buf_count = 0; > > ib_conn->reg_cqe.done = iser_reg_comp; > > } > > > > @@ -1020,44 +1019,28 @@ int iser_post_recvl(struct iser_conn *iser_conn) > > wr.num_sge = 1; > > wr.next = NULL; > > > > - ib_conn->post_recv_buf_count++; > > ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > > - if (ib_ret) { > > + if (ib_ret) > > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > > - ib_conn->post_recv_buf_count--; > > - } > > > > return ib_ret; > > } > > > > -int iser_post_recvm(struct iser_conn *iser_conn, int count) > > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc) > > { > > struct ib_conn *ib_conn = &iser_conn->ib_conn; > > - unsigned int my_rx_head = iser_conn->rx_desc_head; > > - struct iser_rx_desc *rx_desc; > > - struct ib_recv_wr *wr, *wr_failed; > > - int i, ib_ret; > > - > > - for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { > > - rx_desc = &iser_conn->rx_descs[my_rx_head]; > > - rx_desc->cqe.done = iser_task_rsp; > > - wr->wr_cqe = &rx_desc->cqe; > > - wr->sg_list = &rx_desc->rx_sg; > > - wr->num_sge = 1; > > - wr->next = wr + 1; > > - my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; > > - } > > + struct ib_recv_wr wr, *wr_failed; > > + int ib_ret; > > > > - wr--; > > - wr->next = NULL; /* mark end of work requests list */ > > + rx_desc->cqe.done = iser_task_rsp; > > + wr.wr_cqe = &rx_desc->cqe; > > + wr.sg_list = &rx_desc->rx_sg; > > + wr.num_sge = 1; > > + wr.next = NULL; > > > > - ib_conn->post_recv_buf_count += count; > > - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); > > - if (ib_ret) { > > + ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > > + if (ib_ret) > > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > > - ib_conn->post_recv_buf_count -= count; > > - } else > > - iser_conn->rx_desc_head = my_rx_head; > > > > return ib_ret; > > } > >
On 5/1/2018 8:04 PM, Doug Ledford wrote: > On Mon, 2018-04-30 at 13:31 +0300, Max Gurtovoy wrote: >> Jason/Doug/Leon, >> can you please apply this patch ? >> Sergey made the needed changes. >> >> -Max. >> >> On 4/16/2018 6:00 PM, Sergey Gorenko wrote: >>> Some users complain about RNR errors on the target, when heavy >>> high-priority tasks run on the initiator. After the >>> investigation, we found out that the receive WRs were exhausted, >>> because the initiator could not post them on time. >>> >>> Receive work reqeusts are posted in chunks of min_posted_rx to >>> reduce the number of hits to the HCA. The WRs are posted in the >>> receive completion handler when the number of free receive buffers >>> reaches min_posted_rx. But on a high-loaded host, receive CQEs >>> processing can be delayed and all receive WRs will be exhausted. >>> In this case, the target will get an RNR error. >>> >>> To avoid this, we post receive WR, as soon as at least one receive >>> buffer is freed. This increases the number of hits to the HCA, but >>> test results show that performance degradation is not significant. >>> >>> Performance results running fio (8 jobs, 64 iodepth) using ramdisk >>> (w/w.o patch): >>> >>> bs IOPS(randread) IOPS(randwrite) >>> ------ --------------- --------------- >>> 512 329.4K / 340.3K 379.1K / 387.7K > > 3% performance drop > >>> 1K 333.4K / 337.6K 364.2K / 370.0K > > 1.4% performance drop > >>> 2K 321.1K / 323.5K 334.2K / 337.8K > > .75% performance drop > > I know you said the performance hit was not significant, and by the time > you get to 2k reads/writes, I agree with you, but at the low end, I'm > not sure you can call 3% "not significant". > > Is a 3% performance hit better than the transient rnr error? And is > this the only solution available? This problem here is the fact that iSER is using 1 QP per session and then you see 3% hit. I guess if we run a test using 64 targets and 64 sessions (64 QPs) then the locality of the completions would have smooth this hit away. I agree that 3% hit per this scenario is not optimal. I also hope that the work that IdanB and myself did with mlx5 inline KLM/MTT will balance this one as well (Currently it should be pushed in Leon/Jason PR. We saw > x3.5 improvement for small IOs in NVME-oF) - Sergey, we can check this out in our lab. For conclusion, IMO we have few improvments in other patches/drivers that can balance this 3% hit for small IOs and we can also make optimizations for performance in the future (e.g. adaptive CQ moderation, likely/unlikely prefixes, etc..). This is exactly how NVME-oF process the recv completions and we don't complain there :). Also, we recently fixed the post send in NVME-oF to signal on each completion and this caused a hit in the performance either but we overcome it using other improvments (BTW, this signaling fix is on our plate for iSER as well). > >>> 4K 300.7K / 302.9K 290.2K / 291.6K >>> 8K 235.9K / 237.1K 228.2K / 228.8K >>> 16K 176.3K / 177.0K 126.3K / 125.9K >>> 32K 115.2K / 115.4K 82.2K / 82.0K >>> 64K 70.7K / 70.7K 47.8K / 47.6K >>> 128K 38.5K / 38.6K 25.8K / 25.7K >>> 256K 20.7K / 20.7K 13.7K / 13.6K >>> 512K 10.0K / 10.0K 7.0K / 7.0K >>> >>> Signed-off-by: Sergey Gorenko <sergeygo@mellanox.com> >>> Signed-off-by: Vladimir Neyelov <vladimirn@mellanox.com> >>> Reviewed-by: Max Gurtovoy <maxg@mellanox.com> >>> --- -Max. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2018-05-02 at 13:27 +0300, Max Gurtovoy wrote: > > > > Performance results running fio (8 jobs, 64 iodepth) using ramdisk > > > > (w/w.o patch): > > > > > > > > bs IOPS(randread) IOPS(randwrite) > > > > ------ --------------- --------------- > > > > 512 329.4K / 340.3K 379.1K / 387.7K > > > > 3% performance drop > > > > > > 1K 333.4K / 337.6K 364.2K / 370.0K > > > > 1.4% performance drop > > > > > > 2K 321.1K / 323.5K 334.2K / 337.8K > > > > .75% performance drop > > > > I know you said the performance hit was not significant, and by the time > > you get to 2k reads/writes, I agree with you, but at the low end, I'm > > not sure you can call 3% "not significant". > > > > Is a 3% performance hit better than the transient rnr error? And is > > this the only solution available? > > This problem here is the fact that iSER is using 1 QP per session and > then you see 3% hit. I guess if we run a test using 64 targets and 64 > sessions (64 QPs) then the locality of the completions would have smooth > this hit away. Yeah, but if you had 64 QPs then your pre-patch numbers would have been higher too due to greater spreading of the load, so it might have still come out at 3%. > I agree that 3% hit per this scenario is not optimal. I > also hope that the work that IdanB and myself did with mlx5 inline > KLM/MTT will balance this one as well (Currently it should be pushed in > Leon/Jason PR. We saw > x3.5 improvement for small IOs in NVME-oF) - > Sergey, we can check this out in our lab. Your talking apples to oranges here. An across the board performance hit and an adapter specific performance increase to offset it. I'm all for the performance benefit of the inlining, but it's orthogonal to the performance hit of this change.
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index c1ae4aeae2f9..925996ddcf9f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -123,8 +123,6 @@ #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) -#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) - /* the max TX (send) WR supported by the iSER QP is defined by * * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * * to have at max for SCSI command. The tx posting & completion handling code * @@ -452,9 +450,7 @@ struct iser_fr_pool { * * @cma_id: rdma_cm connection maneger handle * @qp: Connection Queue-pair - * @post_recv_buf_count: post receive counter * @sig_count: send work request signal count - * @rx_wr: receive work request for batch posts * @device: reference to iser device * @comp: iser completion context * @fr_pool: connection fast registration poool @@ -463,9 +459,7 @@ struct iser_fr_pool { struct ib_conn { struct rdma_cm_id *cma_id; struct ib_qp *qp; - int post_recv_buf_count; u8 sig_count; - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; struct iser_device *device; struct iser_comp *comp; struct iser_fr_pool fr_pool; @@ -482,8 +476,6 @@ struct ib_conn { * @state: connection logical state * @qp_max_recv_dtos: maximum number of data outs, corresponds * to max number of post recvs - * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) - * @min_posted_rx: (qp_max_recv_dtos >> 2) * @max_cmds: maximum cmds allowed for this connection * @name: connection peer portal * @release_work: deffered work for release job @@ -494,7 +486,6 @@ struct ib_conn { * (state is ISER_CONN_UP) * @conn_list: entry in ig conn list * @login_desc: login descriptor - * @rx_desc_head: head of rx_descs cyclic buffer * @rx_descs: rx buffers array (cyclic buffer) * @num_rx_descs: number of rx descriptors * @scsi_sg_tablesize: scsi host sg_tablesize @@ -505,8 +496,6 @@ struct iser_conn { struct iscsi_endpoint *ep; enum iser_conn_state state; unsigned qp_max_recv_dtos; - unsigned qp_max_recv_dtos_mask; - unsigned min_posted_rx; u16 max_cmds; char name[ISER_OBJECT_NAME_SIZE]; struct work_struct release_work; @@ -516,7 +505,6 @@ struct iser_conn { struct completion up_completion; struct list_head conn_list; struct iser_login_desc login_desc; - unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; u32 num_rx_descs; unsigned short scsi_sg_tablesize; @@ -638,7 +626,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); int iser_post_recvl(struct iser_conn *iser_conn); -int iser_post_recvm(struct iser_conn *iser_conn, int count); +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc); int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal); diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index df49c4eb67f7..5fc648f9afe1 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, struct iser_device *device = ib_conn->device; iser_conn->qp_max_recv_dtos = session->cmds_max; - iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ - iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, iser_conn->scsi_sg_tablesize)) @@ -279,7 +277,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, rx_sg->lkey = device->pd->local_dma_lkey; } - iser_conn->rx_desc_head = 0; return 0; rx_desc_dma_map_failed: @@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) { struct iser_conn *iser_conn = conn->dd_data; - struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iscsi_session *session = conn->session; + int err = 0; + int i; iser_dbg("req op %x flags %x\n", req->opcode, req->flags); /* check if this is the last login - going to full feature phase */ if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) - return 0; - - /* - * Check that there is one posted recv buffer - * (for the last login response). - */ - WARN_ON(ib_conn->post_recv_buf_count != 1); + goto out; if (session->discovery_sess) { iser_info("Discovery session, re-using login RX buffer\n"); - return 0; - } else - iser_info("Normal session, posting batch of RX %d buffers\n", - iser_conn->min_posted_rx); + goto out; + } - /* Initial post receive buffers */ - if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) - return -ENOMEM; + iser_info("Normal session, posting batch of RX %d buffers\n", + iser_conn->qp_max_recv_dtos - 1); - return 0; + /* + * Initial post receive buffers. + * There is one already posted recv buffer (for the last login + * response). Therefore, the first recv buffer is skipped here. + */ + for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) { + err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]); + if (err) + goto out; + } +out: + return err; } static inline bool iser_signal_comp(u8 sig_count) @@ -585,7 +585,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) desc->rsp_dma, ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - ib_conn->post_recv_buf_count--; + if (iser_conn->iscsi_conn->session->discovery_sess) + return; + + /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */ + iser_post_recvm(iser_conn, iser_conn->rx_descs); } static inline void @@ -645,8 +649,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) struct iser_conn *iser_conn = to_iser_conn(ib_conn); struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); struct iscsi_hdr *hdr; - int length; - int outstanding, count, err; + int length, err; if (unlikely(wc->status != IB_WC_SUCCESS)) { iser_err_comp(wc, "task_rsp"); @@ -675,20 +678,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - /* decrementing conn->post_recv_buf_count only --after-- freeing the * - * task eliminates the need to worry on tasks which are completed in * - * parallel to the execution of iser_conn_term. So the code that waits * - * for the posted rx bufs refcount to become zero handles everything */ - ib_conn->post_recv_buf_count--; - - outstanding = ib_conn->post_recv_buf_count; - if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { - count = min(iser_conn->qp_max_recv_dtos - outstanding, - iser_conn->min_posted_rx); - err = iser_post_recvm(iser_conn, count); - if (err) - iser_err("posting %d rx bufs err %d\n", count, err); - } + err = iser_post_recvm(iser_conn, desc); + if (err) + iser_err("posting rx buffer err %d\n", err); } void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 56b7240a3fc3..2019500a2425 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -936,7 +936,6 @@ void iser_conn_init(struct iser_conn *iser_conn) INIT_LIST_HEAD(&iser_conn->conn_list); mutex_init(&iser_conn->state_mutex); - ib_conn->post_recv_buf_count = 0; ib_conn->reg_cqe.done = iser_reg_comp; } @@ -1020,44 +1019,28 @@ int iser_post_recvl(struct iser_conn *iser_conn) wr.num_sge = 1; wr.next = NULL; - ib_conn->post_recv_buf_count++; ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); - if (ib_ret) { + if (ib_ret) iser_err("ib_post_recv failed ret=%d\n", ib_ret); - ib_conn->post_recv_buf_count--; - } return ib_ret; } -int iser_post_recvm(struct iser_conn *iser_conn, int count) +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc) { struct ib_conn *ib_conn = &iser_conn->ib_conn; - unsigned int my_rx_head = iser_conn->rx_desc_head; - struct iser_rx_desc *rx_desc; - struct ib_recv_wr *wr, *wr_failed; - int i, ib_ret; - - for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { - rx_desc = &iser_conn->rx_descs[my_rx_head]; - rx_desc->cqe.done = iser_task_rsp; - wr->wr_cqe = &rx_desc->cqe; - wr->sg_list = &rx_desc->rx_sg; - wr->num_sge = 1; - wr->next = wr + 1; - my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; - } + struct ib_recv_wr wr, *wr_failed; + int ib_ret; - wr--; - wr->next = NULL; /* mark end of work requests list */ + rx_desc->cqe.done = iser_task_rsp; + wr.wr_cqe = &rx_desc->cqe; + wr.sg_list = &rx_desc->rx_sg; + wr.num_sge = 1; + wr.next = NULL; - ib_conn->post_recv_buf_count += count; - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); - if (ib_ret) { + ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); + if (ib_ret) iser_err("ib_post_recv failed ret=%d\n", ib_ret); - ib_conn->post_recv_buf_count -= count; - } else - iser_conn->rx_desc_head = my_rx_head; return ib_ret; }