diff mbox series

[net-next,v2,1/2] net/smc: send cdc msg inline if qp has sufficient inline space

Message ID 20220514102739.41252-2-guangguan.wang@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net/smc: send and write inline optimization for smc | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 90 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Guangguan Wang May 14, 2022, 10:27 a.m. UTC
As cdc msg's length is 44B, cdc msgs can be sent inline in
most rdma devices, which can help reducing sending latency.

In my test environment, which are 2 VMs running on the same
physical host and whose NICs(ConnectX-4Lx) are working on
SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.

Test command:
server: smc_run taskset -c 1 qperf
client: smc_run taskset -c 1 qperf <server ip> -oo \
		msg_size:1:2K:*2 -t 30 -vu tcp_lat

The results shown below:
msgsize     before       after
1B          11.9 us      11.2 us (-0.7 us)
2B          11.7 us      11.2 us (-0.5 us)
4B          11.7 us      11.3 us (-0.4 us)
8B          11.6 us      11.2 us (-0.4 us)
16B         11.7 us      11.3 us (-0.4 us)
32B         11.7 us      11.3 us (-0.4 us)
64B         11.7 us      11.2 us (-0.5 us)
128B        11.6 us      11.2 us (-0.4 us)
256B        11.8 us      11.2 us (-0.6 us)
512B        11.8 us      11.4 us (-0.4 us)
1KB         11.9 us      11.4 us (-0.5 us)
2KB         12.1 us      11.5 us (-0.6 us)

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>
---
 net/smc/smc_ib.c | 1 +
 net/smc/smc_wr.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

Comments

Tony Lu May 15, 2022, 5:10 p.m. UTC | #1
On Sat, May 14, 2022 at 06:27:38PM +0800, Guangguan Wang wrote:
> As cdc msg's length is 44B, cdc msgs can be sent inline in
> most rdma devices, which can help reducing sending latency.
> 
> In my test environment, which are 2 VMs running on the same
> physical host and whose NICs(ConnectX-4Lx) are working on
> SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.
> 
> Test command:
> server: smc_run taskset -c 1 qperf
> client: smc_run taskset -c 1 qperf <server ip> -oo \
> 		msg_size:1:2K:*2 -t 30 -vu tcp_lat
> 
> The results shown below:
> msgsize     before       after
> 1B          11.9 us      11.2 us (-0.7 us)
> 2B          11.7 us      11.2 us (-0.5 us)
> 4B          11.7 us      11.3 us (-0.4 us)
> 8B          11.6 us      11.2 us (-0.4 us)
> 16B         11.7 us      11.3 us (-0.4 us)
> 32B         11.7 us      11.3 us (-0.4 us)
> 64B         11.7 us      11.2 us (-0.5 us)
> 128B        11.6 us      11.2 us (-0.4 us)
> 256B        11.8 us      11.2 us (-0.6 us)
> 512B        11.8 us      11.4 us (-0.4 us)
> 1KB         11.9 us      11.4 us (-0.5 us)
> 2KB         12.1 us      11.5 us (-0.6 us)
> 
> Reported-by: kernel test robot <lkp@intel.com>

You don't need to add this tag, this tag represents who found the issue.
Tested-by is reasonable.

> Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>

Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>

Thanks,
Tony Lu

> ---
>  net/smc/smc_ib.c | 1 +
>  net/smc/smc_wr.c | 5 ++++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
> index a3e2d3b89568..dcda4165d107 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
>  			.max_recv_wr = SMC_WR_BUF_CNT * 3,
>  			.max_send_sge = SMC_IB_MAX_SEND_SGE,
>  			.max_recv_sge = sges_per_buf,
> +			.max_inline_data = 0,
>  		},
>  		.sq_sig_type = IB_SIGNAL_REQ_WR,
>  		.qp_type = IB_QPT_RC,
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 24be1d03fef9..26f8f240d9e8 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
>  static void smc_wr_init_sge(struct smc_link *lnk)
>  {
>  	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
> +	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
>  	u32 i;
>  
>  	for (i = 0; i < lnk->wr_tx_cnt; i++) {
> -		lnk->wr_tx_sges[i].addr =
> +		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
>  			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
>  		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
>  		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
> @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>  		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
>  		lnk->wr_tx_ibs[i].send_flags =
>  			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
> +		if (send_inline)
> +			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
> -- 
> 2.24.3 (Apple Git-128)
diff mbox series

Patch

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a3e2d3b89568..dcda4165d107 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -671,6 +671,7 @@  int smc_ib_create_queue_pair(struct smc_link *lnk)
 			.max_recv_wr = SMC_WR_BUF_CNT * 3,
 			.max_send_sge = SMC_IB_MAX_SEND_SGE,
 			.max_recv_sge = sges_per_buf,
+			.max_inline_data = 0,
 		},
 		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type = IB_QPT_RC,
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 24be1d03fef9..26f8f240d9e8 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -554,10 +554,11 @@  void smc_wr_remember_qp_attr(struct smc_link *lnk)
 static void smc_wr_init_sge(struct smc_link *lnk)
 {
 	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
 	u32 i;
 
 	for (i = 0; i < lnk->wr_tx_cnt; i++) {
-		lnk->wr_tx_sges[i].addr =
+		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
 			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
 		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
@@ -575,6 +576,8 @@  static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
 		lnk->wr_tx_ibs[i].send_flags =
 			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		if (send_inline)
+			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =