diff mbox series

[net-next,1/2] net/smc: send cdc msg inline if qp has sufficient inline space

Message ID 20220513071551.22065-2-guangguan.wang@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net/smc: send and write inline optimization for smc | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 0 this patch: 2
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Guangguan Wang May 13, 2022, 7:15 a.m. UTC
As cdc msg's length is 44B, cdc msgs can be sent inline in
most rdma devices, which can help reducing sending latency.

In my test environment, which are 2 VMs running on the same
physical host and whose NICs(ConnectX-4Lx) are working on
SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.

Test command:
server: smc_run taskset -c 1 qperf
client: smc_run taskset -c 1 qperf <server ip> -oo \
		msg_size:1:2K:*2 -t 30 -vu tcp_lat

The results shown below:
msgsize     before       after
1B          11.9 us      11.2 us (-0.7 us)
2B          11.7 us      11.2 us (-0.5 us)
4B          11.7 us      11.3 us (-0.4 us)
8B          11.6 us      11.2 us (-0.4 us)
16B         11.7 us      11.3 us (-0.4 us)
32B         11.7 us      11.3 us (-0.4 us)
64B         11.7 us      11.2 us (-0.5 us)
128B        11.6 us      11.2 us (-0.4 us)
256B        11.8 us      11.2 us (-0.6 us)
512B        11.8 us      11.4 us (-0.4 us)
1KB         11.9 us      11.4 us (-0.5 us)
2KB         12.1 us      11.5 us (-0.6 us)

Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>
---
 net/smc/smc_ib.c | 1 +
 net/smc/smc_wr.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

Comments

kernel test robot May 13, 2022, 10:42 a.m. UTC | #1
Hi Guangguan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Guangguan-Wang/net-smc-send-and-write-inline-optimization-for-smc/20220513-151715
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git b67fd3d9d94223b424674f45eeadeff58b4b03ef
config: nios2-allyesconfig (https://download.01.org/0day-ci/archive/20220513/202205131842.j3oh7PXI-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/bac726bf950dac20959af52c6884b7bb07772dac
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Guangguan-Wang/net-smc-send-and-write-inline-optimization-for-smc/20220513-151715
        git checkout bac726bf950dac20959af52c6884b7bb07772dac
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=nios2 SHELL=/bin/bash net/smc/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   net/smc/smc_wr.c: In function 'smc_wr_init_sge':
>> net/smc/smc_wr.c:561:57: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
     561 |                 lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
         |                                                         ^


vim +561 net/smc/smc_wr.c

   553	
   554	static void smc_wr_init_sge(struct smc_link *lnk)
   555	{
   556		int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
   557		bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
   558		u32 i;
   559	
   560		for (i = 0; i < lnk->wr_tx_cnt; i++) {
 > 561			lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
   562				lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
   563			lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
   564			lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
   565			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
   566				lnk->roce_pd->local_dma_lkey;
   567			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
   568				lnk->roce_pd->local_dma_lkey;
   569			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
   570				lnk->roce_pd->local_dma_lkey;
   571			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
   572				lnk->roce_pd->local_dma_lkey;
   573			lnk->wr_tx_ibs[i].next = NULL;
   574			lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
   575			lnk->wr_tx_ibs[i].num_sge = 1;
   576			lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
   577			lnk->wr_tx_ibs[i].send_flags =
   578				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   579			if (send_inline)
   580				lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
   581			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
   582			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
   583			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
   584				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
   585			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
   586				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
   587		}
   588	
   589		if (lnk->lgr->smc_version == SMC_V2) {
   590			lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
   591			lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
   592			lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
   593	
   594			lnk->wr_tx_v2_ib->next = NULL;
   595			lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
   596			lnk->wr_tx_v2_ib->num_sge = 1;
   597			lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
   598			lnk->wr_tx_v2_ib->send_flags =
   599				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   600		}
   601	
   602		/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
   603		 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
   604		 * and the same buffer for all sges. When a larger message arrived then
   605		 * the content of the first small sge is copied to the beginning of
   606		 * the larger spillover buffer, allowing easy data mapping.
   607		 */
   608		for (i = 0; i < lnk->wr_rx_cnt; i++) {
   609			int x = i * sges_per_buf;
   610	
   611			lnk->wr_rx_sges[x].addr =
   612				lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
   613			lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
   614			lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
   615			if (lnk->lgr->smc_version == SMC_V2) {
   616				lnk->wr_rx_sges[x + 1].addr =
   617						lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
   618				lnk->wr_rx_sges[x + 1].length =
   619						SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
   620				lnk->wr_rx_sges[x + 1].lkey =
   621						lnk->roce_pd->local_dma_lkey;
   622			}
   623			lnk->wr_rx_ibs[i].next = NULL;
   624			lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
   625			lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
   626		}
   627		lnk->wr_reg.wr.next = NULL;
   628		lnk->wr_reg.wr.num_sge = 0;
   629		lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
   630		lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
   631		lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
   632	}
   633
Leon Romanovsky May 14, 2022, 6:02 a.m. UTC | #2
On Fri, May 13, 2022 at 03:15:50PM +0800, Guangguan Wang wrote:
> As cdc msg's length is 44B, cdc msgs can be sent inline in
> most rdma devices, which can help reducing sending latency.
> 
> In my test environment, which are 2 VMs running on the same
> physical host and whose NICs(ConnectX-4Lx) are working on
> SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.
> 
> Test command:
> server: smc_run taskset -c 1 qperf
> client: smc_run taskset -c 1 qperf <server ip> -oo \
> 		msg_size:1:2K:*2 -t 30 -vu tcp_lat
> 
> The results shown below:
> msgsize     before       after
> 1B          11.9 us      11.2 us (-0.7 us)
> 2B          11.7 us      11.2 us (-0.5 us)
> 4B          11.7 us      11.3 us (-0.4 us)
> 8B          11.6 us      11.2 us (-0.4 us)
> 16B         11.7 us      11.3 us (-0.4 us)
> 32B         11.7 us      11.3 us (-0.4 us)
> 64B         11.7 us      11.2 us (-0.5 us)
> 128B        11.6 us      11.2 us (-0.4 us)
> 256B        11.8 us      11.2 us (-0.6 us)
> 512B        11.8 us      11.4 us (-0.4 us)
> 1KB         11.9 us      11.4 us (-0.5 us)
> 2KB         12.1 us      11.5 us (-0.6 us)
> 
> Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>
> ---
>  net/smc/smc_ib.c | 1 +
>  net/smc/smc_wr.c | 5 ++++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
> index a3e2d3b89568..1dcce9e4f4ca 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
>  			.max_recv_wr = SMC_WR_BUF_CNT * 3,
>  			.max_send_sge = SMC_IB_MAX_SEND_SGE,
>  			.max_recv_sge = sges_per_buf,
> +			.max_inline_data = SMC_WR_TX_SIZE,
>  		},
>  		.sq_sig_type = IB_SIGNAL_REQ_WR,
>  		.qp_type = IB_QPT_RC,
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 24be1d03fef9..8a2f9a561197 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
>  static void smc_wr_init_sge(struct smc_link *lnk)
>  {
>  	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
> +	bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);

When will it be false? You are creating QPs with max_inline_data == SMC_WR_TX_SIZE?

>  	u32 i;
>  
>  	for (i = 0; i < lnk->wr_tx_cnt; i++) {
> -		lnk->wr_tx_sges[i].addr =
> +		lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
>  			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
>  		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
>  		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
> @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>  		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
>  		lnk->wr_tx_ibs[i].send_flags =
>  			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
> +		if (send_inline)
> +			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;

If you try to transfer data == SMC_WR_TX_SIZE, you will get -ENOMEM error.
IB drivers check that length < qp->max_inline_data.

Thanks

>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
>  		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
> -- 
> 2.24.3 (Apple Git-128)
>
Guangguan Wang May 14, 2022, 9:36 a.m. UTC | #3
On 2022/5/14 14:02, Leon Romanovsky wrote:
>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>> index 24be1d03fef9..8a2f9a561197 100644
>> --- a/net/smc/smc_wr.c
>> +++ b/net/smc/smc_wr.c
>> @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
>>  static void smc_wr_init_sge(struct smc_link *lnk)
>>  {
>>  	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
>> +	bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
> 
> When will it be false? You are creating QPs with max_inline_data == SMC_WR_TX_SIZE?
> 
>>  	u32 i;
>>  
>>  	for (i = 0; i < lnk->wr_tx_cnt; i++) {
>> -		lnk->wr_tx_sges[i].addr =
>> +		lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
>>  			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
>>  		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
>>  		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
>> @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>  		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
>>  		lnk->wr_tx_ibs[i].send_flags =
>>  			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
>> +		if (send_inline)
>> +			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
> 
> If you try to transfer data == SMC_WR_TX_SIZE, you will get -ENOMEM error.
> IB drivers check that length < qp->max_inline_data.
> 
> Thanks
> 

Got it. 

I should create qps with max_inline_data == 0, and get the actual max_inline_data by query_qp.
And I should use lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE to decide whether to send inline or not.

Thank you.
diff mbox series

Patch

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a3e2d3b89568..1dcce9e4f4ca 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -671,6 +671,7 @@  int smc_ib_create_queue_pair(struct smc_link *lnk)
 			.max_recv_wr = SMC_WR_BUF_CNT * 3,
 			.max_send_sge = SMC_IB_MAX_SEND_SGE,
 			.max_recv_sge = sges_per_buf,
+			.max_inline_data = SMC_WR_TX_SIZE,
 		},
 		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type = IB_QPT_RC,
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 24be1d03fef9..8a2f9a561197 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -554,10 +554,11 @@  void smc_wr_remember_qp_attr(struct smc_link *lnk)
 static void smc_wr_init_sge(struct smc_link *lnk)
 {
 	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+	bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
 	u32 i;
 
 	for (i = 0; i < lnk->wr_tx_cnt; i++) {
-		lnk->wr_tx_sges[i].addr =
+		lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
 			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
 		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
@@ -575,6 +576,8 @@  static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
 		lnk->wr_tx_ibs[i].send_flags =
 			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		if (send_inline)
+			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =