diff mbox series

[net-next,v2,10/17] nvme: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

Message ID 20230617121146.716077-11-dhowells@redhat.com (mailing list archive)
State New
Headers show
Series [net-next,v2,01/17] net: Copy slab data for sendmsg(MSG_SPLICE_PAGES) | expand

Commit Message

David Howells June 17, 2023, 12:11 p.m. UTC
When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
performing several sendmsg and sendpage calls to transmit header, data
pages and trailer.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nvme@lists.infradead.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #2)
     - Wrap lines at 80.

 drivers/nvme/host/tcp.c   | 46 ++++++++++++++++++++-------------------
 drivers/nvme/target/tcp.c | 46 ++++++++++++++++++++++++---------------
 2 files changed, 53 insertions(+), 39 deletions(-)

Comments

Willem de Bruijn June 18, 2023, 4:47 p.m. UTC | #1
David Howells wrote:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Keith Busch <kbusch@kernel.org>
> cc: Jens Axboe <axboe@fb.com>
> cc: Christoph Hellwig <hch@lst.de>
> cc: Sagi Grimberg <sagi@grimberg.me>
> cc: Chaitanya Kulkarni <kch@nvidia.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: linux-nvme@lists.infradead.org
> cc: netdev@vger.kernel.org
> ---
> 
> Notes:
>     ver #2)
>      - Wrap lines at 80.
> 
>  drivers/nvme/host/tcp.c   | 46 ++++++++++++++++++++-------------------
>  drivers/nvme/target/tcp.c | 46 ++++++++++++++++++++++++---------------
>  2 files changed, 53 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index bf0230442d57..6f31cdbb696a 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>  	u32 h2cdata_left = req->h2cdata_left;
>  
>  	while (true) {
> +		struct bio_vec bvec;
> +		struct msghdr msg = {
> +			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
> +		};
>  		struct page *page = nvme_tcp_req_cur_page(req);
>  		size_t offset = nvme_tcp_req_cur_offset(req);
>  		size_t len = nvme_tcp_req_cur_length(req);
>  		bool last = nvme_tcp_pdu_last_send(req, len);
>  		int req_data_sent = req->data_sent;
> -		int ret, flags = MSG_DONTWAIT;
> +		int ret;
>  
>  		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> -			flags |= MSG_EOR;
> +			msg.msg_flags |= MSG_EOR;
>  		else
> -			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +			msg.msg_flags |= MSG_MORE;
>  
> -		if (sendpage_ok(page)) {
> -			ret = kernel_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		} else {
> -			ret = sock_no_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		}
> +		bvec_set_page(&bvec, page, len, offset);
> +		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +		ret = sock_sendmsg(queue->sock, &msg);
>  		if (ret <= 0)
>  			return ret;
>  
> @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>  {
>  	struct nvme_tcp_queue *queue = req->queue;
>  	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> +	struct bio_vec bvec;
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
>  	bool inline_data = nvme_tcp_has_inline_data(req);
>  	u8 hdgst = nvme_tcp_hdgst_len(queue);
>  	int len = sizeof(*pdu) + hdgst - req->offset;
> -	int flags = MSG_DONTWAIT;
>  	int ret;
>  
>  	if (inline_data || nvme_tcp_queue_more(queue))
> -		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +		msg.msg_flags |= MSG_MORE;
>  	else
> -		flags |= MSG_EOR;
> +		msg.msg_flags |= MSG_EOR;
>  
>  	if (queue->hdr_digest && !req->offset)
>  		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>  
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> +	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +	ret = sock_sendmsg(queue->sock, &msg);
>  	if (unlikely(ret <= 0))
>  		return ret;
>

    struct bio_vec bvec;
    struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };

    ..

    bvec_set_virt
    iov_iter_bvec
    sock_sendmsg

is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.
David Howells June 18, 2023, 5:28 p.m. UTC | #2
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

>     struct bio_vec bvec;
>     struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
> 
>     ..
> 
>     bvec_set_virt
>     iov_iter_bvec
>     sock_sendmsg
> 
> is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.

I dunno.  I'm trying to move towards aggregating multiple pages in a bvec
before calling sendmsg if possible rather than doing it one page at a time,
but it's easier and more obvious in some places than others.

David
Sagi Grimberg June 19, 2023, 8:25 a.m. UTC | #3
>>      struct bio_vec bvec;
>>      struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
>>
>>      ..
>>
>>      bvec_set_virt
>>      iov_iter_bvec
>>      sock_sendmsg
>>
>> is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.
> 
> I dunno.  I'm trying to move towards aggregating multiple pages in a bvec
> before calling sendmsg if possible rather than doing it one page at a time,
> but it's easier and more obvious in some places than others.

That would be great to do, but nvme needs to calculate a data digest
and doing that in a separate scan of the payload is not very cache
friendly...

There is also the fact that the payload may be sent in portions 
asynchronously driven by how the controller wants to accept them,
so there is some complexity there.

But worth looking at for sure.

The patch looks good to me, taking it to run some tests
(from sendpage-3-frag branch in your kernel.org tree correct?)

For now, you can add:
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
David Howells June 19, 2023, 9:28 a.m. UTC | #4
Sagi Grimberg <sagi@grimberg.me> wrote:

> The patch looks good to me, taking it to run some tests
> (from sendpage-3-frag branch in your kernel.org tree correct?)

Yep, but you'll patches 1 also and patch 2 might help with seeing what's going
on.

David
Willem de Bruijn June 19, 2023, 11:46 a.m. UTC | #5
David Howells wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> 
> >     struct bio_vec bvec;
> >     struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
> > 
> >     ..
> > 
> >     bvec_set_virt
> >     iov_iter_bvec
> >     sock_sendmsg
> > 
> > is a frequent pattern. Does it make sense to define a wrapper? Same for bvec_set_page.
> 
> I dunno.  I'm trying to move towards aggregating multiple pages in a bvec
> before calling sendmsg if possible rather than doing it one page at a time,
> but it's easier and more obvious in some places than others.

Ok. I just wanted to have your opinion. Fine to leave as is.

Acked-by: Willem de Bruijn <willemb@google.com>
Sagi Grimberg June 20, 2023, 1 p.m. UTC | #6
>>>      struct bio_vec bvec;
>>>      struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | ... };
>>>
>>>      ..
>>>
>>>      bvec_set_virt
>>>      iov_iter_bvec
>>>      sock_sendmsg
>>>
>>> is a frequent pattern. Does it make sense to define a wrapper? Same 
>>> for bvec_set_page.
>>
>> I dunno.  I'm trying to move towards aggregating multiple pages in a bvec
>> before calling sendmsg if possible rather than doing it one page at a 
>> time,
>> but it's easier and more obvious in some places than others.
> 
> That would be great to do, but nvme needs to calculate a data digest
> and doing that in a separate scan of the payload is not very cache
> friendly...
> 
> There is also the fact that the payload may be sent in portions 
> asynchronously driven by how the controller wants to accept them,
> so there is some complexity there.
> 
> But worth looking at for sure.
> 
> The patch looks good to me, taking it to run some tests
> (from sendpage-3-frag branch in your kernel.org tree correct?)
> 
> For now, you can add:
> Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

Patches seem to hold up.
Tested-by: Sagi Grimberg <sagi@grimberg.me>

However if possible, can you please split nvme/host and nvme/target
changes? We try to separate host side and target side changes in the
same patch.
diff mbox series

Patch

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..6f31cdbb696a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -997,25 +997,25 @@  static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 	u32 h2cdata_left = req->h2cdata_left;
 
 	while (true) {
+		struct bio_vec bvec;
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = nvme_tcp_req_cur_page(req);
 		size_t offset = nvme_tcp_req_cur_offset(req);
 		size_t len = nvme_tcp_req_cur_length(req);
 		bool last = nvme_tcp_pdu_last_send(req, len);
 		int req_data_sent = req->data_sent;
-		int ret, flags = MSG_DONTWAIT;
+		int ret;
 
 		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
-			flags |= MSG_EOR;
+			msg.msg_flags |= MSG_EOR;
 		else
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		if (sendpage_ok(page)) {
-			ret = kernel_sendpage(queue->sock, page, offset, len,
-					flags);
-		} else {
-			ret = sock_no_sendpage(queue->sock, page, offset, len,
-					flags);
-		}
+		bvec_set_page(&bvec, page, len, offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+		ret = sock_sendmsg(queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -1054,22 +1054,24 @@  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
 	bool inline_data = nvme_tcp_has_inline_data(req);
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (inline_data || nvme_tcp_queue_more(queue))
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,  flags);
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1093,6 +1095,8 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) - req->offset + hdgst;
 	int ret;
@@ -1101,13 +1105,11 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
 	if (!req->h2cdata_left)
-		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
-	else
-		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE);
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index ed98df72c76b..868aa4de2e4c 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -576,13 +576,17 @@  static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
 
 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
 {
+	struct msghdr msg = {
+		.msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES,
+	};
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
 	int ret;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
-			offset_in_page(cmd->data_pdu) + cmd->offset,
-			left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
+	bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 
@@ -603,17 +607,21 @@  static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 	int ret;
 
 	while (cmd->cur_sg) {
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = sg_page(cmd->cur_sg);
+		struct bio_vec bvec;
 		u32 left = cmd->cur_sg->length - cmd->offset;
-		int flags = MSG_DONTWAIT;
 
 		if ((!last_in_batch && cmd->queue->send_list_len) ||
 		    cmd->wbytes_done + left < cmd->req.transfer_len ||
 		    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
-					left, flags);
+		bvec_set_page(&bvec, page, left, cmd->offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+		ret = sock_sendmsg(cmd->queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -649,18 +657,20 @@  static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 		bool last_in_batch)
 {
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (!last_in_batch && cmd->queue->send_list_len)
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
-		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+	bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 	cmd->offset += ret;
@@ -677,18 +687,20 @@  static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 
 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 {
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (!last_in_batch && cmd->queue->send_list_len)
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
-		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+	bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 	cmd->offset += ret;