diff mbox series

[RFC,v2,37/48] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()

Message ID 20230329141354.516864-38-dhowells@redhat.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

David Howells March 29, 2023, 2:13 p.m. UTC
Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data.  For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---
 net/ceph/messenger_v2.c | 89 +++++++++--------------------------------
 1 file changed, 18 insertions(+), 71 deletions(-)

Comments

Xiubo Li March 30, 2023, 1:45 a.m. UTC | #1
David,

BTW, will this two patch depend on the others in this patch series ?

I am planing to run a test with these two later.

Thanks

- Xiubo

On 29/03/2023 22:13, David Howells wrote:
> Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
> transmitting data.  For the moment, this can only transmit one page at a
> time because of the architecture of net/ceph/, but if
> write_partial_message_data() can be given a bvec[] at a time by the
> iteration code, this would allow pages to be sent in a batch.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Ilya Dryomov <idryomov@gmail.com>
> cc: Xiubo Li <xiubli@redhat.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: ceph-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
>   net/ceph/messenger_v2.c | 89 +++++++++--------------------------------
>   1 file changed, 18 insertions(+), 71 deletions(-)
>
> diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
> index 301a991dc6a6..1637a0c21126 100644
> --- a/net/ceph/messenger_v2.c
> +++ b/net/ceph/messenger_v2.c
> @@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
>   	return ret;
>   }
>   
> -static int do_sendmsg(struct socket *sock, struct iov_iter *it)
> -{
> -	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -	int ret;
> -
> -	msg.msg_iter = *it;
> -	while (iov_iter_count(it)) {
> -		ret = sock_sendmsg(sock, &msg);
> -		if (ret <= 0) {
> -			if (ret == -EAGAIN)
> -				ret = 0;
> -			return ret;
> -		}
> -
> -		iov_iter_advance(it, ret);
> -	}
> -
> -	WARN_ON(msg_data_left(&msg));
> -	return 1;
> -}
> -
> -static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
> -{
> -	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -	struct bio_vec bv;
> -	int ret;
> -
> -	if (WARN_ON(!iov_iter_is_bvec(it)))
> -		return -EINVAL;
> -
> -	while (iov_iter_count(it)) {
> -		/* iov_iter_iovec() for ITER_BVEC */
> -		bvec_set_page(&bv, it->bvec->bv_page,
> -			      min(iov_iter_count(it),
> -				  it->bvec->bv_len - it->iov_offset),
> -			      it->bvec->bv_offset + it->iov_offset);
> -
> -		/*
> -		 * sendpage cannot properly handle pages with
> -		 * page_count == 0, we need to fall back to sendmsg if
> -		 * that's the case.
> -		 *
> -		 * Same goes for slab pages: skb_can_coalesce() allows
> -		 * coalescing neighboring slab objects into a single frag
> -		 * which triggers one of hardened usercopy checks.
> -		 */
> -		if (sendpage_ok(bv.bv_page)) {
> -			ret = sock->ops->sendpage(sock, bv.bv_page,
> -						  bv.bv_offset, bv.bv_len,
> -						  CEPH_MSG_FLAGS);
> -		} else {
> -			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
> -			ret = sock_sendmsg(sock, &msg);
> -		}
> -		if (ret <= 0) {
> -			if (ret == -EAGAIN)
> -				ret = 0;
> -			return ret;
> -		}
> -
> -		iov_iter_advance(it, ret);
> -	}
> -
> -	return 1;
> -}
> -
>   /*
>    * Write as much as possible.  The socket is expected to be corked,
>    * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
>    *
>    * Return:
> - *   1 - done, nothing (else) to write
> + *  >0 - done, nothing (else) to write
>    *   0 - socket is full, need to wait
>    *  <0 - error
>    */
>   static int ceph_tcp_send(struct ceph_connection *con)
>   {
> +	struct msghdr msg = {
> +		.msg_iter	= con->v2.out_iter,
> +		.msg_flags	= CEPH_MSG_FLAGS,
> +	};
>   	int ret;
>   
> +	if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> +		return -EINVAL;
> +
> +	if (con->v2.out_iter_sendpage)
> +		msg.msg_flags |= MSG_SPLICE_PAGES;
> +
>   	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
>   	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
> -	if (con->v2.out_iter_sendpage)
> -		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
> -	else
> -		ret = do_sendmsg(con->sock, &con->v2.out_iter);
> +
> +	ret = sock_sendmsg(con->sock, &msg);
> +	if (ret > 0)
> +		iov_iter_advance(&con->v2.out_iter, ret);
> +	else if (ret == -EAGAIN)
> +		ret = 0;
> +
>   	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
>   	     iov_iter_count(&con->v2.out_iter));
>   	return ret;
>
David Howells March 30, 2023, 6:48 a.m. UTC | #2
Xiubo Li <xiubli@redhat.com> wrote:

> BTW, will this two patch depend on the others in this patch series ?

Yes.  You'll need patches that affect TCP at least so that TCP supports
MSG_SPLICE_PAGES, so 04-08 and perhaps 09.  It's also on top of the patches
that remove ITER_PIPE on my iov-extract branch, but I don't think that should
affect you.

David
Xiubo Li March 31, 2023, 1:05 p.m. UTC | #3
On 3/30/23 02:48, David Howells wrote:
> Xiubo Li <xiubli@redhat.com> wrote:
>
>> BTW, will this two patch depend on the others in this patch series ?
> Yes.  You'll need patches that affect TCP at least so that TCP supports
> MSG_SPLICE_PAGES, so 04-08 and perhaps 09.  It's also on top of the patches
> that remove ITER_PIPE on my iov-extract branch, but I don't think that should
> affect you.

Okay, I will check that.

Thanks.


> David
>
Xiubo Li April 3, 2023, 3:27 a.m. UTC | #4
On 3/30/23 14:48, David Howells wrote:
> Xiubo Li <xiubli@redhat.com> wrote:
>
>> BTW, will this two patch depend on the others in this patch series ?
> Yes.  You'll need patches that affect TCP at least so that TCP supports
> MSG_SPLICE_PAGES, so 04-08 and perhaps 09.  It's also on top of the patches
> that remove ITER_PIPE on my iov-extract branch, but I don't think that should
> affect you.

Why I asked this is because I only could see these two ceph relevant 
patches currently.

Thanks

- Xiubo


> David
>
David Howells April 3, 2023, 8:32 a.m. UTC | #5
Xiubo Li <xiubli@redhat.com> wrote:

> On 3/30/23 14:48, David Howells wrote:
> > Xiubo Li <xiubli@redhat.com> wrote:
> >
> >> BTW, will this two patch depend on the others in this patch series ?
> > Yes.  You'll need patches that affect TCP at least so that TCP supports
> > MSG_SPLICE_PAGES, so 04-08 and perhaps 09.  It's also on top of the
> > patches that remove ITER_PIPE on my iov-extract branch, but I don't think
> > that should affect you.
> 
> Why I asked this is because I only could see these two ceph relevant patches
> currently.

Depends on how you defined 'relevant', I guess.  Only two patches modify ceph
directly, but there's a dependency: to make those work, TCP needs altering
also.

David
Xiubo Li April 10, 2023, 12:38 a.m. UTC | #6
On 4/3/23 16:32, David Howells wrote:
> Xiubo Li <xiubli@redhat.com> wrote:
>
>> On 3/30/23 14:48, David Howells wrote:
>>> Xiubo Li <xiubli@redhat.com> wrote:
>>>
>>>> BTW, will this two patch depend on the others in this patch series ?
>>> Yes.  You'll need patches that affect TCP at least so that TCP supports
>>> MSG_SPLICE_PAGES, so 04-08 and perhaps 09.  It's also on top of the
>>> patches that remove ITER_PIPE on my iov-extract branch, but I don't think
>>> that should affect you.
>> Why I asked this is because I only could see these two ceph relevant patches
>> currently.
> Depends on how you defined 'relevant', I guess.  Only two patches modify ceph
> directly, but there's a dependency: to make those work, TCP needs altering
> also.

Okay. Will run the test today. I was reinstalling my laptop last week.

Thanks

- Xiubo

> David
>
diff mbox series

Patch

diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 301a991dc6a6..1637a0c21126 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -117,91 +117,38 @@  static int ceph_tcp_recv(struct ceph_connection *con)
 	return ret;
 }
 
-static int do_sendmsg(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	int ret;
-
-	msg.msg_iter = *it;
-	while (iov_iter_count(it)) {
-		ret = sock_sendmsg(sock, &msg);
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	WARN_ON(msg_data_left(&msg));
-	return 1;
-}
-
-static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	struct bio_vec bv;
-	int ret;
-
-	if (WARN_ON(!iov_iter_is_bvec(it)))
-		return -EINVAL;
-
-	while (iov_iter_count(it)) {
-		/* iov_iter_iovec() for ITER_BVEC */
-		bvec_set_page(&bv, it->bvec->bv_page,
-			      min(iov_iter_count(it),
-				  it->bvec->bv_len - it->iov_offset),
-			      it->bvec->bv_offset + it->iov_offset);
-
-		/*
-		 * sendpage cannot properly handle pages with
-		 * page_count == 0, we need to fall back to sendmsg if
-		 * that's the case.
-		 *
-		 * Same goes for slab pages: skb_can_coalesce() allows
-		 * coalescing neighboring slab objects into a single frag
-		 * which triggers one of hardened usercopy checks.
-		 */
-		if (sendpage_ok(bv.bv_page)) {
-			ret = sock->ops->sendpage(sock, bv.bv_page,
-						  bv.bv_offset, bv.bv_len,
-						  CEPH_MSG_FLAGS);
-		} else {
-			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-			ret = sock_sendmsg(sock, &msg);
-		}
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	return 1;
-}
-
 /*
  * Write as much as possible.  The socket is expected to be corked,
  * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
  *
  * Return:
- *   1 - done, nothing (else) to write
+ *  >0 - done, nothing (else) to write
  *   0 - socket is full, need to wait
  *  <0 - error
  */
 static int ceph_tcp_send(struct ceph_connection *con)
 {
+	struct msghdr msg = {
+		.msg_iter	= con->v2.out_iter,
+		.msg_flags	= CEPH_MSG_FLAGS,
+	};
 	int ret;
 
+	if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
+		return -EINVAL;
+
+	if (con->v2.out_iter_sendpage)
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
 	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
 	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
-	if (con->v2.out_iter_sendpage)
-		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
-	else
-		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+
+	ret = sock_sendmsg(con->sock, &msg);
+	if (ret > 0)
+		iov_iter_advance(&con->v2.out_iter, ret);
+	else if (ret == -EAGAIN)
+		ret = 0;
+
 	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
 	     iov_iter_count(&con->v2.out_iter));
 	return ret;