diff mbox series

[net-next,v3,10/18] nvme/host: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

Message ID 20230620145338.1300897-11-dhowells@redhat.com (mailing list archive)
State New
Headers show
Series splice, net: Switch over users of sendpage() and remove it | expand

Commit Message

David Howells June 20, 2023, 2:53 p.m. UTC
When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
performing several sendmsg and sendpage calls to transmit header, data
pages and trailer.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Willem de Bruijn <willemb@google.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nvme@lists.infradead.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #2)
     - Wrap lines at 80.
    
    ver #3)
     - Split nvme/host from nvme/target changes.

 drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

Comments

Sagi Grimberg June 21, 2023, 10:15 a.m. UTC | #1
One comment:

format for title in nvme-tcp is:

"nvme-tcp: ..." for host patches, and
"nvmet-tcp: ..." for target patches.

But this can be fixed up when applying the patch set.

Other than that, for both nvme patches:
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

What tree will this be going from btw?

On 6/20/23 17:53, David Howells wrote:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> Tested-by: Sagi Grimberg <sagi@grimberg.me>
> Acked-by: Willem de Bruijn <willemb@google.com>
> cc: Keith Busch <kbusch@kernel.org>
> cc: Jens Axboe <axboe@fb.com>
> cc: Christoph Hellwig <hch@lst.de>
> cc: Chaitanya Kulkarni <kch@nvidia.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: linux-nvme@lists.infradead.org
> cc: netdev@vger.kernel.org
> ---
> 
> Notes:
>      ver #2)
>       - Wrap lines at 80.
>      
>      ver #3)
>       - Split nvme/host from nvme/target changes.
> 
>   drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++--------------------
>   1 file changed, 24 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index bf0230442d57..6f31cdbb696a 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   	u32 h2cdata_left = req->h2cdata_left;
>   
>   	while (true) {
> +		struct bio_vec bvec;
> +		struct msghdr msg = {
> +			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
> +		};
>   		struct page *page = nvme_tcp_req_cur_page(req);
>   		size_t offset = nvme_tcp_req_cur_offset(req);
>   		size_t len = nvme_tcp_req_cur_length(req);
>   		bool last = nvme_tcp_pdu_last_send(req, len);
>   		int req_data_sent = req->data_sent;
> -		int ret, flags = MSG_DONTWAIT;
> +		int ret;
>   
>   		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> -			flags |= MSG_EOR;
> +			msg.msg_flags |= MSG_EOR;
>   		else
> -			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +			msg.msg_flags |= MSG_MORE;
>   
> -		if (sendpage_ok(page)) {
> -			ret = kernel_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		} else {
> -			ret = sock_no_sendpage(queue->sock, page, offset, len,
> -					flags);
> -		}
> +		bvec_set_page(&bvec, page, len, offset);
> +		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +		ret = sock_sendmsg(queue->sock, &msg);
>   		if (ret <= 0)
>   			return ret;
>   
> @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
>   	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> +	struct bio_vec bvec;
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
>   	bool inline_data = nvme_tcp_has_inline_data(req);
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) + hdgst - req->offset;
> -	int flags = MSG_DONTWAIT;
>   	int ret;
>   
>   	if (inline_data || nvme_tcp_queue_more(queue))
> -		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +		msg.msg_flags |= MSG_MORE;
>   	else
> -		flags |= MSG_EOR;
> +		msg.msg_flags |= MSG_EOR;
>   
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> +	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +	ret = sock_sendmsg(queue->sock, &msg);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
> @@ -1093,6 +1095,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
>   	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
> +	struct bio_vec bvec;
> +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) - req->offset + hdgst;
>   	int ret;
> @@ -1101,13 +1105,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
>   	if (!req->h2cdata_left)
> -		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -				offset_in_page(pdu) + req->offset, len,
> -				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> -	else
> -		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
> -				offset_in_page(pdu) + req->offset, len,
> -				MSG_DONTWAIT | MSG_MORE);
> +		msg.msg_flags |= MSG_SPLICE_PAGES;
> +
> +	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> +	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> +	ret = sock_sendmsg(queue->sock, &msg);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
>
David Howells June 21, 2023, 12:35 p.m. UTC | #2
Sagi Grimberg <sagi@grimberg.me> wrote:

> What tree will this be going from btw?

It's aimed at net-next, as mentioned in the subject line.

Thanks,
David
Sagi Grimberg June 21, 2023, 2:05 p.m. UTC | #3
>> What tree will this be going from btw?
> 
> It's aimed at net-next, as mentioned in the subject line.

ok, thanks.
Aurelien Aptel June 29, 2023, 2:45 p.m. UTC | #4
Hi David,

David Howells <dhowells@redhat.com> writes:
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.

This series makes my kernel crash.

From the current net-next main branch:

commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD)
Merge: b545a13ca9b2 b848b26c6672
Author: Jakub Kicinski <kuba@kernel.org>
Date:   Sat Jun 24 15:50:21 2023 -0700

    Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it'


Steps to reproduce:

* connect a remote nvme null block device (nvmet) with 1 IO queue to keep
  things simple
* open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC
* write() a 8k buffer or 4k buffer

Trace:

[  311.766163] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  311.768136] #PF: supervisor read access in kernel mode
[  311.769327] #PF: error_code(0x0000) - not-present page
[  311.770393] PGD 148988067 P4D 0
[  311.771074] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  311.771978] CPU: 0 PID: 180 Comm: kworker/0:1H Not tainted 6.4.0-rc7+ #27
[  311.773380] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
[  311.774808] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp]
[  311.775547] RIP: 0010:skb_splice_from_iter+0xf1/0x370
[  311.776176] Code: 8b 45 88 4d 89 fa 4d 89 e7 45 89 ec 44 89 e3 41 83
               c4 01 83 fb 07 0f 87 56 02 00 00 48 8b 5c dd 90 41 bd 00 10 00 00 49 29
               c5 <48> 8b 53 08 4d 39 f5 4d 0f 47 ee f6 c2 01 0f 85 c7 01 00 00 66 90
[  311.778472] RSP: 0018:ff633e24c0747b08 EFLAGS: 00010206
[  311.779115] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
[  311.780007] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff633e24c0747d30
[  311.780861] RBP: ff633e24c0747bb0 R08: ff633e24c0747d40 R09: 000000006db29140
[  311.781748] R10: ff3001bd00a22800 R11: 0000000008000000 R12: 0000000000000001
[  311.782631] R13: 0000000000001000 R14: 0000000000001000 R15: 0000000000000000
[  311.783506] FS:  0000000000000000(0000) GS:ff3001be77800000(0000) knlGS:0000000000000000
[  311.784494] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  311.785197] CR2: 0000000000000008 CR3: 0000000107f5c001 CR4: 0000000000771ef0
[  311.786076] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  311.786948] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  311.787822] PKRU: 55555554
[  311.788165] Call Trace:
[  311.788480]  <TASK>
[  311.788756]  ? show_regs+0x6e/0x80
[  311.789189]  ? __die+0x29/0x70
[  311.789577]  ? page_fault_oops+0x154/0x4a0
[  311.790097]  ? ip_output+0x7c/0x110
[  311.790541]  ? __sys_socketpair+0x1b4/0x280
[  311.791065]  ? __pfx_ip_finish_output+0x10/0x10
[  311.791640]  ? do_user_addr_fault+0x360/0x770
[  311.792184]  ? exc_page_fault+0x7d/0x190
[  311.792677]  ? asm_exc_page_fault+0x2b/0x30
[  311.793198]  ? skb_splice_from_iter+0xf1/0x370
[  311.793748]  ? skb_splice_from_iter+0xb7/0x370
[  311.794312]  ? __sk_mem_schedule+0x34/0x50
[  311.794824]  tcp_sendmsg_locked+0x3a6/0xdd0
[  311.795344]  ? tcp_push+0x10c/0x120
[  311.795789]  tcp_sendmsg+0x31/0x50
[  311.796213]  inet_sendmsg+0x47/0x80
[  311.796655]  sock_sendmsg+0x99/0xb0
[  311.797095]  ? inet_sendmsg+0x47/0x80
[  311.797557]  nvme_tcp_try_send_data+0x149/0x490 [nvme_tcp]
[  311.798242]  ? kvm_clock_get_cycles+0xd/0x20
[  311.799181]  nvme_tcp_try_send+0x1b7/0x300 [nvme_tcp]
[  311.800133]  nvme_tcp_io_work+0x40/0xc0 [nvme_tcp]
[  311.801044]  process_one_work+0x21c/0x430
[  311.801847]  worker_thread+0x54/0x3e0
[  311.802611]  ? __pfx_worker_thread+0x10/0x10
[  311.803433]  kthread+0xf8/0x130
[  311.804116]  ? __pfx_kthread+0x10/0x10
[  311.804865]  ret_from_fork+0x29/0x50
[  311.805596]  </TASK>
[  311.806165] Modules linked in: mlx5_ib ib_uverbs ib_core nvme_tcp
 mlx5_core mlxfw psample pci_hyperv_intf rpcsec_gss_krb5 nfsv3
 auth_rpcgss nfs_acl nfsv4 nfs lockd grace fscache netfs nvme_fabrics
 nvme_core nvme_common intel_rapl_msr intel_rapl_common
 intel_uncore_frequency_common nfit kvm_intel kvm rapl input_leds
 serio_raw sunrpc binfmt_misc qemu_fw_cfg sch_fq_codel dm_multipath
 scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops reed_solomon
 efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic
 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor
 async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear
 hid_generic usbhid hid qxl drm_ttm_helper ttm crct10dif_pclmul
 crc32_pclmul ghash_clmulni_intel drm_kms_helper sha512_ssse3
 syscopyarea sysfillrect sysimgblt aesni_intel crypto_simd i2c_i801 ahci
 cryptd psmous e drm virtio_net i2c_smbus libahci lpc_ich net_failover
 xhci_pci virtio_blk failover xhci_pci_renesas [last unloaded: ib_core]
[  311.818698] CR2: 0000000000000008
[  311.819437] ---[ end trace 0000000000000000 ]---

Cheers,
Sagi Grimberg June 29, 2023, 2:49 p.m. UTC | #5
> Hi David,
> 
> David Howells <dhowells@redhat.com> writes:
>> When transmitting data, call down into TCP using a single sendmsg with
>> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
>> performing several sendmsg and sendpage calls to transmit header, data
>> pages and trailer.
> 
> This series makes my kernel crash.
> 
>  From the current net-next main branch:
> 
> commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD)
> Merge: b545a13ca9b2 b848b26c6672
> Author: Jakub Kicinski <kuba@kernel.org>
> Date:   Sat Jun 24 15:50:21 2023 -0700
> 
>      Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it'
> 
> 
> Steps to reproduce:
> 
> * connect a remote nvme null block device (nvmet) with 1 IO queue to keep
>    things simple
> * open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC
> * write() a 8k buffer or 4k buffer

Most likely this also reproduces with blktests?
https://github.com/osandov/blktests

simple way to check is to run:
nvme_trtype=tcp ./check nvme

This runs nvme tests over nvme-tcp.

No need for network, disk or anything. It runs
both nvme and nvmet over the lo device..
Aurelien Aptel June 29, 2023, 3:02 p.m. UTC | #6
Sagi Grimberg <sagi@grimberg.me> writes:
> Most likely this also reproduces with blktests?
> https://github.com/osandov/blktests
>
> simple way to check is to run:
> nvme_trtype=tcp ./check nvme
>
> This runs nvme tests over nvme-tcp.

Yes, it crashes similarly during test 10:

root@ktest:~/blktests# nvme_trtype=tcp ./check nvme
nvme/002 (create many subsystems and test discovery)         [not run]
    nvme_trtype=tcp is not supported in this test
nvme/003 (test if we're sending keep-alives to a discovery controller) [passed]
    runtime    ...  10.389s
nvme/004 (test nvme and nvmet UUID NS descriptors)           [passed]
    runtime    ...  1.264s
nvme/005 (reset local loopback target)                       [passed]
    runtime    ...  1.403s
nvme/006 (create an NVMeOF target with a block device-backed ns) [passed]
    runtime    ...  0.129s
nvme/007 (create an NVMeOF target with a file-backed ns)     [passed]
    runtime    ...  0.083s
nvme/008 (create an NVMeOF host with a block device-backed ns) [passed]
    runtime    ...  1.239s
nvme/009 (create an NVMeOF host with a file-backed ns)       [passed]
    runtime    ...  1.229s
nvme/010 (run data verification fio job on NVMeOF block device-backed
ns)

Same trace, null ptr deref at skb_splice_from_iter+0x10a/0x370

Cheers,
David Howells June 29, 2023, 9:23 p.m. UTC | #7
Sagi Grimberg <sagi@grimberg.me> wrote:

> simple way to check is to run:
> nvme_trtype=tcp ./check nvme

It says a lot of:

nvme/002 (create many subsystems and test discovery)         [not run]
    nvme is not available
    nvme_trtype=tcp is not supported in this test
nvme/003 (test if we're sending keep-alives to a discovery controller) [not run]
    nvme is not available
nvme/004 (test nvme and nvmet UUID NS descriptors)           [not run]
    nvme is not available
nvme/005 (reset local loopback target)                       [not run]
    nvme is not available
...

I have the following NVMe config:

# NVME Support
CONFIG_NVME_COMMON=y
CONFIG_NVME_CORE=y
CONFIG_BLK_DEV_NVME=y
CONFIG_NVME_MULTIPATH=y
# CONFIG_NVME_VERBOSE_ERRORS is not set
# CONFIG_NVME_HWMON is not set
CONFIG_NVME_FABRICS=y
# CONFIG_NVME_RDMA is not set
# CONFIG_NVME_FC is not set
CONFIG_NVME_TCP=y
CONFIG_NVME_AUTH=y
CONFIG_NVME_TARGET=y
CONFIG_NVME_TARGET_PASSTHRU=y
CONFIG_NVME_TARGET_LOOP=y
# CONFIG_NVME_TARGET_RDMA is not set
# CONFIG_NVME_TARGET_FC is not set
CONFIG_NVME_TARGET_TCP=y
CONFIG_NVME_TARGET_AUTH=y
# end of NVME Support
CONFIG_RTC_NVMEM=y
CONFIG_NVMEM=y
# CONFIG_NVMEM_SYSFS is not set
# CONFIG_NVMEM_LAYOUT_SL28_VPD is not set
# CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set
# CONFIG_NVMEM_RMEM is not set

Can you tell me what I'm missing?

David
Sagi Grimberg June 29, 2023, 9:33 p.m. UTC | #8
> Sagi Grimberg <sagi@grimberg.me> wrote:
> 
>> simple way to check is to run:
>> nvme_trtype=tcp ./check nvme
> 
> It says a lot of:
> 
> nvme/002 (create many subsystems and test discovery)         [not run]
>      nvme is not available
>      nvme_trtype=tcp is not supported in this test
> nvme/003 (test if we're sending keep-alives to a discovery controller) [not run]
>      nvme is not available
> nvme/004 (test nvme and nvmet UUID NS descriptors)           [not run]
>      nvme is not available
> nvme/005 (reset local loopback target)                       [not run]
>      nvme is not available
> ...
> 
> I have the following NVMe config:
> 
> # NVME Support
> CONFIG_NVME_COMMON=y
> CONFIG_NVME_CORE=y
> CONFIG_BLK_DEV_NVME=y
> CONFIG_NVME_MULTIPATH=y
> # CONFIG_NVME_VERBOSE_ERRORS is not set
> # CONFIG_NVME_HWMON is not set
> CONFIG_NVME_FABRICS=y
> # CONFIG_NVME_RDMA is not set
> # CONFIG_NVME_FC is not set
> CONFIG_NVME_TCP=y
> CONFIG_NVME_AUTH=y
> CONFIG_NVME_TARGET=y
> CONFIG_NVME_TARGET_PASSTHRU=y
> CONFIG_NVME_TARGET_LOOP=y
> # CONFIG_NVME_TARGET_RDMA is not set
> # CONFIG_NVME_TARGET_FC is not set
> CONFIG_NVME_TARGET_TCP=y
> CONFIG_NVME_TARGET_AUTH=y
> # end of NVME Support
> CONFIG_RTC_NVMEM=y
> CONFIG_NVMEM=y
> # CONFIG_NVMEM_SYSFS is not set
> # CONFIG_NVMEM_LAYOUT_SL28_VPD is not set
> # CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set
> # CONFIG_NVMEM_RMEM is not set
> 
> Can you tell me what I'm missing?

install nvme-cli.

nvme/010 is enough to reproduce I think:
nvme_trtype=tcp ./check nvme/010
David Howells June 29, 2023, 9:34 p.m. UTC | #9
Meh:

                if (!sendpage_ok(page))
-                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
+                       msg.msg_flags &= ~MSG_SPLICE_PAGES;
 
                bvec_set_page(&bvec, page, len, offset);

David
Jakub Kicinski June 29, 2023, 11:43 p.m. UTC | #10
On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote:
>                 if (!sendpage_ok(page))
> -                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
> +                       msg.msg_flags &= ~MSG_SPLICE_PAGES;


Nathan Chancellor June 30, 2023, 4:10 p.m. UTC | #11
On Thu, Jun 29, 2023 at 04:43:18PM -0700, Jakub Kicinski wrote:
> On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote:
> >                 if (!sendpage_ok(page))
> > -                       msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > +                       msg.msg_flags &= ~MSG_SPLICE_PAGES;
> 
> 
Jakub Kicinski June 30, 2023, 4:14 p.m. UTC | #12
On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > Let me CC llvm@ in case someone's there is willing to make 
> > the compiler warn about this.
> 
> Turns out clang already has a warning for this, -Wcomma:
> 
>   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
>    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
>         |                                                           ^
>   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
>    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
>         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>         |                         (void)(                           )
>   1 error generated.
> 
> Let me do some wider build testing to see if it is viable to turn this
> on for the whole kernel because it seems worth it, at least in this
> case. There are a lot of cases where a warning won't be emitted (see the
> original upstream review for a list: https://reviews.llvm.org/D3976) but
> something is better than nothing, right? :)

Ah, neat. Misleading indentation is another possible angle, I reckon,
but not sure if that's enabled/possible to enable for the entire kernel
either :( We test-build with W=1 in networking, FWIW, so W=1 would be
enough for us.
Nathan Chancellor June 30, 2023, 7:28 p.m. UTC | #13
On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote:
> On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > > Let me CC llvm@ in case someone's there is willing to make 
> > > the compiler warn about this.
> > 
> > Turns out clang already has a warning for this, -Wcomma:
> > 
> >   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
> >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> >         |                                                           ^
> >   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
> >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> >         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >         |                         (void)(                           )
> >   1 error generated.
> > 
> > Let me do some wider build testing to see if it is viable to turn this
> > on for the whole kernel because it seems worth it, at least in this
> > case. There are a lot of cases where a warning won't be emitted (see the
> > original upstream review for a list: https://reviews.llvm.org/D3976) but
> > something is better than nothing, right? :)

Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone,
there are 289 unique instances of the warning (although a good number
have multiple instances per line, so it is not quite as bad as it seems,
but still bad):

$ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l
289

https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801

Probably not a good sign of the signal to noise ratio, I looked through
a good handful and all the cases I saw were not interesting... Perhaps
the warning could be tuned further to become useful for the kernel but
in its current form, it is definitely a no-go :/

> Ah, neat. Misleading indentation is another possible angle, I reckon,
> but not sure if that's enabled/possible to enable for the entire kernel

Yeah, I was surprised there was no warning for misleading indentation...
it is a part of -Wall for both clang and GCC, so it is on for the
kernel, it just appears not to trigger in this case.

> either :( We test-build with W=1 in networking, FWIW, so W=1 would be
> enough for us.

Unfortunately, even in its current form, it is way too noisy for W=1, as
the qualifier for W=1 is "do not occur too often". Probably could be
placed under W=2 but it still has the problem of wading through every
instance and it is basically a no-op because nobody tests with W=2.

Cheers,
Nathan
Nick Desaulniers July 7, 2023, 8:45 p.m. UTC | #14
On Fri, Jun 30, 2023 at 12:28 PM Nathan Chancellor <nathan@kernel.org> wrote:
>
> On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote:
> > On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote:
> > > > Let me CC llvm@ in case someone's there is willing to make
> > > > the compiler warn about this.
> > >
> > > Turns out clang already has a warning for this, -Wcomma:
> > >
> > >   drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma]
> > >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > >         |                                                           ^
> > >   drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning
> > >    1017 |                         msg.msg_flags &= ~MSG_SPLICE_PAGES,
> > >         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > >         |                         (void)(                           )
> > >   1 error generated.
> > >
> > > Let me do some wider build testing to see if it is viable to turn this
> > > on for the whole kernel because it seems worth it, at least in this
> > > case. There are a lot of cases where a warning won't be emitted (see the
> > > original upstream review for a list: https://reviews.llvm.org/D3976) but
> > > something is better than nothing, right? :)
>
> Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone,
> there are 289 unique instances of the warning (although a good number
> have multiple instances per line, so it is not quite as bad as it seems,
> but still bad):
>
> $ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l
> 289
>
> https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801

It's definitely interesting to take a look at some of these cases.
Some are pretty funny IMO.

>
> Probably not a good sign of the signal to noise ratio, I looked through
> a good handful and all the cases I saw were not interesting... Perhaps
> the warning could be tuned further to become useful for the kernel but
> in its current form, it is definitely a no-go :/
>
> > Ah, neat. Misleading indentation is another possible angle, I reckon,
> > but not sure if that's enabled/possible to enable for the entire kernel
>
> Yeah, I was surprised there was no warning for misleading indentation...
> it is a part of -Wall for both clang and GCC, so it is on for the
> kernel, it just appears not to trigger in this case.
>
> > either :( We test-build with W=1 in networking, FWIW, so W=1 would be
> > enough for us.
>
> Unfortunately, even in its current form, it is way too noisy for W=1, as
> the qualifier for W=1 is "do not occur too often". Probably could be
> placed under W=2 but it still has the problem of wading through every
> instance and it is basically a no-op because nobody tests with W=2.
>
> Cheers,
> Nathan
>
diff mbox series

Patch

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..6f31cdbb696a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -997,25 +997,25 @@  static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 	u32 h2cdata_left = req->h2cdata_left;
 
 	while (true) {
+		struct bio_vec bvec;
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = nvme_tcp_req_cur_page(req);
 		size_t offset = nvme_tcp_req_cur_offset(req);
 		size_t len = nvme_tcp_req_cur_length(req);
 		bool last = nvme_tcp_pdu_last_send(req, len);
 		int req_data_sent = req->data_sent;
-		int ret, flags = MSG_DONTWAIT;
+		int ret;
 
 		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
-			flags |= MSG_EOR;
+			msg.msg_flags |= MSG_EOR;
 		else
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		if (sendpage_ok(page)) {
-			ret = kernel_sendpage(queue->sock, page, offset, len,
-					flags);
-		} else {
-			ret = sock_no_sendpage(queue->sock, page, offset, len,
-					flags);
-		}
+		bvec_set_page(&bvec, page, len, offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+		ret = sock_sendmsg(queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -1054,22 +1054,24 @@  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
 	bool inline_data = nvme_tcp_has_inline_data(req);
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (inline_data || nvme_tcp_queue_more(queue))
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,  flags);
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1093,6 +1095,8 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) - req->offset + hdgst;
 	int ret;
@@ -1101,13 +1105,11 @@  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
 	if (!req->h2cdata_left)
-		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
-	else
-		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE);
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;