Message ID | 20230620145338.1300897-11-dhowells@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | splice, net: Switch over users of sendpage() and remove it | expand |
One comment: format for title in nvme-tcp is: "nvme-tcp: ..." for host patches, and "nvmet-tcp: ..." for target patches. But this can be fixed up when applying the patch set. Other than that, for both nvme patches: Reviewed-by: Sagi Grimberg <sagi@grimberg.me> What tree will this be going from btw? On 6/20/23 17:53, David Howells wrote: > When transmitting data, call down into TCP using a single sendmsg with > MSG_SPLICE_PAGES to indicate that content should be spliced rather than > performing several sendmsg and sendpage calls to transmit header, data > pages and trailer. > > Signed-off-by: David Howells <dhowells@redhat.com> > Tested-by: Sagi Grimberg <sagi@grimberg.me> > Acked-by: Willem de Bruijn <willemb@google.com> > cc: Keith Busch <kbusch@kernel.org> > cc: Jens Axboe <axboe@fb.com> > cc: Christoph Hellwig <hch@lst.de> > cc: Chaitanya Kulkarni <kch@nvidia.com> > cc: "David S. Miller" <davem@davemloft.net> > cc: Eric Dumazet <edumazet@google.com> > cc: Jakub Kicinski <kuba@kernel.org> > cc: Paolo Abeni <pabeni@redhat.com> > cc: Jens Axboe <axboe@kernel.dk> > cc: Matthew Wilcox <willy@infradead.org> > cc: linux-nvme@lists.infradead.org > cc: netdev@vger.kernel.org > --- > > Notes: > ver #2) > - Wrap lines at 80. > > ver #3) > - Split nvme/host from nvme/target changes. > > drivers/nvme/host/tcp.c | 46 +++++++++++++++++++++-------------------- > 1 file changed, 24 insertions(+), 22 deletions(-) > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index bf0230442d57..6f31cdbb696a 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) > u32 h2cdata_left = req->h2cdata_left; > > while (true) { > + struct bio_vec bvec; > + struct msghdr msg = { > + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, > + }; > struct page *page = nvme_tcp_req_cur_page(req); > size_t offset = nvme_tcp_req_cur_offset(req); > size_t len = nvme_tcp_req_cur_length(req); > bool last = nvme_tcp_pdu_last_send(req, len); > int req_data_sent = req->data_sent; > - int ret, flags = MSG_DONTWAIT; > + int ret; > > if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) > - flags |= MSG_EOR; > + msg.msg_flags |= MSG_EOR; > else > - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; > + msg.msg_flags |= MSG_MORE; > > - if (sendpage_ok(page)) { > - ret = kernel_sendpage(queue->sock, page, offset, len, > - flags); > - } else { > - ret = sock_no_sendpage(queue->sock, page, offset, len, > - flags); > - } > + bvec_set_page(&bvec, page, len, offset); > + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); > + ret = sock_sendmsg(queue->sock, &msg); > if (ret <= 0) > return ret; > > @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) > { > struct nvme_tcp_queue *queue = req->queue; > struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); > + struct bio_vec bvec; > + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; > bool inline_data = nvme_tcp_has_inline_data(req); > u8 hdgst = nvme_tcp_hdgst_len(queue); > int len = sizeof(*pdu) + hdgst - req->offset; > - int flags = MSG_DONTWAIT; > int ret; > > if (inline_data || nvme_tcp_queue_more(queue)) > - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; > + msg.msg_flags |= MSG_MORE; > else > - flags |= MSG_EOR; > + msg.msg_flags |= MSG_EOR; > > if (queue->hdr_digest && !req->offset) > nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); > > - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), > - offset_in_page(pdu) + req->offset, len, flags); > + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); > + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); > + ret = sock_sendmsg(queue->sock, &msg); > if (unlikely(ret <= 0)) > return ret; > > @@ -1093,6 +1095,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) > { > struct nvme_tcp_queue *queue = req->queue; > struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); > + struct bio_vec bvec; > + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; > u8 hdgst = nvme_tcp_hdgst_len(queue); > int len = sizeof(*pdu) - req->offset + hdgst; > int ret; > @@ -1101,13 +1105,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) > nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); > > if (!req->h2cdata_left) > - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), > - offset_in_page(pdu) + req->offset, len, > - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); > - else > - ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), > - offset_in_page(pdu) + req->offset, len, > - MSG_DONTWAIT | MSG_MORE); > + msg.msg_flags |= MSG_SPLICE_PAGES; > + > + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); > + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); > + ret = sock_sendmsg(queue->sock, &msg); > if (unlikely(ret <= 0)) > return ret; > >
Sagi Grimberg <sagi@grimberg.me> wrote:
> What tree will this be going from btw?
It's aimed at net-next, as mentioned in the subject line.
Thanks,
David
>> What tree will this be going from btw? > > It's aimed at net-next, as mentioned in the subject line. ok, thanks.
Hi David, David Howells <dhowells@redhat.com> writes: > When transmitting data, call down into TCP using a single sendmsg with > MSG_SPLICE_PAGES to indicate that content should be spliced rather than > performing several sendmsg and sendpage calls to transmit header, data > pages and trailer. This series makes my kernel crash. From the current net-next main branch: commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD) Merge: b545a13ca9b2 b848b26c6672 Author: Jakub Kicinski <kuba@kernel.org> Date: Sat Jun 24 15:50:21 2023 -0700 Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it' Steps to reproduce: * connect a remote nvme null block device (nvmet) with 1 IO queue to keep things simple * open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC * write() a 8k buffer or 4k buffer Trace: [ 311.766163] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 311.768136] #PF: supervisor read access in kernel mode [ 311.769327] #PF: error_code(0x0000) - not-present page [ 311.770393] PGD 148988067 P4D 0 [ 311.771074] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 311.771978] CPU: 0 PID: 180 Comm: kworker/0:1H Not tainted 6.4.0-rc7+ #27 [ 311.773380] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 [ 311.774808] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [ 311.775547] RIP: 0010:skb_splice_from_iter+0xf1/0x370 [ 311.776176] Code: 8b 45 88 4d 89 fa 4d 89 e7 45 89 ec 44 89 e3 41 83 c4 01 83 fb 07 0f 87 56 02 00 00 48 8b 5c dd 90 41 bd 00 10 00 00 49 29 c5 <48> 8b 53 08 4d 39 f5 4d 0f 47 ee f6 c2 01 0f 85 c7 01 00 00 66 90 [ 311.778472] RSP: 0018:ff633e24c0747b08 EFLAGS: 00010206 [ 311.779115] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 [ 311.780007] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff633e24c0747d30 [ 311.780861] RBP: ff633e24c0747bb0 R08: ff633e24c0747d40 R09: 000000006db29140 [ 311.781748] R10: ff3001bd00a22800 R11: 0000000008000000 R12: 0000000000000001 [ 311.782631] R13: 0000000000001000 R14: 0000000000001000 R15: 0000000000000000 [ 311.783506] FS: 0000000000000000(0000) GS:ff3001be77800000(0000) knlGS:0000000000000000 [ 311.784494] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 311.785197] CR2: 0000000000000008 CR3: 0000000107f5c001 CR4: 0000000000771ef0 [ 311.786076] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 311.786948] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 311.787822] PKRU: 55555554 [ 311.788165] Call Trace: [ 311.788480] <TASK> [ 311.788756] ? show_regs+0x6e/0x80 [ 311.789189] ? __die+0x29/0x70 [ 311.789577] ? page_fault_oops+0x154/0x4a0 [ 311.790097] ? ip_output+0x7c/0x110 [ 311.790541] ? __sys_socketpair+0x1b4/0x280 [ 311.791065] ? __pfx_ip_finish_output+0x10/0x10 [ 311.791640] ? do_user_addr_fault+0x360/0x770 [ 311.792184] ? exc_page_fault+0x7d/0x190 [ 311.792677] ? asm_exc_page_fault+0x2b/0x30 [ 311.793198] ? skb_splice_from_iter+0xf1/0x370 [ 311.793748] ? skb_splice_from_iter+0xb7/0x370 [ 311.794312] ? __sk_mem_schedule+0x34/0x50 [ 311.794824] tcp_sendmsg_locked+0x3a6/0xdd0 [ 311.795344] ? tcp_push+0x10c/0x120 [ 311.795789] tcp_sendmsg+0x31/0x50 [ 311.796213] inet_sendmsg+0x47/0x80 [ 311.796655] sock_sendmsg+0x99/0xb0 [ 311.797095] ? inet_sendmsg+0x47/0x80 [ 311.797557] nvme_tcp_try_send_data+0x149/0x490 [nvme_tcp] [ 311.798242] ? kvm_clock_get_cycles+0xd/0x20 [ 311.799181] nvme_tcp_try_send+0x1b7/0x300 [nvme_tcp] [ 311.800133] nvme_tcp_io_work+0x40/0xc0 [nvme_tcp] [ 311.801044] process_one_work+0x21c/0x430 [ 311.801847] worker_thread+0x54/0x3e0 [ 311.802611] ? __pfx_worker_thread+0x10/0x10 [ 311.803433] kthread+0xf8/0x130 [ 311.804116] ? __pfx_kthread+0x10/0x10 [ 311.804865] ret_from_fork+0x29/0x50 [ 311.805596] </TASK> [ 311.806165] Modules linked in: mlx5_ib ib_uverbs ib_core nvme_tcp mlx5_core mlxfw psample pci_hyperv_intf rpcsec_gss_krb5 nfsv3 auth_rpcgss nfs_acl nfsv4 nfs lockd grace fscache netfs nvme_fabrics nvme_core nvme_common intel_rapl_msr intel_rapl_common intel_uncore_frequency_common nfit kvm_intel kvm rapl input_leds serio_raw sunrpc binfmt_misc qemu_fw_cfg sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops reed_solomon efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear hid_generic usbhid hid qxl drm_ttm_helper ttm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drm_kms_helper sha512_ssse3 syscopyarea sysfillrect sysimgblt aesni_intel crypto_simd i2c_i801 ahci cryptd psmous e drm virtio_net i2c_smbus libahci lpc_ich net_failover xhci_pci virtio_blk failover xhci_pci_renesas [last unloaded: ib_core] [ 311.818698] CR2: 0000000000000008 [ 311.819437] ---[ end trace 0000000000000000 ]--- Cheers,
> Hi David, > > David Howells <dhowells@redhat.com> writes: >> When transmitting data, call down into TCP using a single sendmsg with >> MSG_SPLICE_PAGES to indicate that content should be spliced rather than >> performing several sendmsg and sendpage calls to transmit header, data >> pages and trailer. > > This series makes my kernel crash. > > From the current net-next main branch: > > commit 9ae440b8fdd6772b6c007fa3d3766530a09c9045 (HEAD) > Merge: b545a13ca9b2 b848b26c6672 > Author: Jakub Kicinski <kuba@kernel.org> > Date: Sat Jun 24 15:50:21 2023 -0700 > > Merge branch 'splice-net-switch-over-users-of-sendpage-and-remove-it' > > > Steps to reproduce: > > * connect a remote nvme null block device (nvmet) with 1 IO queue to keep > things simple > * open /dev/nvme0n1 with O_RDWR|O_DIRECT|O_SYNC > * write() a 8k buffer or 4k buffer Most likely this also reproduces with blktests? https://github.com/osandov/blktests simple way to check is to run: nvme_trtype=tcp ./check nvme This runs nvme tests over nvme-tcp. No need for network, disk or anything. It runs both nvme and nvmet over the lo device..
Sagi Grimberg <sagi@grimberg.me> writes: > Most likely this also reproduces with blktests? > https://github.com/osandov/blktests > > simple way to check is to run: > nvme_trtype=tcp ./check nvme > > This runs nvme tests over nvme-tcp. Yes, it crashes similarly during test 10: root@ktest:~/blktests# nvme_trtype=tcp ./check nvme nvme/002 (create many subsystems and test discovery) [not run] nvme_trtype=tcp is not supported in this test nvme/003 (test if we're sending keep-alives to a discovery controller) [passed] runtime ... 10.389s nvme/004 (test nvme and nvmet UUID NS descriptors) [passed] runtime ... 1.264s nvme/005 (reset local loopback target) [passed] runtime ... 1.403s nvme/006 (create an NVMeOF target with a block device-backed ns) [passed] runtime ... 0.129s nvme/007 (create an NVMeOF target with a file-backed ns) [passed] runtime ... 0.083s nvme/008 (create an NVMeOF host with a block device-backed ns) [passed] runtime ... 1.239s nvme/009 (create an NVMeOF host with a file-backed ns) [passed] runtime ... 1.229s nvme/010 (run data verification fio job on NVMeOF block device-backed ns) Same trace, null ptr deref at skb_splice_from_iter+0x10a/0x370 Cheers,
Sagi Grimberg <sagi@grimberg.me> wrote: > simple way to check is to run: > nvme_trtype=tcp ./check nvme It says a lot of: nvme/002 (create many subsystems and test discovery) [not run] nvme is not available nvme_trtype=tcp is not supported in this test nvme/003 (test if we're sending keep-alives to a discovery controller) [not run] nvme is not available nvme/004 (test nvme and nvmet UUID NS descriptors) [not run] nvme is not available nvme/005 (reset local loopback target) [not run] nvme is not available ... I have the following NVMe config: # NVME Support CONFIG_NVME_COMMON=y CONFIG_NVME_CORE=y CONFIG_BLK_DEV_NVME=y CONFIG_NVME_MULTIPATH=y # CONFIG_NVME_VERBOSE_ERRORS is not set # CONFIG_NVME_HWMON is not set CONFIG_NVME_FABRICS=y # CONFIG_NVME_RDMA is not set # CONFIG_NVME_FC is not set CONFIG_NVME_TCP=y CONFIG_NVME_AUTH=y CONFIG_NVME_TARGET=y CONFIG_NVME_TARGET_PASSTHRU=y CONFIG_NVME_TARGET_LOOP=y # CONFIG_NVME_TARGET_RDMA is not set # CONFIG_NVME_TARGET_FC is not set CONFIG_NVME_TARGET_TCP=y CONFIG_NVME_TARGET_AUTH=y # end of NVME Support CONFIG_RTC_NVMEM=y CONFIG_NVMEM=y # CONFIG_NVMEM_SYSFS is not set # CONFIG_NVMEM_LAYOUT_SL28_VPD is not set # CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set # CONFIG_NVMEM_RMEM is not set Can you tell me what I'm missing? David
> Sagi Grimberg <sagi@grimberg.me> wrote: > >> simple way to check is to run: >> nvme_trtype=tcp ./check nvme > > It says a lot of: > > nvme/002 (create many subsystems and test discovery) [not run] > nvme is not available > nvme_trtype=tcp is not supported in this test > nvme/003 (test if we're sending keep-alives to a discovery controller) [not run] > nvme is not available > nvme/004 (test nvme and nvmet UUID NS descriptors) [not run] > nvme is not available > nvme/005 (reset local loopback target) [not run] > nvme is not available > ... > > I have the following NVMe config: > > # NVME Support > CONFIG_NVME_COMMON=y > CONFIG_NVME_CORE=y > CONFIG_BLK_DEV_NVME=y > CONFIG_NVME_MULTIPATH=y > # CONFIG_NVME_VERBOSE_ERRORS is not set > # CONFIG_NVME_HWMON is not set > CONFIG_NVME_FABRICS=y > # CONFIG_NVME_RDMA is not set > # CONFIG_NVME_FC is not set > CONFIG_NVME_TCP=y > CONFIG_NVME_AUTH=y > CONFIG_NVME_TARGET=y > CONFIG_NVME_TARGET_PASSTHRU=y > CONFIG_NVME_TARGET_LOOP=y > # CONFIG_NVME_TARGET_RDMA is not set > # CONFIG_NVME_TARGET_FC is not set > CONFIG_NVME_TARGET_TCP=y > CONFIG_NVME_TARGET_AUTH=y > # end of NVME Support > CONFIG_RTC_NVMEM=y > CONFIG_NVMEM=y > # CONFIG_NVMEM_SYSFS is not set > # CONFIG_NVMEM_LAYOUT_SL28_VPD is not set > # CONFIG_NVMEM_LAYOUT_ONIE_TLV is not set > # CONFIG_NVMEM_RMEM is not set > > Can you tell me what I'm missing? install nvme-cli. nvme/010 is enough to reproduce I think: nvme_trtype=tcp ./check nvme/010
Meh: if (!sendpage_ok(page)) - msg.msg_flags &= ~MSG_SPLICE_PAGES, + msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page, len, offset); David
On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote: > if (!sendpage_ok(page)) > - msg.msg_flags &= ~MSG_SPLICE_PAGES, > + msg.msg_flags &= ~MSG_SPLICE_PAGES;
On Thu, Jun 29, 2023 at 04:43:18PM -0700, Jakub Kicinski wrote: > On Thu, 29 Jun 2023 22:34:59 +0100 David Howells wrote: > > if (!sendpage_ok(page)) > > - msg.msg_flags &= ~MSG_SPLICE_PAGES, > > + msg.msg_flags &= ~MSG_SPLICE_PAGES; > >
On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote: > > Let me CC llvm@ in case someone's there is willing to make > > the compiler warn about this. > > Turns out clang already has a warning for this, -Wcomma: > > drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma] > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > | ^ > drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > | (void)( ) > 1 error generated. > > Let me do some wider build testing to see if it is viable to turn this > on for the whole kernel because it seems worth it, at least in this > case. There are a lot of cases where a warning won't be emitted (see the > original upstream review for a list: https://reviews.llvm.org/D3976) but > something is better than nothing, right? :) Ah, neat. Misleading indentation is another possible angle, I reckon, but not sure if that's enabled/possible to enable for the entire kernel either :( We test-build with W=1 in networking, FWIW, so W=1 would be enough for us.
On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote: > On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote: > > > Let me CC llvm@ in case someone's there is willing to make > > > the compiler warn about this. > > > > Turns out clang already has a warning for this, -Wcomma: > > > > drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma] > > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > > | ^ > > drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning > > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > > | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > | (void)( ) > > 1 error generated. > > > > Let me do some wider build testing to see if it is viable to turn this > > on for the whole kernel because it seems worth it, at least in this > > case. There are a lot of cases where a warning won't be emitted (see the > > original upstream review for a list: https://reviews.llvm.org/D3976) but > > something is better than nothing, right? :) Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone, there are 289 unique instances of the warning (although a good number have multiple instances per line, so it is not quite as bad as it seems, but still bad): $ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l 289 https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801 Probably not a good sign of the signal to noise ratio, I looked through a good handful and all the cases I saw were not interesting... Perhaps the warning could be tuned further to become useful for the kernel but in its current form, it is definitely a no-go :/ > Ah, neat. Misleading indentation is another possible angle, I reckon, > but not sure if that's enabled/possible to enable for the entire kernel Yeah, I was surprised there was no warning for misleading indentation... it is a part of -Wall for both clang and GCC, so it is on for the kernel, it just appears not to trigger in this case. > either :( We test-build with W=1 in networking, FWIW, so W=1 would be > enough for us. Unfortunately, even in its current form, it is way too noisy for W=1, as the qualifier for W=1 is "do not occur too often". Probably could be placed under W=2 but it still has the problem of wading through every instance and it is basically a no-op because nobody tests with W=2. Cheers, Nathan
On Fri, Jun 30, 2023 at 12:28 PM Nathan Chancellor <nathan@kernel.org> wrote: > > On Fri, Jun 30, 2023 at 09:14:42AM -0700, Jakub Kicinski wrote: > > On Fri, 30 Jun 2023 09:10:43 -0700 Nathan Chancellor wrote: > > > > Let me CC llvm@ in case someone's there is willing to make > > > > the compiler warn about this. > > > > > > Turns out clang already has a warning for this, -Wcomma: > > > > > > drivers/nvme/host/tcp.c:1017:38: error: possible misuse of comma operator here [-Werror,-Wcomma] > > > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > > > | ^ > > > drivers/nvme/host/tcp.c:1017:4: note: cast expression to void to silence warning > > > 1017 | msg.msg_flags &= ~MSG_SPLICE_PAGES, > > > | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > > | (void)( ) > > > 1 error generated. > > > > > > Let me do some wider build testing to see if it is viable to turn this > > > on for the whole kernel because it seems worth it, at least in this > > > case. There are a lot of cases where a warning won't be emitted (see the > > > original upstream review for a list: https://reviews.llvm.org/D3976) but > > > something is better than nothing, right? :) > > Well, that was a pipe dream :/ In ARCH=arm multi_v7_defconfig alone, > there are 289 unique instances of the warning (although a good number > have multiple instances per line, so it is not quite as bad as it seems, > but still bad): > > $ rg -- -Wcomma arm-multi_v7_defconfig.log | sort | uniq -c | wc -l > 289 > > https://gist.github.com/nathanchance/907867e0a7adffc877fd39fd08853801 It's definitely interesting to take a look at some of these cases. Some are pretty funny IMO. > > Probably not a good sign of the signal to noise ratio, I looked through > a good handful and all the cases I saw were not interesting... Perhaps > the warning could be tuned further to become useful for the kernel but > in its current form, it is definitely a no-go :/ > > > Ah, neat. Misleading indentation is another possible angle, I reckon, > > but not sure if that's enabled/possible to enable for the entire kernel > > Yeah, I was surprised there was no warning for misleading indentation... > it is a part of -Wall for both clang and GCC, so it is on for the > kernel, it just appears not to trigger in this case. > > > either :( We test-build with W=1 in networking, FWIW, so W=1 would be > > enough for us. > > Unfortunately, even in its current form, it is way too noisy for W=1, as > the qualifier for W=1 is "do not occur too often". Probably could be > placed under W=2 but it still has the problem of wading through every > instance and it is basically a no-op because nobody tests with W=2. > > Cheers, > Nathan >
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bf0230442d57..6f31cdbb696a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -997,25 +997,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) u32 h2cdata_left = req->h2cdata_left; while (true) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; struct page *page = nvme_tcp_req_cur_page(req); size_t offset = nvme_tcp_req_cur_offset(req); size_t len = nvme_tcp_req_cur_length(req); bool last = nvme_tcp_pdu_last_send(req, len); int req_data_sent = req->data_sent; - int ret, flags = MSG_DONTWAIT; + int ret; if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; else - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; - if (sendpage_ok(page)) { - ret = kernel_sendpage(queue->sock, page, offset, len, - flags); - } else { - ret = sock_no_sendpage(queue->sock, page, offset, len, - flags); - } + bvec_set_page(&bvec, page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (ret <= 0) return ret; @@ -1054,22 +1054,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; - int flags = MSG_DONTWAIT; int ret; if (inline_data || nvme_tcp_queue_more(queue)) - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; else - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, flags); + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -1093,6 +1095,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; @@ -1101,13 +1105,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); if (!req->h2cdata_left) - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); - else - ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE); + msg.msg_flags |= MSG_SPLICE_PAGES; + + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret;