Message ID | 9cbf0ce5-ed79-0252-fd2d-34bebaafffa3@fb.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
On Fri, 2017-01-27 at 01:04 -0700, Jens Axboe wrote: > The previous patch had a bug if you didn't use a scheduler, here's a > version that should work fine in both cases. I've also updated the > above mentioned branch, so feel free to pull that as well and merge to > master like before. Booting time is back to normal with commit f3a8ab7d55bc merged with v4.10-rc5. That's a great improvement. However, running the srp-test software triggers now a new complaint: [ 215.600386] sd 11:0:0:0: [sdh] Attached SCSI disk [ 215.609485] sd 11:0:0:0: alua: port group 00 state A non-preferred supports TOlUSNA [ 215.722900] scsi 13:0:0:0: alua: Detached [ 215.724452] general protection fault: 0000 [#1] SMP [ 215.724484] Modules linked in: dm_service_time ib_srp scsi_transport_srp target_core_user uio target_core_pscsi target_core_file ib_srpt target_core_iblock target_core_mod brd netconsole xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat libcrc32c nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables af_packet ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm msr configfs ib_cm iw_cm mlx4_ib ib_core sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp ipmi_ssif coretemp kvm_intel hid_generic kvm usbhid irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel mlx4_core ghash_clmulni_intel iTCO_wdt dcdbas pcbc tg3 [ 215.724629] iTCO_vendor_support ptp aesni_intel pps_core aes_x86_64 pcspkr crypto_simd libphy ipmi_si glue_helper cryptd ipmi_devintf tpm_tis devlink fjes ipmi_msghandler tpm_tis_core tpm mei_me lpc_ich mei mfd_core button shpchp wmi mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm sr_mod cdrom ehci_pci ehci_hcd usbcore usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4 [ 215.724719] CPU: 9 PID: 8043 Comm: multipathd Not tainted 4.10.0-rc5-dbg+ #1 [ 215.724748] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 [ 215.724775] task: ffff8801717998c0 task.stack: ffffc90002a9c000 [ 215.724804] RIP: 0010:scsi_device_put+0xb/0x30 [ 215.724829] RSP: 0018:ffffc90002a9faa0 EFLAGS: 00010246 [ 215.724855] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88038bf85698 RCX: 0000000000000006 [ 215.724880] RDX: 0000000000000006 RSI: ffff88017179a108 RDI: ffff88038bf85698 [ 215.724906] RBP: ffffc90002a9faa8 R08: ffff880384786008 R09: 0000000100170007 [ 215.724932] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88038bf85698 [ 215.724958] R13: ffff88038919f090 R14: dead000000000100 R15: ffff88038a41dd28 [ 215.724983] FS: 00007fbf8c6cf700(0000) GS:ffff88046f440000(0000) knlGS:0000000000000000 [ 215.725010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 215.725035] CR2: 00007f1262ef3ee0 CR3: 000000044f6cc000 CR4: 00000000001406e0 [ 215.725060] Call Trace: [ 215.725086] scsi_disk_put+0x2d/0x40 [ 215.725110] sd_release+0x3d/0xb0 [ 215.725137] __blkdev_put+0x29e/0x360 [ 215.725163] blkdev_put+0x49/0x170 [ 215.725192] dm_put_table_device+0x58/0xc0 [dm_mod] [ 215.725219] dm_put_device+0x70/0xc0 [dm_mod] [ 215.725269] free_priority_group+0x92/0xc0 [dm_multipath] [ 215.725295] free_multipath+0x70/0xc0 [dm_multipath] [ 215.725320] multipath_dtr+0x19/0x20 [dm_multipath] [ 215.725348] dm_table_destroy+0x67/0x120 [dm_mod] [ 215.725379] dev_suspend+0xde/0x240 [dm_mod] [ 215.725434] ctl_ioctl+0x1f5/0x520 [dm_mod] [ 215.725489] dm_ctl_ioctl+0xe/0x20 [dm_mod] [ 215.725515] do_vfs_ioctl+0x8f/0x700 [ 215.725589] SyS_ioctl+0x3c/0x70 [ 215.725614] entry_SYSCALL_64_fastpath+0x18/0xad [ 215.725641] RIP: 0033:0x7fbf8aca0667 [ 215.725665] RSP: 002b:00007fbf8c6cd668 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 215.725692] RAX: ffffffffffffffda RBX: 0000000000000046 RCX: 00007fbf8aca0667 [ 215.725716] RDX: 00007fbf8006b940 RSI: 00000000c138fd06 RDI: 0000000000000007 [ 215.725743] RBP: 0000000000000009 R08: 00007fbf8c6cb3c0 R09: 00007fbf8b68d8d8 [ 215.725768] R10: 0000000000000075 R11: 0000000000000246 R12: 00007fbf8c6cd770 [ 215.725793] R13: 0000000000000013 R14: 00000000006168f0 R15: 0000000000f74780 [ 215.725820] Code: bc 24 b8 00 00 00 e8 55 c8 1c 00 48 83 c4 08 48 89 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 00 55 48 89 e5 53 48 8b 07 48 89 fb <48> 8b 80 a8 01 00 00 48 8b 38 e8 f6 68 c5 ff 48 8d bb 38 02 00 [ 215.725903] RIP: scsi_device_put+0xb/0x30 RSP: ffffc90002a9faa0 (gdb) list *(scsi_device_put+0xb) 0xffffffff8149fc2b is in scsi_device_put (drivers/scsi/scsi.c:957). 952 * count of the underlying LLDD module. The device is freed once the last 953 * user vanishes. 954 */ 955 void scsi_device_put(struct scsi_device *sdev) 956 { 957 module_put(sdev->host->hostt->module); 958 put_device(&sdev->sdev_gendev); 959 } 960 EXPORT_SYMBOL(scsi_device_put); 961 (gdb) disas scsi_device_put Dump of assembler code for function scsi_device_put: 0xffffffff8149fc20 <+0>: push %rbp 0xffffffff8149fc21 <+1>: mov %rsp,%rbp 0xffffffff8149fc24 <+4>: push %rbx 0xffffffff8149fc25 <+5>: mov (%rdi),%rax 0xffffffff8149fc28 <+8>: mov %rdi,%rbx 0xffffffff8149fc2b <+11>: mov 0x1a8(%rax),%rax 0xffffffff8149fc32 <+18>: mov (%rax),%rdi 0xffffffff8149fc35 <+21>: callq 0xffffffff810f6530 <module_put> 0xffffffff8149fc3a <+26>: lea 0x238(%rbx),%rdi 0xffffffff8149fc41 <+33>: callq 0xffffffff814714b0 <put_device> 0xffffffff8149fc46 <+38>: pop %rbx 0xffffffff8149fc47 <+39>: pop %rbp 0xffffffff8149fc48 <+40>: retq End of assembler dump. (gdb) print &((struct Scsi_Host *)0)->hostt $2 = (struct scsi_host_template **) 0x1a8 <irq_stack_union+424> Apparently scsi_device_put() was called for a SCSI device that was already freed (memory poisoning was enabled in my test). This is something I had not yet seen before. Bart.-- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/27/2017 09:52 AM, Bart Van Assche wrote: > On Fri, 2017-01-27 at 01:04 -0700, Jens Axboe wrote: >> The previous patch had a bug if you didn't use a scheduler, here's a >> version that should work fine in both cases. I've also updated the >> above mentioned branch, so feel free to pull that as well and merge to >> master like before. > > Booting time is back to normal with commit f3a8ab7d55bc merged with > v4.10-rc5. That's a great improvement. However, running the srp-test > software triggers now a new complaint: > > [ 215.600386] sd 11:0:0:0: [sdh] Attached SCSI disk > [ 215.609485] sd 11:0:0:0: alua: port group 00 state A non-preferred supports TOlUSNA > [ 215.722900] scsi 13:0:0:0: alua: Detached > [ 215.724452] general protection fault: 0000 [#1] SMP > [ 215.724484] Modules linked in: dm_service_time ib_srp scsi_transport_srp target_core_user uio target_core_pscsi target_core_file ib_srpt target_core_iblock target_core_mod brd netconsole xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat libcrc32c nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables af_packet ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm msr configfs ib_cm iw_cm mlx4_ib ib_core sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp ipmi_ssif coretemp kvm_intel hid_generic kvm usbhid irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel mlx4_core ghash_clmulni_intel iTCO_wdt dcdbas pcbc tg3 > [ 215.724629] iTCO_vendor_support ptp aesni_intel pps_core aes_x86_64 pcspkr crypto_simd libphy ipmi_si glue_helper cryptd ipmi_devintf tpm_tis devlink fjes ipmi_msghandler tpm_tis_core tpm mei_me lpc_ich mei mfd_core button shpchp wmi mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm sr_mod cdrom ehci_pci ehci_hcd usbcore usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4 > [ 215.724719] CPU: 9 PID: 8043 Comm: multipathd Not tainted 4.10.0-rc5-dbg+ #1 > [ 215.724748] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 > [ 215.724775] task: ffff8801717998c0 task.stack: ffffc90002a9c000 > [ 215.724804] RIP: 0010:scsi_device_put+0xb/0x30 > [ 215.724829] RSP: 0018:ffffc90002a9faa0 EFLAGS: 00010246 > [ 215.724855] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88038bf85698 RCX: 0000000000000006 > [ 215.724880] RDX: 0000000000000006 RSI: ffff88017179a108 RDI: ffff88038bf85698 > [ 215.724906] RBP: ffffc90002a9faa8 R08: ffff880384786008 R09: 0000000100170007 > [ 215.724932] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88038bf85698 > [ 215.724958] R13: ffff88038919f090 R14: dead000000000100 R15: ffff88038a41dd28 > [ 215.724983] FS: 00007fbf8c6cf700(0000) GS:ffff88046f440000(0000) knlGS:0000000000000000 > [ 215.725010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 215.725035] CR2: 00007f1262ef3ee0 CR3: 000000044f6cc000 CR4: 00000000001406e0 > [ 215.725060] Call Trace: > [ 215.725086] scsi_disk_put+0x2d/0x40 > [ 215.725110] sd_release+0x3d/0xb0 > [ 215.725137] __blkdev_put+0x29e/0x360 > [ 215.725163] blkdev_put+0x49/0x170 > [ 215.725192] dm_put_table_device+0x58/0xc0 [dm_mod] > [ 215.725219] dm_put_device+0x70/0xc0 [dm_mod] > [ 215.725269] free_priority_group+0x92/0xc0 [dm_multipath] > [ 215.725295] free_multipath+0x70/0xc0 [dm_multipath] > [ 215.725320] multipath_dtr+0x19/0x20 [dm_multipath] > [ 215.725348] dm_table_destroy+0x67/0x120 [dm_mod] > [ 215.725379] dev_suspend+0xde/0x240 [dm_mod] > [ 215.725434] ctl_ioctl+0x1f5/0x520 [dm_mod] > [ 215.725489] dm_ctl_ioctl+0xe/0x20 [dm_mod] > [ 215.725515] do_vfs_ioctl+0x8f/0x700 > [ 215.725589] SyS_ioctl+0x3c/0x70 > [ 215.725614] entry_SYSCALL_64_fastpath+0x18/0xad > [ 215.725641] RIP: 0033:0x7fbf8aca0667 > [ 215.725665] RSP: 002b:00007fbf8c6cd668 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 > [ 215.725692] RAX: ffffffffffffffda RBX: 0000000000000046 RCX: 00007fbf8aca0667 > [ 215.725716] RDX: 00007fbf8006b940 RSI: 00000000c138fd06 RDI: 0000000000000007 > [ 215.725743] RBP: 0000000000000009 R08: 00007fbf8c6cb3c0 R09: 00007fbf8b68d8d8 > [ 215.725768] R10: 0000000000000075 R11: 0000000000000246 R12: 00007fbf8c6cd770 > [ 215.725793] R13: 0000000000000013 R14: 00000000006168f0 R15: 0000000000f74780 > [ 215.725820] Code: bc 24 b8 00 00 00 e8 55 c8 1c 00 48 83 c4 08 48 89 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 00 55 48 89 e5 53 48 8b 07 48 89 fb <48> 8b 80 a8 01 00 00 48 8b 38 e8 f6 68 c5 ff 48 8d bb 38 02 00 > [ 215.725903] RIP: scsi_device_put+0xb/0x30 RSP: ffffc90002a9faa0 > > (gdb) list *(scsi_device_put+0xb) > 0xffffffff8149fc2b is in scsi_device_put (drivers/scsi/scsi.c:957). > 952 * count of the underlying LLDD module. The device is freed once the last > 953 * user vanishes. > 954 */ > 955 void scsi_device_put(struct scsi_device *sdev) > 956 { > 957 module_put(sdev->host->hostt->module); > 958 put_device(&sdev->sdev_gendev); > 959 } > 960 EXPORT_SYMBOL(scsi_device_put); > 961 > (gdb) disas scsi_device_put > Dump of assembler code for function scsi_device_put: > 0xffffffff8149fc20 <+0>: push %rbp > 0xffffffff8149fc21 <+1>: mov %rsp,%rbp > 0xffffffff8149fc24 <+4>: push %rbx > 0xffffffff8149fc25 <+5>: mov (%rdi),%rax > 0xffffffff8149fc28 <+8>: mov %rdi,%rbx > 0xffffffff8149fc2b <+11>: mov 0x1a8(%rax),%rax > 0xffffffff8149fc32 <+18>: mov (%rax),%rdi > 0xffffffff8149fc35 <+21>: callq 0xffffffff810f6530 <module_put> > 0xffffffff8149fc3a <+26>: lea 0x238(%rbx),%rdi > 0xffffffff8149fc41 <+33>: callq 0xffffffff814714b0 <put_device> > 0xffffffff8149fc46 <+38>: pop %rbx > 0xffffffff8149fc47 <+39>: pop %rbp > 0xffffffff8149fc48 <+40>: retq > End of assembler dump. > (gdb) print &((struct Scsi_Host *)0)->hostt > $2 = (struct scsi_host_template **) 0x1a8 <irq_stack_union+424> > > Apparently scsi_device_put() was called for a SCSI device that was already > freed (memory poisoning was enabled in my test). This is something I had > not yet seen before. I have no idea what this is, I haven't messed with life time or devices or queues at all in that branch.
On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote: > I have no idea what this is, I haven't messed with life time or devices > or queues at all in that branch. The srp-test software passes with kernel v4.9. Something must have changed. I'll see whether I can find some time to look further into this. Bart.-- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote: > On 01/27/2017 09:52 AM, Bart Van Assche wrote: > > [ 215.724452] general protection fault: 0000 [#1] SMP > > [ 215.725060] Call Trace: > > [ 215.725086] scsi_disk_put+0x2d/0x40 > > [ 215.725110] sd_release+0x3d/0xb0 > > [ 215.725137] __blkdev_put+0x29e/0x360 > > [ 215.725163] blkdev_put+0x49/0x170 > > [ 215.725192] dm_put_table_device+0x58/0xc0 [dm_mod] > > [ 215.725219] dm_put_device+0x70/0xc0 [dm_mod] > > [ 215.725269] free_priority_group+0x92/0xc0 [dm_multipath] > > [ 215.725295] free_multipath+0x70/0xc0 [dm_multipath] > > [ 215.725320] multipath_dtr+0x19/0x20 [dm_multipath] > > [ 215.725348] dm_table_destroy+0x67/0x120 [dm_mod] > > [ 215.725379] dev_suspend+0xde/0x240 [dm_mod] > > [ 215.725434] ctl_ioctl+0x1f5/0x520 [dm_mod] > > [ 215.725489] dm_ctl_ioctl+0xe/0x20 [dm_mod] > > [ 215.725515] do_vfs_ioctl+0x8f/0x700 > > [ 215.725589] SyS_ioctl+0x3c/0x70 > > [ 215.725614] entry_SYSCALL_64_fastpath+0x18/0xad > > > > I have no idea what this is, I haven't messed with life time or devices > or queues at all in that branch. Hello Jens, Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5 went fine. With your for-4.11/block branch (commit 400f73b23f457a) however I just ran into the following: [ 214.555527] ------------[ cut here ]------------ [ 214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480 [ 214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0) [ 214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1 [ 214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 [ 214.555867] Call Trace: [ 214.555889] dump_stack+0x68/0x93 [ 214.555911] __warn+0xc6/0xe0 [ 214.555953] warn_slowpath_fmt+0x4a/0x50 [ 214.555973] lock_release+0x346/0x480 [ 214.556021] aio_write+0x106/0x140 [ 214.556067] do_io_submit+0x37d/0x900 [ 214.556108] SyS_io_submit+0xb/0x10 [ 214.556131] entry_SYSCALL_64_fastpath+0x18/0xad I will continue to try to figure out what is causing this behavior. Bart.-- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> On Jan 30, 2017, at 5:12 PM, Bart Van Assche <Bart.VanAssche@sandisk.com> wrote: > >> On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote: >>> On 01/27/2017 09:52 AM, Bart Van Assche wrote: >>> [ 215.724452] general protection fault: 0000 [#1] SMP >>> [ 215.725060] Call Trace: >>> [ 215.725086] scsi_disk_put+0x2d/0x40 >>> [ 215.725110] sd_release+0x3d/0xb0 >>> [ 215.725137] __blkdev_put+0x29e/0x360 >>> [ 215.725163] blkdev_put+0x49/0x170 >>> [ 215.725192] dm_put_table_device+0x58/0xc0 [dm_mod] >>> [ 215.725219] dm_put_device+0x70/0xc0 [dm_mod] >>> [ 215.725269] free_priority_group+0x92/0xc0 [dm_multipath] >>> [ 215.725295] free_multipath+0x70/0xc0 [dm_multipath] >>> [ 215.725320] multipath_dtr+0x19/0x20 [dm_multipath] >>> [ 215.725348] dm_table_destroy+0x67/0x120 [dm_mod] >>> [ 215.725379] dev_suspend+0xde/0x240 [dm_mod] >>> [ 215.725434] ctl_ioctl+0x1f5/0x520 [dm_mod] >>> [ 215.725489] dm_ctl_ioctl+0xe/0x20 [dm_mod] >>> [ 215.725515] do_vfs_ioctl+0x8f/0x700 >>> [ 215.725589] SyS_ioctl+0x3c/0x70 >>> [ 215.725614] entry_SYSCALL_64_fastpath+0x18/0xad >>> >> >> I have no idea what this is, I haven't messed with life time or devices >> or queues at all in that branch. > > Hello Jens, > > Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5 > went fine. With your for-4.11/block branch (commit 400f73b23f457a) however > I just ran into the following: > > [ 214.555527] ------------[ cut here ]------------ > [ 214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480 > [ 214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0) > [ 214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1 > [ 214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 > [ 214.555867] Call Trace: > [ 214.555889] dump_stack+0x68/0x93 > [ 214.555911] __warn+0xc6/0xe0 > [ 214.555953] warn_slowpath_fmt+0x4a/0x50 > [ 214.555973] lock_release+0x346/0x480 > [ 214.556021] aio_write+0x106/0x140 > [ 214.556067] do_io_submit+0x37d/0x900 > [ 214.556108] SyS_io_submit+0xb/0x10 > [ 214.556131] entry_SYSCALL_64_fastpath+0x18/0xad > > I will continue to try to figure out what is causing this behavior. That's a known bug in mainline. Pull it into 4.10-rc6, or use my for-next where everything is already merged. -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/30/2017 05:38 PM, Jens Axboe wrote: > > >> On Jan 30, 2017, at 5:12 PM, Bart Van Assche <Bart.VanAssche@sandisk.com> wrote: >> >>> On Fri, 2017-01-27 at 09:56 -0700, Jens Axboe wrote: >>>> On 01/27/2017 09:52 AM, Bart Van Assche wrote: >>>> [ 215.724452] general protection fault: 0000 [#1] SMP >>>> [ 215.725060] Call Trace: >>>> [ 215.725086] scsi_disk_put+0x2d/0x40 >>>> [ 215.725110] sd_release+0x3d/0xb0 >>>> [ 215.725137] __blkdev_put+0x29e/0x360 >>>> [ 215.725163] blkdev_put+0x49/0x170 >>>> [ 215.725192] dm_put_table_device+0x58/0xc0 [dm_mod] >>>> [ 215.725219] dm_put_device+0x70/0xc0 [dm_mod] >>>> [ 215.725269] free_priority_group+0x92/0xc0 [dm_multipath] >>>> [ 215.725295] free_multipath+0x70/0xc0 [dm_multipath] >>>> [ 215.725320] multipath_dtr+0x19/0x20 [dm_multipath] >>>> [ 215.725348] dm_table_destroy+0x67/0x120 [dm_mod] >>>> [ 215.725379] dev_suspend+0xde/0x240 [dm_mod] >>>> [ 215.725434] ctl_ioctl+0x1f5/0x520 [dm_mod] >>>> [ 215.725489] dm_ctl_ioctl+0xe/0x20 [dm_mod] >>>> [ 215.725515] do_vfs_ioctl+0x8f/0x700 >>>> [ 215.725589] SyS_ioctl+0x3c/0x70 >>>> [ 215.725614] entry_SYSCALL_64_fastpath+0x18/0xad >>>> >>> >>> I have no idea what this is, I haven't messed with life time or devices >>> or queues at all in that branch. >> >> Hello Jens, >> >> Running the srp-test software against kernel 4.9.6 and kernel 4.10-rc5 >> went fine. With your for-4.11/block branch (commit 400f73b23f457a) however >> I just ran into the following: >> >> [ 214.555527] ------------[ cut here ]------------ >> [ 214.555565] WARNING: CPU: 5 PID: 13201 at kernel/locking/lockdep.c:3514 lock_release+0x346/0x480 >> [ 214.555588] DEBUG_LOCKS_WARN_ON(depth <= 0) >> [ 214.555824] CPU: 5 PID: 13201 Comm: fio Not tainted 4.10.0-rc3-dbg+ #1 >> [ 214.555846] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014 >> [ 214.555867] Call Trace: >> [ 214.555889] dump_stack+0x68/0x93 >> [ 214.555911] __warn+0xc6/0xe0 >> [ 214.555953] warn_slowpath_fmt+0x4a/0x50 >> [ 214.555973] lock_release+0x346/0x480 >> [ 214.556021] aio_write+0x106/0x140 >> [ 214.556067] do_io_submit+0x37d/0x900 >> [ 214.556108] SyS_io_submit+0xb/0x10 >> [ 214.556131] entry_SYSCALL_64_fastpath+0x18/0xad >> >> I will continue to try to figure out what is causing this behavior. > > That's a known bug in mainline. Pull it into 4.10-rc6, > or use my for-next where everything is already merged. Since I'm not on the phone anymore, this is the commit that was merged after my for-4.11/block was forked, which fixes this issue: commit a12f1ae61c489076a9aeb90bddca7722bf330df3 Author: Shaohua Li <shli@fb.com> Date: Tue Dec 13 12:09:56 2016 -0800 aio: fix lock dep warning So you can just pull that in, if you want, or do what I suggested above.
On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote: > That's a known bug in mainline. Pull it into 4.10-rc6, > or use my for-next where everything is already merged. Hello Jens, With your for-next branch (commit c2e60b3a2602) I haven't hit any block layer crashes so far. The only issue I encountered that is new is a memory leak triggered by the SG-IO code. These memory leak reports started to appear after I started testing the mq-deadline scheduler. kmemleak reported the following call stack multiple times after my tests had finished: unreferenced object 0xffff88041119e528 (size 192): comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00 ................ backtrace: [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0 [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0 [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0 [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400 [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210 [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60 [<ffffffff81336694>] sg_io+0x3d4/0x410 [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490 [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50 [<ffffffff814b4360>] sd_ioctl+0x80/0x100 [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0 [<ffffffff8122f388>] block_ioctl+0x38/0x40 [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700 [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70 [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad Bart.
On Tue, 2017-01-31 at 13:34 -0800, Bart Van Assche wrote: > On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote: > > That's a known bug in mainline. Pull it into 4.10-rc6, > > or use my for-next where everything is already merged. > > Hello Jens, > > With your for-next branch (commit c2e60b3a2602) I haven't hit any block > layer crashes so far. The only issue I encountered that is new is a > memory leak triggered by the SG-IO code. These memory leak reports > started to appear after I started testing the mq-deadline scheduler. > kmemleak reported the following call stack multiple times after my tests > had finished: > > unreferenced object 0xffff88041119e528 (size 192): > comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s) > hex dump (first 32 bytes): > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ > 00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00 ................ > backtrace: > [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0 > [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0 > [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0 > [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400 > [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210 > [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60 > [<ffffffff81336694>] sg_io+0x3d4/0x410 > [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490 > [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50 > [<ffffffff814b4360>] sd_ioctl+0x80/0x100 > [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0 > [<ffffffff8122f388>] block_ioctl+0x38/0x40 > [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700 > [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70 > [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad After I repeated my test the above findings were confirmed: no memory leaks were reported by kmemleak after a test with I/O scheduler "none" and the above call stack was reported 44 times by kmemleak after a test with I/O scheduler "mq-deadline". Bart.
On 01/31/2017 01:55 PM, Bart Van Assche wrote: > On Tue, 2017-01-31 at 13:34 -0800, Bart Van Assche wrote: >> On Mon, 2017-01-30 at 17:38 -0800, Jens Axboe wrote: >>> That's a known bug in mainline. Pull it into 4.10-rc6, >>> or use my for-next where everything is already merged. >> >> Hello Jens, >> >> With your for-next branch (commit c2e60b3a2602) I haven't hit any block >> layer crashes so far. The only issue I encountered that is new is a >> memory leak triggered by the SG-IO code. These memory leak reports >> started to appear after I started testing the mq-deadline scheduler. >> kmemleak reported the following call stack multiple times after my tests >> had finished: >> >> unreferenced object 0xffff88041119e528 (size 192): >> comm "multipathd", pid 2353, jiffies 4295128020 (age 1332.440s) >> hex dump (first 32 bytes): >> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ >> 00 00 00 00 00 00 00 00 12 01 00 00 00 00 00 00 ................ >> backtrace: >> [<ffffffff8165e3b5>] kmemleak_alloc+0x45/0xa0 >> [<ffffffff811cc23d>] __kmalloc+0x15d/0x2f0 >> [<ffffffff81310e35>] bio_alloc_bioset+0x185/0x1f0 >> [<ffffffff813117f4>] bio_map_user_iov+0x124/0x400 >> [<ffffffff81320b7a>] blk_rq_map_user_iov+0x11a/0x210 >> [<ffffffff81320cbd>] blk_rq_map_user+0x4d/0x60 >> [<ffffffff81336694>] sg_io+0x3d4/0x410 >> [<ffffffff813369d0>] scsi_cmd_ioctl+0x300/0x490 >> [<ffffffff81336b9d>] scsi_cmd_blk_ioctl+0x3d/0x50 >> [<ffffffff814b4360>] sd_ioctl+0x80/0x100 >> [<ffffffff8132ddde>] blkdev_ioctl+0x51e/0x9f0 >> [<ffffffff8122f388>] block_ioctl+0x38/0x40 >> [<ffffffff8120097f>] do_vfs_ioctl+0x8f/0x700 >> [<ffffffff8120102c>] SyS_ioctl+0x3c/0x70 >> [<ffffffff8166c4aa>] entry_SYSCALL_64_fastpath+0x18/0xad > > After I repeated my test the above findings were confirmed: no memory leaks > were reported by kmemleak after a test with I/O scheduler "none" and the > above call stack was reported 44 times by kmemleak after a test with I/O > scheduler "mq-deadline". Interesting, I'll check this. Doesn't make any sense why the scheduler would be implicated in that, given how we run completions now. But if it complains, then something must be up.
diff --git a/block/blk-core.c b/block/blk-core.c index a61f1407f4f6..78daf5b6d7cb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2129,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_sched_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); return 0; } diff --git a/block/blk-exec.c b/block/blk-exec.c index 86656fdfa637..ed1f10165268 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -66,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false); + blk_mq_sched_insert_request(rq, at_head, true, false, false); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index d7de34ee39c2..4427896641ac 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -456,7 +456,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c27613de80c5..5e91743e193a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -336,6 +336,64 @@ void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) } } +/* + * Add flush/fua to the queue. If we fail getting a driver tag, then + * punt to the requeue list. Requeue will re-invoke us from a context + * that's safe to block from. + */ +static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool can_block) +{ + if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { + blk_insert_flush(rq); + blk_mq_run_hw_queue(hctx, true); + } else + blk_mq_add_to_requeue_list(rq, true, true); +} + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + if (rq->tag == -1 && (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))) { + blk_mq_sched_insert_flush(hctx, rq, can_block); + return; + } + + if (e && e->type->ops.mq.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.mq.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.insert_requests) + e->type->ops.mq.insert_requests(hctx, list, false); + else + blk_mq_insert_requests(hctx, ctx, list); + + blk_mq_run_hw_queue(hctx, run_queue_async); +} + static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index becbc7840364..9478aaeb48c5 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -21,6 +21,12 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block); +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async); + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, struct list_head *rq_list, @@ -62,45 +68,6 @@ static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, e->type->ops.mq.put_rq_priv(q, rq); } -static inline void -blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - if (e && e->type->ops.mq.insert_requests) { - LIST_HEAD(list); - - list_add(&rq->queuelist, &list); - e->type->ops.mq.insert_requests(hctx, &list, at_head); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static inline void -blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async) -{ - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - struct elevator_queue *e = hctx->queue->elevator; - - if (e && e->type->ops.mq.insert_requests) - e->type->ops.mq.insert_requests(hctx, list, false); - else - blk_mq_insert_requests(hctx, ctx, list); - - blk_mq_run_hw_queue(hctx, run_queue_async); -} - static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1b156ca79af6..78bbacd129c9 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -106,6 +106,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct sbq_wait_state *ws; DEFINE_WAIT(wait); unsigned int tag_offset; + bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { @@ -128,6 +129,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL; ws = bt_wait_ptr(bt, data->hctx); + drop_ctx = data->ctx == NULL; do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); @@ -150,7 +152,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - blk_mq_put_ctx(data->ctx); + if (data->ctx) + blk_mq_put_ctx(data->ctx); io_schedule(); @@ -166,6 +169,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); } while (1); + if (drop_ctx && data->ctx) + blk_mq_put_ctx(data->ctx); + finish_wait(&ws->wait, &wait); found_tag: diff --git a/block/blk-mq.c b/block/blk-mq.c index 4df397910251..888868b62018 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -568,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false, true); } blk_mq_run_hw_queues(q, false); @@ -847,12 +847,11 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -static bool blk_mq_get_driver_tag(struct request *rq, - struct blk_mq_hw_ctx **hctx, bool wait) +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait) { struct blk_mq_alloc_data data = { .q = rq->q, - .ctx = rq->mq_ctx, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, }; @@ -1395,7 +1394,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_sched_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, true, false); } /* @@ -1446,10 +1445,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } plug = current->plug; @@ -1502,7 +1503,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua); + !is_sync || is_flush_fua, true); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1512,7 +1513,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); @@ -1568,10 +1568,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_get_driver_tag(rq, NULL, true); blk_insert_flush(rq); - goto run_queue; + blk_mq_run_hw_queue(data.hctx, true); + goto done; } /* @@ -1612,7 +1614,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua); + !is_sync || is_flush_fua, true); goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1622,7 +1624,6 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) * latter allows for merging opportunities and more efficient * dispatching. */ -run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } diff --git a/block/blk-mq.h b/block/blk-mq.h index d19b0e75a129..d34929968071 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -34,6 +34,8 @@ void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait); /* * Internal helpers for allocating/freeing the request map