Message ID | 20230717040058.3993930-5-chengming.zhou@linux.dev (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq: optimize flush and request size | expand |
Hi, some of our (Proxmox VE) users have been reporting crashes [1] with NULL pointer dereferences in blk_flush_complete_seq. AFAICT, all systems crashing in blk_flush_complete_seq used software RAID1, and ran our downstream kernel based on (Ubuntu) kernel 6.8. On mainline kernel 6.9, I can reproduce the crash [1] by: * compiling the kernel with CONFIG_FAIL_MAKE_REQUEST enabled * setting up root on software RAID1 * enabling the write-intent bitmap with `mdadm --grow --bitmap=internal /dev/md0`. So far, I have not been able to reproduce the crash with the write-intent bitmap disabled. * injecting write faults via CONFIG_FAIL_MAKE_REQUEST on one of the RAID1 disks * generating IO in a KVM guest with fio * waiting ~5 minutes I've used this reproducer for a bisect, which produced 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) as the first commit with which I can reproduce the crashes. I'm not 100% sure it is this one because the reproducer is a bit flaky. But it does sound plausible, as the commit is included in our 6.8 kernel, and touches `queuelist` which is AFAICT where blk_flush_complete_seq dereferences the NULL pointer. Does anyone have an idea what could be the cause for the crash, or how to further debug this? Happy to provide more information if needed, as well as the complete reproducer (I'd need to clean it up a little bit first). Do you think this could also affect setups without software RAID? Best, Friedrich [1] [ 132.292488] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 132.293594] #PF: supervisor write access in kernel mode [ 132.294408] #PF: error_code(0x0002) - not-present page [ 132.295187] PGD 0 P4D 0 [ 132.295609] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 132.296285] CPU: 0 PID: 776 Comm: kvm Tainted: G E 6.5.0-rc2-bisect11+ #22 [ 132.297556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 132.299292] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0 [ 132.300122] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b [ 132.302946] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046 [ 132.303765] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX: ffff92d00c2fd048 [ 132.304856] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI: 0000000000000000 [ 132.305948] RBP: ffffa431407a39f8 R08: 0000000000000000 R09: 0000000000000000 [ 132.307043] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000801 [ 132.308120] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15: ffff92d00ba09e00 [ 132.309216] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) knlGS:0000000000000000 [ 132.310457] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 132.311355] CR2: 0000000000000008 CR3: 000000018f088006 CR4: 0000000000372ef0 [ 132.312444] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 132.313535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 132.314633] Call Trace: [ 132.315016] <TASK> [ 132.315381] ? show_regs+0x6d/0x80 [ 132.315921] ? __die+0x24/0x80 [ 132.316421] ? page_fault_oops+0x176/0x500 [ 132.317049] ? do_user_addr_fault+0x31d/0x6a0 [ 132.317744] ? exc_page_fault+0x83/0x1b0 [ 132.318368] ? asm_exc_page_fault+0x27/0x30 [ 132.319008] ? blk_flush_complete_seq+0x296/0x2e0 [ 132.319759] ? __blk_mq_alloc_requests+0x383/0x3a0 [ 132.320505] ? wbt_wait+0xb3/0x100 [ 132.321041] blk_insert_flush+0xd1/0x220 [ 132.321664] blk_mq_submit_bio+0x564/0x690 [ 132.322294] __submit_bio+0xb3/0x1c0 [ 132.322859] submit_bio_noacct_nocheck+0x2b7/0x390 [ 132.323622] submit_bio_noacct+0x18a/0x6b0 [ 132.324241] submit_bio+0x6c/0x80 [ 132.324774] md_super_write+0xd1/0x120 [ 132.325376] write_page+0x23b/0x3f0 [ 132.325915] ? md_bitmap_wait_writes+0xda/0xf0 [ 132.326624] md_bitmap_unplug+0x9f/0x140 [ 132.327251] flush_bio_list+0x102/0x110 [raid1] [ 132.327963] raid1_unplug+0x3c/0xe0 [raid1] [ 132.328626] __blk_flush_plug+0xc1/0x130 [ 132.329238] blk_finish_plug+0x31/0x50 [ 132.329819] io_submit_sqes+0x53e/0x680 [ 132.330426] __do_sys_io_uring_enter+0x59a/0xc10 [ 132.331149] ? vfs_read+0x20a/0x360 [ 132.331699] __x64_sys_io_uring_enter+0x22/0x40 [ 132.332414] do_syscall_64+0x58/0x90 [ 132.332979] ? ksys_read+0xe6/0x100 [ 132.333530] ? exit_to_user_mode_prepare+0x49/0x220 [ 132.334292] ? syscall_exit_to_user_mode+0x1b/0x50 [ 132.335032] ? do_syscall_64+0x67/0x90 [ 132.335627] ? syscall_exit_to_user_mode+0x1b/0x50 [ 132.336381] ? do_syscall_64+0x67/0x90 [ 132.336958] ? syscall_exit_to_user_mode+0x1b/0x50 [ 132.337707] ? do_syscall_64+0x67/0x90 [ 132.338296] ? syscall_exit_to_user_mode+0x1b/0x50 [ 132.339045] ? do_syscall_64+0x67/0x90 [ 132.339646] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 132.340426] RIP: 0033:0x727d26282b95 [ 132.341011] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 [ 132.343837] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa [ 132.344990] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX: 0000727d26282b95 [ 132.346079] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000033 [ 132.347176] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09: 0000000000000008 [ 132.348272] R10: 0000000000000000 R11: 0000000000000246 R12: 0000727d0c003ac0 [ 132.349376] R13: 0000000000000000 R14: 00005a5a2313bc68 R15: 00005a5a25cf0450 [ 132.350472] </TASK> [ 132.350828] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E) kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E) cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E) uhci_hcd(E) ttm(E) [ 132.350919] ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 132.365204] CR2: 0000000000000008 [ 132.365747] ---[ end trace 0000000000000000 ]--- [ 132.366468] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0 [ 132.367290] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b [ 132.370119] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046 [ 132.370935] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX: ffff92d00c2fd048 [ 132.372015] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI: 0000000000000000 [ 132.373115] RBP: ffffa431407a39f8 R08: 0000000000000000 R09: 0000000000000000 [ 132.374193] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000801 [ 132.375286] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15: ffff92d00ba09e00 [ 132.376385] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) knlGS:0000000000000000 [ 132.377600] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 132.378486] CR2: 0000000000000008 CR3: 000000018f088006 CR4: 0000000000372ef0 [ 132.379564] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 132.380638] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 132.381738] note: kvm[776] exited with irqs disabled [ 132.382631] note: kvm[776] exited with preempt_count 1 [ 132.383469] ------------[ cut here ]------------ [ 132.384193] WARNING: CPU: 0 PID: 776 at kernel/exit.c:818 do_exit+0x8db/0xae0 [ 132.385307] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E) kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E) cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E) uhci_hcd(E) ttm(E) [ 132.385396] ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 132.399854] CPU: 0 PID: 776 Comm: kvm Tainted: G D E 6.5.0-rc2-bisect11+ #22 [ 132.401165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 132.402987] RIP: 0010:do_exit+0x8db/0xae0 [ 132.403661] Code: e9 44 f8 ff ff 48 8b bb d8 09 00 00 31 f6 e8 fc d7 ff ff e9 f0 fd ff ff 4c 89 ee bf 05 06 00 00 e8 4a 2e 01 00 e9 70 f8 ff ff <0f> 0b e9 9e f7 ff ff 0f 0b e9 57 f7 ff ff 48 89 df e8 bf 7d 13 00 [ 132.406622] RSP: 0018:ffffa431407a3ec8 EFLAGS: 00010282 [ 132.407487] RAX: 0000000000000000 RBX: ffff92d08d53a940 RCX: 0000000000000000 [ 132.408637] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 132.409791] RBP: ffffa431407a3f20 R08: 0000000000000000 R09: 0000000000000000 [ 132.410961] R10: 0000000000000000 R11: 0000000000000000 R12: ffff92d0031cba80 [ 132.412102] R13: 0000000000000009 R14: ffff92d090d53180 R15: 0000000000000000 [ 132.413256] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) knlGS:0000000000000000 [ 132.414582] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 132.415512] CR2: 0000000000000008 CR3: 000000018f088006 CR4: 0000000000372ef0 [ 132.416670] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 132.417842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 132.418998] Call Trace: [ 132.419430] <TASK> [ 132.419788] ? show_regs+0x6d/0x80 [ 132.420352] ? __warn+0x89/0x160 [ 132.420891] ? do_exit+0x8db/0xae0 [ 132.421477] ? report_bug+0x17e/0x1b0 [ 132.422063] ? handle_bug+0x46/0x90 [ 132.422655] ? exc_invalid_op+0x18/0x80 [ 132.423299] ? asm_exc_invalid_op+0x1b/0x20 [ 132.423975] ? do_exit+0x8db/0xae0 [ 132.424565] ? do_exit+0x72/0xae0 [ 132.425131] ? _printk+0x60/0x90 [ 132.425666] make_task_dead+0x86/0x180 [ 132.426297] rewind_stack_and_make_dead+0x17/0x20 [ 132.427056] RIP: 0033:0x727d26282b95 [ 132.427681] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 [ 132.430663] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa [ 132.431895] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX: 0000727d26282b95 [ 132.433049] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000033 [ 132.434203] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09: 0000000000000008 [ 132.435372] R10: 0000000000000000 R11: 0000000000000246 R12: 0000727d0c003ac0 [ 132.436529] R13: 0000000000000000 R14: 00005a5a2313bc68 R15: 00005a5a25cf0450 [ 132.437685] </TASK> [ 132.438056] ---[ end trace 0000000000000000 ]---
Hi, On 2024/5/25 00:07, Friedrich Weber wrote: > Hi, > > some of our (Proxmox VE) users have been reporting crashes [1] with NULL > pointer dereferences in blk_flush_complete_seq. AFAICT, all systems > crashing in blk_flush_complete_seq used software RAID1, and ran our > downstream kernel based on (Ubuntu) kernel 6.8. > > On mainline kernel 6.9, I can reproduce the crash [1] by: > > * compiling the kernel with CONFIG_FAIL_MAKE_REQUEST enabled > * setting up root on software RAID1 > * enabling the write-intent bitmap with `mdadm --grow --bitmap=internal > /dev/md0`. So far, I have not been able to reproduce the crash with the > write-intent bitmap disabled. > * injecting write faults via CONFIG_FAIL_MAKE_REQUEST on one of the > RAID1 disks > * generating IO in a KVM guest with fio > * waiting ~5 minutes > > I've used this reproducer for a bisect, which produced > > 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) > > as the first commit with which I can reproduce the crashes. I'm not 100% > sure it is this one because the reproducer is a bit flaky. But it does > sound plausible, as the commit is included in our 6.8 kernel, and > touches `queuelist` which is AFAICT where blk_flush_complete_seq > dereferences the NULL pointer. Ok, it will be better that I can reproduce it locally, will try later. > > Does anyone have an idea what could be the cause for the crash, or how > to further debug this? Happy to provide more information if needed, as > well as the complete reproducer (I'd need to clean it up a little bit > first). BUG shows it panic on 0000000000000008, not sure what it's accessing then, does it means rq->queuelist.next == 0 or something? Could you use add2line to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0 and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;", obviously fq can't be NULL. (I'm using the v6.9 kernel) Thanks. > > Do you think this could also affect setups without software RAID? > > Best, > > Friedrich > > [1] > > [ 132.292488] BUG: kernel NULL pointer dereference, address: > 0000000000000008 > [ 132.293594] #PF: supervisor write access in kernel mode > [ 132.294408] #PF: error_code(0x0002) - not-present page > [ 132.295187] PGD 0 P4D 0 > [ 132.295609] Oops: 0002 [#1] PREEMPT SMP NOPTI > [ 132.296285] CPU: 0 PID: 776 Comm: kvm Tainted: G E > 6.5.0-rc2-bisect11+ #22 > [ 132.297556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 132.299292] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0 > [ 132.300122] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 > 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 > 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b > [ 132.302946] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046 > [ 132.303765] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX: > ffff92d00c2fd048 > [ 132.304856] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI: > 0000000000000000 > [ 132.305948] RBP: ffffa431407a39f8 R08: 0000000000000000 R09: > 0000000000000000 > [ 132.307043] R10: 0000000000000000 R11: 0000000000000000 R12: > 0000000000000801 > [ 132.308120] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15: > ffff92d00ba09e00 > [ 132.309216] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) > knlGS:0000000000000000 > [ 132.310457] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 132.311355] CR2: 0000000000000008 CR3: 000000018f088006 CR4: > 0000000000372ef0 > [ 132.312444] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 132.313535] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 132.314633] Call Trace: > [ 132.315016] <TASK> > [ 132.315381] ? show_regs+0x6d/0x80 > [ 132.315921] ? __die+0x24/0x80 > [ 132.316421] ? page_fault_oops+0x176/0x500 > [ 132.317049] ? do_user_addr_fault+0x31d/0x6a0 > [ 132.317744] ? exc_page_fault+0x83/0x1b0 > [ 132.318368] ? asm_exc_page_fault+0x27/0x30 > [ 132.319008] ? blk_flush_complete_seq+0x296/0x2e0 > [ 132.319759] ? __blk_mq_alloc_requests+0x383/0x3a0 > [ 132.320505] ? wbt_wait+0xb3/0x100 > [ 132.321041] blk_insert_flush+0xd1/0x220 > [ 132.321664] blk_mq_submit_bio+0x564/0x690 > [ 132.322294] __submit_bio+0xb3/0x1c0 > [ 132.322859] submit_bio_noacct_nocheck+0x2b7/0x390 > [ 132.323622] submit_bio_noacct+0x18a/0x6b0 > [ 132.324241] submit_bio+0x6c/0x80 > [ 132.324774] md_super_write+0xd1/0x120 > [ 132.325376] write_page+0x23b/0x3f0 > [ 132.325915] ? md_bitmap_wait_writes+0xda/0xf0 > [ 132.326624] md_bitmap_unplug+0x9f/0x140 > [ 132.327251] flush_bio_list+0x102/0x110 [raid1] > [ 132.327963] raid1_unplug+0x3c/0xe0 [raid1] > [ 132.328626] __blk_flush_plug+0xc1/0x130 > [ 132.329238] blk_finish_plug+0x31/0x50 > [ 132.329819] io_submit_sqes+0x53e/0x680 > [ 132.330426] __do_sys_io_uring_enter+0x59a/0xc10 > [ 132.331149] ? vfs_read+0x20a/0x360 > [ 132.331699] __x64_sys_io_uring_enter+0x22/0x40 > [ 132.332414] do_syscall_64+0x58/0x90 > [ 132.332979] ? ksys_read+0xe6/0x100 > [ 132.333530] ? exit_to_user_mode_prepare+0x49/0x220 > [ 132.334292] ? syscall_exit_to_user_mode+0x1b/0x50 > [ 132.335032] ? do_syscall_64+0x67/0x90 > [ 132.335627] ? syscall_exit_to_user_mode+0x1b/0x50 > [ 132.336381] ? do_syscall_64+0x67/0x90 > [ 132.336958] ? syscall_exit_to_user_mode+0x1b/0x50 > [ 132.337707] ? do_syscall_64+0x67/0x90 > [ 132.338296] ? syscall_exit_to_user_mode+0x1b/0x50 > [ 132.339045] ? do_syscall_64+0x67/0x90 > [ 132.339646] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 > [ 132.340426] RIP: 0033:0x727d26282b95 > [ 132.341011] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 > d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f > 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 > [ 132.343837] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX: > 00000000000001aa > [ 132.344990] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX: > 0000727d26282b95 > [ 132.346079] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > 0000000000000033 > [ 132.347176] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09: > 0000000000000008 > [ 132.348272] R10: 0000000000000000 R11: 0000000000000246 R12: > 0000727d0c003ac0 > [ 132.349376] R13: 0000000000000000 R14: 00005a5a2313bc68 R15: > 00005a5a25cf0450 > [ 132.350472] </TASK> > [ 132.350828] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) > cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) > ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) > ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) > sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) > nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E) > kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E) > polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E) > cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) > mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) > efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) > autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E) > async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) > async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) > crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E) > uhci_hcd(E) ttm(E) > [ 132.350919] ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 132.365204] CR2: 0000000000000008 > [ 132.365747] ---[ end trace 0000000000000000 ]--- > [ 132.366468] RIP: 0010:blk_flush_complete_seq+0x296/0x2e0 > [ 132.367290] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 > 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 > 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b > [ 132.370119] RSP: 0018:ffffa431407a39b8 EFLAGS: 00010046 > [ 132.370935] RAX: 0000000000000000 RBX: ffff92d00c2fd000 RCX: > ffff92d00c2fd048 > [ 132.372015] RDX: ffff92d0036dfaa0 RSI: ffff92d00c2fd048 RDI: > 0000000000000000 > [ 132.373115] RBP: ffffa431407a39f8 R08: 0000000000000000 R09: > 0000000000000000 > [ 132.374193] R10: 0000000000000000 R11: 0000000000000000 R12: > 0000000000000801 > [ 132.375286] R13: ffff92d0036dfa80 R14: ffff92d0036dfa90 R15: > ffff92d00ba09e00 > [ 132.376385] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) > knlGS:0000000000000000 > [ 132.377600] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 132.378486] CR2: 0000000000000008 CR3: 000000018f088006 CR4: > 0000000000372ef0 > [ 132.379564] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 132.380638] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 132.381738] note: kvm[776] exited with irqs disabled > [ 132.382631] note: kvm[776] exited with preempt_count 1 > [ 132.383469] ------------[ cut here ]------------ > [ 132.384193] WARNING: CPU: 0 PID: 776 at kernel/exit.c:818 > do_exit+0x8db/0xae0 > [ 132.385307] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) > cifs_arc4(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) > ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) > ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) > sunrpc(E) softdog(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) > nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_pmc_core(E) > kvm_intel(E) kvm(E) irqbypass(E) crct10dif_pclmul(E) polyval_clmulni(E) > polyval_generic(E) ghash_clmulni_intel(E) aesni_intel(E) crypto_simd(E) > cryptd(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) > mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) > efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) > autofs4(E) hid_generic(E) usbhid(E) hid(E) raid10(E) raid456(E) > async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) > async_tx(E) xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) > crc32_pclmul(E) bochs(E) drm_vram_helper(E) psmouse(E) drm_ttm_helper(E) > uhci_hcd(E) ttm(E) > [ 132.385396] ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 132.399854] CPU: 0 PID: 776 Comm: kvm Tainted: G D E > 6.5.0-rc2-bisect11+ #22 > [ 132.401165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 132.402987] RIP: 0010:do_exit+0x8db/0xae0 > [ 132.403661] Code: e9 44 f8 ff ff 48 8b bb d8 09 00 00 31 f6 e8 fc d7 > ff ff e9 f0 fd ff ff 4c 89 ee bf 05 06 00 00 e8 4a 2e 01 00 e9 70 f8 ff > ff <0f> 0b e9 9e f7 ff ff 0f 0b e9 57 f7 ff ff 48 89 df e8 bf 7d 13 00 > [ 132.406622] RSP: 0018:ffffa431407a3ec8 EFLAGS: 00010282 > [ 132.407487] RAX: 0000000000000000 RBX: ffff92d08d53a940 RCX: > 0000000000000000 > [ 132.408637] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > [ 132.409791] RBP: ffffa431407a3f20 R08: 0000000000000000 R09: > 0000000000000000 > [ 132.410961] R10: 0000000000000000 R11: 0000000000000000 R12: > ffff92d0031cba80 > [ 132.412102] R13: 0000000000000009 R14: ffff92d090d53180 R15: > 0000000000000000 > [ 132.413256] FS: 0000727d191b86c0(0000) GS:ffff92d137c00000(0000) > knlGS:0000000000000000 > [ 132.414582] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 132.415512] CR2: 0000000000000008 CR3: 000000018f088006 CR4: > 0000000000372ef0 > [ 132.416670] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 132.417842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 132.418998] Call Trace: > [ 132.419430] <TASK> > [ 132.419788] ? show_regs+0x6d/0x80 > [ 132.420352] ? __warn+0x89/0x160 > [ 132.420891] ? do_exit+0x8db/0xae0 > [ 132.421477] ? report_bug+0x17e/0x1b0 > [ 132.422063] ? handle_bug+0x46/0x90 > [ 132.422655] ? exc_invalid_op+0x18/0x80 > [ 132.423299] ? asm_exc_invalid_op+0x1b/0x20 > [ 132.423975] ? do_exit+0x8db/0xae0 > [ 132.424565] ? do_exit+0x72/0xae0 > [ 132.425131] ? _printk+0x60/0x90 > [ 132.425666] make_task_dead+0x86/0x180 > [ 132.426297] rewind_stack_and_make_dead+0x17/0x20 > [ 132.427056] RIP: 0033:0x727d26282b95 > [ 132.427681] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 > d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f > 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 > [ 132.430663] RSP: 002b:0000727d191b2fd8 EFLAGS: 00000246 ORIG_RAX: > 00000000000001aa > [ 132.431895] RAX: ffffffffffffffda RBX: 0000727d0c0039d0 RCX: > 0000727d26282b95 > [ 132.433049] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > 0000000000000033 > [ 132.434203] RBP: 0000727d0c0039d8 R08: 0000000000000000 R09: > 0000000000000008 > [ 132.435372] R10: 0000000000000000 R11: 0000000000000246 R12: > 0000727d0c003ac0 > [ 132.436529] R13: 0000000000000000 R14: 00005a5a2313bc68 R15: > 00005a5a25cf0450 > [ 132.437685] </TASK> > [ 132.438056] ---[ end trace 0000000000000000 ]--- >
Hi Chengming, Thank you for taking a look at this! On 27/05/2024 07:09, Chengming Zhou wrote: >> I've used this reproducer for a bisect, which produced >> >> 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) >> >> as the first commit with which I can reproduce the crashes. I'm not 100% >> sure it is this one because the reproducer is a bit flaky. But it does >> sound plausible, as the commit is included in our 6.8 kernel, and >> touches `queuelist` which is AFAICT where blk_flush_complete_seq >> dereferences the NULL pointer. > > Ok, it will be better that I can reproduce it locally, will try later. Interestingly, so far I haven't been able to reproduce the crash when generating IO on the host itself, I only got crashes when generating IO in a QEMU VM. The reproducer in more detail: - Compile Linux 6.9 with CONFIG_FAULT_INJECTION, CONFIG_FAULT_INJECTION_DEBUG_FS, CONFIG_FAIL_MAKE_REQUEST and boot it on the host - On the host, I have root on ext4 on top of a RAID1 software raid, see [0] for mdadm output. I have to enable the write-intent bitmap to reproduce the crash. - Run QEMU to start a Linux VM whose disk is backed by a qcow2 file on root, see [1] for QEMU 8.2.2 command line. The VM is running Proxmox VE here, but I don't think that matters. - Run fio inside the VM as follows: fio --name foo --size 1M --filename foo.bin --rw=write --bs=4k --iodepth 4 --ioengine libaio --numjobs 4 --time_based --runtime 600 - Inject faults on sdb1 on the host: echo 1 > /sys/block/sdb/sdb1/make-it-fail echo 1000 > /sys/kernel/debug/fail_make_request/times echo 50 > /sys/kernel/debug/fail_make_request/interval echo 5 > /sys/kernel/debug/fail_make_request/probability echo 1 > /sys/kernel/debug/fail_make_request/verbose - And usually (in somewhat 80% of cases) I get a host crash within ~5 minutes. Sometimes the software RAID declares sdb1 to be faulty and disables it, in this case I have to try again. >> Does anyone have an idea what could be the cause for the crash, or how >> to further debug this? Happy to provide more information if needed, as >> well as the complete reproducer (I'd need to clean it up a little bit >> first). > > BUG shows it panic on 0000000000000008, not sure what it's accessing then, > does it means rq->queuelist.next == 0 or something? Could you use add2line > to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0 > and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;", > obviously fq can't be NULL. (I'm using the v6.9 kernel) Sorry for the confusion, the crash dump was from a kernel compiled at 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for a kernel 6.9 crash dump. I don't know too much about kernel debugging, but I tried to get something useful out of addr2line: # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2 blk_flush_complete_seq+0x291/0x2d0 __list_del /[...]./include/linux/list.h:195 I tried to find the relevant portions in `objdump -SD blk-flush.o`, see [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to 351: 48 89 4f 08 mov %rcx,0x8(%rdi) To me this looks like part of list_move_tail(&rq->queuelist, pending); What do you think? FWIW, I've also compiled a 6.9 kernel with CONFIG_LIST_DEBUG, and when running the reproducer it started reporting corruptions, see [4] for the first one. It did not crash, though, maybe because CONFIG_LIST_DEBUG prevents the NULL pointer dereference? Hope this helps! If I can provide anything else, just let me know. Best wishes, Friedrich [0] # mdadm --detail /dev/md0 /dev/md0: Version : 1.2 Creation Time : Wed May 22 10:14:40 2024 Raid Level : raid1 Array Size : 33534976 (31.98 GiB 34.34 GB) Used Dev Size : 33534976 (31.98 GiB 34.34 GB) Raid Devices : 2 Total Devices : 2 Persistence : Superblock is persistent Intent Bitmap : Internal Update Time : Mon May 27 17:31:02 2024 State : active Active Devices : 2 Working Devices : 2 Failed Devices : 0 Spare Devices : 0 Consistency Policy : bitmap Name : reproflushfull:0 (local to host reproflushfull) UUID : fda4a959:d2dd0bef:d7094fb1:c6e3174c Events : 749 Number Major Minor RaidDevice State 0 8 1 0 active sync /dev/sda1 1 8 17 1 active sync /dev/sdb1 [1] ./qemu-8.2.2/qemu-system-x86_64 \ -accel kvm \ -name 'pve,debug-threads=on' \ -chardev 'socket,id=qmp,path=/var/run/qemu-server/100.qmp,server=on,wait=off' \ -mon 'chardev=qmp,mode=control' \ -chardev 'socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5' \ -mon 'chardev=qmp-event,mode=control' \ -pidfile /var/run/qemu-server/100.pid \ -smbios 'type=1,uuid=25a3012b-e72a-4121-9e57-f6f5a40ebe62' \ -smp '4,sockets=1,cores=4,maxcpus=4' \ -nodefaults \ -vnc 'unix:/var/run/qemu-server/100.vnc,password=on' \ -cpu host,+kvm_pv_eoi,+kvm_pv_unhalt \ -m 4096 \ -object 'iothread,id=iothread-virtioscsi0' \ -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' \ -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' \ -device 'pci-bridge,id=pci.3,chassis_nr=3,bus=pci.0,addr=0x5' \ -device 'vmgenid,guid=ab6d50e3-d574-4ba8-bb0c-3c0bbcc9cd2e' \ -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' \ -device 'VGA,id=vga,bus=pci.0,addr=0x2' \ -device 'virtio-scsi-pci,id=virtioscsi0,bus=pci.3,addr=0x1,iothread=iothread-virtioscsi0' \ -drive 'file=/var/lib/vz/images/100/vm-100-disk-0.qcow2,if=none,id=drive-scsi0,format=qcow2,cache=none,aio=io_uring,detect-zeroes=on' \ -device 'scsi-hd,bus=virtioscsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0,bootindex=100' \ -machine 'type=pc' [2] [ 85.282301] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 85.283454] #PF: supervisor write access in kernel mode [ 85.284289] #PF: error_code(0x0002) - not-present page [ 85.285087] PGD 0 P4D 0 [ 85.285500] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 85.286211] CPU: 0 PID: 794 Comm: kvm Tainted: G E 6.9.0-debug2 #27 [ 85.287420] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 85.289177] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0 [ 85.289994] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b [ 85.292894] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046 [ 85.293714] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX: ffff8b9c8abad048 [ 85.294809] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI: 0000000000000000 [ 85.295921] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09: 0000000000000000 [ 85.297111] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000009801 [ 85.298211] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15: ffff8b9c8341ddc0 [ 85.299353] FS: 0000776683e006c0(0000) GS:ffff8b9db7a00000(0000) knlGS:0000000000000000 [ 85.300825] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 85.301712] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4: 0000000000372ef0 [ 85.302810] Call Trace: [ 85.303242] <TASK> [ 85.303597] ? show_regs+0x6c/0x80 [ 85.304169] ? __die+0x24/0x80 [ 85.304671] ? page_fault_oops+0x175/0x5b0 [ 85.305326] ? do_user_addr_fault+0x311/0x680 [ 85.306012] ? exc_page_fault+0x82/0x1b0 [ 85.306657] ? asm_exc_page_fault+0x27/0x30 [ 85.307297] ? blk_flush_complete_seq+0x291/0x2d0 [ 85.308023] ? __blk_mq_alloc_requests+0x251/0x540 [ 85.308785] ? wbt_wait+0x33/0x100 [ 85.309620] blk_insert_flush+0xce/0x220 [ 85.310307] blk_mq_submit_bio+0x5cd/0x730 [ 85.310948] __submit_bio+0xb3/0x1c0 [ 85.311535] submit_bio_noacct_nocheck+0x2fc/0x3d0 [ 85.312296] submit_bio_noacct+0x1ba/0x6b0 [ 85.312934] ? ext4_file_write_iter+0x3b1/0x7e0 [ 85.313663] submit_bio+0xb1/0x110 [ 85.314220] md_super_write+0xcf/0x110 [ 85.314816] write_sb_page+0x148/0x300 [ 85.315422] filemap_write_page+0x5b/0x70 [ 85.316058] md_bitmap_unplug+0x99/0x1f0 [ 85.316676] flush_bio_list+0x107/0x110 [raid1] [ 85.317399] raid1_unplug+0x3c/0xf0 [raid1] [ 85.318056] __blk_flush_plug+0xbe/0x140 [ 85.318673] blk_finish_plug+0x30/0x50 [ 85.319268] io_submit_sqes+0x4c4/0x6c0 [ 85.319872] __do_sys_io_uring_enter+0x2f2/0x640 [ 85.320590] ? blk_mq_complete_request+0x26/0x40 [ 85.321326] __x64_sys_io_uring_enter+0x22/0x40 [ 85.322044] x64_sys_call+0x20b9/0x24b0 [ 85.322656] do_syscall_64+0x80/0x170 [ 85.323243] ? vring_interrupt+0x94/0x110 [ 85.323878] ? __x64_sys_ppoll+0xf2/0x170 [ 85.324501] ? handle_irq_event+0x52/0x80 [ 85.325103] ? _raw_spin_unlock+0xe/0x40 [ 85.325728] ? handle_edge_irq+0xda/0x250 [ 85.326347] ? irqentry_exit_to_user_mode+0x76/0x270 [ 85.327114] ? irqentry_exit+0x43/0x50 [ 85.327703] ? clear_bhb_loop+0x15/0x70 [ 85.328286] ? clear_bhb_loop+0x15/0x70 [ 85.328897] ? clear_bhb_loop+0x15/0x70 [ 85.329541] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 85.330326] RIP: 0033:0x776695660b95 [ 85.330908] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 [ 85.333736] RSP: 002b:0000776683dfafd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa [ 85.334911] RAX: ffffffffffffffda RBX: 000077667c0039f0 RCX: 0000776695660b95 [ 85.335999] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000033 [ 85.337111] RBP: 000077667c0039f8 R08: 0000000000000000 R09: 0000000000000008 [ 85.338210] R10: 0000000000000000 R11: 0000000000000246 R12: 000077667c003ae0 [ 85.339304] R13: 0000000000000000 R14: 000064175d5afc68 R15: 000064178a3f9450 [ 85.340408] </TASK> [ 85.340772] Modules linked in: tcp_diag(E) inet_diag(E) veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) [ 85.340859] xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E) uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 85.357137] CR2: 0000000000000008 [ 85.358209] ---[ end trace 0000000000000000 ]--- [ 85.358926] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0 [ 85.360004] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b [ 85.362891] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046 [ 85.363695] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX: ffff8b9c8abad048 [ 85.364800] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI: 0000000000000000 [ 85.365902] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09: 0000000000000000 [ 85.366993] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000009801 [ 85.368089] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15: ffff8b9c8341ddc0 [ 85.369190] FS: 0000776683e006c0(0000) GS:ffff8b9db7a00000(0000) knlGS:0000000000000000 [ 85.370440] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 85.371327] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4: 0000000000372ef0 [ 85.372440] note: kvm[794] exited with irqs disabled [ 85.373340] note: kvm[794] exited with preempt_count 1 [ 85.374238] ------------[ cut here ]------------ [ 85.374943] WARNING: CPU: 0 PID: 794 at kernel/exit.c:827 do_exit+0x8a2/0xab0 [ 85.376047] Modules linked in: tcp_diag(E) inet_diag(E) veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) [ 85.376128] xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E) uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 85.392395] CPU: 0 PID: 794 Comm: kvm Tainted: G D E 6.9.0-debug2 #27 [ 85.393594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 85.395347] RIP: 0010:do_exit+0x8a2/0xab0 [ 85.395975] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00 [ 85.398793] RSP: 0018:ffff9c08c07f3ec8 EFLAGS: 00010286 [ 85.399625] RAX: 0000000000000000 RBX: ffff8b9c82e90000 RCX: 0000000000000000 [ 85.400720] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [3] 00000000000000c0 <blk_flush_complete_seq>: { [...] struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 32a: 49 8d 56 01 lea 0x1(%r14),%rdx 32e: 49 c1 e6 04 shl $0x4,%r14 332: 4d 01 ee add %r13,%r14 335: 48 c1 e2 04 shl $0x4,%rdx return READ_ONCE(head->next) == head; 339: 49 8b 4e 10 mov 0x10(%r14),%rcx 33d: 4c 01 ea add %r13,%rdx if (list_empty(pending)) 340: 48 39 ca cmp %rcx,%rdx 343: 74 2b je 370 <blk_flush_complete_seq+0x2b0> __list_del(entry->prev, entry->next); 345: 48 8b 4b 50 mov 0x50(%rbx),%rcx 349: 48 8b 7b 48 mov 0x48(%rbx),%rdi list_move_tail(&rq->queuelist, pending); 34d: 48 8d 73 48 lea 0x48(%rbx),%rsi next->prev = prev; 351: 48 89 4f 08 mov %rcx,0x8(%rdi) WRITE_ONCE(prev->next, next); 355: 48 89 39 mov %rdi,(%rcx) __list_add(new, head->prev, head); 358: 49 8b 4e 18 mov 0x18(%r14),%rcx next->prev = new; 35c: 49 89 76 18 mov %rsi,0x18(%r14) new->next = next; 360: 48 89 53 48 mov %rdx,0x48(%rbx) new->prev = prev; 364: 48 89 4b 50 mov %rcx,0x50(%rbx) WRITE_ONCE(prev->next, new); 368: 48 89 31 mov %rsi,(%rcx) } 36b: e9 30 fe ff ff jmp 1a0 <blk_flush_complete_seq+0xe0> fq->flush_pending_since = jiffies; 370: 48 8b 0d 00 00 00 00 mov 0x0(%rip),%rcx # 377 <blk_flush_complete_seq+0x2b7> 377: 49 89 4d 08 mov %rcx,0x8(%r13) 37b: eb c8 jmp 345 <blk_flush_complete_seq+0x285> [4] May 27 11:12:53 reproflushfull kernel: list_del corruption. prev->next should be ffff8ad44ab63788, but was ffff8ad44ab63400. (prev=ffff8ad44ab63788) May 27 11:12:53 reproflushfull kernel: WARNING: CPU: 0 PID: 797 at lib/list_debug.c:62 __list_del_entry_valid_or_report+0xd4/0x100 May 27 11:12:53 reproflushfull kernel: Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) hid_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E) raid6_pq(E) May 27 11:12:53 reproflushfull kernel: libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) bochs(E) psmouse(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E) ttm(E) i2c_piix4(E) ehci_hcd(E) pata_acpi(E) floppy(E) May 27 11:12:53 reproflushfull kernel: CPU: 0 PID: 797 Comm: kvm Tainted: G E 6.9.0-debug #25 May 27 11:12:53 reproflushfull kernel: Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 May 27 11:12:53 reproflushfull kernel: RIP: 0010:__list_del_entry_valid_or_report+0xd4/0x100 May 27 11:12:53 reproflushfull kernel: Code: 48 89 fe 48 89 ca 48 c7 c7 78 77 9f a1 e8 84 3a 8e ff 0f 0b 31 c0 eb b0 48 89 fe 48 89 c2 48 c7 c7 b0 77 9f a1 e8 6c 3a 8e ff <0f> 0b 31 c0 eb 98 48 89 d1 48 c7 c7 f8 77 9f a1 48 89 f2 48 89 c6 May 27 11:12:53 reproflushfull kernel: RSP: 0018:ffffb1dd80673740 EFLAGS: 00010046 May 27 11:12:53 reproflushfull kernel: RAX: 0000000000000000 RBX: ffff8ad440be12c0 RCX: 0000000000000000 May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 May 27 11:12:53 reproflushfull kernel: RBP: ffffb1dd80673740 R08: 0000000000000000 R09: 0000000000000000 May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11: 0000000000000000 R12: ffff8ad44a8c1e00 May 27 11:12:53 reproflushfull kernel: R13: ffff8ad440be12c0 R14: ffff8ad44ab63788 R15: ffff8ad44ab63740 May 27 11:12:53 reproflushfull kernel: FS: 00007e8a67c006c0(0000) GS:ffff8ad577a00000(0000) knlGS:0000000000000000 May 27 11:12:53 reproflushfull kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 May 27 11:12:53 reproflushfull kernel: CR2: 00007e8a5801a008 CR3: 000000010e022001 CR4: 0000000000372ef0 May 27 11:12:53 reproflushfull kernel: Call Trace: May 27 11:12:53 reproflushfull kernel: <TASK> May 27 11:12:53 reproflushfull kernel: ? show_regs+0x6c/0x80 May 27 11:12:53 reproflushfull kernel: ? __warn+0x88/0x140 May 27 11:12:53 reproflushfull kernel: ? __list_del_entry_valid_or_report+0xd4/0x100 May 27 11:12:53 reproflushfull kernel: ? report_bug+0x182/0x1b0 May 27 11:12:53 reproflushfull kernel: ? handle_bug+0x46/0x90 May 27 11:12:53 reproflushfull kernel: ? exc_invalid_op+0x18/0x80 May 27 11:12:53 reproflushfull kernel: ? asm_exc_invalid_op+0x1b/0x20 May 27 11:12:53 reproflushfull kernel: ? __list_del_entry_valid_or_report+0xd4/0x100 May 27 11:12:53 reproflushfull kernel: blk_flush_complete_seq+0x2f3/0x360 May 27 11:12:53 reproflushfull kernel: ? wbt_wait+0x33/0x100 May 27 11:12:53 reproflushfull kernel: blk_insert_flush+0xce/0x220 May 27 11:12:53 reproflushfull kernel: blk_mq_submit_bio+0x5cd/0x730 May 27 11:12:53 reproflushfull kernel: __submit_bio+0xb3/0x1c0 May 27 11:12:53 reproflushfull kernel: submit_bio_noacct_nocheck+0x2fc/0x3d0 May 27 11:12:53 reproflushfull kernel: submit_bio_noacct+0x1ba/0x6b0 May 27 11:12:53 reproflushfull kernel: submit_bio+0xb1/0x110 May 27 11:12:53 reproflushfull kernel: md_super_write+0xcf/0x110 May 27 11:12:53 reproflushfull kernel: write_sb_page+0x148/0x300 May 27 11:12:53 reproflushfull kernel: filemap_write_page+0x5b/0x70 May 27 11:12:53 reproflushfull kernel: md_bitmap_unplug+0x99/0x1f0 May 27 11:12:53 reproflushfull kernel: flush_bio_list+0x107/0x110 [raid1] May 27 11:12:53 reproflushfull kernel: raid1_unplug+0x3c/0xf0 [raid1] May 27 11:12:53 reproflushfull kernel: __blk_flush_plug+0xd9/0x170 May 27 11:12:53 reproflushfull kernel: blk_finish_plug+0x30/0x50 May 27 11:12:53 reproflushfull kernel: io_submit_sqes+0x4c4/0x6c0 May 27 11:12:53 reproflushfull kernel: __do_sys_io_uring_enter+0x2f2/0x640 May 27 11:12:53 reproflushfull kernel: __x64_sys_io_uring_enter+0x22/0x40 May 27 11:12:53 reproflushfull kernel: x64_sys_call+0x20b9/0x24b0 May 27 11:12:53 reproflushfull kernel: do_syscall_64+0x80/0x170 May 27 11:12:53 reproflushfull kernel: ? do_mprotect_pkey+0x198/0x620 May 27 11:12:53 reproflushfull kernel: ? __memcg_slab_post_alloc_hook+0x18e/0x230 May 27 11:12:53 reproflushfull kernel: ? policy_nodemask+0x145/0x180 May 27 11:12:53 reproflushfull kernel: ? __mod_memcg_lruvec_state+0x87/0x120 May 27 11:12:53 reproflushfull kernel: ? __mod_lruvec_state+0x36/0x50 May 27 11:12:53 reproflushfull kernel: ? __lruvec_stat_mod_folio+0x70/0xc0 May 27 11:12:53 reproflushfull kernel: ? set_ptes.constprop.0+0x2b/0xb0 May 27 11:12:53 reproflushfull kernel: ? _raw_spin_unlock+0xe/0x40 May 27 11:12:53 reproflushfull kernel: ? do_anonymous_page+0x23d/0x790 May 27 11:12:53 reproflushfull kernel: ? __pte_offset_map+0x1c/0x1b0 May 27 11:12:53 reproflushfull kernel: ? __handle_mm_fault+0xc1a/0xe90 May 27 11:12:53 reproflushfull kernel: ? do_syscall_64+0x8c/0x170 May 27 11:12:53 reproflushfull kernel: ? __count_memcg_events+0x6f/0xe0 May 27 11:12:53 reproflushfull kernel: ? count_memcg_events.constprop.0+0x2a/0x50 May 27 11:12:53 reproflushfull kernel: ? handle_mm_fault+0xaf/0x340 May 27 11:12:53 reproflushfull kernel: ? do_user_addr_fault+0x365/0x680 May 27 11:12:53 reproflushfull kernel: ? irqentry_exit_to_user_mode+0x76/0x270 May 27 11:12:53 reproflushfull kernel: ? irqentry_exit+0x43/0x50 May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 May 27 11:12:53 reproflushfull kernel: entry_SYSCALL_64_after_hwframe+0x76/0x7e May 27 11:12:53 reproflushfull kernel: RIP: 0033:0x7e8a75069b95 May 27 11:12:53 reproflushfull kernel: Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 May 27 11:12:53 reproflushfull kernel: RSP: 002b:00007e8a67bfafd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa May 27 11:12:53 reproflushfull kernel: RAX: ffffffffffffffda RBX: 00007e8a580039f0 RCX: 00007e8a75069b95 May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI: 0000000000000020 RDI: 0000000000000033 May 27 11:12:53 reproflushfull kernel: RBP: 00007e8a580039f8 R08: 0000000000000000 R09: 0000000000000008 May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11: 0000000000000246 R12: 00007e8a58003ae0 May 27 11:12:53 reproflushfull kernel: R13: 0000000000000000 R14: 000064a145ca1c68 R15: 000064a183935450 May 27 11:12:53 reproflushfull kernel: </TASK> May 27 11:12:53 reproflushfull kernel: ---[ end trace 0000000000000000 ]---
On 2024/5/28 00:04, Friedrich Weber wrote: > Hi Chengming, > > Thank you for taking a look at this! > > On 27/05/2024 07:09, Chengming Zhou wrote: >>> I've used this reproducer for a bisect, which produced >>> >>> 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) >>> >>> as the first commit with which I can reproduce the crashes. I'm not 100% >>> sure it is this one because the reproducer is a bit flaky. But it does >>> sound plausible, as the commit is included in our 6.8 kernel, and >>> touches `queuelist` which is AFAICT where blk_flush_complete_seq >>> dereferences the NULL pointer. >> >> Ok, it will be better that I can reproduce it locally, will try later. > > Interestingly, so far I haven't been able to reproduce the crash when > generating IO on the host itself, I only got crashes when generating IO > in a QEMU VM. > > The reproducer in more detail: Thanks for these details, I will try to setup and reproduce when I back to work. > > - Compile Linux 6.9 with CONFIG_FAULT_INJECTION, [...] >> >> BUG shows it panic on 0000000000000008, not sure what it's accessing then, >> does it means rq->queuelist.next == 0 or something? Could you use add2line >> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0 >> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;", >> obviously fq can't be NULL. (I'm using the v6.9 kernel) > > Sorry for the confusion, the crash dump was from a kernel compiled at > 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for > a kernel 6.9 crash dump. > > I don't know too much about kernel debugging, but I tried to get > something useful out of addr2line: > > # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2 > blk_flush_complete_seq+0x291/0x2d0 > __list_del > /[...]./include/linux/list.h:195 > > I tried to find the relevant portions in `objdump -SD blk-flush.o`, see > [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to > > 351: 48 89 4f 08 mov %rcx,0x8(%rdi) > > To me this looks like part of > > list_move_tail(&rq->queuelist, pending); > > What do you think? Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means we allocated a request but its queuelist is not initialized or corrupted? Anyway, I will use below changes for debugging when reproduce, and you could also try this to see if we could get something useful. :) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b4df8e5ac9e..6e3a6cd7739d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_use_cached_rq(rq, plug, bio); } + BUG_ON(rq->queuelist.next == NULL); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio) if (bio_zone_write_plugging(bio)) blk_zone_write_plug_init_request(rq); + BUG_ON(rq->queuelist.next == NULL); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; Thanks! > > FWIW, I've also compiled a 6.9 kernel with CONFIG_LIST_DEBUG, and when > running the reproducer it started reporting corruptions, see [4] for the > first one. It did not crash, though, maybe because CONFIG_LIST_DEBUG > prevents the NULL pointer dereference? > > Hope this helps! If I can provide anything else, just let me know. > > Best wishes, > > Friedrich > > [0] > > # mdadm --detail /dev/md0 > /dev/md0: > Version : 1.2 > Creation Time : Wed May 22 10:14:40 2024 > Raid Level : raid1 > Array Size : 33534976 (31.98 GiB 34.34 GB) > Used Dev Size : 33534976 (31.98 GiB 34.34 GB) > Raid Devices : 2 > Total Devices : 2 > Persistence : Superblock is persistent > > Intent Bitmap : Internal > > Update Time : Mon May 27 17:31:02 2024 > State : active > Active Devices : 2 > Working Devices : 2 > Failed Devices : 0 > Spare Devices : 0 > > Consistency Policy : bitmap > > Name : reproflushfull:0 (local to host reproflushfull) > UUID : fda4a959:d2dd0bef:d7094fb1:c6e3174c > Events : 749 > > Number Major Minor RaidDevice State > 0 8 1 0 active sync /dev/sda1 > 1 8 17 1 active sync /dev/sdb1 > > [1] > > ./qemu-8.2.2/qemu-system-x86_64 \ > -accel kvm \ > -name 'pve,debug-threads=on' \ > -chardev > 'socket,id=qmp,path=/var/run/qemu-server/100.qmp,server=on,wait=off' \ > -mon 'chardev=qmp,mode=control' \ > -chardev 'socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5' \ > -mon 'chardev=qmp-event,mode=control' \ > -pidfile /var/run/qemu-server/100.pid \ > -smbios 'type=1,uuid=25a3012b-e72a-4121-9e57-f6f5a40ebe62' \ > -smp '4,sockets=1,cores=4,maxcpus=4' \ > -nodefaults \ > -vnc 'unix:/var/run/qemu-server/100.vnc,password=on' \ > -cpu host,+kvm_pv_eoi,+kvm_pv_unhalt \ > -m 4096 \ > -object 'iothread,id=iothread-virtioscsi0' \ > -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' \ > -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' \ > -device 'pci-bridge,id=pci.3,chassis_nr=3,bus=pci.0,addr=0x5' \ > -device 'vmgenid,guid=ab6d50e3-d574-4ba8-bb0c-3c0bbcc9cd2e' \ > -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' \ > -device 'VGA,id=vga,bus=pci.0,addr=0x2' \ > -device > 'virtio-scsi-pci,id=virtioscsi0,bus=pci.3,addr=0x1,iothread=iothread-virtioscsi0' > \ > -drive > 'file=/var/lib/vz/images/100/vm-100-disk-0.qcow2,if=none,id=drive-scsi0,format=qcow2,cache=none,aio=io_uring,detect-zeroes=on' > \ > -device > 'scsi-hd,bus=virtioscsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0,bootindex=100' > \ > -machine 'type=pc' > > [2] > > [ 85.282301] BUG: kernel NULL pointer dereference, address: > 0000000000000008 > [ 85.283454] #PF: supervisor write access in kernel mode > [ 85.284289] #PF: error_code(0x0002) - not-present page > [ 85.285087] PGD 0 P4D 0 > [ 85.285500] Oops: 0002 [#1] PREEMPT SMP NOPTI > [ 85.286211] CPU: 0 PID: 794 Comm: kvm Tainted: G E > 6.9.0-debug2 #27 > [ 85.287420] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 85.289177] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0 > [ 85.289994] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 > 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 > 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b > [ 85.292894] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046 > [ 85.293714] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX: > ffff8b9c8abad048 > [ 85.294809] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI: > 0000000000000000 > [ 85.295921] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09: > 0000000000000000 > [ 85.297111] R10: 0000000000000000 R11: 0000000000000000 R12: > 0000000000009801 > [ 85.298211] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15: > ffff8b9c8341ddc0 > [ 85.299353] FS: 0000776683e006c0(0000) GS:ffff8b9db7a00000(0000) > knlGS:0000000000000000 > [ 85.300825] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 85.301712] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4: > 0000000000372ef0 > [ 85.302810] Call Trace: > [ 85.303242] <TASK> > [ 85.303597] ? show_regs+0x6c/0x80 > [ 85.304169] ? __die+0x24/0x80 > [ 85.304671] ? page_fault_oops+0x175/0x5b0 > [ 85.305326] ? do_user_addr_fault+0x311/0x680 > [ 85.306012] ? exc_page_fault+0x82/0x1b0 > [ 85.306657] ? asm_exc_page_fault+0x27/0x30 > [ 85.307297] ? blk_flush_complete_seq+0x291/0x2d0 > [ 85.308023] ? __blk_mq_alloc_requests+0x251/0x540 > [ 85.308785] ? wbt_wait+0x33/0x100 > [ 85.309620] blk_insert_flush+0xce/0x220 > [ 85.310307] blk_mq_submit_bio+0x5cd/0x730 > [ 85.310948] __submit_bio+0xb3/0x1c0 > [ 85.311535] submit_bio_noacct_nocheck+0x2fc/0x3d0 > [ 85.312296] submit_bio_noacct+0x1ba/0x6b0 > [ 85.312934] ? ext4_file_write_iter+0x3b1/0x7e0 > [ 85.313663] submit_bio+0xb1/0x110 > [ 85.314220] md_super_write+0xcf/0x110 > [ 85.314816] write_sb_page+0x148/0x300 > [ 85.315422] filemap_write_page+0x5b/0x70 > [ 85.316058] md_bitmap_unplug+0x99/0x1f0 > [ 85.316676] flush_bio_list+0x107/0x110 [raid1] > [ 85.317399] raid1_unplug+0x3c/0xf0 [raid1] > [ 85.318056] __blk_flush_plug+0xbe/0x140 > [ 85.318673] blk_finish_plug+0x30/0x50 > [ 85.319268] io_submit_sqes+0x4c4/0x6c0 > [ 85.319872] __do_sys_io_uring_enter+0x2f2/0x640 > [ 85.320590] ? blk_mq_complete_request+0x26/0x40 > [ 85.321326] __x64_sys_io_uring_enter+0x22/0x40 > [ 85.322044] x64_sys_call+0x20b9/0x24b0 > [ 85.322656] do_syscall_64+0x80/0x170 > [ 85.323243] ? vring_interrupt+0x94/0x110 > [ 85.323878] ? __x64_sys_ppoll+0xf2/0x170 > [ 85.324501] ? handle_irq_event+0x52/0x80 > [ 85.325103] ? _raw_spin_unlock+0xe/0x40 > [ 85.325728] ? handle_edge_irq+0xda/0x250 > [ 85.326347] ? irqentry_exit_to_user_mode+0x76/0x270 > [ 85.327114] ? irqentry_exit+0x43/0x50 > [ 85.327703] ? clear_bhb_loop+0x15/0x70 > [ 85.328286] ? clear_bhb_loop+0x15/0x70 > [ 85.328897] ? clear_bhb_loop+0x15/0x70 > [ 85.329541] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 85.330326] RIP: 0033:0x776695660b95 > [ 85.330908] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 > d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f > 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 > [ 85.333736] RSP: 002b:0000776683dfafd8 EFLAGS: 00000246 ORIG_RAX: > 00000000000001aa > [ 85.334911] RAX: ffffffffffffffda RBX: 000077667c0039f0 RCX: > 0000776695660b95 > [ 85.335999] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > 0000000000000033 > [ 85.337111] RBP: 000077667c0039f8 R08: 0000000000000000 R09: > 0000000000000008 > [ 85.338210] R10: 0000000000000000 R11: 0000000000000246 R12: > 000077667c003ae0 > [ 85.339304] R13: 0000000000000000 R14: 000064175d5afc68 R15: > 000064178a3f9450 > [ 85.340408] </TASK> > [ 85.340772] Modules linked in: tcp_diag(E) inet_diag(E) veth(E) > cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) > iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) > ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) > ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) > binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) > intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) > intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) > kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E) > polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) > sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) > cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) > serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) > efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) > autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) > async_pq(E) async_xor(E) async_tx(E) > [ 85.340859] xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) > crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E) > uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 85.357137] CR2: 0000000000000008 > [ 85.358209] ---[ end trace 0000000000000000 ]--- > [ 85.358926] RIP: 0010:blk_flush_complete_seq+0x291/0x2d0 > [ 85.360004] Code: 0f b6 f6 49 8d 56 01 49 c1 e6 04 4d 01 ee 48 c1 e2 > 04 49 8b 4e 10 4c 01 ea 48 39 ca 74 2b 48 8b 4b 50 48 8b 7b 48 48 8d 73 > 48 <48> 89 4f 08 48 89 39 49 8b 4e 18 49 89 76 18 48 89 53 48 48 89 4b > [ 85.362891] RSP: 0018:ffff9c08c07f3998 EFLAGS: 00010046 > [ 85.363695] RAX: 0000000000000000 RBX: ffff8b9c8abad000 RCX: > ffff8b9c8abad048 > [ 85.364800] RDX: ffff8b9c80a3d6a0 RSI: ffff8b9c8abad048 RDI: > 0000000000000000 > [ 85.365902] RBP: ffff9c08c07f39d8 R08: 0000000000000000 R09: > 0000000000000000 > [ 85.366993] R10: 0000000000000000 R11: 0000000000000000 R12: > 0000000000009801 > [ 85.368089] R13: ffff8b9c80a3d680 R14: ffff8b9c80a3d690 R15: > ffff8b9c8341ddc0 > [ 85.369190] FS: 0000776683e006c0(0000) GS:ffff8b9db7a00000(0000) > knlGS:0000000000000000 > [ 85.370440] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 85.371327] CR2: 0000000000000008 CR3: 000000010e6e4001 CR4: > 0000000000372ef0 > [ 85.372440] note: kvm[794] exited with irqs disabled > [ 85.373340] note: kvm[794] exited with preempt_count 1 > [ 85.374238] ------------[ cut here ]------------ > [ 85.374943] WARNING: CPU: 0 PID: 794 at kernel/exit.c:827 > do_exit+0x8a2/0xab0 > [ 85.376047] Modules linked in: tcp_diag(E) inet_diag(E) veth(E) > cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) > iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) > ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) > ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) > binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) > intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) > intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) > kvm_intel(E) kvm(E) hid_generic(E) crct10dif_pclmul(E) > polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) > sha256_ssse3(E) sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) > cryptd(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) > serio_raw(E) mac_hid(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) > efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) > autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) > async_pq(E) async_xor(E) async_tx(E) > [ 85.376128] xor(E) raid6_pq(E) libcrc32c(E) raid0(E) raid1(E) > crc32_pclmul(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) psmouse(E) > uhci_hcd(E) ehci_hcd(E) ttm(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 85.392395] CPU: 0 PID: 794 Comm: kvm Tainted: G D E > 6.9.0-debug2 #27 > [ 85.393594] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 85.395347] RIP: 0010:do_exit+0x8a2/0xab0 > [ 85.395975] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07 > 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff > ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00 > [ 85.398793] RSP: 0018:ffff9c08c07f3ec8 EFLAGS: 00010286 > [ 85.399625] RAX: 0000000000000000 RBX: ffff8b9c82e90000 RCX: > 0000000000000000 > [ 85.400720] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > > [3] > > 00000000000000c0 <blk_flush_complete_seq>: > { > [...] > struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; > 32a: 49 8d 56 01 lea 0x1(%r14),%rdx > 32e: 49 c1 e6 04 shl $0x4,%r14 > 332: 4d 01 ee add %r13,%r14 > 335: 48 c1 e2 04 shl $0x4,%rdx > return READ_ONCE(head->next) == head; > 339: 49 8b 4e 10 mov 0x10(%r14),%rcx > 33d: 4c 01 ea add %r13,%rdx > if (list_empty(pending)) > 340: 48 39 ca cmp %rcx,%rdx > 343: 74 2b je 370 <blk_flush_complete_seq+0x2b0> > __list_del(entry->prev, entry->next); > 345: 48 8b 4b 50 mov 0x50(%rbx),%rcx > 349: 48 8b 7b 48 mov 0x48(%rbx),%rdi > list_move_tail(&rq->queuelist, pending); > 34d: 48 8d 73 48 lea 0x48(%rbx),%rsi > next->prev = prev; > 351: 48 89 4f 08 mov %rcx,0x8(%rdi) > WRITE_ONCE(prev->next, next); > 355: 48 89 39 mov %rdi,(%rcx) > __list_add(new, head->prev, head); > 358: 49 8b 4e 18 mov 0x18(%r14),%rcx > next->prev = new; > 35c: 49 89 76 18 mov %rsi,0x18(%r14) > new->next = next; > 360: 48 89 53 48 mov %rdx,0x48(%rbx) > new->prev = prev; > 364: 48 89 4b 50 mov %rcx,0x50(%rbx) > WRITE_ONCE(prev->next, new); > 368: 48 89 31 mov %rsi,(%rcx) > } > 36b: e9 30 fe ff ff jmp 1a0 <blk_flush_complete_seq+0xe0> > fq->flush_pending_since = jiffies; > 370: 48 8b 0d 00 00 00 00 mov 0x0(%rip),%rcx # 377 > <blk_flush_complete_seq+0x2b7> > 377: 49 89 4d 08 mov %rcx,0x8(%r13) > 37b: eb c8 jmp 345 <blk_flush_complete_seq+0x285> > > [4] May 27 11:12:53 reproflushfull kernel: list_del corruption. > prev->next should be ffff8ad44ab63788, but was ffff8ad44ab63400. > (prev=ffff8ad44ab63788) > May 27 11:12:53 reproflushfull kernel: WARNING: CPU: 0 PID: 797 at > lib/list_debug.c:62 __list_del_entry_valid_or_report+0xd4/0x100 > May 27 11:12:53 reproflushfull kernel: Modules linked in: veth(E) > cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) > iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) > ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) > ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) > binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) > intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) > intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) > kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E) > polyval_generic(E) hid_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) > sha1_ssse3(E) aesni_intel(E) crypto_simd(E) usbhid(E) cryptd(E) hid(E) > rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) > serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) > dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) > raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) > async_xor(E) async_tx(E) xor(E) raid6_pq(E) > May 27 11:12:53 reproflushfull kernel: libcrc32c(E) raid0(E) raid1(E) > crc32_pclmul(E) bochs(E) psmouse(E) drm_vram_helper(E) drm_ttm_helper(E) > uhci_hcd(E) ttm(E) i2c_piix4(E) ehci_hcd(E) pata_acpi(E) floppy(E) > May 27 11:12:53 reproflushfull kernel: CPU: 0 PID: 797 Comm: kvm > Tainted: G E 6.9.0-debug #25 > May 27 11:12:53 reproflushfull kernel: Hardware name: QEMU Standard PC > (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org > 04/01/2014 > May 27 11:12:53 reproflushfull kernel: RIP: > 0010:__list_del_entry_valid_or_report+0xd4/0x100 > May 27 11:12:53 reproflushfull kernel: Code: 48 89 fe 48 89 ca 48 c7 c7 > 78 77 9f a1 e8 84 3a 8e ff 0f 0b 31 c0 eb b0 48 89 fe 48 89 c2 48 c7 c7 > b0 77 9f a1 e8 6c 3a 8e ff <0f> 0b 31 c0 eb 98 48 89 d1 48 c7 c7 f8 77 > 9f a1 48 89 f2 48 89 c6 > May 27 11:12:53 reproflushfull kernel: RSP: 0018:ffffb1dd80673740 > EFLAGS: 00010046 > May 27 11:12:53 reproflushfull kernel: RAX: 0000000000000000 RBX: > ffff8ad440be12c0 RCX: 0000000000000000 > May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI: > 0000000000000000 RDI: 0000000000000000 > May 27 11:12:53 reproflushfull kernel: RBP: ffffb1dd80673740 R08: > 0000000000000000 R09: 0000000000000000 > May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11: > 0000000000000000 R12: ffff8ad44a8c1e00 > May 27 11:12:53 reproflushfull kernel: R13: ffff8ad440be12c0 R14: > ffff8ad44ab63788 R15: ffff8ad44ab63740 > May 27 11:12:53 reproflushfull kernel: FS: 00007e8a67c006c0(0000) > GS:ffff8ad577a00000(0000) knlGS:0000000000000000 > May 27 11:12:53 reproflushfull kernel: CS: 0010 DS: 0000 ES: 0000 CR0: > 0000000080050033 > May 27 11:12:53 reproflushfull kernel: CR2: 00007e8a5801a008 CR3: > 000000010e022001 CR4: 0000000000372ef0 > May 27 11:12:53 reproflushfull kernel: Call Trace: > May 27 11:12:53 reproflushfull kernel: <TASK> > May 27 11:12:53 reproflushfull kernel: ? show_regs+0x6c/0x80 > May 27 11:12:53 reproflushfull kernel: ? __warn+0x88/0x140 > May 27 11:12:53 reproflushfull kernel: ? > __list_del_entry_valid_or_report+0xd4/0x100 > May 27 11:12:53 reproflushfull kernel: ? report_bug+0x182/0x1b0 > May 27 11:12:53 reproflushfull kernel: ? handle_bug+0x46/0x90 > May 27 11:12:53 reproflushfull kernel: ? exc_invalid_op+0x18/0x80 > May 27 11:12:53 reproflushfull kernel: ? asm_exc_invalid_op+0x1b/0x20 > May 27 11:12:53 reproflushfull kernel: ? > __list_del_entry_valid_or_report+0xd4/0x100 > May 27 11:12:53 reproflushfull kernel: blk_flush_complete_seq+0x2f3/0x360 > May 27 11:12:53 reproflushfull kernel: ? wbt_wait+0x33/0x100 > May 27 11:12:53 reproflushfull kernel: blk_insert_flush+0xce/0x220 > May 27 11:12:53 reproflushfull kernel: blk_mq_submit_bio+0x5cd/0x730 > May 27 11:12:53 reproflushfull kernel: __submit_bio+0xb3/0x1c0 > May 27 11:12:53 reproflushfull kernel: > submit_bio_noacct_nocheck+0x2fc/0x3d0 > May 27 11:12:53 reproflushfull kernel: submit_bio_noacct+0x1ba/0x6b0 > May 27 11:12:53 reproflushfull kernel: submit_bio+0xb1/0x110 > May 27 11:12:53 reproflushfull kernel: md_super_write+0xcf/0x110 > May 27 11:12:53 reproflushfull kernel: write_sb_page+0x148/0x300 > May 27 11:12:53 reproflushfull kernel: filemap_write_page+0x5b/0x70 > May 27 11:12:53 reproflushfull kernel: md_bitmap_unplug+0x99/0x1f0 > May 27 11:12:53 reproflushfull kernel: flush_bio_list+0x107/0x110 [raid1] > May 27 11:12:53 reproflushfull kernel: raid1_unplug+0x3c/0xf0 [raid1] > May 27 11:12:53 reproflushfull kernel: __blk_flush_plug+0xd9/0x170 > May 27 11:12:53 reproflushfull kernel: blk_finish_plug+0x30/0x50 > May 27 11:12:53 reproflushfull kernel: io_submit_sqes+0x4c4/0x6c0 > May 27 11:12:53 reproflushfull kernel: __do_sys_io_uring_enter+0x2f2/0x640 > May 27 11:12:53 reproflushfull kernel: __x64_sys_io_uring_enter+0x22/0x40 > May 27 11:12:53 reproflushfull kernel: x64_sys_call+0x20b9/0x24b0 > May 27 11:12:53 reproflushfull kernel: do_syscall_64+0x80/0x170 > May 27 11:12:53 reproflushfull kernel: ? do_mprotect_pkey+0x198/0x620 > May 27 11:12:53 reproflushfull kernel: ? > __memcg_slab_post_alloc_hook+0x18e/0x230 > May 27 11:12:53 reproflushfull kernel: ? policy_nodemask+0x145/0x180 > May 27 11:12:53 reproflushfull kernel: ? > __mod_memcg_lruvec_state+0x87/0x120 > May 27 11:12:53 reproflushfull kernel: ? __mod_lruvec_state+0x36/0x50 > May 27 11:12:53 reproflushfull kernel: ? __lruvec_stat_mod_folio+0x70/0xc0 > May 27 11:12:53 reproflushfull kernel: ? set_ptes.constprop.0+0x2b/0xb0 > May 27 11:12:53 reproflushfull kernel: ? _raw_spin_unlock+0xe/0x40 > May 27 11:12:53 reproflushfull kernel: ? do_anonymous_page+0x23d/0x790 > May 27 11:12:53 reproflushfull kernel: ? __pte_offset_map+0x1c/0x1b0 > May 27 11:12:53 reproflushfull kernel: ? __handle_mm_fault+0xc1a/0xe90 > May 27 11:12:53 reproflushfull kernel: ? do_syscall_64+0x8c/0x170 > May 27 11:12:53 reproflushfull kernel: ? __count_memcg_events+0x6f/0xe0 > May 27 11:12:53 reproflushfull kernel: ? > count_memcg_events.constprop.0+0x2a/0x50 > May 27 11:12:53 reproflushfull kernel: ? handle_mm_fault+0xaf/0x340 > May 27 11:12:53 reproflushfull kernel: ? do_user_addr_fault+0x365/0x680 > May 27 11:12:53 reproflushfull kernel: ? > irqentry_exit_to_user_mode+0x76/0x270 > May 27 11:12:53 reproflushfull kernel: ? irqentry_exit+0x43/0x50 > May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 > May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 > May 27 11:12:53 reproflushfull kernel: ? clear_bhb_loop+0x15/0x70 > May 27 11:12:53 reproflushfull kernel: > entry_SYSCALL_64_after_hwframe+0x76/0x7e > May 27 11:12:53 reproflushfull kernel: RIP: 0033:0x7e8a75069b95 > May 27 11:12:53 reproflushfull kernel: Code: 00 00 00 44 89 d0 41 b9 08 > 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 > 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 > 02 74 c2 f0 48 83 0c 24 > May 27 11:12:53 reproflushfull kernel: RSP: 002b:00007e8a67bfafd8 > EFLAGS: 00000246 ORIG_RAX: 00000000000001aa > May 27 11:12:53 reproflushfull kernel: RAX: ffffffffffffffda RBX: > 00007e8a580039f0 RCX: 00007e8a75069b95 > May 27 11:12:53 reproflushfull kernel: RDX: 0000000000000000 RSI: > 0000000000000020 RDI: 0000000000000033 > May 27 11:12:53 reproflushfull kernel: RBP: 00007e8a580039f8 R08: > 0000000000000000 R09: 0000000000000008 > May 27 11:12:53 reproflushfull kernel: R10: 0000000000000000 R11: > 0000000000000246 R12: 00007e8a58003ae0 > May 27 11:12:53 reproflushfull kernel: R13: 0000000000000000 R14: > 000064a145ca1c68 R15: 000064a183935450 > May 27 11:12:53 reproflushfull kernel: </TASK> > May 27 11:12:53 reproflushfull kernel: ---[ end trace 0000000000000000 ]--- >
On 2024/5/28 07:34, Chengming Zhou wrote: > On 2024/5/28 00:04, Friedrich Weber wrote: >> Hi Chengming, >> >> Thank you for taking a look at this! >> >> On 27/05/2024 07:09, Chengming Zhou wrote: >>>> I've used this reproducer for a bisect, which produced >>>> >>>> 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) >>>> >>>> as the first commit with which I can reproduce the crashes. I'm not 100% >>>> sure it is this one because the reproducer is a bit flaky. But it does >>>> sound plausible, as the commit is included in our 6.8 kernel, and >>>> touches `queuelist` which is AFAICT where blk_flush_complete_seq >>>> dereferences the NULL pointer. >>> >>> Ok, it will be better that I can reproduce it locally, will try later. >> >> Interestingly, so far I haven't been able to reproduce the crash when >> generating IO on the host itself, I only got crashes when generating IO >> in a QEMU VM. >> >> The reproducer in more detail: > > Thanks for these details, I will try to setup and reproduce when I back to work. > >> >> - Compile Linux 6.9 with CONFIG_FAULT_INJECTION, > [...] >>> >>> BUG shows it panic on 0000000000000008, not sure what it's accessing then, >>> does it means rq->queuelist.next == 0 or something? Could you use add2line >>> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0 >>> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;", >>> obviously fq can't be NULL. (I'm using the v6.9 kernel) >> >> Sorry for the confusion, the crash dump was from a kernel compiled at >> 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for >> a kernel 6.9 crash dump. >> >> I don't know too much about kernel debugging, but I tried to get >> something useful out of addr2line: >> >> # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2 >> blk_flush_complete_seq+0x291/0x2d0 >> __list_del >> /[...]./include/linux/list.h:195 >> >> I tried to find the relevant portions in `objdump -SD blk-flush.o`, see >> [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to >> >> 351: 48 89 4f 08 mov %rcx,0x8(%rdi) >> >> To me this looks like part of >> >> list_move_tail(&rq->queuelist, pending); >> >> What do you think? > > Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL > if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means > we allocated a request but its queuelist is not initialized or corrupted? > > Anyway, I will use below changes for debugging when reproduce, and you could > also try this to see if we could get something useful. :) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 3b4df8e5ac9e..6e3a6cd7739d 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio) > blk_mq_use_cached_rq(rq, plug, bio); > } > > + BUG_ON(rq->queuelist.next == NULL); > + > trace_block_getrq(bio); > > rq_qos_track(q, rq, bio); > @@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio) > if (bio_zone_write_plugging(bio)) > blk_zone_write_plug_init_request(rq); > > + BUG_ON(rq->queuelist.next == NULL); > + > if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) > return; > Ah, I forgot to change to your kernel version, then should be: diff --git a/block/blk-mq.c b/block/blk-mq.c index d98654869615..908fdfb62132 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } + BUG_ON(rq->queuelist.next == NULL); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } + BUG_ON(rq->queuelist.next == NULL); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return;
On 2024/5/28 07:50, Chengming Zhou wrote: > On 2024/5/28 07:34, Chengming Zhou wrote: >> On 2024/5/28 00:04, Friedrich Weber wrote: >>> Hi Chengming, >>> >>> Thank you for taking a look at this! >>> >>> On 27/05/2024 07:09, Chengming Zhou wrote: >>>>> I've used this reproducer for a bisect, which produced >>>>> >>>>> 81ada09cc25e (blk-flush: reuse rq queuelist in flush state machine) >>>>> >>>>> as the first commit with which I can reproduce the crashes. I'm not 100% >>>>> sure it is this one because the reproducer is a bit flaky. But it does >>>>> sound plausible, as the commit is included in our 6.8 kernel, and >>>>> touches `queuelist` which is AFAICT where blk_flush_complete_seq >>>>> dereferences the NULL pointer. >>>> >>>> Ok, it will be better that I can reproduce it locally, will try later. >>> >>> Interestingly, so far I haven't been able to reproduce the crash when >>> generating IO on the host itself, I only got crashes when generating IO >>> in a QEMU VM. >>> >>> The reproducer in more detail: >> >> Thanks for these details, I will try to setup and reproduce when I back to work. >> >>> >>> - Compile Linux 6.9 with CONFIG_FAULT_INJECTION, >> [...] >>>> >>>> BUG shows it panic on 0000000000000008, not sure what it's accessing then, >>>> does it means rq->queuelist.next == 0 or something? Could you use add2line >>>> to show the exact source code line that panic? I use blk_flush_complete_seq+0x296/0x2e0 >>>> and get block/blk-flush.c:190, which is "fq->flush_data_in_flight++;", >>>> obviously fq can't be NULL. (I'm using the v6.9 kernel) >>> >>> Sorry for the confusion, the crash dump was from a kernel compiled at >>> 81ada09cc25e -- with 6.9, the offset seems to be different. See [2] for >>> a kernel 6.9 crash dump. >>> >>> I don't know too much about kernel debugging, but I tried to get >>> something useful out of addr2line: >>> >>> # addr2line -f -e /usr/lib/debug/vmlinux-6.9.0-debug2 >>> blk_flush_complete_seq+0x291/0x2d0 >>> __list_del >>> /[...]./include/linux/list.h:195 >>> >>> I tried to find the relevant portions in `objdump -SD blk-flush.o`, see >>> [3]. If I'm not mistaken, blk_flush_complete_seq+0x291 should point to >>> >>> 351: 48 89 4f 08 mov %rcx,0x8(%rdi) >>> >>> To me this looks like part of >>> >>> list_move_tail(&rq->queuelist, pending); >>> >>> What do you think? >> >> Yeah, it seems correct, so the rq->queuelist.next == NULL. It can't be NULL >> if went through REQ_FSEQ_POSTFLUSH, so it must be REQ_FSEQ_PREFLUSH. It means >> we allocated a request but its queuelist is not initialized or corrupted? >> >> Anyway, I will use below changes for debugging when reproduce, and you could >> also try this to see if we could get something useful. :) >> >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 3b4df8e5ac9e..6e3a6cd7739d 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -2989,6 +2989,8 @@ void blk_mq_submit_bio(struct bio *bio) >> blk_mq_use_cached_rq(rq, plug, bio); >> } >> >> + BUG_ON(rq->queuelist.next == NULL); >> + >> trace_block_getrq(bio); >> >> rq_qos_track(q, rq, bio); >> @@ -3006,6 +3008,8 @@ void blk_mq_submit_bio(struct bio *bio) >> if (bio_zone_write_plugging(bio)) >> blk_zone_write_plug_init_request(rq); >> >> + BUG_ON(rq->queuelist.next == NULL); >> + >> if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) >> return; >> > > Ah, I forgot to change to your kernel version, then should be: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index d98654869615..908fdfb62132 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio) > return; > } > > + BUG_ON(rq->queuelist.next == NULL); > + > trace_block_getrq(bio); > > rq_qos_track(q, rq, bio); > @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio) > return; > } > > + BUG_ON(rq->queuelist.next == NULL); > + > if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) > return; > Another possibility is that drivers may change rq->queuelist even after rq->end_io(). So add two more BUG_ON() to detect this: diff --git a/block/blk-flush.c b/block/blk-flush.c index e73dc22d05c1..0eb684a468e5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq, switch (seq) { case REQ_FSEQ_PREFLUSH: + BUG_ON(rq->queuelist.next == NULL); + fallthrough; case REQ_FSEQ_POSTFLUSH: + BUG_ON(rq->queuelist.next == NULL); /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; diff --git a/block/blk-mq.c b/block/blk-mq.c index d98654869615..908fdfb62132 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } + BUG_ON(rq->queuelist.next == NULL); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } + BUG_ON(rq->queuelist.next == NULL); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return;
Hi Chengming, Thanks again! On 28/05/2024 02:12, Chengming Zhou wrote: > Another possibility is that drivers may change rq->queuelist even after > rq->end_io(). So add two more BUG_ON() to detect this: > > diff --git a/block/blk-flush.c b/block/blk-flush.c > index e73dc22d05c1..0eb684a468e5 100644 > --- a/block/blk-flush.c > +++ b/block/blk-flush.c > @@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq, > > switch (seq) { > case REQ_FSEQ_PREFLUSH: > + BUG_ON(rq->queuelist.next == NULL); > + fallthrough; > case REQ_FSEQ_POSTFLUSH: > + BUG_ON(rq->queuelist.next == NULL); > /* queue for flush */ > if (list_empty(pending)) > fq->flush_pending_since = jiffies; > diff --git a/block/blk-mq.c b/block/blk-mq.c > index d98654869615..908fdfb62132 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2963,6 +2963,8 @@ void blk_mq_submit_bio(struct bio *bio) > return; > } > > + BUG_ON(rq->queuelist.next == NULL); > + > trace_block_getrq(bio); > > rq_qos_track(q, rq, bio); > @@ -2977,6 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio) > return; > } > > + BUG_ON(rq->queuelist.next == NULL); > + > if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) > return; > I applied the following patch (with slightly different line numbers) to my checkout of Linux 6.9: diff --git a/block/blk-flush.c b/block/blk-flush.c index b0f314f4bc14..07e0de3b28ef 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -179,7 +179,10 @@ static void blk_flush_complete_seq(struct request *rq, switch (seq) { case REQ_FSEQ_PREFLUSH: + BUG_ON(rq->queuelist.next == NULL); + fallthrough; case REQ_FSEQ_POSTFLUSH: + BUG_ON(rq->queuelist.next == NULL); /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; diff --git a/block/blk-mq.c b/block/blk-mq.c index 32afb87efbd0..13a8f4dd1e5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2988,6 +2988,8 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_use_cached_rq(rq, plug, bio); } + BUG_ON(rq->queuelist.next == NULL); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -3002,6 +3004,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } + BUG_ON(rq->queuelist.next == NULL); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; Hope I did this correctly. With this, the reproducer triggered a BUG pretty quickly, see [0]. If I can provide anything else, just let me know. I forgot to mention earlier that I'm running the reproducer against a QEMU/KVM guest too, for convenience -- so, the "host" from my previous post is in fact running virtualized. I don't think it should be relevant here, but I'll try to run the reproducer against real hardware too. Best, Friedrich [0] [ 37.543834] kernel BUG at block/blk-mq.c:2991! [ 37.544458] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 37.545077] CPU: 4 PID: 795 Comm: kvm Tainted: G E 6.9.0-troubleshoot1-dirty #28 [ 37.546087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 37.547389] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750 [ 37.547949] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7 e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43 [ 37.550088] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246 [ 37.550685] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX: 0000000000000000 [ 37.551502] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 37.552325] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09: 0000000000000000 [ 37.553130] R10: 0000000000000000 R11: 0000000000000000 R12: ffff971c106ee600 [ 37.553934] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15: 0000000000000001 [ 37.554747] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) knlGS:0000000000000000 [ 37.555663] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.556311] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: 0000000000372ef0 [ 37.557117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 37.557935] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 37.558789] Call Trace: [ 37.559080] <TASK> [ 37.559355] ? show_regs+0x6c/0x80 [ 37.560276] ? die+0x37/0xa0 [ 37.560633] ? do_trap+0xda/0xf0 [ 37.561023] ? do_error_trap+0x71/0xb0 [ 37.561486] ? blk_mq_submit_bio+0x6f5/0x750 [ 37.562008] ? exc_invalid_op+0x52/0x80 [ 37.562463] ? blk_mq_submit_bio+0x6f5/0x750 [ 37.562987] ? asm_exc_invalid_op+0x1b/0x20 [ 37.563492] ? blk_mq_submit_bio+0x6f5/0x750 [ 37.564007] __submit_bio+0xb3/0x1c0 [ 37.564439] submit_bio_noacct_nocheck+0x2fc/0x3d0 [ 37.565019] submit_bio_noacct+0x1ba/0x6b0 [ 37.565508] flush_bio_list+0x71/0x110 [raid1] [ 37.566049] raid1_unplug+0x3c/0xf0 [raid1] [ 37.566539] __blk_flush_plug+0xbe/0x140 [ 37.567016] blk_finish_plug+0x30/0x50 [ 37.567466] io_submit_sqes+0x4c4/0x6c0 [ 37.567942] ? syscall_exit_to_user_mode+0x81/0x270 [ 37.568530] __do_sys_io_uring_enter+0x2f2/0x640 [ 37.569070] __x64_sys_io_uring_enter+0x22/0x40 [ 37.569626] x64_sys_call+0x20b9/0x24b0 [ 37.570071] do_syscall_64+0x80/0x170 [ 37.570525] ? vfs_read+0x240/0x380 [ 37.570937] ? ksys_read+0xe6/0x100 [ 37.571360] ? syscall_exit_to_user_mode+0x81/0x270 [ 37.571930] ? do_syscall_64+0x8c/0x170 [ 37.572402] ? syscall_exit_to_user_mode+0x81/0x270 [ 37.572969] ? do_syscall_64+0x8c/0x170 [ 37.573438] ? irqentry_exit+0x43/0x50 [ 37.573878] ? clear_bhb_loop+0x15/0x70 [ 37.574348] ? clear_bhb_loop+0x15/0x70 [ 37.574806] ? clear_bhb_loop+0x15/0x70 [ 37.575282] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 37.575891] RIP: 0033:0x7bcc58f45b95 [ 37.576335] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 [ 37.578468] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa [ 37.579359] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX: 00007bcc58f45b95 [ 37.580198] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000033 [ 37.581028] RBP: 00007bcc400039d8 R08: 0000000000000000 R09: 0000000000000008 [ 37.581864] R10: 0000000000000000 R11: 0000000000000246 R12: 00007bcc40003ac0 [ 37.582695] R13: 0000000000000000 R14: 0000591ad05f5c68 R15: 0000591b10198450 [ 37.583534] </TASK> [ 37.583812] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E) aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E) raid6_pq(E) [ 37.583875] libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E) ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 37.595967] ---[ end trace 0000000000000000 ]--- [ 37.596555] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750 [ 37.597144] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7 e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43 [ 37.599320] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246 [ 37.599932] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX: 0000000000000000 [ 37.600757] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 37.601600] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09: 0000000000000000 [ 37.602450] R10: 0000000000000000 R11: 0000000000000000 R12: ffff971c106ee600 [ 37.603299] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15: 0000000000000001 [ 37.604129] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) knlGS:0000000000000000 [ 37.605079] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.605774] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: 0000000000372ef0 [ 37.606620] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 37.607464] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 37.608335] ------------[ cut here ]------------ [ 37.608885] WARNING: CPU: 4 PID: 795 at kernel/exit.c:827 do_exit+0x8a2/0xab0 [ 37.609754] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E) aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E) pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E) raid6_pq(E) [ 37.609827] libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E) ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) [ 37.622013] CPU: 4 PID: 795 Comm: kvm Tainted: G D E 6.9.0-troubleshoot1-dirty #28 [ 37.623045] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 37.624382] RIP: 0010:do_exit+0x8a2/0xab0 [ 37.624869] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00 [ 37.627117] RSP: 0018:ffffa2b480863ec8 EFLAGS: 00010286 [ 37.627780] RAX: 0000000000000000 RBX: ffff971c05458000 RCX: 0000000000000000 [ 37.628638] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 37.629512] RBP: ffffa2b480863f20 R08: 0000000000000000 R09: 0000000000000000 [ 37.630379] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000b [ 37.631231] R13: ffff971c04c8d100 R14: ffff971c00c28000 R15: ffffa2b4808639c8 [ 37.632095] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) knlGS:0000000000000000 [ 37.633090] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.633794] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: 0000000000372ef0 [ 37.634666] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 37.635533] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 37.636387] Call Trace: [ 37.636692] <TASK> [ 37.636963] ? show_regs+0x6c/0x80 [ 37.637393] ? __warn+0x88/0x140 [ 37.637801] ? do_exit+0x8a2/0xab0 [ 37.638197] ? report_bug+0x182/0x1b0 [ 37.638674] ? handle_bug+0x46/0x90 [ 37.639094] ? exc_invalid_op+0x18/0x80 [ 37.639577] ? asm_exc_invalid_op+0x1b/0x20 [ 37.640090] ? do_exit+0x8a2/0xab0 [ 37.640537] ? do_exit+0x6f/0xab0 [ 37.640941] ? syscall_exit_to_user_mode+0x81/0x270 [ 37.641542] make_task_dead+0x81/0x160 [ 37.641999] rewind_stack_and_make_dead+0x16/0x20 [ 37.642586] RIP: 0033:0x7bcc58f45b95 [ 37.643046] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 [ 37.645251] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001aa [ 37.646213] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX: 00007bcc58f45b95 [ 37.647117] RDX: 0000000000000000 RSI: 0000000000000003 RDI: 0000000000000033 [ 37.648018] RBP: 00007bcc400039d8 R08: 0000000000000000 R09: 0000000000000008 [ 37.648916] R10: 0000000000000000 R11: 0000000000000246 R12: 00007bcc40003ac0 [ 37.649824] R13: 0000000000000000 R14: 0000591ad05f5c68 R15: 0000591b10198450 [ 37.650760] </TASK> [ 37.651034] ---[ end trace 0000000000000000 ]---
On 2024/5/28 16:42, Friedrich Weber wrote: > Hope I did this correctly. With this, the reproducer triggered a BUG > pretty quickly, see [0]. If I can provide anything else, just let me know. Thanks for your patience, it's correct. Then how about this fix? diff --git a/block/blk-mq.c b/block/blk-mq.c index d98654869615..b2ec5c4c738f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { + INIT_LIST_HEAD(&rq->queuelist); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } > > I forgot to mention earlier that I'm running the reproducer against a > QEMU/KVM guest too, for convenience -- so, the "host" from my previous > post is in fact running virtualized. I don't think it should be relevant > here, but I'll try to run the reproducer against real hardware too. Yeah, I think it should not be relevant. Thanks. > > Best, > > Friedrich > > [0] > > [ 37.543834] kernel BUG at block/blk-mq.c:2991! > [ 37.544458] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI > [ 37.545077] CPU: 4 PID: 795 Comm: kvm Tainted: G E > 6.9.0-troubleshoot1-dirty #28 > [ 37.546087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 37.547389] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750 > [ 37.547949] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7 > e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff > ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43 > [ 37.550088] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246 > [ 37.550685] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX: > 0000000000000000 > [ 37.551502] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > [ 37.552325] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09: > 0000000000000000 > [ 37.553130] R10: 0000000000000000 R11: 0000000000000000 R12: > ffff971c106ee600 > [ 37.553934] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15: > 0000000000000001 > [ 37.554747] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) > knlGS:0000000000000000 > [ 37.555663] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 37.556311] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: > 0000000000372ef0 > [ 37.557117] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 37.557935] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 37.558789] Call Trace: > [ 37.559080] <TASK> > [ 37.559355] ? show_regs+0x6c/0x80 > [ 37.560276] ? die+0x37/0xa0 > [ 37.560633] ? do_trap+0xda/0xf0 > [ 37.561023] ? do_error_trap+0x71/0xb0 > [ 37.561486] ? blk_mq_submit_bio+0x6f5/0x750 > [ 37.562008] ? exc_invalid_op+0x52/0x80 > [ 37.562463] ? blk_mq_submit_bio+0x6f5/0x750 > [ 37.562987] ? asm_exc_invalid_op+0x1b/0x20 > [ 37.563492] ? blk_mq_submit_bio+0x6f5/0x750 > [ 37.564007] __submit_bio+0xb3/0x1c0 > [ 37.564439] submit_bio_noacct_nocheck+0x2fc/0x3d0 > [ 37.565019] submit_bio_noacct+0x1ba/0x6b0 > [ 37.565508] flush_bio_list+0x71/0x110 [raid1] > [ 37.566049] raid1_unplug+0x3c/0xf0 [raid1] > [ 37.566539] __blk_flush_plug+0xbe/0x140 > [ 37.567016] blk_finish_plug+0x30/0x50 > [ 37.567466] io_submit_sqes+0x4c4/0x6c0 > [ 37.567942] ? syscall_exit_to_user_mode+0x81/0x270 > [ 37.568530] __do_sys_io_uring_enter+0x2f2/0x640 > [ 37.569070] __x64_sys_io_uring_enter+0x22/0x40 > [ 37.569626] x64_sys_call+0x20b9/0x24b0 > [ 37.570071] do_syscall_64+0x80/0x170 > [ 37.570525] ? vfs_read+0x240/0x380 > [ 37.570937] ? ksys_read+0xe6/0x100 > [ 37.571360] ? syscall_exit_to_user_mode+0x81/0x270 > [ 37.571930] ? do_syscall_64+0x8c/0x170 > [ 37.572402] ? syscall_exit_to_user_mode+0x81/0x270 > [ 37.572969] ? do_syscall_64+0x8c/0x170 > [ 37.573438] ? irqentry_exit+0x43/0x50 > [ 37.573878] ? clear_bhb_loop+0x15/0x70 > [ 37.574348] ? clear_bhb_loop+0x15/0x70 > [ 37.574806] ? clear_bhb_loop+0x15/0x70 > [ 37.575282] entry_SYSCALL_64_after_hwframe+0x76/0x7e > [ 37.575891] RIP: 0033:0x7bcc58f45b95 > [ 37.576335] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 > d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f > 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 > [ 37.578468] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX: > 00000000000001aa > [ 37.579359] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX: > 00007bcc58f45b95 > [ 37.580198] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > 0000000000000033 > [ 37.581028] RBP: 00007bcc400039d8 R08: 0000000000000000 R09: > 0000000000000008 > [ 37.581864] R10: 0000000000000000 R11: 0000000000000246 R12: > 00007bcc40003ac0 > [ 37.582695] R13: 0000000000000000 R14: 0000591ad05f5c68 R15: > 0000591b10198450 > [ 37.583534] </TASK> > [ 37.583812] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) > cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) > cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) > ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) > iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) > bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) > intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) > intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) > crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) > ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E) > aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E) > pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) > vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) > qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) > async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) > async_tx(E) xor(E) raid6_pq(E) > [ 37.583875] libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) > psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E) > ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 37.595967] ---[ end trace 0000000000000000 ]--- > [ 37.596555] RIP: 0010:blk_mq_submit_bio+0x6f5/0x750 > [ 37.597144] Code: ff e9 aa fc ff ff e8 7a 3c a4 ff 48 89 de 4c 89 e7 > e8 9f ef ff ff e8 da 79 a4 ff e9 99 fc ff ff e8 d0 45 a7 ff e9 cd fa ff > ff <0f> 0b 0f 0b 48 8b 03 48 8b 40 68 48 8b 40 40 e9 8c fb ff ff f6 43 > [ 37.599320] RSP: 0018:ffffa2b480863a70 EFLAGS: 00010246 > [ 37.599932] RAX: 0000000000000000 RBX: ffff971c0b29d000 RCX: > 0000000000000000 > [ 37.600757] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > [ 37.601600] RBP: ffffa2b480863ae8 R08: 0000000000000000 R09: > 0000000000000000 > [ 37.602450] R10: 0000000000000000 R11: 0000000000000000 R12: > ffff971c106ee600 > [ 37.603299] R13: ffff971c0fb328f0 R14: ffff971c03b330c0 R15: > 0000000000000001 > [ 37.604129] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) > knlGS:0000000000000000 > [ 37.605079] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 37.605774] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: > 0000000000372ef0 > [ 37.606620] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 37.607464] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 37.608335] ------------[ cut here ]------------ > [ 37.608885] WARNING: CPU: 4 PID: 795 at kernel/exit.c:827 > do_exit+0x8a2/0xab0 > [ 37.609754] Modules linked in: veth(E) cmac(E) nls_utf8(E) cifs(E) > cifs_arc4(E) nls_ucs2_utils(E) rdma_cm(E) iw_cm(E) ib_cm(E) ib_core(E) > cifs_md4(E) netfs(E) ebtable_filter(E) ebtables(E) ip_set(E) > ip6table_raw(E) iptable_raw(E) ip6table_filter(E) ip6_tables(E) > iptable_filter(E) nf_tables(E) softdog(E) sunrpc(E) binfmt_misc(E) > bonding(E) tls(E) nfnetlink_log(E) nfnetlink(E) intel_rapl_msr(E) > intel_rapl_common(E) intel_uncore_frequency_common(E) intel_pmc_core(E) > intel_vsec(E) pmt_telemetry(E) pmt_class(E) kvm_intel(E) kvm(E) > crct10dif_pclmul(E) polyval_clmulni(E) polyval_generic(E) > ghash_clmulni_intel(E) sha256_ssse3(E) sha1_ssse3(E) hid_generic(E) > aesni_intel(E) crypto_simd(E) cryptd(E) usbhid(E) hid(E) rapl(E) > pcspkr(E) vmgenid(E) joydev(E) input_leds(E) mac_hid(E) serio_raw(E) > vhost_net(E) vhost(E) vhost_iotlb(E) tap(E) efi_pstore(E) dmi_sysfs(E) > qemu_fw_cfg(E) ip_tables(E) x_tables(E) autofs4(E) raid10(E) raid456(E) > async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) > async_tx(E) xor(E) raid6_pq(E) > [ 37.609827] libcrc32c(E) raid0(E) raid1(E) crc32_pclmul(E) > psmouse(E) bochs(E) drm_vram_helper(E) drm_ttm_helper(E) uhci_hcd(E) > ttm(E) ehci_hcd(E) i2c_piix4(E) pata_acpi(E) floppy(E) > [ 37.622013] CPU: 4 PID: 795 Comm: kvm Tainted: G D E > 6.9.0-troubleshoot1-dirty #28 > [ 37.623045] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 > [ 37.624382] RIP: 0010:do_exit+0x8a2/0xab0 > [ 37.624869] Code: 38 0c 00 00 e9 2d fe ff ff 49 8d 7d 18 e8 b6 83 07 > 00 e9 5a f8 ff ff 48 8b bb f8 09 00 00 31 f6 e8 b3 e0 ff ff e9 c1 fd ff > ff <0f> 0b e9 d4 f7 ff ff 0f 0b e9 8f f7 ff ff 4c 89 e6 bf 05 06 00 00 > [ 37.627117] RSP: 0018:ffffa2b480863ec8 EFLAGS: 00010286 > [ 37.627780] RAX: 0000000000000000 RBX: ffff971c05458000 RCX: > 0000000000000000 > [ 37.628638] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > 0000000000000000 > [ 37.629512] RBP: ffffa2b480863f20 R08: 0000000000000000 R09: > 0000000000000000 > [ 37.630379] R10: 0000000000000000 R11: 0000000000000000 R12: > 000000000000000b > [ 37.631231] R13: ffff971c04c8d100 R14: ffff971c00c28000 R15: > ffffa2b4808639c8 > [ 37.632095] FS: 00007bcc47e006c0(0000) GS:ffff971d37c00000(0000) > knlGS:0000000000000000 > [ 37.633090] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 37.633794] CR2: 000070d06e1a40a0 CR3: 000000010b6f4004 CR4: > 0000000000372ef0 > [ 37.634666] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 37.635533] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 37.636387] Call Trace: > [ 37.636692] <TASK> > [ 37.636963] ? show_regs+0x6c/0x80 > [ 37.637393] ? __warn+0x88/0x140 > [ 37.637801] ? do_exit+0x8a2/0xab0 > [ 37.638197] ? report_bug+0x182/0x1b0 > [ 37.638674] ? handle_bug+0x46/0x90 > [ 37.639094] ? exc_invalid_op+0x18/0x80 > [ 37.639577] ? asm_exc_invalid_op+0x1b/0x20 > [ 37.640090] ? do_exit+0x8a2/0xab0 > [ 37.640537] ? do_exit+0x6f/0xab0 > [ 37.640941] ? syscall_exit_to_user_mode+0x81/0x270 > [ 37.641542] make_task_dead+0x81/0x160 > [ 37.641999] rewind_stack_and_make_dead+0x16/0x20 > [ 37.642586] RIP: 0033:0x7bcc58f45b95 > [ 37.643046] Code: 00 00 00 44 89 d0 41 b9 08 00 00 00 83 c8 10 f6 87 > d0 00 00 00 01 8b bf cc 00 00 00 44 0f 45 d0 45 31 c0 b8 aa 01 00 00 0f > 05 <c3> 66 2e 0f 1f 84 00 00 00 00 00 41 83 e2 02 74 c2 f0 48 83 0c 24 > [ 37.645251] RSP: 002b:00007bcc47dfafd8 EFLAGS: 00000246 ORIG_RAX: > 00000000000001aa > [ 37.646213] RAX: ffffffffffffffda RBX: 00007bcc400039d0 RCX: > 00007bcc58f45b95 > [ 37.647117] RDX: 0000000000000000 RSI: 0000000000000003 RDI: > 0000000000000033 > [ 37.648018] RBP: 00007bcc400039d8 R08: 0000000000000000 R09: > 0000000000000008 > [ 37.648916] R10: 0000000000000000 R11: 0000000000000246 R12: > 00007bcc40003ac0 > [ 37.649824] R13: 0000000000000000 R14: 0000591ad05f5c68 R15: > 0000591b10198450 > [ 37.650760] </TASK> > [ 37.651034] ---[ end trace 0000000000000000 ]--- >
On 28/05/2024 11:09, Chengming Zhou wrote: > On 2024/5/28 16:42, Friedrich Weber wrote: >> Hope I did this correctly. With this, the reproducer triggered a BUG >> pretty quickly, see [0]. If I can provide anything else, just let me know. > > Thanks for your patience, it's correct. Then how about this fix? > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index d98654869615..b2ec5c4c738f 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) > if (data->nr_tags > 1) { > rq = __blk_mq_alloc_requests_batch(data); > if (rq) { > + INIT_LIST_HEAD(&rq->queuelist); > blk_mq_rq_time_init(rq, alloc_time_ns); > return rq; > } > Nice, seems like with this patch applied on top of 6.9, the reproducer does not trigger crashes anymore for me! Thanks! To verify that the reproducer hits the new INIT_LIST_HEAD, I added debug prints before/after: diff --git a/block/blk-mq.c b/block/blk-mq.c index 4da581f13273..75186bb0d9c9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -485,7 +485,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { + pr_warn("before init: list: %p next: %p prev: %p\n", &rq->queuelist, rq->queuelist.next, rq->queuelist.prev); INIT_LIST_HEAD(&rq->queuelist); + pr_warn("after init: list: %p next: %p prev: %p\n", &rq->queuelist, rq->queuelist.next, rq->queuelist.prev); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } And indeed, I see quite some printouts where rq->queuelist.next differs before/after the request, e.g. May 28 16:31:25 reproflushfull kernel: before init: list: 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f May 28 16:31:25 reproflushfull kernel: after init: list: 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f May 28 16:31:26 reproflushfull kernel: before init: list: 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f May 28 16:31:26 reproflushfull kernel: after init: list: 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f I know very little about the block layer, but if I understand the commit message of the original 81ada09cc25e correctly, it's expected that queuelist needs to be reinitialized at some places. I'm just a little confused to see the same pointer 00000000aaa2e372 in two subsequent "before init" printouts for the same queuelist 000000001e0a144f. Is this expected too? Also, just out of interest: Can you estimate whether this issue is specific to software RAID setups, or could similar NULL pointer dereferences also happen in setups without software RAID? Best wishes, Friedrich
On 2024/5/28 22:40, Friedrich Weber wrote: > On 28/05/2024 11:09, Chengming Zhou wrote: >> On 2024/5/28 16:42, Friedrich Weber wrote: >>> Hope I did this correctly. With this, the reproducer triggered a BUG >>> pretty quickly, see [0]. If I can provide anything else, just let me know. >> >> Thanks for your patience, it's correct. Then how about this fix? >> >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index d98654869615..b2ec5c4c738f 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -485,6 +485,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) >> if (data->nr_tags > 1) { >> rq = __blk_mq_alloc_requests_batch(data); >> if (rq) { >> + INIT_LIST_HEAD(&rq->queuelist); >> blk_mq_rq_time_init(rq, alloc_time_ns); >> return rq; >> } >> > > Nice, seems like with this patch applied on top of 6.9, the reproducer > does not trigger crashes anymore for me! Thanks! Good news. Thanks. > > To verify that the reproducer hits the new INIT_LIST_HEAD, I added debug > prints before/after: [...] > And indeed, I see quite some printouts where rq->queuelist.next differs > before/after the request, e.g. > > May 28 16:31:25 reproflushfull kernel: before init: list: > 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f > May 28 16:31:25 reproflushfull kernel: after init: list: > 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f > May 28 16:31:26 reproflushfull kernel: before init: list: > 000000001e0a144f next: 00000000aaa2e372 prev: 000000001e0a144f > May 28 16:31:26 reproflushfull kernel: after init: list: > 000000001e0a144f next: 000000001e0a144f prev: 000000001e0a144f > > I know very little about the block layer, but if I understand the commit > message of the original 81ada09cc25e correctly, it's expected that > queuelist needs to be reinitialized at some places. I'm just a little Yes, because we use list_move_tail() in the flush sequences. Maybe we can just use list_add_tail() so we don't need the queuelist initialized. It should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH, so there isn't any move actually. But now I'm concerned that rq->queuelist maybe changed by driver after request end? Don't know if this is a reasonable behavior, or I'm afraid that the safest way is to revert the last patch. Want to know what Jens, Ming and Christoph think? > confused to see the same pointer 00000000aaa2e372 in two subsequent > "before init" printouts for the same queuelist 000000001e0a144f. Is this > expected too? Not sure, but it seems possible. This is a rq_list in the plug cache, 000000001e0a144f is a PREFLUSH request, it will be freed after request end. Then next time we again allocate it and put it in the plug cache, just before 00000000aaa2e372 again. The reason why block doesn't use 00000000aaa2e372 maybe it's from a different queue or hardware queue. But these are just my guess. > > Also, just out of interest: Can you estimate whether this issue is > specific to software RAID setups, or could similar NULL pointer > dereferences also happen in setups without software RAID? I think it can also happen without software RAID. Thanks.
On Wed, May 29, 2024 at 04:50:02PM +0800, Chengming Zhou wrote: > Yes, because we use list_move_tail() in the flush sequences. Maybe we can > just use list_add_tail() so we don't need the queuelist initialized. It > should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH, > so there isn't any move actually. Sounds good. > But now I'm concerned that rq->queuelist maybe changed by driver after > request end? How could the driver change it? > > Also, just out of interest: Can you estimate whether this issue is > > specific to software RAID setups, or could similar NULL pointer > > dereferences also happen in setups without software RAID? > > I think it can also happen without software RAID. Seems to be about batch allocation. So you either need a plug in the stacking device, or io_uring. I guess people aren't using the io_uring high performance options on devices with a write cache all that much, as that should immediately reproduce the problem.
On 2024/5/31 14:17, Christoph Hellwig wrote: > On Wed, May 29, 2024 at 04:50:02PM +0800, Chengming Zhou wrote: >> Yes, because we use list_move_tail() in the flush sequences. Maybe we can >> just use list_add_tail() so we don't need the queuelist initialized. It >> should be ok since rq can't be on any list when PREFLUSH or POSTFLUSH, >> so there isn't any move actually. > > Sounds good. Ok, I could send a fix that changes to use list_add_tail() later. > >> But now I'm concerned that rq->queuelist maybe changed by driver after >> request end? > > How could the driver change it? I don't know much about drivers. Normally, they will detach rq->queuelist from their internal list and do blk_mq_end_request(), in which we reuse this queuelist to add rq to the post-flush list. Strictly speaking, that rq's ownership still belongs to the drivers until they call blk_mq_free_request(), right? So I'm not sure if the drivers would touch rq->queuelist after blk_mq_end_request(). If the drivers don't have such behaviors, then we are good. > >>> Also, just out of interest: Can you estimate whether this issue is >>> specific to software RAID setups, or could similar NULL pointer >>> dereferences also happen in setups without software RAID? >> >> I think it can also happen without software RAID. > > Seems to be about batch allocation. So you either need a plug in > the stacking device, or io_uring. I guess people aren't using the > io_uring high performance options on devices with a write cache > all that much, as that should immediately reproduce the problem. >
diff --git a/block/blk-flush.c b/block/blk-flush.c index fedb39031647..e73dc22d05c1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -183,14 +183,13 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_del_init(&rq->flush.list); fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); - list_add(&rq->queuelist, &q->requeue_list); + list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -202,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -258,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -292,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ @@ -376,6 +375,11 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -386,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; - INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 67f810857634..01e8c31db665 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -178,7 +178,6 @@ struct request { struct { unsigned int seq; - struct list_head list; rq_end_io_fn *saved_end_io; } flush;