Message ID | 20170605155925.7896-2-ming.lei@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, 2017-06-05 at 23:59 +0800, Ming Lei wrote: > +static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > + struct request *rq, > + blk_qc_t *cookie, bool may_sleep) > { > struct request_queue *q = rq->q; > struct blk_mq_queue_data bd = { > .rq = rq, > .last = true, > }; > - struct blk_mq_hw_ctx *hctx; > blk_qc_t new_cookie; > int ret; > + bool run_queue = true; > + > + if (blk_mq_hctx_stopped(hctx)) { > + run_queue = false; > + goto insert; > + } > > if (q->elevator) > goto insert; > > - if (!blk_mq_get_driver_tag(rq, &hctx, false)) > + if (!blk_mq_get_driver_tag(rq, NULL, false)) > goto insert; > > new_cookie = request_to_qc_t(hctx, rq); > @@ -1439,7 +1445,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, > > __blk_mq_requeue_request(rq); > insert: > - blk_mq_sched_insert_request(rq, false, true, false, may_sleep); > + blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); > } > > static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > @@ -1447,7 +1453,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > { > if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { > rcu_read_lock(); > - __blk_mq_try_issue_directly(rq, cookie, false); > + __blk_mq_try_issue_directly(hctx, rq, cookie, false); > rcu_read_unlock(); > } else { > unsigned int srcu_idx; > @@ -1455,7 +1461,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > might_sleep(); > > srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); > - __blk_mq_try_issue_directly(rq, cookie, true); > + __blk_mq_try_issue_directly(hctx, rq, cookie, true); > srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); > } > } Hello Ming, It seems like you are assuming that the hardware queue of the rq argument passed to __blk_mq_try_issue_directly() matches the hctx argument? Sorry but I think that's an incorrect assumption. Please have a look at the following code in blk_mq_make_request(): if (same_queue_rq) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); Bart.
On Mon, Jun 05, 2017 at 09:25:29PM +0000, Bart Van Assche wrote: > On Mon, 2017-06-05 at 23:59 +0800, Ming Lei wrote: > > +static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > > + struct request *rq, > > + blk_qc_t *cookie, bool may_sleep) > > { > > struct request_queue *q = rq->q; > > struct blk_mq_queue_data bd = { > > .rq = rq, > > .last = true, > > }; > > - struct blk_mq_hw_ctx *hctx; > > blk_qc_t new_cookie; > > int ret; > > + bool run_queue = true; > > + > > + if (blk_mq_hctx_stopped(hctx)) { > > + run_queue = false; > > + goto insert; > > + } > > > > if (q->elevator) > > goto insert; > > > > - if (!blk_mq_get_driver_tag(rq, &hctx, false)) > > + if (!blk_mq_get_driver_tag(rq, NULL, false)) > > goto insert; > > > > new_cookie = request_to_qc_t(hctx, rq); > > @@ -1439,7 +1445,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, > > > > __blk_mq_requeue_request(rq); > > insert: > > - blk_mq_sched_insert_request(rq, false, true, false, may_sleep); > > + blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); > > } > > > > static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > > @@ -1447,7 +1453,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > > { > > if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { > > rcu_read_lock(); > > - __blk_mq_try_issue_directly(rq, cookie, false); > > + __blk_mq_try_issue_directly(hctx, rq, cookie, false); > > rcu_read_unlock(); > > } else { > > unsigned int srcu_idx; > > @@ -1455,7 +1461,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > > might_sleep(); > > > > srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); > > - __blk_mq_try_issue_directly(rq, cookie, true); > > + __blk_mq_try_issue_directly(hctx, rq, cookie, true); > > srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); > > } > > } > > Hello Ming, > > It seems like you are assuming that the hardware queue of the rq argument > passed to __blk_mq_try_issue_directly() matches the hctx argument? Sorry > but I think that's an incorrect assumption. Please have a look at the > following code in blk_mq_make_request(): > > if (same_queue_rq) > blk_mq_try_issue_directly(data.hctx, same_queue_rq, > &cookie); IMO it is a bug which need to be fixed, since blk_mq_try_issue_directly() may take a wrong SRCU lock and performance can be hurt under this situation. Thanks, Ming
diff --git a/block/blk-mq.c b/block/blk-mq.c index 58688205c8f4..2da2cb8d9ff4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1400,22 +1400,28 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, - bool may_sleep) +static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = true, }; - struct blk_mq_hw_ctx *hctx; blk_qc_t new_cookie; int ret; + bool run_queue = true; + + if (blk_mq_hctx_stopped(hctx)) { + run_queue = false; + goto insert; + } if (q->elevator) goto insert; - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; new_cookie = request_to_qc_t(hctx, rq); @@ -1439,7 +1445,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, __blk_mq_requeue_request(rq); insert: - blk_mq_sched_insert_request(rq, false, true, false, may_sleep); + blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, @@ -1447,7 +1453,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, { if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - __blk_mq_try_issue_directly(rq, cookie, false); + __blk_mq_try_issue_directly(hctx, rq, cookie, false); rcu_read_unlock(); } else { unsigned int srcu_idx; @@ -1455,7 +1461,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, might_sleep(); srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(rq, cookie, true); + __blk_mq_try_issue_directly(hctx, rq, cookie, true); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } }
If queue is stopped, we shouldn't dispatch request into driver and hardware, unfortunately the check is removed in bd166ef183c2(blk-mq-sched: add framework for MQ capable IO schedulers). This patch fixes the issue by moving the check back into __blk_mq_try_issue_directly(). This patch fixes request use-after-free[1][2] during canceling requets of NVMe in nvme_dev_disable(), which can be triggered easily during NVMe reset & remove test. [1] oops kernel log when CONFIG_BLK_DEV_INTEGRITY is on [ 103.412969] BUG: unable to handle kernel NULL pointer dereference at 000000000000000a [ 103.412980] IP: bio_integrity_advance+0x48/0xf0 [ 103.412981] PGD 275a88067 [ 103.412981] P4D 275a88067 [ 103.412982] PUD 276c43067 [ 103.412983] PMD 0 [ 103.412984] [ 103.412986] Oops: 0000 [#1] SMP [ 103.412989] Modules linked in: vfat fat intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel crypto_simd cryptd ipmi_ssif iTCO_wdt iTCO_vendor_support mxm_wmi glue_helper dcdbas ipmi_si mei_me pcspkr mei sg ipmi_devintf lpc_ich ipmi_msghandler shpchp acpi_power_meter wmi nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel nvme ahci nvme_core libahci libata tg3 i2c_core megaraid_sas ptp pps_core dm_mirror dm_region_hash dm_log dm_mod [ 103.413035] CPU: 0 PID: 102 Comm: kworker/0:2 Not tainted 4.11.0+ #1 [ 103.413036] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016 [ 103.413041] Workqueue: events nvme_remove_dead_ctrl_work [nvme] [ 103.413043] task: ffff9cc8775c8000 task.stack: ffffc033c252c000 [ 103.413045] RIP: 0010:bio_integrity_advance+0x48/0xf0 [ 103.413046] RSP: 0018:ffffc033c252fc10 EFLAGS: 00010202 [ 103.413048] RAX: 0000000000000000 RBX: ffff9cc8720a8cc0 RCX: ffff9cca72958240 [ 103.413049] RDX: ffff9cca72958000 RSI: 0000000000000008 RDI: ffff9cc872537f00 [ 103.413049] RBP: ffffc033c252fc28 R08: 0000000000000000 R09: ffffffffb963a0d5 [ 103.413050] R10: 000000000000063e R11: 0000000000000000 R12: ffff9cc8720a8d18 [ 103.413051] R13: 0000000000001000 R14: ffff9cc872682e00 R15: 00000000fffffffb [ 103.413053] FS: 0000000000000000(0000) GS:ffff9cc877c00000(0000) knlGS:0000000000000000 [ 103.413054] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 103.413055] CR2: 000000000000000a CR3: 0000000276c41000 CR4: 00000000001406f0 [ 103.413056] Call Trace: [ 103.413063] bio_advance+0x2a/0xe0 [ 103.413067] blk_update_request+0x76/0x330 [ 103.413072] blk_mq_end_request+0x1a/0x70 [ 103.413074] blk_mq_dispatch_rq_list+0x370/0x410 [ 103.413076] ? blk_mq_flush_busy_ctxs+0x94/0xe0 [ 103.413080] blk_mq_sched_dispatch_requests+0x173/0x1a0 [ 103.413083] __blk_mq_run_hw_queue+0x8e/0xa0 [ 103.413085] __blk_mq_delay_run_hw_queue+0x9d/0xa0 [ 103.413088] blk_mq_start_hw_queue+0x17/0x20 [ 103.413090] blk_mq_start_hw_queues+0x32/0x50 [ 103.413095] nvme_kill_queues+0x54/0x80 [nvme_core] [ 103.413097] nvme_remove_dead_ctrl_work+0x1f/0x40 [nvme] [ 103.413103] process_one_work+0x149/0x360 [ 103.413105] worker_thread+0x4d/0x3c0 [ 103.413109] kthread+0x109/0x140 [ 103.413111] ? rescuer_thread+0x380/0x380 [ 103.413113] ? kthread_park+0x60/0x60 [ 103.413120] ret_from_fork+0x2c/0x40 [ 103.413121] Code: 08 4c 8b 63 50 48 8b 80 80 00 00 00 48 8b 90 d0 03 00 00 31 c0 48 83 ba 40 02 00 00 00 48 8d 8a 40 02 00 00 48 0f 45 c1 c1 ee 09 <0f> b6 48 0a 0f b6 40 09 41 89 f5 83 e9 09 41 d3 ed 44 0f af e8 [ 103.413145] RIP: bio_integrity_advance+0x48/0xf0 RSP: ffffc033c252fc10 [ 103.413146] CR2: 000000000000000a [ 103.413157] ---[ end trace cd6875d16eb5a11e ]--- [ 103.455368] Kernel panic - not syncing: Fatal exception [ 103.459826] Kernel Offset: 0x37600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 103.850916] ---[ end Kernel panic - not syncing: Fatal exception [ 103.857637] sched: Unexpected reschedule of offline CPU#1! [ 103.863762] ------------[ cut here ]------------ [2] kernel hang in blk_mq_freeze_queue_wait() when CONFIG_BLK_DEV_INTEGRITY is off [ 247.129825] INFO: task nvme-test:1772 blocked for more than 120 seconds. [ 247.137311] Not tainted 4.12.0-rc2.upstream+ #4 [ 247.142954] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 247.151704] Call Trace: [ 247.154445] __schedule+0x28a/0x880 [ 247.158341] schedule+0x36/0x80 [ 247.161850] blk_mq_freeze_queue_wait+0x4b/0xb0 [ 247.166913] ? remove_wait_queue+0x60/0x60 [ 247.171485] blk_freeze_queue+0x1a/0x20 [ 247.175770] blk_cleanup_queue+0x7f/0x140 [ 247.180252] nvme_ns_remove+0xa3/0xb0 [nvme_core] [ 247.185503] nvme_remove_namespaces+0x32/0x50 [nvme_core] [ 247.191532] nvme_uninit_ctrl+0x2d/0xa0 [nvme_core] [ 247.196977] nvme_remove+0x70/0x110 [nvme] [ 247.201545] pci_device_remove+0x39/0xc0 [ 247.205927] device_release_driver_internal+0x141/0x200 [ 247.211761] device_release_driver+0x12/0x20 [ 247.216531] pci_stop_bus_device+0x8c/0xa0 [ 247.221104] pci_stop_and_remove_bus_device_locked+0x1a/0x30 [ 247.227420] remove_store+0x7c/0x90 [ 247.231320] dev_attr_store+0x18/0x30 [ 247.235409] sysfs_kf_write+0x3a/0x50 [ 247.239497] kernfs_fop_write+0xff/0x180 [ 247.243867] __vfs_write+0x37/0x160 [ 247.247757] ? selinux_file_permission+0xe5/0x120 [ 247.253011] ? security_file_permission+0x3b/0xc0 [ 247.258260] vfs_write+0xb2/0x1b0 [ 247.261964] ? syscall_trace_enter+0x1d0/0x2b0 [ 247.266924] SyS_write+0x55/0xc0 [ 247.270540] do_syscall_64+0x67/0x150 [ 247.274636] entry_SYSCALL64_slow_path+0x25/0x25 [ 247.279794] RIP: 0033:0x7f5c96740840 [ 247.283785] RSP: 002b:00007ffd00e87ee8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 247.292238] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f5c96740840 [ 247.300194] RDX: 0000000000000002 RSI: 00007f5c97060000 RDI: 0000000000000001 [ 247.308159] RBP: 00007f5c97060000 R08: 000000000000000a R09: 00007f5c97059740 [ 247.316123] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f5c96a14400 [ 247.324087] R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000 [ 370.016340] INFO: task nvme-test:1772 blocked for more than 120 seconds. Fixes: bd166ef183c2(blk-mq-sched: add framework for MQ capable IO schedulers) Cc: stable@vger.kernel.org Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/blk-mq.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-)