diff mbox series

block: model freeze & enter queue as rwsem for supporting lockdep

Message ID 20241018013542.3013963-1-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: model freeze & enter queue as rwsem for supporting lockdep | expand

Commit Message

Ming Lei Oct. 18, 2024, 1:35 a.m. UTC
Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
and blk_enter_queue().

Turns out the two are just like one rwsem, so model them as rwsem for
supporting lockdep:

1) model blk_mq_freeze_queue() as down_write_trylock()
- it is exclusive lock, so dependency with blk_enter_queue() is covered
- it is trylock because blk_mq_freeze_queue() are allowed to run concurrently

2) model blk_enter_queue() as down_read()
- it is shared lock, so concurrent blk_enter_queue() are allowed
- it is read lock, so dependency with blk_mq_freeze_queue() is modeled
- blk_queue_exit() is often called from other contexts(such as irq), and
it can't be annotated as rwsem_release(), so simply do it in
blk_enter_queue(), this way still covered cases as many as possible

NVMe is the only subsystem which may call blk_mq_freeze_queue() and
blk_mq_unfreeze_queue() from different context, so it is the only
exception for the modeling. Add one tagset flag to exclude it from
the lockdep support.

With lockdep support, such kind of reports may be reported asap and
needn't wait until the real deadlock is triggered.

For example, the following lockdep report can be triggered in the
report[3].

[   45.701432] ======================================================
[   45.702621] WARNING: possible circular locking dependency detected
[   45.703829] 6.12.0-rc1_uring_dev+ #188 Not tainted
[   45.704806] ------------------------------------------------------
[   45.705903] bash/1323 is trying to acquire lock:
[   45.706153] ffff88813e075870 (&q->limits_lock){+.+.}-{3:3}, at: queue_wc_store+0x11c/0x380
[   45.706602]
               but task is already holding lock:
[   45.706927] ffff88813e075730 (&q->sysfs_lock){+.+.}-{3:3}, at: queue_attr_store+0xd9/0x170
[   45.707391]
               which lock already depends on the new lock.

[   45.707838]
               the existing dependency chain (in reverse order) is:
[   45.708238]
               -> #2 (&q->sysfs_lock){+.+.}-{3:3}:
[   45.708558]        __mutex_lock+0x177/0x10d0
[   45.708797]        queue_attr_store+0xd9/0x170
[   45.709039]        kernfs_fop_write_iter+0x39f/0x5a0
[   45.709304]        vfs_write+0x5d3/0xe80
[   45.709514]        ksys_write+0xfb/0x1d0
[   45.709723]        do_syscall_64+0x95/0x180
[   45.709946]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   45.710256]
               -> #1 (q->q_usage_counter){++++}-{0:0}:
[   45.710589]        blk_try_enter_queue+0x32/0x340
[   45.710842]        blk_queue_enter+0x97/0x4c0
[   45.711080]        blk_mq_alloc_request+0x347/0x900
[   45.711340]        scsi_execute_cmd+0x183/0xc20
[   45.711584]        read_capacity_16+0x1ce/0xbb0
[   45.711823]        sd_revalidate_disk.isra.0+0xf78/0x8b40
[   45.712119]        sd_probe+0x813/0xf40
[   45.712320]        really_probe+0x1e0/0x8a0
[   45.712542]        __driver_probe_device+0x18c/0x370
[   45.712801]        driver_probe_device+0x4a/0x120
[   45.713053]        __device_attach_driver+0x162/0x270
[   45.713327]        bus_for_each_drv+0x115/0x1a0
[   45.713572]        __device_attach_async_helper+0x1a0/0x240
[   45.713872]        async_run_entry_fn+0x97/0x4f0
[   45.714127]        process_one_work+0x85d/0x1430
[   45.714377]        worker_thread+0x5be/0xff0
[   45.714610]        kthread+0x293/0x350
[   45.714811]        ret_from_fork+0x31/0x70
[   45.715032]        ret_from_fork_asm+0x1a/0x30
[   45.715667]
               -> #0 (&q->limits_lock){+.+.}-{3:3}:
[   45.716819]        __lock_acquire+0x310a/0x6060
[   45.717476]        lock_acquire.part.0+0x122/0x360
[   45.718133]        __mutex_lock+0x177/0x10d0
[   45.718759]        queue_wc_store+0x11c/0x380
[   45.719384]        queue_attr_store+0xff/0x170
[   45.720007]        kernfs_fop_write_iter+0x39f/0x5a0
[   45.720647]        vfs_write+0x5d3/0xe80
[   45.721252]        ksys_write+0xfb/0x1d0
[   45.721847]        do_syscall_64+0x95/0x180
[   45.722433]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   45.723085]
               other info that might help us debug this:

[   45.724532] Chain exists of:
                 &q->limits_lock --> q->q_usage_counter --> &q->sysfs_lock

[   45.726122]  Possible unsafe locking scenario:

[   45.727114]        CPU0                    CPU1
[   45.727687]        ----                    ----
[   45.728270]   lock(&q->sysfs_lock);
[   45.728792]                                lock(q->q_usage_counter);
[   45.729466]                                lock(&q->sysfs_lock);
[   45.730119]   lock(&q->limits_lock);
[   45.730655]
                *** DEADLOCK ***

[   45.731983] 5 locks held by bash/1323:
[   45.732524]  #0: ffff88811a4a0450 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0xfb/0x1d0
[   45.733290]  #1: ffff88811fa35890 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x25d/0x5a0
[   45.734113]  #2: ffff888118e9fc20 (kn->active#133){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x281/0x5a0
[   45.734939]  #3: ffff88813e0751e0 (q->q_usage_counter){++++}-{0:0}, at: blk_mq_freeze_queue+0x12/0x20
[   45.735797]  #4: ffff88813e075730 (&q->sysfs_lock){+.+.}-{3:3}, at: queue_attr_store+0xd9/0x170
[   45.736626]
               stack backtrace:
[   45.737598] CPU: 9 UID: 0 PID: 1323 Comm: bash Not tainted 6.12.0-rc1_uring_dev+ #188
[   45.738388] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014
[   45.739222] Call Trace:
[   45.739762]  <TASK>
[   45.740309]  dump_stack_lvl+0x84/0xd0
[   45.740962]  print_circular_bug.cold+0x1e4/0x278
[   45.741628]  check_noncircular+0x331/0x410
[   45.742289]  ? __pfx_check_noncircular+0x10/0x10
[   45.742933]  ? lock_release+0x588/0xcc0
[   45.743536]  ? lockdep_lock+0xbe/0x1c0
[   45.744156]  ? __pfx_lockdep_lock+0x10/0x10
[   45.744785]  ? is_bpf_text_address+0x21/0x100
[   45.745442]  __lock_acquire+0x310a/0x6060
[   45.746088]  ? __pfx___lock_acquire+0x10/0x10
[   45.746735]  ? __pfx___bfs+0x10/0x10
[   45.747325]  lock_acquire.part.0+0x122/0x360
[   45.747947]  ? queue_wc_store+0x11c/0x380
[   45.748564]  ? __pfx_lock_acquire.part.0+0x10/0x10
[   45.749254]  ? trace_lock_acquire+0x12f/0x1a0
[   45.749903]  ? queue_wc_store+0x11c/0x380
[   45.750545]  ? lock_acquire+0x2f/0xb0
[   45.751168]  ? queue_wc_store+0x11c/0x380
[   45.751777]  __mutex_lock+0x177/0x10d0
[   45.752379]  ? queue_wc_store+0x11c/0x380
[   45.752988]  ? queue_wc_store+0x11c/0x380
[   45.753586]  ? __pfx___mutex_lock+0x10/0x10
[   45.754202]  ? __pfx___lock_acquire+0x10/0x10
[   45.754823]  ? lock_acquire.part.0+0x122/0x360
[   45.755456]  ? queue_wc_store+0x11c/0x380
[   45.756063]  queue_wc_store+0x11c/0x380
[   45.756653]  ? __pfx_lock_acquired+0x10/0x10
[   45.757305]  ? lock_acquire+0x2f/0xb0
[   45.757904]  ? trace_contention_end+0xd4/0x110
[   45.758524]  ? __pfx_queue_wc_store+0x10/0x10
[   45.759172]  ? queue_attr_store+0xd9/0x170
[   45.759899]  ? __pfx_autoremove_wake_function+0x10/0x10
[   45.760587]  queue_attr_store+0xff/0x170
[   45.761213]  ? sysfs_kf_write+0x41/0x170
[   45.761823]  ? __pfx_sysfs_kf_write+0x10/0x10
[   45.762461]  kernfs_fop_write_iter+0x39f/0x5a0
[   45.763109]  vfs_write+0x5d3/0xe80
[   45.763690]  ? lockdep_hardirqs_on_prepare+0x274/0x3f0
[   45.764368]  ? __pfx_vfs_write+0x10/0x10
[   45.764964]  ksys_write+0xfb/0x1d0
[   45.765544]  ? __pfx_ksys_write+0x10/0x10
[   45.766160]  ? ksys_dup3+0xce/0x2b0
[   45.766710]  do_syscall_64+0x95/0x180
[   45.767267]  ? filp_close+0x1d/0x30
[   45.767801]  ? do_dup2+0x27c/0x4f0
[   45.768334]  ? syscall_exit_to_user_mode+0x97/0x290
[   45.768930]  ? lockdep_hardirqs_on_prepare+0x274/0x3f0
[   45.769538]  ? syscall_exit_to_user_mode+0x97/0x290
[   45.770136]  ? do_syscall_64+0xa1/0x180
[   45.770665]  ? clear_bhb_loop+0x25/0x80
[   45.771206]  ? clear_bhb_loop+0x25/0x80
[   45.771735]  ? clear_bhb_loop+0x25/0x80
[   45.772261]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   45.772843] RIP: 0033:0x7f3b17be9984
[   45.773363] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 06 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89
[   45.775131] RSP: 002b:00007fffaa975ba8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
[   45.775909] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f3b17be9984
[   45.776672] RDX: 0000000000000005 RSI: 0000556af3758790 RDI: 0000000000000001
[   45.777454] RBP: 00007fffaa975bd0 R08: 0000000000000073 R09: 00000000ffffffff
[   45.778251] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000005
[   45.779039] R13: 0000556af3758790 R14: 00007f3b17cc35c0 R15: 00007f3b17cc0f00
[   45.779822]  </TASK>

[1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler'
https://bugzilla.kernel.org/show_bug.cgi?id=219166

[2] del_gendisk() vs blk_queue_enter() race condition
https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/

[3] queue_freeze & queue_enter deadlock in scsi
https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c         |  3 +++
 block/blk-mq-debugfs.c   |  1 +
 block/blk-mq.c           | 15 +++++++++++++++
 block/blk.h              |  9 +++++++++
 block/genhd.c            |  3 +++
 drivers/nvme/host/core.c |  4 ++--
 include/linux/blk-mq.h   |  5 ++++-
 include/linux/blkdev.h   |  6 ++++++
 8 files changed, 43 insertions(+), 3 deletions(-)

Comments

Bart Van Assche Oct. 18, 2024, 4:57 p.m. UTC | #1
On 10/17/24 6:35 PM, Ming Lei wrote:
> Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> and blk_enter_queue().
> 
> Turns out the two are just like one rwsem, so model them as rwsem for
> supporting lockdep:
> 
> 1) model blk_mq_freeze_queue() as down_write_trylock()
> - it is exclusive lock, so dependency with blk_enter_queue() is covered
> - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> 
> 2) model blk_enter_queue() as down_read()
> - it is shared lock, so concurrent blk_enter_queue() are allowed
> - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> - blk_queue_exit() is often called from other contexts(such as irq), and
> it can't be annotated as rwsem_release(), so simply do it in
> blk_enter_queue(), this way still covered cases as many as possible
> 
> NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> blk_mq_unfreeze_queue() from different context, so it is the only
> exception for the modeling. Add one tagset flag to exclude it from
> the lockdep support.
> 
> With lockdep support, such kind of reports may be reported asap and
> needn't wait until the real deadlock is triggered.
> 
> For example, the following lockdep report can be triggered in the
> report[3].

Hi Ming,

Thank you for having reported this issue and for having proposed a
patch. I think the following is missing from the patch description:
(a) An analysis of which code causes the inconsistent nested lock order.
(b) A discussion of potential alternatives.

It seems unavoidable to me that some code freezes request queue(s)
before the limits are updated. Additionally, there is code that freezes
queues first (sd_revalidate_disk()), executes commands and next updates
limits. Hence the inconsistent order of freezing queues and obtaining
limits_lock.

The alternative (entirely untested) solution below has the following
advantages:
* No additional information has to be provided to lockdep about the
   locking order.
* No new flags are required (SKIP_FREEZE_LOCKDEP).
* No exceptions are necessary for blk_queue_exit() nor for the NVMe
   driver.

Thanks,

Bart.


diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 83b696ba0cac..50962ca037d7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -211,6 +211,7 @@ static ssize_t flag_store(struct device *dev, const 
char *page, size_t count,
  	if (err)
  		return err;

+	blk_mq_freeze_queue(q);
  	/* note that the flags are inverted vs the values in the sysfs files */
  	lim = queue_limits_start_update(q);
  	if (val)
@@ -218,7 +219,6 @@ static ssize_t flag_store(struct device *dev, const 
char *page, size_t count,
  	else
  		lim.integrity.flags |= flag;

-	blk_mq_freeze_queue(q);
  	err = queue_limits_commit_update(q, &lim);
  	blk_mq_unfreeze_queue(q);
  	if (err)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 194417abc105..a3d2613bad1d 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1106,12 +1106,12 @@ cache_type_store(struct device *dev, struct 
device_attribute *attr,

  	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);

+	blk_mq_freeze_queue(disk->queue);
  	lim = queue_limits_start_update(disk->queue);
  	if (virtblk_get_cache_mode(vdev))
  		lim.features |= BLK_FEAT_WRITE_CACHE;
  	else
  		lim.features &= ~BLK_FEAT_WRITE_CACHE;
-	blk_mq_freeze_queue(disk->queue);
  	i = queue_limits_commit_update(disk->queue, &lim);
  	blk_mq_unfreeze_queue(disk->queue);
  	if (i)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ca4bc0ac76ad..1f6ab9768ac7 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -175,9 +175,9 @@ cache_type_store(struct device *dev, struct 
device_attribute *attr,
  		sdkp->WCE = wce;
  		sdkp->RCD = rcd;

+		blk_mq_freeze_queue(sdkp->disk->queue);
  		lim = queue_limits_start_update(sdkp->disk->queue);
  		sd_set_flush_flag(sdkp, &lim);
-		blk_mq_freeze_queue(sdkp->disk->queue);
  		ret = queue_limits_commit_update(sdkp->disk->queue, &lim);
  		blk_mq_unfreeze_queue(sdkp->disk->queue);
  		if (ret)
@@ -481,9 +481,9 @@ provisioning_mode_store(struct device *dev, struct 
device_attribute *attr,
  	if (mode < 0)
  		return -EINVAL;

+	blk_mq_freeze_queue(sdkp->disk->queue);
  	lim = queue_limits_start_update(sdkp->disk->queue);
  	sd_config_discard(sdkp, &lim, mode);
-	blk_mq_freeze_queue(sdkp->disk->queue);
  	err = queue_limits_commit_update(sdkp->disk->queue, &lim);
  	blk_mq_unfreeze_queue(sdkp->disk->queue);
  	if (err)
@@ -592,9 +592,9 @@ max_write_same_blocks_store(struct device *dev, 
struct device_attribute *attr,
  		sdkp->max_ws_blocks = max;
  	}

+	blk_mq_freeze_queue(sdkp->disk->queue);
  	lim = queue_limits_start_update(sdkp->disk->queue);
  	sd_config_write_same(sdkp, &lim);
-	blk_mq_freeze_queue(sdkp->disk->queue);
  	err = queue_limits_commit_update(sdkp->disk->queue, &lim);
  	blk_mq_unfreeze_queue(sdkp->disk->queue);
  	if (err)
@@ -3700,7 +3700,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
  	struct scsi_disk *sdkp = scsi_disk(disk);
  	struct scsi_device *sdp = sdkp->device;
  	sector_t old_capacity = sdkp->capacity;
-	struct queue_limits lim;
+	struct queue_limits lim = {};
  	unsigned char *buffer;
  	unsigned int dev_max;
  	int err;
@@ -3724,8 +3724,6 @@ static int sd_revalidate_disk(struct gendisk *disk)

  	sd_spinup_disk(sdkp);

-	lim = queue_limits_start_update(sdkp->disk->queue);
-
  	/*
  	 * Without media there is no reason to ask; moreover, some devices
  	 * react badly if we do.
@@ -3804,7 +3802,36 @@ static int sd_revalidate_disk(struct gendisk *disk)
  	kfree(buffer);

  	blk_mq_freeze_queue(sdkp->disk->queue);
-	err = queue_limits_commit_update(sdkp->disk->queue, &lim);
+	{
+		struct queue_limits lim2 =
+			queue_limits_start_update(sdkp->disk->queue);
+		/* Keep the lim2 member assignments below in alphabetical order. */
+		lim2.alignment_offset = lim.alignment_offset;
+		lim2.atomic_write_hw_boundary = lim.atomic_write_hw_boundary;
+		lim2.atomic_write_hw_max = lim.atomic_write_hw_max;
+		lim2.atomic_write_hw_unit_max = lim.atomic_write_hw_unit_max;
+		lim2.atomic_write_hw_unit_min = lim.atomic_write_hw_unit_min;
+		lim2.chunk_sectors = lim.chunk_sectors;
+		lim2.discard_alignment = lim.discard_alignment;
+		lim2.discard_granularity = lim.discard_granularity;
+		lim2.dma_alignment = lim.dma_alignment;
+		lim2.features |= lim.features;
+		lim2.integrity = lim.integrity;
+		lim2.logical_block_size = lim.logical_block_size;
+		lim2.max_active_zones = lim.max_active_zones;
+		lim2.max_hw_discard_sectors = lim.max_hw_discard_sectors;
+		lim2.max_hw_sectors = lim.max_hw_sectors;
+		lim2.max_integrity_segments = lim.max_integrity_segments;
+		lim2.max_open_zones = lim.max_open_zones;
+		lim2.max_segment_size = lim.max_segment_size;
+		lim2.max_segments = lim.max_segments;
+		lim2.max_write_zeroes_sectors = lim.max_write_zeroes_sectors;
+		lim2.max_zone_append_sectors = lim.max_zone_append_sectors;
+		lim2.physical_block_size = lim.physical_block_size;
+		lim2.virt_boundary_mask = lim.virt_boundary_mask;
+		lim2.zone_write_granularity = lim.zone_write_granularity;
+		err = queue_limits_commit_update(sdkp->disk->queue, &lim2);
+	}
  	blk_mq_unfreeze_queue(sdkp->disk->queue);
  	if (err)
  		return err;
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 198bec87bb8e..e9c1673ce6bb 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -795,9 +795,9 @@ static int get_sectorsize(struct scsi_cd *cd)
  		set_capacity(cd->disk, cd->capacity);
  	}

+	blk_mq_freeze_queue(q);
  	lim = queue_limits_start_update(q);
  	lim.logical_block_size = sector_size;
-	blk_mq_freeze_queue(q);
  	err = queue_limits_commit_update(q, &lim);
  	blk_mq_unfreeze_queue(q);
  	return err;
kernel test robot Oct. 18, 2024, 6:45 p.m. UTC | #2
Hi Ming,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.12-rc3 next-20241018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/block-model-freeze-enter-queue-as-rwsem-for-supporting-lockdep/20241018-093704
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20241018013542.3013963-1-ming.lei%40redhat.com
patch subject: [PATCH] block: model freeze & enter queue as rwsem for supporting lockdep
config: x86_64-allnoconfig (https://download.01.org/0day-ci/archive/20241019/202410190214.KgZovHXy-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241019/202410190214.KgZovHXy-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410190214.KgZovHXy-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> block/blk-mq.c:125:6: warning: variable 'sub_class' set but not used [-Wunused-but-set-variable]
     125 |         int sub_class;
         |             ^
   1 warning generated.


vim +/sub_class +125 block/blk-mq.c

   122	
   123	void blk_freeze_queue_start(struct request_queue *q)
   124	{
 > 125		int sub_class;
   126	
   127		mutex_lock(&q->mq_freeze_lock);
   128		sub_class = q->mq_freeze_depth;
   129		if (++q->mq_freeze_depth == 1) {
   130			percpu_ref_kill(&q->q_usage_counter);
   131			mutex_unlock(&q->mq_freeze_lock);
   132			if (queue_is_mq(q))
   133				blk_mq_run_hw_queues(q, false);
   134		} else {
   135			mutex_unlock(&q->mq_freeze_lock);
   136		}
   137		/*
   138		 * model as down_write_trylock() so that two concurrent freeze queue
   139		 * can be allowed
   140		 */
   141		if (blk_queue_freeze_lockdep(q))
   142			rwsem_acquire(&q->q_usage_counter_map, sub_class, 1, _RET_IP_);
   143	}
   144	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
   145
Jens Axboe Oct. 19, 2024, 10:46 p.m. UTC | #3
On 10/17/24 7:35 PM, Ming Lei wrote:
> Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> and blk_enter_queue().
> 
> Turns out the two are just like one rwsem, so model them as rwsem for
> supporting lockdep:
> 
> 1) model blk_mq_freeze_queue() as down_write_trylock()
> - it is exclusive lock, so dependency with blk_enter_queue() is covered
> - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> 
> 2) model blk_enter_queue() as down_read()
> - it is shared lock, so concurrent blk_enter_queue() are allowed
> - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> - blk_queue_exit() is often called from other contexts(such as irq), and
> it can't be annotated as rwsem_release(), so simply do it in
> blk_enter_queue(), this way still covered cases as many as possible
> 
> NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> blk_mq_unfreeze_queue() from different context, so it is the only
> exception for the modeling. Add one tagset flag to exclude it from
> the lockdep support.
> 
> With lockdep support, such kind of reports may be reported asap and
> needn't wait until the real deadlock is triggered.

I think this is a great idea. We've had way too many issues in this
area, getting lockdep to grok it (and report issues) is the ideal way to
avoid that, and even find issues we haven't come across yet.
Ming Lei Oct. 21, 2024, 2:20 a.m. UTC | #4
On Fri, Oct 18, 2024 at 09:57:12AM -0700, Bart Van Assche wrote:
> 
> On 10/17/24 6:35 PM, Ming Lei wrote:
> > Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> > and blk_enter_queue().
> > 
> > Turns out the two are just like one rwsem, so model them as rwsem for
> > supporting lockdep:
> > 
> > 1) model blk_mq_freeze_queue() as down_write_trylock()
> > - it is exclusive lock, so dependency with blk_enter_queue() is covered
> > - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> > 
> > 2) model blk_enter_queue() as down_read()
> > - it is shared lock, so concurrent blk_enter_queue() are allowed
> > - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> > - blk_queue_exit() is often called from other contexts(such as irq), and
> > it can't be annotated as rwsem_release(), so simply do it in
> > blk_enter_queue(), this way still covered cases as many as possible
> > 
> > NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> > blk_mq_unfreeze_queue() from different context, so it is the only
> > exception for the modeling. Add one tagset flag to exclude it from
> > the lockdep support.
> > 
> > With lockdep support, such kind of reports may be reported asap and
> > needn't wait until the real deadlock is triggered.
> > 
> > For example, the following lockdep report can be triggered in the
> > report[3].
> 
> Hi Ming,
> 
> Thank you for having reported this issue and for having proposed a
> patch. I think the following is missing from the patch description:
> (a) An analysis of which code causes the inconsistent nested lock order.
> (b) A discussion of potential alternatives.

I guess you might misunderstand this patch which just enables lockdep for
freezing & entering queue, so that lockdep can be used for running early
verification on patches & kernel.

The basic idea is to model .q_usage_counter as rwsem:

- treat freeze_queue as down_write_trylock()
- treat enter_queue() as down_read()

We needn't to care relation with other locks if the following is true:

- .q_usage_counter can be aligned with lock, and

- the modeling in this patch is correct.

Then lock order can be verified by lockdep.

> 
> It seems unavoidable to me that some code freezes request queue(s)
> before the limits are updated. Additionally, there is code that freezes
> queues first (sd_revalidate_disk()), executes commands and next updates
> limits. Hence the inconsistent order of freezing queues and obtaining
> limits_lock.

This is just one specific issue which can be reported with lockdep
support, but this patch only focus on how to model freeze/enter queue as lock,
so please move discussion on this specific lock issue to another standalone thread.

> 
> The alternative (entirely untested) solution below has the following
> advantages:
> * No additional information has to be provided to lockdep about the
>   locking order.

Can you explain a bit more? I think this patch doesn't provide
`additional` info to lockdep APIs.

> * No new flags are required (SKIP_FREEZE_LOCKDEP).

So far it isn't possible, nvme subsystem freezes queue in one context,
and unfreezes queue from another context, this way has caused many
trouble.

And it needs to refactor nvme error handling code path for removing
SKIP_FREEZE_LOCKDEP, not short-term thing.

> * No exceptions are necessary for blk_queue_exit() nor for the NVMe
>   driver.

I think it isn't basically possible since lock won't be used in this way.
More importantly we have covered many enough cases already by not taking
blk_queue_exit() into account.


Thanks, 
Ming
Ming Lei Oct. 21, 2024, 11:17 a.m. UTC | #5
On Sat, Oct 19, 2024 at 04:46:13PM -0600, Jens Axboe wrote:
> On 10/17/24 7:35 PM, Ming Lei wrote:
> > Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> > and blk_enter_queue().
> > 
> > Turns out the two are just like one rwsem, so model them as rwsem for
> > supporting lockdep:
> > 
> > 1) model blk_mq_freeze_queue() as down_write_trylock()
> > - it is exclusive lock, so dependency with blk_enter_queue() is covered
> > - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> > 
> > 2) model blk_enter_queue() as down_read()
> > - it is shared lock, so concurrent blk_enter_queue() are allowed
> > - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> > - blk_queue_exit() is often called from other contexts(such as irq), and
> > it can't be annotated as rwsem_release(), so simply do it in
> > blk_enter_queue(), this way still covered cases as many as possible
> > 
> > NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> > blk_mq_unfreeze_queue() from different context, so it is the only
> > exception for the modeling. Add one tagset flag to exclude it from
> > the lockdep support.
> > 
> > With lockdep support, such kind of reports may be reported asap and
> > needn't wait until the real deadlock is triggered.
> 
> I think this is a great idea. We've had way too many issues in this
> area, getting lockdep to grok it (and report issues) is the ideal way to
> avoid that, and even find issues we haven't come across yet.

So far, one main false positive is that the modeling becomes not
correct when calling blk_queue_start_drain() with setting disk state as
GD_DEAD or queue as QUEUE_FLAG_DYING, since __bio_queue_enter() or
blk_queue_enter() can return immediately in this situation.


[  281.645392] ======================================================
[  281.647189] WARNING: possible circular locking dependency detected
[  281.648770] 6.11.0_nbd+ #405 Not tainted
[  281.649171] ------------------------------------------------------
[  281.649668] nvme/10551 is trying to acquire lock:
[  281.650100] ffff938a5717e3e0 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: __flush_work+0x1d6/0x4d0
[  281.650771]
               but task is already holding lock:
[  281.651442] ffff938a12206c48 (q->q_usage_counter){++++}-{0:0}, at: blk_queue_start_drain+0x12/0x40
[  281.652085]
               which lock already depends on the new lock.

[  281.653061]
               the existing dependency chain (in reverse order) is:
[  281.653820]
               -> #1 (q->q_usage_counter){++++}-{0:0}:
[  281.654525]        blk_try_enter_queue+0xc7/0x230
[  281.654951]        __submit_bio+0xa7/0x190
[  281.655339]        submit_bio_noacct_nocheck+0x1b2/0x400
[  281.655779]        __block_write_full_folio+0x1e7/0x400
[  281.656212]        write_cache_pages+0x62/0xb0
[  281.656608]        blkdev_writepages+0x56/0x90
[  281.657007]        do_writepages+0x76/0x270
[  281.657389]        __writeback_single_inode+0x5b/0x4c0
[  281.657813]        writeback_sb_inodes+0x22e/0x550
[  281.658220]        __writeback_inodes_wb+0x4c/0xf0
[  281.658617]        wb_writeback+0x193/0x3f0
[  281.658995]        wb_workfn+0x343/0x530
[  281.659353]        process_one_work+0x212/0x700
[  281.659739]        worker_thread+0x1ce/0x380
[  281.660118]        kthread+0xd2/0x110
[  281.660460]        ret_from_fork+0x31/0x50
[  281.660818]        ret_from_fork_asm+0x1a/0x30
[  281.661192]
               -> #0 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}:
[  281.661876]        __lock_acquire+0x15c0/0x23e0
[  281.662254]        lock_acquire+0xd8/0x300
[  281.662603]        __flush_work+0x1f2/0x4d0
[  281.662954]        wb_shutdown+0xa1/0xd0
[  281.663285]        bdi_unregister+0x92/0x250
[  281.663632]        del_gendisk+0x37b/0x3a0
[  281.664017]        nvme_mpath_shutdown_disk+0x58/0x60 [nvme_core]
[  281.664453]        nvme_ns_remove+0x17f/0x210 [nvme_core]
[  281.664854]        nvme_remove_namespaces+0xf7/0x150 [nvme_core]
[  281.665304]        nvme_do_delete_ctrl+0x71/0x90 [nvme_core]
[  281.665728]        nvme_delete_ctrl_sync+0x3f/0x50 [nvme_core]
[  281.666159]        nvme_sysfs_delete+0x38/0x50 [nvme_core]
[  281.666569]        kernfs_fop_write_iter+0x15c/0x210
[  281.666953]        vfs_write+0x2a7/0x540
[  281.667281]        ksys_write+0x75/0x100
[  281.667607]        do_syscall_64+0x95/0x180
[  281.667948]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  281.668352]
               other info that might help us debug this:

[  281.669122]  Possible unsafe locking scenario:

[  281.669671]        CPU0                    CPU1
[  281.670019]        ----                    ----
[  281.670358]   lock(q->q_usage_counter);
[  281.670676]                                lock((work_completion)(&(&wb->dwork)->work));
[  281.671186]                                lock(q->q_usage_counter);
[  281.671628]   lock((work_completion)(&(&wb->dwork)->work));
[  281.672056]
                *** DEADLOCK ***



thanks,
Ming
Christoph Hellwig Oct. 22, 2024, 6:18 a.m. UTC | #6
On Fri, Oct 18, 2024 at 09:35:42AM +0800, Ming Lei wrote:
> Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> and blk_enter_queue().
> 
> Turns out the two are just like one rwsem, so model them as rwsem for
> supporting lockdep:
> 
> 1) model blk_mq_freeze_queue() as down_write_trylock()
> - it is exclusive lock, so dependency with blk_enter_queue() is covered
> - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently

Is this using the right terminology?  down_write and other locking
primitives obviously can run concurrently, the whole point is to
synchronize the code run inside the criticial section.

I think what you mean here is blk_mq_freeze_queue can be called more
than once due to a global recursion counter.

Not sure modelling it as a trylock is the right approach here,
I've added the lockdep maintainers if they have an idea.

> 
> 2) model blk_enter_queue() as down_read()
> - it is shared lock, so concurrent blk_enter_queue() are allowed
> - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> - blk_queue_exit() is often called from other contexts(such as irq), and
> it can't be annotated as rwsem_release(), so simply do it in
> blk_enter_queue(), this way still covered cases as many as possible
> 
> NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> blk_mq_unfreeze_queue() from different context, so it is the only
> exception for the modeling. Add one tagset flag to exclude it from
> the lockdep support.

rwsems have a non_owner variant for these kinds of uses cases,
we should do the same for blk_mq_freeze_queue to annoate the callsite
instead of a global flag.
Christoph Hellwig Oct. 22, 2024, 6:26 a.m. UTC | #7
On Fri, Oct 18, 2024 at 09:57:12AM -0700, Bart Van Assche wrote:
> Thank you for having reported this issue and for having proposed a
> patch. I think the following is missing from the patch description:
> (a) An analysis of which code causes the inconsistent nested lock order.
> (b) A discussion of potential alternatives.
>
> It seems unavoidable to me that some code freezes request queue(s)
> before the limits are updated. Additionally, there is code that freezes
> queues first (sd_revalidate_disk()), executes commands and next updates
> limits. Hence the inconsistent order of freezing queues and obtaining
> limits_lock.
>
> The alternative (entirely untested) solution below has the following
> advantages:
> * No additional information has to be provided to lockdep about the
>   locking order.
> * No new flags are required (SKIP_FREEZE_LOCKDEP).
> * No exceptions are necessary for blk_queue_exit() nor for the NVMe
>   driver.

As mentioned by Ming fixing the lockdep reports is a bit different
from adding it first.  But I'll chime in for your proposal to fix
the report anyway.

I think requiring the queue to be frozen before we start the queue
limits update is reasonable in general, but a major pain for
sd_revalidate_disk, and unfortunately I don't think your proposed patch
works there as it doesn't protect against concurrent updates from
say sysfs, and also doesn't scale to adding new limits.

One option to deal with this would be to add an update sequence to
the queue_limits.  sd_revalidate_disk would sample it where it
currently starts the update, then drop the lock and query the
device, then takes the limits lock again, checks for the sequence.
If it matches it commits the update, else it retries.
Peter Zijlstra Oct. 22, 2024, 7:19 a.m. UTC | #8
On Tue, Oct 22, 2024 at 08:18:05AM +0200, Christoph Hellwig wrote:
> On Fri, Oct 18, 2024 at 09:35:42AM +0800, Ming Lei wrote:
> > Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> > and blk_enter_queue().
> > 
> > Turns out the two are just like one rwsem, so model them as rwsem for
> > supporting lockdep:
> > 
> > 1) model blk_mq_freeze_queue() as down_write_trylock()
> > - it is exclusive lock, so dependency with blk_enter_queue() is covered
> > - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> 
> Is this using the right terminology?  down_write and other locking
> primitives obviously can run concurrently, the whole point is to
> synchronize the code run inside the criticial section.
> 
> I think what you mean here is blk_mq_freeze_queue can be called more
> than once due to a global recursion counter.
> 
> Not sure modelling it as a trylock is the right approach here,
> I've added the lockdep maintainers if they have an idea.

So lockdep supports recursive reader state, but you're looking at
recursive exclusive state?

If you achieve this using an external nest count, then it is probably (I
haven't yet had morning juice) sufficient to use the regular exclusive
state on the outermost lock / unlock pair and simply ignore the inner
locks.
Christoph Hellwig Oct. 22, 2024, 7:21 a.m. UTC | #9
On Tue, Oct 22, 2024 at 09:19:05AM +0200, Peter Zijlstra wrote:
> > Not sure modelling it as a trylock is the right approach here,
> > I've added the lockdep maintainers if they have an idea.
> 
> So lockdep supports recursive reader state, but you're looking at
> recursive exclusive state?

Yes.

> If you achieve this using an external nest count, then it is probably (I
> haven't yet had morning juice) sufficient to use the regular exclusive
> state on the outermost lock / unlock pair and simply ignore the inner
> locks.

There obviosuly is an external nest count here, so I guess that should
do the job.  Ming?
Bart Van Assche Oct. 22, 2024, 3:05 p.m. UTC | #10
On 10/17/24 6:35 PM, Ming Lei wrote:
>                 -> #1 (q->q_usage_counter){++++}-{0:0}:

What code in the upstream kernel associates lockdep information
with a *counter*? I haven't found it. Did I perhaps overlook something?

Thanks,

Bart.
Ming Lei Oct. 23, 2024, 3:22 a.m. UTC | #11
On Tue, Oct 22, 2024 at 08:18:05AM +0200, Christoph Hellwig wrote:
> On Fri, Oct 18, 2024 at 09:35:42AM +0800, Ming Lei wrote:
> > Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue
> > and blk_enter_queue().
> > 
> > Turns out the two are just like one rwsem, so model them as rwsem for
> > supporting lockdep:
> > 
> > 1) model blk_mq_freeze_queue() as down_write_trylock()
> > - it is exclusive lock, so dependency with blk_enter_queue() is covered
> > - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently
> 
> Is this using the right terminology?  down_write and other locking
> primitives obviously can run concurrently, the whole point is to
> synchronize the code run inside the criticial section.
> 
> I think what you mean here is blk_mq_freeze_queue can be called more
> than once due to a global recursion counter.
> 
> Not sure modelling it as a trylock is the right approach here,
> I've added the lockdep maintainers if they have an idea.

Yeah, looks we can just call lock_acquire for the outermost
freeze/unfreeze.

> 
> > 
> > 2) model blk_enter_queue() as down_read()
> > - it is shared lock, so concurrent blk_enter_queue() are allowed
> > - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> > - blk_queue_exit() is often called from other contexts(such as irq), and
> > it can't be annotated as rwsem_release(), so simply do it in
> > blk_enter_queue(), this way still covered cases as many as possible
> > 
> > NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> > blk_mq_unfreeze_queue() from different context, so it is the only
> > exception for the modeling. Add one tagset flag to exclude it from
> > the lockdep support.
> 
> rwsems have a non_owner variant for these kinds of uses cases,
> we should do the same for blk_mq_freeze_queue to annoate the callsite
> instead of a global flag.
 
Here it isn't real rwsem, and lockdep doesn't have non_owner variant
for rwsem_acquire() and rwsem_release().

Another corner case is blk_mark_disk_dead() in which freeze & unfreeze
may be run from different task contexts too.


thanks,
Ming
Christoph Hellwig Oct. 23, 2024, 6:07 a.m. UTC | #12
On Wed, Oct 23, 2024 at 11:22:55AM +0800, Ming Lei wrote:
> > > 2) model blk_enter_queue() as down_read()
> > > - it is shared lock, so concurrent blk_enter_queue() are allowed
> > > - it is read lock, so dependency with blk_mq_freeze_queue() is modeled
> > > - blk_queue_exit() is often called from other contexts(such as irq), and
> > > it can't be annotated as rwsem_release(), so simply do it in
> > > blk_enter_queue(), this way still covered cases as many as possible
> > > 
> > > NVMe is the only subsystem which may call blk_mq_freeze_queue() and
> > > blk_mq_unfreeze_queue() from different context, so it is the only
> > > exception for the modeling. Add one tagset flag to exclude it from
> > > the lockdep support.
> > 
> > rwsems have a non_owner variant for these kinds of uses cases,
> > we should do the same for blk_mq_freeze_queue to annoate the callsite
> > instead of a global flag.
>  
> Here it isn't real rwsem, and lockdep doesn't have non_owner variant
> for rwsem_acquire() and rwsem_release().

Hmm, it looks like down_read_non_owner completely skips lockdep,
which seems rather problematic.  Sure we can't really track an
owner, but having it take part in the lock chain would be extremely
useful.  Whatever we're using there should work for the freeze
protection.

> Another corner case is blk_mark_disk_dead() in which freeze & unfreeze
> may be run from different task contexts too.

Yes, this is a pretty questionable one though as we should be able
to unfreeze as soon as the dying bit is set.  Separate discussion,
though.

Either way the non-ownership should be per call and not a queue or
tagset flag.
Ming Lei Oct. 23, 2024, 7:59 a.m. UTC | #13
On Tue, Oct 22, 2024 at 08:05:32AM -0700, Bart Van Assche wrote:
> On 10/17/24 6:35 PM, Ming Lei wrote:
> >                 -> #1 (q->q_usage_counter){++++}-{0:0}:
> 
> What code in the upstream kernel associates lockdep information
> with a *counter*? I haven't found it. Did I perhaps overlook something?

Please look fs/kernfs/dir.c, and there should be more in linux kernel.


Thanks,
Ming
Bart Van Assche Oct. 23, 2024, 6:05 p.m. UTC | #14
On 10/23/24 12:59 AM, Ming Lei wrote:
> On Tue, Oct 22, 2024 at 08:05:32AM -0700, Bart Van Assche wrote:
>> On 10/17/24 6:35 PM, Ming Lei wrote:
>>>                  -> #1 (q->q_usage_counter){++++}-{0:0}:
>>
>> What code in the upstream kernel associates lockdep information
>> with a *counter*? I haven't found it. Did I perhaps overlook something?
> 
> Please look fs/kernfs/dir.c, and there should be more in linux kernel.

 From block/blk-core.c:

	error = percpu_ref_init(&q->q_usage_counter,
				blk_queue_usage_counter_release,
				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);

The per-cpu ref implementation occurs in the following source files: 
include/linux/percpu-refcount.h and lib/percpu-refcount.c. It is not
clear to me how fs/kernfs/dir.c is related to q->q_usage_counter?

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index bc5e8c5eaac9..2c3ca6d405e2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -384,6 +384,7 @@  static void blk_timeout_work(struct work_struct *work)
 
 struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
 {
+	static struct lock_class_key __q_usage_counter_key;
 	struct request_queue *q;
 	int error;
 
@@ -441,6 +442,8 @@  struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
 				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
 	if (error)
 		goto fail_stats;
+	lockdep_init_map(&q->q_usage_counter_map, "q->q_usage_counter",
+			 &__q_usage_counter_key, 0);
 
 	q->nr_requests = BLKDEV_DEFAULT_RQ;
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5463697a8442..d0edac3b8b08 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -188,6 +188,7 @@  static const char *const hctx_flag_name[] = {
 	HCTX_FLAG_NAME(BLOCKING),
 	HCTX_FLAG_NAME(NO_SCHED),
 	HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
+	HCTX_FLAG_NAME(SKIP_FREEZE_LOCKDEP),
 };
 #undef HCTX_FLAG_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4b2c8e940f59..b1b970650641 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -122,7 +122,10 @@  void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 
 void blk_freeze_queue_start(struct request_queue *q)
 {
+	int sub_class;
+
 	mutex_lock(&q->mq_freeze_lock);
+	sub_class = q->mq_freeze_depth;
 	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
 		mutex_unlock(&q->mq_freeze_lock);
@@ -131,6 +134,12 @@  void blk_freeze_queue_start(struct request_queue *q)
 	} else {
 		mutex_unlock(&q->mq_freeze_lock);
 	}
+	/*
+	 * model as down_write_trylock() so that two concurrent freeze queue
+	 * can be allowed
+	 */
+	if (blk_queue_freeze_lockdep(q))
+		rwsem_acquire(&q->q_usage_counter_map, sub_class, 1, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
@@ -188,6 +197,9 @@  void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 		wake_up_all(&q->mq_freeze_wq);
 	}
 	mutex_unlock(&q->mq_freeze_lock);
+
+	if (blk_queue_freeze_lockdep(q))
+		rwsem_release(&q->q_usage_counter_map, _RET_IP_);
 }
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -4185,6 +4197,9 @@  void blk_mq_destroy_queue(struct request_queue *q)
 	blk_queue_start_drain(q);
 	blk_mq_freeze_queue_wait(q);
 
+	/* counter pair of acquire in blk_queue_start_drain */
+	if (blk_queue_freeze_lockdep(q))
+		rwsem_release(&q->q_usage_counter_map, _RET_IP_);
 	blk_sync_queue(q);
 	blk_mq_cancel_work_sync(q);
 	blk_mq_exit_queue(q);
diff --git a/block/blk.h b/block/blk.h
index c718e4291db0..d6274f3bece9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@ 
 
 #include <linux/bio-integrity.h>
 #include <linux/blk-crypto.h>
+#include <linux/lockdep.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <linux/sched/sysctl.h>
 #include <linux/timekeeping.h>
@@ -43,6 +44,8 @@  void bio_await_chain(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
+	/* model as down_read() for lockdep */
+	rwsem_acquire_read(&q->q_usage_counter_map, 0, 0, _RET_IP_);
 	rcu_read_lock();
 	if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
 		goto fail;
@@ -56,12 +59,18 @@  static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 		goto fail_put;
 
 	rcu_read_unlock();
+	/*
+	 * queue exit often happen in other context, so we simply annotate
+	 * release here, still lots of cases can be covered
+	 */
+	rwsem_release(&q->q_usage_counter_map, _RET_IP_);
 	return true;
 
 fail_put:
 	blk_queue_exit(q);
 fail:
 	rcu_read_unlock();
+	rwsem_release(&q->q_usage_counter_map, _RET_IP_);
 	return false;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index 1c05dd4c6980..4016a83a0d83 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -722,6 +722,9 @@  void del_gendisk(struct gendisk *disk)
 		blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
 		__blk_mq_unfreeze_queue(q, true);
 	} else {
+		/* counter pair of acquire in blk_queue_start_drain */
+		if (blk_queue_freeze_lockdep(q))
+			rwsem_release(&q->q_usage_counter_map, _RET_IP_);
 		if (queue_is_mq(q))
 			blk_mq_exit_queue(q);
 	}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ba6508455e18..6575f0f5a5fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4528,7 +4528,7 @@  int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
 		/* Reserved for fabric connect and keep alive */
 		set->reserved_tags = 2;
 	set->numa_node = ctrl->numa_node;
-	set->flags = BLK_MQ_F_NO_SCHED;
+	set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_SKIP_FREEZE_LOCKDEP;
 	if (ctrl->ops->flags & NVME_F_BLOCKING)
 		set->flags |= BLK_MQ_F_BLOCKING;
 	set->cmd_size = cmd_size;
@@ -4598,7 +4598,7 @@  int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
 		/* Reserved for fabric connect */
 		set->reserved_tags = 1;
 	set->numa_node = ctrl->numa_node;
-	set->flags = BLK_MQ_F_SHOULD_MERGE;
+	set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SKIP_FREEZE_LOCKDEP;
 	if (ctrl->ops->flags & NVME_F_BLOCKING)
 		set->flags |= BLK_MQ_F_BLOCKING;
 	set->cmd_size = cmd_size;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46ef681..9c5c9dc0e7e2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -687,7 +687,10 @@  enum {
 	 * or shared hwqs instead of 'mq-deadline'.
 	 */
 	BLK_MQ_F_NO_SCHED_BY_DEFAULT	= 1 << 6,
-	BLK_MQ_F_ALLOC_POLICY_START_BIT = 7,
+
+	BLK_MQ_F_SKIP_FREEZE_LOCKDEP	= 1 << 7,
+
+	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 };
 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b959da28..5f25521bf2f6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@ 
 #include <linux/uuid.h>
 #include <linux/xarray.h>
 #include <linux/file.h>
+#include <linux/lockdep.h>
 
 struct module;
 struct request_queue;
@@ -471,6 +472,9 @@  struct request_queue {
 	struct xarray		hctx_table;
 
 	struct percpu_ref	q_usage_counter;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	q_usage_counter_map;
+#endif
 
 	struct request		*last_merge;
 
@@ -635,6 +639,8 @@  void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 #define blk_queue_skip_tagset_quiesce(q) \
 	((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
+#define blk_queue_freeze_lockdep(q) \
+	!(q->tag_set->flags & BLK_MQ_F_SKIP_FREEZE_LOCKDEP)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);