diff mbox series

[V2,1/3] block: add blk_mq_enter_no_io() and blk_mq_exit_no_io()

Message ID 20250403025214.1274650-2-ming.lei@redhat.com (mailing list archive)
State New
Headers show
Series block: fix lock dependency between freeze and elevator lock | expand

Commit Message

Ming Lei April 3, 2025, 2:52 a.m. UTC
Add blk_mq_enter_no_io() and blk_mq_exit_no_io() for preventing queue
from handling any FS or passthrough IO, meantime the queue is kept in
non-freeze state.

The added two APIs can avoid many potential lock risks related with
freeze lock & elevator lock, because lock implied in new APIs are
block layer local lock, which won't be required in IO code path,
and can't connect many global locks required in IO path.

Also add two variants of memsave version, since no fs_reclaim is allowed
in case of blk_mq_enter_no_io().

Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c       |  6 ++++--
 block/blk-mq.c         | 27 +++++++++++++++++++++++++--
 block/blk-mq.h         | 19 +++++++++++++++++++
 block/blk.h            |  5 +++--
 include/linux/blkdev.h |  8 ++++++++
 5 files changed, 59 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig April 3, 2025, 5:44 a.m. UTC | #1
On Thu, Apr 03, 2025 at 10:52:08AM +0800, Ming Lei wrote:
> Add blk_mq_enter_no_io() and blk_mq_exit_no_io() for preventing queue
> from handling any FS or passthrough IO, meantime the queue is kept in
> non-freeze state.

How does that differ from the actual freeze?  Please document that
clearly in the commit log and in kerneldoc comments, and do an analysis
of which callers should do the full freeze and which the limited I/O
freeze.

Also the name is really unfortunate - no_io has a very clear connotation
for memory allocations, so this should be using something else.

> Also add two variants of memsave version, since no fs_reclaim is allowed
> in case of blk_mq_enter_no_io().

Please explain why.



> index ae8494d88897..d117fa18b394 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -222,8 +222,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
>  	bool unfreeze;
>  
>  	mutex_lock(&q->mq_freeze_lock);
> -	if (force_atomic)
> -		q->q_usage_counter.data->force_atomic = true;
> +	q->q_usage_counter.data->force_atomic = force_atomic;
>  	q->mq_freeze_depth--;
>  	WARN_ON_ONCE(q->mq_freeze_depth < 0);
>  	if (!q->mq_freeze_depth) {

This is a completely unrelated cleanup.

> +void blk_mq_enter_no_io(struct request_queue *q)
> +{
> +	blk_mq_freeze_queue_nomemsave(q);
> +	q->no_io = true;
> +	if (__blk_mq_unfreeze_queue(q, true))
> +		blk_unfreeze_release_lock(q);

So this freezes the queue, sets a flag to not do I/O then unfreezes
it.   So AFAIK it just is a freeze without the automatic recursion.

But maybe I'm missing something?

> +	if ((blk_queue_pm_only(q) &&
> +	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) ||
> +			blk_queue_no_io(q))

The indentation is very inconsistent here.  This would looks
more reasonable:

	if (blk_queue_no_io(q) ||
	    (blk_queue_pm_only(q) &&
	     (!pm || queue_rpm_status(q) == RPM_SUSPENDED))) {

Also as this logic is duplicated it might make sense to de-dup it.
Ming Lei April 3, 2025, 10:22 a.m. UTC | #2
On Thu, Apr 03, 2025 at 07:44:27AM +0200, Christoph Hellwig wrote:
> On Thu, Apr 03, 2025 at 10:52:08AM +0800, Ming Lei wrote:
> > Add blk_mq_enter_no_io() and blk_mq_exit_no_io() for preventing queue
> > from handling any FS or passthrough IO, meantime the queue is kept in
> > non-freeze state.
> 
> How does that differ from the actual freeze?  Please document that
> clearly in the commit log and in kerneldoc comments, and do an analysis
> of which callers should do the full freeze and which the limited I/O
> freeze.
> 
> Also the name is really unfortunate - no_io has a very clear connotation
> for memory allocations, so this should be using something else.
> 
> > Also add two variants of memsave version, since no fs_reclaim is allowed
> > in case of blk_mq_enter_no_io().
> 
> Please explain why.
> 
> 
> > index ae8494d88897..d117fa18b394 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -222,8 +222,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
> >  	bool unfreeze;
> >  
> >  	mutex_lock(&q->mq_freeze_lock);
> > -	if (force_atomic)
> > -		q->q_usage_counter.data->force_atomic = true;
> > +	q->q_usage_counter.data->force_atomic = force_atomic;
> >  	q->mq_freeze_depth--;
> >  	WARN_ON_ONCE(q->mq_freeze_depth < 0);
> >  	if (!q->mq_freeze_depth) {
> 
> This is a completely unrelated cleanup.
> 
> > +void blk_mq_enter_no_io(struct request_queue *q)
> > +{
> > +	blk_mq_freeze_queue_nomemsave(q);
> > +	q->no_io = true;
> > +	if (__blk_mq_unfreeze_queue(q, true))
> > +		blk_unfreeze_release_lock(q);
> 
> So this freezes the queue, sets a flag to not do I/O then unfreezes
> it.   So AFAIK it just is a freeze without the automatic recursion.
> 
> But maybe I'm missing something?

Yeah, looks lockdep modeling for blk_mq_enter_no_io() is wrong, and the
part in bio_enter_queue() is missed.

So this approach doesn't work.

Now the dependency between freeze lock and elevator lock looks one trouble,
such as [1], which is one real deadlock risk.

And there should be more.

[1] https://lore.kernel.org/linux-block/7755.1743228130@turing-police/#tReviewed-by


Thanks,
Ming
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 4623de79effa..a1388d675b8d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -319,7 +319,8 @@  int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 		smp_rmb();
 		wait_event(q->mq_freeze_wq,
 			   (!q->mq_freeze_depth &&
-			    blk_pm_resume_queue(pm, q)) ||
+			    blk_pm_resume_queue(pm, q) &&
+			     !blk_queue_no_io(q)) ||
 			   blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -352,7 +353,8 @@  int __bio_queue_enter(struct request_queue *q, struct bio *bio)
 		smp_rmb();
 		wait_event(q->mq_freeze_wq,
 			   (!q->mq_freeze_depth &&
-			    blk_pm_resume_queue(false, q)) ||
+			    blk_pm_resume_queue(false, q) &&
+			     !blk_queue_no_io(q)) ||
 			   test_bit(GD_DEAD, &disk->state));
 		if (test_bit(GD_DEAD, &disk->state))
 			goto dead;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ae8494d88897..d117fa18b394 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -222,8 +222,7 @@  bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 	bool unfreeze;
 
 	mutex_lock(&q->mq_freeze_lock);
-	if (force_atomic)
-		q->q_usage_counter.data->force_atomic = true;
+	q->q_usage_counter.data->force_atomic = force_atomic;
 	q->mq_freeze_depth--;
 	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	if (!q->mq_freeze_depth) {
@@ -278,6 +277,30 @@  void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map blk_mq_no_io_map =
+	STATIC_LOCKDEP_MAP_INIT("blk_mq_no_io", &blk_mq_no_io_map);
+#endif
+
+void blk_mq_enter_no_io(struct request_queue *q)
+{
+	blk_mq_freeze_queue_nomemsave(q);
+	q->no_io = true;
+	if (__blk_mq_unfreeze_queue(q, true))
+		blk_unfreeze_release_lock(q);
+
+	lock_acquire_exclusive(&blk_mq_no_io_map, 0, 0, NULL, _RET_IP_);
+}
+
+void blk_mq_exit_no_io(struct request_queue *q)
+{
+	lock_release(&blk_mq_no_io_map, _RET_IP_);
+
+	blk_mq_freeze_queue_nomemsave(q);
+	q->no_io = false;
+	blk_mq_unfreeze_queue_nomemrestore(q);
+}
+
 /**
  * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
  * @set: tag_set to wait on
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3011a78cf16a..f49070c8c05f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -452,4 +452,23 @@  static inline bool blk_mq_can_poll(struct request_queue *q)
 		q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
 }
 
+void blk_mq_enter_no_io(struct request_queue *q);
+void blk_mq_exit_no_io(struct request_queue *q);
+
+static inline unsigned int __must_check
+blk_mq_enter_no_io_memsave(struct request_queue *q)
+{
+	unsigned int memflags = memalloc_noio_save();
+
+	blk_mq_enter_no_io(q);
+	return memflags;
+}
+
+static inline void
+blk_mq_exit_no_io_memrestore(struct request_queue *q, unsigned int memflags)
+{
+	blk_mq_exit_no_io(q);
+	memalloc_noio_restore(memflags);
+}
+
 #endif
diff --git a/block/blk.h b/block/blk.h
index 006e3be433d2..7d0994c1d3ad 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -56,8 +56,9 @@  static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 	 * The code that increments the pm_only counter must ensure that the
 	 * counter is globally visible before the queue is unfrozen.
 	 */
-	if (blk_queue_pm_only(q) &&
-	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
+	if ((blk_queue_pm_only(q) &&
+	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) ||
+			blk_queue_no_io(q))
 		goto fail_put;
 
 	rcu_read_unlock();
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e39c45bc0a97..1b8fd63eee80 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -498,6 +498,13 @@  struct request_queue {
 
 	int			quiesce_depth;
 
+	/*
+	 * Prevent queue from handling IO
+	 *
+	 * keep it in same cache line with q_usage_counter
+	 */
+	bool			no_io;
+
 	struct gendisk		*disk;
 
 	/*
@@ -679,6 +686,7 @@  void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 #define blk_queue_skip_tagset_quiesce(q) \
 	((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
+#define blk_queue_no_io(q)	(q->no_io)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);