diff mbox series

loop: make autoclear operation asynchronous

Message ID de6ec247-4a2d-7c3e-3700-90604f88e901@i-love.sakura.ne.jp (mailing list archive)
State New, archived
Headers show
Series loop: make autoclear operation asynchronous | expand

Commit Message

Tetsuo Handa Dec. 1, 2021, 2:41 p.m. UTC
On 2021/11/30 21:57, Christoph Hellwig wrote:
> On Mon, Nov 29, 2021 at 07:36:27PM +0900, Tetsuo Handa wrote:
>> If the caller just want to call ioctl(LOOP_CTL_GET_FREE) followed by
>> ioctl(LOOP_CONFIGURE), deferring __loop_clr_fd() would be fine.
>>
>> But the caller might want to unmount as soon as fput(filp) from __loop_clr_fd() completes.
>> I think we need to wait for __loop_clr_fd() from lo_release() to complete.
> 
> Anything else could have a reference to this or other files as well.
> So I can't see how deferring the clear to a different context can be
> any kind of problem in practice.
> 

OK. Here is a patch.
Is this better than temporarily dropping disk->open_mutex ?

From 1405d604f1a0aa153de595f607726f0dcbe5c784 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 1 Dec 2021 23:31:20 +0900
Subject: [PATCH] loop: make autoclear operation asynchronous

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker")
is calling destroy_workqueue() with disk->open_mutex held.

This circular dependency cannot be broken unless we call __loop_clr_fd()
without holding disk->open_mutex. There are two approaches.

One is to temporarily drop disk->open_mutex when calling __loop_clr_fd().

  -  __loop_clr_fd(lo, true);
  +  mutex_unlock(&lo->lo_disk->open_mutex);
  +  __loop_clr_fd(lo, false);
  +  mutex_lock(&lo->lo_disk->open_mutex);

This should work because

  (a) __loop_clr_fd() can be called without disk->open_mutex held, and
      takes disk->open_mutex if needed when called by ioctl(LOOP_CLR_FD)

  (b) lo_release() is called by blkdev_put_whole() via
      bdev->bd_disk->fops->release from blkdev_put() (maybe via
      blkdev_put_part()) immediately before dropping disk->open_mutex

  (c) there is no resource to protect after dropping disk->open_mutex
      till blkdev_put() completes

are true.

The other is to defer __loop_clr_fd() to a WQ context. This should work
given that

  (d) refcount on resources accessed by __loop_clr_fd() are taken before
      blkdev_put() drops refcount

  (e) refcount on resources accessed by __loop_clr_fd() are dropped after
      __loop_clr_fd() completes

  (f) the caller is not trying to e.g. unmount as soon as returning from
      loop_release()

  (g) the WQ context does not introduce new locking problems

are true. This patch implements (d) and (e).

Link: https://syzkaller.appspot.com/bug?extid=643e4ce4b6ad1347d372 [1]
Reported-by: syzbot <syzbot+643e4ce4b6ad1347d372@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 drivers/block/loop.c | 65 ++++++++++++++++++++++++--------------------
 drivers/block/loop.h |  1 +
 2 files changed, 37 insertions(+), 29 deletions(-)

Comments

Christoph Hellwig Dec. 2, 2021, 7:22 a.m. UTC | #1
On Wed, Dec 01, 2021 at 11:41:23PM +0900, Tetsuo Handa wrote:
> OK. Here is a patch.
> Is this better than temporarily dropping disk->open_mutex ?

This looks much better, and also cleans up the horrible locking warts
in __loop_clr_fd.
Tetsuo Handa Dec. 2, 2021, 11:03 a.m. UTC | #2
On 2021/12/02 16:22, Christoph Hellwig wrote:
> On Wed, Dec 01, 2021 at 11:41:23PM +0900, Tetsuo Handa wrote:
>> OK. Here is a patch.
>> Is this better than temporarily dropping disk->open_mutex ?
> 
> This looks much better, and also cleans up the horrible locking warts
> in __loop_clr_fd.
> 

What "the horrible locking warts" refer to? Below approach temporarily
drops disk->open_mutex. I think there is no locking difference between
synchronous and asynchronous...

Anyway, I resent
https://lkml.kernel.org/r/d1f760f9-cdb2-f40d-33d8-bfa517c731be@i-love.sakura.ne.jp
in order to apply before "loop: replace loop_validate_mutex with loop_validate_spinlock".

---
 drivers/block/loop.c | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ba76319b5544..31d3fbe67fea 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1082,7 +1082,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return error;
 }
 
-static void __loop_clr_fd(struct loop_device *lo, bool release)
+static void __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
@@ -1153,31 +1153,15 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		/*
-		 * open_mutex has been held already in release path, so don't
-		 * acquire it if this function is called in such case.
-		 *
-		 * If the reread partition isn't from release path, lo_refcnt
-		 * must be at least one and it can only become zero when the
-		 * current holder is released.
-		 */
-		if (!release)
-			mutex_lock(&lo->lo_disk->open_mutex);
+		mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		if (!release)
-			mutex_unlock(&lo->lo_disk->open_mutex);
+		mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
 		/* Device is gone, no point in returning error */
 	}
 
-	/*
-	 * lo->lo_state is set to Lo_unbound here after above partscan has
-	 * finished. There cannot be anybody else entering __loop_clr_fd() as
-	 * Lo_rundown state protects us from all the other places trying to
-	 * change the 'lo' device.
-	 */
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
@@ -1185,11 +1169,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
 
-	/*
-	 * Need not hold lo_mutex to fput backing file. Calling fput holding
-	 * lo_mutex triggers a circular lock dependency possibility warning as
-	 * fput can take open_mutex which is usually taken before lo_mutex.
-	 */
 	fput(filp);
 }
 
@@ -1222,7 +1201,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
 
-	__loop_clr_fd(lo, false);
+	__loop_clr_fd(lo);
 	return 0;
 }
 
@@ -1747,7 +1726,9 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		__loop_clr_fd(lo, true);
+		mutex_unlock(&lo->lo_disk->open_mutex);
+		__loop_clr_fd(lo);
+		mutex_lock(&lo->lo_disk->open_mutex);
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
Jan Kara Dec. 2, 2021, 12:16 p.m. UTC | #3
On Wed 01-12-21 23:41:23, Tetsuo Handa wrote:
> On 2021/11/30 21:57, Christoph Hellwig wrote:
> > On Mon, Nov 29, 2021 at 07:36:27PM +0900, Tetsuo Handa wrote:
> >> If the caller just want to call ioctl(LOOP_CTL_GET_FREE) followed by
> >> ioctl(LOOP_CONFIGURE), deferring __loop_clr_fd() would be fine.
> >>
> >> But the caller might want to unmount as soon as fput(filp) from __loop_clr_fd() completes.
> >> I think we need to wait for __loop_clr_fd() from lo_release() to complete.
> > 
> > Anything else could have a reference to this or other files as well.
> > So I can't see how deferring the clear to a different context can be
> > any kind of problem in practice.
> > 
> 
> OK. Here is a patch.
> Is this better than temporarily dropping disk->open_mutex ?

The patch looks good to me. Just one suggestion for improvement:

> +static void loop_schedule_rundown(struct loop_device *lo)
> +{
> +	struct block_device *bdev = lo->lo_device;
> +	struct gendisk *disk = lo->lo_disk;
> +
> +	__module_get(disk->fops->owner);
> +	kobject_get(&bdev->bd_device.kobj);
> +	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
> +	queue_work(system_long_wq, &lo->rundown_work);
>  }

Why not scheduling this using task_work_add()? It solves the locking
context problems, has generally lower overhead than normal work (no need to
schedule), and avoids possible unexpected side-effects of releasing
loopback device later. Also task work is specifically designed so that one
task work can queue another task work so we should be fine using it.

								Honza
Tetsuo Handa Dec. 2, 2021, 2:39 p.m. UTC | #4
On 2021/12/02 21:16, Jan Kara wrote:
> Why not scheduling this using task_work_add()? It solves the locking
> context problems, has generally lower overhead than normal work (no need to
> schedule), and avoids possible unexpected side-effects of releasing
> loopback device later. Also task work is specifically designed so that one
> task work can queue another task work so we should be fine using it.

Indeed. But that will make really no difference between synchronous approach
( https://lkml.kernel.org/r/fb6adcdc-fb56-3b90-355b-3f5a81220f2b@i-love.sakura.ne.jp )
and asynchronous approach
( https://lkml.kernel.org/r/d1f760f9-cdb2-f40d-33d8-bfa517c731be@i-love.sakura.ne.jp ), for
disk->open_mutex is the only lock held when lo_release() is called.

Both approaches allow __loop_clr_fd() to run with no lock held, and both approaches
need to be aware of what actions are taken by blkdev_put() before and after dropping
disk->open_mutex. And bdev->bd_disk->fops->release() is the last action taken before
dropping disk->open_mutex.

What is so happier with preventing what will be done after disk->open_mutex is dropped
by blkdev_put() (i.e. __module_get() + kobject_get() before blkdev_put() calls
kobject_put() + module_put(), and kobject_put() + module_put() upon task_work_run()),
compared to doing things that can be done without disk->open_mutex (i.e. calling
__loop_clr_fd() without disk->open_mutex) ?
Jan Kara Dec. 2, 2021, 6:05 p.m. UTC | #5
On Thu 02-12-21 23:39:42, Tetsuo Handa wrote:
> On 2021/12/02 21:16, Jan Kara wrote:
> > Why not scheduling this using task_work_add()? It solves the locking
> > context problems, has generally lower overhead than normal work (no need to
> > schedule), and avoids possible unexpected side-effects of releasing
> > loopback device later. Also task work is specifically designed so that one
> > task work can queue another task work so we should be fine using it.
> 
> Indeed. But that will make really no difference between synchronous approach
> ( https://lkml.kernel.org/r/fb6adcdc-fb56-3b90-355b-3f5a81220f2b@i-love.sakura.ne.jp )
> and asynchronous approach
> ( https://lkml.kernel.org/r/d1f760f9-cdb2-f40d-33d8-bfa517c731be@i-love.sakura.ne.jp ), for
> disk->open_mutex is the only lock held when lo_release() is called.
> 
> Both approaches allow __loop_clr_fd() to run with no lock held, and both
> approaches need to be aware of what actions are taken by blkdev_put()
> before and after dropping disk->open_mutex. And
> bdev->bd_disk->fops->release() is the last action taken before dropping
> disk->open_mutex.
> 
> What is so happier with preventing what will be done after
> disk->open_mutex is dropped by blkdev_put() (i.e. __module_get() +
> kobject_get() before blkdev_put() calls kobject_put() + module_put(), and
> kobject_put() + module_put() upon task_work_run()), compared to doing
> things that can be done without disk->open_mutex (i.e. calling
> __loop_clr_fd() without disk->open_mutex) ?

So the advantage of using task work instead of just dropping open_mutex
before calling __loop_clr_fd() is that if something in block/bdev.c ever
changes and starts relying on open_mutex being held throughout blkdev_put()
then loop device handling will not suddently become broken. Generally it is
a bad practice to drop locks (even temporarily) upper layers have acquired.
Sometimes it is inevitable in in this case we can avoid that... So I'd
prefer if we used task work instead of dropping open_mutex inside loop
driver. Not sure what's Christoph's opinion though, I don't feel *that*
strongly about it.

								Honza
Christoph Hellwig Dec. 3, 2021, 6:50 a.m. UTC | #6
On Thu, Dec 02, 2021 at 07:05:00PM +0100, Jan Kara wrote:
> So the advantage of using task work instead of just dropping open_mutex
> before calling __loop_clr_fd() is that if something in block/bdev.c ever
> changes and starts relying on open_mutex being held throughout blkdev_put()
> then loop device handling will not suddently become broken. Generally it is
> a bad practice to drop locks (even temporarily) upper layers have acquired.
> Sometimes it is inevitable in in this case we can avoid that... So I'd
> prefer if we used task work instead of dropping open_mutex inside loop
> driver. Not sure what's Christoph's opinion though, I don't feel *that*
> strongly about it.

Dropping the lock is a complete no go a it doesn't allow proper
reasoning about the locking scheme in the block layer.

task_work_add sounds nice, but it is currently not exported which might
be for a reason (I don't really have any experience with it).
Tetsuo Handa Dec. 3, 2021, 11:01 a.m. UTC | #7
On 2021/12/03 15:50, Christoph Hellwig wrote:
> task_work_add sounds nice, but it is currently not exported which might
> be for a reason (I don't really have any experience with it).

I didn't find a reason not to export. But generally task_work_add() users
seem to implement a fallback which uses a WQ in case task_work_add() failed
(i.e. exit_task_work() was already called from do_exit()) or task_work_add()
cannot be used (e.g. the caller is a kernel thread).

I don't know if there is possibility that a kernel thread calls blkdev_put(),
but implementing the fallback path after all requires WQ. Thus, I think that
starting from WQ only and see if something breaks is fine.
Tetsuo Handa Dec. 8, 2021, 9:56 a.m. UTC | #8
Can we apply https://lkml.kernel.org/r/d1f760f9-cdb2-f40d-33d8-bfa517c731be@i-love.sakura.ne.jp ?

On 2021/12/03 20:01, Tetsuo Handa wrote:
> On 2021/12/03 15:50, Christoph Hellwig wrote:
>> task_work_add sounds nice, but it is currently not exported which might
>> be for a reason (I don't really have any experience with it).
> 
> I didn't find a reason not to export. But generally task_work_add() users
> seem to implement a fallback which uses a WQ in case task_work_add() failed
> (i.e. exit_task_work() was already called from do_exit()) or task_work_add()
> cannot be used (e.g. the caller is a kernel thread).
> 
> I don't know if there is possibility that a kernel thread calls blkdev_put(),
> but implementing the fallback path after all requires WQ. Thus, I think that
> starting from WQ only and see if something breaks is fine.
>
diff mbox series

Patch

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ba76319b5544..7f4ea06534c2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1082,7 +1082,7 @@  static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return error;
 }
 
-static void __loop_clr_fd(struct loop_device *lo, bool release)
+static void __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
@@ -1144,8 +1144,6 @@  static void __loop_clr_fd(struct loop_device *lo, bool release)
 	/* let user-space know about this change */
 	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
-	/* This is safe: open() is still holding a reference. */
-	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
@@ -1153,44 +1151,52 @@  static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		/*
-		 * open_mutex has been held already in release path, so don't
-		 * acquire it if this function is called in such case.
-		 *
-		 * If the reread partition isn't from release path, lo_refcnt
-		 * must be at least one and it can only become zero when the
-		 * current holder is released.
-		 */
-		if (!release)
-			mutex_lock(&lo->lo_disk->open_mutex);
+		mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		if (!release)
-			mutex_unlock(&lo->lo_disk->open_mutex);
+		mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
 		/* Device is gone, no point in returning error */
 	}
 
-	/*
-	 * lo->lo_state is set to Lo_unbound here after above partscan has
-	 * finished. There cannot be anybody else entering __loop_clr_fd() as
-	 * Lo_rundown state protects us from all the other places trying to
-	 * change the 'lo' device.
-	 */
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
+
+	fput(filp);
+}
+
+static void loop_rundown_completed(struct loop_device *lo)
+{
 	mutex_lock(&lo->lo_mutex);
 	lo->lo_state = Lo_unbound;
 	mutex_unlock(&lo->lo_mutex);
+	module_put(THIS_MODULE);
+}
 
-	/*
-	 * Need not hold lo_mutex to fput backing file. Calling fput holding
-	 * lo_mutex triggers a circular lock dependency possibility warning as
-	 * fput can take open_mutex which is usually taken before lo_mutex.
-	 */
-	fput(filp);
+static void loop_rundown_workfn(struct work_struct *work)
+{
+	struct loop_device *lo = container_of(work, struct loop_device,
+					      rundown_work);
+	struct block_device *bdev = lo->lo_device;
+	struct gendisk *disk = lo->lo_disk;
+
+	__loop_clr_fd(lo);
+	kobject_put(&bdev->bd_device.kobj);
+	module_put(disk->fops->owner);
+	loop_rundown_completed(lo);
+}
+
+static void loop_schedule_rundown(struct loop_device *lo)
+{
+	struct block_device *bdev = lo->lo_device;
+	struct gendisk *disk = lo->lo_disk;
+
+	__module_get(disk->fops->owner);
+	kobject_get(&bdev->bd_device.kobj);
+	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
+	queue_work(system_long_wq, &lo->rundown_work);
 }
 
 static int loop_clr_fd(struct loop_device *lo)
@@ -1222,7 +1228,8 @@  static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
 
-	__loop_clr_fd(lo, false);
+	__loop_clr_fd(lo);
+	loop_rundown_completed(lo);
 	return 0;
 }
 
@@ -1747,7 +1754,7 @@  static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		__loop_clr_fd(lo, true);
+		loop_schedule_rundown(lo);
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 082d4b6bfc6a..918a7a2dc025 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,6 +56,7 @@  struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
+	struct work_struct      rundown_work;
 };
 
 struct loop_cmd {