diff mbox

[RFC] loop: make partition scanning reliable

Message ID 1422267319-8428-1-git-send-email-dh.herrmann@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Herrmann Jan. 26, 2015, 10:15 a.m. UTC
So far, loop-device partition-scanning is skipped with EBUSY if the
bd_mutex cannot be locked. This might happen, if someone runs open() /
close() / ioctl() in parallel to LOOP_SET_FD/LOOP_CHANGE_FD and friends.

__fput() on open files might get delayed arbitrarily, which means,
blkdev_put() might get called at any point in time, locking bd_mutex and
thus preventing any possible partition-rescanning in parallel. This
basically requires user-space to *always* run BLKRRSCAN after
LOOP_SET_FD/LOOP_CHANGE_FD as we cannot know whether someone else is about
to close their FD at the same time.

Fix this by running BLKRRPART without holding lo_ctl_mutex, thus, we will
no longer deadlock if we lock bd_mutex.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 block/ioctl.c        |  9 ++++++---
 drivers/block/loop.c | 49 ++++++++++++++++++++++++++++++-------------------
 include/linux/fs.h   |  1 +
 3 files changed, 37 insertions(+), 22 deletions(-)

Comments

Tejun Heo Jan. 26, 2015, 4:13 p.m. UTC | #1
Hello, David.

On Mon, Jan 26, 2015 at 11:15:19AM +0100, David Herrmann wrote:
> -static int blkdev_reread_part(struct block_device *bdev)
> +int blkdev_reread_part(struct block_device *bdev, int skipbusy)
>  {
>  	struct gendisk *disk = bdev->bd_disk;
>  	int res;
> @@ -159,12 +159,15 @@ static int blkdev_reread_part(struct block_device *bdev)
>  		return -EINVAL;
>  	if (!capable(CAP_SYS_ADMIN))
>  		return -EACCES;
> -	if (!mutex_trylock(&bdev->bd_mutex))
> +	if (!skipbusy)
> +		mutex_lock(&bdev->bd_mutex);
> +	else if (!mutex_trylock(&bdev->bd_mutex))
>  		return -EBUSY;

Do we actually need the mutex_trylock() path?  Why can't we just
always grab the mutex?

...
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 6cb1beb..4047985 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -637,7 +637,7 @@ out:
>   * new backing store is the same size and type as the old backing store.
>   */
>  static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
> -			  unsigned int arg)
> +			  unsigned int arg, int *rrpart)

bool *rrpart would be better but can't we communicate this through the
return value?  Wouldn't that be prettier?

Thanks.
Christoph Hellwig Jan. 26, 2015, 5:27 p.m. UTC | #2
On Mon, Jan 26, 2015 at 11:13:28AM -0500, Tejun Heo wrote:
> > @@ -159,12 +159,15 @@ static int blkdev_reread_part(struct block_device *bdev)
> >  		return -EINVAL;
> >  	if (!capable(CAP_SYS_ADMIN))
> >  		return -EACCES;
> > -	if (!mutex_trylock(&bdev->bd_mutex))
> > +	if (!skipbusy)
> > +		mutex_lock(&bdev->bd_mutex);
> > +	else if (!mutex_trylock(&bdev->bd_mutex))
> >  		return -EBUSY;
> 
> Do we actually need the mutex_trylock() path?  Why can't we just
> always grab the mutex?

I wondered about this, too.  The trylock goes all the way back
to when Al first factored out the partition re-reading into common
code ("[PATCH] partition table flush/read cleanup" in the historic tree
converted from bk), and even before that most drivers used a weird
dev_lock_part() consruct operating on a kdev_t, wich looked like a
trylock.

Given that blkdev_reread_part is invoked directly from the ioctl method
without any additional locking it seems fairly pointless for the
case where it's issue from userspace.  In addition to that the loop
driver, nbd and dasd issue it from a driver, as does the root mounting
code for md.

I would be surprised if any of these callers needs it, but a careful
audit would be useful.

I'd also really prefer a patch that makes loop, nbd and dasd call
blkdev_reread_part directly instead of by ioctl_by_bdev as a first
stage preparation, followed by the locking changes  and the changes
to loop.c in separate patch so each does one thing and can be properly
documented.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/ioctl.c b/block/ioctl.c
index 6c7bf90..01c6550 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,7 +150,7 @@  static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+int blkdev_reread_part(struct block_device *bdev, int skipbusy)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	int res;
@@ -159,12 +159,15 @@  static int blkdev_reread_part(struct block_device *bdev)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (!mutex_trylock(&bdev->bd_mutex))
+	if (!skipbusy)
+		mutex_lock(&bdev->bd_mutex);
+	else if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
 	res = rescan_partitions(disk, bdev);
 	mutex_unlock(&bdev->bd_mutex);
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
@@ -407,7 +410,7 @@  int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
 		break;
 	case BLKRRPART:
-		ret = blkdev_reread_part(bdev);
+		ret = blkdev_reread_part(bdev, 1);
 		break;
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb..4047985 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -637,7 +637,7 @@  out:
  * new backing store is the same size and type as the old backing store.
  */
 static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
-			  unsigned int arg)
+			  unsigned int arg, int *rrpart)
 {
 	struct file	*file, *old_file;
 	struct inode	*inode;
@@ -676,7 +676,7 @@  static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 
 	fput(old_file);
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		ioctl_by_bdev(bdev, BLKRRPART, 0);
+		*rrpart = 1;
 	return 0;
 
  out_putf:
@@ -821,7 +821,7 @@  static void loop_config_discard(struct loop_device *lo)
 }
 
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
-		       struct block_device *bdev, unsigned int arg)
+		       struct block_device *bdev, unsigned int arg, int *rrpart)
 {
 	struct file	*file, *f;
 	struct inode	*inode;
@@ -917,7 +917,7 @@  static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		ioctl_by_bdev(bdev, BLKRRPART, 0);
+		*rrpart = 1;
 
 	/* Grab the block_device to prevent its destruction after we
 	 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
@@ -1064,7 +1064,8 @@  static int loop_clr_fd(struct loop_device *lo)
 }
 
 static int
-loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
+loop_set_status(struct loop_device *lo, const struct loop_info64 *info,
+		int *rrpart)
 {
 	int err;
 	struct loop_func_table *xfer;
@@ -1123,7 +1124,7 @@  loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
-		ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+		*rrpart = 1;
 	}
 
 	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
@@ -1223,7 +1224,8 @@  loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
 }
 
 static int
-loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
+loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg,
+		    int *rrpart)
 {
 	struct loop_info info;
 	struct loop_info64 info64;
@@ -1231,17 +1233,18 @@  loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
 	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
 		return -EFAULT;
 	loop_info64_from_old(&info, &info64);
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, &info64, rrpart);
 }
 
 static int
-loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
+loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg,
+		  int *rrpart)
 {
 	struct loop_info64 info64;
 
 	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
 		return -EFAULT;
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, &info64, rrpart);
 }
 
 static int
@@ -1289,15 +1292,15 @@  static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
 	struct loop_device *lo = bdev->bd_disk->private_data;
-	int err;
+	int err, rrpart = 0;
 
 	mutex_lock_nested(&lo->lo_ctl_mutex, 1);
 	switch (cmd) {
 	case LOOP_SET_FD:
-		err = loop_set_fd(lo, mode, bdev, arg);
+		err = loop_set_fd(lo, mode, bdev, arg, &rrpart);
 		break;
 	case LOOP_CHANGE_FD:
-		err = loop_change_fd(lo, bdev, arg);
+		err = loop_change_fd(lo, bdev, arg, &rrpart);
 		break;
 	case LOOP_CLR_FD:
 		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
@@ -1309,7 +1312,8 @@  static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_status_old(lo,
-					(struct loop_info __user *)arg);
+					(struct loop_info __user *)arg,
+					&rrpart);
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
@@ -1318,7 +1322,8 @@  static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_status64(lo,
-					(struct loop_info64 __user *) arg);
+					(struct loop_info64 __user *) arg,
+					&rrpart);
 		break;
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
@@ -1334,6 +1339,9 @@  static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	mutex_unlock(&lo->lo_ctl_mutex);
 
 out_unlocked:
+	if (rrpart)
+		blkdev_reread_part(bdev, 0);
+
 	return err;
 }
 
@@ -1429,7 +1437,7 @@  loop_info64_to_compat(const struct loop_info64 *info64,
 
 static int
 loop_set_status_compat(struct loop_device *lo,
-		       const struct compat_loop_info __user *arg)
+		       const struct compat_loop_info __user *arg, int *rrpart)
 {
 	struct loop_info64 info64;
 	int ret;
@@ -1437,7 +1445,7 @@  loop_set_status_compat(struct loop_device *lo,
 	ret = loop_info64_from_compat(arg, &info64);
 	if (ret < 0)
 		return ret;
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, &info64, rrpart);
 }
 
 static int
@@ -1460,14 +1468,17 @@  static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 			   unsigned int cmd, unsigned long arg)
 {
 	struct loop_device *lo = bdev->bd_disk->private_data;
-	int err;
+	int err, rrpart = 0;
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
 		mutex_lock(&lo->lo_ctl_mutex);
 		err = loop_set_status_compat(
-			lo, (const struct compat_loop_info __user *) arg);
+			lo, (const struct compat_loop_info __user *) arg,
+			&rrpart);
 		mutex_unlock(&lo->lo_ctl_mutex);
+		if (rrpart)
+			blkdev_reread_part(bdev, 0);
 		break;
 	case LOOP_GET_STATUS:
 		mutex_lock(&lo->lo_ctl_mutex);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e..755a056 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2153,6 +2153,7 @@  extern const struct file_operations bad_sock_fops;
 #ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
+extern int blkdev_reread_part(struct block_device *bdev, int skipbusy);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
 extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,