Message ID | 20220325063929.1773899-14-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/14] nbd: use the correct block_device in nbd_bdev_reset | expand |
Since __loop_clr_fd() is currently holding loop_validate_mutex and lo->lo_mutex, "avoid lo_mutex in ->release" part is incomplete. The intent of holding loop_validate_mutex (which causes disk->open_mutex => loop_validate_mutex => lo->lo_mutex => blk_mq_freeze_queue()/blk_mq_unfreeze_queue() chain) is to make loop_validate_file() traversal safe. Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from lo_release() is called via ->release when disk_openers() == 0, we are guaranteed that "struct file" which will be passed to loop_validate_file() via fget() cannot be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to hold loop_validate_mutex and lo->lo_mutex from __loop_clr_fd() if release == true. diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2506193a4fd1..d47f3d86dd55 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1136,25 +1136,26 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) gfp_t gfp = lo->old_gfp_mask; /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. + * Make sure that Lo_rundown state becomes visible to loop_configure() + * and loop_change_fd(). When called from ->release, we are guaranteed + * that the "struct file" which loop_configure()/loop_change_fd() found + * via fget() is not this loop device. */ + if (!release) { + mutex_lock(&loop_validate_mutex); + mutex_unlock(&loop_validate_mutex); + } /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. + * It is a sign of something going wrong if lo->lo_state has changed + * while waiting for lo->lo_mutex. When called from ->release, we are + * guaranteed that the nobody is using this loop device. */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); + if (!release) { + mutex_lock(&lo->lo_mutex); + BUG_ON(lo->lo_state != Lo_rundown); + mutex_unlock(&lo->lo_mutex); + } if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); But thinking again about release == false case, which I wrote "It is acceptable for loop_validate_file() to succeed, for actual clear operation has not started yet.", I came to feel why it is acceptable to succeed. Even if loop_validate_file() was safely traversed due to serialization via loop_validate_mutex, I/O requests after loop_configure()/loop_change_fd() completed will fail. Is this behavior what we want? If we don't want I/O requests after loop_configure()/loop_change_fd() completed fail due to __loop_clr_fd(), it is not acceptable for loop_validate_file() to succeed. We should hold loop_validate_mutex before setting Lo_rundown in order to make sure that loop_validate_file() will see Lo_rundown state. That is, something like below will be expected? diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2506193a4fd1..a4ff94ca654f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1135,27 +1135,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) struct file *filp; gfp_t gfp = lo->old_gfp_mask; - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); @@ -1238,11 +1217,18 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Use global lock when setting Lo_rundown state in order to make sure + * that loop_validate_file() will fail if the "struct file" which + * loop_configure()/loop_change_fd() found via fget() was this loop + * device. The disk_openers(lo->lo_disk) > 1 test below guarantees that + * fget() did not return this loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1257,11 +1243,11 @@ static int loop_clr_fd(struct loop_device *lo) */ if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0;
Hi Tetsuo, I'm a bit confused. Partially due to the two patches in one mail, but also because I can't actually find a try that they cleanly apply to. But let me add my thoughts here: - I think the change On Sat, Mar 26, 2022 at 11:52:36AM +0900, Tetsuo Handa wrote: > Since __loop_clr_fd() is currently holding loop_validate_mutex and lo->lo_mutex, > "avoid lo_mutex in ->release" part is incomplete. > > The intent of holding loop_validate_mutex (which causes disk->open_mutex => > loop_validate_mutex => lo->lo_mutex => blk_mq_freeze_queue()/blk_mq_unfreeze_queue() > chain) is to make loop_validate_file() traversal safe. > > Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from > lo_release() is called via ->release when disk_openers() == 0, we are guaranteed > that "struct file" which will be passed to loop_validate_file() via fget() cannot > be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to > hold loop_validate_mutex and lo->lo_mutex from __loop_clr_fd() if release == true. This part looks reasonable. Can you give a signoff and proper commit log and I'll slot it in before the lo_refcount removal.
Thinking a bit more, I really don't think the existing any refcount check protects us against a different tread modififying the backing file. When a process has a file descriptor to a loop device open and is multithreaded (or forks) we can still have multiple threads manipulating the loop state. That being said I do not think we really need that refcount check at all - once loop_clr_fd set lo->lo_state to Lo_rundown under the global lock we know that loop_validate_file will error out on it due to the lo_state != Lo_bound check.
On 2022/03/29 22:25, Christoph Hellwig wrote: > Thinking a bit more, I really don't think the existing any refcount > check protects us against a different tread modifying the backing > file. When a process has a file descriptor to a loop device open > and is multithreaded (or forks) we can still have multiple threads > manipulating the loop state. Yes, I came to that answer. But > > That being said I do not think we really need that refcount check > at all - once loop_clr_fd set lo->lo_state to Lo_rundown under the > global lock we know that loop_validate_file will error out on it > due to the lo_state != Lo_bound check. if I think about non "a file descriptor to a loop device" case, I came to the opposite answer. Rather, shouldn't we check the refcount strictly like below? [PATCH] loop: avoid loop_validate_mutex/lo_mutex in ->release Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from lo_release() is called via ->release when disk_openers() == 0, we are guaranteed that "struct file" which will be passed to loop_validate_file() via fget() cannot be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to hold loop_validate_mutex from __loop_clr_fd() if release == true. When I made commit 3ce6e1f662a91097 ("loop: reintroduce global lock for safe loop_validate_file() traversal"), I wrote "It is acceptable for loop_validate_file() to succeed, for actual clear operation has not started yet.". But now I came to feel why it is acceptable to succeed. It seems that the loop driver was added in Linux 1.3.68, and if (lo->lo_refcnt > 1) return -EBUSY; check in loop_clr_fd() was there from the beginning. The intent of this check was unclear. But now I think that current disk_openers(lo->lo_disk) > 1 form is there for three reasons. (1) Avoid I/O errors when some process which opens and reads from this loop device in response to uevent notification (e.g. systemd-udevd), as described in commit a1ecac3b0656a682 ("loop: Make explicit loop device destruction lazy"). This opener is short-lived because it is likely that the file descriptor used by that process is closed soon. (2) Avoid I/O errors caused by underlying layer of stacked loop devices (i.e. ioctl(some_loop_fd, LOOP_SET_FD, other_loop_fd)) being suddenly disappeared. This opener is long-lived because this reference is associated with not a file descriptor but lo->lo_backing_file. (3) Avoid I/O errors caused by underlying layer of mounted loop device (i.e. mount(some_loop_device, some_mount_point)) being suddenly disappeared. This opener is long-lived because this reference is associated with not a file descriptor but mount. While race in (1) might be acceptable, (2) and (3) should be checked racelessly. That is, make sure that __loop_clr_fd() will not run if loop_validate_file() succeeds, by doing refcount check with global lock held when explicit loop device destruction is requested. As a result of no longer waiting for lo->lo_mutex after setting Lo_rundown, we can remove pointless BUG_ON(lo->lo_state != Lo_rundown) check. Not-yet-signed-off. ;-) --- drivers/block/loop.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2506193a4fd1..6b813c592159 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1135,27 +1135,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) struct file *filp; gfp_t gfp = lo->old_gfp_mask; - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); @@ -1238,11 +1217,20 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1257,11 +1245,11 @@ static int loop_clr_fd(struct loop_device *lo) */ if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0;
On Tue, Mar 29, 2022 at 11:02:15PM +0900, Tetsuo Handa wrote: > It seems that the loop driver was added in Linux 1.3.68, and > > if (lo->lo_refcnt > 1) > return -EBUSY; > > check in loop_clr_fd() was there from the beginning. The intent of this > check was unclear. Yes. > But now I think that current > > disk_openers(lo->lo_disk) > 1 > > form is there for three reasons. > > (1) Avoid I/O errors when some process which opens and reads from this > loop device in response to uevent notification (e.g. systemd-udevd), > as described in commit a1ecac3b0656a682 ("loop: Make explicit loop > device destruction lazy"). This opener is short-lived because it is > likely that the file descriptor used by that process is closed soon. Well. With the the uevent supression in the current series there won't be uevents until the capacity has been set to 0. More importantly anything that listens to theses kinds of uevents needs to be able to deal with I/O errors like this. > (2) Avoid I/O errors caused by underlying layer of stacked loop devices > (i.e. ioctl(some_loop_fd, LOOP_SET_FD, other_loop_fd)) being suddenly > disappeared. This opener is long-lived because this reference is > associated with not a file descriptor but lo->lo_backing_file. Again, if you clear the FD expecting I/O errors is the logical consequence. This is like saying we should work around seeing I/O errors when hot removing a physical device. > (3) Avoid I/O errors caused by underlying layer of mounted loop device > (i.e. mount(some_loop_device, some_mount_point)) being suddenly > disappeared. This opener is long-lived because this reference is > associated with not a file descriptor but mount. Same I/O error story. If you hot remove a nvme SSD you do expect error in the file system. This is a pretty clear action -> consequence relation. > While race in (1) might be acceptable, (2) and (3) should be checked > racelessly. That is, make sure that __loop_clr_fd() will not run if > loop_validate_file() succeeds, by doing refcount check with global lock > held when explicit loop device destruction is requested. > > As a result of no longer waiting for lo->lo_mutex after setting Lo_rundown, > we can remove pointless BUG_ON(lo->lo_state != Lo_rundown) check. I really do like this patch. And I think based on your description that we both agree that the disk_openers check is not needed for functional correctness as a malicious userspace can do concurrent operations even without the openers check. You want a protection against "I/O errors" when the FD is cleared on a live device, and with your patch we get that with the disk_openers check. I'm perfectly fine with that state for this series as it keeps the status quo. I just think this check that goes all the way back is actually a really bad idea that just provides some false security. But that isn't something we need to discuss here and now.
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bfd21af7aa38b..8ad8cfffdcbdc 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1261,7 +1261,7 @@ static int loop_clr_fd(struct loop_device *lo) * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ - if (atomic_read(&lo->lo_refcnt) > 1) { + if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_mutex); return 0; @@ -1741,33 +1741,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) -{ - struct loop_device *lo = bdev->bd_disk->private_data; - int err; - - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) - err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); - return err; -} - static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* @@ -1777,8 +1759,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode) __loop_clr_fd(lo, true); return; } - -out_unlock: mutex_unlock(&lo->lo_mutex); } @@ -1792,7 +1772,6 @@ static void lo_free_disk(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT @@ -2046,7 +2025,6 @@ static int loop_add(int i) */ if (!part_shift) disk->flags |= GENHD_FL_NO_PART; - atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); @@ -2136,13 +2114,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 082d4b6bfc6a6..449d562738c52 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -28,7 +28,6 @@ struct loop_func_table; struct loop_device { int lo_number; - atomic_t lo_refcnt; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags;