diff mbox series

[3/7] block: Add config option to not allow writing to mounted devices

Message ID 20231101174325.10596-3-jack@suse.cz (mailing list archive)
State New, archived
Headers show
Series block: Add config option to not allow writing to mounted devices | expand

Commit Message

Jan Kara Nov. 1, 2023, 5:43 p.m. UTC
Writing to mounted devices is dangerous and can lead to filesystem
corruption as well as crashes. Furthermore syzbot comes with more and
more involved examples how to corrupt block device under a mounted
filesystem leading to kernel crashes and reports we can do nothing
about. Add tracking of writers to each block device and a kernel cmdline
argument which controls whether other writeable opens to block devices
open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
filesystems use this flag for used devices.

Note that this effectively only prevents modification of the particular
block device's page cache by other writers. The actual device content
can still be modified by other means - e.g. by issuing direct scsi
commands, by doing writes through devices lower in the storage stack
(e.g. in case loop devices, DM, or MD are involved) etc. But blocking
direct modifications of the block device page cache is enough to give
filesystems a chance to perform data validation when loading data from
the underlying storage and thus prevent kernel crashes.

Syzbot can use this cmdline argument option to avoid uninteresting
crashes. Also users whose userspace setup does not need writing to
mounted block devices can set this option for hardening.

Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 block/Kconfig             | 20 +++++++++++++
 block/bdev.c              | 62 ++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h |  1 +
 include/linux/blkdev.h    |  2 ++
 4 files changed, 84 insertions(+), 1 deletion(-)

Comments

Christian Brauner Nov. 6, 2023, 2:47 p.m. UTC | #1
On Wed, Nov 01, 2023 at 06:43:08PM +0100, Jan Kara wrote:
> Writing to mounted devices is dangerous and can lead to filesystem
> corruption as well as crashes. Furthermore syzbot comes with more and
> more involved examples how to corrupt block device under a mounted
> filesystem leading to kernel crashes and reports we can do nothing
> about. Add tracking of writers to each block device and a kernel cmdline
> argument which controls whether other writeable opens to block devices
> open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
> filesystems use this flag for used devices.
> 
> Note that this effectively only prevents modification of the particular
> block device's page cache by other writers. The actual device content
> can still be modified by other means - e.g. by issuing direct scsi
> commands, by doing writes through devices lower in the storage stack
> (e.g. in case loop devices, DM, or MD are involved) etc. But blocking
> direct modifications of the block device page cache is enough to give
> filesystems a chance to perform data validation when loading data from
> the underlying storage and thus prevent kernel crashes.
> 
> Syzbot can use this cmdline argument option to avoid uninteresting
> crashes. Also users whose userspace setup does not need writing to
> mounted block devices can set this option for hardening.
> 
> Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---

A few minor tweaks I would do in-tree. Please see below.
I know it's mostly stylistic that's why I would do it so there's no
resend dance for non-technical reasons.

>  block/Kconfig             | 20 +++++++++++++
>  block/bdev.c              | 62 ++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h |  1 +
>  include/linux/blkdev.h    |  2 ++
>  4 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index f1364d1c0d93..ca04b657e058 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
>  	select CRC_T10DIF
>  	select CRC64_ROCKSOFT
>  
> +config BLK_DEV_WRITE_MOUNTED
> +	bool "Allow writing to mounted block devices"
> +	default y

Let's hope that this can become the default one day.

> +static void bdev_unblock_writes(struct block_device *bdev)
> +{
> +	bdev->bd_writers = 0;
> +}
> +
> +static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)

I would like to mirror our may_{open,create}() routines here and call
this:

    bdev_may_open()

This is a well-known vfs pattern and also easy to understand for block
devs as well.

> @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
>  		goto abort_claiming;
>  	if (!try_module_get(disk->fops->owner))
>  		goto abort_claiming;
> +	ret = -EBUSY;
> +	if (!blkdev_open_compatible(bdev, mode))
> +		goto abort_claiming;
>  	if (bdev_is_partition(bdev))
>  		ret = blkdev_get_part(bdev, mode);
>  	else
>  		ret = blkdev_get_whole(bdev, mode);
>  	if (ret)
>  		goto put_module;
> +	if (!bdev_allow_write_mounted) {
> +		if (mode & BLK_OPEN_RESTRICT_WRITES)
> +			bdev_block_writes(bdev);
> +		else if (mode & BLK_OPEN_WRITE)
> +			bdev->bd_writers++;
> +	}

I would like to move this to a tiny helper for clarity:

static void bdev_claim_write_access(struct block_device *bdev)
{
        if (!bdev_allow_write_mounted)
                return;

        /* Claim exclusive or shared write access to the block device. */
        if (mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_block_writes(bdev);
        else if (mode & BLK_OPEN_WRITE)
                bdev->bd_writers++;
}

>  	if (holder) {
>  		bd_finish_claiming(bdev, holder, hops);
>  
> @@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
>  		sync_blockdev(bdev);
>  
>  	mutex_lock(&disk->open_mutex);
> +	if (!bdev_allow_write_mounted) {
> +		/* The exclusive opener was blocking writes? Unblock them. */
> +		if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> +			bdev_unblock_writes(bdev);
> +		else if (handle->mode & BLK_OPEN_WRITE)
> +			bdev->bd_writers--;
> +	}

static void bdev_yield_write_access(struct block_device *bdev)
{
        if (!bdev_allow_write_mounted)
                return;

        /* Yield exclusive or shared write access. */
        if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_unblock_writes(bdev);
        else if (handle->mode & BLK_OPEN_WRITE)
                bdev->bd_writers--;
}
Jan Kara Nov. 6, 2023, 3:18 p.m. UTC | #2
On Mon 06-11-23 15:47:54, Christian Brauner wrote:
> On Wed, Nov 01, 2023 at 06:43:08PM +0100, Jan Kara wrote:
> > Writing to mounted devices is dangerous and can lead to filesystem
> > corruption as well as crashes. Furthermore syzbot comes with more and
> > more involved examples how to corrupt block device under a mounted
> > filesystem leading to kernel crashes and reports we can do nothing
> > about. Add tracking of writers to each block device and a kernel cmdline
> > argument which controls whether other writeable opens to block devices
> > open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
> > filesystems use this flag for used devices.
> > 
> > Note that this effectively only prevents modification of the particular
> > block device's page cache by other writers. The actual device content
> > can still be modified by other means - e.g. by issuing direct scsi
> > commands, by doing writes through devices lower in the storage stack
> > (e.g. in case loop devices, DM, or MD are involved) etc. But blocking
> > direct modifications of the block device page cache is enough to give
> > filesystems a chance to perform data validation when loading data from
> > the underlying storage and thus prevent kernel crashes.
> > 
> > Syzbot can use this cmdline argument option to avoid uninteresting
> > crashes. Also users whose userspace setup does not need writing to
> > mounted block devices can set this option for hardening.
> > 
> > Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> 
> A few minor tweaks I would do in-tree. Please see below.
> I know it's mostly stylistic that's why I would do it so there's no
> resend dance for non-technical reasons.

Whatever works best for you. I agree with the changes but please see my
comments below for some fixes.

> >  block/Kconfig             | 20 +++++++++++++
> >  block/bdev.c              | 62 ++++++++++++++++++++++++++++++++++++++-
> >  include/linux/blk_types.h |  1 +
> >  include/linux/blkdev.h    |  2 ++
> >  4 files changed, 84 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/Kconfig b/block/Kconfig
> > index f1364d1c0d93..ca04b657e058 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
> >  	select CRC_T10DIF
> >  	select CRC64_ROCKSOFT
> >  
> > +config BLK_DEV_WRITE_MOUNTED
> > +	bool "Allow writing to mounted block devices"
> > +	default y
> 
> Let's hope that this can become the default one day.

Yes, I'd hope as well but we need some tooling work (util-linux, e2fsprogs)
before that can happen.

> > +static void bdev_unblock_writes(struct block_device *bdev)
> > +{
> > +	bdev->bd_writers = 0;
> > +}
> > +
> > +static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
> 
> I would like to mirror our may_{open,create}() routines here and call
> this:
> 
>     bdev_may_open()
> 
> This is a well-known vfs pattern and also easy to understand for block
> devs as well.

Sure.

> > @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> >  		goto abort_claiming;
> >  	if (!try_module_get(disk->fops->owner))
> >  		goto abort_claiming;
> > +	ret = -EBUSY;
> > +	if (!blkdev_open_compatible(bdev, mode))
> > +		goto abort_claiming;
> >  	if (bdev_is_partition(bdev))
> >  		ret = blkdev_get_part(bdev, mode);
> >  	else
> >  		ret = blkdev_get_whole(bdev, mode);
> >  	if (ret)
> >  		goto put_module;
> > +	if (!bdev_allow_write_mounted) {
> > +		if (mode & BLK_OPEN_RESTRICT_WRITES)
> > +			bdev_block_writes(bdev);
> > +		else if (mode & BLK_OPEN_WRITE)
> > +			bdev->bd_writers++;
> > +	}
> 
> I would like to move this to a tiny helper for clarity:
> 
> static void bdev_claim_write_access(struct block_device *bdev)
> {
>         if (!bdev_allow_write_mounted)

This should be the other way around.

>                 return;
> 
>         /* Claim exclusive or shared write access to the block device. */
>         if (mode & BLK_OPEN_RESTRICT_WRITES)
>                 bdev_block_writes(bdev);
>         else if (mode & BLK_OPEN_WRITE)
>                 bdev->bd_writers++;
> }
> 
> >  	if (holder) {
> >  		bd_finish_claiming(bdev, holder, hops);
> >  
> > @@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
> >  		sync_blockdev(bdev);
> >  
> >  	mutex_lock(&disk->open_mutex);
> > +	if (!bdev_allow_write_mounted) {
> > +		/* The exclusive opener was blocking writes? Unblock them. */
> > +		if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> > +			bdev_unblock_writes(bdev);
> > +		else if (handle->mode & BLK_OPEN_WRITE)
> > +			bdev->bd_writers--;
> > +	}
> 
> static void bdev_yield_write_access(struct block_device *bdev)
> {
>         if (!bdev_allow_write_mounted)

And this as well.

>                 return;
> 
>         /* Yield exclusive or shared write access. */
>         if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
>                 bdev_unblock_writes(bdev);
>         else if (handle->mode & BLK_OPEN_WRITE)
>                 bdev->bd_writers--;
> }

								Honza
Christian Brauner Nov. 6, 2023, 3:57 p.m. UTC | #3
> > Let's hope that this can become the default one day.
> 
> Yes, I'd hope as well but we need some tooling work (util-linux, e2fsprogs)
> before that can happen.

Yeah, absolutely. I'm moderately confident we'll have fairly quick
adoption of this though.
Li Lingfeng Dec. 20, 2023, 3:26 a.m. UTC | #4
在 2023/11/2 1:43, Jan Kara 写道:
> Writing to mounted devices is dangerous and can lead to filesystem
> corruption as well as crashes. Furthermore syzbot comes with more and
> more involved examples how to corrupt block device under a mounted
> filesystem leading to kernel crashes and reports we can do nothing
> about. Add tracking of writers to each block device and a kernel cmdline
> argument which controls whether other writeable opens to block devices
> open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
> filesystems use this flag for used devices.
>
> Note that this effectively only prevents modification of the particular
> block device's page cache by other writers. The actual device content
> can still be modified by other means - e.g. by issuing direct scsi
> commands, by doing writes through devices lower in the storage stack
> (e.g. in case loop devices, DM, or MD are involved) etc. But blocking
> direct modifications of the block device page cache is enough to give
> filesystems a chance to perform data validation when loading data from
> the underlying storage and thus prevent kernel crashes.
>
> Syzbot can use this cmdline argument option to avoid uninteresting
> crashes. Also users whose userspace setup does not need writing to
> mounted block devices can set this option for hardening.
>
> Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>   block/Kconfig             | 20 +++++++++++++
>   block/bdev.c              | 62 ++++++++++++++++++++++++++++++++++++++-
>   include/linux/blk_types.h |  1 +
>   include/linux/blkdev.h    |  2 ++
>   4 files changed, 84 insertions(+), 1 deletion(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index f1364d1c0d93..ca04b657e058 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
>   	select CRC_T10DIF
>   	select CRC64_ROCKSOFT
>   
> +config BLK_DEV_WRITE_MOUNTED
> +	bool "Allow writing to mounted block devices"
> +	default y
> +	help
> +	When a block device is mounted, writing to its buffer cache is very
> +	likely going to cause filesystem corruption. It is also rather easy to
> +	crash the kernel in this way since the filesystem has no practical way
> +	of detecting these writes to buffer cache and verifying its metadata
> +	integrity. However there are some setups that need this capability
> +	like running fsck on read-only mounted root device, modifying some
> +	features on mounted ext4 filesystem, and similar. If you say N, the
> +	kernel will prevent processes from writing to block devices that are
> +	mounted by filesystems which provides some more protection from runaway
> +	privileged processes and generally makes it much harder to crash
> +	filesystem drivers. Note however that this does not prevent
> +	underlying device(s) from being modified by other means, e.g. by
> +	directly submitting SCSI commands or through access to lower layers of
> +	storage stack. If in doubt, say Y. The configuration can be overridden
> +	with the bdev_allow_write_mounted boot option.
> +
>   config BLK_DEV_ZONED
>   	bool "Zoned block device support"
>   	select MQ_IOSCHED_DEADLINE
> diff --git a/block/bdev.c b/block/bdev.c
> index 3f27939e02c6..d75dd7dd2b31 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -30,6 +30,9 @@
>   #include "../fs/internal.h"
>   #include "blk.h"
>   
> +/* Should we allow writing to mounted block devices? */
> +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
> +
>   struct bdev_inode {
>   	struct block_device bdev;
>   	struct inode vfs_inode;
> @@ -730,7 +733,34 @@ void blkdev_put_no_open(struct block_device *bdev)
>   {
>   	put_device(&bdev->bd_device);
>   }
> -	
> +
> +static bool bdev_writes_blocked(struct block_device *bdev)
> +{
> +	return bdev->bd_writers == -1;
> +}
> +
> +static void bdev_block_writes(struct block_device *bdev)
> +{
> +	bdev->bd_writers = -1;
> +}
> +
> +static void bdev_unblock_writes(struct block_device *bdev)
> +{
> +	bdev->bd_writers = 0;
> +}
> +
> +static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
> +{
> +	if (!bdev_allow_write_mounted) {
> +		/* Writes blocked? */
> +		if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
> +			return false;
> +		if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
> +			return false;
> +	}
> +	return true;
> +}
> +
>   /**
>    * bdev_open_by_dev - open a block device by device number
>    * @dev: device number of block device to open
> @@ -773,6 +803,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
>   	if (ret)
>   		goto free_handle;
>   
> +	/* Blocking writes requires exclusive opener */
> +	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
> +		return ERR_PTR(-EINVAL);
> +
>   	bdev = blkdev_get_no_open(dev);
>   	if (!bdev) {
>   		ret = -ENXIO;
> @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
>   		goto abort_claiming;
>   	if (!try_module_get(disk->fops->owner))
>   		goto abort_claiming;
> +	ret = -EBUSY;
> +	if (!blkdev_open_compatible(bdev, mode))
> +		goto abort_claiming;
>   	if (bdev_is_partition(bdev))
>   		ret = blkdev_get_part(bdev, mode);
>   	else
>   		ret = blkdev_get_whole(bdev, mode);
>   	if (ret)
>   		goto put_module;
> +	if (!bdev_allow_write_mounted) {
> +		if (mode & BLK_OPEN_RESTRICT_WRITES)
> +			bdev_block_writes(bdev);

Hi, Jan

When a partition device is mounted, I think maybe it's better to block 
writes on the whole device at same time.

Allowing the whole device to be opened for writing when mounting a 
partition device, did you have any special considerations before?

Thanks.

> +		else if (mode & BLK_OPEN_WRITE)
> +			bdev->bd_writers++;
> +	}
>   	if (holder) {
>   		bd_finish_claiming(bdev, holder, hops);
>   
> @@ -901,6 +944,14 @@ void bdev_release(struct bdev_handle *handle)
>   		sync_blockdev(bdev);
>   
>   	mutex_lock(&disk->open_mutex);
> +	if (!bdev_allow_write_mounted) {
> +		/* The exclusive opener was blocking writes? Unblock them. */
> +		if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
> +			bdev_unblock_writes(bdev);
> +		else if (handle->mode & BLK_OPEN_WRITE)
> +			bdev->bd_writers--;
> +	}
> +
>   	if (handle->holder)
>   		bd_end_claim(bdev, handle->holder);
>   
> @@ -1069,3 +1120,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>   
>   	blkdev_put_no_open(bdev);
>   }
> +
> +static int __init setup_bdev_allow_write_mounted(char *str)
> +{
> +	if (kstrtobool(str, &bdev_allow_write_mounted))
> +		pr_warn("Invalid option string for bdev_allow_write_mounted:"
> +			" '%s'\n", str);
> +	return 1;
> +}
> +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 749203277fee..52e264d5a830 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -66,6 +66,7 @@ struct block_device {
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   	bool			bd_make_it_fail;
>   #endif
> +	int			bd_writers;
>   	/*
>   	 * keep this out-of-line as it's both big and not needed in the fast
>   	 * path
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 7afc10315dd5..0e0c0186aa32 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t;
>   #define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
>   /* open for "writes" only for ioctls (specialy hack for floppy.c) */
>   #define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
> +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
> +#define BLK_OPEN_RESTRICT_WRITES	((__force blk_mode_t)(1 << 5))
>   
>   struct gendisk {
>   	/*
Jan Kara Dec. 21, 2023, 12:11 p.m. UTC | #5
On Wed 20-12-23 11:26:38, Li Lingfeng wrote:
> > @@ -773,6 +803,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> >   	if (ret)
> >   		goto free_handle;
> > +	/* Blocking writes requires exclusive opener */
> > +	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
> > +		return ERR_PTR(-EINVAL);
> > +
> >   	bdev = blkdev_get_no_open(dev);
> >   	if (!bdev) {
> >   		ret = -ENXIO;
> > @@ -800,12 +834,21 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
> >   		goto abort_claiming;
> >   	if (!try_module_get(disk->fops->owner))
> >   		goto abort_claiming;
> > +	ret = -EBUSY;
> > +	if (!blkdev_open_compatible(bdev, mode))
> > +		goto abort_claiming;
> >   	if (bdev_is_partition(bdev))
> >   		ret = blkdev_get_part(bdev, mode);
> >   	else
> >   		ret = blkdev_get_whole(bdev, mode);
> >   	if (ret)
> >   		goto put_module;
> > +	if (!bdev_allow_write_mounted) {
> > +		if (mode & BLK_OPEN_RESTRICT_WRITES)
> > +			bdev_block_writes(bdev);
> 
> When a partition device is mounted, I think maybe it's better to block
> writes on the whole device at same time.
> 
> Allowing the whole device to be opened for writing when mounting a partition
> device, did you have any special considerations before?

Yes, we were considering this. But the truth is that:

a) It is *very* hard to stop all the possibilities of corrupting data on
the device - e.g. with device mapper / loop device / malicious partition
table you can construct many block devices pointing to the same storage,
you can use e.g. SG_IO to corrupt on disk data etc. So special-casing
partitions is providing little additional benefit.

b) It is difficult to then correctly handle valid cases of multiple
writeably mounted partitions on the same device - you'd need to track used
block numbers for each device which gets difficult in presence of device
mapper etc.

c) To stop filesystem crashes, it is enough to stop modifications of buffer
cache of that one block device. Because filesystems have to validate data
they are loading into buffer cache anyway to handle faulty device, fs
corruption etc.

								Honza
diff mbox series

Patch

diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0d93..ca04b657e058 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@  config BLK_DEV_INTEGRITY_T10
 	select CRC_T10DIF
 	select CRC64_ROCKSOFT
 
+config BLK_DEV_WRITE_MOUNTED
+	bool "Allow writing to mounted block devices"
+	default y
+	help
+	When a block device is mounted, writing to its buffer cache is very
+	likely going to cause filesystem corruption. It is also rather easy to
+	crash the kernel in this way since the filesystem has no practical way
+	of detecting these writes to buffer cache and verifying its metadata
+	integrity. However there are some setups that need this capability
+	like running fsck on read-only mounted root device, modifying some
+	features on mounted ext4 filesystem, and similar. If you say N, the
+	kernel will prevent processes from writing to block devices that are
+	mounted by filesystems which provides some more protection from runaway
+	privileged processes and generally makes it much harder to crash
+	filesystem drivers. Note however that this does not prevent
+	underlying device(s) from being modified by other means, e.g. by
+	directly submitting SCSI commands or through access to lower layers of
+	storage stack. If in doubt, say Y. The configuration can be overridden
+	with the bdev_allow_write_mounted boot option.
+
 config BLK_DEV_ZONED
 	bool "Zoned block device support"
 	select MQ_IOSCHED_DEADLINE
diff --git a/block/bdev.c b/block/bdev.c
index 3f27939e02c6..d75dd7dd2b31 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@ 
 #include "../fs/internal.h"
 #include "blk.h"
 
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
@@ -730,7 +733,34 @@  void blkdev_put_no_open(struct block_device *bdev)
 {
 	put_device(&bdev->bd_device);
 }
-	
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+	return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+	bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+	bdev->bd_writers = 0;
+}
+
+static bool blkdev_open_compatible(struct block_device *bdev, blk_mode_t mode)
+{
+	if (!bdev_allow_write_mounted) {
+		/* Writes blocked? */
+		if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+			return false;
+		if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+			return false;
+	}
+	return true;
+}
+
 /**
  * bdev_open_by_dev - open a block device by device number
  * @dev: device number of block device to open
@@ -773,6 +803,10 @@  struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 	if (ret)
 		goto free_handle;
 
+	/* Blocking writes requires exclusive opener */
+	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
+		return ERR_PTR(-EINVAL);
+
 	bdev = blkdev_get_no_open(dev);
 	if (!bdev) {
 		ret = -ENXIO;
@@ -800,12 +834,21 @@  struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		goto abort_claiming;
 	if (!try_module_get(disk->fops->owner))
 		goto abort_claiming;
+	ret = -EBUSY;
+	if (!blkdev_open_compatible(bdev, mode))
+		goto abort_claiming;
 	if (bdev_is_partition(bdev))
 		ret = blkdev_get_part(bdev, mode);
 	else
 		ret = blkdev_get_whole(bdev, mode);
 	if (ret)
 		goto put_module;
+	if (!bdev_allow_write_mounted) {
+		if (mode & BLK_OPEN_RESTRICT_WRITES)
+			bdev_block_writes(bdev);
+		else if (mode & BLK_OPEN_WRITE)
+			bdev->bd_writers++;
+	}
 	if (holder) {
 		bd_finish_claiming(bdev, holder, hops);
 
@@ -901,6 +944,14 @@  void bdev_release(struct bdev_handle *handle)
 		sync_blockdev(bdev);
 
 	mutex_lock(&disk->open_mutex);
+	if (!bdev_allow_write_mounted) {
+		/* The exclusive opener was blocking writes? Unblock them. */
+		if (handle->mode & BLK_OPEN_RESTRICT_WRITES)
+			bdev_unblock_writes(bdev);
+		else if (handle->mode & BLK_OPEN_WRITE)
+			bdev->bd_writers--;
+	}
+
 	if (handle->holder)
 		bd_end_claim(bdev, handle->holder);
 
@@ -1069,3 +1120,12 @@  void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 
 	blkdev_put_no_open(bdev);
 }
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+	if (kstrtobool(str, &bdev_allow_write_mounted))
+		pr_warn("Invalid option string for bdev_allow_write_mounted:"
+			" '%s'\n", str);
+	return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 749203277fee..52e264d5a830 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,6 +66,7 @@  struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	int			bd_writers;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7afc10315dd5..0e0c0186aa32 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,8 @@  typedef unsigned int __bitwise blk_mode_t;
 #define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
 /* open for "writes" only for ioctls (specialy hack for floppy.c) */
 #define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
+/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
+#define BLK_OPEN_RESTRICT_WRITES	((__force blk_mode_t)(1 << 5))
 
 struct gendisk {
 	/*