diff mbox series

[RFC,v3,3/3] devguard: added device guard for mknod in non-initial userns

Message ID 20231213143813.6818-4-michael.weiss@aisec.fraunhofer.de (mailing list archive)
State RFC
Delegated to: Paul Moore
Headers show
Series devguard: guard mknod for non-initial user namespace | expand

Commit Message

Michael Weiß Dec. 13, 2023, 2:38 p.m. UTC
devguard is a simple LSM to allow CAP_MKNOD in non-initial user
namespace in cooperation of an attached cgroup device program. We
just need to implement the security_inode_mknod() hook for this.
In the hook, we check if the current task is guarded by a device
cgroup using the lately introduced cgroup_bpf_current_enabled()
helper. If so, we strip out SB_I_NODEV from the super block.

Access decisions to those device nodes are then guarded by existing
device cgroups mechanism.

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
---
 security/Kconfig             | 11 +++++----
 security/Makefile            |  1 +
 security/devguard/Kconfig    | 12 ++++++++++
 security/devguard/Makefile   |  2 ++
 security/devguard/devguard.c | 44 ++++++++++++++++++++++++++++++++++++
 5 files changed, 65 insertions(+), 5 deletions(-)
 create mode 100644 security/devguard/Kconfig
 create mode 100644 security/devguard/Makefile
 create mode 100644 security/devguard/devguard.c

Comments

Casey Schaufler Dec. 13, 2023, 6:35 p.m. UTC | #1
On 12/13/2023 6:38 AM, Michael Weiß wrote:
> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> namespace in cooperation of an attached cgroup device program. We
> just need to implement the security_inode_mknod() hook for this.
> In the hook, we check if the current task is guarded by a device
> cgroup using the lately introduced cgroup_bpf_current_enabled()
> helper. If so, we strip out SB_I_NODEV from the super block.
>
> Access decisions to those device nodes are then guarded by existing
> device cgroups mechanism.
>
> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> ---
>  security/Kconfig             | 11 +++++----
>  security/Makefile            |  1 +
>  security/devguard/Kconfig    | 12 ++++++++++
>  security/devguard/Makefile   |  2 ++
>  security/devguard/devguard.c | 44 ++++++++++++++++++++++++++++++++++++
>  5 files changed, 65 insertions(+), 5 deletions(-)
>  create mode 100644 security/devguard/Kconfig
>  create mode 100644 security/devguard/Makefile
>  create mode 100644 security/devguard/devguard.c
>
> diff --git a/security/Kconfig b/security/Kconfig
> index 52c9af08ad35..7ec4017745d4 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -194,6 +194,7 @@ source "security/yama/Kconfig"
>  source "security/safesetid/Kconfig"
>  source "security/lockdown/Kconfig"
>  source "security/landlock/Kconfig"
> +source "security/devguard/Kconfig"
>  
>  source "security/integrity/Kconfig"
>  
> @@ -233,11 +234,11 @@ endchoice
>  
>  config LSM
>  	string "Ordered list of enabled LSMs"
> -	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
> -	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
> -	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
> -	default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC
> -	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf"
> +	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf,devguard" if DEFAULT_SECURITY_SMACK
> +	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf,devguard" if DEFAULT_SECURITY_APPARMOR
> +	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf,devguard" if DEFAULT_SECURITY_TOMOYO
> +	default "landlock,lockdown,yama,loadpin,safesetid,bpf,devguard" if DEFAULT_SECURITY_DAC
> +	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf,devguard"
>  	help
>  	  A comma-separated list of LSMs, in initialization order.
>  	  Any LSMs left off this list, except for those with order
> diff --git a/security/Makefile b/security/Makefile
> index 18121f8f85cd..82a0d8cab3c3 100644
> --- a/security/Makefile
> +++ b/security/Makefile
> @@ -24,6 +24,7 @@ obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
>  obj-$(CONFIG_CGROUPS)			+= device_cgroup.o
>  obj-$(CONFIG_BPF_LSM)			+= bpf/
>  obj-$(CONFIG_SECURITY_LANDLOCK)		+= landlock/
> +obj-$(CONFIG_SECURITY_DEVGUARD)		+= devguard/
>  
>  # Object integrity file lists
>  obj-$(CONFIG_INTEGRITY)			+= integrity/
> diff --git a/security/devguard/Kconfig b/security/devguard/Kconfig
> new file mode 100644
> index 000000000000..592684615a8f
> --- /dev/null
> +++ b/security/devguard/Kconfig
> @@ -0,0 +1,12 @@
> +config SECURITY_DEVGUARD
> +	bool "Devguard for device node creation"
> +	depends on SECURITY
> +	depends on CGROUP_BPF
> +	default n
> +	help
> +	  This enables devguard, an LSM that allows to guard device node
> +	  creation in non-initial user namespace. It may allow mknod
> +	  in cooperation of an attached cgroup device program.
> +	  This security module stacks with other LSMs.
> +
> +	  If you are unsure how to answer this question, answer N.
> diff --git a/security/devguard/Makefile b/security/devguard/Makefile
> new file mode 100644
> index 000000000000..fdaff8dc2fea
> --- /dev/null
> +++ b/security/devguard/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_SECURITY_DEVGUARD) += devguard.o
> diff --git a/security/devguard/devguard.c b/security/devguard/devguard.c
> new file mode 100644
> index 000000000000..3a0c9c27a691
> --- /dev/null
> +++ b/security/devguard/devguard.c
> @@ -0,0 +1,44 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Device guard security module
> + *
> + * Simple in-kernel LSM to allow cap_mknod in non-initial
> + * user namespace if current task is guarded by device cgroup.
> + *
> + * Copyright (C) 2023 Fraunhofer AISEC. All rights reserved.
> + *
> + * Authors: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> + */
> +
> +#include <linux/bpf-cgroup.h>
> +#include <linux/lsm_hooks.h>
> +
> +static int devguard_inode_mknod(struct inode *dir, struct dentry *dentry,
> +				umode_t mode, dev_t dev)
> +{
> +	if (dentry->d_sb->s_iflags & ~SB_I_NODEV)
> +		return 0;
> +
> +	// strip SB_I_NODEV on super block if device cgroup is active

Please use block style comments. We don't use // comments here.

	/*
	 * Strip SB_I_NODEV on super block if device cgroup is active
	 */

> +	if (cgroup_bpf_current_enabled(CGROUP_DEVICE))
> +		dentry->d_sb->s_iflags &= ~SB_I_NODEV;
> +
> +	return 0;
> +}
> +
> +static struct security_hook_list devguard_hooks[] __ro_after_init = {
> +	LSM_HOOK_INIT(inode_mknod, devguard_inode_mknod),
> +};
> +
> +static int __init devguard_init(void)
> +{
> +	security_add_hooks(devguard_hooks, ARRAY_SIZE(devguard_hooks),
> +			   "devguard");
> +	pr_info("devguard: initialized\n");
> +	return 0;
> +}
> +
> +DEFINE_LSM(devguard) = {
> +	.name = "devguard",
> +	.init = devguard_init,
> +};
Christian Brauner Dec. 15, 2023, 12:31 p.m. UTC | #2
On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> namespace in cooperation of an attached cgroup device program. We
> just need to implement the security_inode_mknod() hook for this.
> In the hook, we check if the current task is guarded by a device
> cgroup using the lately introduced cgroup_bpf_current_enabled()
> helper. If so, we strip out SB_I_NODEV from the super block.
> 
> Access decisions to those device nodes are then guarded by existing
> device cgroups mechanism.
> 
> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> ---

I think you misunderstood me... My point was that I believe you don't
need an additional LSM at all and no additional LSM hook. But I might be
wrong. Only a POC would show.

Just write a bpf lsm program that strips SB_I_NODEV in the existing
security_sb_set_mnt_opts() call which is guranteed to be called when a
new superblock is created.

Store your device access rules in a bpf map or in the sb->s_security
blob (This is where I'm fuzzy and could use a bpf LSM expert's input.).

Then make that bpf lsm program kick in everytime a
security_inode_mknod() and security_file_open() is called and do device
access management in there. Actually, you might need to add one hook
when the actual device that's about to be opened is know. 
This should be where today the device access hooks are called.

And then you should already be done with this. The only thing that you
need is the capable check patch.

You don't need that cgroup_bpf_current_enabled() per se. Device
management could now be done per superblock, and not per task. IOW, you
allowlist a bunch of devices that can be created and opened. Any task
that passes basic permission checks and that passes the bpf lsm program
may create device nodes.

That's a way more natural device management model than making this a per
cgroup thing. Though that could be implemented as well with this.

I would try to write a bpf lsm program that does device access
management with your capable() sysctl patch applied and see how far I
get.

I don't have the time otherwise I'd do it.
Michael Weiß Dec. 15, 2023, 1:26 p.m. UTC | #3
On 15.12.23 13:31, Christian Brauner wrote:
> On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
>> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
>> namespace in cooperation of an attached cgroup device program. We
>> just need to implement the security_inode_mknod() hook for this.
>> In the hook, we check if the current task is guarded by a device
>> cgroup using the lately introduced cgroup_bpf_current_enabled()
>> helper. If so, we strip out SB_I_NODEV from the super block.
>>
>> Access decisions to those device nodes are then guarded by existing
>> device cgroups mechanism.
>>
>> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
>> ---
> 
> I think you misunderstood me... My point was that I believe you don't
> need an additional LSM at all and no additional LSM hook. But I might be
> wrong. Only a POC would show.

Yeah sorry, I got your point now.

> 
> Just write a bpf lsm program that strips SB_I_NODEV in the existing
> security_sb_set_mnt_opts() call which is guranteed to be called when a
> new superblock is created.

This does not work since SB_I_NODEV is a required_iflag in
mount_too_revealing(). This I have already tested when writing the
simple LSM here. So maybe we need to drop SB_I_NODEV from required_flags
there, too. Would that be safe?

> 
> Store your device access rules in a bpf map or in the sb->s_security
> blob (This is where I'm fuzzy and could use a bpf LSM expert's input.).
> 
> Then make that bpf lsm program kick in everytime a
> security_inode_mknod() and security_file_open() is called and do device
> access management in there. Actually, you might need to add one hook
> when the actual device that's about to be opened is know. 
> This should be where today the device access hooks are called.
> 
> And then you should already be done with this. The only thing that you
> need is the capable check patch.
> 
> You don't need that cgroup_bpf_current_enabled() per se. Device
> management could now be done per superblock, and not per task. IOW, you
> allowlist a bunch of devices that can be created and opened. Any task
> that passes basic permission checks and that passes the bpf lsm program
> may create device nodes.
> 
> That's a way more natural device management model than making this a per
> cgroup thing. Though that could be implemented as well with this.
> 
> I would try to write a bpf lsm program that does device access
> management with your capable() sysctl patch applied and see how far I
> get.
> 
> I don't have the time otherwise I'd do it.
I'll give it a try but no promises how fast this will go.
Christian Brauner Dec. 15, 2023, 2:15 p.m. UTC | #4
On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> On 15.12.23 13:31, Christian Brauner wrote:
> > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> >> namespace in cooperation of an attached cgroup device program. We
> >> just need to implement the security_inode_mknod() hook for this.
> >> In the hook, we check if the current task is guarded by a device
> >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> >> helper. If so, we strip out SB_I_NODEV from the super block.
> >>
> >> Access decisions to those device nodes are then guarded by existing
> >> device cgroups mechanism.
> >>
> >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> >> ---
> > 
> > I think you misunderstood me... My point was that I believe you don't
> > need an additional LSM at all and no additional LSM hook. But I might be
> > wrong. Only a POC would show.
> 
> Yeah sorry, I got your point now.

I think I might have had a misconception about how this works.
A bpf LSM program can't easily alter a kernel object such as struct
super_block I've been told.

> 
> > 
> > Just write a bpf lsm program that strips SB_I_NODEV in the existing
> > security_sb_set_mnt_opts() call which is guranteed to be called when a
> > new superblock is created.
> 
> This does not work since SB_I_NODEV is a required_iflag in
> mount_too_revealing(). This I have already tested when writing the
> simple LSM here. So maybe we need to drop SB_I_NODEV from required_flags
> there, too. Would that be safe?

Right. I think we might be able to add a new SB_I_MANAGED_DEVICES flag.
__UNTESTED, UNCOMPILED_

diff --git a/fs/namespace.c b/fs/namespace.c
index fbf0e596fcd3..e87cc0320091 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4887,7 +4887,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,

 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
 {
-       const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

@@ -4899,9 +4898,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

-       if ((s_iflags & required_iflags) != required_iflags) {
-               WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
-                         required_iflags);
+       if (!(s_iflags & SB_I_NOEXEC)) {
+               WARN_ONCE(1, "Expected s_iflags to contain SB_I_NOEXEC\n");
+               return true;
+       }
+
+       if (!(s_iflags & (SB_I_NODEV | SB_I_MANAGED_DEVICES))) {
+               WARN_ONCE(1, "Expected s_iflags to contain device access mask\n");
                return true;
        }

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..6ca0fe922478 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE            0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
 #define SB_I_UNTRUSTED_MOUNTER         0x00000040
+#define SB_I_MANAGED_DEVICES           0x00000080

 #define SB_I_SKIP_SYNC 0x00000100      /* Skip superblock at global sync */
 #define SB_I_PERSB_BDI 0x00000200      /* has a per-sb bdi */

> 
> > 
> > Store your device access rules in a bpf map or in the sb->s_security
> > blob (This is where I'm fuzzy and could use a bpf LSM expert's input.).
> > 
> > Then make that bpf lsm program kick in everytime a
> > security_inode_mknod() and security_file_open() is called and do device
> > access management in there. Actually, you might need to add one hook
> > when the actual device that's about to be opened is know. 
> > This should be where today the device access hooks are called.
> > 
> > And then you should already be done with this. The only thing that you
> > need is the capable check patch.
> > 
> > You don't need that cgroup_bpf_current_enabled() per se. Device
> > management could now be done per superblock, and not per task. IOW, you
> > allowlist a bunch of devices that can be created and opened. Any task
> > that passes basic permission checks and that passes the bpf lsm program
> > may create device nodes.
> > 
> > That's a way more natural device management model than making this a per
> > cgroup thing. Though that could be implemented as well with this.
> > 
> > I would try to write a bpf lsm program that does device access
> > management with your capable() sysctl patch applied and see how far I
> > get.
> > 
> > I don't have the time otherwise I'd do it.
> I'll give it a try but no promises how fast this will go.

No worries. We're entering the holiday season anyway.
Christian Brauner Dec. 15, 2023, 4:36 p.m. UTC | #5
On Fri, Dec 15, 2023 at 03:15:33PM +0100, Christian Brauner wrote:
> On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > On 15.12.23 13:31, Christian Brauner wrote:
> > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > >> namespace in cooperation of an attached cgroup device program. We
> > >> just need to implement the security_inode_mknod() hook for this.
> > >> In the hook, we check if the current task is guarded by a device
> > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > >>
> > >> Access decisions to those device nodes are then guarded by existing
> > >> device cgroups mechanism.
> > >>
> > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > >> ---
> > > 
> > > I think you misunderstood me... My point was that I believe you don't
> > > need an additional LSM at all and no additional LSM hook. But I might be
> > > wrong. Only a POC would show.
> > 
> > Yeah sorry, I got your point now.
> 
> I think I might have had a misconception about how this works.
> A bpf LSM program can't easily alter a kernel object such as struct
> super_block I've been told.

Which is why you need that new hook in there. I get it now. In any case,
I think we can do this slightly nicer (for some definition of nice)...

So the thing below moves the capability check for mknod into the
security_inode_mknod() hook (This should be a separate patch.).

It moves raising SB_I_NODEV into security_sb_device_access() and the old
semantics are retained if no LSM claims device management. If an LSM
claims device management we raise the new flag and don't even raise
SB_I_NODEV. The capability check is namespace aware if device management
is claimed by an LSM. That's backward compatible. And we don't need any
sysctl.

What's missing is that all devcgroup_*() calls should be moved into a
new, unified security_device_access() hook that's called consistently in
all places where that matters such as blkdev_get_by_dev() and so on. Let
the bpf lsm implement that new hook.

Then write a sample BPF LSM as POC that this works. This would also
all other LSMs to do device management if they wanted to.

Thoughts?

From 7f4177e4f87e0b0182022f114c0287a0f0985752 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 15 Dec 2023 17:22:26 +0100
Subject: [PATCH] UNTESTED, UNCOMPILED, PROBABLY BUGGY

Signed-off-and-definitely-neither-compiled-nor-tested-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c                    |  5 -----
 fs/namespace.c                | 11 +++++++----
 fs/super.c                    |  6 ++++--
 include/linux/fs.h            |  1 +
 include/linux/lsm_hook_defs.h |  1 +
 include/linux/security.h      | 15 ++++++++++++---
 security/commoncap.c          | 19 +++++++++++++++++++
 security/security.c           | 22 ++++++++++++++++++++++
 8 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 71c13b2990b4..da481e6a696c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3959,16 +3959,11 @@ EXPORT_SYMBOL(user_path_create);
 int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	      struct dentry *dentry, umode_t mode, dev_t dev)
 {
-	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
 	int error = may_create(idmap, dir, dentry);
 
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
-	    !capable(CAP_MKNOD))
-		return -EPERM;
-
 	if (!dir->i_op->mknod)
 		return -EPERM;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index fbf0e596fcd3..e87cc0320091 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4887,7 +4887,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 
 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
 {
-	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	unsigned long s_iflags;
 
@@ -4899,9 +4898,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
 		return false;
 
-	if ((s_iflags & required_iflags) != required_iflags) {
-		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
-			  required_iflags);
+	if (!(s_iflags & SB_I_NOEXEC)) {
+		WARN_ONCE(1, "Expected s_iflags to contain SB_I_NOEXEC\n");
+		return true;
+	}
+
+	if (!(s_iflags & (SB_I_NODEV | SB_I_MANAGED_DEVICES))) {
+		WARN_ONCE(1, "Expected s_iflags to contain device access mask\n");
 		return true;
 	}
 
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..7b8098db17c9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -362,8 +362,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	}
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	if (s->s_user_ns != &init_user_ns)
-		s->s_iflags |= SB_I_NODEV;
+
+	if (security_sb_device_access(s))
+		goto fail;
+
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_roots);
 	mutex_init(&s->s_sync_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..6ca0fe922478 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
+#define SB_I_MANAGED_DEVICES		0x00000080
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3fdd00b452ac..8c8a0d8aa71d 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -60,6 +60,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
 	 struct fs_parameter *param)
 LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
+LSM_HOOK(int, 0, sb_device_access, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
diff --git a/include/linux/security.h b/include/linux/security.h
index 00809d2d5c38..a174f8c09594 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -155,6 +155,8 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags);
+int cap_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		    dev_t dev);
 int cap_inode_removexattr(struct mnt_idmap *idmap,
 			  struct dentry *dentry, const char *name);
 int cap_inode_need_killpriv(struct dentry *dentry);
@@ -348,6 +350,7 @@ int security_inode_symlink(struct inode *dir, struct dentry *dentry,
 int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 int security_inode_rmdir(struct inode *dir, struct dentry *dentry);
 int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev);
+int security_sb_device_access(struct super_block *sb);
 int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags);
@@ -823,10 +826,16 @@ static inline int security_inode_rmdir(struct inode *dir,
 	return 0;
 }
 
-static inline int security_inode_mknod(struct inode *dir,
-					struct dentry *dentry,
-					int mode, dev_t dev)
+static inline int security_inode_mknod(struct inode *dir, struct dentry *dentry,
+				       int mode, dev_t dev)
+{
+	return cap_inode_mknod(dir, dentry, mode, dev);
+}
+
+static inline int security_sb_device_access(struct super_block *sb)
 {
+	if (s->s_user_ns != &init_user_ns)
+		sb->s_iflags |= SB_I_NODEV;
 	return 0;
 }
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 8e8c630ce204..f4a208fdf939 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1438,6 +1438,24 @@ int cap_mmap_file(struct file *file, unsigned long reqprot,
 	return 0;
 }
 
+int cap_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		    dev_t dev)
+{
+	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
+	struct super_block *sb = dir->i_sb;
+	struct user_namespace *userns;
+
+	if (dir->i_sb->s_iflags & SB_I_MANAGED_DEVICES)
+		userns = sb->s_user_ns;
+	else
+		userns = &init_user_ns;
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
+	    !ns_capable(userns, CAP_MKNOD))
+		return -EPERM;
+
+	return 0;
+}
+
 #ifdef CONFIG_SECURITY
 
 static struct security_hook_list capability_hooks[] __ro_after_init = {
@@ -1448,6 +1466,7 @@ static struct security_hook_list capability_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(capget, cap_capget),
 	LSM_HOOK_INIT(capset, cap_capset),
 	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
+	LSM_HOOK_INIT(inode_mknod, cap_inode_mknod),
 	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
 	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
diff --git a/security/security.c b/security/security.c
index 088a79c35c26..192b992f1a34 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1221,6 +1221,28 @@ int security_sb_alloc(struct super_block *sb)
 	return rc;
 }
 
+int security_sb_device_access(struct super_block *sb)
+{
+	int rc;
+
+	rc = call_int_hook(sb_device_access, 0, sb);
+	switch (rc) {
+	case 0:
+		/*
+		 * LSM doesn't do device access management and this is an
+		 * untrusted mount so block all device access.
+		 */
+		if (sb->s_user_ns != &init_user_ns)
+			sb->s_iflags |= SB_I_NODEV;
+		return 0;
+	case 1:
+		sb->s_iflags |= SB_I_MANAGED_DEVICES;
+		return 0;
+	}
+
+	return rc;
+}
+
 /**
  * security_sb_delete() - Release super_block LSM associated objects
  * @sb: filesystem superblock
Alexei Starovoitov Dec. 15, 2023, 6:08 p.m. UTC | #6
On Fri, Dec 15, 2023 at 6:15 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > On 15.12.23 13:31, Christian Brauner wrote:
> > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > >> namespace in cooperation of an attached cgroup device program. We
> > >> just need to implement the security_inode_mknod() hook for this.
> > >> In the hook, we check if the current task is guarded by a device
> > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > >>
> > >> Access decisions to those device nodes are then guarded by existing
> > >> device cgroups mechanism.
> > >>
> > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > >> ---
> > >
> > > I think you misunderstood me... My point was that I believe you don't
> > > need an additional LSM at all and no additional LSM hook. But I might be
> > > wrong. Only a POC would show.
> >
> > Yeah sorry, I got your point now.
>
> I think I might have had a misconception about how this works.
> A bpf LSM program can't easily alter a kernel object such as struct
> super_block I've been told.

Right. bpf cannot change arbitrary kernel objects,
but we can add a kfunc that will change a specific bit in a specific
data structure.
Adding a new lsm hook that does:
    rc = call_int_hook(sb_device_access, 0, sb);
    switch (rc) {
    case 0: do X
    case 1: do Y

is the same thing, but uglier, since return code will be used
to do this action.
The 'do X' can be one kfunc
and 'do Y' can be another.
If later we find out that 'do X' is not a good idea we can remove
that kfunc.
Christian Brauner Dec. 16, 2023, 10:38 a.m. UTC | #7
On Fri, Dec 15, 2023 at 10:08:08AM -0800, Alexei Starovoitov wrote:
> On Fri, Dec 15, 2023 at 6:15 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > > On 15.12.23 13:31, Christian Brauner wrote:
> > > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > > >> namespace in cooperation of an attached cgroup device program. We
> > > >> just need to implement the security_inode_mknod() hook for this.
> > > >> In the hook, we check if the current task is guarded by a device
> > > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > > >>
> > > >> Access decisions to those device nodes are then guarded by existing
> > > >> device cgroups mechanism.
> > > >>
> > > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > > >> ---
> > > >
> > > > I think you misunderstood me... My point was that I believe you don't
> > > > need an additional LSM at all and no additional LSM hook. But I might be
> > > > wrong. Only a POC would show.
> > >
> > > Yeah sorry, I got your point now.
> >
> > I think I might have had a misconception about how this works.
> > A bpf LSM program can't easily alter a kernel object such as struct
> > super_block I've been told.
> 
> Right. bpf cannot change arbitrary kernel objects,
> but we can add a kfunc that will change a specific bit in a specific
> data structure.
> Adding a new lsm hook that does:
>     rc = call_int_hook(sb_device_access, 0, sb);
>     switch (rc) {
>     case 0: do X
>     case 1: do Y
> 
> is the same thing, but uglier, since return code will be used
> to do this action.
> The 'do X' can be one kfunc
> and 'do Y' can be another.
> If later we find out that 'do X' is not a good idea we can remove
> that kfunc.

The reason I moved the SB_I_MANAGED_DEVICES here is that I want a single
central place where that is done for any possible LSM that wants to
implement device management. So we don't have to go chasing where that
bit is set for each LSM. I also don't want to have LSMs raise bits in
sb->s_iflags directly as that's VFS property.
Alexei Starovoitov Dec. 16, 2023, 5:41 p.m. UTC | #8
On Sat, Dec 16, 2023 at 2:38 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Dec 15, 2023 at 10:08:08AM -0800, Alexei Starovoitov wrote:
> > On Fri, Dec 15, 2023 at 6:15 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > > > On 15.12.23 13:31, Christian Brauner wrote:
> > > > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > > > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > > > >> namespace in cooperation of an attached cgroup device program. We
> > > > >> just need to implement the security_inode_mknod() hook for this.
> > > > >> In the hook, we check if the current task is guarded by a device
> > > > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > > > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > > > >>
> > > > >> Access decisions to those device nodes are then guarded by existing
> > > > >> device cgroups mechanism.
> > > > >>
> > > > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > > > >> ---
> > > > >
> > > > > I think you misunderstood me... My point was that I believe you don't
> > > > > need an additional LSM at all and no additional LSM hook. But I might be
> > > > > wrong. Only a POC would show.
> > > >
> > > > Yeah sorry, I got your point now.
> > >
> > > I think I might have had a misconception about how this works.
> > > A bpf LSM program can't easily alter a kernel object such as struct
> > > super_block I've been told.
> >
> > Right. bpf cannot change arbitrary kernel objects,
> > but we can add a kfunc that will change a specific bit in a specific
> > data structure.
> > Adding a new lsm hook that does:
> >     rc = call_int_hook(sb_device_access, 0, sb);
> >     switch (rc) {
> >     case 0: do X
> >     case 1: do Y
> >
> > is the same thing, but uglier, since return code will be used
> > to do this action.
> > The 'do X' can be one kfunc
> > and 'do Y' can be another.
> > If later we find out that 'do X' is not a good idea we can remove
> > that kfunc.
>
> The reason I moved the SB_I_MANAGED_DEVICES here is that I want a single
> central place where that is done for any possible LSM that wants to
> implement device management. So we don't have to go chasing where that
> bit is set for each LSM. I also don't want to have LSMs raise bits in
> sb->s_iflags directly as that's VFS property.

a kfunc that sets a bit in sb->s_iflags will be the same central place.
It will be somewhere in the fs/ directory and vfs maintainers can do what they
wish with it, including removal.
For traditional LSM one would need to do an accurate code review to make
sure that they don't mess with sb->s_iflags while for bpf_lsm it
will be done automatically. That kfunc will be that only one central place.
Christian Brauner Dec. 18, 2023, 12:30 p.m. UTC | #9
On Sat, Dec 16, 2023 at 09:41:10AM -0800, Alexei Starovoitov wrote:
> On Sat, Dec 16, 2023 at 2:38 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Fri, Dec 15, 2023 at 10:08:08AM -0800, Alexei Starovoitov wrote:
> > > On Fri, Dec 15, 2023 at 6:15 AM Christian Brauner <brauner@kernel.org> wrote:
> > > >
> > > > On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > > > > On 15.12.23 13:31, Christian Brauner wrote:
> > > > > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > > > > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > > > > >> namespace in cooperation of an attached cgroup device program. We
> > > > > >> just need to implement the security_inode_mknod() hook for this.
> > > > > >> In the hook, we check if the current task is guarded by a device
> > > > > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > > > > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > > > > >>
> > > > > >> Access decisions to those device nodes are then guarded by existing
> > > > > >> device cgroups mechanism.
> > > > > >>
> > > > > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > > > > >> ---
> > > > > >
> > > > > > I think you misunderstood me... My point was that I believe you don't
> > > > > > need an additional LSM at all and no additional LSM hook. But I might be
> > > > > > wrong. Only a POC would show.
> > > > >
> > > > > Yeah sorry, I got your point now.
> > > >
> > > > I think I might have had a misconception about how this works.
> > > > A bpf LSM program can't easily alter a kernel object such as struct
> > > > super_block I've been told.
> > >
> > > Right. bpf cannot change arbitrary kernel objects,
> > > but we can add a kfunc that will change a specific bit in a specific
> > > data structure.
> > > Adding a new lsm hook that does:
> > >     rc = call_int_hook(sb_device_access, 0, sb);
> > >     switch (rc) {
> > >     case 0: do X
> > >     case 1: do Y
> > >
> > > is the same thing, but uglier, since return code will be used
> > > to do this action.
> > > The 'do X' can be one kfunc
> > > and 'do Y' can be another.
> > > If later we find out that 'do X' is not a good idea we can remove
> > > that kfunc.
> >
> > The reason I moved the SB_I_MANAGED_DEVICES here is that I want a single
> > central place where that is done for any possible LSM that wants to
> > implement device management. So we don't have to go chasing where that
> > bit is set for each LSM. I also don't want to have LSMs raise bits in
> > sb->s_iflags directly as that's VFS property.
> 
> a kfunc that sets a bit in sb->s_iflags will be the same central place.

For the BPF LSM. I'm talking the same place for al LSMs.

> It will be somewhere in the fs/ directory and vfs maintainers can do what they
> wish with it, including removal.
> For traditional LSM one would need to do an accurate code review to make
> sure that they don't mess with sb->s_iflags while for bpf_lsm it
> will be done automatically. That kfunc will be that only one central place.

I'm not generally opposed to kfuncs ofc but here it just seems a bit
pointless. What we want is to keep SB_I_{NODEV,MANAGED_DEVICES} confined
to alloc_super(). The only central place it's raised where we control
all locking and logic. So it doesn't even have to appear in any
security_*() hooks.

diff --git a/security/security.c b/security/security.c
index 088a79c35c26..bf440d15615d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1221,6 +1221,33 @@ int security_sb_alloc(struct super_block *sb)
 	return rc;
 }
 
+/*
+ * security_sb_device_access() - Let LSMs handle device access
+ * @sb: filesystem superblock
+ *
+ * Let an LSM take over device access management for this superblock.
+ *
+ * Return: Returns 1 if LSMs handle device access, 0 if none does and -ERRNO on
+ *         failure.
+ */
+int security_sb_device_access(struct super_block *sb)
+{
+	int thisrc;
+	int rc = LSM_RET_DEFAULT(sb_device_access);
+	struct security_hook_list *hp;
+
+	hlist_for_each_entry(hp, &security_hook_heads.sb_device_access, list) {
+		thisrc = hp->hook.sb_device_access(sb);
+		if (thisrc < 0)
+			return thisrc;
+		/* At least one LSM claimed device access management. */
+		if (thisrc == 1)
+			rc = 1;
+	}
+
+	return rc;
+}
+
 /**
  * security_sb_delete() - Release super_block LSM associated objects
  * @sb: filesystem superblock

diff --git a/fs/super.c b/fs/super.c
index 076392396e72..2295c0f76e56 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
-	int i;
+	int err, i;
 
 	if (!s)
 		return NULL;
@@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	}
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	if (s->s_user_ns != &init_user_ns)
+
+	err = security_sb_device_access(s);
+	if (err < 0)
+		goto fail;
+
+	if (err)
+		s->s_iflags |= SB_I_MANAGED_DEVICES;
+	else if (s->s_user_ns != &init_user_ns)
 		s->s_iflags |= SB_I_NODEV;
+
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_roots);
 	mutex_init(&s->s_sync_lock);
Alexander Mikhalitsyn Dec. 18, 2023, 4:09 p.m. UTC | #10
On Fri, 15 Dec 2023 17:36:12 +0100
Christian Brauner <brauner@kernel.org> wrote:

> On Fri, Dec 15, 2023 at 03:15:33PM +0100, Christian Brauner wrote:
> > On Fri, Dec 15, 2023 at 02:26:53PM +0100, Michael Weiß wrote:
> > > On 15.12.23 13:31, Christian Brauner wrote:
> > > > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> > > >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> > > >> namespace in cooperation of an attached cgroup device program. We
> > > >> just need to implement the security_inode_mknod() hook for this.
> > > >> In the hook, we check if the current task is guarded by a device
> > > >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> > > >> helper. If so, we strip out SB_I_NODEV from the super block.
> > > >>
> > > >> Access decisions to those device nodes are then guarded by existing
> > > >> device cgroups mechanism.
> > > >>
> > > >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> > > >> ---
> > > > 
> > > > I think you misunderstood me... My point was that I believe you don't
> > > > need an additional LSM at all and no additional LSM hook. But I might be
> > > > wrong. Only a POC would show.
> > > 
> > > Yeah sorry, I got your point now.
> > 
> > I think I might have had a misconception about how this works.
> > A bpf LSM program can't easily alter a kernel object such as struct
> > super_block I've been told.
> 
> Which is why you need that new hook in there. I get it now. In any case,
> I think we can do this slightly nicer (for some definition of nice)...
> 
> So the thing below moves the capability check for mknod into the
> security_inode_mknod() hook (This should be a separate patch.).
> 
> It moves raising SB_I_NODEV into security_sb_device_access() and the old
> semantics are retained if no LSM claims device management. If an LSM
> claims device management we raise the new flag and don't even raise
> SB_I_NODEV. The capability check is namespace aware if device management
> is claimed by an LSM. That's backward compatible. And we don't need any
> sysctl.
> 
> What's missing is that all devcgroup_*() calls should be moved into a
> new, unified security_device_access() hook that's called consistently in
> all places where that matters such as blkdev_get_by_dev() and so on. Let
> the bpf lsm implement that new hook.
> 
> Then write a sample BPF LSM as POC that this works. This would also
> all other LSMs to do device management if they wanted to.
> 
> Thoughts?

Dear colleagues,
Dear Christian,

As far as I understand Christian's idea is to remain mknod capability checks decoupled from the device cgroups checks.
It looks sane and less error prone.

So we want to:
- use BPF LSM sb_device_access hook to disable SB_I_NODEV raising for non-root-userns superblocks
- use BPF LSM inode_permission hook to actually filter if we want this device to be permitted or not (alternatively we can use device cgroup).

The only thing that is not clear to me about the sb_device_access hook is, what we can check inside it practically?
Yes, we have an access to struct super_block, but at this point this structure is not filled with anything useful. We only
can determine a filesystem type and that's all. It means that we can use this hook as a flag that says "ok, we do care about device permissions,
kernel, please do not set SB_I_NODEV for us". Am I correct?

Kind regards,
Alex

> 
> From 7f4177e4f87e0b0182022f114c0287a0f0985752 Mon Sep 17 00:00:00 2001
> From: Christian Brauner <brauner@kernel.org>
> Date: Fri, 15 Dec 2023 17:22:26 +0100
> Subject: [PATCH] UNTESTED, UNCOMPILED, PROBABLY BUGGY
> 
> Signed-off-and-definitely-neither-compiled-nor-tested-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/namei.c                    |  5 -----
>  fs/namespace.c                | 11 +++++++----
>  fs/super.c                    |  6 ++++--
>  include/linux/fs.h            |  1 +
>  include/linux/lsm_hook_defs.h |  1 +
>  include/linux/security.h      | 15 ++++++++++++---
>  security/commoncap.c          | 19 +++++++++++++++++++
>  security/security.c           | 22 ++++++++++++++++++++++
>  8 files changed, 66 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 71c13b2990b4..da481e6a696c 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3959,16 +3959,11 @@ EXPORT_SYMBOL(user_path_create);
>  int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
>  	      struct dentry *dentry, umode_t mode, dev_t dev)
>  {
> -	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
>  	int error = may_create(idmap, dir, dentry);
>  
>  	if (error)
>  		return error;
>  
> -	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
> -	    !capable(CAP_MKNOD))
> -		return -EPERM;
> -
>  	if (!dir->i_op->mknod)
>  		return -EPERM;
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index fbf0e596fcd3..e87cc0320091 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4887,7 +4887,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
>  
>  static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
>  {
> -	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
>  	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
>  	unsigned long s_iflags;
>  
> @@ -4899,9 +4898,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
>  	if (!(s_iflags & SB_I_USERNS_VISIBLE))
>  		return false;
>  
> -	if ((s_iflags & required_iflags) != required_iflags) {
> -		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
> -			  required_iflags);
> +	if (!(s_iflags & SB_I_NOEXEC)) {
> +		WARN_ONCE(1, "Expected s_iflags to contain SB_I_NOEXEC\n");
> +		return true;
> +	}
> +
> +	if (!(s_iflags & (SB_I_NODEV | SB_I_MANAGED_DEVICES))) {
> +		WARN_ONCE(1, "Expected s_iflags to contain device access mask\n");
>  		return true;
>  	}
>  
> diff --git a/fs/super.c b/fs/super.c
> index 076392396e72..7b8098db17c9 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -362,8 +362,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  	}
>  	s->s_bdi = &noop_backing_dev_info;
>  	s->s_flags = flags;
> -	if (s->s_user_ns != &init_user_ns)
> -		s->s_iflags |= SB_I_NODEV;
> +
> +	if (security_sb_device_access(s))
> +		goto fail;
> +
>  	INIT_HLIST_NODE(&s->s_instances);
>  	INIT_HLIST_BL_HEAD(&s->s_roots);
>  	mutex_init(&s->s_sync_lock);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 98b7a7a8c42e..6ca0fe922478 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
>  #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
>  #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
>  #define SB_I_UNTRUSTED_MOUNTER		0x00000040
> +#define SB_I_MANAGED_DEVICES		0x00000080
>  
>  #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
>  #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> index 3fdd00b452ac..8c8a0d8aa71d 100644
> --- a/include/linux/lsm_hook_defs.h
> +++ b/include/linux/lsm_hook_defs.h
> @@ -60,6 +60,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
>  LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
>  	 struct fs_parameter *param)
>  LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
> +LSM_HOOK(int, 0, sb_device_access, struct super_block *sb)
>  LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
>  LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
>  LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 00809d2d5c38..a174f8c09594 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -155,6 +155,8 @@ extern int cap_capset(struct cred *new, const struct cred *old,
>  extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
>  int cap_inode_setxattr(struct dentry *dentry, const char *name,
>  		       const void *value, size_t size, int flags);
> +int cap_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
> +		    dev_t dev);
>  int cap_inode_removexattr(struct mnt_idmap *idmap,
>  			  struct dentry *dentry, const char *name);
>  int cap_inode_need_killpriv(struct dentry *dentry);
> @@ -348,6 +350,7 @@ int security_inode_symlink(struct inode *dir, struct dentry *dentry,
>  int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
>  int security_inode_rmdir(struct inode *dir, struct dentry *dentry);
>  int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev);
> +int security_sb_device_access(struct super_block *sb);
>  int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
>  			  struct inode *new_dir, struct dentry *new_dentry,
>  			  unsigned int flags);
> @@ -823,10 +826,16 @@ static inline int security_inode_rmdir(struct inode *dir,
>  	return 0;
>  }
>  
> -static inline int security_inode_mknod(struct inode *dir,
> -					struct dentry *dentry,
> -					int mode, dev_t dev)
> +static inline int security_inode_mknod(struct inode *dir, struct dentry *dentry,
> +				       int mode, dev_t dev)
> +{
> +	return cap_inode_mknod(dir, dentry, mode, dev);
> +}
> +
> +static inline int security_sb_device_access(struct super_block *sb)
>  {
> +	if (s->s_user_ns != &init_user_ns)
> +		sb->s_iflags |= SB_I_NODEV;
>  	return 0;
>  }
>  
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 8e8c630ce204..f4a208fdf939 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -1438,6 +1438,24 @@ int cap_mmap_file(struct file *file, unsigned long reqprot,
>  	return 0;
>  }
>  
> +int cap_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
> +		    dev_t dev)
> +{
> +	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
> +	struct super_block *sb = dir->i_sb;
> +	struct user_namespace *userns;
> +
> +	if (dir->i_sb->s_iflags & SB_I_MANAGED_DEVICES)
> +		userns = sb->s_user_ns;
> +	else
> +		userns = &init_user_ns;
> +	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
> +	    !ns_capable(userns, CAP_MKNOD))
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
>  #ifdef CONFIG_SECURITY
>  
>  static struct security_hook_list capability_hooks[] __ro_after_init = {
> @@ -1448,6 +1466,7 @@ static struct security_hook_list capability_hooks[] __ro_after_init = {
>  	LSM_HOOK_INIT(capget, cap_capget),
>  	LSM_HOOK_INIT(capset, cap_capset),
>  	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
> +	LSM_HOOK_INIT(inode_mknod, cap_inode_mknod),
>  	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
>  	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
>  	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
> diff --git a/security/security.c b/security/security.c
> index 088a79c35c26..192b992f1a34 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -1221,6 +1221,28 @@ int security_sb_alloc(struct super_block *sb)
>  	return rc;
>  }
>  
> +int security_sb_device_access(struct super_block *sb)
> +{
> +	int rc;
> +
> +	rc = call_int_hook(sb_device_access, 0, sb);
> +	switch (rc) {
> +	case 0:
> +		/*
> +		 * LSM doesn't do device access management and this is an
> +		 * untrusted mount so block all device access.
> +		 */
> +		if (sb->s_user_ns != &init_user_ns)
> +			sb->s_iflags |= SB_I_NODEV;
> +		return 0;
> +	case 1:
> +		sb->s_iflags |= SB_I_MANAGED_DEVICES;
> +		return 0;
> +	}
> +
> +	return rc;
> +}
> +
>  /**
>   * security_sb_delete() - Release super_block LSM associated objects
>   * @sb: filesystem superblock
> -- 
> 2.42.0
> 
>
Alexander Mikhalitsyn Dec. 18, 2023, 4:18 p.m. UTC | #11
On Fri, 15 Dec 2023 14:26:53 +0100
Michael Weiß <michael.weiss@aisec.fraunhofer.de> wrote:

> On 15.12.23 13:31, Christian Brauner wrote:
> > On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
> >> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
> >> namespace in cooperation of an attached cgroup device program. We
> >> just need to implement the security_inode_mknod() hook for this.
> >> In the hook, we check if the current task is guarded by a device
> >> cgroup using the lately introduced cgroup_bpf_current_enabled()
> >> helper. If so, we strip out SB_I_NODEV from the super block.
> >>
> >> Access decisions to those device nodes are then guarded by existing
> >> device cgroups mechanism.
> >>
> >> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
> >> ---
> > 
> > I think you misunderstood me... My point was that I believe you don't
> > need an additional LSM at all and no additional LSM hook. But I might be
> > wrong. Only a POC would show.
> 
> Yeah sorry, I got your point now.
> 
> > 
> > Just write a bpf lsm program that strips SB_I_NODEV in the existing
> > security_sb_set_mnt_opts() call which is guranteed to be called when a
> > new superblock is created.
> 
> This does not work since SB_I_NODEV is a required_iflag in
> mount_too_revealing(). This I have already tested when writing the
> simple LSM here. So maybe we need to drop SB_I_NODEV from required_flags
> there, too. Would that be safe?
> 
> > 
> > Store your device access rules in a bpf map or in the sb->s_security
> > blob (This is where I'm fuzzy and could use a bpf LSM expert's input.).
> > 
> > Then make that bpf lsm program kick in everytime a
> > security_inode_mknod() and security_file_open() is called and do device
> > access management in there. Actually, you might need to add one hook
> > when the actual device that's about to be opened is know. 
> > This should be where today the device access hooks are called.
> > 
> > And then you should already be done with this. The only thing that you
> > need is the capable check patch.
> > 
> > You don't need that cgroup_bpf_current_enabled() per se. Device
> > management could now be done per superblock, and not per task. IOW, you
> > allowlist a bunch of devices that can be created and opened. Any task
> > that passes basic permission checks and that passes the bpf lsm program
> > may create device nodes.
> > 
> > That's a way more natural device management model than making this a per
> > cgroup thing. Though that could be implemented as well with this.
> > 
> > I would try to write a bpf lsm program that does device access
> > management with your capable() sysctl patch applied and see how far I
> > get.
> > 
> > I don't have the time otherwise I'd do it.
> I'll give it a try but no promises how fast this will go.

Hi Michael,

thanks for your work on this!

If you don't mind I'm ready to help you with writing the PoC for this bpf-based approach,
as I have touched eBPF earlier I guess I can save some your time. (I'll post it here and you will incude it
in your patch series.)

Kind regards,
Alex

>
Christian Brauner Dec. 19, 2023, 1:43 p.m. UTC | #12
> The only thing that is not clear to me about the sb_device_access hook is, what we can check inside it practically?
> Yes, we have an access to struct super_block, but at this point this structure is not filled with anything useful. We only
> can determine a filesystem type and that's all. It means that we can use this hook as a flag that says "ok, we do care about device permissions,
> kernel, please do not set SB_I_NODEV for us". Am I correct?

What the the LSM needs to definitely know is what filesystem type and
what user namespace are relevant. Because this whole thing is mostly
interesting for the != init_user_ns case here.

And both things are already present at that point in time (Technically,
kernfs stuff can be a bit different but kernfs stuff does have
SB_I_NODEV unconditionally so it really doesn't matter.).The thing is
though that you want device access settled as soon as possible when the
superblock isn't yet exposed anywhere. And for that alloc_super() is
pretty convenient. Then you don't have to put much thought into it.

But we can always move the hook to another place. It's also feasible to
do this in vfs_get_tree() for example and provide the fs_context but
again. I don't see why we need to do this now.
Michael Weiß Dec. 20, 2023, 7:44 p.m. UTC | #13
On 18.12.23 17:18, Alexander Mikhalitsyn wrote:
> On Fri, 15 Dec 2023 14:26:53 +0100
> Michael Weiß <michael.weiss@aisec.fraunhofer.de> wrote:
> 
>> On 15.12.23 13:31, Christian Brauner wrote:
>>> On Wed, Dec 13, 2023 at 03:38:13PM +0100, Michael Weiß wrote:
>>>> devguard is a simple LSM to allow CAP_MKNOD in non-initial user
>>>> namespace in cooperation of an attached cgroup device program. We
>>>> just need to implement the security_inode_mknod() hook for this.
>>>> In the hook, we check if the current task is guarded by a device
>>>> cgroup using the lately introduced cgroup_bpf_current_enabled()
>>>> helper. If so, we strip out SB_I_NODEV from the super block.
>>>>
>>>> Access decisions to those device nodes are then guarded by existing
>>>> device cgroups mechanism.
>>>>
>>>> Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
>>>> ---
>>>
>>> I think you misunderstood me... My point was that I believe you don't
>>> need an additional LSM at all and no additional LSM hook. But I might be
>>> wrong. Only a POC would show.
>>
>> Yeah sorry, I got your point now.
>>
>>>
>>> Just write a bpf lsm program that strips SB_I_NODEV in the existing
>>> security_sb_set_mnt_opts() call which is guranteed to be called when a
>>> new superblock is created.
>>
>> This does not work since SB_I_NODEV is a required_iflag in
>> mount_too_revealing(). This I have already tested when writing the
>> simple LSM here. So maybe we need to drop SB_I_NODEV from required_flags
>> there, too. Would that be safe?
>>
>>>
>>> Store your device access rules in a bpf map or in the sb->s_security
>>> blob (This is where I'm fuzzy and could use a bpf LSM expert's input.).
>>>
>>> Then make that bpf lsm program kick in everytime a
>>> security_inode_mknod() and security_file_open() is called and do device
>>> access management in there. Actually, you might need to add one hook
>>> when the actual device that's about to be opened is know. 
>>> This should be where today the device access hooks are called.
>>>
>>> And then you should already be done with this. The only thing that you
>>> need is the capable check patch.
>>>
>>> You don't need that cgroup_bpf_current_enabled() per se. Device
>>> management could now be done per superblock, and not per task. IOW, you
>>> allowlist a bunch of devices that can be created and opened. Any task
>>> that passes basic permission checks and that passes the bpf lsm program
>>> may create device nodes.
>>>
>>> That's a way more natural device management model than making this a per
>>> cgroup thing. Though that could be implemented as well with this.
>>>
>>> I would try to write a bpf lsm program that does device access
>>> management with your capable() sysctl patch applied and see how far I
>>> get.
>>>
>>> I don't have the time otherwise I'd do it.
>> I'll give it a try but no promises how fast this will go.
> 
> Hi Michael,
> 
> thanks for your work on this!
> 
> If you don't mind I'm ready to help you with writing the PoC for this bpf-based approach,
> as I have touched eBPF earlier I guess I can save some your time. (I'll post it here and you will incude it
> in your patch series.)

Yeah for sure. This would be very helpful thanks.
I'll start to sort Christians patches of this thread and get the missing security
hook for the remaining checks lined up from v2 then.

> 
> Kind regards,
> Alex
> 
>>
> 
>
Paul Moore Dec. 22, 2023, 11:39 p.m. UTC | #14
On Mon, Dec 18, 2023 at 7:30 AM Christian Brauner <brauner@kernel.org> wrote:
> I'm not generally opposed to kfuncs ofc but here it just seems a bit
> pointless. What we want is to keep SB_I_{NODEV,MANAGED_DEVICES} confined
> to alloc_super(). The only central place it's raised where we control
> all locking and logic. So it doesn't even have to appear in any
> security_*() hooks.
>
> diff --git a/security/security.c b/security/security.c
> index 088a79c35c26..bf440d15615d 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -1221,6 +1221,33 @@ int security_sb_alloc(struct super_block *sb)
>         return rc;
>  }
>
> +/*
> + * security_sb_device_access() - Let LSMs handle device access
> + * @sb: filesystem superblock
> + *
> + * Let an LSM take over device access management for this superblock.
> + *
> + * Return: Returns 1 if LSMs handle device access, 0 if none does and -ERRNO on
> + *         failure.
> + */
> +int security_sb_device_access(struct super_block *sb)
> +{
> +       int thisrc;
> +       int rc = LSM_RET_DEFAULT(sb_device_access);
> +       struct security_hook_list *hp;
> +
> +       hlist_for_each_entry(hp, &security_hook_heads.sb_device_access, list) {
> +               thisrc = hp->hook.sb_device_access(sb);
> +               if (thisrc < 0)
> +                       return thisrc;
> +               /* At least one LSM claimed device access management. */
> +               if (thisrc == 1)
> +                       rc = 1;
> +       }
> +
> +       return rc;
> +}

I worry that this hook, and the way it is plumbed into alloc_super()
below, brings us back to the problem of authoritative LSM hooks which
is something I can't support at this point in time.  The same can be
said for a LSM directly flipping bits in the superblock struct.

The LSM should not grant any additional privilege, either directly in
the LSM code, or indirectly via the caller; the LSM should only
restrict operations which would have otherwise been allowed.

The LSM should also refrain from modifying any kernel data structures
that do not belong directly to the LSM.  A LSM caller may modify
kernel data structures that it owns based on the result of the LSM
hook, so long as those modifications do not grant additional privilege
as described above.

> diff --git a/fs/super.c b/fs/super.c
> index 076392396e72..2295c0f76e56 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  {
>         struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
>         static const struct super_operations default_op;
> -       int i;
> +       int err, i;
>
>         if (!s)
>                 return NULL;
> @@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>         }
>         s->s_bdi = &noop_backing_dev_info;
>         s->s_flags = flags;
> -       if (s->s_user_ns != &init_user_ns)
> +
> +       err = security_sb_device_access(s);
> +       if (err < 0)
> +               goto fail;
> +
> +       if (err)
> +               s->s_iflags |= SB_I_MANAGED_DEVICES;
> +       else if (s->s_user_ns != &init_user_ns)
>                 s->s_iflags |= SB_I_NODEV;
> +
>         INIT_HLIST_NODE(&s->s_instances);
>         INIT_HLIST_BL_HEAD(&s->s_roots);
>         mutex_init(&s->s_sync_lock);
Michael Weiß Dec. 27, 2023, 2:31 p.m. UTC | #15
On 23.12.23 00:39, Paul Moore wrote:
> On Mon, Dec 18, 2023 at 7:30 AM Christian Brauner <brauner@kernel.org> wrote:
>> I'm not generally opposed to kfuncs ofc but here it just seems a bit
>> pointless. What we want is to keep SB_I_{NODEV,MANAGED_DEVICES} confined
>> to alloc_super(). The only central place it's raised where we control
>> all locking and logic. So it doesn't even have to appear in any
>> security_*() hooks.
>>
>> diff --git a/security/security.c b/security/security.c
>> index 088a79c35c26..bf440d15615d 100644
>> --- a/security/security.c
>> +++ b/security/security.c
>> @@ -1221,6 +1221,33 @@ int security_sb_alloc(struct super_block *sb)
>>         return rc;
>>  }
>>
>> +/*
>> + * security_sb_device_access() - Let LSMs handle device access
>> + * @sb: filesystem superblock
>> + *
>> + * Let an LSM take over device access management for this superblock.
>> + *
>> + * Return: Returns 1 if LSMs handle device access, 0 if none does and -ERRNO on
>> + *         failure.
>> + */
>> +int security_sb_device_access(struct super_block *sb)
>> +{
>> +       int thisrc;
>> +       int rc = LSM_RET_DEFAULT(sb_device_access);
>> +       struct security_hook_list *hp;
>> +
>> +       hlist_for_each_entry(hp, &security_hook_heads.sb_device_access, list) {
>> +               thisrc = hp->hook.sb_device_access(sb);
>> +               if (thisrc < 0)
>> +                       return thisrc;
>> +               /* At least one LSM claimed device access management. */
>> +               if (thisrc == 1)
>> +                       rc = 1;
>> +       }
>> +
>> +       return rc;
>> +}
> 
> I worry that this hook, and the way it is plumbed into alloc_super()
> below, brings us back to the problem of authoritative LSM hooks which
> is something I can't support at this point in time.  The same can be
> said for a LSM directly flipping bits in the superblock struct.
> 
> The LSM should not grant any additional privilege, either directly in
> the LSM code, or indirectly via the caller; the LSM should only
> restrict operations which would have otherwise been allowed.
> 
> The LSM should also refrain from modifying any kernel data structures
> that do not belong directly to the LSM.  A LSM caller may modify
> kernel data structures that it owns based on the result of the LSM
> hook, so long as those modifications do not grant additional privilege
> as described above.

Hi Paul, what would you think about if we do it as shown in the
patch below (untested)?

I have adapted Christians patch slightly in a way that we do let
all LSMs agree on if device access management should be done or not.
Similar to the security_task_prctl() hook.

The new default behavior in alloc_super() is that the
SB_I_MANANGED_DEVICES flag is set if no error is returned by the
security hook, otherwise the old semantic which raises SB_I_NODEV
is used if an LSM does not agree on device management for the
superblock.

So a LSM can only be used to restrict access to the superblock
but not to do more privileged operations, since the
MANAGED_DEVICES would be allowed by default.

The LSM default value is set to -EOPNOSUPP to decide if an LSM has
implemented the hook inside of fs/super.c for backward compatibility.

> 
>> diff --git a/fs/super.c b/fs/super.c
>> index 076392396e72..2295c0f76e56 100644
>> --- a/fs/super.c
>> +++ b/fs/super.c
>> @@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>>  {
>>         struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
>>         static const struct super_operations default_op;
>> -       int i;
>> +       int err, i;
>>
>>         if (!s)
>>                 return NULL;
>> @@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>>         }
>>         s->s_bdi = &noop_backing_dev_info;
>>         s->s_flags = flags;
>> -       if (s->s_user_ns != &init_user_ns)
>> +
>> +       err = security_sb_device_access(s);
>> +       if (err < 0)
>> +               goto fail;
>> +
>> +       if (err)
>> +               s->s_iflags |= SB_I_MANAGED_DEVICES;
>> +       else if (s->s_user_ns != &init_user_ns)
>>                 s->s_iflags |= SB_I_NODEV;
>> +
>>         INIT_HLIST_NODE(&s->s_instances);
>>         INIT_HLIST_BL_HEAD(&s->s_roots);
>>         mutex_init(&s->s_sync_lock);
> 
---
 fs/namespace.c                | 11 +++++++----
 fs/super.c                    | 12 ++++++++++--
 include/linux/fs.h            |  1 +
 include/linux/lsm_hook_defs.h |  1 +
 include/linux/security.h      |  6 ++++++
 security/security.c           | 26 ++++++++++++++++++++++++++
 6 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index fbf0e596fcd3..e87cc0320091 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4887,7 +4887,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 
 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
 {
-	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	unsigned long s_iflags;
 
@@ -4899,9 +4898,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
 		return false;
 
-	if ((s_iflags & required_iflags) != required_iflags) {
-		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
-			  required_iflags);
+	if (!(s_iflags & SB_I_NOEXEC)) {
+		WARN_ONCE(1, "Expected s_iflags to contain SB_I_NOEXEC\n");
+		return true;
+	}
+
+	if (!(s_iflags & (SB_I_NODEV | SB_I_MANAGED_DEVICES))) {
+		WARN_ONCE(1, "Expected s_iflags to contain device access mask\n");
 		return true;
 	}
 
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..6510168d51ce 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
-	int i;
+	int i, err;
 
 	if (!s)
 		return NULL;
@@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	}
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	if (s->s_user_ns != &init_user_ns)
+
+	err = security_sb_device_access(s);
+	if (err < 0 && err != -EOPNOTSUPP)
+		goto fail;
+
+	if (err && s->s_user_ns != &init_user_ns)
 		s->s_iflags |= SB_I_NODEV;
+	else
+		s->s_iflags |= SB_I_MANAGED_DEVICES;
+
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_roots);
 	mutex_init(&s->s_sync_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..6ca0fe922478 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
+#define SB_I_MANAGED_DEVICES		0x00000080
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..1dc13b7e9a4a 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -60,6 +60,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
 	 struct fs_parameter *param)
 LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
+LSM_HOOK(int, -EOPNOTSUPP, sb_device_access, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..79f2b201c7bd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -298,6 +298,7 @@ int security_fs_context_submount(struct fs_context *fc, struct super_block *refe
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
 int security_sb_alloc(struct super_block *sb);
+int security_sb_device_access(struct super_block *sb);
 void security_sb_delete(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 void security_free_mnt_opts(void **mnt_opts);
@@ -652,6 +653,11 @@ static inline int security_sb_alloc(struct super_block *sb)
 	return 0;
 }
 
+static inline int security_sb_device_access(struct super_block *sb)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void security_sb_delete(struct super_block *sb)
 { }
 
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..054ee19edef0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1221,6 +1221,32 @@ int security_sb_alloc(struct super_block *sb)
 	return rc;
 }
 
+/*
+ * security_sb_device_access() - handle device access on this sb
+ * @sb: filesystem superblock
+ *
+ * Let LSMs decide if device access management is allowed for this superblock.
+ *
+ * Return: Returns 0 if LSMs handle device access, -EOPNOTSUPP if none does and
+ * 	   -ERRNO on any other failure.
+ */
+int security_sb_device_access(struct super_block *sb)
+{
+	int thisrc;
+	int rc = LSM_RET_DEFAULT(sb_device_access);
+	struct security_hook_list *hp;
+
+	hlist_for_each_entry(hp, &security_hook_heads.sb_device_access, list) {
+		thisrc = hp->hook.sb_device_access(sb);
+		if (thisrc != LSM_RET_DEFAULT(sb_device_access)) {
+			rc = thisrc;
+			if (thisrc != 0)
+				break;
+		}
+	}
+	return rc;
+}
+
 /**
  * security_sb_delete() - Release super_block LSM associated objects
  * @sb: filesystem superblock
--
Paul Moore Dec. 29, 2023, 10:31 p.m. UTC | #16
On Wed, Dec 27, 2023 at 9:31 AM Michael Weiß
<michael.weiss@aisec.fraunhofer.de> wrote:
> Hi Paul, what would you think about if we do it as shown in the
> patch below (untested)?
>
> I have adapted Christians patch slightly in a way that we do let
> all LSMs agree on if device access management should be done or not.
> Similar to the security_task_prctl() hook.

I think it's worth taking a minute to talk about this proposed change
and the existing security_task_prctl() hook, as there is an important
difference between the two which is the source of my concern.

If you look at the prctl() syscall implementation, right at the top of
the function you see the LSM hook:

  SYSCALL_DEFINE(prctl, ...)
  {
    ...

    error = security_task_prctl(...);
    if (error != -ENOSYS)
      return error;

    error = 0;

    ....
  }

While it is true that the LSM hook returns a "special" value, -ENOSYS,
from a practical perspective this is not significantly different from
the much more common zero value used to indicate no restriction from
the LSM layer.  However, the more important thing to note is that the
return value from security_task_prctl() does not influence any other
access controls in the caller outside of those implemented inside the
LSM; in fact the error code is reset to zero immediately after the LSM
hook.

More on this below ...

> diff --git a/fs/super.c b/fs/super.c
> index 076392396e72..6510168d51ce 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  {
>         struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
>         static const struct super_operations default_op;
> -       int i;
> +       int i, err;
>
>         if (!s)
>                 return NULL;
> @@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>         }
>         s->s_bdi = &noop_backing_dev_info;
>         s->s_flags = flags;
> -       if (s->s_user_ns != &init_user_ns)
> +
> +       err = security_sb_device_access(s);
> +       if (err < 0 && err != -EOPNOTSUPP)
> +               goto fail;
> +
> +       if (err && s->s_user_ns != &init_user_ns)
>                 s->s_iflags |= SB_I_NODEV;
> +       else
> +               s->s_iflags |= SB_I_MANAGED_DEVICES;

This is my concern, depending on what the LSM hook returns, the
superblock's flags are set differently, affecting much more than just
a LSM-based security mechanism.

LSMs should not be able to undermine, shortcut, or otherwise bypass
access controls built into other parts of the kernel.  In other words,
a LSM should only ever be able to deny an operation, it should not be
able to permit an operation that otherwise would have been denied.

>         INIT_HLIST_NODE(&s->s_instances);
>         INIT_HLIST_BL_HEAD(&s->s_roots);
>         mutex_init(&s->s_sync_lock);
Michael Weiß Jan. 8, 2024, 1:44 p.m. UTC | #17
On 29.12.23 23:31, Paul Moore wrote:
> On Wed, Dec 27, 2023 at 9:31 AM Michael Weiß
> <michael.weiss@aisec.fraunhofer.de> wrote:
>> Hi Paul, what would you think about if we do it as shown in the
>> patch below (untested)?
>>
>> I have adapted Christians patch slightly in a way that we do let
>> all LSMs agree on if device access management should be done or not.
>> Similar to the security_task_prctl() hook.
> 
> I think it's worth taking a minute to talk about this proposed change
> and the existing security_task_prctl() hook, as there is an important
> difference between the two which is the source of my concern.
> 
> If you look at the prctl() syscall implementation, right at the top of
> the function you see the LSM hook:
> 
>   SYSCALL_DEFINE(prctl, ...)
>   {
>     ...
> 
>     error = security_task_prctl(...);
>     if (error != -ENOSYS)
>       return error;
> 
>     error = 0;
> 
>     ....
>   }
> 
> While it is true that the LSM hook returns a "special" value, -ENOSYS,
> from a practical perspective this is not significantly different from
> the much more common zero value used to indicate no restriction from
> the LSM layer.  However, the more important thing to note is that the
> return value from security_task_prctl() does not influence any other
> access controls in the caller outside of those implemented inside the
> LSM; in fact the error code is reset to zero immediately after the LSM
> hook.
> 
> More on this below ...
> 
>> diff --git a/fs/super.c b/fs/super.c
>> index 076392396e72..6510168d51ce 100644
>> --- a/fs/super.c
>> +++ b/fs/super.c
>> @@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>>  {
>>         struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
>>         static const struct super_operations default_op;
>> -       int i;
>> +       int i, err;
>>
>>         if (!s)
>>                 return NULL;
>> @@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>>         }
>>         s->s_bdi = &noop_backing_dev_info;
>>         s->s_flags = flags;
>> -       if (s->s_user_ns != &init_user_ns)
>> +
>> +       err = security_sb_device_access(s);
>> +       if (err < 0 && err != -EOPNOTSUPP)
>> +               goto fail;
>> +
>> +       if (err && s->s_user_ns != &init_user_ns)
>>                 s->s_iflags |= SB_I_NODEV;
>> +       else
>> +               s->s_iflags |= SB_I_MANAGED_DEVICES;
> 
> This is my concern, depending on what the LSM hook returns, the
> superblock's flags are set differently, affecting much more than just
> a LSM-based security mechanism.
> 
> LSMs should not be able to undermine, shortcut, or otherwise bypass
> access controls built into other parts of the kernel.  In other words,
> a LSM should only ever be able to deny an operation, it should not be
> able to permit an operation that otherwise would have been denied.

Hmm, OK. Then I can't see to come here any further as we would directly
or indirectly set the superblock flags based on if a security hook is
implemented or not, which I understand now is against LSM architecture.
Thanks Paul for clarification.

Christian, what do you think? 
Maybe we just set the SB_I_NODEV and SB_I_MANGED_DEVICES flag based on
a sysctl at the same place for backward compatibility,
drop the additional security hook and keep the rest as is from your
proposal:

	if (sysctl_managed_devices)
		s->s_iflags |= SB_I_MANAGED_DEVICES;
	else if (s->s_user_ns != &init_user_ns)
		s->s_iflags |= SB_I_NODEV;

A device access managing LSM can then just implement the current
security_sb_alloc() hook to deny creating the super block at all.

> 
>>         INIT_HLIST_NODE(&s->s_instances);
>>         INIT_HLIST_BL_HEAD(&s->s_roots);
>>         mutex_init(&s->s_sync_lock);
>
Paul Moore Jan. 8, 2024, 4:34 p.m. UTC | #18
On Mon, Jan 8, 2024 at 8:45 AM Michael Weiß
<michael.weiss@aisec.fraunhofer.de> wrote:
> On 29.12.23 23:31, Paul Moore wrote:
> > On Wed, Dec 27, 2023 at 9:31 AM Michael Weiß
> > <michael.weiss@aisec.fraunhofer.de> wrote:
> >> Hi Paul, what would you think about if we do it as shown in the
> >> patch below (untested)?
> >>
> >> I have adapted Christians patch slightly in a way that we do let
> >> all LSMs agree on if device access management should be done or not.
> >> Similar to the security_task_prctl() hook.
> >
> > I think it's worth taking a minute to talk about this proposed change
> > and the existing security_task_prctl() hook, as there is an important
> > difference between the two which is the source of my concern.
> >
> > If you look at the prctl() syscall implementation, right at the top of
> > the function you see the LSM hook:
> >
> >   SYSCALL_DEFINE(prctl, ...)
> >   {
> >     ...
> >
> >     error = security_task_prctl(...);
> >     if (error != -ENOSYS)
> >       return error;
> >
> >     error = 0;
> >
> >     ....
> >   }
> >
> > While it is true that the LSM hook returns a "special" value, -ENOSYS,
> > from a practical perspective this is not significantly different from
> > the much more common zero value used to indicate no restriction from
> > the LSM layer.  However, the more important thing to note is that the
> > return value from security_task_prctl() does not influence any other
> > access controls in the caller outside of those implemented inside the
> > LSM; in fact the error code is reset to zero immediately after the LSM
> > hook.
> >
> > More on this below ...
> >
> >> diff --git a/fs/super.c b/fs/super.c
> >> index 076392396e72..6510168d51ce 100644
> >> --- a/fs/super.c
> >> +++ b/fs/super.c
> >> @@ -325,7 +325,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
> >>  {
> >>         struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
> >>         static const struct super_operations default_op;
> >> -       int i;
> >> +       int i, err;
> >>
> >>         if (!s)
> >>                 return NULL;
> >> @@ -362,8 +362,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
> >>         }
> >>         s->s_bdi = &noop_backing_dev_info;
> >>         s->s_flags = flags;
> >> -       if (s->s_user_ns != &init_user_ns)
> >> +
> >> +       err = security_sb_device_access(s);
> >> +       if (err < 0 && err != -EOPNOTSUPP)
> >> +               goto fail;
> >> +
> >> +       if (err && s->s_user_ns != &init_user_ns)
> >>                 s->s_iflags |= SB_I_NODEV;
> >> +       else
> >> +               s->s_iflags |= SB_I_MANAGED_DEVICES;
> >
> > This is my concern, depending on what the LSM hook returns, the
> > superblock's flags are set differently, affecting much more than just
> > a LSM-based security mechanism.
> >
> > LSMs should not be able to undermine, shortcut, or otherwise bypass
> > access controls built into other parts of the kernel.  In other words,
> > a LSM should only ever be able to deny an operation, it should not be
> > able to permit an operation that otherwise would have been denied.
>
> Hmm, OK. Then I can't see to come here any further as we would directly
> or indirectly set the superblock flags based on if a security hook is
> implemented or not, which I understand now is against LSM architecture.
> Thanks Paul for clarification.

No worries, thank you for posting to the LSM list for review and
consideration.  While it may take me a while to review something
(there always appears to be a backlog), I'm always happy to review
patches in this area and work with folks to find a solution.
diff mbox series

Patch

diff --git a/security/Kconfig b/security/Kconfig
index 52c9af08ad35..7ec4017745d4 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -194,6 +194,7 @@  source "security/yama/Kconfig"
 source "security/safesetid/Kconfig"
 source "security/lockdown/Kconfig"
 source "security/landlock/Kconfig"
+source "security/devguard/Kconfig"
 
 source "security/integrity/Kconfig"
 
@@ -233,11 +234,11 @@  endchoice
 
 config LSM
 	string "Ordered list of enabled LSMs"
-	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
-	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
-	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
-	default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC
-	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf"
+	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf,devguard" if DEFAULT_SECURITY_SMACK
+	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf,devguard" if DEFAULT_SECURITY_APPARMOR
+	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf,devguard" if DEFAULT_SECURITY_TOMOYO
+	default "landlock,lockdown,yama,loadpin,safesetid,bpf,devguard" if DEFAULT_SECURITY_DAC
+	default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf,devguard"
 	help
 	  A comma-separated list of LSMs, in initialization order.
 	  Any LSMs left off this list, except for those with order
diff --git a/security/Makefile b/security/Makefile
index 18121f8f85cd..82a0d8cab3c3 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -24,6 +24,7 @@  obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
 obj-$(CONFIG_CGROUPS)			+= device_cgroup.o
 obj-$(CONFIG_BPF_LSM)			+= bpf/
 obj-$(CONFIG_SECURITY_LANDLOCK)		+= landlock/
+obj-$(CONFIG_SECURITY_DEVGUARD)		+= devguard/
 
 # Object integrity file lists
 obj-$(CONFIG_INTEGRITY)			+= integrity/
diff --git a/security/devguard/Kconfig b/security/devguard/Kconfig
new file mode 100644
index 000000000000..592684615a8f
--- /dev/null
+++ b/security/devguard/Kconfig
@@ -0,0 +1,12 @@ 
+config SECURITY_DEVGUARD
+	bool "Devguard for device node creation"
+	depends on SECURITY
+	depends on CGROUP_BPF
+	default n
+	help
+	  This enables devguard, an LSM that allows to guard device node
+	  creation in non-initial user namespace. It may allow mknod
+	  in cooperation of an attached cgroup device program.
+	  This security module stacks with other LSMs.
+
+	  If you are unsure how to answer this question, answer N.
diff --git a/security/devguard/Makefile b/security/devguard/Makefile
new file mode 100644
index 000000000000..fdaff8dc2fea
--- /dev/null
+++ b/security/devguard/Makefile
@@ -0,0 +1,2 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SECURITY_DEVGUARD) += devguard.o
diff --git a/security/devguard/devguard.c b/security/devguard/devguard.c
new file mode 100644
index 000000000000..3a0c9c27a691
--- /dev/null
+++ b/security/devguard/devguard.c
@@ -0,0 +1,44 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device guard security module
+ *
+ * Simple in-kernel LSM to allow cap_mknod in non-initial
+ * user namespace if current task is guarded by device cgroup.
+ *
+ * Copyright (C) 2023 Fraunhofer AISEC. All rights reserved.
+ *
+ * Authors: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
+ */
+
+#include <linux/bpf-cgroup.h>
+#include <linux/lsm_hooks.h>
+
+static int devguard_inode_mknod(struct inode *dir, struct dentry *dentry,
+				umode_t mode, dev_t dev)
+{
+	if (dentry->d_sb->s_iflags & ~SB_I_NODEV)
+		return 0;
+
+	// strip SB_I_NODEV on super block if device cgroup is active
+	if (cgroup_bpf_current_enabled(CGROUP_DEVICE))
+		dentry->d_sb->s_iflags &= ~SB_I_NODEV;
+
+	return 0;
+}
+
+static struct security_hook_list devguard_hooks[] __ro_after_init = {
+	LSM_HOOK_INIT(inode_mknod, devguard_inode_mknod),
+};
+
+static int __init devguard_init(void)
+{
+	security_add_hooks(devguard_hooks, ARRAY_SIZE(devguard_hooks),
+			   "devguard");
+	pr_info("devguard: initialized\n");
+	return 0;
+}
+
+DEFINE_LSM(devguard) = {
+	.name = "devguard",
+	.init = devguard_init,
+};