diff mbox series

[3/3] capabilities: add cap userns sysctl mask

Message ID 20240516092213.6799-4-jcalmels@3xx0.net (mailing list archive)
State New
Delegated to: Paul Moore
Headers show
Series Introduce user namespace capabilities | expand

Commit Message

Jonathan Calmels May 16, 2024, 9:22 a.m. UTC
This patch adds a new system-wide userns capability mask designed to mask
off capabilities in user namespaces.

This mask is controlled through a sysctl and can be set early in the boot
process or on the kernel command line to exclude known capabilities from
ever being gained in namespaces. Once set, it can be further restricted to
exert dynamic policies on the system (e.g. ward off a potential exploit).

Changing this mask requires privileges over CAP_SYS_ADMIN and CAP_SETPCAP
in the initial user namespace.

Example:

    # sysctl -qw kernel.cap_userns_mask=0x1fffffdffff && \
      unshare -r grep Cap /proc/self/status
    CapInh: 0000000000000000
    CapPrm: 000001fffffdffff
    CapEff: 000001fffffdffff
    CapBnd: 000001fffffdffff
    CapAmb: 0000000000000000
    CapUNs: 000001fffffdffff

Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
---
 include/linux/user_namespace.h |  7 ++++
 kernel/sysctl.c                | 10 ++++++
 kernel/user_namespace.c        | 66 ++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

Comments

Jarkko Sakkinen May 16, 2024, 12:44 p.m. UTC | #1
On Thu May 16, 2024 at 12:22 PM EEST, Jonathan Calmels wrote:
> This patch adds a new system-wide userns capability mask designed to mask
> off capabilities in user namespaces.
>
> This mask is controlled through a sysctl and can be set early in the boot
> process or on the kernel command line to exclude known capabilities from
> ever being gained in namespaces. Once set, it can be further restricted to
> exert dynamic policies on the system (e.g. ward off a potential exploit).
>
> Changing this mask requires privileges over CAP_SYS_ADMIN and CAP_SETPCAP
> in the initial user namespace.
>
> Example:
>
>     # sysctl -qw kernel.cap_userns_mask=0x1fffffdffff && \
>       unshare -r grep Cap /proc/self/status
>     CapInh: 0000000000000000
>     CapPrm: 000001fffffdffff
>     CapEff: 000001fffffdffff
>     CapBnd: 000001fffffdffff
>     CapAmb: 0000000000000000
>     CapUNs: 000001fffffdffff
>
> Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>
> ---
>  include/linux/user_namespace.h |  7 ++++
>  kernel/sysctl.c                | 10 ++++++
>  kernel/user_namespace.c        | 66 ++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 6030a8235617..e3478bd54ee5 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_USER_NAMESPACE_H
>  #define _LINUX_USER_NAMESPACE_H
>  
> +#include <linux/capability.h>
>  #include <linux/kref.h>
>  #include <linux/nsproxy.h>
>  #include <linux/ns_common.h>
> @@ -14,6 +15,12 @@
>  #define UID_GID_MAP_MAX_BASE_EXTENTS 5
>  #define UID_GID_MAP_MAX_EXTENTS 340
>  
> +#ifdef CONFIG_SYSCTL
> +extern kernel_cap_t cap_userns_mask;
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos);
> +#endif
> +
>  struct uid_gid_extent {
>  	u32 first;
>  	u32 lower_first;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 81cc974913bb..1546eebd6aea 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -62,6 +62,7 @@
>  #include <linux/sched/sysctl.h>
>  #include <linux/mount.h>
>  #include <linux/userfaultfd_k.h>
> +#include <linux/user_namespace.h>
>  #include <linux/pid.h>
>  
>  #include "../lib/kstrtox.h"
> @@ -1846,6 +1847,15 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0444,
>  		.proc_handler	= proc_dointvec,
>  	},
> +#ifdef CONFIG_USER_NS
> +	{
> +		.procname	= "cap_userns_mask",
> +		.data		= &cap_userns_mask,
> +		.maxlen		= sizeof(kernel_cap_t),
> +		.mode		= 0644,
> +		.proc_handler	= proc_cap_userns_handler,
> +	},
> +#endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
>  	{
>  		.procname       = "unknown_nmi_panic",
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 53848e2b68cd..e0cf606e9140 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -26,6 +26,66 @@
>  static struct kmem_cache *user_ns_cachep __ro_after_init;
>  static DEFINE_MUTEX(userns_state_mutex);
>  
> +#ifdef CONFIG_SYSCTL
> +static DEFINE_SPINLOCK(cap_userns_lock);

Generally new global or file-local locks are better to have a comment
that describes their use.

> +kernel_cap_t cap_userns_mask = CAP_FULL_SET;
> +

Non-static symbol should have appropriate kdoc with alll arguments
and return values documented.

> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	unsigned long mask_array[2];
> +	kernel_cap_t new_mask, *mask;
> +	int err;
> +
> +	if (write && (!capable(CAP_SETPCAP) ||
> +		      !capable(CAP_SYS_ADMIN)))
> +		return -EPERM;
> +
> +	/*
> +	 * convert from the global kernel_cap_t to the ulong array to print to
> +	 * userspace if this is a read.
> +	 *
> +	 * capabilities are exposed as one 64-bit value or two 32-bit values
> +	 * depending on the architecture
> +	 */
> +	mask = table->data;
> +	spin_lock(&cap_userns_lock);
> +	mask_array[0] = (unsigned long) mask->val;
> +#if BITS_PER_LONG != 64
> +	mask_array[1] = mask->val >> BITS_PER_LONG;
> +#endif

Why not just "if (BITS_PER_LONG != 64)"?

Compiler will do its job here.

> +	spin_unlock(&cap_userns_lock);
> +
> +	t = *table;
> +	t.data = &mask_array;
> +
> +	/*
> +	 * actually read or write and array of ulongs from userspace.  Remember
> +	 * these are least significant bits first
> +	 */
> +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0)
> +		return err;
> +
> +	new_mask.val = mask_array[0];
> +#if BITS_PER_LONG != 64
> +	new_mask.val += (u64)mask_array[1] << BITS_PER_LONG;
> +#endif

Ditto.

> +
> +	/*
> +	 * Drop everything not in the new_mask (but don't add things)
> +	 */
> +	if (write) {
> +		spin_lock(&cap_userns_lock);
> +		*mask = cap_intersect(*mask, new_mask);
> +		spin_unlock(&cap_userns_lock);
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
>  static bool new_idmap_permitted(const struct file *file,
>  				struct user_namespace *ns, int cap_setid,
>  				struct uid_gid_map *map);
> @@ -46,6 +106,12 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>  	/* Limit userns capabilities to our parent's bounding set. */
>  	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
>  		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
> +#ifdef CONFIG_SYSCTL
> +	/* Mask off userns capabilities that are not permitted by the system-wide mask. */
> +	spin_lock(&cap_userns_lock);
> +	cred->cap_userns = cap_intersect(cred->cap_userns, cap_userns_mask);
> +	spin_unlock(&cap_userns_lock);
> +#endif
>  
>  	/* Start with the capabilities defined in the userns set. */
>  	cred->cap_bset = cred->cap_userns;

BR, Jarkko
Serge E. Hallyn May 20, 2024, 3:38 a.m. UTC | #2
On Thu, May 16, 2024 at 02:22:05AM -0700, Jonathan Calmels wrote:
> This patch adds a new system-wide userns capability mask designed to mask
> off capabilities in user namespaces.
> 
> This mask is controlled through a sysctl and can be set early in the boot
> process or on the kernel command line to exclude known capabilities from
> ever being gained in namespaces. Once set, it can be further restricted to
> exert dynamic policies on the system (e.g. ward off a potential exploit).
> 
> Changing this mask requires privileges over CAP_SYS_ADMIN and CAP_SETPCAP
> in the initial user namespace.
> 
> Example:
> 
>     # sysctl -qw kernel.cap_userns_mask=0x1fffffdffff && \
>       unshare -r grep Cap /proc/self/status
>     CapInh: 0000000000000000
>     CapPrm: 000001fffffdffff
>     CapEff: 000001fffffdffff
>     CapBnd: 000001fffffdffff
>     CapAmb: 0000000000000000
>     CapUNs: 000001fffffdffff
> 
> Signed-off-by: Jonathan Calmels <jcalmels@3xx0.net>

Reviewed-by: Serge Hallyn <serge@hallyn.com>

> ---
>  include/linux/user_namespace.h |  7 ++++
>  kernel/sysctl.c                | 10 ++++++
>  kernel/user_namespace.c        | 66 ++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 6030a8235617..e3478bd54ee5 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_USER_NAMESPACE_H
>  #define _LINUX_USER_NAMESPACE_H
>  
> +#include <linux/capability.h>
>  #include <linux/kref.h>
>  #include <linux/nsproxy.h>
>  #include <linux/ns_common.h>
> @@ -14,6 +15,12 @@
>  #define UID_GID_MAP_MAX_BASE_EXTENTS 5
>  #define UID_GID_MAP_MAX_EXTENTS 340
>  
> +#ifdef CONFIG_SYSCTL
> +extern kernel_cap_t cap_userns_mask;
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos);
> +#endif
> +
>  struct uid_gid_extent {
>  	u32 first;
>  	u32 lower_first;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 81cc974913bb..1546eebd6aea 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -62,6 +62,7 @@
>  #include <linux/sched/sysctl.h>
>  #include <linux/mount.h>
>  #include <linux/userfaultfd_k.h>
> +#include <linux/user_namespace.h>
>  #include <linux/pid.h>
>  
>  #include "../lib/kstrtox.h"
> @@ -1846,6 +1847,15 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0444,
>  		.proc_handler	= proc_dointvec,
>  	},
> +#ifdef CONFIG_USER_NS
> +	{
> +		.procname	= "cap_userns_mask",
> +		.data		= &cap_userns_mask,
> +		.maxlen		= sizeof(kernel_cap_t),
> +		.mode		= 0644,
> +		.proc_handler	= proc_cap_userns_handler,
> +	},
> +#endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
>  	{
>  		.procname       = "unknown_nmi_panic",
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 53848e2b68cd..e0cf606e9140 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -26,6 +26,66 @@
>  static struct kmem_cache *user_ns_cachep __ro_after_init;
>  static DEFINE_MUTEX(userns_state_mutex);
>  
> +#ifdef CONFIG_SYSCTL
> +static DEFINE_SPINLOCK(cap_userns_lock);
> +kernel_cap_t cap_userns_mask = CAP_FULL_SET;
> +
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	unsigned long mask_array[2];
> +	kernel_cap_t new_mask, *mask;
> +	int err;
> +
> +	if (write && (!capable(CAP_SETPCAP) ||
> +		      !capable(CAP_SYS_ADMIN)))
> +		return -EPERM;
> +
> +	/*
> +	 * convert from the global kernel_cap_t to the ulong array to print to
> +	 * userspace if this is a read.
> +	 *
> +	 * capabilities are exposed as one 64-bit value or two 32-bit values
> +	 * depending on the architecture
> +	 */
> +	mask = table->data;
> +	spin_lock(&cap_userns_lock);
> +	mask_array[0] = (unsigned long) mask->val;
> +#if BITS_PER_LONG != 64
> +	mask_array[1] = mask->val >> BITS_PER_LONG;
> +#endif
> +	spin_unlock(&cap_userns_lock);
> +
> +	t = *table;
> +	t.data = &mask_array;
> +
> +	/*
> +	 * actually read or write and array of ulongs from userspace.  Remember
> +	 * these are least significant bits first
> +	 */
> +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0)
> +		return err;
> +
> +	new_mask.val = mask_array[0];
> +#if BITS_PER_LONG != 64
> +	new_mask.val += (u64)mask_array[1] << BITS_PER_LONG;
> +#endif
> +
> +	/*
> +	 * Drop everything not in the new_mask (but don't add things)
> +	 */
> +	if (write) {
> +		spin_lock(&cap_userns_lock);
> +		*mask = cap_intersect(*mask, new_mask);
> +		spin_unlock(&cap_userns_lock);
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
>  static bool new_idmap_permitted(const struct file *file,
>  				struct user_namespace *ns, int cap_setid,
>  				struct uid_gid_map *map);
> @@ -46,6 +106,12 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>  	/* Limit userns capabilities to our parent's bounding set. */
>  	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
>  		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
> +#ifdef CONFIG_SYSCTL
> +	/* Mask off userns capabilities that are not permitted by the system-wide mask. */
> +	spin_lock(&cap_userns_lock);
> +	cred->cap_userns = cap_intersect(cred->cap_userns, cap_userns_mask);
> +	spin_unlock(&cap_userns_lock);
> +#endif
>  
>  	/* Start with the capabilities defined in the userns set. */
>  	cred->cap_bset = cred->cap_userns;
> -- 
> 2.45.0
>
Tycho Andersen May 20, 2024, 1:30 p.m. UTC | #3
Hi Jonathan,

On Thu, May 16, 2024 at 02:22:05AM -0700, Jonathan Calmels wrote:
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos)
> +{

there is an ongoing effort (started at [0]) to constify the first arg
here, since you're not supposed to write to it. Your usage looks
correct to me, so I think all it needs is a literal "const" here.

[0]: https://lore.kernel.org/lkml/20240423-sysctl-const-handler-v3-0-e0beccb836e2@weissschuh.net/

> +	struct ctl_table t;
> +	unsigned long mask_array[2];
> +	kernel_cap_t new_mask, *mask;
> +	int err;
> +
> +	if (write && (!capable(CAP_SETPCAP) ||
> +		      !capable(CAP_SYS_ADMIN)))
> +		return -EPERM;

...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
explain why.

Tycho
Jonathan Calmels May 20, 2024, 7:25 p.m. UTC | #4
On Mon, May 20, 2024 at 07:30:14AM GMT, Tycho Andersen wrote:
> there is an ongoing effort (started at [0]) to constify the first arg
> here, since you're not supposed to write to it. Your usage looks
> correct to me, so I think all it needs is a literal "const" here.

Will do, along with the suggestions from Jarkko

> > +	struct ctl_table t;
> > +	unsigned long mask_array[2];
> > +	kernel_cap_t new_mask, *mask;
> > +	int err;
> > +
> > +	if (write && (!capable(CAP_SETPCAP) ||
> > +		      !capable(CAP_SYS_ADMIN)))
> > +		return -EPERM;
> 
> ...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
> explain why.

No reason really, I was hoping we could decide what we want here.
UMH uses CAP_SYS_MODULE, Serge mentioned adding a new cap maybe.
Tycho Andersen May 20, 2024, 9:13 p.m. UTC | #5
On Mon, May 20, 2024 at 12:25:27PM -0700, Jonathan Calmels wrote:
> On Mon, May 20, 2024 at 07:30:14AM GMT, Tycho Andersen wrote:
> > there is an ongoing effort (started at [0]) to constify the first arg
> > here, since you're not supposed to write to it. Your usage looks
> > correct to me, so I think all it needs is a literal "const" here.
> 
> Will do, along with the suggestions from Jarkko
> 
> > > +	struct ctl_table t;
> > > +	unsigned long mask_array[2];
> > > +	kernel_cap_t new_mask, *mask;
> > > +	int err;
> > > +
> > > +	if (write && (!capable(CAP_SETPCAP) ||
> > > +		      !capable(CAP_SYS_ADMIN)))
> > > +		return -EPERM;
> > 
> > ...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
> > explain why.
> 
> No reason really, I was hoping we could decide what we want here.
> UMH uses CAP_SYS_MODULE, Serge mentioned adding a new cap maybe.

I don't have a strong preference between SETPCAP and a new capability,
but I do think it should be just one. SYS_ADMIN is already god mode
enough, IMO.

Tycho
Jarkko Sakkinen May 20, 2024, 10:12 p.m. UTC | #6
On Tue May 21, 2024 at 12:13 AM EEST, Tycho Andersen wrote:
> On Mon, May 20, 2024 at 12:25:27PM -0700, Jonathan Calmels wrote:
> > On Mon, May 20, 2024 at 07:30:14AM GMT, Tycho Andersen wrote:
> > > there is an ongoing effort (started at [0]) to constify the first arg
> > > here, since you're not supposed to write to it. Your usage looks
> > > correct to me, so I think all it needs is a literal "const" here.
> > 
> > Will do, along with the suggestions from Jarkko
> > 
> > > > +	struct ctl_table t;
> > > > +	unsigned long mask_array[2];
> > > > +	kernel_cap_t new_mask, *mask;
> > > > +	int err;
> > > > +
> > > > +	if (write && (!capable(CAP_SETPCAP) ||
> > > > +		      !capable(CAP_SYS_ADMIN)))
> > > > +		return -EPERM;
> > > 
> > > ...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
> > > explain why.
> > 
> > No reason really, I was hoping we could decide what we want here.
> > UMH uses CAP_SYS_MODULE, Serge mentioned adding a new cap maybe.
>
> I don't have a strong preference between SETPCAP and a new capability,
> but I do think it should be just one. SYS_ADMIN is already god mode
> enough, IMO.

Sometimes I think would it make more sense to invent something
completely new like capabilities but more modern and robust, instead of
increasing complexity of a broken mechanism (especially thanks to
CAP_MAC_ADMIN).

I kind of liked the idea of privilege tokens both in Symbian and Maemo
(have been involved professionally in both). Emphasis on the idea not
necessarily on implementation.

Not an LSM but like something that you could use in the place of POSIX
caps. Probably quite tedious effort tho because you would need to pull
the whole industry with the new thing...

BR, Jarkko
Tycho Andersen May 21, 2024, 2:29 p.m. UTC | #7
On Tue, May 21, 2024 at 01:12:57AM +0300, Jarkko Sakkinen wrote:
> On Tue May 21, 2024 at 12:13 AM EEST, Tycho Andersen wrote:
> > On Mon, May 20, 2024 at 12:25:27PM -0700, Jonathan Calmels wrote:
> > > On Mon, May 20, 2024 at 07:30:14AM GMT, Tycho Andersen wrote:
> > > > there is an ongoing effort (started at [0]) to constify the first arg
> > > > here, since you're not supposed to write to it. Your usage looks
> > > > correct to me, so I think all it needs is a literal "const" here.
> > > 
> > > Will do, along with the suggestions from Jarkko
> > > 
> > > > > +	struct ctl_table t;
> > > > > +	unsigned long mask_array[2];
> > > > > +	kernel_cap_t new_mask, *mask;
> > > > > +	int err;
> > > > > +
> > > > > +	if (write && (!capable(CAP_SETPCAP) ||
> > > > > +		      !capable(CAP_SYS_ADMIN)))
> > > > > +		return -EPERM;
> > > > 
> > > > ...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
> > > > explain why.
> > > 
> > > No reason really, I was hoping we could decide what we want here.
> > > UMH uses CAP_SYS_MODULE, Serge mentioned adding a new cap maybe.
> >
> > I don't have a strong preference between SETPCAP and a new capability,
> > but I do think it should be just one. SYS_ADMIN is already god mode
> > enough, IMO.
> 
> Sometimes I think would it make more sense to invent something
> completely new like capabilities but more modern and robust, instead of
> increasing complexity of a broken mechanism (especially thanks to
> CAP_MAC_ADMIN).
> 
> I kind of liked the idea of privilege tokens both in Symbian and Maemo
> (have been involved professionally in both). Emphasis on the idea not
> necessarily on implementation.
> 
> Not an LSM but like something that you could use in the place of POSIX
> caps. Probably quite tedious effort tho because you would need to pull
> the whole industry with the new thing...

And then we have LSM hooks, (ns_)capable(), __secure_computing() plus
a new set of hooks for this new thing sprinkled around. I guess
kernel developers wouldn't be excited about it, let alone the rest of
the industry :)

Thinking out loud: I wonder if fixing the seccomp TOCTOU against
pointers would help here. I guess you'd still have issues where your
policy engine resolves a path arg to open() and that inode changes
between the decision and the actual vfs access, you have just changed
the TOCTOU.

Or even scarier: what if you could change the return value at any
kprobe? :)

Tycho
Jarkko Sakkinen May 21, 2024, 2:45 p.m. UTC | #8
On Tue May 21, 2024 at 5:29 PM EEST, Tycho Andersen wrote:
> On Tue, May 21, 2024 at 01:12:57AM +0300, Jarkko Sakkinen wrote:
> > On Tue May 21, 2024 at 12:13 AM EEST, Tycho Andersen wrote:
> > > On Mon, May 20, 2024 at 12:25:27PM -0700, Jonathan Calmels wrote:
> > > > On Mon, May 20, 2024 at 07:30:14AM GMT, Tycho Andersen wrote:
> > > > > there is an ongoing effort (started at [0]) to constify the first arg
> > > > > here, since you're not supposed to write to it. Your usage looks
> > > > > correct to me, so I think all it needs is a literal "const" here.
> > > > 
> > > > Will do, along with the suggestions from Jarkko
> > > > 
> > > > > > +	struct ctl_table t;
> > > > > > +	unsigned long mask_array[2];
> > > > > > +	kernel_cap_t new_mask, *mask;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	if (write && (!capable(CAP_SETPCAP) ||
> > > > > > +		      !capable(CAP_SYS_ADMIN)))
> > > > > > +		return -EPERM;
> > > > > 
> > > > > ...why CAP_SYS_ADMIN? You mention it in the changelog, but don't
> > > > > explain why.
> > > > 
> > > > No reason really, I was hoping we could decide what we want here.
> > > > UMH uses CAP_SYS_MODULE, Serge mentioned adding a new cap maybe.
> > >
> > > I don't have a strong preference between SETPCAP and a new capability,
> > > but I do think it should be just one. SYS_ADMIN is already god mode
> > > enough, IMO.
> > 
> > Sometimes I think would it make more sense to invent something
> > completely new like capabilities but more modern and robust, instead of
> > increasing complexity of a broken mechanism (especially thanks to
> > CAP_MAC_ADMIN).
> > 
> > I kind of liked the idea of privilege tokens both in Symbian and Maemo
> > (have been involved professionally in both). Emphasis on the idea not
> > necessarily on implementation.
> > 
> > Not an LSM but like something that you could use in the place of POSIX
> > caps. Probably quite tedious effort tho because you would need to pull
> > the whole industry with the new thing...
>
> And then we have LSM hooks, (ns_)capable(), __secure_computing() plus
> a new set of hooks for this new thing sprinkled around. I guess
> kernel developers wouldn't be excited about it, let alone the rest of
> the industry :)
>
> Thinking out loud: I wonder if fixing the seccomp TOCTOU against
> pointers would help here. I guess you'd still have issues where your
> policy engine resolves a path arg to open() and that inode changes
> between the decision and the actual vfs access, you have just changed
> the TOCTOU.
>
> Or even scarier: what if you could change the return value at any
> kprobe? :)

I had one crazy idea related to seccomp filters once.

What if there was way to compose tokens that would be just a seccomp
filter like the one that you pass to PR_SET_SECCOMP but presented with a
file descriptor?

Then you could send these with SCM_RIGHTS to other processes and they
could upgrade their existing filter with them. So it would be a kind of
extension mechanism for a seccomp filter.

Not something I'm seriously suggesting but though to flush this out now
that we are on these topics anyhow ;-)

> Tycho

PS. Sorry if my language was a bit harsh earlier but I think I had also
a point related to at least to the patch set presentation. I.e. you
are very precise describing the mechanism but motivation and bringing
topic somehow to a context is equally important :-)

BR, Jarkko
diff mbox series

Patch

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..e3478bd54ee5 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -2,6 +2,7 @@ 
 #ifndef _LINUX_USER_NAMESPACE_H
 #define _LINUX_USER_NAMESPACE_H
 
+#include <linux/capability.h>
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
@@ -14,6 +15,12 @@ 
 #define UID_GID_MAP_MAX_BASE_EXTENTS 5
 #define UID_GID_MAP_MAX_EXTENTS 340
 
+#ifdef CONFIG_SYSCTL
+extern kernel_cap_t cap_userns_mask;
+int proc_cap_userns_handler(struct ctl_table *table, int write,
+			    void *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 struct uid_gid_extent {
 	u32 first;
 	u32 lower_first;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81cc974913bb..1546eebd6aea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,6 +62,7 @@ 
 #include <linux/sched/sysctl.h>
 #include <linux/mount.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/user_namespace.h>
 #include <linux/pid.h>
 
 #include "../lib/kstrtox.h"
@@ -1846,6 +1847,15 @@  static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_USER_NS
+	{
+		.procname	= "cap_userns_mask",
+		.data		= &cap_userns_mask,
+		.maxlen		= sizeof(kernel_cap_t),
+		.mode		= 0644,
+		.proc_handler	= proc_cap_userns_handler,
+	},
+#endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.procname       = "unknown_nmi_panic",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 53848e2b68cd..e0cf606e9140 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -26,6 +26,66 @@ 
 static struct kmem_cache *user_ns_cachep __ro_after_init;
 static DEFINE_MUTEX(userns_state_mutex);
 
+#ifdef CONFIG_SYSCTL
+static DEFINE_SPINLOCK(cap_userns_lock);
+kernel_cap_t cap_userns_mask = CAP_FULL_SET;
+
+int proc_cap_userns_handler(struct ctl_table *table, int write,
+			    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	unsigned long mask_array[2];
+	kernel_cap_t new_mask, *mask;
+	int err;
+
+	if (write && (!capable(CAP_SETPCAP) ||
+		      !capable(CAP_SYS_ADMIN)))
+		return -EPERM;
+
+	/*
+	 * convert from the global kernel_cap_t to the ulong array to print to
+	 * userspace if this is a read.
+	 *
+	 * capabilities are exposed as one 64-bit value or two 32-bit values
+	 * depending on the architecture
+	 */
+	mask = table->data;
+	spin_lock(&cap_userns_lock);
+	mask_array[0] = (unsigned long) mask->val;
+#if BITS_PER_LONG != 64
+	mask_array[1] = mask->val >> BITS_PER_LONG;
+#endif
+	spin_unlock(&cap_userns_lock);
+
+	t = *table;
+	t.data = &mask_array;
+
+	/*
+	 * actually read or write and array of ulongs from userspace.  Remember
+	 * these are least significant bits first
+	 */
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+
+	new_mask.val = mask_array[0];
+#if BITS_PER_LONG != 64
+	new_mask.val += (u64)mask_array[1] << BITS_PER_LONG;
+#endif
+
+	/*
+	 * Drop everything not in the new_mask (but don't add things)
+	 */
+	if (write) {
+		spin_lock(&cap_userns_lock);
+		*mask = cap_intersect(*mask, new_mask);
+		spin_unlock(&cap_userns_lock);
+	}
+
+	return 0;
+}
+#endif
+
 static bool new_idmap_permitted(const struct file *file,
 				struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
@@ -46,6 +106,12 @@  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	/* Limit userns capabilities to our parent's bounding set. */
 	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
 		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
+#ifdef CONFIG_SYSCTL
+	/* Mask off userns capabilities that are not permitted by the system-wide mask. */
+	spin_lock(&cap_userns_lock);
+	cred->cap_userns = cap_intersect(cred->cap_userns, cap_userns_mask);
+	spin_unlock(&cap_userns_lock);
+#endif
 
 	/* Start with the capabilities defined in the userns set. */
 	cred->cap_bset = cred->cap_userns;