diff mbox series

[v6,3/3] openat2: add OA2_CRED_INHERIT flag

Message ID 20240427112451.1609471-4-stsp2@yandex.ru (mailing list archive)
State New, archived
Headers show
Series implement OA2_CRED_INHERIT flag for openat2() | expand

Commit Message

stsp April 27, 2024, 11:24 a.m. UTC
This flag performs the open operation with the fs credentials
(fsuid, fsgid, group_info) that were in effect when dir_fd was opened.
dir_fd must be opened with O_CRED_ALLOW, or EPERM is returned.

Selftests are added to check for these properties as well as for
the invalid flag combinations.

This allows the process to pre-open some directories and then
change eUID (and all other UIDs/GIDs) to a less-privileged user,
retaining the ability to open/create files within these directories.

Design goal:
The idea is to provide a very light-weight sandboxing, where the
process, without the use of any heavy-weight techniques like chroot
within namespaces, can restrict the access to the set of pre-opened
directories.
This patch is just a first step to such sandboxing. If things go
well, in the future the same extension can be added to more syscalls.
These should include at least unlinkat(), renameat2() and the
not-yet-upstreamed setxattrat().

Security considerations:
- Only the bare minimal set of credentials is overridden:
  fsuid, fsgid and group_info. The rest, for example capabilities,
  are not overridden to avoid unneeded security risks.
- To avoid sandboxing escape, this patch makes sure the restricted
  lookup modes are used. Namely, RESOLVE_BENEATH or RESOLVE_IN_ROOT.
- Magic /proc symlinks are discarded, as suggested by
  Andy Lutomirski <luto@kernel.org>
- O_CRED_ALLOW fds cannot be passed via unix socket and are always
  closed on exec() to prevent "unsuspecting userspace" from not being
  able to fully drop privs.

Use cases:
Virtual machines that deal with untrusted code, can use that
instead of a more heavy-weighted approaches.
Currently the approach is being tested on a dosemu2 VM.

Signed-off-by: Stas Sergeev <stsp2@yandex.ru>

CC: Stefan Metzmacher <metze@samba.org>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Andy Lutomirski <luto@kernel.org>
CC: Christian Brauner <brauner@kernel.org>
CC: Jan Kara <jack@suse.cz>
CC: Jeff Layton <jlayton@kernel.org>
CC: Chuck Lever <chuck.lever@oracle.com>
CC: Alexander Aring <alex.aring@gmail.com>
CC: linux-fsdevel@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Christian Göttsche <cgzones@googlemail.com>
---
 fs/fcntl.c                                    |   2 +
 fs/namei.c                                    |  56 +++++++++-
 fs/open.c                                     |  10 +-
 include/linux/fcntl.h                         |   2 +
 include/uapi/linux/openat2.h                  |   2 +
 tools/testing/selftests/openat2/Makefile      |   2 +-
 .../testing/selftests/openat2/cred_inherit.c  | 105 ++++++++++++++++++
 .../testing/selftests/openat2/openat2_test.c  |  12 +-
 8 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/openat2/cred_inherit.c

Comments

Donald Buczek May 4, 2024, 8:38 p.m. UTC | #1
On 4/27/24 13:24, Stas Sergeev wrote:
> This flag performs the open operation with the fs credentials
> (fsuid, fsgid, group_info) that were in effect when dir_fd was opened.
> dir_fd must be opened with O_CRED_ALLOW, or EPERM is returned.
> 
> Selftests are added to check for these properties as well as for
> the invalid flag combinations.
> 
> This allows the process to pre-open some directories and then
> change eUID (and all other UIDs/GIDs) to a less-privileged user,
> retaining the ability to open/create files within these directories.
> 
> Design goal:
> The idea is to provide a very light-weight sandboxing, where the
> process, without the use of any heavy-weight techniques like chroot
> within namespaces, can restrict the access to the set of pre-opened
> directories.
> This patch is just a first step to such sandboxing. If things go
> well, in the future the same extension can be added to more syscalls.
> These should include at least unlinkat(), renameat2() and the
> not-yet-upstreamed setxattrat().
> 
> Security considerations:
> - Only the bare minimal set of credentials is overridden:
>    fsuid, fsgid and group_info. The rest, for example capabilities,
>    are not overridden to avoid unneeded security risks.
> - To avoid sandboxing escape, this patch makes sure the restricted
>    lookup modes are used. Namely, RESOLVE_BENEATH or RESOLVE_IN_ROOT.
> - Magic /proc symlinks are discarded, as suggested by
>    Andy Lutomirski <luto@kernel.org>> - O_CRED_ALLOW fds cannot be passed via unix socket and are always
>    closed on exec() to prevent "unsuspecting userspace" from not being
>    able to fully drop privs.

What about hard links?

== snip ==
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <stdarg.h>
#include <fcntl.h>
#include <unistd.h>
#include <linux/openat2.h>

#define O_CRED_ALLOW 0x2000000
#define OA2_CRED_INHERIT (1UL << 28)

#define SYS_openat2 437
long openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) {
     return syscall(SYS_openat2, dirfd, pathname, how, size);
}


__attribute__ ((noreturn, format(printf, 1, 2)))
static void die(const char *restrict fmt, ...) {
     va_list ap;
     va_start(ap, fmt);
     vfprintf(stderr, fmt, ap);
     va_end(ap);
     _exit(1);
}

int main() {

     unlink("/tmp/d/test.dat");
     unlink("/tmp/d/hostname");
     if (rmdir("/tmp/d") != 0 && errno != ENOENT)
         die("/tmp/d: %m\n");
     
     umask(0);
     if (mkdir("/tmp/d", 0777) != 0)
         die("/tmp/d: %m\n");

     int dirfd = open("/tmp/d", O_RDONLY + O_CRED_ALLOW);
     if (dirfd == -1)
         die("/tmp/d: %m\n");

     if (setuid(1000) != 0)
         die("setuid: %m\n");

     if (link("/etc/hostname", "/tmp/d/hostname") == -1)
         die ("/etc/hostname: %m\n");

     if(openat(dirfd, "hostname", O_RDWR) != -1)
         die("/tmp/d/hostname could be opened by uid 1000");

     {   struct open_how how = { .flags = O_RDWR + OA2_CRED_INHERIT, .resolve = RESOLVE_BENEATH };
         if (openat2(dirfd, "hostname", &how, sizeof(how)) == -1)
             die("hostname: %m\n");
         printf("able to open /etc/hostname RDWR \n");
     }
}

== snip ==


buczek@dose:~$ gcc -O0 -Wall -Wextra -Werror -g -o test test.c
buczek@dose:~$ sudo ./test
able to open /etc/hostname RDWR
buczek@dose:~$
stsp May 4, 2024, 9:11 p.m. UTC | #2
04.05.2024 23:38, Donald Buczek пишет:
> On 4/27/24 13:24, Stas Sergeev wrote:
>> This flag performs the open operation with the fs credentials
>> (fsuid, fsgid, group_info) that were in effect when dir_fd was opened.
>> dir_fd must be opened with O_CRED_ALLOW, or EPERM is returned.
>>
>> Selftests are added to check for these properties as well as for
>> the invalid flag combinations.
>>
>> This allows the process to pre-open some directories and then
>> change eUID (and all other UIDs/GIDs) to a less-privileged user,
>> retaining the ability to open/create files within these directories.
>>
>> Design goal:
>> The idea is to provide a very light-weight sandboxing, where the
>> process, without the use of any heavy-weight techniques like chroot
>> within namespaces, can restrict the access to the set of pre-opened
>> directories.
>> This patch is just a first step to such sandboxing. If things go
>> well, in the future the same extension can be added to more syscalls.
>> These should include at least unlinkat(), renameat2() and the
>> not-yet-upstreamed setxattrat().
>>
>> Security considerations:
>> - Only the bare minimal set of credentials is overridden:
>>    fsuid, fsgid and group_info. The rest, for example capabilities,
>>    are not overridden to avoid unneeded security risks.
>> - To avoid sandboxing escape, this patch makes sure the restricted
>>    lookup modes are used. Namely, RESOLVE_BENEATH or RESOLVE_IN_ROOT.
>> - Magic /proc symlinks are discarded, as suggested by
>>    Andy Lutomirski <luto@kernel.org>> - O_CRED_ALLOW fds cannot be 
>> passed via unix socket and are always
>>    closed on exec() to prevent "unsuspecting userspace" from not being
>>    able to fully drop privs.
>
> What about hard links?
Well, you set umask to 0 in your example.
If you didn't do that, the dir wouldn't have
0777 perms, and the hard link would not
be created.
But yes, that demonstrates the unsafe
usage scenario, i.e. unsafe directory perms
immediately lead to a security hole.
Maybe O_CRED_ALLOW should check for
safe perms, or maybe it shouldn't... So far
there are no signs of this patch to ever be
accepted, so I am not sure if more complexity
needs to be added to it.
diff mbox series

Patch

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 78c96b1293c2..283c2e65fc2c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1043,6 +1043,8 @@  static int __init fcntl_init(void)
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
 			__FMODE_EXEC | __FMODE_NONOTIFY));
+	BUILD_BUG_ON(HWEIGHT32(VALID_OPENAT2_FLAGS) !=
+			HWEIGHT32(VALID_OPEN_FLAGS) + 1);
 
 	fasync_cache = kmem_cache_create("fasync_cache",
 					 sizeof(struct fasync_struct), 0,
diff --git a/fs/namei.c b/fs/namei.c
index dd50345f7260..aa5dcf57851b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3776,6 +3776,43 @@  static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
 	return error;
 }
 
+static const struct cred *openat2_init_creds(int dfd)
+{
+	struct cred *cred;
+	struct fd f;
+
+	if (dfd == AT_FDCWD)
+		return ERR_PTR(-EINVAL);
+
+	f = fdget_raw(dfd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	cred = ERR_PTR(-EPERM);
+	if (!(f.file->f_flags & O_CRED_ALLOW))
+		goto done;
+
+	cred = prepare_creds();
+	if (!cred) {
+		cred = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	cred->fsuid = f.file->f_cred->fsuid;
+	cred->fsgid = f.file->f_cred->fsgid;
+	cred->group_info = get_group_info(f.file->f_cred->group_info);
+
+done:
+	fdput(f);
+	return cred;
+}
+
+static void openat2_done_creds(const struct cred *cred)
+{
+	put_group_info(cred->group_info);
+	put_cred(cred);
+}
+
 static struct file *path_openat(struct nameidata *nd,
 			const struct open_flags *op, unsigned flags)
 {
@@ -3793,18 +3830,33 @@  static struct file *path_openat(struct nameidata *nd,
 			error = do_o_path(nd, flags, file);
 	} else {
 		const char *s;
+		const struct cred *old_cred = NULL, *cred = NULL;
 
-		file = alloc_empty_file(open_flags, current_cred());
-		if (IS_ERR(file))
+		if (open_flags & OA2_CRED_INHERIT) {
+			cred = openat2_init_creds(nd->dfd);
+			if (IS_ERR(cred))
+				return ERR_CAST(cred);
+		}
+		file = alloc_empty_file(open_flags, cred ?: current_cred());
+		if (IS_ERR(file)) {
+			if (cred)
+				openat2_done_creds(cred);
 			return file;
+		}
 
 		s = path_init(nd, flags);
+		if (cred)
+			old_cred = override_creds(cred);
 		while (!(error = link_path_walk(s, nd)) &&
 		       (s = open_last_lookups(nd, file, op)) != NULL)
 			;
 		if (!error)
 			error = do_open(nd, file, op);
+		if (old_cred)
+			revert_creds(old_cred);
 		terminate_walk(nd);
+		if (cred)
+			openat2_done_creds(cred);
 	}
 	if (likely(!error)) {
 		if (likely(file->f_mode & FMODE_OPENED))
diff --git a/fs/open.c b/fs/open.c
index ee8460c83c77..dd4fab536135 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1225,7 +1225,7 @@  inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	 * values before calling build_open_flags(), but openat2(2) checks all
 	 * of its arguments.
 	 */
-	if (flags & ~VALID_OPEN_FLAGS)
+	if (flags & ~VALID_OPENAT2_FLAGS)
 		return -EINVAL;
 	if (how->resolve & ~VALID_RESOLVE_FLAGS)
 		return -EINVAL;
@@ -1326,6 +1326,14 @@  inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		lookup_flags |= LOOKUP_CACHED;
 	}
 
+	if (flags & OA2_CRED_INHERIT) {
+		/* Inherit creds only with scoped look-up modes. */
+		if (!(lookup_flags & LOOKUP_IS_SCOPED))
+			return -EPERM;
+		/* Reject /proc "magic" links if inheriting creds. */
+		lookup_flags |= LOOKUP_NO_MAGICLINKS;
+	}
+
 	op->lookup_flags = lookup_flags;
 	return 0;
 }
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index e074ee9c1e36..33b9c7ad056b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -12,6 +12,8 @@ 
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
 	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_CRED_ALLOW)
 
+#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OA2_CRED_INHERIT)
+
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index a5feb7604948..f803558ad62f 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -40,4 +40,6 @@  struct open_how {
 					return -EAGAIN if that's not
 					possible. */
 
+#define OA2_CRED_INHERIT		(1UL << 28)
+
 #endif /* _UAPI_LINUX_OPENAT2_H */
diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile
index 254d676a2689..a1f4b5395f82 100644
--- a/tools/testing/selftests/openat2/Makefile
+++ b/tools/testing/selftests/openat2/Makefile
@@ -1,7 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-or-later
 
 CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan
-TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test
+TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test cred_inherit
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/openat2/cred_inherit.c b/tools/testing/selftests/openat2/cred_inherit.c
new file mode 100644
index 000000000000..550a06763ac7
--- /dev/null
+++ b/tools/testing/selftests/openat2/cred_inherit.c
@@ -0,0 +1,105 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <time.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+#ifndef O_CRED_ALLOW
+#define O_CRED_ALLOW 0x2000000
+#endif
+
+#ifndef OA2_CRED_INHERIT
+#define OA2_CRED_INHERIT (1UL << 28)
+#endif
+
+enum { FD_NORM, FD_NCA, FD_DIR, FD_DCA, FD_MAX };
+
+int main(int argc, char *argv[], char *env[])
+{
+	struct open_how how1 = {
+				.flags = O_RDONLY,
+				.resolve = RESOLVE_BENEATH,
+			       };
+	struct open_how how2 = {
+				.flags = O_RDONLY | OA2_CRED_INHERIT,
+				.resolve = RESOLVE_BENEATH,
+			       };
+	int size = sizeof(struct open_how);
+	int i;
+	int fd;
+	int err;
+	int fds[FD_MAX];
+#define NFD(n) ((n) + 3)
+#define FD_OK(n) (NFD(n) == fds[n])
+
+	if (!openat2_supported) {
+		ksft_print_msg("openat2(2) unsupported\n");
+		return 0;
+	}
+
+	ksft_set_plan(14);
+
+	fds[FD_NORM] = open("/proc/self/maps", O_RDONLY);
+	fds[FD_NCA] = open("/proc/self/maps", O_RDONLY | O_CRED_ALLOW);
+	fds[FD_DIR] = open("/proc/self", O_RDONLY | O_DIRECTORY);
+	fds[FD_DCA] = open("/proc/self", O_RDONLY | O_DIRECTORY | O_CRED_ALLOW);
+	ksft_test_result(FD_OK(FD_NORM), "file open\n");
+	ksft_test_result(FD_OK(FD_NCA), "file open with O_CRED_ALLOW\n");
+	ksft_test_result(FD_OK(FD_DIR), "directory open\n");
+	ksft_test_result(FD_OK(FD_DCA), "directory open with O_CRED_ALLOW\n");
+
+	err = fchdir(fds[FD_DIR]);
+	if (err) {
+		ksft_perror("fchdir() failed");
+		ksft_exit_fail_msg("fchdir\n");
+		return 1;
+	}
+	fd = syscall(SYS_openat2, AT_FDCWD, "maps", &how1, size);
+	ksft_test_result(fd != -1, "AT_FDCWD success\n");
+	close(fd);
+	/* OA2_CRED_INHERIT fails with AT_FDCWD */
+	fd = syscall(SYS_openat2, AT_FDCWD, "maps", &how2, size);
+	ksft_test_result(fd == -1 && errno == EINVAL, "AT_FDCWD EINVAL\n");
+
+	fd = syscall(SYS_openat2, fds[FD_NORM], "maps", &how1, size);
+	ksft_test_result(fd == -1 && errno == ENOTDIR, "regilar file ENOTDIR\n");
+	/* No O_CRED_ALLOW -> EPERM */
+	fd = syscall(SYS_openat2, fds[FD_NORM], "maps", &how2, size);
+	ksft_test_result(fd == -1 && errno == EPERM, "regilar file EPERM\n");
+
+	fd = syscall(SYS_openat2, fds[FD_NCA], "maps", &how1, size);
+	ksft_test_result(fd == -1 && errno == ENOTDIR, "regilar file ENOTDIR\n");
+	fd = syscall(SYS_openat2, fds[FD_NCA], "maps", &how2, size);
+	ksft_test_result(fd == -1 && errno == ENOTDIR, "regilar file ENOTDIR\n");
+
+	fd = syscall(SYS_openat2, fds[FD_DIR], "maps", &how1, size);
+	ksft_test_result(fd != -1, "dir fd success\n");
+	close(fd);
+	/* No O_CRED_ALLOW -> EPERM */
+	fd = syscall(SYS_openat2, fds[FD_DIR], "maps", &how2, size);
+	ksft_test_result(fd == -1 && errno == EPERM, "dir fd EPERM\n");
+
+	fd = syscall(SYS_openat2, fds[FD_DCA], "maps", &how1, size);
+	ksft_test_result(fd != -1, "dir O_CRED_ALLOW fd success\n");
+	close(fd);
+	fd = syscall(SYS_openat2, fds[FD_DCA], "maps", &how2, size);
+	ksft_test_result(fd != -1, "dir O_CRED_ALLOW fd O_CRED_INHERIT success\n");
+	close(fd);
+
+	for (i = 0; i < FD_MAX; i++)
+		close(fds[i]);
+	ksft_finished();
+	return 0;
+}
diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c
index 9024754530b2..5095288fe1ac 100644
--- a/tools/testing/selftests/openat2/openat2_test.c
+++ b/tools/testing/selftests/openat2/openat2_test.c
@@ -28,6 +28,10 @@ 
 #define	O_LARGEFILE 0x8000
 #endif
 
+#ifndef OA2_CRED_INHERIT
+#define OA2_CRED_INHERIT (1UL << 28)
+#endif
+
 struct open_how_ext {
 	struct open_how inner;
 	uint32_t extra1;
@@ -159,7 +163,7 @@  struct flag_test {
 	int err;
 };
 
-#define NUM_OPENAT2_FLAG_TESTS 25
+#define NUM_OPENAT2_FLAG_TESTS 27
 
 void test_openat2_flags(void)
 {
@@ -233,6 +237,12 @@  void test_openat2_flags(void)
 		{ .name = "invalid how.resolve and O_PATH",
 		  .how.flags = O_PATH,
 		  .how.resolve = 0x1337, .err = -EINVAL },
+		{ .name = "invalid how.resolve and OA2_CRED_INHERIT",
+		  .how.flags = OA2_CRED_INHERIT,
+		  .how.resolve = 0, .err = -EPERM },
+		{ .name = "invalid AT_FDCWD and OA2_CRED_INHERIT",
+		  .how.flags = OA2_CRED_INHERIT,
+		  .how.resolve = 0x08, .err = -EINVAL },
 
 		/* currently unknown upper 32 bit rejected. */
 		{ .name = "currently unknown bit (1 << 63)",