diff mbox series

[v3,3/4] selftests/ovl: add second selftest for "override_creds"

Message ID 20250219-work-overlayfs-v3-3-46af55e4ceda@kernel.org (mailing list archive)
State New
Headers show
Series ovl: add override_creds mount option | expand

Commit Message

Christian Brauner Feb. 19, 2025, 10:01 a.m. UTC
Add a simple test to verify that the new "override_creds" option works.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/overlayfs/Makefile       |  11 +-
 .../filesystems/overlayfs/set_layers_via_fds.c     | 149 ++++++-
 tools/testing/selftests/filesystems/utils.c        | 474 +++++++++++++++++++++
 tools/testing/selftests/filesystems/utils.h        |  44 ++
 4 files changed, 665 insertions(+), 13 deletions(-)

Comments

Amir Goldstein Feb. 19, 2025, 12:36 p.m. UTC | #1
On Wed, Feb 19, 2025 at 11:02 AM Christian Brauner <brauner@kernel.org> wrote:
>
> Add a simple test to verify that the new "override_creds" option works.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>

For the added test you may add:

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

But you may want to consider splitting the large infrastructure
and the churn to the previous test to a separate patch, to make this
patch cleaner.

Thanks,
Amir.

> ---
>  .../selftests/filesystems/overlayfs/Makefile       |  11 +-
>  .../filesystems/overlayfs/set_layers_via_fds.c     | 149 ++++++-
>  tools/testing/selftests/filesystems/utils.c        | 474 +++++++++++++++++++++
>  tools/testing/selftests/filesystems/utils.h        |  44 ++
>  4 files changed, 665 insertions(+), 13 deletions(-)
>
> diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
> index e8d1adb021af..6c661232b3b5 100644
> --- a/tools/testing/selftests/filesystems/overlayfs/Makefile
> +++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
> @@ -1,7 +1,14 @@
>  # SPDX-License-Identifier: GPL-2.0
>
> -TEST_GEN_PROGS := dev_in_maps set_layers_via_fds
> +CFLAGS += -Wall
> +CFLAGS += $(KHDR_INCLUDES)
> +LDLIBS += -lcap
>
> -CFLAGS := -Wall -Werror
> +LOCAL_HDRS += wrappers.h log.h
> +
> +TEST_GEN_PROGS := dev_in_maps
> +TEST_GEN_PROGS += set_layers_via_fds
>
>  include ../../lib.mk
> +
> +$(OUTPUT)/set_layers_via_fds: ../utils.c
> diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
> index 70acd833581d..6b65e3610578 100644
> --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
> +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
> @@ -6,6 +6,7 @@
>  #include <sched.h>
>  #include <stdio.h>
>  #include <string.h>
> +#include <sys/socket.h>
>  #include <sys/stat.h>
>  #include <sys/mount.h>
>  #include <unistd.h>
> @@ -13,20 +14,27 @@
>  #include "../../kselftest_harness.h"
>  #include "../../pidfd/pidfd.h"
>  #include "log.h"
> +#include "../utils.h"
>  #include "wrappers.h"
>
>  FIXTURE(set_layers_via_fds) {
> +       int pidfd;
>  };
>
>  FIXTURE_SETUP(set_layers_via_fds)
>  {
> -       ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
> +       self->pidfd = -EBADF;
> +       EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
>  }
>
>  FIXTURE_TEARDOWN(set_layers_via_fds)
>  {
> +       if (self->pidfd >= 0) {
> +               EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0);
> +               EXPECT_EQ(close(self->pidfd), 0);
> +       }
>         umount2("/set_layers_via_fds", 0);
> -       ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
> +       EXPECT_EQ(rmdir("/set_layers_via_fds"), 0);
>  }
>
>  TEST_F(set_layers_via_fds, set_layers_via_fds)
> @@ -266,7 +274,7 @@ TEST_F(set_layers_via_fds, set_override_creds)
>         ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
>
>         pid = create_child(&pidfd, 0);
> -       EXPECT_GE(pid, 0);
> +       ASSERT_GE(pid, 0);
>         if (pid == 0) {
>                 if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
>                         TH_LOG("sys_fsconfig should have succeeded");
> @@ -275,11 +283,11 @@ TEST_F(set_layers_via_fds, set_override_creds)
>
>                 _exit(EXIT_SUCCESS);
>         }
> -       EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> -       EXPECT_EQ(close(pidfd), 0);
> +       ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> +       ASSERT_GE(close(pidfd), 0);
>
>         pid = create_child(&pidfd, 0);
> -       EXPECT_GE(pid, 0);
> +       ASSERT_GE(pid, 0);
>         if (pid == 0) {
>                 if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) {
>                         TH_LOG("sys_fsconfig should have succeeded");
> @@ -288,11 +296,11 @@ TEST_F(set_layers_via_fds, set_override_creds)
>
>                 _exit(EXIT_SUCCESS);
>         }
> -       EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> -       EXPECT_EQ(close(pidfd), 0);
> +       ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> +       ASSERT_GE(close(pidfd), 0);
>
>         pid = create_child(&pidfd, 0);
> -       EXPECT_GE(pid, 0);
> +       ASSERT_GE(pid, 0);
>         if (pid == 0) {
>                 if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
>                         TH_LOG("sys_fsconfig should have succeeded");
> @@ -301,8 +309,125 @@ TEST_F(set_layers_via_fds, set_override_creds)
>
>                 _exit(EXIT_SUCCESS);
>         }
> -       EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> -       EXPECT_EQ(close(pidfd), 0);
> +       ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
> +       ASSERT_GE(close(pidfd), 0);
> +
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
> +
> +       fd_overlay = sys_fsmount(fd_context, 0, 0);
> +       ASSERT_GE(fd_overlay, 0);
> +
> +       ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
> +
> +       ASSERT_EQ(close(fd_context), 0);
> +       ASSERT_EQ(close(fd_overlay), 0);
> +}
> +
> +TEST_F(set_layers_via_fds, set_override_creds_invalid)
> +{
> +       int fd_context, fd_tmpfs, fd_overlay, ret;
> +       int layer_fds[] = { [0 ... 3] = -EBADF };
> +       pid_t pid;
> +       int fd_userns1, fd_userns2;
> +       int ipc_sockets[2];
> +       char c;
> +       const unsigned int predictable_fd_context_nr = 123;
> +
> +       fd_userns1 = get_userns_fd(0, 0, 10000);
> +       ASSERT_GE(fd_userns1, 0);
> +
> +       fd_userns2 = get_userns_fd(0, 1234, 10000);
> +       ASSERT_GE(fd_userns2, 0);
> +
> +       ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
> +       ASSERT_GE(ret, 0);
> +
> +       pid = create_child(&self->pidfd, 0);
> +       ASSERT_GE(pid, 0);
> +       if (pid == 0) {
> +               if (close(ipc_sockets[0])) {
> +                       TH_LOG("close should have succeeded");
> +                       _exit(EXIT_FAILURE);
> +               }
> +
> +               if (!switch_userns(fd_userns2, 0, 0, false)) {
> +                       TH_LOG("switch_userns should have succeeded");
> +                       _exit(EXIT_FAILURE);
> +               }
> +
> +               if (read_nointr(ipc_sockets[1], &c, 1) != 1) {
> +                       TH_LOG("read_nointr should have succeeded");
> +                       _exit(EXIT_FAILURE);
> +               }
> +
> +               if (close(ipc_sockets[1])) {
> +                       TH_LOG("close should have succeeded");
> +                       _exit(EXIT_FAILURE);
> +               }
> +
> +               if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
> +                       TH_LOG("sys_fsconfig should have failed");
> +                       _exit(EXIT_FAILURE);
> +               }
> +
> +               _exit(EXIT_SUCCESS);
> +       }
> +
> +       ASSERT_EQ(close(ipc_sockets[1]), 0);
> +       ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true);
> +       ASSERT_EQ(unshare(CLONE_NEWNS), 0);
> +       ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
> +
> +       fd_context = sys_fsopen("tmpfs", 0);
> +       ASSERT_GE(fd_context, 0);
> +
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
> +       fd_tmpfs = sys_fsmount(fd_context, 0, 0);
> +       ASSERT_GE(fd_tmpfs, 0);
> +       ASSERT_EQ(close(fd_context), 0);
> +
> +       ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
> +       ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
> +       ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
> +       ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
> +
> +       layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
> +       ASSERT_GE(layer_fds[0], 0);
> +
> +       layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
> +       ASSERT_GE(layer_fds[1], 0);
> +
> +       layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
> +       ASSERT_GE(layer_fds[2], 0);
> +
> +       layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
> +       ASSERT_GE(layer_fds[3], 0);
> +
> +       ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
> +       ASSERT_EQ(close(fd_tmpfs), 0);
> +
> +       fd_context = sys_fsopen("overlay", 0);
> +       ASSERT_GE(fd_context, 0);
> +       ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr);
> +       ASSERT_EQ(close(fd_context), 0);
> +       fd_context = predictable_fd_context_nr;
> +       ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1);
> +       ASSERT_EQ(close(ipc_sockets[0]), 0);
> +
> +       ASSERT_EQ(wait_for_pid(pid), 0);
> +       ASSERT_EQ(close(self->pidfd), 0);
> +       self->pidfd = -EBADF;
> +
> +       ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
> +
> +       for (int i = 0; i < ARRAY_SIZE(layer_fds); i++)
> +               ASSERT_EQ(close(layer_fds[i]), 0);
> +
> +       ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);
>
>         ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
>
> @@ -313,6 +438,8 @@ TEST_F(set_layers_via_fds, set_override_creds)
>
>         ASSERT_EQ(close(fd_context), 0);
>         ASSERT_EQ(close(fd_overlay), 0);
> +       ASSERT_EQ(close(fd_userns1), 0);
> +       ASSERT_EQ(close(fd_userns2), 0);
>  }
>
>  TEST_HARNESS_MAIN
> diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
> new file mode 100644
> index 000000000000..0e8080bd0aea
> --- /dev/null
> +++ b/tools/testing/selftests/filesystems/utils.c
> @@ -0,0 +1,474 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#ifndef _GNU_SOURCE
> +#define _GNU_SOURCE
> +#endif
> +#include <fcntl.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <grp.h>
> +#include <linux/limits.h>
> +#include <sched.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/eventfd.h>
> +#include <sys/fsuid.h>
> +#include <sys/prctl.h>
> +#include <sys/socket.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +#include <sys/xattr.h>
> +
> +#include "utils.h"
> +
> +#define MAX_USERNS_LEVEL 32
> +
> +#define syserror(format, ...)                           \
> +       ({                                              \
> +               fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \
> +               (-errno);                               \
> +       })
> +
> +#define syserror_set(__ret__, format, ...)                    \
> +       ({                                                    \
> +               typeof(__ret__) __internal_ret__ = (__ret__); \
> +               errno = labs(__ret__);                        \
> +               fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__);       \
> +               __internal_ret__;                             \
> +       })
> +
> +#define STRLITERALLEN(x) (sizeof(""x"") - 1)
> +
> +#define INTTYPE_TO_STRLEN(type)             \
> +       (2 + (sizeof(type) <= 1             \
> +                 ? 3                       \
> +                 : sizeof(type) <= 2       \
> +                       ? 5                 \
> +                       : sizeof(type) <= 4 \
> +                             ? 10          \
> +                             : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)])))
> +
> +#define list_for_each(__iterator, __list) \
> +       for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next)
> +
> +typedef enum idmap_type_t {
> +       ID_TYPE_UID,
> +       ID_TYPE_GID
> +} idmap_type_t;
> +
> +struct id_map {
> +       idmap_type_t map_type;
> +       __u32 nsid;
> +       __u32 hostid;
> +       __u32 range;
> +};
> +
> +struct list {
> +       void *elem;
> +       struct list *next;
> +       struct list *prev;
> +};
> +
> +struct userns_hierarchy {
> +       int fd_userns;
> +       int fd_event;
> +       unsigned int level;
> +       struct list id_map;
> +};
> +
> +static inline void list_init(struct list *list)
> +{
> +       list->elem = NULL;
> +       list->next = list->prev = list;
> +}
> +
> +static inline int list_empty(const struct list *list)
> +{
> +       return list == list->next;
> +}
> +
> +static inline void __list_add(struct list *new, struct list *prev, struct list *next)
> +{
> +       next->prev = new;
> +       new->next = next;
> +       new->prev = prev;
> +       prev->next = new;
> +}
> +
> +static inline void list_add_tail(struct list *head, struct list *list)
> +{
> +       __list_add(list, head->prev, head);
> +}
> +
> +static inline void list_del(struct list *list)
> +{
> +       struct list *next, *prev;
> +
> +       next = list->next;
> +       prev = list->prev;
> +       next->prev = prev;
> +       prev->next = next;
> +}
> +
> +static ssize_t read_nointr(int fd, void *buf, size_t count)
> +{
> +       ssize_t ret;
> +
> +       do {
> +               ret = read(fd, buf, count);
> +       } while (ret < 0 && errno == EINTR);
> +
> +       return ret;
> +}
> +
> +static ssize_t write_nointr(int fd, const void *buf, size_t count)
> +{
> +       ssize_t ret;
> +
> +       do {
> +               ret = write(fd, buf, count);
> +       } while (ret < 0 && errno == EINTR);
> +
> +       return ret;
> +}
> +
> +#define __STACK_SIZE (8 * 1024 * 1024)
> +static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
> +{
> +       void *stack;
> +
> +       stack = malloc(__STACK_SIZE);
> +       if (!stack)
> +               return -ENOMEM;
> +
> +#ifdef __ia64__
> +       return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL);
> +#else
> +       return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL);
> +#endif
> +}
> +
> +static int get_userns_fd_cb(void *data)
> +{
> +       for (;;)
> +               pause();
> +       _exit(0);
> +}
> +
> +static int wait_for_pid(pid_t pid)
> +{
> +       int status, ret;
> +
> +again:
> +       ret = waitpid(pid, &status, 0);
> +       if (ret == -1) {
> +               if (errno == EINTR)
> +                       goto again;
> +
> +               return -1;
> +       }
> +
> +       if (!WIFEXITED(status))
> +               return -1;
> +
> +       return WEXITSTATUS(status);
> +}
> +
> +static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size)
> +{
> +       int fd = -EBADF, setgroups_fd = -EBADF;
> +       int fret = -1;
> +       int ret;
> +       char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
> +                 STRLITERALLEN("/setgroups") + 1];
> +
> +       if (geteuid() != 0 && map_type == ID_TYPE_GID) {
> +               ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
> +               if (ret < 0 || ret >= sizeof(path))
> +                       goto out;
> +
> +               setgroups_fd = open(path, O_WRONLY | O_CLOEXEC);
> +               if (setgroups_fd < 0 && errno != ENOENT) {
> +                       syserror("Failed to open \"%s\"", path);
> +                       goto out;
> +               }
> +
> +               if (setgroups_fd >= 0) {
> +                       ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n"));
> +                       if (ret != STRLITERALLEN("deny\n")) {
> +                               syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
> +                               goto out;
> +                       }
> +               }
> +       }
> +
> +       ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g');
> +       if (ret < 0 || ret >= sizeof(path))
> +               goto out;
> +
> +       fd = open(path, O_WRONLY | O_CLOEXEC);
> +       if (fd < 0) {
> +               syserror("Failed to open \"%s\"", path);
> +               goto out;
> +       }
> +
> +       ret = write_nointr(fd, buf, buf_size);
> +       if (ret != buf_size) {
> +               syserror("Failed to write %cid mapping to \"%s\"",
> +                        map_type == ID_TYPE_UID ? 'u' : 'g', path);
> +               goto out;
> +       }
> +
> +       fret = 0;
> +out:
> +       close(fd);
> +       close(setgroups_fd);
> +
> +       return fret;
> +}
> +
> +static int map_ids_from_idmap(struct list *idmap, pid_t pid)
> +{
> +       int fill, left;
> +       char mapbuf[4096] = {};
> +       bool had_entry = false;
> +       idmap_type_t map_type, u_or_g;
> +
> +       if (list_empty(idmap))
> +               return 0;
> +
> +       for (map_type = ID_TYPE_UID, u_or_g = 'u';
> +            map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') {
> +               char *pos = mapbuf;
> +               int ret;
> +               struct list *iterator;
> +
> +
> +               list_for_each(iterator, idmap) {
> +                       struct id_map *map = iterator->elem;
> +                       if (map->map_type != map_type)
> +                               continue;
> +
> +                       had_entry = true;
> +
> +                       left = 4096 - (pos - mapbuf);
> +                       fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range);
> +                       /*
> +                        * The kernel only takes <= 4k for writes to
> +                        * /proc/<pid>/{g,u}id_map
> +                        */
> +                       if (fill <= 0 || fill >= left)
> +                               return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g);
> +
> +                       pos += fill;
> +               }
> +               if (!had_entry)
> +                       continue;
> +
> +               ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf);
> +               if (ret < 0)
> +                       return syserror("Failed to write mapping: %s", mapbuf);
> +
> +               memset(mapbuf, 0, sizeof(mapbuf));
> +       }
> +
> +       return 0;
> +}
> +
> +static int get_userns_fd_from_idmap(struct list *idmap)
> +{
> +       int ret;
> +       pid_t pid;
> +       char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
> +                    STRLITERALLEN("/ns/user") + 1];
> +
> +       pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS);
> +       if (pid < 0)
> +               return -errno;
> +
> +       ret = map_ids_from_idmap(idmap, pid);
> +       if (ret < 0)
> +               return ret;
> +
> +       ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid);
> +       if (ret < 0 || (size_t)ret >= sizeof(path_ns))
> +               ret = -EIO;
> +       else
> +               ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY);
> +
> +       (void)kill(pid, SIGKILL);
> +       (void)wait_for_pid(pid);
> +       return ret;
> +}
> +
> +int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range)
> +{
> +       struct list head, uid_mapl, gid_mapl;
> +       struct id_map uid_map = {
> +               .map_type       = ID_TYPE_UID,
> +               .nsid           = nsid,
> +               .hostid         = hostid,
> +               .range          = range,
> +       };
> +       struct id_map gid_map = {
> +               .map_type       = ID_TYPE_GID,
> +               .nsid           = nsid,
> +               .hostid         = hostid,
> +               .range          = range,
> +       };
> +
> +       list_init(&head);
> +       uid_mapl.elem = &uid_map;
> +       gid_mapl.elem = &gid_map;
> +       list_add_tail(&head, &uid_mapl);
> +       list_add_tail(&head, &gid_mapl);
> +
> +       return get_userns_fd_from_idmap(&head);
> +}
> +
> +bool switch_ids(uid_t uid, gid_t gid)
> +{
> +       if (setgroups(0, NULL))
> +               return syserror("failure: setgroups");
> +
> +       if (setresgid(gid, gid, gid))
> +               return syserror("failure: setresgid");
> +
> +       if (setresuid(uid, uid, uid))
> +               return syserror("failure: setresuid");
> +
> +       /* Ensure we can access proc files from processes we can ptrace. */
> +       if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
> +               return syserror("failure: make dumpable");
> +
> +       return true;
> +}
> +
> +static int create_userns_hierarchy(struct userns_hierarchy *h);
> +
> +static int userns_fd_cb(void *data)
> +{
> +       struct userns_hierarchy *h = data;
> +       char c;
> +       int ret;
> +
> +       ret = read_nointr(h->fd_event, &c, 1);
> +       if (ret < 0)
> +               return syserror("failure: read from socketpair");
> +
> +       /* Only switch ids if someone actually wrote a mapping for us. */
> +       if (c == '1') {
> +               if (!switch_ids(0, 0))
> +                       return syserror("failure: switch ids to 0");
> +       }
> +
> +       ret = write_nointr(h->fd_event, "1", 1);
> +       if (ret < 0)
> +               return syserror("failure: write to socketpair");
> +
> +       ret = create_userns_hierarchy(++h);
> +       if (ret < 0)
> +               return syserror("failure: userns level %d", h->level);
> +
> +       return 0;
> +}
> +
> +static int create_userns_hierarchy(struct userns_hierarchy *h)
> +{
> +       int fret = -1;
> +       char c;
> +       int fd_socket[2];
> +       int fd_userns = -EBADF, ret = -1;
> +       ssize_t bytes;
> +       pid_t pid;
> +       char path[256];
> +
> +       if (h->level == MAX_USERNS_LEVEL)
> +               return 0;
> +
> +       ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket);
> +       if (ret < 0)
> +               return syserror("failure: create socketpair");
> +
> +       /* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */
> +       h->fd_event = fd_socket[1];
> +       pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM);
> +       if (pid < 0) {
> +               syserror("failure: userns level %d", h->level);
> +               goto out_close;
> +       }
> +
> +       ret = map_ids_from_idmap(&h->id_map, pid);
> +       if (ret < 0) {
> +               kill(pid, SIGKILL);
> +               syserror("failure: writing id mapping for userns level %d for %d", h->level, pid);
> +               goto out_wait;
> +       }
> +
> +       if (!list_empty(&h->id_map))
> +               bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */
> +       else
> +               bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */
> +       if (bytes < 0) {
> +               kill(pid, SIGKILL);
> +               syserror("failure: write to socketpair");
> +               goto out_wait;
> +       }
> +
> +       /* Wait for child to set*id() and become dumpable. */
> +       bytes = read_nointr(fd_socket[0], &c, 1);
> +       if (bytes < 0) {
> +               kill(pid, SIGKILL);
> +               syserror("failure: read from socketpair");
> +               goto out_wait;
> +       }
> +
> +       snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
> +       fd_userns = open(path, O_RDONLY | O_CLOEXEC);
> +       if (fd_userns < 0) {
> +               kill(pid, SIGKILL);
> +               syserror("failure: open userns level %d for %d", h->level, pid);
> +               goto out_wait;
> +       }
> +
> +       fret = 0;
> +
> +out_wait:
> +       if (!wait_for_pid(pid) && !fret) {
> +               h->fd_userns = fd_userns;
> +               fd_userns = -EBADF;
> +       }
> +
> +out_close:
> +       if (fd_userns >= 0)
> +               close(fd_userns);
> +       close(fd_socket[0]);
> +       close(fd_socket[1]);
> +       return fret;
> +}
> +
> +/* caps_down - lower all effective caps */
> +int caps_down(void)
> +{
> +       bool fret = false;
> +       cap_t caps = NULL;
> +       int ret = -1;
> +
> +       caps = cap_get_proc();
> +       if (!caps)
> +               goto out;
> +
> +       ret = cap_clear_flag(caps, CAP_EFFECTIVE);
> +       if (ret)
> +               goto out;
> +
> +       ret = cap_set_proc(caps);
> +       if (ret)
> +               goto out;
> +
> +       fret = true;
> +
> +out:
> +       cap_free(caps);
> +       return fret;
> +}
> diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
> new file mode 100644
> index 000000000000..f35001a75f99
> --- /dev/null
> +++ b/tools/testing/selftests/filesystems/utils.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __IDMAP_UTILS_H
> +#define __IDMAP_UTILS_H
> +
> +#ifndef _GNU_SOURCE
> +#define _GNU_SOURCE
> +#endif
> +#include <errno.h>
> +#include <linux/types.h>
> +#include <sched.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <syscall.h>
> +#include <sys/capability.h>
> +#include <sys/fsuid.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +extern int get_userns_fd(unsigned long nsid, unsigned long hostid,
> +                        unsigned long range);
> +
> +extern int caps_down(void);
> +
> +extern bool switch_ids(uid_t uid, gid_t gid);
> +
> +static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
> +{
> +       if (setns(fd, CLONE_NEWUSER))
> +               return false;
> +
> +       if (!switch_ids(uid, gid))
> +               return false;
> +
> +       if (drop_caps && !caps_down())
> +               return false;
> +
> +       return true;
> +}
> +
> +#endif /* __IDMAP_UTILS_H */
>
> --
> 2.47.2
>
Christian Brauner Feb. 19, 2025, 1:25 p.m. UTC | #2
On Wed, Feb 19, 2025 at 01:36:17PM +0100, Amir Goldstein wrote:
> On Wed, Feb 19, 2025 at 11:02 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > Add a simple test to verify that the new "override_creds" option works.
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> For the added test you may add:
> 
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> 
> But you may want to consider splitting the large infrastructure
> and the churn to the previous test to a separate patch, to make this
> patch cleaner.

Done.
diff mbox series

Patch

diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
index e8d1adb021af..6c661232b3b5 100644
--- a/tools/testing/selftests/filesystems/overlayfs/Makefile
+++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
@@ -1,7 +1,14 @@ 
 # SPDX-License-Identifier: GPL-2.0
 
-TEST_GEN_PROGS := dev_in_maps set_layers_via_fds
+CFLAGS += -Wall
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lcap
 
-CFLAGS := -Wall -Werror
+LOCAL_HDRS += wrappers.h log.h
+
+TEST_GEN_PROGS := dev_in_maps
+TEST_GEN_PROGS += set_layers_via_fds
 
 include ../../lib.mk
+
+$(OUTPUT)/set_layers_via_fds: ../utils.c
diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
index 70acd833581d..6b65e3610578 100644
--- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -6,6 +6,7 @@ 
 #include <sched.h>
 #include <stdio.h>
 #include <string.h>
+#include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <unistd.h>
@@ -13,20 +14,27 @@ 
 #include "../../kselftest_harness.h"
 #include "../../pidfd/pidfd.h"
 #include "log.h"
+#include "../utils.h"
 #include "wrappers.h"
 
 FIXTURE(set_layers_via_fds) {
+	int pidfd;
 };
 
 FIXTURE_SETUP(set_layers_via_fds)
 {
-	ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
+	self->pidfd = -EBADF;
+	EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
 }
 
 FIXTURE_TEARDOWN(set_layers_via_fds)
 {
+	if (self->pidfd >= 0) {
+		EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0);
+		EXPECT_EQ(close(self->pidfd), 0);
+	}
 	umount2("/set_layers_via_fds", 0);
-	ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
+	EXPECT_EQ(rmdir("/set_layers_via_fds"), 0);
 }
 
 TEST_F(set_layers_via_fds, set_layers_via_fds)
@@ -266,7 +274,7 @@  TEST_F(set_layers_via_fds, set_override_creds)
 	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
 
 	pid = create_child(&pidfd, 0);
-	EXPECT_GE(pid, 0);
+	ASSERT_GE(pid, 0);
 	if (pid == 0) {
 		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
 			TH_LOG("sys_fsconfig should have succeeded");
@@ -275,11 +283,11 @@  TEST_F(set_layers_via_fds, set_override_creds)
 
 		_exit(EXIT_SUCCESS);
 	}
-	EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
-	EXPECT_EQ(close(pidfd), 0);
+	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+	ASSERT_GE(close(pidfd), 0);
 
 	pid = create_child(&pidfd, 0);
-	EXPECT_GE(pid, 0);
+	ASSERT_GE(pid, 0);
 	if (pid == 0) {
 		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) {
 			TH_LOG("sys_fsconfig should have succeeded");
@@ -288,11 +296,11 @@  TEST_F(set_layers_via_fds, set_override_creds)
 
 		_exit(EXIT_SUCCESS);
 	}
-	EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
-	EXPECT_EQ(close(pidfd), 0);
+	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+	ASSERT_GE(close(pidfd), 0);
 
 	pid = create_child(&pidfd, 0);
-	EXPECT_GE(pid, 0);
+	ASSERT_GE(pid, 0);
 	if (pid == 0) {
 		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
 			TH_LOG("sys_fsconfig should have succeeded");
@@ -301,8 +309,125 @@  TEST_F(set_layers_via_fds, set_override_creds)
 
 		_exit(EXIT_SUCCESS);
 	}
-	EXPECT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
-	EXPECT_EQ(close(pidfd), 0);
+	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
+	ASSERT_GE(close(pidfd), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+	fd_overlay = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_overlay, 0);
+
+	ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+	ASSERT_EQ(close(fd_context), 0);
+	ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_F(set_layers_via_fds, set_override_creds_invalid)
+{
+	int fd_context, fd_tmpfs, fd_overlay, ret;
+	int layer_fds[] = { [0 ... 3] = -EBADF };
+	pid_t pid;
+	int fd_userns1, fd_userns2;
+	int ipc_sockets[2];
+	char c;
+	const unsigned int predictable_fd_context_nr = 123;
+
+	fd_userns1 = get_userns_fd(0, 0, 10000);
+	ASSERT_GE(fd_userns1, 0);
+
+	fd_userns2 = get_userns_fd(0, 1234, 10000);
+	ASSERT_GE(fd_userns2, 0);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	ASSERT_GE(ret, 0);
+
+	pid = create_child(&self->pidfd, 0);
+	ASSERT_GE(pid, 0);
+	if (pid == 0) {
+		if (close(ipc_sockets[0])) {
+			TH_LOG("close should have succeeded");
+			_exit(EXIT_FAILURE);
+		}
+
+		if (!switch_userns(fd_userns2, 0, 0, false)) {
+			TH_LOG("switch_userns should have succeeded");
+			_exit(EXIT_FAILURE);
+		}
+
+		if (read_nointr(ipc_sockets[1], &c, 1) != 1) {
+			TH_LOG("read_nointr should have succeeded");
+			_exit(EXIT_FAILURE);
+		}
+
+		if (close(ipc_sockets[1])) {
+			TH_LOG("close should have succeeded");
+			_exit(EXIT_FAILURE);
+		}
+
+		if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
+			TH_LOG("sys_fsconfig should have failed");
+			_exit(EXIT_FAILURE);
+		}
+
+		_exit(EXIT_SUCCESS);
+	}
+
+	ASSERT_EQ(close(ipc_sockets[1]), 0);
+	ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true);
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+	fd_context = sys_fsopen("tmpfs", 0);
+	ASSERT_GE(fd_context, 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+	ASSERT_GE(fd_tmpfs, 0);
+	ASSERT_EQ(close(fd_context), 0);
+
+	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+	ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+
+	layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+	ASSERT_GE(layer_fds[0], 0);
+
+	layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+	ASSERT_GE(layer_fds[1], 0);
+
+	layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+	ASSERT_GE(layer_fds[2], 0);
+
+	layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+	ASSERT_GE(layer_fds[3], 0);
+
+	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+	ASSERT_EQ(close(fd_tmpfs), 0);
+
+	fd_context = sys_fsopen("overlay", 0);
+	ASSERT_GE(fd_context, 0);
+	ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr);
+	ASSERT_EQ(close(fd_context), 0);
+	fd_context = predictable_fd_context_nr;
+	ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1);
+	ASSERT_EQ(close(ipc_sockets[0]), 0);
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+	ASSERT_EQ(close(self->pidfd), 0);
+	self->pidfd = -EBADF;
+
+	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+
+	for (int i = 0; i < ARRAY_SIZE(layer_fds); i++)
+		ASSERT_EQ(close(layer_fds[i]), 0);
+
+	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);
 
 	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
 
@@ -313,6 +438,8 @@  TEST_F(set_layers_via_fds, set_override_creds)
 
 	ASSERT_EQ(close(fd_context), 0);
 	ASSERT_EQ(close(fd_overlay), 0);
+	ASSERT_EQ(close(fd_userns1), 0);
+	ASSERT_EQ(close(fd_userns2), 0);
 }
 
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
new file mode 100644
index 000000000000..0e8080bd0aea
--- /dev/null
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -0,0 +1,474 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fcntl.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <grp.h>
+#include <linux/limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <sys/fsuid.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/xattr.h>
+
+#include "utils.h"
+
+#define MAX_USERNS_LEVEL 32
+
+#define syserror(format, ...)                           \
+	({                                              \
+		fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \
+		(-errno);                               \
+	})
+
+#define syserror_set(__ret__, format, ...)                    \
+	({                                                    \
+		typeof(__ret__) __internal_ret__ = (__ret__); \
+		errno = labs(__ret__);                        \
+		fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__);       \
+		__internal_ret__;                             \
+	})
+
+#define STRLITERALLEN(x) (sizeof(""x"") - 1)
+
+#define INTTYPE_TO_STRLEN(type)             \
+	(2 + (sizeof(type) <= 1             \
+		  ? 3                       \
+		  : sizeof(type) <= 2       \
+			? 5                 \
+			: sizeof(type) <= 4 \
+			      ? 10          \
+			      : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)])))
+
+#define list_for_each(__iterator, __list) \
+	for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next)
+
+typedef enum idmap_type_t {
+	ID_TYPE_UID,
+	ID_TYPE_GID
+} idmap_type_t;
+
+struct id_map {
+	idmap_type_t map_type;
+	__u32 nsid;
+	__u32 hostid;
+	__u32 range;
+};
+
+struct list {
+	void *elem;
+	struct list *next;
+	struct list *prev;
+};
+
+struct userns_hierarchy {
+	int fd_userns;
+	int fd_event;
+	unsigned int level;
+	struct list id_map;
+};
+
+static inline void list_init(struct list *list)
+{
+	list->elem = NULL;
+	list->next = list->prev = list;
+}
+
+static inline int list_empty(const struct list *list)
+{
+	return list == list->next;
+}
+
+static inline void __list_add(struct list *new, struct list *prev, struct list *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+static inline void list_add_tail(struct list *head, struct list *list)
+{
+	__list_add(list, head->prev, head);
+}
+
+static inline void list_del(struct list *list)
+{
+	struct list *next, *prev;
+
+	next = list->next;
+	prev = list->prev;
+	next->prev = prev;
+	prev->next = next;
+}
+
+static ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = read(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = write(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+	void *stack;
+
+	stack = malloc(__STACK_SIZE);
+	if (!stack)
+		return -ENOMEM;
+
+#ifdef __ia64__
+	return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#else
+	return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#endif
+}
+
+static int get_userns_fd_cb(void *data)
+{
+	for (;;)
+		pause();
+	_exit(0);
+}
+
+static int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		return -1;
+	}
+
+	if (!WIFEXITED(status))
+		return -1;
+
+	return WEXITSTATUS(status);
+}
+
+static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size)
+{
+	int fd = -EBADF, setgroups_fd = -EBADF;
+	int fret = -1;
+	int ret;
+	char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
+		  STRLITERALLEN("/setgroups") + 1];
+
+	if (geteuid() != 0 && map_type == ID_TYPE_GID) {
+		ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
+		if (ret < 0 || ret >= sizeof(path))
+			goto out;
+
+		setgroups_fd = open(path, O_WRONLY | O_CLOEXEC);
+		if (setgroups_fd < 0 && errno != ENOENT) {
+			syserror("Failed to open \"%s\"", path);
+			goto out;
+		}
+
+		if (setgroups_fd >= 0) {
+			ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n"));
+			if (ret != STRLITERALLEN("deny\n")) {
+				syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
+				goto out;
+			}
+		}
+	}
+
+	ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g');
+	if (ret < 0 || ret >= sizeof(path))
+		goto out;
+
+	fd = open(path, O_WRONLY | O_CLOEXEC);
+	if (fd < 0) {
+		syserror("Failed to open \"%s\"", path);
+		goto out;
+	}
+
+	ret = write_nointr(fd, buf, buf_size);
+	if (ret != buf_size) {
+		syserror("Failed to write %cid mapping to \"%s\"",
+			 map_type == ID_TYPE_UID ? 'u' : 'g', path);
+		goto out;
+	}
+
+	fret = 0;
+out:
+	close(fd);
+	close(setgroups_fd);
+
+	return fret;
+}
+
+static int map_ids_from_idmap(struct list *idmap, pid_t pid)
+{
+	int fill, left;
+	char mapbuf[4096] = {};
+	bool had_entry = false;
+	idmap_type_t map_type, u_or_g;
+
+	if (list_empty(idmap))
+		return 0;
+
+	for (map_type = ID_TYPE_UID, u_or_g = 'u';
+	     map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') {
+		char *pos = mapbuf;
+		int ret;
+		struct list *iterator;
+
+
+		list_for_each(iterator, idmap) {
+			struct id_map *map = iterator->elem;
+			if (map->map_type != map_type)
+				continue;
+
+			had_entry = true;
+
+			left = 4096 - (pos - mapbuf);
+			fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range);
+			/*
+			 * The kernel only takes <= 4k for writes to
+			 * /proc/<pid>/{g,u}id_map
+			 */
+			if (fill <= 0 || fill >= left)
+				return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g);
+
+			pos += fill;
+		}
+		if (!had_entry)
+			continue;
+
+		ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf);
+		if (ret < 0)
+			return syserror("Failed to write mapping: %s", mapbuf);
+
+		memset(mapbuf, 0, sizeof(mapbuf));
+	}
+
+	return 0;
+}
+
+static int get_userns_fd_from_idmap(struct list *idmap)
+{
+	int ret;
+	pid_t pid;
+	char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) +
+		     STRLITERALLEN("/ns/user") + 1];
+
+	pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS);
+	if (pid < 0)
+		return -errno;
+
+	ret = map_ids_from_idmap(idmap, pid);
+	if (ret < 0)
+		return ret;
+
+	ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid);
+	if (ret < 0 || (size_t)ret >= sizeof(path_ns))
+		ret = -EIO;
+	else
+		ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY);
+
+	(void)kill(pid, SIGKILL);
+	(void)wait_for_pid(pid);
+	return ret;
+}
+
+int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range)
+{
+	struct list head, uid_mapl, gid_mapl;
+	struct id_map uid_map = {
+		.map_type	= ID_TYPE_UID,
+		.nsid		= nsid,
+		.hostid		= hostid,
+		.range		= range,
+	};
+	struct id_map gid_map = {
+		.map_type	= ID_TYPE_GID,
+		.nsid		= nsid,
+		.hostid		= hostid,
+		.range		= range,
+	};
+
+	list_init(&head);
+	uid_mapl.elem = &uid_map;
+	gid_mapl.elem = &gid_map;
+	list_add_tail(&head, &uid_mapl);
+	list_add_tail(&head, &gid_mapl);
+
+	return get_userns_fd_from_idmap(&head);
+}
+
+bool switch_ids(uid_t uid, gid_t gid)
+{
+	if (setgroups(0, NULL))
+		return syserror("failure: setgroups");
+
+	if (setresgid(gid, gid, gid))
+		return syserror("failure: setresgid");
+
+	if (setresuid(uid, uid, uid))
+		return syserror("failure: setresuid");
+
+	/* Ensure we can access proc files from processes we can ptrace. */
+	if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
+		return syserror("failure: make dumpable");
+
+	return true;
+}
+
+static int create_userns_hierarchy(struct userns_hierarchy *h);
+
+static int userns_fd_cb(void *data)
+{
+	struct userns_hierarchy *h = data;
+	char c;
+	int ret;
+
+	ret = read_nointr(h->fd_event, &c, 1);
+	if (ret < 0)
+		return syserror("failure: read from socketpair");
+
+	/* Only switch ids if someone actually wrote a mapping for us. */
+	if (c == '1') {
+		if (!switch_ids(0, 0))
+			return syserror("failure: switch ids to 0");
+	}
+
+	ret = write_nointr(h->fd_event, "1", 1);
+	if (ret < 0)
+		return syserror("failure: write to socketpair");
+
+	ret = create_userns_hierarchy(++h);
+	if (ret < 0)
+		return syserror("failure: userns level %d", h->level);
+
+	return 0;
+}
+
+static int create_userns_hierarchy(struct userns_hierarchy *h)
+{
+	int fret = -1;
+	char c;
+	int fd_socket[2];
+	int fd_userns = -EBADF, ret = -1;
+	ssize_t bytes;
+	pid_t pid;
+	char path[256];
+
+	if (h->level == MAX_USERNS_LEVEL)
+		return 0;
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket);
+	if (ret < 0)
+		return syserror("failure: create socketpair");
+
+	/* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */
+	h->fd_event = fd_socket[1];
+	pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM);
+	if (pid < 0) {
+		syserror("failure: userns level %d", h->level);
+		goto out_close;
+	}
+
+	ret = map_ids_from_idmap(&h->id_map, pid);
+	if (ret < 0) {
+		kill(pid, SIGKILL);
+		syserror("failure: writing id mapping for userns level %d for %d", h->level, pid);
+		goto out_wait;
+	}
+
+	if (!list_empty(&h->id_map))
+		bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */
+	else
+		bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */
+	if (bytes < 0) {
+		kill(pid, SIGKILL);
+		syserror("failure: write to socketpair");
+		goto out_wait;
+	}
+
+	/* Wait for child to set*id() and become dumpable. */
+	bytes = read_nointr(fd_socket[0], &c, 1);
+	if (bytes < 0) {
+		kill(pid, SIGKILL);
+		syserror("failure: read from socketpair");
+		goto out_wait;
+	}
+
+	snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+	fd_userns = open(path, O_RDONLY | O_CLOEXEC);
+	if (fd_userns < 0) {
+		kill(pid, SIGKILL);
+		syserror("failure: open userns level %d for %d", h->level, pid);
+		goto out_wait;
+	}
+
+	fret = 0;
+
+out_wait:
+	if (!wait_for_pid(pid) && !fret) {
+		h->fd_userns = fd_userns;
+		fd_userns = -EBADF;
+	}
+
+out_close:
+	if (fd_userns >= 0)
+		close(fd_userns);
+	close(fd_socket[0]);
+	close(fd_socket[1]);
+	return fret;
+}
+
+/* caps_down - lower all effective caps */
+int caps_down(void)
+{
+	bool fret = false;
+	cap_t caps = NULL;
+	int ret = -1;
+
+	caps = cap_get_proc();
+	if (!caps)
+		goto out;
+
+	ret = cap_clear_flag(caps, CAP_EFFECTIVE);
+	if (ret)
+		goto out;
+
+	ret = cap_set_proc(caps);
+	if (ret)
+		goto out;
+
+	fret = true;
+
+out:
+	cap_free(caps);
+	return fret;
+}
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
new file mode 100644
index 000000000000..f35001a75f99
--- /dev/null
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -0,0 +1,44 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __IDMAP_UTILS_H
+#define __IDMAP_UTILS_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <errno.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/fsuid.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+extern int get_userns_fd(unsigned long nsid, unsigned long hostid,
+			 unsigned long range);
+
+extern int caps_down(void);
+
+extern bool switch_ids(uid_t uid, gid_t gid);
+
+static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
+{
+	if (setns(fd, CLONE_NEWUSER))
+		return false;
+
+	if (!switch_ids(uid, gid))
+		return false;
+
+	if (drop_caps && !caps_down())
+		return false;
+
+	return true;
+}
+
+#endif /* __IDMAP_UTILS_H */