@@ -169,7 +169,7 @@ static inline void sb_end_ro_state_change(struct super_block *sb)
* open.c
*/
struct open_flags {
- int open_flag;
+ u64 open_flag;
umode_t mode;
int acc_mode;
int intent;
@@ -586,6 +586,7 @@ struct nameidata {
int dfd;
vfsuid_t dir_vfsuid;
umode_t dir_mode;
+ const struct cred *dir_open_cred;
} __randomize_layout;
#define ND_ROOT_PRESET 1
@@ -695,6 +696,7 @@ static void terminate_walk(struct nameidata *nd)
nd->depth = 0;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
+ put_cred(nd->dir_open_cred);
}
/* path_put is needed afterwards regardless of success or failure */
@@ -2414,6 +2416,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
}
+ nd->dir_open_cred = get_current_cred();
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
@@ -2437,6 +2440,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
+ nd->dir_open_cred = get_cred(f.file->f_cred);
fdput(f);
}
@@ -3794,8 +3798,28 @@ static struct file *path_openat(struct nameidata *nd,
error = do_o_path(nd, flags, file);
} else {
const char *s = path_init(nd, flags);
- file = alloc_empty_file(op->open_flag, current_cred());
- error = PTR_ERR_OR_ZERO(file);
+ const struct cred *old_cred = NULL;
+
+ error = 0;
+ if (op->open_flag & OA2_INHERIT_CRED) {
+ /* Make sure to work only with restricted
+ * look-up modes.
+ */
+ if (!(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
+ error = -EPERM;
+ /* Only work with O_CLOEXEC dirs. */
+ if (!get_close_on_exec(nd->dfd))
+ error = -EPERM;
+
+ if (!error)
+ old_cred = override_creds(nd->dir_open_cred);
+ }
+ if (!error) {
+ file = alloc_empty_file(op->open_flag, current_cred());
+ error = PTR_ERR_OR_ZERO(file);
+ } else {
+ file = ERR_PTR(error);
+ }
if (!error) {
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
@@ -3803,6 +3827,8 @@ static struct file *path_openat(struct nameidata *nd,
}
if (!error)
error = do_open(nd, file, op);
+ if (old_cred)
+ revert_creds(old_cred);
terminate_walk(nd);
if (IS_ERR(file))
return file;
@@ -1225,7 +1225,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
* values before calling build_open_flags(), but openat2(2) checks all
* of its arguments.
*/
- if (flags & ~VALID_OPEN_FLAGS)
+ if (flags & ~VALID_OPENAT2_FLAGS)
return -EINVAL;
if (how->resolve & ~VALID_RESOLVE_FLAGS)
return -EINVAL;
@@ -12,6 +12,8 @@
FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+#define VALID_OPENAT2_FLAGS (VALID_OPEN_FLAGS | OA2_INHERIT_CRED)
+
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
@@ -40,4 +40,7 @@ struct open_how {
return -EAGAIN if that's not
possible. */
+/* openat2-specific flags go to upper 4 bytes. */
+#define OA2_INHERIT_CRED (1ULL << 32)
+
#endif /* _UAPI_LINUX_OPENAT2_H */
This flag performs the open operation with the credentials that were in effect when dir_fd was opened. This allows the process to pre-open some directories and then change eUID (and all other UIDs/GIDs) to a less-privileged user, retaining the ability to open/create files within these directories. Design goal: The idea is to provide a very light-weight sandboxing, where the process, without the use of any heavy-weight techniques like chroot within namespaces, can restrict the access to the set of pre-opened directories. This patch is just a first step to such sandboxing. If things go well, in the future the same extension can be added to more syscalls. These should include at least unlinkat(), renameat2() and the not-yet-upstreamed setxattrat(). Security considerations: To avoid sandboxing escape, this patch makes sure the restricted lookup modes are used. Namely, RESOLVE_BENEATH or RESOLVE_IN_ROOT. To avoid leaking creds across exec, this patch requires O_CLOEXEC flag on a directory. Use cases: Virtual machines that deal with untrusted code, can use that instead of a more heavy-weighted approaches. Currently the approach is being tested on a dosemu2 VM. Signed-off-by: Stas Sergeev <stsp2@yandex.ru> CC: Stefan Metzmacher <metze@samba.org> CC: Eric Biederman <ebiederm@xmission.com> CC: Alexander Viro <viro@zeniv.linux.org.uk> CC: Andy Lutomirski <luto@kernel.org> CC: Christian Brauner <brauner@kernel.org> CC: Jan Kara <jack@suse.cz> CC: Jeff Layton <jlayton@kernel.org> CC: Chuck Lever <chuck.lever@oracle.com> CC: Alexander Aring <alex.aring@gmail.com> CC: linux-fsdevel@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: Paolo Bonzini <pbonzini@redhat.com> CC: Christian Göttsche <cgzones@googlemail.com> --- fs/internal.h | 2 +- fs/namei.c | 30 ++++++++++++++++++++++++++++-- fs/open.c | 2 +- include/linux/fcntl.h | 2 ++ include/uapi/linux/openat2.h | 3 +++ 5 files changed, 35 insertions(+), 4 deletions(-)