@@ -36,6 +36,7 @@ Currently, these files are in /proc/sys/fs:
- inode-max
- inode-nr
- inode-state
+- introspection_policy
- nr_open
- overflowuid
- overflowgid
@@ -165,6 +166,55 @@ system needs to prune the inode list instead of allocating
more.
+introspection_policy
+--------------------
+
+An interpreter can call :manpage:`introspect_access(2)` with an ``X_OK`` mode
+to check that opened regular files are expected to be executable. If the file
+is not identified as executable, then the syscall returns -EACCES. This may
+allow a script interpreter to check executable permission before reading
+commands from a file, or a dynamic linker to only load executable shared
+objects. One interesting use case is to enforce a "write xor execute" policy
+through interpreters.
+
+The ability to restrict code execution must be thought as a system-wide policy,
+which first starts by restricting mount points with the ``noexec`` option.
+This option is also automatically applied to special filesystems such as /proc .
+This prevents files on such mount points to be directly executed by the kernel
+or mapped as executable memory (e.g. libraries). With script interpreters
+using :manpage:`introspect_access(2)`, the executable permission can then be
+checked before reading commands from files. This makes it possible to enforce
+the ``noexec`` at the interpreter level, and thus propagates this security
+policy to scripts. To be fully effective, these interpreters also need to
+handle the other ways to execute code: command line parameters (e.g., option
+``-e`` for Perl), module loading (e.g., option ``-m`` for Python), stdin, file
+sourcing, environment variables, configuration files, etc. According to the
+threat model, it may be acceptable to allow some script interpreters (e.g.
+Bash) to interpret commands from stdin, may it be a TTY or a pipe, because it
+may not be enough to (directly) perform syscalls.
+
+There are two complementary security policies: enforce the ``noexec`` mount
+option, and enforce executable file permission. These policies are handled by
+the ``fs.introspection_policy`` sysctl (writable only with ``CAP_SYS_ADMIN``)
+as a bitmask:
+
+1 - Mount restriction: checks that the mount options for the underlying VFS
+ mount do not prevent execution.
+
+2 - File permission restriction: checks that the file is marked as
+ executable for the current process (e.g., POSIX permissions, ACLs).
+
+Note that as long as a policy is enforced, checking any non-regular file with
+:manpage:`introspect_access(2)` returns -EACCES (e.g. TTYs, pipe), even when
+such a file is marked as executable or is on an executable mount point.
+
+Code samples can be found in
+tools/testing/selftests/interpreter/introspection_policy_test.c and interpreter
+patches (for the original O_MAYEXEC) are available at
+https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC .
+See also an overview article: https://lwn.net/Articles/820000/ .
+
+
overflowgid & overflowuid
-------------------------
@@ -32,6 +32,7 @@
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
+#include <linux/sysctl.h>
#include "internal.h"
@@ -482,6 +483,84 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
return do_faccessat(AT_FDCWD, filename, mode, 0);
}
+#define INTROSPECTION_EXEC_MOUNT BIT(0)
+#define INTROSPECTION_EXEC_FILE BIT(1)
+
+int sysctl_introspection_policy __read_mostly;
+
+SYSCALL_DEFINE3(introspect_access, const int, fd, const int, mode, const int, flags)
+{
+ int mask, err = -EACCES;
+ struct fd f;
+ struct inode *inode;
+
+ if (flags)
+ return -EINVAL;
+
+ /* Only allows X_OK for now. */
+ if (mode != S_IXOTH)
+ return -EINVAL;
+ mask = MAY_EXEC;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+ inode = d_backing_inode(f.file->f_path.dentry);
+
+ /*
+ * For compatibility reasons, without a defined security policy (via
+ * sysctl or LSM), we must map the execute permission to the read
+ * permission. Indeed, from user space point of view, being able to
+ * execute data (e.g. scripts) implies to be able to read this data.
+ *
+ * The MAY_INTROSPECTION_EXEC bit is set to enable LSMs to add custom
+ * checks, while being compatible with current policies.
+ */
+ if ((mask & MAY_EXEC)) {
+ mask |= MAY_INTROSPECTION_EXEC;
+ /*
+ * If there is a system-wide execute policy enforced, then
+ * forbids access to non-regular files and special superblocks.
+ */
+ if ((sysctl_introspection_policy & (INTROSPECTION_EXEC_MOUNT |
+ INTROSPECTION_EXEC_FILE))) {
+ if (!S_ISREG(inode->i_mode))
+ goto out_fd;
+ /*
+ * Denies access to pseudo filesystems that will never
+ * be mountable (e.g. sockfs, pipefs) but can still be
+ * reachable through /proc/self/fd, or memfd-like file
+ * descriptors, or nsfs-like files.
+ *
+ * According to the tests, SB_NOEXEC seems to be only
+ * used by proc and nsfs filesystems. Is it correct?
+ */
+ if ((f.file->f_path.dentry->d_sb->s_flags &
+ (SB_NOUSER | SB_KERNMOUNT | SB_NOEXEC)))
+ goto out_fd;
+ }
+
+ if ((sysctl_introspection_policy & INTROSPECTION_EXEC_MOUNT) &&
+ path_noexec(&f.file->f_path))
+ goto out_fd;
+ /*
+ * For compatibility reasons, if the system-wide policy doesn't
+ * enforce file permission checks, then replaces the execute
+ * permission request with a read permission request.
+ */
+ if (!(sysctl_introspection_policy & INTROSPECTION_EXEC_FILE))
+ mask &= ~MAY_EXEC;
+ /* To be executed *by* user space, files must be readable. */
+ mask |= MAY_READ;
+ }
+
+ err = inode_permission(inode, mask | MAY_ACCESS);
+
+out_fd:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
@@ -83,6 +83,7 @@ extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;
+extern int sysctl_introspection_policy;
typedef __kernel_rwf_t rwf_t;
@@ -101,6 +102,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define MAY_CHDIR 0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK 0x00000080
+/* introspection accesses, cf. introspect_access(2) */
+#define MAY_INTROSPECTION_EXEC 0x00000100
/*
* flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond
@@ -429,6 +429,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
asmlinkage long sys_faccessat2(int dfd, const char __user *filename, int mode,
int flags);
+asmlinkage long sys_introspect_access(int fd, int mode, int flags);
asmlinkage long sys_chdir(const char __user *filename);
asmlinkage long sys_fchdir(unsigned int fd);
asmlinkage long sys_chroot(const char __user *filename);
@@ -113,6 +113,7 @@ static int sixty = 60;
static int __maybe_unused neg_one = -1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
static unsigned long zero_ul;
static unsigned long one_ul = 1;
@@ -887,7 +888,6 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
-#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -896,7 +896,6 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
-#endif
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
@@ -3293,6 +3292,15 @@ static struct ctl_table fs_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
+ {
+ .procname = "introspection_policy",
+ .data = &sysctl_introspection_policy,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &three,
+ },
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
.procname = "binfmt_misc",