@@ -36,6 +36,7 @@ Currently, these files are in /proc/sys/fs:
- inode-max
- inode-nr
- inode-state
+- interpreted_access
- nr_open
- overflowuid
- overflowgid
@@ -165,6 +166,59 @@ system needs to prune the inode list instead of allocating
more.
+interpreted_access
+------------------
+
+The ``AT_INTERPRETED`` flag with an ``X_OK`` mode can be passed to
+:manpage:`faccessat2(2)` by an interpreter to check that regular files are
+expected to be executable. If the file is not identified as executable, then
+the syscall returns -EACCES. This may allow a script interpreter to check
+executable permission before reading commands from a file, or a dynamic linker
+to only load executable shared objects. One interesting use case is to enforce
+a "write xor execute" policy through interpreters.
+
+To avoid race-conditions, it is highly recommended to first open the file and
+then do the check on the new file descriptor thanks to the ``AT_EMPTY_PATH``
+flag.
+
+The ability to restrict code execution must be thought as a system-wide policy,
+which first starts by restricting mount points with the ``noexec`` option.
+This option is also automatically applied to special filesystems such as /proc .
+This prevents files on such mount points to be directly executed by the kernel
+or mapped as executable memory (e.g. libraries). With script interpreters
+using :manpage:`faccessat2(2)` and ``AT_INTERPRETED``, the executable
+permission can then be checked before reading commands from files. This makes
+it possible to enforce the ``noexec`` at the interpreter level, and thus
+propagates this security policy to scripts. To be fully effective, these
+interpreters also need to handle the other ways to execute code: command line
+parameters (e.g., option ``-e`` for Perl), module loading (e.g., option ``-m``
+for Python), stdin, file sourcing, environment variables, configuration files,
+etc. According to the threat model, it may be acceptable to allow some script
+interpreters (e.g. Bash) to interpret commands from stdin, may it be a TTY or a
+pipe, because it may not be enough to (directly) perform syscalls.
+
+There are two complementary security policies: enforce the ``noexec`` mount
+option, and enforce executable file permission. These policies are handled by
+the ``fs.interpreted_access`` sysctl (writable only with ``CAP_SYS_ADMIN``)
+as a bitmask:
+
+1 - Mount restriction: checks that the mount options for the underlying VFS
+ mount do not prevent execution.
+
+2 - File permission restriction: checks that the file is marked as
+ executable for the current process (e.g., POSIX permissions, ACLs).
+
+Note that as long as a policy is enforced, checking any non-regular file with
+``AT_INTERPRETED`` returns -EINVAL (e.g. TTYs, pipe), even when such a file is
+marked as executable or is on an executable mount point.
+
+Code samples can be found in
+tools/testing/selftests/interpreter/interpreted_access_test.c and interpreter
+patches (for the original O_MAYEXEC) are available at
+https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC .
+See also an overview article: https://lwn.net/Articles/820000/ .
+
+
overflowgid & overflowuid
-------------------------
@@ -32,6 +32,7 @@
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
+#include <linux/sysctl.h>
#include "internal.h"
@@ -394,6 +395,11 @@ static const struct cred *access_override_creds(void)
return old_cred;
}
+#define INTERPRETED_EXEC_MOUNT BIT(0)
+#define INTERPRETED_EXEC_FILE BIT(1)
+
+int sysctl_interpreted_access __read_mostly;
+
static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
@@ -443,13 +449,43 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla
*/
if ((mode & MAY_EXEC)) {
mode |= MAY_INTERPRETED_EXEC;
+ res = -EACCES;
+ /*
+ * If there is a system-wide execute policy enforced,
+ * then forbids access to non-regular files and special
+ * superblocks.
+ */
+ if ((sysctl_interpreted_access & (INTERPRETED_EXEC_MOUNT |
+ INTERPRETED_EXEC_FILE))) {
+ if (!S_ISREG(inode->i_mode))
+ goto out_path_release;
+ /*
+ * Denies access to pseudo filesystems that
+ * will never be mountable (e.g. sockfs,
+ * pipefs) but can still be reachable through
+ * /proc/self/fd, or memfd-like file
+ * descriptors, or nsfs-like files.
+ *
+ * According to the tests, SB_NOEXEC seems to
+ * be only used by proc and nsfs filesystems.
+ * Is it correct?
+ */
+ if ((path.dentry->d_sb->s_flags &
+ (SB_NOUSER | SB_KERNMOUNT | SB_NOEXEC)))
+ goto out_path_release;
+ }
+
+ if ((sysctl_interpreted_access & INTERPRETED_EXEC_MOUNT) &&
+ path_noexec(&path))
+ goto out_path_release;
/*
* For compatibility reasons, if the system-wide policy
* doesn't enforce file permission checks, then
* replaces the execute permission request with a read
* permission request.
*/
- mode &= ~MAY_EXEC;
+ if (!(sysctl_interpreted_access & INTERPRETED_EXEC_FILE))
+ mode &= ~MAY_EXEC;
/* To be executed *by* user space, files must be readable. */
mode |= MAY_READ;
}
@@ -83,6 +83,7 @@ extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;
+extern int sysctl_interpreted_access;
typedef __kernel_rwf_t rwf_t;
@@ -113,6 +113,7 @@ static int sixty = 60;
static int __maybe_unused neg_one = -1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
static unsigned long zero_ul;
static unsigned long one_ul = 1;
@@ -887,7 +888,6 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
-#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -896,7 +896,6 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
-#endif
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
@@ -3293,6 +3292,15 @@ static struct ctl_table fs_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
+ {
+ .procname = "interpreted_access",
+ .data = &sysctl_interpreted_access,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &three,
+ },
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
.procname = "binfmt_misc",