diff mbox series

[v5,seccomp,5/5] seccomp/cache: Report cache data through /proc/pid/seccomp_cache

Message ID 4706b0ff81f28b498c9012fd3517fe88319e7c42.1602431034.git.yifeifz2@illinois.edu (mailing list archive)
State Not Applicable
Headers show
Series seccomp: Add bitmap cache of constant allow filter results | expand

Commit Message

YiFei Zhu Oct. 11, 2020, 3:47 p.m. UTC
From: YiFei Zhu <yifeifz2@illinois.edu>

Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
---
 arch/Kconfig                   | 24 ++++++++++++++
 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/seccomp.h |  3 ++
 fs/proc/base.c                 |  6 ++++
 include/linux/seccomp.h        |  7 ++++
 kernel/seccomp.c               | 59 ++++++++++++++++++++++++++++++++++
 6 files changed, 100 insertions(+)

Comments

Jann Horn Oct. 12, 2020, 6:49 a.m. UTC | #1
On Sun, Oct 11, 2020 at 5:48 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
>
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
>
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]
>
> This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
> of N because I think certain users of seccomp might not want the
> application to know which syscalls are definitely usable. For
> the same reason, it is also guarded by CAP_SYS_ADMIN.
>
> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>

Reviewed-by: Jann Horn <jannh@google.com>
Geert Uytterhoeven Dec. 17, 2020, 12:14 p.m. UTC | #2
Hi Yifei,

On Sun, Oct 11, 2020 at 8:08 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> From: YiFei Zhu <yifeifz2@illinois.edu>
>
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
>
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
>
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]
>
> This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
> of N because I think certain users of seccomp might not want the
> application to know which syscalls are definitely usable. For
> the same reason, it is also guarded by CAP_SYS_ADMIN.
>
> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>

> @@ -2311,3 +2314,59 @@ static int __init seccomp_sysctl_init(void)
>  device_initcall(seccomp_sysctl_init)
>
>  #endif /* CONFIG_SYSCTL */
> +
> +#ifdef CONFIG_SECCOMP_CACHE_DEBUG
> +/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */

Should there be a dependency on SECCOMP_ARCH_NATIVE?
Should all architectures that implement seccomp have this?

E.g. mips does select HAVE_ARCH_SECCOMP_FILTER, but doesn't
have SECCOMP_ARCH_NATIVE?

(noticed with preliminary out-of-tree seccomp implementation for m68k,
 which doesn't have SECCOMP_ARCH_NATIVE

> +static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
> +                                       const void *bitmap, size_t bitmap_size)
> +{
> +       int nr;
> +
> +       for (nr = 0; nr < bitmap_size; nr++) {
> +               bool cached = test_bit(nr, bitmap);
> +               char *status = cached ? "ALLOW" : "FILTER";
> +
> +               seq_printf(m, "%s %d %s\n", name, nr, status);
> +       }
> +}
> +
> +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
> +                          struct pid *pid, struct task_struct *task)
> +{
> +       struct seccomp_filter *f;
> +       unsigned long flags;
> +
> +       /*
> +        * We don't want some sandboxed process to know what their seccomp
> +        * filters consist of.
> +        */
> +       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
> +               return -EACCES;
> +
> +       if (!lock_task_sighand(task, &flags))
> +               return -ESRCH;
> +
> +       f = READ_ONCE(task->seccomp.filter);
> +       if (!f) {
> +               unlock_task_sighand(task, &flags);
> +               return 0;
> +       }
> +
> +       /* prevent filter from being freed while we are printing it */
> +       __get_seccomp_filter(f);
> +       unlock_task_sighand(task, &flags);
> +
> +       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
> +                                   f->cache.allow_native,

error: ‘struct action_cache’ has no member named ‘allow_native’

struct action_cache is empty if SECCOMP_ARCH_NATIVE is not
defined (so there are checks for it).

> +                                   SECCOMP_ARCH_NATIVE_NR);
> +
> +#ifdef SECCOMP_ARCH_COMPAT
> +       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
> +                                   f->cache.allow_compat,
> +                                   SECCOMP_ARCH_COMPAT_NR);
> +#endif /* SECCOMP_ARCH_COMPAT */
> +
> +       __put_seccomp_filter(f);
> +       return 0;
> +}
> +#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
> --
> 2.28.0
>
YiFei Zhu Dec. 17, 2020, 6:34 p.m. UTC | #3
On Thu, Dec 17, 2020 at 6:14 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> Should there be a dependency on SECCOMP_ARCH_NATIVE?
> Should all architectures that implement seccomp have this?
>
> E.g. mips does select HAVE_ARCH_SECCOMP_FILTER, but doesn't
> have SECCOMP_ARCH_NATIVE?
>
> (noticed with preliminary out-of-tree seccomp implementation for m68k,
>  which doesn't have SECCOMP_ARCH_NATIVE

Hi Geert

You are correct. This specific patch in this series was not applied,
and this was addressed in a follow up patch series [1]. MIPS does not
define SECCOMP_ARCH_NATIVE because the bitmap expects syscall numbers
to start from 0, whereas MIPS does not (defines
CONFIG_HAVE_SPARSE_SYSCALL_NR). The follow up patch makes it so that
any arch with HAVE_SPARSE_SYSCALL_NR (currently just MIPS) cannot have
CONFIG_SECCOMP_CACHE_DEBUG on, by the depend on clause.

I see that you are doing an out of tree seccomp implementation for
m68k. Assuming unchanged arch/xtensa/include/asm/syscall.h, something
like this to arch/m68k/include/asm/seccomp.h should make it work:

#define SECCOMP_ARCH_NATIVE        AUDIT_ARCH_M68K
#define SECCOMP_ARCH_NATIVE_NR        NR_syscalls
#define SECCOMP_ARCH_NATIVE_NAME    "m68k"

If the file does not exist already, arch/xtensa/include/asm/seccomp.h
is a good example of how the file should look like, and remember to
remove `generic-y += seccomp.h` from arch/m68k/include/asm/Kbuild.

[1] https://lore.kernel.org/lkml/cover.1605101222.git.yifeifz2@illinois.edu/T/

YiFei Zhu
Geert Uytterhoeven Dec. 18, 2020, 12:35 p.m. UTC | #4
Hi YiFei,

On Thu, Dec 17, 2020 at 7:34 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> On Thu, Dec 17, 2020 at 6:14 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > Should there be a dependency on SECCOMP_ARCH_NATIVE?
> > Should all architectures that implement seccomp have this?
> >
> > E.g. mips does select HAVE_ARCH_SECCOMP_FILTER, but doesn't
> > have SECCOMP_ARCH_NATIVE?
> >
> > (noticed with preliminary out-of-tree seccomp implementation for m68k,
> >  which doesn't have SECCOMP_ARCH_NATIVE
>
> You are correct. This specific patch in this series was not applied,
> and this was addressed in a follow up patch series [1]. MIPS does not
> define SECCOMP_ARCH_NATIVE because the bitmap expects syscall numbers
> to start from 0, whereas MIPS does not (defines
> CONFIG_HAVE_SPARSE_SYSCALL_NR). The follow up patch makes it so that
> any arch with HAVE_SPARSE_SYSCALL_NR (currently just MIPS) cannot have
> CONFIG_SECCOMP_CACHE_DEBUG on, by the depend on clause.
>
> I see that you are doing an out of tree seccomp implementation for
> m68k. Assuming unchanged arch/xtensa/include/asm/syscall.h, something
> like this to arch/m68k/include/asm/seccomp.h should make it work:
>
> #define SECCOMP_ARCH_NATIVE        AUDIT_ARCH_M68K
> #define SECCOMP_ARCH_NATIVE_NR        NR_syscalls
> #define SECCOMP_ARCH_NATIVE_NAME    "m68k"
>
> If the file does not exist already, arch/xtensa/include/asm/seccomp.h
> is a good example of how the file should look like, and remember to
> remove `generic-y += seccomp.h` from arch/m68k/include/asm/Kbuild.
>
> [1] https://lore.kernel.org/lkml/cover.1605101222.git.yifeifz2@illinois.edu/T/

Thank you for your extensive explanation.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 21a3675a7a3a..6157c3ce0662 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -471,6 +471,15 @@  config HAVE_ARCH_SECCOMP_FILTER
 	    results in the system call being skipped immediately.
 	  - seccomp syscall wired up
 
+config HAVE_ARCH_SECCOMP_CACHE
+	bool
+	help
+	  An arch should select this symbol if it provides all of these things:
+	  - all the requirements for HAVE_ARCH_SECCOMP_FILTER
+	  - SECCOMP_ARCH_NATIVE
+	  - SECCOMP_ARCH_NATIVE_NR
+	  - SECCOMP_ARCH_NATIVE_NAME
+
 config SECCOMP
 	prompt "Enable seccomp to safely execute untrusted bytecode"
 	def_bool y
@@ -498,6 +507,21 @@  config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config SECCOMP_CACHE_DEBUG
+	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+	depends on SECCOMP
+	depends on SECCOMP_FILTER && HAVE_ARCH_SECCOMP_CACHE
+	depends on PROC_FS
+	help
+	  This enables the /proc/pid/seccomp_cache interface to monitor
+	  seccomp cache data. The file format is subject to change. Reading
+	  the file requires CAP_SYS_ADMIN.
+
+	  This option is for debugging only. Enabling presents the risk that
+	  an adversary may be able to infer the seccomp filter logic.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1ab22869a765..1a807f89ac77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,6 +150,7 @@  config X86
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SECCOMP_CACHE
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index b17d037c72ce..fef16e398161 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -19,9 +19,11 @@ 
 #ifdef CONFIG_X86_64
 # define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_X86_64
 # define SECCOMP_ARCH_NATIVE_NR		NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME	"x86_64"
 # ifdef CONFIG_COMPAT
 #  define SECCOMP_ARCH_COMPAT		AUDIT_ARCH_I386
 #  define SECCOMP_ARCH_COMPAT_NR	IA32_NR_syscalls
+#  define SECCOMP_ARCH_COMPAT_NAME	"ia32"
 # endif
 /*
  * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
@@ -31,6 +33,7 @@ 
 #else /* !CONFIG_X86_64 */
 # define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_I386
 # define SECCOMP_ARCH_NATIVE_NR	        NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME	"ia32"
 #endif
 
 #include <asm-generic/seccomp.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 617db4e0faa0..a4990410ff05 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3258,6 +3258,9 @@  static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3587,6 +3590,9 @@  static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..76963ec4641a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,11 @@  static inline long seccomp_get_metadata(struct task_struct *task,
 	return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+struct seq_file;
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 236e7b367d4e..1df2fac281da 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -553,6 +553,9 @@  void seccomp_filter_release(struct task_struct *tsk)
 {
 	struct seccomp_filter *orig = tsk->seccomp.filter;
 
+	/* We are effectively holding the siglock by not having any sighand. */
+	WARN_ON(tsk->sighand != NULL);
+
 	/* Detach task from its filter tree. */
 	tsk->seccomp.filter = NULL;
 	__seccomp_filter_release(orig);
@@ -2311,3 +2314,59 @@  static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+					const void *bitmap, size_t bitmap_size)
+{
+	int nr;
+
+	for (nr = 0; nr < bitmap_size; nr++) {
+		bool cached = test_bit(nr, bitmap);
+		char *status = cached ? "ALLOW" : "FILTER";
+
+		seq_printf(m, "%s %d %s\n", name, nr, status);
+	}
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filter *f;
+	unsigned long flags;
+
+	/*
+	 * We don't want some sandboxed process to know what their seccomp
+	 * filters consist of.
+	 */
+	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!lock_task_sighand(task, &flags))
+		return -ESRCH;
+
+	f = READ_ONCE(task->seccomp.filter);
+	if (!f) {
+		unlock_task_sighand(task, &flags);
+		return 0;
+	}
+
+	/* prevent filter from being freed while we are printing it */
+	__get_seccomp_filter(f);
+	unlock_task_sighand(task, &flags);
+
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+				    f->cache.allow_native,
+				    SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+				    f->cache.allow_compat,
+				    SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+	__put_seccomp_filter(f);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */