diff mbox series

[v4,seccomp,5/5] seccomp/cache: Report cache data through /proc/pid/seccomp_cache

Message ID c2077b8a86c6d82d611007d81ce81d32f718ec59.1602263422.git.yifeifz2@illinois.edu (mailing list archive)
State Superseded
Headers show
Series seccomp: Add bitmap cache of constant allow filter results | expand

Commit Message

YiFei Zhu Oct. 9, 2020, 5:14 p.m. UTC
From: YiFei Zhu <yifeifz2@illinois.edu>

Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
---
 arch/Kconfig                   | 24 ++++++++++++++
 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/seccomp.h |  3 ++
 fs/proc/base.c                 |  6 ++++
 include/linux/seccomp.h        |  5 +++
 kernel/seccomp.c               | 59 ++++++++++++++++++++++++++++++++++
 6 files changed, 98 insertions(+)

Comments

Jann Horn Oct. 9, 2020, 9:45 p.m. UTC | #1
On Fri, Oct 9, 2020 at 7:15 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
>
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
>
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]
>
> This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
> of N because I think certain users of seccomp might not want the
> application to know which syscalls are definitely usable. For
> the same reason, it is also guarded by CAP_SYS_ADMIN.
>
> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
[...]
> diff --git a/arch/Kconfig b/arch/Kconfig
[...]
> +config SECCOMP_CACHE_DEBUG
> +       bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
> +       depends on SECCOMP
> +       depends on SECCOMP_FILTER
> +       depends on PROC_FS
> +       help
> +         This is enables /proc/pid/seccomp_cache interface to monitor

nit: s/This is enables/This enables the/

> +         seccomp cache data. The file format is subject to change. Reading
> +         the file requires CAP_SYS_ADMIN.
> +
> +         This option is for debugging only. Enabling present the risk that

nit: *presents

> +         an adversary may be able to infer the seccomp filter logic.

[...]
> +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
> +                          struct pid *pid, struct task_struct *task)
> +{
> +       struct seccomp_filter *f;
> +       unsigned long flags;
> +
> +       /*
> +        * We don't want some sandboxed process know what their seccomp

s/know/to know/

> +        * filters consist of.
> +        */
> +       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
> +               return -EACCES;
> +
> +       if (!lock_task_sighand(task, &flags))
> +               return 0;

maybe return -ESRCH here so that userspace can distinguish between an
exiting process and a process with no filters?

> +       f = READ_ONCE(task->seccomp.filter);
> +       if (!f) {
> +               unlock_task_sighand(task, &flags);
> +               return 0;
> +       }
[...]
Kees Cook Oct. 9, 2020, 11:14 p.m. UTC | #2
On Fri, Oct 09, 2020 at 12:14:33PM -0500, YiFei Zhu wrote:
> From: YiFei Zhu <yifeifz2@illinois.edu>
> 
> Currently the kernel does not provide an infrastructure to translate
> architecture numbers to a human-readable name. Translating syscall
> numbers to syscall names is possible through FTRACE_SYSCALL
> infrastructure but it does not provide support for compat syscalls.
> 
> This will create a file for each PID as /proc/pid/seccomp_cache.
> The file will be empty when no seccomp filters are loaded, or be
> in the format of:
> <arch name> <decimal syscall number> <ALLOW | FILTER>
> where ALLOW means the cache is guaranteed to allow the syscall,
> and filter means the cache will pass the syscall to the BPF filter.
> 
> For the docker default profile on x86_64 it looks like:
> x86_64 0 ALLOW
> x86_64 1 ALLOW
> x86_64 2 ALLOW
> x86_64 3 ALLOW
> [...]
> x86_64 132 ALLOW
> x86_64 133 ALLOW
> x86_64 134 FILTER
> x86_64 135 FILTER
> x86_64 136 FILTER
> x86_64 137 ALLOW
> x86_64 138 ALLOW
> x86_64 139 FILTER
> x86_64 140 ALLOW
> x86_64 141 ALLOW
> [...]
> 
> This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
> of N because I think certain users of seccomp might not want the
> application to know which syscalls are definitely usable. For
> the same reason, it is also guarded by CAP_SYS_ADMIN.
> 
> Suggested-by: Jann Horn <jannh@google.com>
> Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
> Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
> ---
>  arch/Kconfig                   | 24 ++++++++++++++
>  arch/x86/Kconfig               |  1 +
>  arch/x86/include/asm/seccomp.h |  3 ++
>  fs/proc/base.c                 |  6 ++++
>  include/linux/seccomp.h        |  5 +++
>  kernel/seccomp.c               | 59 ++++++++++++++++++++++++++++++++++
>  6 files changed, 98 insertions(+)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 21a3675a7a3a..85239a974f04 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -471,6 +471,15 @@ config HAVE_ARCH_SECCOMP_FILTER
>  	    results in the system call being skipped immediately.
>  	  - seccomp syscall wired up
>  
> +config HAVE_ARCH_SECCOMP_CACHE
> +	bool
> +	help
> +	  An arch should select this symbol if it provides all of these things:
> +	  - all the requirements for HAVE_ARCH_SECCOMP_FILTER
> +	  - SECCOMP_ARCH_NATIVE
> +	  - SECCOMP_ARCH_NATIVE_NR
> +	  - SECCOMP_ARCH_NATIVE_NAME
> +
> [...]
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 1ab22869a765..1a807f89ac77 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -150,6 +150,7 @@ config X86
>  	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
>  	select HAVE_ARCH_PREL32_RELOCATIONS
>  	select HAVE_ARCH_SECCOMP_FILTER
> +	select HAVE_ARCH_SECCOMP_CACHE
>  	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>  	select HAVE_ARCH_STACKLEAK
>  	select HAVE_ARCH_TRACEHOOK

HAVE_ARCH_SECCOMP_CACHE isn't used any more. I think this was left over
from before.
YiFei Zhu Oct. 10, 2020, 1:26 p.m. UTC | #3
On Fri, Oct 9, 2020 at 6:14 PM Kees Cook <keescook@chromium.org> wrote:
> HAVE_ARCH_SECCOMP_CACHE isn't used any more. I think this was left over
> from before.

Oh, I was meant to add this to the dependencies of
SECCOMP_CACHE_DEBUG. Is this something that would make sense?

YiFei Zhu
Kees Cook Oct. 12, 2020, 10:57 p.m. UTC | #4
On Sat, Oct 10, 2020 at 08:26:16AM -0500, YiFei Zhu wrote:
> On Fri, Oct 9, 2020 at 6:14 PM Kees Cook <keescook@chromium.org> wrote:
> > HAVE_ARCH_SECCOMP_CACHE isn't used any more. I think this was left over
> > from before.
> 
> Oh, I was meant to add this to the dependencies of
> SECCOMP_CACHE_DEBUG. Is this something that would make sense?

I think it's fine to just have this "dangle" with a help text update of
"if seccomp action caching is supported by the architecture, provide the
/proc/$pid ..."
YiFei Zhu Oct. 13, 2020, 12:31 a.m. UTC | #5
On Mon, Oct 12, 2020 at 5:57 PM Kees Cook <keescook@chromium.org> wrote:
> I think it's fine to just have this "dangle" with a help text update of
> "if seccomp action caching is supported by the architecture, provide the
> /proc/$pid ..."

I think it would be weird if someone sees this help text and wonder...
"hmm does my architecture support seccomp action caching" and without
a clear pointer to how seccomp action cache works, goes and compiles
the kernel with this config option on for the purpose of knowing if
their arch supports it... Or, is it a common practice in the kernel to
leave dangling configs?

YiFei Zhu
YiFei Zhu Oct. 22, 2020, 8:52 p.m. UTC | #6
On Mon, Oct 12, 2020 at 7:31 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
>
> On Mon, Oct 12, 2020 at 5:57 PM Kees Cook <keescook@chromium.org> wrote:
> > I think it's fine to just have this "dangle" with a help text update of
> > "if seccomp action caching is supported by the architecture, provide the
> > /proc/$pid ..."
>
> I think it would be weird if someone sees this help text and wonder...
> "hmm does my architecture support seccomp action caching" and without
> a clear pointer to how seccomp action cache works, goes and compiles
> the kernel with this config option on for the purpose of knowing if
> their arch supports it... Or, is it a common practice in the kernel to
> leave dangling configs?

Bump, in case this question was missed. I don't really want to miss
the 5.10 merge window...

YiFei Zhu
Kees Cook Oct. 22, 2020, 10:32 p.m. UTC | #7
On Thu, Oct 22, 2020 at 03:52:20PM -0500, YiFei Zhu wrote:
> On Mon, Oct 12, 2020 at 7:31 PM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> >
> > On Mon, Oct 12, 2020 at 5:57 PM Kees Cook <keescook@chromium.org> wrote:
> > > I think it's fine to just have this "dangle" with a help text update of
> > > "if seccomp action caching is supported by the architecture, provide the
> > > /proc/$pid ..."
> >
> > I think it would be weird if someone sees this help text and wonder...
> > "hmm does my architecture support seccomp action caching" and without
> > a clear pointer to how seccomp action cache works, goes and compiles
> > the kernel with this config option on for the purpose of knowing if
> > their arch supports it... Or, is it a common practice in the kernel to
> > leave dangling configs?
> 
> Bump, in case this question was missed.

I've been going back and forth on this, and I think what I've settled
on is I'd like to avoid new CONFIG dependencies just for this feature.
Instead, how about we just fill in SECCOMP_NATIVE and SECCOMP_COMPAT
for all the HAVE_ARCH_SECCOMP_FILTER architectures, and then the
cache reporting can be cleanly tied to CONFIG_SECCOMP_FILTER? It
should be relatively simple to extract those details and make
SECCOMP_ARCH_{NATIVE,COMPAT}_NAME part of the per-arch enabling patches?

> I don't really want to miss the 5.10 merge window...

Sorry, the 5.10 merge window is already closed for stuff that hasn't
already been in -next. Most subsystem maintainers (myself included)
don't take new features into their trees between roughly N-rc6 and
(N+1)-rc1. My plan is to put this in my -next tree after -rc1 is released
(expected to be Oct 25th).

I'd still like to get more specific workload performance numbers too.
The microbenchmark is nice, but getting things like build times under
docker's default seccomp filter, etc would be lovely. I've almost gotten
there, but my benchmarks are still really noisy and CPU isolation
continues to frustrate me. :)
YiFei Zhu Oct. 22, 2020, 11:40 p.m. UTC | #8
On Thu, Oct 22, 2020 at 5:32 PM Kees Cook <keescook@chromium.org> wrote:
> I've been going back and forth on this, and I think what I've settled
> on is I'd like to avoid new CONFIG dependencies just for this feature.
> Instead, how about we just fill in SECCOMP_NATIVE and SECCOMP_COMPAT
> for all the HAVE_ARCH_SECCOMP_FILTER architectures, and then the
> cache reporting can be cleanly tied to CONFIG_SECCOMP_FILTER? It
> should be relatively simple to extract those details and make
> SECCOMP_ARCH_{NATIVE,COMPAT}_NAME part of the per-arch enabling patches?

Hmm. So I could enable the cache logic to every architecture (one
patch per arch) that does not have the sparse syscall numbers, and
then have the proc reporting after the arch patches? I could do that.
I don't have test machines to run anything other than x86_64 or ia32,
so they will need a closer look by people more familiar with those
arches.

> I'd still like to get more specific workload performance numbers too.
> The microbenchmark is nice, but getting things like build times under
> docker's default seccomp filter, etc would be lovely. I've almost gotten
> there, but my benchmarks are still really noisy and CPU isolation
> continues to frustrate me. :)

Ok, let me know if I can help.

YiFei Zhu
Kees Cook Oct. 24, 2020, 2:51 a.m. UTC | #9
On Thu, Oct 22, 2020 at 06:40:08PM -0500, YiFei Zhu wrote:
> On Thu, Oct 22, 2020 at 5:32 PM Kees Cook <keescook@chromium.org> wrote:
> > I've been going back and forth on this, and I think what I've settled
> > on is I'd like to avoid new CONFIG dependencies just for this feature.
> > Instead, how about we just fill in SECCOMP_NATIVE and SECCOMP_COMPAT
> > for all the HAVE_ARCH_SECCOMP_FILTER architectures, and then the
> > cache reporting can be cleanly tied to CONFIG_SECCOMP_FILTER? It
> > should be relatively simple to extract those details and make
> > SECCOMP_ARCH_{NATIVE,COMPAT}_NAME part of the per-arch enabling patches?
> 
> Hmm. So I could enable the cache logic to every architecture (one
> patch per arch) that does not have the sparse syscall numbers, and
> then have the proc reporting after the arch patches? I could do that.
> I don't have test machines to run anything other than x86_64 or ia32,
> so they will need a closer look by people more familiar with those
> arches.

Cool, yes please. It looks like MIPS will need to be skipped for now. I
would have the debug cache reporting patch then depend on
!CONFIG_HAVE_SPARSE_SYSCALL_NR.

> > I'd still like to get more specific workload performance numbers too.
> > The microbenchmark is nice, but getting things like build times under
> > docker's default seccomp filter, etc would be lovely. I've almost gotten
> > there, but my benchmarks are still really noisy and CPU isolation
> > continues to frustrate me. :)
> 
> Ok, let me know if I can help.

Do you have a test environment where you can compare the before/after
of repeated kernel build times (or some other sufficiently
complex/interesting) workload under these conditions:

bare metal
docker w/ seccomp policy disabled
docker w/ default seccomp policy

This is what I've been trying to construct, but it's really noisy, so
I've been trying to pin CPUs and NUMA memory nodes, but it's not really
helping yet. :P
YiFei Zhu Oct. 30, 2020, 12:18 p.m. UTC | #10
On Fri, Oct 23, 2020 at 9:51 PM Kees Cook <keescook@chromium.org> wrote:
> Do you have a test environment where you can compare the before/after
> of repeated kernel build times (or some other sufficiently
> complex/interesting) workload under these conditions:
>
> bare metal
> docker w/ seccomp policy disabled
> docker w/ default seccomp policy
>
> This is what I've been trying to construct, but it's really noisy, so
> I've been trying to pin CPUs and NUMA memory nodes, but it's not really
> helping yet. :P

Hi, sorry for the delay. The benchmarks took a while to get.

I got a bare metal test machine with Intel(R) Xeon(R) CPU E5-2660 v3 @
2.60GHz, running Ubuntu 18.04. Test kernels are compiled at
57a339117e52 ("selftests/seccomp: Compare bitmap vs filter overhead")
and 3650b228f83a ("Linux 5.10-rc1"), built with Ubuntu's
5.3.0-64-generic's config, then `make olddefconfig`. "Mitigations off"
indicate the kernel was booted with "nospectre_v2 nospectre_v1
no_stf_barrier tsx=off tsx_async_abort=off".

The benchmark was single-job make on x86_64 defconfig of 5.9.1, with
CPU affinity to set only processor #0. Raw results are appended below.
Each boot is tested by running the build directly and inside docker,
with and without seccomp. The commands used are attached below. Each
test is 4 trials, with the middle two (non-minimum, non-maximum) wall
clock time averaged. Results summary:

                Mitigations On                  Mitigations Off
                With Cache      Without Cache   With Cache      Without Cache
Native          18:17.38        18:13.78        18:16.08        18:15.67
D. no seccomp   18:15.54        18:17.71        18:17.58        18:16.75
D. + seccomp    20:42.47        20:45.04        18:47.67        18:49.01

To be honest, I'm somewhat surprised that it didn't produce as much of
a dent in the seccomp overhead in this macro benchmark as I had
expected.

Below are commands used and outputs from time command.

Commands used to start the docker containers:
docker run -w /srv/yifeifz2/linux-buildtest \
  --tmpfs /srv/yifeifz2/linux-buildtest:exec --rm -it ubuntu:18.04

docker run -w /srv/yifeifz2/linux-buildtest \
  --tmpfs /srv/yifeifz2/linux-buildtest:exec --rm -it \
  --security-opt seccomp=unconfined ubuntu:18.04

Commands used to install the toolchain inside docker:
apt -y update
apt -y dist-upgrade
apt -y install build-essential wget flex bison time libssl-dev bc libelf-dev

Commands to benchmark on native:
for i in {1..4}; do
    mkdir -p /srv/yifeifz2/linux-buildtest
    mount -t tmpfs tmpfs /srv/yifeifz2/linux-buildtest
    pushd /srv/yifeifz2/linux-buildtest
    wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.9.1.tar.xz
    tar xf linux-5.9.1.tar.xz
    cd linux-5.9.1
    make mrproper
    make defconfig
    taskset 0x1 time make -j1 > /dev/null
    popd
    umount /srv/yifeifz2/linux-buildtest
done

Commands to benchmark inside docker:
for i in {1..4}; do
    wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.9.1.tar.xz
    tar xf linux-5.9.1.tar.xz
    pushd linux-5.9.1
    make mrproper
    make defconfig
    taskset 0x1 time make -j1 > /dev/null
    popd
    rm -rf linux-5.9.1 linux-5.9.1.tar.xz
done

==== with cache, mitigations on ====

973.52user 113.98system 18:16.51elapsed 99%CPU (0avgtext+0avgdata
239784maxresident)k
0inputs+217152outputs (0major+51937662minor)pagefaults 0swaps

973.74user 115.35system 18:18.41elapsed 99%CPU (0avgtext+0avgdata
239640maxresident)k
0inputs+217152outputs (0major+51933865minor)pagefaults 0swaps

973.31user 114.41system 18:17.37elapsed 99%CPU (0avgtext+0avgdata
239660maxresident)k
72inputs+217152outputs (0major+51936343minor)pagefaults 0swaps

971.76user 116.04system 18:17.39elapsed 99%CPU (0avgtext+0avgdata
239588maxresident)k
0inputs+217152outputs (0major+51936222minor)pagefaults 0swaps


961.44user 121.30system 18:15.30elapsed 98%CPU (0avgtext+0avgdata
239580maxresident)k
0inputs+217152outputs (0major+51555371minor)pagefaults 0swaps

961.86user 119.48system 18:13.96elapsed 98%CPU (0avgtext+0avgdata
239480maxresident)k
0inputs+217152outputs (0major+51552153minor)pagefaults 0swaps

961.68user 121.75system 18:15.78elapsed 98%CPU (0avgtext+0avgdata
239504maxresident)k
0inputs+217152outputs (0major+51559201minor)pagefaults 0swaps

960.80user 122.04system 18:18.99elapsed 98%CPU (0avgtext+0avgdata
239644maxresident)k
0inputs+217152outputs (0major+51557386minor)pagefaults 0swaps


1104.08user 124.48system 20:42.13elapsed 98%CPU (0avgtext+0avgdata
239544maxresident)k
984inputs+217152outputs (21major+51552022minor)pagefaults 0swaps

1101.78user 125.66system 20:40.80elapsed 98%CPU (0avgtext+0avgdata
239692maxresident)k
0inputs+217152outputs (0major+51546446minor)pagefaults 0swaps

1102.98user 126.03system 20:43.09elapsed 98%CPU (0avgtext+0avgdata
239592maxresident)k
0inputs+217152outputs (0major+51551238minor)pagefaults 0swaps

1103.34user 125.32system 20:42.82elapsed 98%CPU (0avgtext+0avgdata
239620maxresident)k
0inputs+217152outputs (0major+51554493minor)pagefaults 0swaps


==== without cache, mitigations on ====

967.19user 115.77system 18:17.20elapsed 98%CPU (0avgtext+0avgdata
239536maxresident)k
25112inputs+217152outputs (166major+51935958minor)pagefaults 0swaps

969.05user 114.18system 18:12.92elapsed 99%CPU (0avgtext+0avgdata
239544maxresident)k
0inputs+217152outputs (0major+51938961minor)pagefaults 0swaps

968.51user 116.50system 18:14.64elapsed 99%CPU (0avgtext+0avgdata
239716maxresident)k
0inputs+217152outputs (0major+51937686minor)pagefaults 0swaps

968.53user 115.13system 18:10.33elapsed 99%CPU (0avgtext+0avgdata
239628maxresident)k
0inputs+217152outputs (0major+51938033minor)pagefaults 0swaps


962.85user 121.56system 18:17.73elapsed 98%CPU (0avgtext+0avgdata
239736maxresident)k
0inputs+217152outputs (0major+51549715minor)pagefaults 0swaps

962.51user 121.74system 18:17.42elapsed 98%CPU (0avgtext+0avgdata
239480maxresident)k
0inputs+217152outputs (0major+51558249minor)pagefaults 0swaps

963.37user 121.24system 18:18.59elapsed 98%CPU (0avgtext+0avgdata
239224maxresident)k
0inputs+217152outputs (0major+51551031minor)pagefaults 0swaps

963.71user 120.75system 18:17.70elapsed 98%CPU (0avgtext+0avgdata
239460maxresident)k
0inputs+217152outputs (0major+51555583minor)pagefaults 0swaps


1103.35user 126.49system 20:45.59elapsed 98%CPU (0avgtext+0avgdata
239600maxresident)k
984inputs+217152outputs (21major+51557916minor)pagefaults 0swaps

1103.01user 126.69system 20:45.36elapsed 98%CPU (0avgtext+0avgdata
239708maxresident)k
232inputs+217152outputs (0major+51560311minor)pagefaults 0swaps

1102.97user 127.13system 20:44.73elapsed 98%CPU (0avgtext+0avgdata
239440maxresident)k
0inputs+217152outputs (0major+51552998minor)pagefaults 0swaps

1103.09user 127.01system 20:44.48elapsed 98%CPU (0avgtext+0avgdata
239448maxresident)k
0inputs+217152outputs (0major+51559328minor)pagefaults 0swaps


==== with cache, mitigations off ====

971.35user 114.45system 18:16.36elapsed 99%CPU (0avgtext+0avgdata
239740maxresident)k
1584inputs+217152outputs (10major+51937572minor)pagefaults 0swaps

971.75user 115.18system 18:16.04elapsed 99%CPU (0avgtext+0avgdata
239648maxresident)k
0inputs+217152outputs (0major+51944016minor)pagefaults 0swaps

972.03user 114.47system 18:16.12elapsed 99%CPU (0avgtext+0avgdata
239368maxresident)k
744inputs+217152outputs (0major+51946745minor)pagefaults 0swaps

970.59user 115.13system 18:15.21elapsed 99%CPU (0avgtext+0avgdata
239736maxresident)k
0inputs+217152outputs (1major+51936971minor)pagefaults 0swaps


964.13user 121.15system 18:17.44elapsed 98%CPU (0avgtext+0avgdata
239496maxresident)k
0inputs+217152outputs (0major+51554855minor)pagefaults 0swaps

964.46user 120.73system 18:16.89elapsed 98%CPU (0avgtext+0avgdata
239492maxresident)k
0inputs+217152outputs (0major+51563668minor)pagefaults 0swaps

964.00user 121.71system 18:18.42elapsed 98%CPU (0avgtext+0avgdata
239504maxresident)k
0inputs+217152outputs (0major+51549101minor)pagefaults 0swaps

963.99user 121.46system 18:17.72elapsed 98%CPU (0avgtext+0avgdata
239644maxresident)k
0inputs+217152outputs (0major+51561705minor)pagefaults 0swaps


993.01user 123.83system 18:47.73elapsed 99%CPU (0avgtext+0avgdata
239648maxresident)k
984inputs+217152outputs (21major+51554203minor)pagefaults 0swaps

991.53user 125.49system 18:47.28elapsed 99%CPU (0avgtext+0avgdata
239380maxresident)k
0inputs+217152outputs (0major+51557014minor)pagefaults 0swaps

992.52user 124.53system 18:47.61elapsed 99%CPU (0avgtext+0avgdata
239344maxresident)k
0inputs+217152outputs (0major+51555681minor)pagefaults 0swaps

993.47user 125.01system 18:48.98elapsed 99%CPU (0avgtext+0avgdata
239448maxresident)k
0inputs+217152outputs (0major+51558830minor)pagefaults 0swaps


==== without cache, mitigations off ====

969.87user 118.18system 18:16.82elapsed 99%CPU (0avgtext+0avgdata
239640maxresident)k
0inputs+217152outputs (0major+51937042minor)pagefaults 0swaps

971.42user 114.62system 18:14.93elapsed 99%CPU (0avgtext+0avgdata
239840maxresident)k
0inputs+217152outputs (0major+51937617minor)pagefaults 0swaps

971.73user 114.40system 18:15.39elapsed 99%CPU (0avgtext+0avgdata
239724maxresident)k
0inputs+217152outputs (0major+51937768minor)pagefaults 0swaps

969.71user 117.13system 18:15.95elapsed 99%CPU (0avgtext+0avgdata
239680maxresident)k
0inputs+217152outputs (0major+51940505minor)pagefaults 0swaps


963.51user 121.32system 18:16.91elapsed 98%CPU (0avgtext+0avgdata
239516maxresident)k
0inputs+217152outputs (0major+51561337minor)pagefaults 0swaps

963.10user 120.75system 18:17.34elapsed 98%CPU (0avgtext+0avgdata
239464maxresident)k
0inputs+217152outputs (0major+51547338minor)pagefaults 0swaps

962.27user 122.48system 18:16.59elapsed 98%CPU (0avgtext+0avgdata
239544maxresident)k
0inputs+217152outputs (0major+51552060minor)pagefaults 0swaps

962.83user 120.21system 18:15.37elapsed 98%CPU (0avgtext+0avgdata
239496maxresident)k
0inputs+217152outputs (0major+51553345minor)pagefaults 0swaps


990.69user 125.78system 18:48.93elapsed 98%CPU (0avgtext+0avgdata
239440maxresident)k
984inputs+217152outputs (21major+51558142minor)pagefaults 0swaps

990.76user 126.01system 18:48.88elapsed 98%CPU (0avgtext+0avgdata
239800maxresident)k
0inputs+217152outputs (0major+51558483minor)pagefaults 0swaps

991.06user 125.99system 18:49.30elapsed 98%CPU (0avgtext+0avgdata
239412maxresident)k
0inputs+217152outputs (0major+51556462minor)pagefaults 0swaps

992.33user 124.77system 18:49.09elapsed 98%CPU (0avgtext+0avgdata
239684maxresident)k
0inputs+217152outputs (0major+51549745minor)pagefaults 0swaps

YiFei Zhu
YiFei Zhu Nov. 3, 2020, 1 p.m. UTC | #11
On Fri, Oct 30, 2020 at 7:18 AM YiFei Zhu <zhuyifei1999@gmail.com> wrote:
> I got a bare metal test machine with Intel(R) Xeon(R) CPU E5-2660 v3 @
> 2.60GHz, running Ubuntu 18.04. Test kernels are compiled at
> 57a339117e52 ("selftests/seccomp: Compare bitmap vs filter overhead")
> and 3650b228f83a ("Linux 5.10-rc1"), built with Ubuntu's
> 5.3.0-64-generic's config, then `make olddefconfig`. "Mitigations off"
> indicate the kernel was booted with "nospectre_v2 nospectre_v1
> no_stf_barrier tsx=off tsx_async_abort=off".
>
> The benchmark was single-job make on x86_64 defconfig of 5.9.1, with
> CPU affinity to set only processor #0. Raw results are appended below.
> Each boot is tested by running the build directly and inside docker,
> with and without seccomp. The commands used are attached below. Each
> test is 4 trials, with the middle two (non-minimum, non-maximum) wall
> clock time averaged. Results summary:
>
>                 Mitigations On                  Mitigations Off
>                 With Cache      Without Cache   With Cache      Without Cache
> Native          18:17.38        18:13.78        18:16.08        18:15.67
> D. no seccomp   18:15.54        18:17.71        18:17.58        18:16.75
> D. + seccomp    20:42.47        20:45.04        18:47.67        18:49.01
>
> To be honest, I'm somewhat surprised that it didn't produce as much of
> a dent in the seccomp overhead in this macro benchmark as I had
> expected.

My peers pointed out that in my previous benchmark there are still a
few mitigations left on, and suggested to use "noibrs noibpb nopti
nospectre_v2 nospectre_v1 l1tf=off nospec_store_bypass_disable
no_stf_barrier mds=off tsx=on tsx_async_abort=off mitigations=off".
Results with "Mitigations Off" updated:

                        Mitigations On            Mitigations Off
                With Cache      Without Cache   With Cache      Without Cache
Native          18:17.38        18:13.78        17:43.42        17:47.68
D. no seccomp   18:15.54        18:17.71        17:34.59        17:37.54
D. + seccomp    20:42.47        20:45.04        17:35.70        17:37.16

Whether seccomp is on or off seems not to make much of a difference
for this benchmark. Bitmap being enabled does seem to decrease the
overall compilation time but it also affects where seccomp is off, so
the speedup is probably from other factors. We are thinking about
using more syscall-intensive workloads, such as httpd.

Thugh, this does make me wonder, where does the 3-minute overhead with
seccomp with mitigations come from? Is it data cache misses? If that
is the case, can we somehow preload the seccomp bitmap cache maybe? I
mean, mitigations only cause around half a minute slowdown without
seccomp but seccomp somehow amplify the slowdown with an additional
2.5 minutes, so something must be off here.

This is the raw output for the time commands:

==== with cache, mitigations off ====

947.02user 108.62system 17:47.65elapsed 98%CPU (0avgtext+0avgdata
239804maxresident)k
25112inputs+217152outputs (166major+51934447minor)pagefaults 0swaps

947.91user 108.20system 17:46.53elapsed 99%CPU (0avgtext+0avgdata
239576maxresident)k
0inputs+217152outputs (0major+51941524minor)pagefaults 0swaps

948.33user 108.70system 17:47.72elapsed 98%CPU (0avgtext+0avgdata
239604maxresident)k
0inputs+217152outputs (0major+51938566minor)pagefaults 0swaps

948.65user 108.81system 17:48.41elapsed 98%CPU (0avgtext+0avgdata
239692maxresident)k
0inputs+217152outputs (0major+51935349minor)pagefaults 0swaps


932.12user 113.68system 17:37.24elapsed 98%CPU (0avgtext+0avgdata
239660maxresident)k
0inputs+217152outputs (0major+51547571minor)pagefaults 0swap

931.69user 114.12system 17:37.84elapsed 98%CPU (0avgtext+0avgdata
239448maxresident)k
0inputs+217152outputs (0major+51539964minor)pagefaults 0swaps

932.25user 113.39system 17:37.75elapsed 98%CPU (0avgtext+0avgdata
239372maxresident)k
0inputs+217152outputs (0major+51538018minor)pagefaults 0swaps

931.09user 114.25system 17:37.34elapsed 98%CPU (0avgtext+0avgdata
239508maxresident)k
0inputs+217152outputs (0major+51537700minor)pagefaults 0swaps


929.96user 113.42system 17:36.23elapsed 98%CPU (0avgtext+0avgdata
239448maxresident)k
984inputs+217152outputs (22major+51544059minor)pagefaults 0swaps

929.73user 115.13system 17:38.09elapsed 98%CPU (0avgtext+0avgdata
239464maxresident)k
0inputs+217152outputs (0major+51540259minor)pagefaults 0swaps

930.13user 112.71system 17:36.17elapsed 98%CPU (0avgtext+0avgdata
239620maxresident)k
0inputs+217152outputs (0major+51540623minor)pagefaults 0swaps

930.57user 113.02system 17:49.70elapsed 97%CPU (0avgtext+0avgdata
239432maxresident)k
0inputs+217152outputs (0major+51537776minor)pagefaults 0swaps

==== without cache, mitigations off ====

947.59user 108.06system 17:44.56elapsed 99%CPU (0avgtext+0avgdata
239484maxresident)k
25112inputs+217152outputs (167major+51938723minor)pagefaults 0swaps

947.95user 108.58system 17:43.40elapsed 99%CPU (0avgtext+0avgdata
239580maxresident)k
0inputs+217152outputs (0major+51943434minor)pagefaults 0swaps

948.54user 106.62system 17:42.39elapsed 99%CPU (0avgtext+0avgdata
239608maxresident)k
0inputs+217152outputs (0major+51936408minor)pagefaults 0swaps

947.85user 107.92system 17:43.44elapsed 99%CPU (0avgtext+0avgdata
239656maxresident)k
0inputs+217152outputs (0major+51931633minor)pagefaults 0swaps


931.28user 111.16system 17:33.59elapsed 98%CPU (0avgtext+0avgdata
239440maxresident)k
0inputs+217152outputs (0major+51543540minor)pagefaults 0swaps

930.21user 112.56system 17:34.20elapsed 98%CPU (0avgtext+0avgdata
239400maxresident)k
0inputs+217152outputs (0major+51539699minor)pagefaults 0swaps

930.16user 113.74system 17:35.06elapsed 98%CPU (0avgtext+0avgdata
239344maxresident)k
0inputs+217152outputs (0major+51543072minor)pagefaults 0swaps

930.17user 112.77system 17:34.98elapsed 98%CPU (0avgtext+0avgdata
239176maxresident)k
0inputs+217152outputs (0major+51540777minor)pagefaults 0swaps


931.92user 113.31system 17:36.05elapsed 98%CPU (0avgtext+0avgdata
239520maxresident)k
984inputs+217152outputs (22major+51534636minor)pagefaults 0swaps

931.14user 112.81system 17:35.35elapsed 98%CPU (0avgtext+0avgdata
239524maxresident)k
0inputs+217152outputs (0major+51549007minor)pagefaults 0swaps

930.93user 114.56system 17:37.72elapsed 98%CPU (0avgtext+0avgdata
239360maxresident)k
0inputs+217152outputs (0major+51542191minor)pagefaults 0swaps

932.26user 111.54system 17:35.36elapsed 98%CPU (0avgtext+0avgdata
239572maxresident)k
0inputs+217152outputs (0major+51537921minor)pagefaults 0swaps

YiFei Zhu
Kees Cook Nov. 4, 2020, 12:29 a.m. UTC | #12
On Tue, Nov 03, 2020 at 07:00:22AM -0600, YiFei Zhu wrote:
> My peers pointed out that in my previous benchmark there are still a
> few mitigations left on, and suggested to use "noibrs noibpb nopti
> nospectre_v2 nospectre_v1 l1tf=off nospec_store_bypass_disable
> no_stf_barrier mds=off tsx=on tsx_async_abort=off mitigations=off".
> Results with "Mitigations Off" updated:
> 
>                         Mitigations On            Mitigations Off
>                 With Cache      Without Cache   With Cache      Without Cache
> Native          18:17.38        18:13.78        17:43.42        17:47.68
> D. no seccomp   18:15.54        18:17.71        17:34.59        17:37.54
> D. + seccomp    20:42.47        20:45.04        17:35.70        17:37.16
> 
> Whether seccomp is on or off seems not to make much of a difference
> for this benchmark. Bitmap being enabled does seem to decrease the
> overall compilation time but it also affects where seccomp is off, so
> the speedup is probably from other factors. We are thinking about
> using more syscall-intensive workloads, such as httpd.

Yeah, this is very interesting. That there is anything measurably _slower_
with the cache is surprising. Though with only 4 runs, I wonder if it's
still noisy? What happens at 10 runs -- more importantly what is the
standard deviation?

> Thugh, this does make me wonder, where does the 3-minute overhead with
> seccomp with mitigations come from? Is it data cache misses? If that
> is the case, can we somehow preload the seccomp bitmap cache maybe? I
> mean, mitigations only cause around half a minute slowdown without
> seccomp but seccomp somehow amplify the slowdown with an additional
> 2.5 minutes, so something must be off here.

I assume this is from Indirect Branch Prediction Barrier (IBPB) and
Single Threaded Indirect Branch Prediction (STIBP) (which get enabled
for threads under seccomp by default).

Try booting with "spectre_v2_user=prctl"

https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/spectre.html#spectre-mitigation-control-command-line
YiFei Zhu Nov. 4, 2020, 11:40 a.m. UTC | #13
On Tue, Nov 3, 2020 at 6:29 PM Kees Cook <keescook@chromium.org> wrote:
> Yeah, this is very interesting. That there is anything measurably _slower_
> with the cache is surprising. Though with only 4 runs, I wonder if it's
> still noisy? What happens at 10 runs -- more importantly what is the
> standard deviation?

I could do that. it just takes such a long time. Each run takes about
20 minutes so with 10 runs per environment, 3 environments (native + 2
docker) per boot, and 4 boots (2 bootparam * 2 compile config), it's
27 hours of compilation. I should probably script it at that point.

> I assume this is from Indirect Branch Prediction Barrier (IBPB) and
> Single Threaded Indirect Branch Prediction (STIBP) (which get enabled
> for threads under seccomp by default).
>
> Try booting with "spectre_v2_user=prctl"

Hmm, to make sure, boot with just "spectre_v2_user=prctl" on the
command line and test the performance of that?

YiFei Zhu
Kees Cook Nov. 4, 2020, 6:57 p.m. UTC | #14
On Wed, Nov 04, 2020 at 05:40:51AM -0600, YiFei Zhu wrote:
> On Tue, Nov 3, 2020 at 6:29 PM Kees Cook <keescook@chromium.org> wrote:
> > Yeah, this is very interesting. That there is anything measurably _slower_
> > with the cache is surprising. Though with only 4 runs, I wonder if it's
> > still noisy? What happens at 10 runs -- more importantly what is the
> > standard deviation?
> 
> I could do that. it just takes such a long time. Each run takes about
> 20 minutes so with 10 runs per environment, 3 environments (native + 2
> docker) per boot, and 4 boots (2 bootparam * 2 compile config), it's
> 27 hours of compilation. I should probably script it at that point.

Yeah, I was facing the same issues. Though perhaps hackbench (with
multiple CPUs) would be a better test (and it's much faster):

https://lore.kernel.org/lkml/7723ae8d-8333-ba17-6983-a45ec8b11c54@redhat.com/

(I usually run this with a CNT of 20 to get quick results.)

> > I assume this is from Indirect Branch Prediction Barrier (IBPB) and
> > Single Threaded Indirect Branch Prediction (STIBP) (which get enabled
> > for threads under seccomp by default).
> >
> > Try booting with "spectre_v2_user=prctl"
> 
> Hmm, to make sure, boot with just "spectre_v2_user=prctl" on the
> command line and test the performance of that?

Right, see if that eliminates the 3 minute jump seen for seccomp.
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 21a3675a7a3a..85239a974f04 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -471,6 +471,15 @@  config HAVE_ARCH_SECCOMP_FILTER
 	    results in the system call being skipped immediately.
 	  - seccomp syscall wired up
 
+config HAVE_ARCH_SECCOMP_CACHE
+	bool
+	help
+	  An arch should select this symbol if it provides all of these things:
+	  - all the requirements for HAVE_ARCH_SECCOMP_FILTER
+	  - SECCOMP_ARCH_NATIVE
+	  - SECCOMP_ARCH_NATIVE_NR
+	  - SECCOMP_ARCH_NATIVE_NAME
+
 config SECCOMP
 	prompt "Enable seccomp to safely execute untrusted bytecode"
 	def_bool y
@@ -498,6 +507,21 @@  config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config SECCOMP_CACHE_DEBUG
+	bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+	depends on SECCOMP
+	depends on SECCOMP_FILTER
+	depends on PROC_FS
+	help
+	  This is enables /proc/pid/seccomp_cache interface to monitor
+	  seccomp cache data. The file format is subject to change. Reading
+	  the file requires CAP_SYS_ADMIN.
+
+	  This option is for debugging only. Enabling present the risk that
+	  an adversary may be able to infer the seccomp filter logic.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1ab22869a765..1a807f89ac77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,6 +150,7 @@  config X86
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SECCOMP_CACHE
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 03365af6165d..cd57c3eabab5 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -19,13 +19,16 @@ 
 #ifdef CONFIG_X86_64
 # define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_X86_64
 # define SECCOMP_ARCH_NATIVE_NR		NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME	"x86_64"
 # ifdef CONFIG_COMPAT
 #  define SECCOMP_ARCH_COMPAT		AUDIT_ARCH_I386
 #  define SECCOMP_ARCH_COMPAT_NR	IA32_NR_syscalls
+#  define SECCOMP_ARCH_COMPAT_NAME	"ia32"
 # endif
 #else /* !CONFIG_X86_64 */
 # define SECCOMP_ARCH_NATIVE		AUDIT_ARCH_I386
 # define SECCOMP_ARCH_NATIVE_NR	        NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME	"ia32"
 #endif
 
 #include <asm-generic/seccomp.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 617db4e0faa0..a4990410ff05 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3258,6 +3258,9 @@  static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3587,6 +3590,9 @@  static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+	ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 02aef2844c38..1f028d55142a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -121,4 +121,9 @@  static inline long seccomp_get_metadata(struct task_struct *task,
 	return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 51032b41fe59..a75746d259a5 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -548,6 +548,9 @@  void seccomp_filter_release(struct task_struct *tsk)
 {
 	struct seccomp_filter *orig = tsk->seccomp.filter;
 
+	/* We are effectively holding the siglock by not having any sighand. */
+	WARN_ON(tsk->sighand != NULL);
+
 	/* Detach task from its filter tree. */
 	tsk->seccomp.filter = NULL;
 	__seccomp_filter_release(orig);
@@ -2308,3 +2311,59 @@  static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+					const void *bitmap, size_t bitmap_size)
+{
+	int nr;
+
+	for (nr = 0; nr < bitmap_size; nr++) {
+		bool cached = test_bit(nr, bitmap);
+		char *status = cached ? "ALLOW" : "FILTER";
+
+		seq_printf(m, "%s %d %s\n", name, nr, status);
+	}
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+			   struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filter *f;
+	unsigned long flags;
+
+	/*
+	 * We don't want some sandboxed process know what their seccomp
+	 * filters consist of.
+	 */
+	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!lock_task_sighand(task, &flags))
+		return 0;
+
+	f = READ_ONCE(task->seccomp.filter);
+	if (!f) {
+		unlock_task_sighand(task, &flags);
+		return 0;
+	}
+
+	/* prevent filter from being freed while we are printing it */
+	__get_seccomp_filter(f);
+	unlock_task_sighand(task, &flags);
+
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+				    f->cache.allow_native,
+				    SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+				    f->cache.allow_compat,
+				    SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+	__put_seccomp_filter(f);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */