Message ID | 2df320a600020fda055fccf2b668145729dd0c04.1409954077.git.luto@amacapital.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi, On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: > This splits syscall_trace_enter into syscall_trace_enter_phase1 and > syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only > phase 2 is permitted to modify any of pt_regs except for orig_ax. This breaks ptrace, see below. > The intent is that phase 1 can be called from the syscall fast path. > > In this implementation, phase1 can handle any combination of > TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, > unless seccomp requests a ptrace event, in which case phase2 is > forced. > > In principle, this could yield a big speedup for TIF_NOHZ as well as > for TIF_SECCOMP if syscall exit work were similarly split up. > > Signed-off-by: Andy Lutomirski <luto@amacapital.net> > --- > arch/x86/include/asm/ptrace.h | 5 ++ > arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- > 2 files changed, 138 insertions(+), 24 deletions(-) > > diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h > index 6205f0c434db..86fc2bb82287 100644 > --- a/arch/x86/include/asm/ptrace.h > +++ b/arch/x86/include/asm/ptrace.h > @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); > extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > int error_code, int si_code); > > + > +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); > +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, > + unsigned long phase1_result); > + > extern long syscall_trace_enter(struct pt_regs *); > extern void syscall_trace_leave(struct pt_regs *); > > diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c > index bbf338a04a5d..29576c244699 100644 > --- a/arch/x86/kernel/ptrace.c > +++ b/arch/x86/kernel/ptrace.c > @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > force_sig_info(SIGTRAP, &info, tsk); > } > > +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) > +{ > +#ifdef CONFIG_X86_64 > + if (arch == AUDIT_ARCH_X86_64) { > + audit_syscall_entry(arch, regs->orig_ax, regs->di, > + regs->si, regs->dx, regs->r10); > + } else > +#endif > + { > + audit_syscall_entry(arch, regs->orig_ax, regs->bx, > + regs->cx, regs->dx, regs->si); > + } > +} > + > /* > - * We must return the syscall number to actually look up in the table. > - * This can be -1L to skip running any syscall at all. > + * We can return 0 to resume the syscall or anything else to go to phase > + * 2. If we resume the syscall, we need to put something appropriate in > + * regs->orig_ax. > + * > + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax > + * are fully functional. > + * > + * For phase 2's benefit, our return value is: > + * 0: resume the syscall > + * 1: go to phase 2; no seccomp phase 2 needed > + * anything else: go to phase 2; pass return value to seccomp > */ > -long syscall_trace_enter(struct pt_regs *regs) > +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) > { > - long ret = 0; > + unsigned long ret = 0; > + u32 work; > + > + BUG_ON(regs != task_pt_regs(current)); > + > + work = ACCESS_ONCE(current_thread_info()->flags) & > + _TIF_WORK_SYSCALL_ENTRY; > > /* > * If TIF_NOHZ is set, we are required to call user_exit() before > * doing anything that could touch RCU. > */ > - if (test_thread_flag(TIF_NOHZ)) > + if (work & _TIF_NOHZ) { > user_exit(); > + work &= ~TIF_NOHZ; > + } > + > +#ifdef CONFIG_SECCOMP > + /* > + * Do seccomp first -- it should minimize exposure of other > + * code, and keeping seccomp fast is probably more valuable > + * than the rest of this. > + */ > + if (work & _TIF_SECCOMP) { > + struct seccomp_data sd; > + > + sd.arch = arch; > + sd.nr = regs->orig_ax; > + sd.instruction_pointer = regs->ip; > +#ifdef CONFIG_X86_64 > + if (arch == AUDIT_ARCH_X86_64) { > + sd.args[0] = regs->di; > + sd.args[1] = regs->si; > + sd.args[2] = regs->dx; > + sd.args[3] = regs->r10; > + sd.args[4] = regs->r8; > + sd.args[5] = regs->r9; > + } else > +#endif > + { > + sd.args[0] = regs->bx; > + sd.args[1] = regs->cx; > + sd.args[2] = regs->dx; > + sd.args[3] = regs->si; > + sd.args[4] = regs->di; > + sd.args[5] = regs->bp; > + } > + > + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); > + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); > + > + ret = seccomp_phase1(&sd); > + if (ret == SECCOMP_PHASE1_SKIP) { > + regs->orig_ax = -1; How the tracer is expected to get the correct syscall number after that?
On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > Hi, > > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only >> phase 2 is permitted to modify any of pt_regs except for orig_ax. > > This breaks ptrace, see below. > >> The intent is that phase 1 can be called from the syscall fast path. >> >> In this implementation, phase1 can handle any combination of >> TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, >> unless seccomp requests a ptrace event, in which case phase2 is >> forced. >> >> In principle, this could yield a big speedup for TIF_NOHZ as well as >> for TIF_SECCOMP if syscall exit work were similarly split up. >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >> --- >> arch/x86/include/asm/ptrace.h | 5 ++ >> arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- >> 2 files changed, 138 insertions(+), 24 deletions(-) >> >> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h >> index 6205f0c434db..86fc2bb82287 100644 >> --- a/arch/x86/include/asm/ptrace.h >> +++ b/arch/x86/include/asm/ptrace.h >> @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); >> extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >> int error_code, int si_code); >> >> + >> +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); >> +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, >> + unsigned long phase1_result); >> + >> extern long syscall_trace_enter(struct pt_regs *); >> extern void syscall_trace_leave(struct pt_regs *); >> >> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c >> index bbf338a04a5d..29576c244699 100644 >> --- a/arch/x86/kernel/ptrace.c >> +++ b/arch/x86/kernel/ptrace.c >> @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >> force_sig_info(SIGTRAP, &info, tsk); >> } >> >> +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) >> +{ >> +#ifdef CONFIG_X86_64 >> + if (arch == AUDIT_ARCH_X86_64) { >> + audit_syscall_entry(arch, regs->orig_ax, regs->di, >> + regs->si, regs->dx, regs->r10); >> + } else >> +#endif >> + { >> + audit_syscall_entry(arch, regs->orig_ax, regs->bx, >> + regs->cx, regs->dx, regs->si); >> + } >> +} >> + >> /* >> - * We must return the syscall number to actually look up in the table. >> - * This can be -1L to skip running any syscall at all. >> + * We can return 0 to resume the syscall or anything else to go to phase >> + * 2. If we resume the syscall, we need to put something appropriate in >> + * regs->orig_ax. >> + * >> + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax >> + * are fully functional. >> + * >> + * For phase 2's benefit, our return value is: >> + * 0: resume the syscall >> + * 1: go to phase 2; no seccomp phase 2 needed >> + * anything else: go to phase 2; pass return value to seccomp >> */ >> -long syscall_trace_enter(struct pt_regs *regs) >> +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) >> { >> - long ret = 0; >> + unsigned long ret = 0; >> + u32 work; >> + >> + BUG_ON(regs != task_pt_regs(current)); >> + >> + work = ACCESS_ONCE(current_thread_info()->flags) & >> + _TIF_WORK_SYSCALL_ENTRY; >> >> /* >> * If TIF_NOHZ is set, we are required to call user_exit() before >> * doing anything that could touch RCU. >> */ >> - if (test_thread_flag(TIF_NOHZ)) >> + if (work & _TIF_NOHZ) { >> user_exit(); >> + work &= ~TIF_NOHZ; >> + } >> + >> +#ifdef CONFIG_SECCOMP >> + /* >> + * Do seccomp first -- it should minimize exposure of other >> + * code, and keeping seccomp fast is probably more valuable >> + * than the rest of this. >> + */ >> + if (work & _TIF_SECCOMP) { >> + struct seccomp_data sd; >> + >> + sd.arch = arch; >> + sd.nr = regs->orig_ax; >> + sd.instruction_pointer = regs->ip; >> +#ifdef CONFIG_X86_64 >> + if (arch == AUDIT_ARCH_X86_64) { >> + sd.args[0] = regs->di; >> + sd.args[1] = regs->si; >> + sd.args[2] = regs->dx; >> + sd.args[3] = regs->r10; >> + sd.args[4] = regs->r8; >> + sd.args[5] = regs->r9; >> + } else >> +#endif >> + { >> + sd.args[0] = regs->bx; >> + sd.args[1] = regs->cx; >> + sd.args[2] = regs->dx; >> + sd.args[3] = regs->si; >> + sd.args[4] = regs->di; >> + sd.args[5] = regs->bp; >> + } >> + >> + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); >> + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); >> + >> + ret = seccomp_phase1(&sd); >> + if (ret == SECCOMP_PHASE1_SKIP) { >> + regs->orig_ax = -1; > > How the tracer is expected to get the correct syscall number after that? There shouldn't be a tracer if a skip is encountered. (A seccomp skip would skip ptrace.) This behavior hasn't changed, but maybe I don't see what you mean? (I haven't encountered any problems with syscall tracing as a result of these changes.) -Kees
On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: > On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > > Hi, > > > > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: > >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and > >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only > >> phase 2 is permitted to modify any of pt_regs except for orig_ax. > > > > This breaks ptrace, see below. > > > >> The intent is that phase 1 can be called from the syscall fast path. > >> > >> In this implementation, phase1 can handle any combination of > >> TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, > >> unless seccomp requests a ptrace event, in which case phase2 is > >> forced. > >> > >> In principle, this could yield a big speedup for TIF_NOHZ as well as > >> for TIF_SECCOMP if syscall exit work were similarly split up. > >> > >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> > >> --- > >> arch/x86/include/asm/ptrace.h | 5 ++ > >> arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- > >> 2 files changed, 138 insertions(+), 24 deletions(-) > >> > >> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h > >> index 6205f0c434db..86fc2bb82287 100644 > >> --- a/arch/x86/include/asm/ptrace.h > >> +++ b/arch/x86/include/asm/ptrace.h > >> @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); > >> extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > >> int error_code, int si_code); > >> > >> + > >> +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); > >> +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, > >> + unsigned long phase1_result); > >> + > >> extern long syscall_trace_enter(struct pt_regs *); > >> extern void syscall_trace_leave(struct pt_regs *); > >> > >> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c > >> index bbf338a04a5d..29576c244699 100644 > >> --- a/arch/x86/kernel/ptrace.c > >> +++ b/arch/x86/kernel/ptrace.c > >> @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > >> force_sig_info(SIGTRAP, &info, tsk); > >> } > >> > >> +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) > >> +{ > >> +#ifdef CONFIG_X86_64 > >> + if (arch == AUDIT_ARCH_X86_64) { > >> + audit_syscall_entry(arch, regs->orig_ax, regs->di, > >> + regs->si, regs->dx, regs->r10); > >> + } else > >> +#endif > >> + { > >> + audit_syscall_entry(arch, regs->orig_ax, regs->bx, > >> + regs->cx, regs->dx, regs->si); > >> + } > >> +} > >> + > >> /* > >> - * We must return the syscall number to actually look up in the table. > >> - * This can be -1L to skip running any syscall at all. > >> + * We can return 0 to resume the syscall or anything else to go to phase > >> + * 2. If we resume the syscall, we need to put something appropriate in > >> + * regs->orig_ax. > >> + * > >> + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax > >> + * are fully functional. > >> + * > >> + * For phase 2's benefit, our return value is: > >> + * 0: resume the syscall > >> + * 1: go to phase 2; no seccomp phase 2 needed > >> + * anything else: go to phase 2; pass return value to seccomp > >> */ > >> -long syscall_trace_enter(struct pt_regs *regs) > >> +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) > >> { > >> - long ret = 0; > >> + unsigned long ret = 0; > >> + u32 work; > >> + > >> + BUG_ON(regs != task_pt_regs(current)); > >> + > >> + work = ACCESS_ONCE(current_thread_info()->flags) & > >> + _TIF_WORK_SYSCALL_ENTRY; > >> > >> /* > >> * If TIF_NOHZ is set, we are required to call user_exit() before > >> * doing anything that could touch RCU. > >> */ > >> - if (test_thread_flag(TIF_NOHZ)) > >> + if (work & _TIF_NOHZ) { > >> user_exit(); > >> + work &= ~TIF_NOHZ; > >> + } > >> + > >> +#ifdef CONFIG_SECCOMP > >> + /* > >> + * Do seccomp first -- it should minimize exposure of other > >> + * code, and keeping seccomp fast is probably more valuable > >> + * than the rest of this. > >> + */ > >> + if (work & _TIF_SECCOMP) { > >> + struct seccomp_data sd; > >> + > >> + sd.arch = arch; > >> + sd.nr = regs->orig_ax; > >> + sd.instruction_pointer = regs->ip; > >> +#ifdef CONFIG_X86_64 > >> + if (arch == AUDIT_ARCH_X86_64) { > >> + sd.args[0] = regs->di; > >> + sd.args[1] = regs->si; > >> + sd.args[2] = regs->dx; > >> + sd.args[3] = regs->r10; > >> + sd.args[4] = regs->r8; > >> + sd.args[5] = regs->r9; > >> + } else > >> +#endif > >> + { > >> + sd.args[0] = regs->bx; > >> + sd.args[1] = regs->cx; > >> + sd.args[2] = regs->dx; > >> + sd.args[3] = regs->si; > >> + sd.args[4] = regs->di; > >> + sd.args[5] = regs->bp; > >> + } > >> + > >> + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); > >> + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); > >> + > >> + ret = seccomp_phase1(&sd); > >> + if (ret == SECCOMP_PHASE1_SKIP) { > >> + regs->orig_ax = -1; > > > > How the tracer is expected to get the correct syscall number after that? > > There shouldn't be a tracer if a skip is encountered. (A seccomp skip > would skip ptrace.) This behavior hasn't changed, but maybe I don't > see what you mean? (I haven't encountered any problems with syscall > tracing as a result of these changes.) SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, it will get -1 as a syscall number. I've found this while testing a strace parser for SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real.
On Thu, Feb 5, 2015 at 1:40 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: >> On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> > Hi, >> > >> > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: >> >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and >> >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only >> >> phase 2 is permitted to modify any of pt_regs except for orig_ax. >> > >> > This breaks ptrace, see below. >> > >> >> The intent is that phase 1 can be called from the syscall fast path. >> >> >> >> In this implementation, phase1 can handle any combination of >> >> TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, >> >> unless seccomp requests a ptrace event, in which case phase2 is >> >> forced. >> >> >> >> In principle, this could yield a big speedup for TIF_NOHZ as well as >> >> for TIF_SECCOMP if syscall exit work were similarly split up. >> >> >> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >> >> --- >> >> arch/x86/include/asm/ptrace.h | 5 ++ >> >> arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- >> >> 2 files changed, 138 insertions(+), 24 deletions(-) >> >> >> >> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h >> >> index 6205f0c434db..86fc2bb82287 100644 >> >> --- a/arch/x86/include/asm/ptrace.h >> >> +++ b/arch/x86/include/asm/ptrace.h >> >> @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); >> >> extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >> >> int error_code, int si_code); >> >> >> >> + >> >> +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); >> >> +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, >> >> + unsigned long phase1_result); >> >> + >> >> extern long syscall_trace_enter(struct pt_regs *); >> >> extern void syscall_trace_leave(struct pt_regs *); >> >> >> >> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c >> >> index bbf338a04a5d..29576c244699 100644 >> >> --- a/arch/x86/kernel/ptrace.c >> >> +++ b/arch/x86/kernel/ptrace.c >> >> @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >> >> force_sig_info(SIGTRAP, &info, tsk); >> >> } >> >> >> >> +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) >> >> +{ >> >> +#ifdef CONFIG_X86_64 >> >> + if (arch == AUDIT_ARCH_X86_64) { >> >> + audit_syscall_entry(arch, regs->orig_ax, regs->di, >> >> + regs->si, regs->dx, regs->r10); >> >> + } else >> >> +#endif >> >> + { >> >> + audit_syscall_entry(arch, regs->orig_ax, regs->bx, >> >> + regs->cx, regs->dx, regs->si); >> >> + } >> >> +} >> >> + >> >> /* >> >> - * We must return the syscall number to actually look up in the table. >> >> - * This can be -1L to skip running any syscall at all. >> >> + * We can return 0 to resume the syscall or anything else to go to phase >> >> + * 2. If we resume the syscall, we need to put something appropriate in >> >> + * regs->orig_ax. >> >> + * >> >> + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax >> >> + * are fully functional. >> >> + * >> >> + * For phase 2's benefit, our return value is: >> >> + * 0: resume the syscall >> >> + * 1: go to phase 2; no seccomp phase 2 needed >> >> + * anything else: go to phase 2; pass return value to seccomp >> >> */ >> >> -long syscall_trace_enter(struct pt_regs *regs) >> >> +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) >> >> { >> >> - long ret = 0; >> >> + unsigned long ret = 0; >> >> + u32 work; >> >> + >> >> + BUG_ON(regs != task_pt_regs(current)); >> >> + >> >> + work = ACCESS_ONCE(current_thread_info()->flags) & >> >> + _TIF_WORK_SYSCALL_ENTRY; >> >> >> >> /* >> >> * If TIF_NOHZ is set, we are required to call user_exit() before >> >> * doing anything that could touch RCU. >> >> */ >> >> - if (test_thread_flag(TIF_NOHZ)) >> >> + if (work & _TIF_NOHZ) { >> >> user_exit(); >> >> + work &= ~TIF_NOHZ; >> >> + } >> >> + >> >> +#ifdef CONFIG_SECCOMP >> >> + /* >> >> + * Do seccomp first -- it should minimize exposure of other >> >> + * code, and keeping seccomp fast is probably more valuable >> >> + * than the rest of this. >> >> + */ >> >> + if (work & _TIF_SECCOMP) { >> >> + struct seccomp_data sd; >> >> + >> >> + sd.arch = arch; >> >> + sd.nr = regs->orig_ax; >> >> + sd.instruction_pointer = regs->ip; >> >> +#ifdef CONFIG_X86_64 >> >> + if (arch == AUDIT_ARCH_X86_64) { >> >> + sd.args[0] = regs->di; >> >> + sd.args[1] = regs->si; >> >> + sd.args[2] = regs->dx; >> >> + sd.args[3] = regs->r10; >> >> + sd.args[4] = regs->r8; >> >> + sd.args[5] = regs->r9; >> >> + } else >> >> +#endif >> >> + { >> >> + sd.args[0] = regs->bx; >> >> + sd.args[1] = regs->cx; >> >> + sd.args[2] = regs->dx; >> >> + sd.args[3] = regs->si; >> >> + sd.args[4] = regs->di; >> >> + sd.args[5] = regs->bp; >> >> + } >> >> + >> >> + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); >> >> + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); >> >> + >> >> + ret = seccomp_phase1(&sd); >> >> + if (ret == SECCOMP_PHASE1_SKIP) { >> >> + regs->orig_ax = -1; >> > >> > How the tracer is expected to get the correct syscall number after that? >> >> There shouldn't be a tracer if a skip is encountered. (A seccomp skip >> would skip ptrace.) This behavior hasn't changed, but maybe I don't >> see what you mean? (I haven't encountered any problems with syscall >> tracing as a result of these changes.) > > SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, > it will get -1 as a syscall number. > > I've found this while testing a strace parser for > SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real. > > Hasn't it always been this way? I admit that I kind of wish this worked the other way -- that is, I think it would be nice to have a mode in which ptrace runs before seccomp, which would close the ptrace hole (where ptrace can do things that seccomp would disallow) and maybe have more comprehensible results. --Andy > -- > ldv
On Thu, Feb 5, 2015 at 1:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Thu, Feb 5, 2015 at 1:40 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: >>> On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>> > Hi, >>> > >>> > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: >>> >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and >>> >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only >>> >> phase 2 is permitted to modify any of pt_regs except for orig_ax. >>> > >>> > This breaks ptrace, see below. >>> > >>> >> The intent is that phase 1 can be called from the syscall fast path. >>> >> >>> >> In this implementation, phase1 can handle any combination of >>> >> TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, >>> >> unless seccomp requests a ptrace event, in which case phase2 is >>> >> forced. >>> >> >>> >> In principle, this could yield a big speedup for TIF_NOHZ as well as >>> >> for TIF_SECCOMP if syscall exit work were similarly split up. >>> >> >>> >> Signed-off-by: Andy Lutomirski <luto@amacapital.net> >>> >> --- >>> >> arch/x86/include/asm/ptrace.h | 5 ++ >>> >> arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- >>> >> 2 files changed, 138 insertions(+), 24 deletions(-) >>> >> >>> >> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h >>> >> index 6205f0c434db..86fc2bb82287 100644 >>> >> --- a/arch/x86/include/asm/ptrace.h >>> >> +++ b/arch/x86/include/asm/ptrace.h >>> >> @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); >>> >> extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >>> >> int error_code, int si_code); >>> >> >>> >> + >>> >> +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); >>> >> +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, >>> >> + unsigned long phase1_result); >>> >> + >>> >> extern long syscall_trace_enter(struct pt_regs *); >>> >> extern void syscall_trace_leave(struct pt_regs *); >>> >> >>> >> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c >>> >> index bbf338a04a5d..29576c244699 100644 >>> >> --- a/arch/x86/kernel/ptrace.c >>> >> +++ b/arch/x86/kernel/ptrace.c >>> >> @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, >>> >> force_sig_info(SIGTRAP, &info, tsk); >>> >> } >>> >> >>> >> +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) >>> >> +{ >>> >> +#ifdef CONFIG_X86_64 >>> >> + if (arch == AUDIT_ARCH_X86_64) { >>> >> + audit_syscall_entry(arch, regs->orig_ax, regs->di, >>> >> + regs->si, regs->dx, regs->r10); >>> >> + } else >>> >> +#endif >>> >> + { >>> >> + audit_syscall_entry(arch, regs->orig_ax, regs->bx, >>> >> + regs->cx, regs->dx, regs->si); >>> >> + } >>> >> +} >>> >> + >>> >> /* >>> >> - * We must return the syscall number to actually look up in the table. >>> >> - * This can be -1L to skip running any syscall at all. >>> >> + * We can return 0 to resume the syscall or anything else to go to phase >>> >> + * 2. If we resume the syscall, we need to put something appropriate in >>> >> + * regs->orig_ax. >>> >> + * >>> >> + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax >>> >> + * are fully functional. >>> >> + * >>> >> + * For phase 2's benefit, our return value is: >>> >> + * 0: resume the syscall >>> >> + * 1: go to phase 2; no seccomp phase 2 needed >>> >> + * anything else: go to phase 2; pass return value to seccomp >>> >> */ >>> >> -long syscall_trace_enter(struct pt_regs *regs) >>> >> +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) >>> >> { >>> >> - long ret = 0; >>> >> + unsigned long ret = 0; >>> >> + u32 work; >>> >> + >>> >> + BUG_ON(regs != task_pt_regs(current)); >>> >> + >>> >> + work = ACCESS_ONCE(current_thread_info()->flags) & >>> >> + _TIF_WORK_SYSCALL_ENTRY; >>> >> >>> >> /* >>> >> * If TIF_NOHZ is set, we are required to call user_exit() before >>> >> * doing anything that could touch RCU. >>> >> */ >>> >> - if (test_thread_flag(TIF_NOHZ)) >>> >> + if (work & _TIF_NOHZ) { >>> >> user_exit(); >>> >> + work &= ~TIF_NOHZ; >>> >> + } >>> >> + >>> >> +#ifdef CONFIG_SECCOMP >>> >> + /* >>> >> + * Do seccomp first -- it should minimize exposure of other >>> >> + * code, and keeping seccomp fast is probably more valuable >>> >> + * than the rest of this. >>> >> + */ >>> >> + if (work & _TIF_SECCOMP) { >>> >> + struct seccomp_data sd; >>> >> + >>> >> + sd.arch = arch; >>> >> + sd.nr = regs->orig_ax; >>> >> + sd.instruction_pointer = regs->ip; >>> >> +#ifdef CONFIG_X86_64 >>> >> + if (arch == AUDIT_ARCH_X86_64) { >>> >> + sd.args[0] = regs->di; >>> >> + sd.args[1] = regs->si; >>> >> + sd.args[2] = regs->dx; >>> >> + sd.args[3] = regs->r10; >>> >> + sd.args[4] = regs->r8; >>> >> + sd.args[5] = regs->r9; >>> >> + } else >>> >> +#endif >>> >> + { >>> >> + sd.args[0] = regs->bx; >>> >> + sd.args[1] = regs->cx; >>> >> + sd.args[2] = regs->dx; >>> >> + sd.args[3] = regs->si; >>> >> + sd.args[4] = regs->di; >>> >> + sd.args[5] = regs->bp; >>> >> + } >>> >> + >>> >> + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); >>> >> + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); >>> >> + >>> >> + ret = seccomp_phase1(&sd); >>> >> + if (ret == SECCOMP_PHASE1_SKIP) { >>> >> + regs->orig_ax = -1; >>> > >>> > How the tracer is expected to get the correct syscall number after that? >>> >>> There shouldn't be a tracer if a skip is encountered. (A seccomp skip >>> would skip ptrace.) This behavior hasn't changed, but maybe I don't >>> see what you mean? (I haven't encountered any problems with syscall >>> tracing as a result of these changes.) >> >> SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, >> it will get -1 as a syscall number. >> >> I've found this while testing a strace parser for >> SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real. >> >> > > Hasn't it always been this way? As far as I know, yes, it's always been this way. The point is to the skip the syscall, which is what the -1 signals. Userspace then reads back the errno. > I admit that I kind of wish this worked the other way -- that is, I > think it would be nice to have a mode in which ptrace runs before > seccomp, which would close the ptrace hole (where ptrace can do things > that seccomp would disallow) and maybe have more comprehensible > results. I prefer keeping the seccomp attack surface as tiny as possible. I would not like to ptrace happening before seccomp. -Kees
On Thu, Feb 05, 2015 at 03:12:39PM -0800, Kees Cook wrote: > On Thu, Feb 5, 2015 at 1:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: > > On Thu, Feb 5, 2015 at 1:40 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > >> On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: > >>> On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > >>> > Hi, > >>> > > >>> > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: > >>> >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and > >>> >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only > >>> >> phase 2 is permitted to modify any of pt_regs except for orig_ax. > >>> > > >>> > This breaks ptrace, see below. > >>> > [...] > >>> >> + ret = seccomp_phase1(&sd); > >>> >> + if (ret == SECCOMP_PHASE1_SKIP) { > >>> >> + regs->orig_ax = -1; > >>> > > >>> > How the tracer is expected to get the correct syscall number after that? > >>> > >>> There shouldn't be a tracer if a skip is encountered. (A seccomp skip > >>> would skip ptrace.) This behavior hasn't changed, but maybe I don't > >>> see what you mean? (I haven't encountered any problems with syscall > >>> tracing as a result of these changes.) > >> > >> SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, > >> it will get -1 as a syscall number. > >> > >> I've found this while testing a strace parser for > >> SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real. > > > > Hasn't it always been this way? > > As far as I know, yes, it's always been this way. The point is to the > skip the syscall, which is what the -1 signals. Userspace then reads > back the errno. There is a clear difference: before these changes, SECCOMP_RET_ERRNO used to keep the syscall number unchanged and suppress syscall-exit-stop event, which was awful because userspace cannot distinguish syscall-enter-stop from syscall-exit-stop and therefore relies on the kernel that syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop events to be suppressed, but now the syscall number is lost.
On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > On Thu, Feb 05, 2015 at 03:12:39PM -0800, Kees Cook wrote: >> On Thu, Feb 5, 2015 at 1:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> > On Thu, Feb 5, 2015 at 1:40 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> >> On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: >> >>> On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> >>> > Hi, >> >>> > >> >>> > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: >> >>> >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and >> >>> >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only >> >>> >> phase 2 is permitted to modify any of pt_regs except for orig_ax. >> >>> > >> >>> > This breaks ptrace, see below. >> >>> > > [...] >> >>> >> + ret = seccomp_phase1(&sd); >> >>> >> + if (ret == SECCOMP_PHASE1_SKIP) { >> >>> >> + regs->orig_ax = -1; >> >>> > >> >>> > How the tracer is expected to get the correct syscall number after that? >> >>> >> >>> There shouldn't be a tracer if a skip is encountered. (A seccomp skip >> >>> would skip ptrace.) This behavior hasn't changed, but maybe I don't >> >>> see what you mean? (I haven't encountered any problems with syscall >> >>> tracing as a result of these changes.) >> >> >> >> SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, >> >> it will get -1 as a syscall number. >> >> >> >> I've found this while testing a strace parser for >> >> SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real. >> > >> > Hasn't it always been this way? >> >> As far as I know, yes, it's always been this way. The point is to the >> skip the syscall, which is what the -1 signals. Userspace then reads >> back the errno. > > There is a clear difference: before these changes, SECCOMP_RET_ERRNO used > to keep the syscall number unchanged and suppress syscall-exit-stop event, > which was awful because userspace cannot distinguish syscall-enter-stop > from syscall-exit-stop and therefore relies on the kernel that > syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). > > After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop > events to be suppressed, but now the syscall number is lost. Ah-ha! Okay, thanks, I understand now. I think this means seccomp phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you think here? -Kees
On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> On Thu, Feb 05, 2015 at 03:12:39PM -0800, Kees Cook wrote: >>> On Thu, Feb 5, 2015 at 1:52 PM, Andy Lutomirski <luto@amacapital.net> wrote: >>> > On Thu, Feb 5, 2015 at 1:40 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>> >> On Thu, Feb 05, 2015 at 01:27:16PM -0800, Kees Cook wrote: >>> >>> On Thu, Feb 5, 2015 at 1:19 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>> >>> > Hi, >>> >>> > >>> >>> > On Fri, Sep 05, 2014 at 03:13:54PM -0700, Andy Lutomirski wrote: >>> >>> >> This splits syscall_trace_enter into syscall_trace_enter_phase1 and >>> >>> >> syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only >>> >>> >> phase 2 is permitted to modify any of pt_regs except for orig_ax. >>> >>> > >>> >>> > This breaks ptrace, see below. >>> >>> > >> [...] >>> >>> >> + ret = seccomp_phase1(&sd); >>> >>> >> + if (ret == SECCOMP_PHASE1_SKIP) { >>> >>> >> + regs->orig_ax = -1; >>> >>> > >>> >>> > How the tracer is expected to get the correct syscall number after that? >>> >>> >>> >>> There shouldn't be a tracer if a skip is encountered. (A seccomp skip >>> >>> would skip ptrace.) This behavior hasn't changed, but maybe I don't >>> >>> see what you mean? (I haven't encountered any problems with syscall >>> >>> tracing as a result of these changes.) >>> >> >>> >> SECCOMP_RET_ERRNO leads to SECCOMP_PHASE1_SKIP, and if there is a tracer, >>> >> it will get -1 as a syscall number. >>> >> >>> >> I've found this while testing a strace parser for >>> >> SECCOMP_MODE_FILTER/SECCOMP_SET_MODE_FILTER, so the problem is quite real. >>> > >>> > Hasn't it always been this way? >>> >>> As far as I know, yes, it's always been this way. The point is to the >>> skip the syscall, which is what the -1 signals. Userspace then reads >>> back the errno. >> >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used >> to keep the syscall number unchanged and suppress syscall-exit-stop event, >> which was awful because userspace cannot distinguish syscall-enter-stop >> from syscall-exit-stop and therefore relies on the kernel that >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). >> >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop >> events to be suppressed, but now the syscall number is lost. > > Ah-ha! Okay, thanks, I understand now. I think this means seccomp > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you > think here? > I still don't quite see how this change caused this. I can play with it a bit more. But RET_ERRNO *has* to be some kind of skip event, because it needs to skip the syscall. We could change this by treating RET_ERRNO as an instruction to enter phase 2 and then asking for a skip in phase 2 without changing orig_ax, but IMO this is pretty ugly. I think this all kind of sucks. We're trying to run ptrace after seccomp, so ptrace is seeing the syscalls as transformed by seccomp. That means that if we use RET_TRAP, then ptrace will see the possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO correctly given the current design) showing syscall -1, and if we use RET_KILL, then ptrace just sees the process mysteriously die. I think it would be more useful and easier to understand if ptrace saw syscalls as the traced process saw them, i.e. before seccomp modification. How would this meaningfully increase the attack surface? As far as ptrace is concerned, a syscall is just seven numbers, and as long as a process can issue *any* syscall that seccomp allows, then it can invoke the ptrace hooks. I don't think the entry/exit hooks care *at all* about the syscall nr or args. Audit is a different story. I think we should absolutely continue to audit syscalls that actually happen, not syscalls that were requested. Given this bug, I doubt we'd break anything if we changed it, since it appears that it's already rather broken. Also, changing it would make me happy, because I want to add a SECCOMP_RET_MONITOR that freezes the process, sends an event to a seccompfd, and then executes syscalls as requested by the holder of the seccompfd. Those syscalls would, in turn, be filtered again through inner layers of seccomp. One sticking point would be that the current ptrace behavior is very hard to reconcile with this type of feature. --Andy
On Thu, Feb 05, 2015 at 04:09:06PM -0800, Andy Lutomirski wrote: > On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: > > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: [...] > >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used > >> to keep the syscall number unchanged and suppress syscall-exit-stop event, > >> which was awful because userspace cannot distinguish syscall-enter-stop > >> from syscall-exit-stop and therefore relies on the kernel that > >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). > >> > >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop > >> events to be suppressed, but now the syscall number is lost. > > > > Ah-ha! Okay, thanks, I understand now. I think this means seccomp > > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you > > think here? > > I still don't quite see how this change caused this. I have a test for this at http://sourceforge.net/p/strace/code/ci/HEAD/~/tree/test/seccomp.c > I can play with > it a bit more. But RET_ERRNO *has* to be some kind of skip event, > because it needs to skip the syscall. > > We could change this by treating RET_ERRNO as an instruction to enter > phase 2 and then asking for a skip in phase 2 without changing > orig_ax, but IMO this is pretty ugly. > > I think this all kind of sucks. We're trying to run ptrace after > seccomp, so ptrace is seeing the syscalls as transformed by seccomp. > That means that if we use RET_TRAP, then ptrace will see the > possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO > correctly given the current design) showing syscall -1, and if we use > RET_KILL, then ptrace just sees the process mysteriously die. Userspace is usually not prepared to see syscall -1. For example, strace had to be patched, otherwise it just skipped such syscalls as "not a syscall" events or did other improper things: http://sourceforge.net/p/strace/code/ci/c3948327717c29b10b5e00a436dc138b4ab1a486 http://sourceforge.net/p/strace/code/ci/8e398b6c4020fb2d33a5b3e40271ebf63199b891 A slightly different but related story: userspace is also not prepared to handle large errno values produced by seccomp filters like this: BPF_STMT(BPF_RET, SECCOMP_RET_ERRNO | SECCOMP_RET_DATA) For example, glibc assumes that syscalls do not return errno values greater than 0xfff: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h#l55 https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/syscall.S#l20 If it isn't too late, I'd recommend changing SECCOMP_RET_DATA mask applied in SECCOMP_RET_ERRNO case from current 0xffff to 0xfff.
On Thu, Feb 5, 2015 at 6:32 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > On Thu, Feb 05, 2015 at 04:09:06PM -0800, Andy Lutomirski wrote: >> On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: >> > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > [...] >> >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used >> >> to keep the syscall number unchanged and suppress syscall-exit-stop event, >> >> which was awful because userspace cannot distinguish syscall-enter-stop >> >> from syscall-exit-stop and therefore relies on the kernel that >> >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). >> >> >> >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop >> >> events to be suppressed, but now the syscall number is lost. >> > >> > Ah-ha! Okay, thanks, I understand now. I think this means seccomp >> > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you >> > think here? >> >> I still don't quite see how this change caused this. > > I have a test for this at > http://sourceforge.net/p/strace/code/ci/HEAD/~/tree/test/seccomp.c > >> I can play with >> it a bit more. But RET_ERRNO *has* to be some kind of skip event, >> because it needs to skip the syscall. >> >> We could change this by treating RET_ERRNO as an instruction to enter >> phase 2 and then asking for a skip in phase 2 without changing >> orig_ax, but IMO this is pretty ugly. >> >> I think this all kind of sucks. We're trying to run ptrace after >> seccomp, so ptrace is seeing the syscalls as transformed by seccomp. >> That means that if we use RET_TRAP, then ptrace will see the >> possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO >> correctly given the current design) showing syscall -1, and if we use >> RET_KILL, then ptrace just sees the process mysteriously die. > > Userspace is usually not prepared to see syscall -1. > For example, strace had to be patched, otherwise it just skipped such > syscalls as "not a syscall" events or did other improper things: > http://sourceforge.net/p/strace/code/ci/c3948327717c29b10b5e00a436dc138b4ab1a486 > http://sourceforge.net/p/strace/code/ci/8e398b6c4020fb2d33a5b3e40271ebf63199b891 > The x32 thing is a legit ABI bug, I'd argue. I'd be happy to submit a patch to fix that (clear the x32 bit if we're not x32). > A slightly different but related story: userspace is also not prepared > to handle large errno values produced by seccomp filters like this: > BPF_STMT(BPF_RET, SECCOMP_RET_ERRNO | SECCOMP_RET_DATA) > > For example, glibc assumes that syscalls do not return errno values greater than 0xfff: > https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h#l55 > https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/syscall.S#l20 > > If it isn't too late, I'd recommend changing SECCOMP_RET_DATA mask > applied in SECCOMP_RET_ERRNO case from current 0xffff to 0xfff. I think this is solidly in the "don't do that" category. Seccomp lets you tamper with syscalls. If you tamper wrong, then you lose. Kees, what do you think about reversing the whole thing so that ptracers always see the original syscall? --Andy
On Thu, Feb 5, 2015 at 6:38 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Thu, Feb 5, 2015 at 6:32 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> On Thu, Feb 05, 2015 at 04:09:06PM -0800, Andy Lutomirski wrote: >>> On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: >>> > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >> [...] >>> >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used >>> >> to keep the syscall number unchanged and suppress syscall-exit-stop event, >>> >> which was awful because userspace cannot distinguish syscall-enter-stop >>> >> from syscall-exit-stop and therefore relies on the kernel that >>> >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). >>> >> >>> >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop >>> >> events to be suppressed, but now the syscall number is lost. >>> > >>> > Ah-ha! Okay, thanks, I understand now. I think this means seccomp >>> > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you >>> > think here? >>> >>> I still don't quite see how this change caused this. >> >> I have a test for this at >> http://sourceforge.net/p/strace/code/ci/HEAD/~/tree/test/seccomp.c >> >>> I can play with >>> it a bit more. But RET_ERRNO *has* to be some kind of skip event, >>> because it needs to skip the syscall. >>> >>> We could change this by treating RET_ERRNO as an instruction to enter >>> phase 2 and then asking for a skip in phase 2 without changing >>> orig_ax, but IMO this is pretty ugly. >>> >>> I think this all kind of sucks. We're trying to run ptrace after >>> seccomp, so ptrace is seeing the syscalls as transformed by seccomp. >>> That means that if we use RET_TRAP, then ptrace will see the >>> possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO >>> correctly given the current design) showing syscall -1, and if we use >>> RET_KILL, then ptrace just sees the process mysteriously die. >> >> Userspace is usually not prepared to see syscall -1. >> For example, strace had to be patched, otherwise it just skipped such >> syscalls as "not a syscall" events or did other improper things: >> http://sourceforge.net/p/strace/code/ci/c3948327717c29b10b5e00a436dc138b4ab1a486 >> http://sourceforge.net/p/strace/code/ci/8e398b6c4020fb2d33a5b3e40271ebf63199b891 >> > > The x32 thing is a legit ABI bug, I'd argue. I'd be happy to submit a > patch to fix that (clear the x32 bit if we're not x32). > >> A slightly different but related story: userspace is also not prepared >> to handle large errno values produced by seccomp filters like this: >> BPF_STMT(BPF_RET, SECCOMP_RET_ERRNO | SECCOMP_RET_DATA) >> >> For example, glibc assumes that syscalls do not return errno values greater than 0xfff: >> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h#l55 >> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/syscall.S#l20 To save others the link reading: "Linus said he will make sure the no syscall returns a value in -1 .. -4095 as a valid result so we can savely test with -4095." Strictly speaking (ISO C, "man 3 errno"), errno is supposed to be a full int, though digging around I find this in include/linux/err.h: /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) But no architecture overrides this. >> If it isn't too late, I'd recommend changing SECCOMP_RET_DATA mask >> applied in SECCOMP_RET_ERRNO case from current 0xffff to 0xfff. I'm not opposed to this. I would want to more explicitly document the 4095 max value in man pages, though. > I think this is solidly in the "don't do that" category. Seccomp lets > you tamper with syscalls. If you tamper wrong, then you lose. > > Kees, what do you think about reversing the whole thing so that > ptracers always see the original syscall? What do you mean by "reversing"? The interactions I see here are: PTRACE_SYSCALL SECCOMP_RET_ERRNO SECCOMP_RET_TRACE SECCOMP_RET_TRAP Both ptrace and seccomp will trigger via _TIF_WORK_SYSCALL_ENTRY. Only ptrace will trigger via _TIF_WORK_SYSCALL_EXIT. For SECCOMP_RET_ERRNO to work, we must skip the syscall, as mentioned earlier: arch/x86/kernel/entry_32.S ... syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) Both before and after the 2-phase change, syscall_trace_enter would return -1 if it hit SECCOMP_RET_ERRNO, before calling tracehook_report_syscall_entry. On exit, if PTRACE_SYSCALL, we'd hit tracehook_report_syscall_exit during syscall_trace_leave, which means a ptracer will see a syscall-exit-stop without a matching syscall-enter-stop. Using SECCOMP_RET_TRACE with PTRACE_SYSCALL in place seems totally crazy, as the ptracer would need to be the same program, and if it chose to skip a syscall, it would be in the same place: it would see PTRACE_EVENT_SECCOMP, then no syscall-enter-stop, then a syscall-exit-stop. I think we can ignore this pathological case. Using SECCOMP_RET_TRAP with PTRACE_SYSCALL also results in a skip, which produces the same "only syscall-exit-stop seen" problem. In the SECCOMP_RET_ERRNO case, the syscall nr doesn't change (and isn't executed). In the SECCOMP_RET_TRAP case, the syscall nr doesn't change (and isn't executed). In the SECCOMP_RET_TRACE, the syscall nr _could_ change, but the ptracer would be doing it, so the crazy situation around PTRACE_SYSCALL is probably safe to ignore (as long as we document what is expected to happen). So, the question is: should PTRACE_SYSCALL see a syscall that is _not_ being executed (due to seccomp)? Audit doesn't see it currently, and neither does ptrace. I would argue that it should continue to not see the syscall. That said, if it shouldn't be shown, we also shouldn't trigger syscall-exit-stop. If you can convince me it should see syscall-enter-stop, then I have two questions: 1) Do we accept that a ptracer can interfere with SECCOMP_RET_ERRNO? I think we probably must, since it can already interfere via syscall-exit-stop and change the errno. And especially since a ptracer can change syscalls during syscall-enter-stop to any syscall it wants, bypassing seccomp. This condition is already documented. 2) What do we do with audit? Suddenly we have ptrace seeing a syscall that audit doesn't? And an unrelated thought: 3) Can't we find some way to fix the inability of a ptracer to distinguish between syscall-enter-stop and syscall-exit-stop? -Kees
On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: > On Thu, Feb 5, 2015 at 6:38 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Thu, Feb 5, 2015 at 6:32 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>> On Thu, Feb 05, 2015 at 04:09:06PM -0800, Andy Lutomirski wrote: >>>> On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: >>>> > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>> [...] >>>> >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used >>>> >> to keep the syscall number unchanged and suppress syscall-exit-stop event, >>>> >> which was awful because userspace cannot distinguish syscall-enter-stop >>>> >> from syscall-exit-stop and therefore relies on the kernel that >>>> >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). >>>> >> >>>> >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop >>>> >> events to be suppressed, but now the syscall number is lost. >>>> > >>>> > Ah-ha! Okay, thanks, I understand now. I think this means seccomp >>>> > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you >>>> > think here? >>>> >>>> I still don't quite see how this change caused this. >>> >>> I have a test for this at >>> http://sourceforge.net/p/strace/code/ci/HEAD/~/tree/test/seccomp.c >>> >>>> I can play with >>>> it a bit more. But RET_ERRNO *has* to be some kind of skip event, >>>> because it needs to skip the syscall. >>>> >>>> We could change this by treating RET_ERRNO as an instruction to enter >>>> phase 2 and then asking for a skip in phase 2 without changing >>>> orig_ax, but IMO this is pretty ugly. >>>> >>>> I think this all kind of sucks. We're trying to run ptrace after >>>> seccomp, so ptrace is seeing the syscalls as transformed by seccomp. >>>> That means that if we use RET_TRAP, then ptrace will see the >>>> possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO >>>> correctly given the current design) showing syscall -1, and if we use >>>> RET_KILL, then ptrace just sees the process mysteriously die. >>> >>> Userspace is usually not prepared to see syscall -1. >>> For example, strace had to be patched, otherwise it just skipped such >>> syscalls as "not a syscall" events or did other improper things: >>> http://sourceforge.net/p/strace/code/ci/c3948327717c29b10b5e00a436dc138b4ab1a486 >>> http://sourceforge.net/p/strace/code/ci/8e398b6c4020fb2d33a5b3e40271ebf63199b891 >>> >> >> The x32 thing is a legit ABI bug, I'd argue. I'd be happy to submit a >> patch to fix that (clear the x32 bit if we're not x32). >> >>> A slightly different but related story: userspace is also not prepared >>> to handle large errno values produced by seccomp filters like this: >>> BPF_STMT(BPF_RET, SECCOMP_RET_ERRNO | SECCOMP_RET_DATA) >>> >>> For example, glibc assumes that syscalls do not return errno values greater than 0xfff: >>> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h#l55 >>> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/syscall.S#l20 > > To save others the link reading: "Linus said he will make sure the no > syscall returns a value in -1 .. -4095 as a valid result so we can > savely test with -4095." > > Strictly speaking (ISO C, "man 3 errno"), errno is supposed to be a > full int, though digging around I find this in include/linux/err.h: > > /* > * Kernel pointers have redundant information, so we can use a > * scheme where we can return either an error code or a normal > * pointer with the same return value. > * > * This should be a per-architecture thing, to allow different > * error and pointer decisions. > */ > #define MAX_ERRNO 4095 > > #ifndef __ASSEMBLY__ > > #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) > > But no architecture overrides this. > >>> If it isn't too late, I'd recommend changing SECCOMP_RET_DATA mask >>> applied in SECCOMP_RET_ERRNO case from current 0xffff to 0xfff. > > I'm not opposed to this. I would want to more explicitly document the > 4095 max value in man pages, though. > >> I think this is solidly in the "don't do that" category. Seccomp lets >> you tamper with syscalls. If you tamper wrong, then you lose. >> >> Kees, what do you think about reversing the whole thing so that >> ptracers always see the original syscall? > > What do you mean by "reversing"? The interactions I see here are: > > PTRACE_SYSCALL > SECCOMP_RET_ERRNO > SECCOMP_RET_TRACE > SECCOMP_RET_TRAP > > Both ptrace and seccomp will trigger via _TIF_WORK_SYSCALL_ENTRY. Only > ptrace will trigger via _TIF_WORK_SYSCALL_EXIT. > > For SECCOMP_RET_ERRNO to work, we must skip the syscall, as mentioned earlier: > > arch/x86/kernel/entry_32.S ... > syscall_trace_entry: > movl $-ENOSYS,PT_EAX(%esp) > movl %esp, %eax > call syscall_trace_enter > /* What it returned is what we'll actually use. */ > cmpl $(NR_syscalls), %eax > jnae syscall_call > jmp syscall_exit > END(syscall_trace_entry) > > Both before and after the 2-phase change, syscall_trace_enter would > return -1 if it hit SECCOMP_RET_ERRNO, before calling > tracehook_report_syscall_entry. On exit, if PTRACE_SYSCALL, we'd hit > tracehook_report_syscall_exit during syscall_trace_leave, which means > a ptracer will see a syscall-exit-stop without a matching > syscall-enter-stop. > > Using SECCOMP_RET_TRACE with PTRACE_SYSCALL in place seems totally > crazy, as the ptracer would need to be the same program, and if it > chose to skip a syscall, it would be in the same place: it would see > PTRACE_EVENT_SECCOMP, then no syscall-enter-stop, then a > syscall-exit-stop. I think we can ignore this pathological case. > > Using SECCOMP_RET_TRAP with PTRACE_SYSCALL also results in a skip, > which produces the same "only syscall-exit-stop seen" problem. > > In the SECCOMP_RET_ERRNO case, the syscall nr doesn't change (and > isn't executed). In the SECCOMP_RET_TRAP case, the syscall nr doesn't > change (and isn't executed). In the SECCOMP_RET_TRACE, the syscall nr > _could_ change, but the ptracer would be doing it, so the crazy > situation around PTRACE_SYSCALL is probably safe to ignore (as long as > we document what is expected to happen). > > So, the question is: should PTRACE_SYSCALL see a syscall that is _not_ > being executed (due to seccomp)? Audit doesn't see it currently, and > neither does ptrace. I would argue that it should continue to not see > the syscall. That said, if it shouldn't be shown, we also shouldn't > trigger syscall-exit-stop. If you can convince me it should see > syscall-enter-stop, then I have two questions: I think PTRACE_SYSCALL should see syscalls that are skipped due to seccomp. I think that the exit event should see the modified errno, if any, so that strace will show whatever the traced process thinks is happening. > > 1) Do we accept that a ptracer can interfere with SECCOMP_RET_ERRNO? I > think we probably must, since it can already interfere via > syscall-exit-stop and change the errno. I think this is fine. > And especially since a ptracer > can change syscalls during syscall-enter-stop to any syscall it wants, > bypassing seccomp. This condition is already documented. If a ptracer (using PTRACE_SYSCALL) were to get the entry callback before seccomp, then this oddity would go away, which might be a good thing. A ptracer could change the syscall, but seccomp would based on what the ptracer changed the syscall to. > > 2) What do we do with audit? Suddenly we have ptrace seeing a syscall > that audit doesn't? Is this a problem? I'd be amazed if program uses both ptrace and audit -- after all, audit is a global thing, and it only has one implementation (AFAIK): auditd. auditd doesn't ptrace the world. > > And an unrelated thought: > > 3) Can't we find some way to fix the inability of a ptracer to > distinguish between syscall-enter-stop and syscall-exit-stop? > Couldn't we add PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT along the lines of PTRACE_O_TRACESYSGOOD? --Andy
On Fri, Feb 6, 2015 at 11:32 AM, Andy Lutomirski <luto@amacapital.net> wrote: > On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: >> On Thu, Feb 5, 2015 at 6:38 PM, Andy Lutomirski <luto@amacapital.net> wrote: >>> On Thu, Feb 5, 2015 at 6:32 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>>> On Thu, Feb 05, 2015 at 04:09:06PM -0800, Andy Lutomirski wrote: >>>>> On Thu, Feb 5, 2015 at 3:49 PM, Kees Cook <keescook@chromium.org> wrote: >>>>> > On Thu, Feb 5, 2015 at 3:39 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: >>>> [...] >>>>> >> There is a clear difference: before these changes, SECCOMP_RET_ERRNO used >>>>> >> to keep the syscall number unchanged and suppress syscall-exit-stop event, >>>>> >> which was awful because userspace cannot distinguish syscall-enter-stop >>>>> >> from syscall-exit-stop and therefore relies on the kernel that >>>>> >> syscall-enter-stop is followed by syscall-exit-stop (or tracee's death, etc.). >>>>> >> >>>>> >> After these changes, SECCOMP_RET_ERRNO no longer causes syscall-exit-stop >>>>> >> events to be suppressed, but now the syscall number is lost. >>>>> > >>>>> > Ah-ha! Okay, thanks, I understand now. I think this means seccomp >>>>> > phase1 should not treat RET_ERRNO as a "skip" event. Andy, what do you >>>>> > think here? >>>>> >>>>> I still don't quite see how this change caused this. >>>> >>>> I have a test for this at >>>> http://sourceforge.net/p/strace/code/ci/HEAD/~/tree/test/seccomp.c >>>> >>>>> I can play with >>>>> it a bit more. But RET_ERRNO *has* to be some kind of skip event, >>>>> because it needs to skip the syscall. >>>>> >>>>> We could change this by treating RET_ERRNO as an instruction to enter >>>>> phase 2 and then asking for a skip in phase 2 without changing >>>>> orig_ax, but IMO this is pretty ugly. >>>>> >>>>> I think this all kind of sucks. We're trying to run ptrace after >>>>> seccomp, so ptrace is seeing the syscalls as transformed by seccomp. >>>>> That means that if we use RET_TRAP, then ptrace will see the >>>>> possibly-modified syscall, if we use RET_ERRNO, then ptrace is (IMO >>>>> correctly given the current design) showing syscall -1, and if we use >>>>> RET_KILL, then ptrace just sees the process mysteriously die. >>>> >>>> Userspace is usually not prepared to see syscall -1. >>>> For example, strace had to be patched, otherwise it just skipped such >>>> syscalls as "not a syscall" events or did other improper things: >>>> http://sourceforge.net/p/strace/code/ci/c3948327717c29b10b5e00a436dc138b4ab1a486 >>>> http://sourceforge.net/p/strace/code/ci/8e398b6c4020fb2d33a5b3e40271ebf63199b891 >>>> >>> >>> The x32 thing is a legit ABI bug, I'd argue. I'd be happy to submit a >>> patch to fix that (clear the x32 bit if we're not x32). >>> >>>> A slightly different but related story: userspace is also not prepared >>>> to handle large errno values produced by seccomp filters like this: >>>> BPF_STMT(BPF_RET, SECCOMP_RET_ERRNO | SECCOMP_RET_DATA) >>>> >>>> For example, glibc assumes that syscalls do not return errno values greater than 0xfff: >>>> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h#l55 >>>> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/syscall.S#l20 >> >> To save others the link reading: "Linus said he will make sure the no >> syscall returns a value in -1 .. -4095 as a valid result so we can >> savely test with -4095." >> >> Strictly speaking (ISO C, "man 3 errno"), errno is supposed to be a >> full int, though digging around I find this in include/linux/err.h: >> >> /* >> * Kernel pointers have redundant information, so we can use a >> * scheme where we can return either an error code or a normal >> * pointer with the same return value. >> * >> * This should be a per-architecture thing, to allow different >> * error and pointer decisions. >> */ >> #define MAX_ERRNO 4095 >> >> #ifndef __ASSEMBLY__ >> >> #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) >> >> But no architecture overrides this. >> >>>> If it isn't too late, I'd recommend changing SECCOMP_RET_DATA mask >>>> applied in SECCOMP_RET_ERRNO case from current 0xffff to 0xfff. >> >> I'm not opposed to this. I would want to more explicitly document the >> 4095 max value in man pages, though. >> >>> I think this is solidly in the "don't do that" category. Seccomp lets >>> you tamper with syscalls. If you tamper wrong, then you lose. >>> >>> Kees, what do you think about reversing the whole thing so that >>> ptracers always see the original syscall? >> >> What do you mean by "reversing"? The interactions I see here are: >> >> PTRACE_SYSCALL >> SECCOMP_RET_ERRNO >> SECCOMP_RET_TRACE >> SECCOMP_RET_TRAP >> >> Both ptrace and seccomp will trigger via _TIF_WORK_SYSCALL_ENTRY. Only >> ptrace will trigger via _TIF_WORK_SYSCALL_EXIT. >> >> For SECCOMP_RET_ERRNO to work, we must skip the syscall, as mentioned earlier: >> >> arch/x86/kernel/entry_32.S ... >> syscall_trace_entry: >> movl $-ENOSYS,PT_EAX(%esp) >> movl %esp, %eax >> call syscall_trace_enter >> /* What it returned is what we'll actually use. */ >> cmpl $(NR_syscalls), %eax >> jnae syscall_call >> jmp syscall_exit >> END(syscall_trace_entry) >> >> Both before and after the 2-phase change, syscall_trace_enter would >> return -1 if it hit SECCOMP_RET_ERRNO, before calling >> tracehook_report_syscall_entry. On exit, if PTRACE_SYSCALL, we'd hit >> tracehook_report_syscall_exit during syscall_trace_leave, which means >> a ptracer will see a syscall-exit-stop without a matching >> syscall-enter-stop. >> >> Using SECCOMP_RET_TRACE with PTRACE_SYSCALL in place seems totally >> crazy, as the ptracer would need to be the same program, and if it >> chose to skip a syscall, it would be in the same place: it would see >> PTRACE_EVENT_SECCOMP, then no syscall-enter-stop, then a >> syscall-exit-stop. I think we can ignore this pathological case. >> >> Using SECCOMP_RET_TRAP with PTRACE_SYSCALL also results in a skip, >> which produces the same "only syscall-exit-stop seen" problem. >> >> In the SECCOMP_RET_ERRNO case, the syscall nr doesn't change (and >> isn't executed). In the SECCOMP_RET_TRAP case, the syscall nr doesn't >> change (and isn't executed). In the SECCOMP_RET_TRACE, the syscall nr >> _could_ change, but the ptracer would be doing it, so the crazy >> situation around PTRACE_SYSCALL is probably safe to ignore (as long as >> we document what is expected to happen). >> >> So, the question is: should PTRACE_SYSCALL see a syscall that is _not_ >> being executed (due to seccomp)? Audit doesn't see it currently, and >> neither does ptrace. I would argue that it should continue to not see >> the syscall. That said, if it shouldn't be shown, we also shouldn't >> trigger syscall-exit-stop. If you can convince me it should see >> syscall-enter-stop, then I have two questions: > > I think PTRACE_SYSCALL should see syscalls that are skipped due to > seccomp. I think that the exit event should see the modified errno, > if any, so that strace will show whatever the traced process thinks is > happening. > >> >> 1) Do we accept that a ptracer can interfere with SECCOMP_RET_ERRNO? I >> think we probably must, since it can already interfere via >> syscall-exit-stop and change the errno. > > I think this is fine. > >> And especially since a ptracer >> can change syscalls during syscall-enter-stop to any syscall it wants, >> bypassing seccomp. This condition is already documented. > > If a ptracer (using PTRACE_SYSCALL) were to get the entry callback > before seccomp, then this oddity would go away, which might be a good > thing. A ptracer could change the syscall, but seccomp would based on > what the ptracer changed the syscall to. I want kill events to trigger immediately. I don't want to leave the ptrace surface available on a SECCOMP_RET_KILL. So maybe it can be seccomp phase 1, then ptrace, then seccomp phase 2? And pass more information between phases to determine how things should behave beyond just "skip"? >> 2) What do we do with audit? Suddenly we have ptrace seeing a syscall >> that audit doesn't? > > Is this a problem? I'd be amazed if program uses both ptrace and > audit -- after all, audit is a global thing, and it only has one > implementation (AFAIK): auditd. auditd doesn't ptrace the world. > >> >> And an unrelated thought: >> >> 3) Can't we find some way to fix the inability of a ptracer to >> distinguish between syscall-enter-stop and syscall-exit-stop? >> > > Couldn't we add PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT along > the lines of PTRACE_O_TRACESYSGOOD? That might be a nice idea. I haven't written a test to see, but what does PTRACE_GETEVENTMSG return on syscall-enter/exit-stop? If we can't add something there, then yeah, adding PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT with their own event msgs would be nice. Could even add the syscall nr to the event msg so ptracers don't have to dig around in per-arch registers, too. -Kees
On 02/06/2015 11:23 AM, Kees Cook wrote: > > Strictly speaking (ISO C, "man 3 errno"), errno is supposed to be a > full int, though digging around I find this in include/linux/err.h: > That doesn't mean the kernel has to support them. > /* > * Kernel pointers have redundant information, so we can use a > * scheme where we can return either an error code or a normal > * pointer with the same return value. > * > * This should be a per-architecture thing, to allow different > * error and pointer decisions. > */ > #define MAX_ERRNO 4095 > > #ifndef __ASSEMBLY__ > > #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) > > But no architecture overrides this. > We used to have a much lower value, that was per-architecture, in order to optimize the resulting assembly (e.g. 8-bit immediates on x86). This didn't work as the number of errnos increased. The other motivation was probably binary compatibility with other Unices, which was an idea for a while. -hpa
On Fri, Feb 6, 2015 at 12:07 PM, Kees Cook <keescook@chromium.org> wrote: > On Fri, Feb 6, 2015 at 11:32 AM, Andy Lutomirski <luto@amacapital.net> wrote: >> On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: >>> And especially since a ptracer >>> can change syscalls during syscall-enter-stop to any syscall it wants, >>> bypassing seccomp. This condition is already documented. >> >> If a ptracer (using PTRACE_SYSCALL) were to get the entry callback >> before seccomp, then this oddity would go away, which might be a good >> thing. A ptracer could change the syscall, but seccomp would based on >> what the ptracer changed the syscall to. > > I want kill events to trigger immediately. I don't want to leave the > ptrace surface available on a SECCOMP_RET_KILL. So maybe it can be > seccomp phase 1, then ptrace, then seccomp phase 2? And pass more > information between phases to determine how things should behave > beyond just "skip"? I thought so too, originally, but I'm far less convinced now, for two reasons: 1. I think that a lot of filters these days use RET_ERRNO heavily, so this won't benefit them. 2. I'm not convinced it really reduces the attack surface for anyone. Unless your filter is literally "return SECCOMP_RET_KILL", then the seccomp-filtered task can always cause the ptracer to get a pair of syscall notifications. Also, the task can send itself signals (using page faults, breakpoints, etc) and cause ptrace events via other paths. --Andy
On Fri, Feb 06, 2015 at 12:07:03PM -0800, Kees Cook wrote: > On Fri, Feb 6, 2015 at 11:32 AM, Andy Lutomirski <luto@amacapital.net> wrote: > > On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: [...] > >> And an unrelated thought: > >> > >> 3) Can't we find some way to fix the inability of a ptracer to > >> distinguish between syscall-enter-stop and syscall-exit-stop? > > > > Couldn't we add PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT along > > the lines of PTRACE_O_TRACESYSGOOD? > > That might be a nice idea. I haven't written a test to see, but what > does PTRACE_GETEVENTMSG return on syscall-enter/exit-stop? The value returned by PTRACE_GETEVENTMSG is the value set along with the latest PTRACE_EVENT_*. In case of syscall-enter/exit-stop (which is not a PTRACE_EVENT_*), there is no particular value set for PTRACE_GETEVENTMSG.
On Fri, Feb 6, 2015 at 3:17 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > On Fri, Feb 06, 2015 at 12:07:03PM -0800, Kees Cook wrote: >> On Fri, Feb 6, 2015 at 11:32 AM, Andy Lutomirski <luto@amacapital.net> wrote: >> > On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: > [...] >> >> And an unrelated thought: >> >> >> >> 3) Can't we find some way to fix the inability of a ptracer to >> >> distinguish between syscall-enter-stop and syscall-exit-stop? >> > >> > Couldn't we add PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT along >> > the lines of PTRACE_O_TRACESYSGOOD? >> >> That might be a nice idea. I haven't written a test to see, but what >> does PTRACE_GETEVENTMSG return on syscall-enter/exit-stop? > > The value returned by PTRACE_GETEVENTMSG is the value set along with the > latest PTRACE_EVENT_*. > In case of syscall-enter/exit-stop (which is not a PTRACE_EVENT_*), > there is no particular value set for PTRACE_GETEVENTMSG. Could we define one to help distinguish? -Kees
On Fri, Feb 06, 2015 at 05:07:41PM -0800, Kees Cook wrote: > On Fri, Feb 6, 2015 at 3:17 PM, Dmitry V. Levin <ldv@altlinux.org> wrote: > > On Fri, Feb 06, 2015 at 12:07:03PM -0800, Kees Cook wrote: > >> On Fri, Feb 6, 2015 at 11:32 AM, Andy Lutomirski <luto@amacapital.net> wrote: > >> > On Fri, Feb 6, 2015 at 11:23 AM, Kees Cook <keescook@chromium.org> wrote: > > [...] > >> >> And an unrelated thought: > >> >> > >> >> 3) Can't we find some way to fix the inability of a ptracer to > >> >> distinguish between syscall-enter-stop and syscall-exit-stop? > >> > > >> > Couldn't we add PTRACE_O_TRACESYSENTRY and PTRACE_O_TRACESYSEXIT along > >> > the lines of PTRACE_O_TRACESYSGOOD? > >> > >> That might be a nice idea. I haven't written a test to see, but what > >> does PTRACE_GETEVENTMSG return on syscall-enter/exit-stop? > > > > The value returned by PTRACE_GETEVENTMSG is the value set along with the > > latest PTRACE_EVENT_*. > > In case of syscall-enter/exit-stop (which is not a PTRACE_EVENT_*), > > there is no particular value set for PTRACE_GETEVENTMSG. > > Could we define one to help distinguish? I suppose we could define one, but performing extra PTRACE_GETEVENTMSG for every syscall-stop may be too expensive. For example, strace makes about 4.5 syscalls per syscall-stop. The minimum is 4 syscalls: wait4, PTRACE_GETREGSET, write, and PTRACE_SYSCALL; processing some syscall-stops may require additional process_vm_readv calls. That is, forcing strace to make extra PTRACE_GETEVENTMSG per syscall-stop would result to about 20% more syscalls per syscall-stop, that is a noticeable cost. A better alternative is to define an event that wouldn't require this extra PTRACE_GETEVENTMSG per syscall-stop. For example, it could be a PTRACE_EVENT_SYSCALL_ENTRY and/or PTRACE_EVENT_SYSCALL_EXIT. In practice, adding just one of these two events would be enough to distinguish two kinds of syscall-stops. Adding two events would look less surprising, though. If the decision would be to add both events, I'd recommend adding just one new option to cover both events - there is a room only for 32 different PTRACE_O_* options.
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6205f0c434db..86fc2bb82287 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); + +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, + unsigned long phase1_result); + extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bbf338a04a5d..29576c244699 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, force_sig_info(SIGTRAP, &info, tsk); } +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(arch, regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(arch, regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + /* - * We must return the syscall number to actually look up in the table. - * This can be -1L to skip running any syscall at all. + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp */ -long syscall_trace_enter(struct pt_regs *regs) +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { - long ret = 0; + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; /* * If TIF_NOHZ is set, we are required to call user_exit() before * doing anything that could touch RCU. */ - if (test_thread_flag(TIF_NOHZ)) + if (work & _TIF_NOHZ) { user_exit(); + work &= ~TIF_NOHZ; + } + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) +{ + long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1463,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs) * do_debug() and we need to set it again to restore the user * state. If we entered on the slow path, TF was already set. */ - if (test_thread_flag(TIF_SINGLESTEP)) + if (work & _TIF_SINGLESTEP) regs->flags |= X86_EFLAGS_TF; - /* do the secure computing check first */ - if (secure_computing()) { +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { /* seccomp failures shouldn't expose any additional code. */ - ret = -1L; - goto out; + return -1; } +#endif - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + if (unlikely(work & _TIF_SYSCALL_EMU)) ret = -1L; if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && @@ -1483,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (is_ia32_task()) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); -#ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); -#endif + do_audit_syscall_entry(regs, arch); -out: return ret ?: regs->orig_ax; } +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + void syscall_trace_leave(struct pt_regs *regs) { bool step;
This splits syscall_trace_enter into syscall_trace_enter_phase1 and syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only phase 2 is permitted to modify any of pt_regs except for orig_ax. The intent is that phase 1 can be called from the syscall fast path. In this implementation, phase1 can handle any combination of TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT, unless seccomp requests a ptrace event, in which case phase2 is forced. In principle, this could yield a big speedup for TIF_NOHZ as well as for TIF_SECCOMP if syscall exit work were similarly split up. Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- arch/x86/include/asm/ptrace.h | 5 ++ arch/x86/kernel/ptrace.c | 157 +++++++++++++++++++++++++++++++++++------- 2 files changed, 138 insertions(+), 24 deletions(-)