Message ID | bd4e2efb7cd97f2bf9d4f1e2065f16c9091d799a.1405452484.git.luto@amacapital.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jul 15, 2014 at 12:32 PM, Andy Lutomirski <luto@amacapital.net> wrote: > This splits syscall_trace_enter into syscall_trace_enter_phase1 and > syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only > phase 2 is permitted to modify any of pt_regs except for orig_ax. > > The intent is that phase 1 can be called from the syscall fast path. > > Signed-off-by: Andy Lutomirski <luto@amacapital.net> > --- > arch/x86/include/asm/ptrace.h | 5 ++ > arch/x86/kernel/ptrace.c | 139 ++++++++++++++++++++++++++++++++++++------ > 2 files changed, 125 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h > index 14fd6fd..dcbfb49 100644 > --- a/arch/x86/include/asm/ptrace.h > +++ b/arch/x86/include/asm/ptrace.h > @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); > extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > int error_code, int si_code); > > + > +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); > +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, > + unsigned long phase1_result); > + > extern long syscall_trace_enter(struct pt_regs *); > extern void syscall_trace_leave(struct pt_regs *); > > diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c > index 39296d2..8e05418 100644 > --- a/arch/x86/kernel/ptrace.c > +++ b/arch/x86/kernel/ptrace.c > @@ -1441,13 +1441,111 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, > force_sig_info(SIGTRAP, &info, tsk); > } > > +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) > +{ > + if (arch == AUDIT_ARCH_X86_64) { > + audit_syscall_entry(arch, regs->orig_ax, regs->di, > + regs->si, regs->dx, regs->r10); > + } else { > + audit_syscall_entry(arch, regs->orig_ax, regs->bx, > + regs->cx, regs->dx, regs->si); > + } > +} > + > /* > - * We must return the syscall number to actually look up in the table. > - * This can be -1L to skip running any syscall at all. > + * We can return 0 to resume the syscall or anything else to go to phase > + * 2. If we resume the syscall, we need to put something appropriate in > + * regs->orig_ax. > + * > + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax > + * are fully functional. > + * > + * For phase 2's benefit, our return value is: > + * 0: resume the syscall > + * 1: go to phase 2; no seccomp phase 2 needed > + * 2: go to phase 2; pass return value to seccomp > */ > -long syscall_trace_enter(struct pt_regs *regs) > +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) > +{ > + unsigned long ret = 0; > + u32 work; > + > + BUG_ON(regs != task_pt_regs(current)); > + > + work = ACCESS_ONCE(current_thread_info()->flags) & > + _TIF_WORK_SYSCALL_ENTRY; > + > +#ifdef CONFIG_SECCOMP > + /* > + * Do seccomp first -- it should minimize exposure of other > + * code, and keeping seccomp fast is probably more valuable > + * than the rest of this. > + */ > + if (work & _TIF_SECCOMP) { > + struct seccomp_data sd; > + > + sd.arch = arch; > + sd.nr = regs->orig_ax; > + sd.instruction_pointer = regs->ip; > + if (arch == AUDIT_ARCH_X86_64) { > + sd.args[0] = regs->di; > + sd.args[1] = regs->si; > + sd.args[2] = regs->dx; > + sd.args[3] = regs->r10; > + sd.args[4] = regs->r8; > + sd.args[5] = regs->r9; > + } else { > + sd.args[0] = regs->bx; > + sd.args[1] = regs->cx; > + sd.args[2] = regs->dx; > + sd.args[3] = regs->si; > + sd.args[4] = regs->di; > + sd.args[5] = regs->bp; > + } > + > + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); > + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); > + > + ret = seccomp_phase1(&sd); > + if (ret == SECCOMP_PHASE1_SKIP) { > + regs->orig_ax = -ENOSYS; Before, seccomp didn't touch orig_ax on a skip. I don't see any problem with this, and it's probably more clear this way, but are you sure there aren't unexpected side-effects from this? -Kees > + ret = 0; > + } else if (ret != SECCOMP_PHASE1_OK) { > + return ret; /* Go directly to phase 2 */ > + } > + > + work &= ~_TIF_SECCOMP; > + } > +#endif > + > + /* Do our best to finish without phase 2. */ > + if (work == 0) > + return ret; /* seccomp only (ret == 0 here) */ > + > +#ifdef CONFIG_AUDITSYSCALL > + if (work == _TIF_SYSCALL_AUDIT) { > + /* > + * If there is no more work to be done except auditing, > + * then audit in phase 1. Phase 2 always audits, so, if > + * we audit here, then we can't go on to phase 2. > + */ > + do_audit_syscall_entry(regs, arch); > + return 0; > + } > +#endif > + > + return 1; /* Something is enabled that we can't handle in phase 1 */ > +} > + > +/* Returns the syscall nr to run (which should match regs->orig_ax). */ > +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, > + unsigned long phase1_result) > { > long ret = 0; > + u32 work = ACCESS_ONCE(current_thread_info()->flags) & > + _TIF_WORK_SYSCALL_ENTRY; > + > + BUG_ON(regs != task_pt_regs(current)); > > user_exit(); > > @@ -1458,17 +1556,20 @@ long syscall_trace_enter(struct pt_regs *regs) > * do_debug() and we need to set it again to restore the user > * state. If we entered on the slow path, TF was already set. > */ > - if (test_thread_flag(TIF_SINGLESTEP)) > + if (work & _TIF_SINGLESTEP) > regs->flags |= X86_EFLAGS_TF; > > - /* do the secure computing check first */ > - if (secure_computing()) { > + /* > + * Call seccomp_phase2 before running the other hooks so that > + * they can see any changes made by a seccomp tracer. > + */ > + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { > /* seccomp failures shouldn't expose any additional code. */ > ret = -1L; > goto out; > } > > - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) > + if (unlikely(work & _TIF_SYSCALL_EMU)) > ret = -1L; > > if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && > @@ -1478,23 +1579,23 @@ long syscall_trace_enter(struct pt_regs *regs) > if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) > trace_sys_enter(regs, regs->orig_ax); > > - if (is_ia32_task()) > - audit_syscall_entry(AUDIT_ARCH_I386, > - regs->orig_ax, > - regs->bx, regs->cx, > - regs->dx, regs->si); > -#ifdef CONFIG_X86_64 > - else > - audit_syscall_entry(AUDIT_ARCH_X86_64, > - regs->orig_ax, > - regs->di, regs->si, > - regs->dx, regs->r10); > -#endif > + do_audit_syscall_entry(regs, arch); > > out: > return ret ?: regs->orig_ax; > } > > +long syscall_trace_enter(struct pt_regs *regs) > +{ > + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; > + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); > + > + if (phase1_result == 0) > + return regs->orig_ax; > + else > + return syscall_trace_enter_phase2(regs, arch, phase1_result); > +} > + > void syscall_trace_leave(struct pt_regs *regs) > { > bool step; > -- > 1.9.3 >
On Wed, Jul 16, 2014 at 1:21 PM, Kees Cook <keescook@chromium.org> wrote: > On Tue, Jul 15, 2014 at 12:32 PM, Andy Lutomirski <luto@amacapital.net> wrote: >> >> + >> + ret = seccomp_phase1(&sd); >> + if (ret == SECCOMP_PHASE1_SKIP) { >> + regs->orig_ax = -ENOSYS; > > Before, seccomp didn't touch orig_ax on a skip. I don't see any > problem with this, and it's probably more clear this way, but are you > sure there aren't unexpected side-effects from this? It's necessary to cause the syscall to be skipped -- see syscall_trace_enter. That being said, setting it to -ENOSYS is nonsense and probably confused you at least as much as it confused me. It should be regs->orig_ax = -1. --Andy
On Wed, Jul 16, 2014 at 2:15 PM, Andy Lutomirski <luto@amacapital.net> wrote: > On Wed, Jul 16, 2014 at 1:21 PM, Kees Cook <keescook@chromium.org> wrote: >> On Tue, Jul 15, 2014 at 12:32 PM, Andy Lutomirski <luto@amacapital.net> wrote: >>> >>> + >>> + ret = seccomp_phase1(&sd); >>> + if (ret == SECCOMP_PHASE1_SKIP) { >>> + regs->orig_ax = -ENOSYS; >> >> Before, seccomp didn't touch orig_ax on a skip. I don't see any >> problem with this, and it's probably more clear this way, but are you >> sure there aren't unexpected side-effects from this? > > It's necessary to cause the syscall to be skipped -- see syscall_trace_enter. > > That being said, setting it to -ENOSYS is nonsense and probably > confused you at least as much as it confused me. It should be > regs->orig_ax = -1. Yes, I think that would be better. -Kees
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14fd6fd..dcbfb49 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); + +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, + unsigned long phase1_result); + extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 39296d2..8e05418 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1441,13 +1441,111 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, force_sig_info(SIGTRAP, &info, tsk); } +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(arch, regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else { + audit_syscall_entry(arch, regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + /* - * We must return the syscall number to actually look up in the table. - * This can be -1L to skip running any syscall at all. + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * 2: go to phase 2; pass return value to seccomp */ -long syscall_trace_enter(struct pt_regs *regs) +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -ENOSYS; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) { long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); user_exit(); @@ -1458,17 +1556,20 @@ long syscall_trace_enter(struct pt_regs *regs) * do_debug() and we need to set it again to restore the user * state. If we entered on the slow path, TF was already set. */ - if (test_thread_flag(TIF_SINGLESTEP)) + if (work & _TIF_SINGLESTEP) regs->flags |= X86_EFLAGS_TF; - /* do the secure computing check first */ - if (secure_computing()) { + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { /* seccomp failures shouldn't expose any additional code. */ ret = -1L; goto out; } - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + if (unlikely(work & _TIF_SYSCALL_EMU)) ret = -1L; if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && @@ -1478,23 +1579,23 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (is_ia32_task()) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); -#ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); -#endif + do_audit_syscall_entry(regs, arch); out: return ret ?: regs->orig_ax; } +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + void syscall_trace_leave(struct pt_regs *regs) { bool step;
This splits syscall_trace_enter into syscall_trace_enter_phase1 and syscall_trace_enter_phase2. Only phase 2 has full pt_regs, and only phase 2 is permitted to modify any of pt_regs except for orig_ax. The intent is that phase 1 can be called from the syscall fast path. Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- arch/x86/include/asm/ptrace.h | 5 ++ arch/x86/kernel/ptrace.c | 139 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 125 insertions(+), 19 deletions(-)