Message ID | 20180514094640.27569-9-mark.rutland@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote: > As a first step towards invoking syscalls with a pt_regs argument, > convert the raw syscall invocation logic to C. We end up with a bit more > register shuffling, but the unified invocation logic means we can unify > the tracing paths, too. > > This only converts the invocation of the syscall. The rest of the > syscall triage and tracing is left in assembly for now, and will be > converted in subsequent patches. > > Signed-off-by: Mark Rutland <mark.rutland@arm.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: Will Deacon <will.deacon@arm.com> > --- > arch/arm64/kernel/Makefile | 3 ++- > arch/arm64/kernel/entry.S | 36 ++++++++++-------------------------- > arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++ > 3 files changed, 41 insertions(+), 27 deletions(-) > create mode 100644 arch/arm64/kernel/syscall.c > > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > index bf825f38d206..c22e8ace5ea3 100644 > --- a/arch/arm64/kernel/Makefile > +++ b/arch/arm64/kernel/Makefile > @@ -18,7 +18,8 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ > hyp-stub.o psci.o cpu_ops.o insn.o \ > return_address.o cpuinfo.o cpu_errata.o \ > cpufeature.o alternative.o cacheinfo.o \ > - smp.o smp_spin_table.o topology.o smccc-call.o > + smp.o smp_spin_table.o topology.o smccc-call.o \ > + syscall.o > > extra-$(CONFIG_EFI) := efi-entry.o > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > index 08ea3cbfb08f..d6e057500eaf 100644 > --- a/arch/arm64/kernel/entry.S > +++ b/arch/arm64/kernel/entry.S > @@ -873,7 +873,6 @@ ENDPROC(el0_error) > */ > ret_fast_syscall: > disable_daif > - str x0, [sp, #S_X0] // returned x0 > ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing > and x2, x1, #_TIF_SYSCALL_WORK > cbnz x2, ret_fast_syscall_trace > @@ -946,15 +945,11 @@ el0_svc_naked: // compat entry point > > tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks > b.ne __sys_trace > - cmp wscno, wsc_nr // check upper syscall limit > - b.hs ni_sys > - mask_nospec64 xscno, xsc_nr, x19 // enforce bounds for syscall number > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > - blr x16 // call sys_* routine > - b ret_fast_syscall > -ni_sys: > mov x0, sp > - bl do_ni_syscall > + mov w1, wscno > + mov w2, wsc_nr > + mov x3, stbl > + bl invoke_syscall > b ret_fast_syscall > ENDPROC(el0_svc) > > @@ -971,29 +966,18 @@ __sys_trace: > bl syscall_trace_enter > cmp w0, #NO_SYSCALL // skip the syscall? > b.eq __sys_trace_return_skipped > - mov wscno, w0 // syscall number (possibly new) > - mov x1, sp // pointer to regs > - cmp wscno, wsc_nr // check upper syscall limit > - b.hs __ni_sys_trace > - ldp x0, x1, [sp] // restore the syscall args > - ldp x2, x3, [sp, #S_X2] > - ldp x4, x5, [sp, #S_X4] > - ldp x6, x7, [sp, #S_X6] > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > - blr x16 // call sys_* routine > > -__sys_trace_return: > - str x0, [sp, #S_X0] // save returned x0 > + mov x0, sp > + mov w1, wscno > + mov w2, wsc_nr > + mov x3, stbl > + bl invoke_syscall > + > __sys_trace_return_skipped: > mov x0, sp > bl syscall_trace_exit > b ret_to_user > > -__ni_sys_trace: > - mov x0, sp > - bl do_ni_syscall > - b __sys_trace_return > - Can you explain why ni_syscall is special here, why __sys_trace_return existed, and why its disappearance doesn't break anything? Not saying there's a bug, just that I'm a little confuse -- I see no real reason for ni_syscall being special, and this may be a good opportunity to decruft it. (See also comments below.) > .popsection // .entry.text > > #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 > diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c > new file mode 100644 > index 000000000000..58d7569f47df > --- /dev/null > +++ b/arch/arm64/kernel/syscall.c > @@ -0,0 +1,29 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/nospec.h> > +#include <linux/ptrace.h> > + > +long do_ni_syscall(struct pt_regs *regs); > + > +typedef long (*syscall_fn_t)(unsigned long, unsigned long, > + unsigned long, unsigned long, > + unsigned long, unsigned long); > + > +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) > +{ > + regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1], > + regs->regs[2], regs->regs[3], > + regs->regs[4], regs->regs[5]); > +} > + > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr, > + syscall_fn_t syscall_table[]) > +{ > + if (scno < sc_nr) { What if (int)scno < 0? Should those args both by unsigned ints? "sc_nr" sounds too much like "syscall number" to me. Might "syscall_table_size" might be clearer? Similarly, we could have "stbl_size" or similar in the asm. This is purely cosmetic, though. > + syscall_fn_t syscall_fn; > + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; > + __invoke_syscall(regs, syscall_fn); > + } else { > + regs->regs[0] = do_ni_syscall(regs); Can we make __invoke_syscall() the universal syscall wrapper, and give do_ni_syscall() the same interface as any other syscall body? Then you could factor this as static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[], (unsigned) int scno, (unsigned) int sc_nr) { if (sc_no >= sc_nr) return sys_ni_syscall; return syscall_table[array_index_nospec(scno, sc_nr)]; } ... __invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr); This is cosmetic too, of course. do_ni_syscall() should be given a pt_regs-based wrapper like all the rest. This is still cosmetic, but reduces unnecessary special-case-ness for ni_syscall. Cheers ---Dave
On Mon, May 14, 2018 at 12:07:18PM +0100, Dave Martin wrote: > On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote: > > As a first step towards invoking syscalls with a pt_regs argument, > > convert the raw syscall invocation logic to C. We end up with a bit more > > register shuffling, but the unified invocation logic means we can unify > > the tracing paths, too. > > > > This only converts the invocation of the syscall. The rest of the > > syscall triage and tracing is left in assembly for now, and will be > > converted in subsequent patches. > > > > Signed-off-by: Mark Rutland <mark.rutland@arm.com> > > Cc: Catalin Marinas <catalin.marinas@arm.com> > > Cc: Will Deacon <will.deacon@arm.com> > > --- > > arch/arm64/kernel/Makefile | 3 ++- > > arch/arm64/kernel/entry.S | 36 ++++++++++-------------------------- > > arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++ > > 3 files changed, 41 insertions(+), 27 deletions(-) > > create mode 100644 arch/arm64/kernel/syscall.c > > > > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > > index bf825f38d206..c22e8ace5ea3 100644 > > --- a/arch/arm64/kernel/Makefile > > +++ b/arch/arm64/kernel/Makefile > > @@ -18,7 +18,8 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ > > hyp-stub.o psci.o cpu_ops.o insn.o \ > > return_address.o cpuinfo.o cpu_errata.o \ > > cpufeature.o alternative.o cacheinfo.o \ > > - smp.o smp_spin_table.o topology.o smccc-call.o > > + smp.o smp_spin_table.o topology.o smccc-call.o \ > > + syscall.o > > > > extra-$(CONFIG_EFI) := efi-entry.o > > > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > > index 08ea3cbfb08f..d6e057500eaf 100644 > > --- a/arch/arm64/kernel/entry.S > > +++ b/arch/arm64/kernel/entry.S > > @@ -873,7 +873,6 @@ ENDPROC(el0_error) > > */ > > ret_fast_syscall: > > disable_daif > > - str x0, [sp, #S_X0] // returned x0 > > ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing > > and x2, x1, #_TIF_SYSCALL_WORK > > cbnz x2, ret_fast_syscall_trace > > @@ -946,15 +945,11 @@ el0_svc_naked: // compat entry point > > > > tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks > > b.ne __sys_trace > > - cmp wscno, wsc_nr // check upper syscall limit > > - b.hs ni_sys > > - mask_nospec64 xscno, xsc_nr, x19 // enforce bounds for syscall number > > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > > - blr x16 // call sys_* routine > > - b ret_fast_syscall > > -ni_sys: > > mov x0, sp > > - bl do_ni_syscall > > + mov w1, wscno > > + mov w2, wsc_nr > > + mov x3, stbl > > + bl invoke_syscall > > b ret_fast_syscall > > ENDPROC(el0_svc) > > > > @@ -971,29 +966,18 @@ __sys_trace: > > bl syscall_trace_enter > > cmp w0, #NO_SYSCALL // skip the syscall? > > b.eq __sys_trace_return_skipped > > - mov wscno, w0 // syscall number (possibly new) > > - mov x1, sp // pointer to regs > > - cmp wscno, wsc_nr // check upper syscall limit > > - b.hs __ni_sys_trace > > - ldp x0, x1, [sp] // restore the syscall args > > - ldp x2, x3, [sp, #S_X2] > > - ldp x4, x5, [sp, #S_X4] > > - ldp x6, x7, [sp, #S_X6] > > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > > - blr x16 // call sys_* routine > > > > -__sys_trace_return: > > - str x0, [sp, #S_X0] // save returned x0 > > + mov x0, sp > > + mov w1, wscno > > + mov w2, wsc_nr > > + mov x3, stbl > > + bl invoke_syscall > > + > > __sys_trace_return_skipped: > > mov x0, sp > > bl syscall_trace_exit > > b ret_to_user > > > > -__ni_sys_trace: > > - mov x0, sp > > - bl do_ni_syscall > > - b __sys_trace_return > > - > > Can you explain why ni_syscall is special here, This is for out-of-range syscall numbers, instances of ni_syscall in the syscall table are handled by the regular path. When the syscall number is out-of-range, we can't index the syscall table, and have to call ni_sys directly. The c invoke_syscall() wrapper handles that case internally so that we don't have to open-code it everywhere. > why __sys_trace_return existed, The __sys_trace_return label existed so that the special __ni_sys_trace path could return into a common tracing return path. > and why its disappearance doesn't break anything? Now that invoke_syscall() handles out-of-range syscall numbers, and we can remove the __ni_sys_trace path, nothing branches to __sys_trace_return. Only the label has been removed, not the usual return path. > Not saying there's a bug, just that I'm a little confuse -- I see no > real reason for ni_syscall being special, and this may be a good > opportunity to decruft it. (See also comments below.) Hopefully the above clarifies things? I've updated the commit message with a description. [...] > > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr, > > + syscall_fn_t syscall_table[]) > > +{ > > + if (scno < sc_nr) { > > What if (int)scno < 0? Should those args both by unsigned ints? Yes, they should -- I've fixed that up locally. That is a *very* good point, thanks! > "sc_nr" sounds too much like "syscall number" to me. Might > "syscall_table_size" might be clearer? Similarly, we could have > "stbl_size" or similar in the asm. This is purely cosmetic, > though. I'd tried to stick to the naming used in assembly to keep the conversion clearer for those familiar with the asm. I agree the names aren't great. > > + syscall_fn_t syscall_fn; > > + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; > > + __invoke_syscall(regs, syscall_fn); > > + } else { > > + regs->regs[0] = do_ni_syscall(regs); > > Can we make __invoke_syscall() the universal syscall wrapper, and give > do_ni_syscall() the same interface as any other syscall body? Not at this point in time, since the prototype (in core code) differs. I agree that would be nicer, but there are a number of complications; more details below. > Then you could factor this as > > static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[], > (unsigned) int scno, (unsigned) int sc_nr) > { > if (sc_no >= sc_nr) > return sys_ni_syscall; > > return syscall_table[array_index_nospec(scno, sc_nr)]; > } > > ... > __invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr); > > > > This is cosmetic too, of course. > > do_ni_syscall() should be given a pt_regs-based wrapper like all the > rest. I agree it would be nicer if it had a wrapper that took a pt_regs, even if it does nothing with it. We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd need a ksys_ni_syscall() for our traps.c logic, and adding this uniformly would involve some arch-specific rework for x86, too, so I decided it was not worth the effort. Thanks, Mark.
On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote: > On Mon, May 14, 2018 at 12:07:18PM +0100, Dave Martin wrote: > > On Mon, May 14, 2018 at 10:46:30AM +0100, Mark Rutland wrote: > > > As a first step towards invoking syscalls with a pt_regs argument, > > > convert the raw syscall invocation logic to C. We end up with a bit more > > > register shuffling, but the unified invocation logic means we can unify > > > the tracing paths, too. > > > > > > This only converts the invocation of the syscall. The rest of the > > > syscall triage and tracing is left in assembly for now, and will be > > > converted in subsequent patches. > > > > > > Signed-off-by: Mark Rutland <mark.rutland@arm.com> > > > Cc: Catalin Marinas <catalin.marinas@arm.com> > > > Cc: Will Deacon <will.deacon@arm.com> > > > --- > > > arch/arm64/kernel/Makefile | 3 ++- > > > arch/arm64/kernel/entry.S | 36 ++++++++++-------------------------- > > > arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++ > > > 3 files changed, 41 insertions(+), 27 deletions(-) > > > create mode 100644 arch/arm64/kernel/syscall.c > > > > > > diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile > > > index bf825f38d206..c22e8ace5ea3 100644 > > > --- a/arch/arm64/kernel/Makefile > > > +++ b/arch/arm64/kernel/Makefile > > > @@ -18,7 +18,8 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ > > > hyp-stub.o psci.o cpu_ops.o insn.o \ > > > return_address.o cpuinfo.o cpu_errata.o \ > > > cpufeature.o alternative.o cacheinfo.o \ > > > - smp.o smp_spin_table.o topology.o smccc-call.o > > > + smp.o smp_spin_table.o topology.o smccc-call.o \ > > > + syscall.o > > > > > > extra-$(CONFIG_EFI) := efi-entry.o > > > > > > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S > > > index 08ea3cbfb08f..d6e057500eaf 100644 > > > --- a/arch/arm64/kernel/entry.S > > > +++ b/arch/arm64/kernel/entry.S > > > @@ -873,7 +873,6 @@ ENDPROC(el0_error) > > > */ > > > ret_fast_syscall: > > > disable_daif > > > - str x0, [sp, #S_X0] // returned x0 > > > ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing > > > and x2, x1, #_TIF_SYSCALL_WORK > > > cbnz x2, ret_fast_syscall_trace > > > @@ -946,15 +945,11 @@ el0_svc_naked: // compat entry point > > > > > > tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks > > > b.ne __sys_trace > > > - cmp wscno, wsc_nr // check upper syscall limit > > > - b.hs ni_sys > > > - mask_nospec64 xscno, xsc_nr, x19 // enforce bounds for syscall number > > > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > > > - blr x16 // call sys_* routine > > > - b ret_fast_syscall > > > -ni_sys: > > > mov x0, sp > > > - bl do_ni_syscall > > > + mov w1, wscno > > > + mov w2, wsc_nr > > > + mov x3, stbl > > > + bl invoke_syscall > > > b ret_fast_syscall > > > ENDPROC(el0_svc) > > > > > > @@ -971,29 +966,18 @@ __sys_trace: > > > bl syscall_trace_enter > > > cmp w0, #NO_SYSCALL // skip the syscall? > > > b.eq __sys_trace_return_skipped > > > - mov wscno, w0 // syscall number (possibly new) > > > - mov x1, sp // pointer to regs > > > - cmp wscno, wsc_nr // check upper syscall limit > > > - b.hs __ni_sys_trace > > > - ldp x0, x1, [sp] // restore the syscall args > > > - ldp x2, x3, [sp, #S_X2] > > > - ldp x4, x5, [sp, #S_X4] > > > - ldp x6, x7, [sp, #S_X6] > > > - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table > > > - blr x16 // call sys_* routine > > > > > > -__sys_trace_return: > > > - str x0, [sp, #S_X0] // save returned x0 > > > + mov x0, sp > > > + mov w1, wscno > > > + mov w2, wsc_nr > > > + mov x3, stbl > > > + bl invoke_syscall > > > + > > > __sys_trace_return_skipped: > > > mov x0, sp > > > bl syscall_trace_exit > > > b ret_to_user > > > > > > -__ni_sys_trace: > > > - mov x0, sp > > > - bl do_ni_syscall > > > - b __sys_trace_return > > > - > > > > Can you explain why ni_syscall is special here, > > This is for out-of-range syscall numbers, instances of ni_syscall in the > syscall table are handled by the regular path. When the syscall number > is out-of-range, we can't index the syscall table, and have to call > ni_sys directly. > > The c invoke_syscall() wrapper handles that case internally so that we > don't have to open-code it everywhere. > > > why __sys_trace_return existed, > > The __sys_trace_return label existed so that the special __ni_sys_trace > path could return into a common tracing return path. > > > and why its disappearance doesn't break anything? > > Now that invoke_syscall() handles out-of-range syscall numbers, and we > can remove the __ni_sys_trace path, nothing branches to > __sys_trace_return. > > Only the label has been removed, not the usual return path. OK, I think I understand. I think the name "__sys_trace_return" was confusing me, as if this was something special that only relates to the ni_syscall case. If it was only ever intended as the merge point for those two paths, then I can see that merging the paths for real enables us to get rid of it. > > Not saying there's a bug, just that I'm a little confuse -- I see no > > real reason for ni_syscall being special, and this may be a good > > opportunity to decruft it. (See also comments below.) > > Hopefully the above clarifies things? Yes, I think so. > I've updated the commit message with a description. > > [...] > > > > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr, > > > + syscall_fn_t syscall_table[]) > > > +{ > > > + if (scno < sc_nr) { > > > > What if (int)scno < 0? Should those args both by unsigned ints? > > Yes, they should -- I've fixed that up locally. > > That is a *very* good point, thanks! > > > "sc_nr" sounds too much like "syscall number" to me. Might > > "syscall_table_size" might be clearer? Similarly, we could have > > "stbl_size" or similar in the asm. This is purely cosmetic, > > though. > > I'd tried to stick to the naming used in assembly to keep the conversion > clearer for those familiar with the asm. > > I agree the names aren't great. Not a big deal. If you feel you would like to rename them though, I won't argue with it ;) > > > + syscall_fn_t syscall_fn; > > > + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; > > > + __invoke_syscall(regs, syscall_fn); > > > + } else { > > > + regs->regs[0] = do_ni_syscall(regs); > > > > Can we make __invoke_syscall() the universal syscall wrapper, and give > > do_ni_syscall() the same interface as any other syscall body? > > Not at this point in time, since the prototype (in core code) differs. > > I agree that would be nicer, but there are a number of complications; > more details below. > > > Then you could factor this as > > > > static syscall_fn_t syscall_fn(syscall_fn_t const syscall_table[], > > (unsigned) int scno, (unsigned) int sc_nr) > > { > > if (sc_no >= sc_nr) > > return sys_ni_syscall; > > > > return syscall_table[array_index_nospec(scno, sc_nr)]; > > } > > > > ... > > __invoke_syscall(regs, syscall_fn(syscall_table, scno, sc_nr); > > > > > > > > This is cosmetic too, of course. > > > > do_ni_syscall() should be given a pt_regs-based wrapper like all the > > rest. > > I agree it would be nicer if it had a wrapper that took a pt_regs, even > if it does nothing with it. > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd > need a ksys_ni_syscall() for our traps.c logic, and adding this > uniformly would involve some arch-specific rework for x86, too, so I > decided it was not worth the effort. Does allowing error injection for ni_syscall actually matter? Error injection is an ABI break by itself. It would arguably be a bit strange though. Cheers ---Dave
> +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) > +{ > + regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1], > + regs->regs[2], regs->regs[3], > + regs->regs[4], regs->regs[5]); > +} Any specific reason to have this in a separate function? This seems to be called only from one instance, namely > +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr, > + syscall_fn_t syscall_table[]) > +{ > + if (scno < sc_nr) { > + syscall_fn_t syscall_fn; > + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; > + __invoke_syscall(regs, syscall_fn); > + } else { > + regs->regs[0] = do_ni_syscall(regs); > + } > +} and result in a one-liner in the last patch of the series. Thanks, Dominik
On Mon, May 14, 2018 at 08:00:29PM +0200, Dominik Brodowski wrote: > > +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) > > +{ > > + regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1], > > + regs->regs[2], regs->regs[3], > > + regs->regs[4], regs->regs[5]); > > +} > > Any specific reason to have this in a separate function? This seems to be > called only from one instance, namely I wanted to keep the raw syscall invocation logically separate from the syscall table lookup and ni_syscall fallback, so that it was easier to verify in isolation. I don't think it's a big deal either way, though. Thanks, Mark.
On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote: > On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote: > > I agree it would be nicer if it had a wrapper that took a pt_regs, even > > if it does nothing with it. > > > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd > > need a ksys_ni_syscall() for our traps.c logic, and adding this > > uniformly would involve some arch-specific rework for x86, too, so I > > decided it was not worth the effort. > > Couldn't you just open-code the "return -ENOSYS;" in traps.c? I guess so. I was just worried that debug logic might be added to the generic ni_syscall() in future, and wanted to avoid potential divergence. > Error injection has no reasonable stable ABI/API expectations, so that's not > a show-stopper either. If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to do that -- it's just that we'll need a fixup for x86 as that will change the symbol name. Thanks, Mark.
On Tue, May 15, 2018 at 09:22:23AM +0100, Mark Rutland wrote: > On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote: > > On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote: > > > I agree it would be nicer if it had a wrapper that took a pt_regs, even > > > if it does nothing with it. > > > > > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd > > > need a ksys_ni_syscall() for our traps.c logic, and adding this > > > uniformly would involve some arch-specific rework for x86, too, so I > > > decided it was not worth the effort. > > > > Couldn't you just open-code the "return -ENOSYS;" in traps.c? > > I guess so. I was just worried that debug logic might be added to the generic > ni_syscall() in future, and wanted to avoid potential divergence. > > > Error injection has no reasonable stable ABI/API expectations, so that's not > > a show-stopper either. > > If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to > do that -- it's just that we'll need a fixup for x86 as that will change the > symbol name. For me, it's less about using SYSCALL_DEFINE0() for ni_syscall, but more about keeping the syscall invokation easy. Therefore, we do pass a pointer struct pt_regs to sys_ni_syscall() on x86, even though it does not expect it. /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ extern asmlinkage long sys_ni_syscall(const struct pt_regs *); Thanks, Dominik
On Tue, May 15, 2018 at 12:01:58PM +0200, Dominik Brodowski wrote: > On Tue, May 15, 2018 at 09:22:23AM +0100, Mark Rutland wrote: > > On Mon, May 14, 2018 at 10:24:45PM +0200, Dominik Brodowski wrote: > > > On Mon, May 14, 2018 at 12:41:10PM +0100, Mark Rutland wrote: > > > > I agree it would be nicer if it had a wrapper that took a pt_regs, even > > > > if it does nothing with it. > > > > > > > > We can't use SYSCALL_DEFINE0() due to the fault injection muck, we'd > > > > need a ksys_ni_syscall() for our traps.c logic, and adding this > > > > uniformly would involve some arch-specific rework for x86, too, so I > > > > decided it was not worth the effort. > > > > > > Couldn't you just open-code the "return -ENOSYS;" in traps.c? > > > > I guess so. I was just worried that debug logic might be added to the generic > > ni_syscall() in future, and wanted to avoid potential divergence. > > > > > Error injection has no reasonable stable ABI/API expectations, so that's not > > > a show-stopper either. > > > > If people are happy with using SYSCALL_DEFINE0() for ni_syscall, I'm happy to > > do that -- it's just that we'll need a fixup for x86 as that will change the > > symbol name. > > For me, it's less about using SYSCALL_DEFINE0() for ni_syscall, but more > about keeping the syscall invokation easy. Therefore, we do pass a pointer > struct pt_regs to sys_ni_syscall() on x86, even though it does not expect > it. > > /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ > extern asmlinkage long sys_ni_syscall(const struct pt_regs *); Oh, sure, we do the same on arm64 in this series. Having a pt_regs wrapper for it (e.g. using SYSCALL_DEFINE0()) would allow us to avoid that lie (which might be best for CFI stuff), would allow us to avoid some name mangling on arm64, and would seemingly confuse people less. Thanks, Mark.
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index bf825f38d206..c22e8ace5ea3 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -18,7 +18,8 @@ arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ hyp-stub.o psci.o cpu_ops.o insn.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ - smp.o smp_spin_table.o topology.o smccc-call.o + smp.o smp_spin_table.o topology.o smccc-call.o \ + syscall.o extra-$(CONFIG_EFI) := efi-entry.o diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 08ea3cbfb08f..d6e057500eaf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -873,7 +873,6 @@ ENDPROC(el0_error) */ ret_fast_syscall: disable_daif - str x0, [sp, #S_X0] // returned x0 ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace @@ -946,15 +945,11 @@ el0_svc_naked: // compat entry point tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks b.ne __sys_trace - cmp wscno, wsc_nr // check upper syscall limit - b.hs ni_sys - mask_nospec64 xscno, xsc_nr, x19 // enforce bounds for syscall number - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table - blr x16 // call sys_* routine - b ret_fast_syscall -ni_sys: mov x0, sp - bl do_ni_syscall + mov w1, wscno + mov w2, wsc_nr + mov x3, stbl + bl invoke_syscall b ret_fast_syscall ENDPROC(el0_svc) @@ -971,29 +966,18 @@ __sys_trace: bl syscall_trace_enter cmp w0, #NO_SYSCALL // skip the syscall? b.eq __sys_trace_return_skipped - mov wscno, w0 // syscall number (possibly new) - mov x1, sp // pointer to regs - cmp wscno, wsc_nr // check upper syscall limit - b.hs __ni_sys_trace - ldp x0, x1, [sp] // restore the syscall args - ldp x2, x3, [sp, #S_X2] - ldp x4, x5, [sp, #S_X4] - ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, xscno, lsl #3] // address in the syscall table - blr x16 // call sys_* routine -__sys_trace_return: - str x0, [sp, #S_X0] // save returned x0 + mov x0, sp + mov w1, wscno + mov w2, wsc_nr + mov x3, stbl + bl invoke_syscall + __sys_trace_return_skipped: mov x0, sp bl syscall_trace_exit b ret_to_user -__ni_sys_trace: - mov x0, sp - bl do_ni_syscall - b __sys_trace_return - .popsection // .entry.text #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c new file mode 100644 index 000000000000..58d7569f47df --- /dev/null +++ b/arch/arm64/kernel/syscall.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nospec.h> +#include <linux/ptrace.h> + +long do_ni_syscall(struct pt_regs *regs); + +typedef long (*syscall_fn_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + +static void __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn) +{ + regs->regs[0] = syscall_fn(regs->regs[0], regs->regs[1], + regs->regs[2], regs->regs[3], + regs->regs[4], regs->regs[5]); +} + +asmlinkage void invoke_syscall(struct pt_regs *regs, int scno, int sc_nr, + syscall_fn_t syscall_table[]) +{ + if (scno < sc_nr) { + syscall_fn_t syscall_fn; + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; + __invoke_syscall(regs, syscall_fn); + } else { + regs->regs[0] = do_ni_syscall(regs); + } +}
As a first step towards invoking syscalls with a pt_regs argument, convert the raw syscall invocation logic to C. We end up with a bit more register shuffling, but the unified invocation logic means we can unify the tracing paths, too. This only converts the invocation of the syscall. The rest of the syscall triage and tracing is left in assembly for now, and will be converted in subsequent patches. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will.deacon@arm.com> --- arch/arm64/kernel/Makefile | 3 ++- arch/arm64/kernel/entry.S | 36 ++++++++++-------------------------- arch/arm64/kernel/syscall.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 27 deletions(-) create mode 100644 arch/arm64/kernel/syscall.c