diff mbox series

[V3,08/13] x86/entry: Use generic syscall entry function

Message ID 20200716185424.765294277@linutronix.de (mailing list archive)
State New, archived
Headers show
Series entry, x86, kvm: Generic entry/exit functionality for host and guest | expand

Commit Message

Thomas Gleixner July 16, 2020, 6:22 p.m. UTC
From: Thomas Gleixner <tglx@linutronix.de>

Replace the syscall entry work handling with the generic version. Provide
the necessary helper inlines to handle the real architecture specific
parts, e.g. audit and seccomp invocations.

Use a temporary define for idtentry_enter_user which will be cleaned up
seperately.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3: Adapt to the noinstr changes
---
 arch/x86/Kconfig                    |    1 
 arch/x86/entry/common.c             |  181 +-----------------------------------
 arch/x86/include/asm/entry-common.h |   86 +++++++++++++++++
 arch/x86/include/asm/idtentry.h     |    5 
 arch/x86/include/asm/thread_info.h  |    5 
 5 files changed, 99 insertions(+), 179 deletions(-)

Comments

Kees Cook July 16, 2020, 9:13 p.m. UTC | #1
On Thu, Jul 16, 2020 at 08:22:16PM +0200, Thomas Gleixner wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Replace the syscall entry work handling with the generic version. Provide
> the necessary helper inlines to handle the real architecture specific
> parts, e.g. audit and seccomp invocations.
> 
> Use a temporary define for idtentry_enter_user which will be cleaned up
> seperately.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> [...]
> --- /dev/null
> +++ b/arch/x86/include/asm/entry-common.h
> @@ -0,0 +1,86 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef _ASM_X86_ENTRY_COMMON_H
> +#define _ASM_X86_ENTRY_COMMON_H
> +
> +#include <linux/seccomp.h>
> +#include <linux/audit.h>
> +
> +/* Check that the stack and regs on entry from user mode are sane. */
> +static __always_inline void arch_check_user_regs(struct pt_regs *regs)
> +{
> +	if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
> +		/*
> +		 * Make sure that the entry code gave us a sensible EFLAGS
> +		 * register.  Native because we want to check the actual CPU
> +		 * state, not the interrupt state as imagined by Xen.
> +		 */
> +		unsigned long flags = native_save_fl();
> +		WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
> +				      X86_EFLAGS_NT));
> +
> +		/* We think we came from user mode. Make sure pt_regs agrees. */
> +		WARN_ON_ONCE(!user_mode(regs));
> +
> +		/*
> +		 * All entries from user mode (except #DF) should be on the
> +		 * normal thread stack and should have user pt_regs in the
> +		 * correct location.
> +		 */
> +		WARN_ON_ONCE(!on_thread_stack());
> +		WARN_ON_ONCE(regs != task_pt_regs(current));
> +	}
> +}
> +#define arch_check_user_regs arch_check_user_regs

Will architectures implement subsets of these functions? (i.e. instead
of each of the defines, is CONFIG_ENTRY_GENERIC sufficient for the
no-op inlines?)

> +
> +static inline long arch_syscall_enter_seccomp(struct pt_regs *regs)
> +{
> +#ifdef CONFIG_SECCOMP
> +	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
> +	struct seccomp_data sd;
> +
> +	sd.arch = arch;
> +	sd.nr = regs->orig_ax;
> +	sd.instruction_pointer = regs->ip;
> +
> +#ifdef CONFIG_X86_64
> +	if (arch == AUDIT_ARCH_X86_64) {
> +		sd.args[0] = regs->di;
> +		sd.args[1] = regs->si;
> +		sd.args[2] = regs->dx;
> +		sd.args[3] = regs->r10;
> +		sd.args[4] = regs->r8;
> +		sd.args[5] = regs->r9;
> +	} else
> +#endif
> +	{
> +		sd.args[0] = regs->bx;
> +		sd.args[1] = regs->cx;
> +		sd.args[2] = regs->dx;
> +		sd.args[3] = regs->si;
> +		sd.args[4] = regs->di;
> +		sd.args[5] = regs->bp;
> +	}
> +
> +	return __secure_computing(&sd);
> +#else
> +	return 0;
> +#endif
> +}
> +#define arch_syscall_enter_seccomp arch_syscall_enter_seccomp

Actually, I've been meaning to clean this up. It's not needed at all.
This was left over from the seccomp fast-path code that got ripped out a
while ago. seccomp already has everything it needs to do this work, so
just:

	__secure_computing(NULL);

is sufficient for every architecture that supports seccomp. (See kernel/seccomp.c
populate_seccomp_data().)

And if you want more generalization work, note that the secure_computing()
macro performs a TIF test before calling __secure_computing(NULL). But
my point is, I think arch_syscall_enter_seccomp() is not needed.

> +static inline void arch_syscall_enter_audit(struct pt_regs *regs)
> +{
> +#ifdef CONFIG_X86_64
> +	if (in_ia32_syscall()) {
> +		audit_syscall_entry(regs->orig_ax, regs->di,
> +				    regs->si, regs->dx, regs->r10);
> +	} else
> +#endif
> +	{
> +		audit_syscall_entry(regs->orig_ax, regs->bx,
> +				    regs->cx, regs->dx, regs->si);
> +	}
> +}
> +#define arch_syscall_enter_audit arch_syscall_enter_audit

Similarly, I think these can be redefined in the generic case
using the existing accessors for syscall arguments, etc. e.g.
arch_syscall_enter_audit() is not needed for any architecture, and the
generic is:

	unsigned long args[6];

        syscall_get_arguments(task, regs, args);
	audit_syscall_entry(syscall_get_nr(current, regs),
			    args[0], args[1], args[2], args[3]);
Thomas Gleixner July 16, 2020, 9:33 p.m. UTC | #2
Kees Cook <keescook@chromium.org> writes:
> On Thu, Jul 16, 2020 at 08:22:16PM +0200, Thomas Gleixner wrote:
>> +}
>> +#define arch_check_user_regs arch_check_user_regs
>
> Will architectures implement subsets of these functions? (i.e. instead
> of each of the defines, is CONFIG_ENTRY_GENERIC sufficient for the
> no-op inlines?)

Yes, some of these are optional as far as my analysis of the
architecture code went.

>> +}
>> +#define arch_syscall_enter_seccomp arch_syscall_enter_seccomp
>
> Actually, I've been meaning to clean this up. It's not needed at all.
> This was left over from the seccomp fast-path code that got ripped out a
> while ago. seccomp already has everything it needs to do this work, so
> just:
>
> 	__secure_computing(NULL);
>
> is sufficient for every architecture that supports seccomp. (See kernel/seccomp.c
> populate_seccomp_data().)

Nice. Was not aware of these details. Trivial enough to fix :)

> And if you want more generalization work, note that the secure_computing()
> macro performs a TIF test before calling __secure_computing(NULL). But
> my point is, I think arch_syscall_enter_seccomp() is not needed.

Cute. One horror gone.

>> +static inline void arch_syscall_enter_audit(struct pt_regs *regs)
>> +{
>> +#ifdef CONFIG_X86_64
>> +	if (in_ia32_syscall()) {
>> +		audit_syscall_entry(regs->orig_ax, regs->di,
>> +				    regs->si, regs->dx, regs->r10);
>> +	} else
>> +#endif
>> +	{
>> +		audit_syscall_entry(regs->orig_ax, regs->bx,
>> +				    regs->cx, regs->dx, regs->si);
>> +	}
>> +}
>> +#define arch_syscall_enter_audit arch_syscall_enter_audit
>
> Similarly, I think these can be redefined in the generic case
> using the existing accessors for syscall arguments, etc. e.g.
> arch_syscall_enter_audit() is not needed for any architecture, and the
> generic is:
>
> 	unsigned long args[6];
>
>         syscall_get_arguments(task, regs, args);
> 	audit_syscall_entry(syscall_get_nr(current, regs),
> 			    args[0], args[1], args[2], args[3]);

Nice. Another arch specific mess gone.

Thanks,

        tglx
diff mbox series

Patch

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -115,6 +115,7 @@  config X86
 	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_CPU_VULNERABILITIES
 	select GENERIC_EARLY_IOREMAP
+	select GENERIC_ENTRY
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IOMAP
 	select GENERIC_IRQ_EFFECTIVE_AFF_MASK	if SMP
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -10,13 +10,13 @@ 
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <linux/entry-common.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/audit.h>
-#include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/export.h>
 #include <linux/context_tracking.h>
@@ -42,70 +42,8 @@ 
 #include <asm/syscall.h>
 #include <asm/irq_stack.h>
 
-#define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
-/* Check that the stack and regs on entry from user mode are sane. */
-static noinstr void check_user_regs(struct pt_regs *regs)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
-		/*
-		 * Make sure that the entry code gave us a sensible EFLAGS
-		 * register.  Native because we want to check the actual CPU
-		 * state, not the interrupt state as imagined by Xen.
-		 */
-		unsigned long flags = native_save_fl();
-		WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
-				      X86_EFLAGS_NT));
-
-		/* We think we came from user mode. Make sure pt_regs agrees. */
-		WARN_ON_ONCE(!user_mode(regs));
-
-		/*
-		 * All entries from user mode (except #DF) should be on the
-		 * normal thread stack and should have user pt_regs in the
-		 * correct location.
-		 */
-		WARN_ON_ONCE(!on_thread_stack());
-		WARN_ON_ONCE(regs != task_pt_regs(current));
-	}
-}
-
-#ifdef CONFIG_CONTEXT_TRACKING
-/**
- * enter_from_user_mode - Establish state when coming from user mode
- *
- * Syscall entry disables interrupts, but user mode is traced as interrupts
- * enabled. Also with NO_HZ_FULL RCU might be idle.
- *
- * 1) Tell lockdep that interrupts are disabled
- * 2) Invoke context tracking if enabled to reactivate RCU
- * 3) Trace interrupts off state
- */
-static noinstr void enter_from_user_mode(struct pt_regs *regs)
-{
-	enum ctx_state state = ct_state();
-
-	check_user_regs(regs);
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	user_exit_irqoff();
-
-	instrumentation_begin();
-	CT_WARN_ON(state != CONTEXT_USER);
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-}
-#else
-static __always_inline void enter_from_user_mode(struct pt_regs *regs)
-{
-	check_user_regs(regs);
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	instrumentation_begin();
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-}
-#endif
-
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -129,83 +67,6 @@  static __always_inline void exit_to_user
 	lockdep_hardirqs_on(CALLER_ADDR0);
 }
 
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * Returns the syscall nr to run (which should match regs->orig_ax) or -1
- * to skip the syscall.
- */
-static long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-
-	struct thread_info *ti = current_thread_info();
-	unsigned long ret = 0;
-	u32 work;
-
-	work = READ_ONCE(ti->flags);
-
-	if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
-		ret = tracehook_report_syscall_entry(regs);
-		if (ret || (work & _TIF_SYSCALL_EMU))
-			return -1L;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp after ptrace, to catch any tracer changes.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		ret = __secure_computing(&sd);
-		if (ret == -1)
-			return ret;
-	}
-#endif
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
 #define EXIT_TO_USERMODE_LOOP_FLAGS				\
 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
 	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
@@ -366,26 +227,10 @@  static void __syscall_return_slowpath(st
 	exit_to_user_mode();
 }
 
-static noinstr long syscall_enter(struct pt_regs *regs, unsigned long nr)
-{
-	struct thread_info *ti;
-
-	enter_from_user_mode(regs);
-	instrumentation_begin();
-
-	local_irq_enable();
-	ti = current_thread_info();
-	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
-		nr = syscall_trace_enter(regs);
-
-	instrumentation_end();
-	return nr;
-}
-
 #ifdef CONFIG_X86_64
 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
 {
-	nr = syscall_enter(regs, nr);
+	nr = syscall_enter_from_user_mode(regs, nr);
 
 	instrumentation_begin();
 	if (likely(nr < NR_syscalls)) {
@@ -407,6 +252,8 @@  static noinstr long syscall_enter(struct
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
 {
+	unsigned int nr = (unsigned int)regs->orig_ax;
+
 	if (IS_ENABLED(CONFIG_IA32_EMULATION))
 		current_thread_info()->status |= TS_COMPAT;
 	/*
@@ -414,7 +261,7 @@  static __always_inline unsigned int sysc
 	 * orig_ax, the unsigned int return value truncates it.  This may
 	 * or may not be necessary, but it matches the old asm behavior.
 	 */
-	return syscall_enter(regs, (unsigned int)regs->orig_ax);
+	return (unsigned int)syscall_enter_from_user_mode(regs, nr);
 }
 
 /*
@@ -568,7 +415,7 @@  SYSCALL_DEFINE0(ni_syscall)
  * solves the problem of kernel mode pagefaults which can schedule, which
  * is not possible after invoking rcu_irq_enter() without undoing it.
  *
- * For user mode entries enter_from_user_mode() must be invoked to
+ * For user mode entries irqentry_enter_from_user_mode() must be invoked to
  * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
  * would not be possible.
  *
@@ -584,7 +431,7 @@  idtentry_state_t noinstr idtentry_enter(
 	};
 
 	if (user_mode(regs)) {
-		enter_from_user_mode(regs);
+		irqentry_enter_from_user_mode(regs);
 		return ret;
 	}
 
@@ -615,7 +462,7 @@  idtentry_state_t noinstr idtentry_enter(
 		/*
 		 * If RCU is not watching then the same careful
 		 * sequence vs. lockdep and tracing is required
-		 * as in enter_from_user_mode().
+		 * as in irqentry_enter_from_user_mode().
 		 */
 		lockdep_hardirqs_off(CALLER_ADDR0);
 		rcu_irq_enter();
@@ -709,18 +556,6 @@  void noinstr idtentry_exit(struct pt_reg
 }
 
 /**
- * idtentry_enter_user - Handle state tracking on idtentry from user mode
- * @regs:	Pointer to pt_regs of interrupted context
- *
- * Invokes enter_from_user_mode() to establish the proper context for
- * NOHZ_FULL. Otherwise scheduling on exit would not be possible.
- */
-void noinstr idtentry_enter_user(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-}
-
-/**
  * idtentry_exit_user - Handle return from exception to user mode
  * @regs:	Pointer to pt_regs (exception entry regs)
  *
--- /dev/null
+++ b/arch/x86/include/asm/entry-common.h
@@ -0,0 +1,86 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_ENTRY_COMMON_H
+#define _ASM_X86_ENTRY_COMMON_H
+
+#include <linux/seccomp.h>
+#include <linux/audit.h>
+
+/* Check that the stack and regs on entry from user mode are sane. */
+static __always_inline void arch_check_user_regs(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
+		/*
+		 * Make sure that the entry code gave us a sensible EFLAGS
+		 * register.  Native because we want to check the actual CPU
+		 * state, not the interrupt state as imagined by Xen.
+		 */
+		unsigned long flags = native_save_fl();
+		WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
+				      X86_EFLAGS_NT));
+
+		/* We think we came from user mode. Make sure pt_regs agrees. */
+		WARN_ON_ONCE(!user_mode(regs));
+
+		/*
+		 * All entries from user mode (except #DF) should be on the
+		 * normal thread stack and should have user pt_regs in the
+		 * correct location.
+		 */
+		WARN_ON_ONCE(!on_thread_stack());
+		WARN_ON_ONCE(regs != task_pt_regs(current));
+	}
+}
+#define arch_check_user_regs arch_check_user_regs
+
+static inline long arch_syscall_enter_seccomp(struct pt_regs *regs)
+{
+#ifdef CONFIG_SECCOMP
+	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	struct seccomp_data sd;
+
+	sd.arch = arch;
+	sd.nr = regs->orig_ax;
+	sd.instruction_pointer = regs->ip;
+
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		sd.args[0] = regs->di;
+		sd.args[1] = regs->si;
+		sd.args[2] = regs->dx;
+		sd.args[3] = regs->r10;
+		sd.args[4] = regs->r8;
+		sd.args[5] = regs->r9;
+	} else
+#endif
+	{
+		sd.args[0] = regs->bx;
+		sd.args[1] = regs->cx;
+		sd.args[2] = regs->dx;
+		sd.args[3] = regs->si;
+		sd.args[4] = regs->di;
+		sd.args[5] = regs->bp;
+	}
+
+	return __secure_computing(&sd);
+#else
+	return 0;
+#endif
+}
+#define arch_syscall_enter_seccomp arch_syscall_enter_seccomp
+
+static inline void arch_syscall_enter_audit(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+	if (in_ia32_syscall()) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+#define arch_syscall_enter_audit arch_syscall_enter_audit
+
+#endif
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -6,11 +6,14 @@ 
 #include <asm/trapnr.h>
 
 #ifndef __ASSEMBLY__
+#include <linux/entry-common.h>
 #include <linux/hardirq.h>
 
 #include <asm/irq_stack.h>
 
-void idtentry_enter_user(struct pt_regs *regs);
+/* Temporary define */
+#define idtentry_enter_user	irqentry_enter_from_user_mode
+
 void idtentry_exit_user(struct pt_regs *regs);
 
 typedef struct idtentry_state {
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -133,11 +133,6 @@  struct thread_info {
 #define _TIF_X32		(1 << TIF_X32)
 #define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
-/* Work to do before invoking the actual syscall. */
-#define _TIF_WORK_SYSCALL_ENTRY	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
-
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE					\
 	(_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP |		\