diff mbox series

[RFC] exec: x86: Ensure SIGBUS delivered on MCE

Message ID 20240501015340.3014724-1-andrew.zaborowski@intel.com (mailing list archive)
State New
Headers show
Series [RFC] exec: x86: Ensure SIGBUS delivered on MCE | expand

Commit Message

Andrew Zaborowski May 1, 2024, 1:53 a.m. UTC
Uncorrected memory errors are signaled to processes using SIGBUS or an
error retval from a syscall.  But there's a corner cases in execve where
a SIGSEGV will be delivered.  Specifically this will happen if the binary
loader triggers a memory error reading user pages.  The architecture's
handler (MCE handler on x86) may queue a call to memory_failure but that
won't run until the execve() returns.  The binary loader is called after
bprm->point_of_no_return is set meaning that any error is handled by
bprm_execve() with a SIGSEGV to the process.

To ensure it is terminated with a SIGBUS we 1. let pending work run in
the bprm_execve error case.

And 2. ensure memory_failure() is passed MF_ACTION_REQUIRED so that the
SIGBUS is queued.  Normally when the MCE is in a syscall, a fixup of
return IP and a call to kill_me_never are enough.  But in this case
it's necessary to queue kill_me_maybe() which will set
MF_ACTION_REQUIRED.

Reuse current->in_execve to make the decision.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
---
 arch/x86/kernel/cpu/mce/core.c | 14 ++++++++++++++
 fs/exec.c                      | 12 +++++++++---
 include/linux/sched.h          |  2 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

Comments

Kees Cook May 1, 2024, 4:19 p.m. UTC | #1
On Wed, May 01, 2024 at 03:53:40AM +0200, Andrew Zaborowski wrote:
> Uncorrected memory errors are signaled to processes using SIGBUS or an
> error retval from a syscall.  But there's a corner cases in execve where
> a SIGSEGV will be delivered.  Specifically this will happen if the binary
> loader triggers a memory error reading user pages.  The architecture's
> handler (MCE handler on x86) may queue a call to memory_failure but that
> won't run until the execve() returns.  The binary loader is called after
> bprm->point_of_no_return is set meaning that any error is handled by
> bprm_execve() with a SIGSEGV to the process.

Why is it needed to have a distinction between SIGBUS and SIGSEGV in
this case?

> To ensure it is terminated with a SIGBUS we 1. let pending work run in
> the bprm_execve error case.
> 
> And 2. ensure memory_failure() is passed MF_ACTION_REQUIRED so that the
> SIGBUS is queued.  Normally when the MCE is in a syscall, a fixup of
> return IP and a call to kill_me_never are enough.  But in this case
> it's necessary to queue kill_me_maybe() which will set
> MF_ACTION_REQUIRED.
> 
> Reuse current->in_execve to make the decision.

We're actually in the process of trying to remove[1] this flag, so I'd
like to avoid adding new users of it. It sounds like it's desirable here
because a choice is needed about kill_me_never() vs kill_me_maybe()?

-Kees

[1] https://lore.kernel.org/lkml/8fafb8e1-b6be-4d08-945f-b464e3a396c8@I-love.SAKURA.ne.jp/

> 
> Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
> ---
>  arch/x86/kernel/cpu/mce/core.c | 14 ++++++++++++++
>  fs/exec.c                      | 12 +++++++++---
>  include/linux/sched.h          |  2 +-
>  3 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 84d41be6d06b..11effdff942c 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1593,6 +1593,20 @@ noinstr void do_machine_check(struct pt_regs *regs)
>  		else
>  			queue_task_work(&m, msg, kill_me_maybe);
>  
> +	} else if (current->in_execve) {
> +		/*
> +		 * Cannot recover a task in execve() beyond point of no
> +		 * return but stop further user memory accesses.
> +		 */
> +		if (m.kflags & MCE_IN_KERNEL_RECOV) {
> +			if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> +				mce_panic("Failed kernel mode recovery", &m, msg);
> +		}
> +
> +		if (!mce_usable_address(&m))
> +			queue_task_work(&m, msg, kill_me_now);
> +		else
> +			queue_task_work(&m, msg, kill_me_maybe);
>  	} else {
>  		/*
>  		 * Handle an MCE which has happened in kernel space but from
> diff --git a/fs/exec.c b/fs/exec.c
> index cf1df7f16e55..1bea9c252a11 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -67,6 +67,7 @@
>  #include <linux/time_namespace.h>
>  #include <linux/user_events.h>
>  #include <linux/rseq.h>
> +#include <linux/task_work.h>
>  
>  #include <linux/uaccess.h>
>  #include <asm/mmu_context.h>
> @@ -1888,10 +1889,15 @@ static int bprm_execve(struct linux_binprm *bprm)
>  	 * If past the point of no return ensure the code never
>  	 * returns to the userspace process.  Use an existing fatal
>  	 * signal if present otherwise terminate the process with
> -	 * SIGSEGV.
> +	 * SIGSEGV.  Run pending work before that in case it is
> +	 * terminating the process with a different signal.
>  	 */
> -	if (bprm->point_of_no_return && !fatal_signal_pending(current))
> -		force_fatal_sig(SIGSEGV);
> +	if (bprm->point_of_no_return) {
> +		task_work_run();
> +
> +		if (!fatal_signal_pending(current))
> +			force_fatal_sig(SIGSEGV);
> +	}
>  
>  	sched_mm_cid_after_execve(current);
>  	current->fs->in_exec = 0;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 3c2abbc587b4..8970a191d8fe 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -922,7 +922,7 @@ struct task_struct {
>  	unsigned			sched_rt_mutex:1;
>  #endif
>  
> -	/* Bit to tell TOMOYO we're in execve(): */
> +	/* Bit to tell TOMOYO and x86 MCE code we're in execve(): */
>  	unsigned			in_execve:1;
>  	unsigned			in_iowait:1;
>  #ifndef TIF_RESTORE_SIGMASK
> -- 
> 2.39.3
>
Andrew Zaborowski May 1, 2024, 6:52 p.m. UTC | #2
On Wed, 1 May 2024 at 18:19, Kees Cook <keescook@chromium.org> wrote:
> Why is it needed to have a distinction between SIGBUS and SIGSEGV in
> this case?

So this is mostly to comply with
Documentation/mm/hwpoison.rst#failure-recovery-modes.  No doc probably
mentions the execve case but users might expect consistency with the
cases where user memory is accessed from userspace.

In our case it was the parent process that was confused by the SIGSEGV
but it was a validation scenario, not a real use case.

>
> > To ensure it is terminated with a SIGBUS we 1. let pending work run in
> > the bprm_execve error case.
> >
> > And 2. ensure memory_failure() is passed MF_ACTION_REQUIRED so that the
> > SIGBUS is queued.  Normally when the MCE is in a syscall, a fixup of
> > return IP and a call to kill_me_never are enough.  But in this case
> > it's necessary to queue kill_me_maybe() which will set
> > MF_ACTION_REQUIRED.
> >
> > Reuse current->in_execve to make the decision.
>
> We're actually in the process of trying to remove[1] this flag, so I'd
> like to avoid adding new users of it.

Oh, didn't see that.

> It sounds like it's desirable here
> because a choice is needed about kill_me_never() vs kill_me_maybe()?

Ideally it should be based on bprm->point_of_no_return and
current->in_execve matches that closely enough.

Instead bprm_execve could directly send the SIGBUS based on the return
value from the binary loader (which would have to be changed) or a
flag set by the MCE handler but I couldn't see a good way to do that.

Best regards

[I can't set the In-reply-to header correctly for this message, sorry]
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 84d41be6d06b..11effdff942c 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1593,6 +1593,20 @@  noinstr void do_machine_check(struct pt_regs *regs)
 		else
 			queue_task_work(&m, msg, kill_me_maybe);
 
+	} else if (current->in_execve) {
+		/*
+		 * Cannot recover a task in execve() beyond point of no
+		 * return but stop further user memory accesses.
+		 */
+		if (m.kflags & MCE_IN_KERNEL_RECOV) {
+			if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+				mce_panic("Failed kernel mode recovery", &m, msg);
+		}
+
+		if (!mce_usable_address(&m))
+			queue_task_work(&m, msg, kill_me_now);
+		else
+			queue_task_work(&m, msg, kill_me_maybe);
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
diff --git a/fs/exec.c b/fs/exec.c
index cf1df7f16e55..1bea9c252a11 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -67,6 +67,7 @@ 
 #include <linux/time_namespace.h>
 #include <linux/user_events.h>
 #include <linux/rseq.h>
+#include <linux/task_work.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1888,10 +1889,15 @@  static int bprm_execve(struct linux_binprm *bprm)
 	 * If past the point of no return ensure the code never
 	 * returns to the userspace process.  Use an existing fatal
 	 * signal if present otherwise terminate the process with
-	 * SIGSEGV.
+	 * SIGSEGV.  Run pending work before that in case it is
+	 * terminating the process with a different signal.
 	 */
-	if (bprm->point_of_no_return && !fatal_signal_pending(current))
-		force_fatal_sig(SIGSEGV);
+	if (bprm->point_of_no_return) {
+		task_work_run();
+
+		if (!fatal_signal_pending(current))
+			force_fatal_sig(SIGSEGV);
+	}
 
 	sched_mm_cid_after_execve(current);
 	current->fs->in_exec = 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c2abbc587b4..8970a191d8fe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -922,7 +922,7 @@  struct task_struct {
 	unsigned			sched_rt_mutex:1;
 #endif
 
-	/* Bit to tell TOMOYO we're in execve(): */
+	/* Bit to tell TOMOYO and x86 MCE code we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
 #ifndef TIF_RESTORE_SIGMASK