diff mbox series

[-tip,v4,10/12] x86/kprobes: Push a fake return address at kretprobe_trampoline

Message ID 161639530062.895304.16962383429668412873.stgit@devnote2 (mailing list archive)
State Superseded
Headers show
Series kprobes: Fix stacktrace with kretprobes on x86 | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Masami Hiramatsu (Google) March 22, 2021, 6:41 a.m. UTC
This changes x86/kretprobe stack frame on kretprobe_trampoline
a bit, which now push the kretprobe_trampoline as a fake return
address at the bottom of the stack frame. With this fix, the ORC
unwinder will see the kretprobe_trampoline as a return address.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/kprobes/core.c |   31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

Comments

Peter Zijlstra March 23, 2021, 10:30 p.m. UTC | #1
On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
>  	".global kretprobe_trampoline\n"
>  	".type kretprobe_trampoline, @function\n"
>  	"kretprobe_trampoline:\n"
>  #ifdef CONFIG_X86_64

So what happens if we get an NMI here? That is, after the RET but before
the push? Then our IP points into the trampoline but we've not done that
push yet.

> +	/* Push fake return address to tell the unwinder it's a kretprobe */
> +	"	pushq $kretprobe_trampoline\n"
>  	UNWIND_HINT_FUNC
> +	/* Save the sp-8, this will be fixed later */
> +	"	pushq %rsp\n"
>  	"	pushfq\n"
>  	SAVE_REGS_STRING
>  	"	movq %rsp, %rdi\n"
>  	"	call trampoline_handler\n"
>  	RESTORE_REGS_STRING
> +	"	addq $8, %rsp\n"
>  	"	popfq\n"
Masami Hiramatsu (Google) March 24, 2021, 1:40 a.m. UTC | #2
On Tue, 23 Mar 2021 23:30:07 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
> >  	".global kretprobe_trampoline\n"
> >  	".type kretprobe_trampoline, @function\n"
> >  	"kretprobe_trampoline:\n"
> >  #ifdef CONFIG_X86_64
> 
> So what happens if we get an NMI here? That is, after the RET but before
> the push? Then our IP points into the trampoline but we've not done that
> push yet.

Not only NMI, but also interrupts can happen. There is no cli/sti here.

Anyway, thanks for pointing!
I think in UNWIND_HINT_TYPE_REGS and UNWIND_HINT_TYPE_REGS_PARTIAL cases
ORC unwinder also has to check the state->ip and if it is kretprobe_trampoline,
it should be recovered.
What about this?

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 332aa6174b10..36d3971c0a2c 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -101,6 +101,15 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
 			void *orc, size_t orc_size) {}
 #endif
 
+static inline
+unsigned long unwind_recover_kretprobe(struct unwind_state *state,
+				       unsigned long addr, unsigned long *addr_p)
+{
+	return is_kretprobe_trampoline(addr) ?
+		kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) :
+		addr;
+}
+
 /* Recover the return address modified by instrumentation (e.g. kretprobe) */
 static inline
 unsigned long unwind_recover_ret_addr(struct unwind_state *state,
@@ -110,10 +119,7 @@ unsigned long unwind_recover_ret_addr(struct unwind_state *state,
 
 	ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
 				    addr, addr_p);
-	if (is_kretprobe_trampoline(ret))
-		ret = kretprobe_find_ret_addr(state->task, addr_p,
-					      &state->kr_cur);
-	return ret;
+	return unwind_recover_kretprobe(state, ret, addr_p);
 }
 
 /*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 839a0698342a..cb59aeca6a4a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -549,7 +549,15 @@ bool unwind_next_frame(struct unwind_state *state)
 					 (void *)orig_ip);
 			goto err;
 		}
-
+		/*
+		 * There is a small chance to interrupt at the entry of
+		 * kretprobe_trampoline where the ORC info doesn't exist.
+		 * That point is right after the RET to kretprobe_trampoline
+		 * which was modified return address. So the @addr_p must
+		 * be right before the regs->sp.
+		 */
+		state->ip = unwind_recover_kretprobe(state, state->ip,
+					state->sp - sizeof(unsigned long));
 		state->regs = (struct pt_regs *)sp;
 		state->prev_regs = NULL;
 		state->full_regs = true;
@@ -562,6 +570,9 @@ bool unwind_next_frame(struct unwind_state *state)
 					 (void *)orig_ip);
 			goto err;
 		}
+		/* See UNWIND_HINT_TYPE_REGS case comment. */
+		state->ip = unwind_recover_kretprobe(state, state->ip,
+					state->sp - sizeof(unsigned long));
 
 		if (state->full_regs)
 			state->prev_regs = state->regs;
Josh Poimboeuf March 24, 2021, 4:01 p.m. UTC | #3
On Wed, Mar 24, 2021 at 10:40:58AM +0900, Masami Hiramatsu wrote:
> On Tue, 23 Mar 2021 23:30:07 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
> > >  	".global kretprobe_trampoline\n"
> > >  	".type kretprobe_trampoline, @function\n"
> > >  	"kretprobe_trampoline:\n"
> > >  #ifdef CONFIG_X86_64
> > 
> > So what happens if we get an NMI here? That is, after the RET but before
> > the push? Then our IP points into the trampoline but we've not done that
> > push yet.
> 
> Not only NMI, but also interrupts can happen. There is no cli/sti here.
> 
> Anyway, thanks for pointing!
> I think in UNWIND_HINT_TYPE_REGS and UNWIND_HINT_TYPE_REGS_PARTIAL cases
> ORC unwinder also has to check the state->ip and if it is kretprobe_trampoline,
> it should be recovered.
> What about this?

I think the REGS and REGS_PARTIAL cases can also be affected by function
graph tracing.  So should they use the generic unwind_recover_ret_addr()
instead of unwind_recover_kretprobe()?
Masami Hiramatsu (Google) March 24, 2021, 11:47 p.m. UTC | #4
On Wed, 24 Mar 2021 11:01:43 -0500
Josh Poimboeuf <jpoimboe@redhat.com> wrote:

> On Wed, Mar 24, 2021 at 10:40:58AM +0900, Masami Hiramatsu wrote:
> > On Tue, 23 Mar 2021 23:30:07 +0100
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
> > > >  	".global kretprobe_trampoline\n"
> > > >  	".type kretprobe_trampoline, @function\n"
> > > >  	"kretprobe_trampoline:\n"
> > > >  #ifdef CONFIG_X86_64
> > > 
> > > So what happens if we get an NMI here? That is, after the RET but before
> > > the push? Then our IP points into the trampoline but we've not done that
> > > push yet.
> > 
> > Not only NMI, but also interrupts can happen. There is no cli/sti here.
> > 
> > Anyway, thanks for pointing!
> > I think in UNWIND_HINT_TYPE_REGS and UNWIND_HINT_TYPE_REGS_PARTIAL cases
> > ORC unwinder also has to check the state->ip and if it is kretprobe_trampoline,
> > it should be recovered.
> > What about this?
> 
> I think the REGS and REGS_PARTIAL cases can also be affected by function
> graph tracing.  So should they use the generic unwind_recover_ret_addr()
> instead of unwind_recover_kretprobe()?

Yes, but I'm not sure this parameter can be applied.
For example, it passed "state->sp - sizeof(unsigned long)" as where the
return address stored address. Is that same on ftrace graph too?

Thank you,
Steven Rostedt March 25, 2021, 12:26 a.m. UTC | #5
On Thu, 25 Mar 2021 08:47:41 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> > I think the REGS and REGS_PARTIAL cases can also be affected by function
> > graph tracing.  So should they use the generic unwind_recover_ret_addr()
> > instead of unwind_recover_kretprobe()?  
> 
> Yes, but I'm not sure this parameter can be applied.
> For example, it passed "state->sp - sizeof(unsigned long)" as where the
> return address stored address. Is that same on ftrace graph too?

Stack traces on the return side of function graph tracer has never
worked. It's on my todo list, because that's one of the requirements to
get right if we every manage to combine kretprobe and function graph
tracers together.

-- Steve
Masami Hiramatsu (Google) March 25, 2021, 3:25 a.m. UTC | #6
On Wed, 24 Mar 2021 20:26:13 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 25 Mar 2021 08:47:41 +0900
> Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > > I think the REGS and REGS_PARTIAL cases can also be affected by function
> > > graph tracing.  So should they use the generic unwind_recover_ret_addr()
> > > instead of unwind_recover_kretprobe()?  
> > 
> > Yes, but I'm not sure this parameter can be applied.
> > For example, it passed "state->sp - sizeof(unsigned long)" as where the
> > return address stored address. Is that same on ftrace graph too?
> 
> Stack traces on the return side of function graph tracer has never
> worked. It's on my todo list, because that's one of the requirements to
> get right if we every manage to combine kretprobe and function graph
> tracers together.

OK, then at this point let's just fix the kretprobe side.

Thanks,

> 
> -- Steve
Masami Hiramatsu (Google) March 25, 2021, 6:05 p.m. UTC | #7
On Wed, 24 Mar 2021 10:40:58 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Tue, 23 Mar 2021 23:30:07 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
> > >  	".global kretprobe_trampoline\n"
> > >  	".type kretprobe_trampoline, @function\n"
> > >  	"kretprobe_trampoline:\n"
> > >  #ifdef CONFIG_X86_64
> > 
> > So what happens if we get an NMI here? That is, after the RET but before
> > the push? Then our IP points into the trampoline but we've not done that
> > push yet.
> 
> Not only NMI, but also interrupts can happen. There is no cli/sti here.
> 
> Anyway, thanks for pointing!
> I think in UNWIND_HINT_TYPE_REGS and UNWIND_HINT_TYPE_REGS_PARTIAL cases
> ORC unwinder also has to check the state->ip and if it is kretprobe_trampoline,
> it should be recovered.
> What about this?

Hmm, this seems to intoduce another issue on stacktrace from kprobes.

           <...>-137     [003] d.Z.    17.250714: p_full_proxy_read_5: (full_proxy_read+0x5/0x80)
           <...>-137     [003] d.Z.    17.250737: <stack trace>
 => kprobe_trace_func+0x1d0/0x2c0
 => kprobe_dispatcher+0x39/0x60
 => aggr_pre_handler+0x4f/0x90
 => kprobe_int3_handler+0x152/0x1a0
 => exc_int3+0x47/0x140
 => asm_exc_int3+0x31/0x40
 => 0
 => 0
 => 0
 => 0
 => 0
 => 0
 => 0

Let me check...

Thanks,

> 
> diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
> index 332aa6174b10..36d3971c0a2c 100644
> --- a/arch/x86/include/asm/unwind.h
> +++ b/arch/x86/include/asm/unwind.h
> @@ -101,6 +101,15 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
>  			void *orc, size_t orc_size) {}
>  #endif
>  
> +static inline
> +unsigned long unwind_recover_kretprobe(struct unwind_state *state,
> +				       unsigned long addr, unsigned long *addr_p)
> +{
> +	return is_kretprobe_trampoline(addr) ?
> +		kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) :
> +		addr;
> +}
> +
>  /* Recover the return address modified by instrumentation (e.g. kretprobe) */
>  static inline
>  unsigned long unwind_recover_ret_addr(struct unwind_state *state,
> @@ -110,10 +119,7 @@ unsigned long unwind_recover_ret_addr(struct unwind_state *state,
>  
>  	ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
>  				    addr, addr_p);
> -	if (is_kretprobe_trampoline(ret))
> -		ret = kretprobe_find_ret_addr(state->task, addr_p,
> -					      &state->kr_cur);
> -	return ret;
> +	return unwind_recover_kretprobe(state, ret, addr_p);
>  }
>  
>  /*
> diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
> index 839a0698342a..cb59aeca6a4a 100644
> --- a/arch/x86/kernel/unwind_orc.c
> +++ b/arch/x86/kernel/unwind_orc.c
> @@ -549,7 +549,15 @@ bool unwind_next_frame(struct unwind_state *state)
>  					 (void *)orig_ip);
>  			goto err;
>  		}
> -
> +		/*
> +		 * There is a small chance to interrupt at the entry of
> +		 * kretprobe_trampoline where the ORC info doesn't exist.
> +		 * That point is right after the RET to kretprobe_trampoline
> +		 * which was modified return address. So the @addr_p must
> +		 * be right before the regs->sp.
> +		 */
> +		state->ip = unwind_recover_kretprobe(state, state->ip,
> +					state->sp - sizeof(unsigned long));
>  		state->regs = (struct pt_regs *)sp;
>  		state->prev_regs = NULL;
>  		state->full_regs = true;
> @@ -562,6 +570,9 @@ bool unwind_next_frame(struct unwind_state *state)
>  					 (void *)orig_ip);
>  			goto err;
>  		}
> +		/* See UNWIND_HINT_TYPE_REGS case comment. */
> +		state->ip = unwind_recover_kretprobe(state, state->ip,
> +					state->sp - sizeof(unsigned long));
>  
>  		if (state->full_regs)
>  			state->prev_regs = state->regs;
> 
> 
> -- 
> Masami Hiramatsu <mhiramat@kernel.org>
Masami Hiramatsu (Google) March 26, 2021, 12:03 p.m. UTC | #8
On Fri, 26 Mar 2021 03:05:03 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Wed, 24 Mar 2021 10:40:58 +0900
> Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > On Tue, 23 Mar 2021 23:30:07 +0100
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Mon, Mar 22, 2021 at 03:41:40PM +0900, Masami Hiramatsu wrote:
> > > >  	".global kretprobe_trampoline\n"
> > > >  	".type kretprobe_trampoline, @function\n"
> > > >  	"kretprobe_trampoline:\n"
> > > >  #ifdef CONFIG_X86_64
> > > 
> > > So what happens if we get an NMI here? That is, after the RET but before
> > > the push? Then our IP points into the trampoline but we've not done that
> > > push yet.
> > 
> > Not only NMI, but also interrupts can happen. There is no cli/sti here.
> > 
> > Anyway, thanks for pointing!
> > I think in UNWIND_HINT_TYPE_REGS and UNWIND_HINT_TYPE_REGS_PARTIAL cases
> > ORC unwinder also has to check the state->ip and if it is kretprobe_trampoline,
> > it should be recovered.
> > What about this?
> 
> Hmm, this seems to intoduce another issue on stacktrace from kprobes.
> 
>            <...>-137     [003] d.Z.    17.250714: p_full_proxy_read_5: (full_proxy_read+0x5/0x80)
>            <...>-137     [003] d.Z.    17.250737: <stack trace>
>  => kprobe_trace_func+0x1d0/0x2c0
>  => kprobe_dispatcher+0x39/0x60
>  => aggr_pre_handler+0x4f/0x90
>  => kprobe_int3_handler+0x152/0x1a0
>  => exc_int3+0x47/0x140
>  => asm_exc_int3+0x31/0x40
>  => 0
>  => 0
>  => 0
>  => 0
>  => 0
>  => 0
>  => 0
> 
> Let me check...

I confirmed this is not related to this series, but occurs when I build kernels with different
configs without cleanup.

Once I build kernel with CONFIG_UNWIND_GUESS=y (for testing), and after that,
I build kernel again with CONFIG_UNWIND_ORC=y (but without make clean), this
happened. In this case, I guess ORC data might be corrupted?
When I cleanup and rebuild, the stacktrace seems correct.

Thank you,
Steven Rostedt March 26, 2021, 2:20 p.m. UTC | #9
On Fri, 26 Mar 2021 21:03:49 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> I confirmed this is not related to this series, but occurs when I build kernels with different
> configs without cleanup.
> 
> Once I build kernel with CONFIG_UNWIND_GUESS=y (for testing), and after that,
> I build kernel again with CONFIG_UNWIND_ORC=y (but without make clean), this
> happened. In this case, I guess ORC data might be corrupted?
> When I cleanup and rebuild, the stacktrace seems correct.

Hmm, that should be fixed. Seems like we are missing a dependency somewhere.

-- Steve
Josh Poimboeuf March 30, 2021, 4:58 a.m. UTC | #10
On Fri, Mar 26, 2021 at 10:20:09AM -0400, Steven Rostedt wrote:
> On Fri, 26 Mar 2021 21:03:49 +0900
> Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > I confirmed this is not related to this series, but occurs when I build kernels with different
> > configs without cleanup.
> > 
> > Once I build kernel with CONFIG_UNWIND_GUESS=y (for testing), and after that,
> > I build kernel again with CONFIG_UNWIND_ORC=y (but without make clean), this
> > happened. In this case, I guess ORC data might be corrupted?
> > When I cleanup and rebuild, the stacktrace seems correct.
> 
> Hmm, that should be fixed. Seems like we are missing a dependency somewhere.

Thomas reported something similar: for example arch/x86/kernel/head_64.o
doesn't get rebuilt when changing unwinders.

  https://lkml.kernel.org/r/87tuqublrb.fsf@nanos.tec.linutronix.de

Masahiro, any idea how we can force a full tree rebuild when changing
the unwinder?
diff mbox series

Patch

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 23255663c434..d7b90541eda1 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -782,28 +782,31 @@  asm(
 	".global kretprobe_trampoline\n"
 	".type kretprobe_trampoline, @function\n"
 	"kretprobe_trampoline:\n"
-	/* We don't bother saving the ss register */
 #ifdef CONFIG_X86_64
-	"	pushq %rsp\n"
+	/* Push fake return address to tell the unwinder it's a kretprobe */
+	"	pushq $kretprobe_trampoline\n"
 	UNWIND_HINT_FUNC
+	/* Save the sp-8, this will be fixed later */
+	"	pushq %rsp\n"
 	"	pushfq\n"
 	SAVE_REGS_STRING
 	"	movq %rsp, %rdi\n"
 	"	call trampoline_handler\n"
-	/* Replace saved sp with true return address. */
-	"	movq %rax, 19*8(%rsp)\n"
 	RESTORE_REGS_STRING
+	"	addq $8, %rsp\n"
 	"	popfq\n"
 #else
-	"	pushl %esp\n"
+	/* Push fake return address to tell the unwinder it's a kretprobe */
+	"	pushl $kretprobe_trampoline\n"
 	UNWIND_HINT_FUNC
+	/* Save the sp-4, this will be fixed later */
+	"	pushl %esp\n"
 	"	pushfl\n"
 	SAVE_REGS_STRING
 	"	movl %esp, %eax\n"
 	"	call trampoline_handler\n"
-	/* Replace saved sp with true return address. */
-	"	movl %eax, 15*4(%esp)\n"
 	RESTORE_REGS_STRING
+	"	addl $4, %esp\n"
 	"	popfl\n"
 #endif
 	"	ret\n"
@@ -814,8 +817,10 @@  NOKPROBE_SYMBOL(kretprobe_trampoline);
 /*
  * Called from kretprobe_trampoline
  */
-__used __visible void *trampoline_handler(struct pt_regs *regs)
+__used __visible void trampoline_handler(struct pt_regs *regs)
 {
+	unsigned long *frame_pointer;
+
 	/* fixup registers */
 	regs->cs = __KERNEL_CS;
 #ifdef CONFIG_X86_32
@@ -823,8 +828,16 @@  __used __visible void *trampoline_handler(struct pt_regs *regs)
 #endif
 	regs->ip = (unsigned long)&kretprobe_trampoline;
 	regs->orig_ax = ~0UL;
+	regs->sp += sizeof(long);
+	frame_pointer = ((unsigned long *)&regs->sp) + 1;
 
-	return (void *)kretprobe_trampoline_handler(regs, &regs->sp);
+	/* Replace fake return address with real one. */
+	*frame_pointer = kretprobe_trampoline_handler(regs, frame_pointer);
+	/*
+	 * Move flags to sp so that kretprobe_trapmoline can return
+	 * right after popf.
+	 */
+	regs->sp = regs->flags;
 }
 NOKPROBE_SYMBOL(trampoline_handler);