diff mbox series

x86: kprobes: orc: Fix ORC walks in kretprobes

Message ID d72c62498ea0514e7b81a3eab5e8c1671137b9a0.1614902828.git.dxu@dxuuu.xyz (mailing list archive)
State Not Applicable
Delegated to: Netdev Maintainers
Headers show
Series x86: kprobes: orc: Fix ORC walks in kretprobes | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Daniel Xu March 5, 2021, 12:07 a.m. UTC
Getting a stack trace from inside a kretprobe used to work with frame
pointer stack walks. After the default unwinder was switched to ORC,
stack traces broke because ORC did not know how to skip the
`kretprobe_trampoline` "frame".

Frame based stack walks used to work with kretprobes because
`kretprobe_trampoline` does not set up a new call frame. Thus, the frame
pointer based unwinder could walk directly to the kretprobe'd caller.

For example, this stack is walked incorrectly with ORC + kretprobe:

    # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
    Attaching 1 probe...
    ^C

    @[
        kretprobe_trampoline+0
    ]: 1

After this patch, the stack is walked correctly:

    # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
    Attaching 1 probe...
    ^C

    @[
        kretprobe_trampoline+0
        __x64_sys_nanosleep+150
        do_syscall_64+51
        entry_SYSCALL_64_after_hwframe+68
    ]: 12

Fixes: fc72ae40e303 ("x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit")
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 arch/x86/kernel/unwind_orc.c | 53 +++++++++++++++++++++++++++++++++++-
 kernel/kprobes.c             |  8 +++---
 2 files changed, 56 insertions(+), 5 deletions(-)

Comments

Masami Hiramatsu (Google) March 5, 2021, 9:28 a.m. UTC | #1
Hi Daniel,

On Thu,  4 Mar 2021 16:07:52 -0800
Daniel Xu <dxu@dxuuu.xyz> wrote:

> Getting a stack trace from inside a kretprobe used to work with frame
> pointer stack walks. After the default unwinder was switched to ORC,
> stack traces broke because ORC did not know how to skip the
> `kretprobe_trampoline` "frame".
> 
> Frame based stack walks used to work with kretprobes because
> `kretprobe_trampoline` does not set up a new call frame. Thus, the frame
> pointer based unwinder could walk directly to the kretprobe'd caller.
> 
> For example, this stack is walked incorrectly with ORC + kretprobe:
> 
>     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
>     Attaching 1 probe...
>     ^C
> 
>     @[
>         kretprobe_trampoline+0
>     ]: 1
> 
> After this patch, the stack is walked correctly:
> 
>     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
>     Attaching 1 probe...
>     ^C
> 
>     @[
>         kretprobe_trampoline+0
>         __x64_sys_nanosleep+150
>         do_syscall_64+51
>         entry_SYSCALL_64_after_hwframe+68
>     ]: 12
> 
> Fixes: fc72ae40e303 ("x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit")
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>

OK, basically good, but this is messy, and doing much more than fixing issue.

> ---
>  arch/x86/kernel/unwind_orc.c | 53 +++++++++++++++++++++++++++++++++++-
>  kernel/kprobes.c             |  8 +++---
>  2 files changed, 56 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
> index 2a1d47f47eee..1b88d75e2e9e 100644
> --- a/arch/x86/kernel/unwind_orc.c
> +++ b/arch/x86/kernel/unwind_orc.c
> @@ -1,7 +1,9 @@
>  // SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kprobes.h>
>  #include <linux/objtool.h>
>  #include <linux/module.h>
>  #include <linux/sort.h>
> +#include <asm/kprobes.h>
>  #include <asm/ptrace.h>
>  #include <asm/stacktrace.h>
>  #include <asm/unwind.h>
> @@ -77,9 +79,11 @@ static struct orc_entry *orc_module_find(unsigned long ip)
>  }
>  #endif
>  
> -#ifdef CONFIG_DYNAMIC_FTRACE
> +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_KRETPROBES)
>  static struct orc_entry *orc_find(unsigned long ip);
> +#endif
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE
>  /*
>   * Ftrace dynamic trampolines do not have orc entries of their own.
>   * But they are copies of the ftrace entries that are static and
> @@ -117,6 +121,43 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
>  }
>  #endif
>  
> +#ifdef CONFIG_KRETPROBES
> +static struct orc_entry *orc_kretprobe_find(void)
> +{
> +	kprobe_opcode_t *correct_ret_addr = NULL;
> +	struct kretprobe_instance *ri = NULL;
> +	struct llist_node *node;
> +
> +	node = current->kretprobe_instances.first;
> +	while (node) {
> +		ri = container_of(node, struct kretprobe_instance, llist);
> +
> +		if ((void *)ri->ret_addr != &kretprobe_trampoline) {
> +			/*
> +			 * This is the real return address. Any other
> +			 * instances associated with this task are for
> +			 * other calls deeper on the call stack
> +			 */
> +			correct_ret_addr = ri->ret_addr;
> +			break;
> +		}
> +
> +
> +		node = node->next;
> +	}
> +
> +	if (!correct_ret_addr)
> +		return NULL;
> +
> +	return orc_find((unsigned long)correct_ret_addr);
> +}
> +#else
> +static struct orc_entry *orc_kretprobe_find(void)
> +{
> +	return NULL;
> +}
> +#endif

This code is too much depending on kretprobe internal implementation.
This should should be provided by kretprobe.

>  /*
>   * If we crash with IP==0, the last successfully executed instruction
>   * was probably an indirect function call with a NULL function pointer,
> @@ -148,6 +189,16 @@ static struct orc_entry *orc_find(unsigned long ip)
>  	if (ip == 0)
>  		return &null_orc_entry;
>  
> +	/*
> +	 * Kretprobe lookup -- must occur before vmlinux addresses as
> +	 * kretprobe_trampoline is in the symbol table.
> +	 */
> +	if (ip == (unsigned long) &kretprobe_trampoline) {
> +		orc = orc_kretprobe_find();
> +		if (orc)
> +			return orc;
> +	}

Here too. at least "ip == (unsigned long) &kretprobe_trampoline" should
be hidden by an inline function...

> +
>  	/* For non-init vmlinux addresses, use the fast lookup table: */
>  	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
>  		unsigned int idx, start, stop;
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 745f08fdd7a6..334c23d33451 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1895,10 +1895,6 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
>  	BUG_ON(1);
>  
>  found:
> -	/* Unlink all nodes for this frame. */
> -	current->kretprobe_instances.first = node->next;
> -	node->next = NULL;
> -
>  	/* Run them..  */
>  	while (first) {
>  		ri = container_of(first, struct kretprobe_instance, llist);
> @@ -1917,6 +1913,10 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
>  		recycle_rp_inst(ri);
>  	}
>  
> +	/* Unlink all nodes for this frame. */
> +	current->kretprobe_instances.first = node->next;
> +	node->next = NULL;

Nack, this is a bit dangerous. We should unlink the chunk of kretprobe instances and
recycle it as I did in my patch, see below;

https://lore.kernel.org/bpf/20210304221947.5a177ce2e1e94314e57c38a4@kernel.org/

I would like to fix this issue in the generic part, not for x86 only.
Let me refresh my series for fixing it.

Thank you,
Masami Hiramatsu (Google) March 5, 2021, 10:58 a.m. UTC | #2
On Fri, 5 Mar 2021 18:28:06 +0900
Masami Hiramatsu <mhiramat@kernel.org> wrote:

> Hi Daniel,
> 
> On Thu,  4 Mar 2021 16:07:52 -0800
> Daniel Xu <dxu@dxuuu.xyz> wrote:
> 
> > Getting a stack trace from inside a kretprobe used to work with frame
> > pointer stack walks. After the default unwinder was switched to ORC,
> > stack traces broke because ORC did not know how to skip the
> > `kretprobe_trampoline` "frame".
> > 
> > Frame based stack walks used to work with kretprobes because
> > `kretprobe_trampoline` does not set up a new call frame. Thus, the frame
> > pointer based unwinder could walk directly to the kretprobe'd caller.
> > 
> > For example, this stack is walked incorrectly with ORC + kretprobe:
> > 
> >     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
> >     Attaching 1 probe...
> >     ^C
> > 
> >     @[
> >         kretprobe_trampoline+0
> >     ]: 1
> > 
> > After this patch, the stack is walked correctly:
> > 
> >     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
> >     Attaching 1 probe...
> >     ^C
> > 
> >     @[
> >         kretprobe_trampoline+0
> >         __x64_sys_nanosleep+150
> >         do_syscall_64+51
> >         entry_SYSCALL_64_after_hwframe+68
> >     ]: 12
> > 
> > Fixes: fc72ae40e303 ("x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit")
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> 
> OK, basically good, but this is messy, and doing much more than fixing issue.

BTW, is this a regression? or CONFIG_UNWINDER_ORC has this issue before?
It seems that the above commit just changed the default unwinder. This means
OCR stack unwinder has this bug before that commit. If you choose the 
CONFIG_UNWINDER_FRAME_POINTER, it might work again.

If so, it is not a regression. this need to be fixed, but ORC has this issue
from the original code.

Thank you,
Daniel Xu March 5, 2021, 7:25 p.m. UTC | #3
On Fri, Mar 05, 2021 at 07:58:09PM +0900, Masami Hiramatsu wrote:
> On Fri, 5 Mar 2021 18:28:06 +0900
> Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > Hi Daniel,
> > 
> > On Thu,  4 Mar 2021 16:07:52 -0800
> > Daniel Xu <dxu@dxuuu.xyz> wrote:
> > 
> > > Getting a stack trace from inside a kretprobe used to work with frame
> > > pointer stack walks. After the default unwinder was switched to ORC,
> > > stack traces broke because ORC did not know how to skip the
> > > `kretprobe_trampoline` "frame".
> > > 
> > > Frame based stack walks used to work with kretprobes because
> > > `kretprobe_trampoline` does not set up a new call frame. Thus, the frame
> > > pointer based unwinder could walk directly to the kretprobe'd caller.
> > > 
> > > For example, this stack is walked incorrectly with ORC + kretprobe:
> > > 
> > >     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
> > >     Attaching 1 probe...
> > >     ^C
> > > 
> > >     @[
> > >         kretprobe_trampoline+0
> > >     ]: 1
> > > 
> > > After this patch, the stack is walked correctly:
> > > 
> > >     # bpftrace -e 'kretprobe:do_nanosleep { @[kstack] = count() }'
> > >     Attaching 1 probe...
> > >     ^C
> > > 
> > >     @[
> > >         kretprobe_trampoline+0
> > >         __x64_sys_nanosleep+150
> > >         do_syscall_64+51
> > >         entry_SYSCALL_64_after_hwframe+68
> > >     ]: 12
> > > 
> > > Fixes: fc72ae40e303 ("x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit")
> > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > 
> > OK, basically good, but this is messy, and doing much more than fixing issue.

Thanks for taking a look!

> BTW, is this a regression? or CONFIG_UNWINDER_ORC has this issue before?
> It seems that the above commit just changed the default unwinder. This means
> OCR stack unwinder has this bug before that commit.

I see your point -- I suppose it depends on point of view. Viewed from
userspace, a change in kernel defaults means that one kernel worked and
the next one didn't -- all without the user doing anything. Consider it
from the POV of a typical linux user who just takes whatever the distro
gives them and doesn't compile their own kernels.

From the kernel point of view, you're also right. ORC didn't regress, it
was always broken for this particular use case. But as a primarily
userspace developer, I would consider this a kernel regression.


> If you choose the CONFIG_UNWINDER_FRAME_POINTER, it might work again.

Yes, I've confirmed that switching to CONFIG_UNWINDER_FRAME_POINTER does
fix the issue. But it's a non-starter for production machines b/c the
perf regression is too significant.

Thanks,
Daniel
Josh Poimboeuf March 5, 2021, 7:32 p.m. UTC | #4
On Fri, Mar 05, 2021 at 11:25:15AM -0800, Daniel Xu wrote:
> > BTW, is this a regression? or CONFIG_UNWINDER_ORC has this issue before?
> > It seems that the above commit just changed the default unwinder. This means
> > OCR stack unwinder has this bug before that commit.
> 
> I see your point -- I suppose it depends on point of view. Viewed from
> userspace, a change in kernel defaults means that one kernel worked and
> the next one didn't -- all without the user doing anything. Consider it
> from the POV of a typical linux user who just takes whatever the distro
> gives them and doesn't compile their own kernels.
> 
> From the kernel point of view, you're also right. ORC didn't regress, it
> was always broken for this particular use case. But as a primarily
> userspace developer, I would consider this a kernel regression.

Either way, if the bug has always existed in the ORC unwinder, the Fixes
tag needs to reference the original ORC commit:

  Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder")

That makes it possible for stable kernels to identify the source of the
bug and corresponding fix.  Many people used the ORC unwinder before it
became the default.
Daniel Xu March 5, 2021, 8:45 p.m. UTC | #5
On Fri, Mar 05, 2021 at 01:32:44PM -0600, Josh Poimboeuf wrote:
> On Fri, Mar 05, 2021 at 11:25:15AM -0800, Daniel Xu wrote:
> > > BTW, is this a regression? or CONFIG_UNWINDER_ORC has this issue before?
> > > It seems that the above commit just changed the default unwinder. This means
> > > OCR stack unwinder has this bug before that commit.
> > 
> > I see your point -- I suppose it depends on point of view. Viewed from
> > userspace, a change in kernel defaults means that one kernel worked and
> > the next one didn't -- all without the user doing anything. Consider it
> > from the POV of a typical linux user who just takes whatever the distro
> > gives them and doesn't compile their own kernels.
> > 
> > From the kernel point of view, you're also right. ORC didn't regress, it
> > was always broken for this particular use case. But as a primarily
> > userspace developer, I would consider this a kernel regression.
> 
> Either way, if the bug has always existed in the ORC unwinder, the Fixes
> tag needs to reference the original ORC commit:
> 
>   Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder")
> 
> That makes it possible for stable kernels to identify the source of the
> bug and corresponding fix.  Many people used the ORC unwinder before it
> became the default.

Got it. I'll change it in the next version if we get to V2 (another
ongoing discussion in Masami's patchset).

Thanks,
Daniel
diff mbox series

Patch

diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 2a1d47f47eee..1b88d75e2e9e 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -1,7 +1,9 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kprobes.h>
 #include <linux/objtool.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <asm/kprobes.h>
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
@@ -77,9 +79,11 @@  static struct orc_entry *orc_module_find(unsigned long ip)
 }
 #endif
 
-#ifdef CONFIG_DYNAMIC_FTRACE
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_KRETPROBES)
 static struct orc_entry *orc_find(unsigned long ip);
+#endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
 /*
  * Ftrace dynamic trampolines do not have orc entries of their own.
  * But they are copies of the ftrace entries that are static and
@@ -117,6 +121,43 @@  static struct orc_entry *orc_ftrace_find(unsigned long ip)
 }
 #endif
 
+#ifdef CONFIG_KRETPROBES
+static struct orc_entry *orc_kretprobe_find(void)
+{
+	kprobe_opcode_t *correct_ret_addr = NULL;
+	struct kretprobe_instance *ri = NULL;
+	struct llist_node *node;
+
+	node = current->kretprobe_instances.first;
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, llist);
+
+		if ((void *)ri->ret_addr != &kretprobe_trampoline) {
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			correct_ret_addr = ri->ret_addr;
+			break;
+		}
+
+
+		node = node->next;
+	}
+
+	if (!correct_ret_addr)
+		return NULL;
+
+	return orc_find((unsigned long)correct_ret_addr);
+}
+#else
+static struct orc_entry *orc_kretprobe_find(void)
+{
+	return NULL;
+}
+#endif
+
 /*
  * If we crash with IP==0, the last successfully executed instruction
  * was probably an indirect function call with a NULL function pointer,
@@ -148,6 +189,16 @@  static struct orc_entry *orc_find(unsigned long ip)
 	if (ip == 0)
 		return &null_orc_entry;
 
+	/*
+	 * Kretprobe lookup -- must occur before vmlinux addresses as
+	 * kretprobe_trampoline is in the symbol table.
+	 */
+	if (ip == (unsigned long) &kretprobe_trampoline) {
+		orc = orc_kretprobe_find();
+		if (orc)
+			return orc;
+	}
+
 	/* For non-init vmlinux addresses, use the fast lookup table: */
 	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
 		unsigned int idx, start, stop;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..334c23d33451 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1895,10 +1895,6 @@  unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
 	BUG_ON(1);
 
 found:
-	/* Unlink all nodes for this frame. */
-	current->kretprobe_instances.first = node->next;
-	node->next = NULL;
-
 	/* Run them..  */
 	while (first) {
 		ri = container_of(first, struct kretprobe_instance, llist);
@@ -1917,6 +1913,10 @@  unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
 		recycle_rp_inst(ri);
 	}
 
+	/* Unlink all nodes for this frame. */
+	current->kretprobe_instances.first = node->next;
+	node->next = NULL;
+
 	return (unsigned long)correct_ret_addr;
 }
 NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)