diff mbox series

[v5,11/34] function_graph: Have the instances use their own ftrace_ops for filtering

Message ID 170290522555.220107.1435543481968270637.stgit@devnote2 (mailing list archive)
State Not Applicable
Headers show
Series tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat

Commit Message

Masami Hiramatsu (Google) Dec. 18, 2023, 1:13 p.m. UTC
From: Steven Rostedt (VMware) <rostedt@goodmis.org>

Allow for instances to have their own ftrace_ops part of the fgraph_ops
that makes the funtion_graph tracer filter on the set_ftrace_filter file
of the instance and not the top instance.

This also change how the function_graph handles multiple instances on the
shadow stack. Previously we use ARRAY type entries to record which one
is enabled, and this makes it a bitmap of the fgraph_array's indexes.
Previous function_graph_enter() expects calling back from
prepare_ftrace_return() function which is called back only once if it is
enabled. But this introduces different ftrace_ops for each fgraph
instance and those are called from ftrace_graph_func() one by one. Thus
we can not loop on the fgraph_array(), and need to reuse the ret_stack
pushed by the previous instance. Finding the ret_stack is easy because
we can check the ret_stack->func. But that is not enough for the self-
recursive tail-call case. Thus fgraph uses the bitmap entry to find it
is already set (this means that entry is for previous tail call).

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v4:
  - Simplify get_ret_stack() sanity check and use WARN_ON_ONCE() for
    obviously wrong value.
  - Do not check ret == return_to_handler but always read the previous
    ret_stack in ftrace_push_return_trace() to check it is reusable.
  - Set the bit 0 of the bitmap entry always in function_graph_enter()
    because it uses bit 0 to check re-usability.
  - Fix to ensure the ret_stack entry is bitmap type when checking the
    bitmap.
 Changes in v3:
  - Pass current fgraph_ops to the new entry handler
   (function_graph_enter_ops) if fgraph use ftrace.
  - Add fgraph_ops::idx in this patch.
  - Replace the array type with the bitmap type so that it can record
    which fgraph is called.
  - Fix some helper function to use passed task_struct instead of current.
  - Reduce the ret-index size to 1024 words.
  - Make the ret-index directly points the ret_stack.
  - Fix ftrace_graph_ret_addr() to handle tail-call case correctly.
 Changes in v2:
  - Use ftrace_graph_func and FTRACE_OPS_GRAPH_STUB instead of
    ftrace_stub and FTRACE_OPS_FL_STUB for new ftrace based fgraph.
---
 arch/arm64/kernel/ftrace.c           |   19 ++
 arch/x86/kernel/ftrace.c             |   19 ++
 include/linux/ftrace.h               |    7 +
 kernel/trace/fgraph.c                |  369 ++++++++++++++++++++--------------
 kernel/trace/ftrace.c                |    6 -
 kernel/trace/trace.h                 |   16 +
 kernel/trace/trace_functions.c       |    2 
 kernel/trace/trace_functions_graph.c |    8 +
 8 files changed, 277 insertions(+), 169 deletions(-)

Comments

Masami Hiramatsu (Google) Dec. 26, 2023, 12:20 a.m. UTC | #1
Hi,

On Mon, 18 Dec 2023 22:13:46 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> @@ -408,15 +395,51 @@ int function_graph_enter(unsigned long ret, unsigned long func,
>  	return -EBUSY;
>  }
>  
> +/* This is called from ftrace_graph_func() via ftrace */
> +int function_graph_enter_ops(unsigned long ret, unsigned long func,
> +			     unsigned long frame_pointer, unsigned long *retp,
> +			     struct fgraph_ops *gops)
> +{
> +	struct ftrace_graph_ent trace;
> +	int index;
> +	int type;
> +

Here,  I found that this needs to check whether the fgraph_array[gops->idx]
is still valid or not. When unregistering the fgraph, fgraph_array[idx] is
cleared (with fgraph_stub) and disable ftrace. So there is a chance to hit
this and it will mess up the shadow stack because gops->idx is already invalid.

Thank you,
Mark Rutland Jan. 5, 2024, 5:09 p.m. UTC | #2
On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> 
> Allow for instances to have their own ftrace_ops part of the fgraph_ops
> that makes the funtion_graph tracer filter on the set_ftrace_filter file
> of the instance and not the top instance.
> 
> This also change how the function_graph handles multiple instances on the
> shadow stack. Previously we use ARRAY type entries to record which one
> is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> Previous function_graph_enter() expects calling back from
> prepare_ftrace_return() function which is called back only once if it is
> enabled. But this introduces different ftrace_ops for each fgraph
> instance and those are called from ftrace_graph_func() one by one. Thus
> we can not loop on the fgraph_array(), and need to reuse the ret_stack
> pushed by the previous instance. Finding the ret_stack is easy because
> we can check the ret_stack->func. But that is not enough for the self-
> recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> is already set (this means that entry is for previous tail call).
> 
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
a warning which bisets down to this commit:

| Testing tracer function_graph: 
| ------------[ cut here ]------------
| WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
| Modules linked in:
| CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
| Hardware name: linux,dummy-virt (DT)
| pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : arch_stack_walk+0x3c0/0x3d8
| lr : arch_stack_walk+0x260/0x3d8
| sp : ffff80008318be00
| x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
| x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
| x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
| x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
| x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
| x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
| x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
| x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
| x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
| x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
| Call trace:
|  arch_stack_walk+0x3c0/0x3d8
|  return_address+0x40/0x80
|  trace_hardirqs_on+0x8c/0x198
|  __do_softirq+0xe8/0x440
| ---[ end trace 0000000000000000 ]---

That's a warning in arm64's unwind_recover_return_address() function, which
fires when ftrace_graph_ret_addr() finds return_to_handler:

	if (state->task->ret_stack &&
	    (state->pc == (unsigned long)return_to_handler)) {
		unsigned long orig_pc;
		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
						(void *)state->fp);
		if (WARN_ON_ONCE(state->pc == orig_pc))
			return -EINVAL;
		state->pc = orig_pc;
	}

The rationale there is that since tail calls are (currently) disabled on arm64,
the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
it fails to find the original return address.

Does this change make it legitimate for ftrace_graph_ret_addr() to return
return_to_handler in other cases, or is that a bug?

Either way, we'll need *some* way to recover the original return addresss...

Mark.

> ---
>  Changes in v4:
>   - Simplify get_ret_stack() sanity check and use WARN_ON_ONCE() for
>     obviously wrong value.
>   - Do not check ret == return_to_handler but always read the previous
>     ret_stack in ftrace_push_return_trace() to check it is reusable.
>   - Set the bit 0 of the bitmap entry always in function_graph_enter()
>     because it uses bit 0 to check re-usability.
>   - Fix to ensure the ret_stack entry is bitmap type when checking the
>     bitmap.
>  Changes in v3:
>   - Pass current fgraph_ops to the new entry handler
>    (function_graph_enter_ops) if fgraph use ftrace.
>   - Add fgraph_ops::idx in this patch.
>   - Replace the array type with the bitmap type so that it can record
>     which fgraph is called.
>   - Fix some helper function to use passed task_struct instead of current.
>   - Reduce the ret-index size to 1024 words.
>   - Make the ret-index directly points the ret_stack.
>   - Fix ftrace_graph_ret_addr() to handle tail-call case correctly.
>  Changes in v2:
>   - Use ftrace_graph_func and FTRACE_OPS_GRAPH_STUB instead of
>     ftrace_stub and FTRACE_OPS_FL_STUB for new ftrace based fgraph.
> ---
>  arch/arm64/kernel/ftrace.c           |   19 ++
>  arch/x86/kernel/ftrace.c             |   19 ++
>  include/linux/ftrace.h               |    7 +
>  kernel/trace/fgraph.c                |  369 ++++++++++++++++++++--------------
>  kernel/trace/ftrace.c                |    6 -
>  kernel/trace/trace.h                 |   16 +
>  kernel/trace/trace_functions.c       |    2 
>  kernel/trace/trace_functions_graph.c |    8 +
>  8 files changed, 277 insertions(+), 169 deletions(-)
> 
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index a650f5e11fc5..205937e04ece 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -481,7 +481,24 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
>  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>  		       struct ftrace_ops *op, struct ftrace_regs *fregs)
>  {
> -	prepare_ftrace_return(ip, &fregs->lr, fregs->fp);
> +	unsigned long *parent = &fregs->lr;
> +	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
> +	int bit;
> +
> +	if (unlikely(ftrace_graph_is_dead()))
> +		return;
> +
> +	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> +		return;
> +
> +	bit = ftrace_test_recursion_trylock(ip, *parent);
> +	if (bit < 0)
> +		return;
> +
> +	if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
> +		*parent = (unsigned long)&return_to_handler;
> +
> +	ftrace_test_recursion_unlock(bit);
>  }
>  #else
>  /*
> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> index 12df54ff0e81..845e29b4254f 100644
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -657,9 +657,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>  		       struct ftrace_ops *op, struct ftrace_regs *fregs)
>  {
>  	struct pt_regs *regs = &fregs->regs;
> -	unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
> +	unsigned long *parent = (unsigned long *)kernel_stack_pointer(regs);
> +	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
> +	int bit;
> +
> +	if (unlikely(ftrace_graph_is_dead()))
> +		return;
> +
> +	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> +		return;
>  
> -	prepare_ftrace_return(ip, (unsigned long *)stack, 0);
> +	bit = ftrace_test_recursion_trylock(ip, *parent);
> +	if (bit < 0)
> +		return;
> +
> +	if (!function_graph_enter_ops(*parent, ip, 0, parent, gops))
> +		*parent = (unsigned long)&return_to_handler;
> +
> +	ftrace_test_recursion_unlock(bit);
>  }
>  #endif
>  
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 7b08169aa51d..c431a33fe789 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -1070,7 +1070,9 @@ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph
>  struct fgraph_ops {
>  	trace_func_graph_ent_t		entryfunc;
>  	trace_func_graph_ret_t		retfunc;
> +	struct ftrace_ops		ops; /* for the hash lists */
>  	void				*private;
> +	int				idx;
>  };
>  
>  /*
> @@ -1104,6 +1106,11 @@ extern int
>  function_graph_enter(unsigned long ret, unsigned long func,
>  		     unsigned long frame_pointer, unsigned long *retp);
>  
> +extern int
> +function_graph_enter_ops(unsigned long ret, unsigned long func,
> +			 unsigned long frame_pointer, unsigned long *retp,
> +			 struct fgraph_ops *gops);
> +
>  struct ftrace_ret_stack *
>  ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
>  
> diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> index 62c35d6d95f9..6f537ebd3ed7 100644
> --- a/kernel/trace/fgraph.c
> +++ b/kernel/trace/fgraph.c
> @@ -7,6 +7,7 @@
>   *
>   * Highly modified by Steven Rostedt (VMware).
>   */
> +#include <linux/bits.h>
>  #include <linux/jump_label.h>
>  #include <linux/suspend.h>
>  #include <linux/ftrace.h>
> @@ -17,22 +18,15 @@
>  #include "ftrace_internal.h"
>  #include "trace.h"
>  
> -#ifdef CONFIG_DYNAMIC_FTRACE
> -#define ASSIGN_OPS_HASH(opsname, val) \
> -	.func_hash		= val, \
> -	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
> -#else
> -#define ASSIGN_OPS_HASH(opsname, val)
> -#endif
> -
>  #define FGRAPH_RET_SIZE sizeof(struct ftrace_ret_stack)
>  #define FGRAPH_RET_INDEX (FGRAPH_RET_SIZE / sizeof(long))
>  
>  /*
>   * On entry to a function (via function_graph_enter()), a new ftrace_ret_stack
> - * is allocated on the task's ret_stack, then each fgraph_ops on the
> - * fgraph_array[]'s entryfunc is called and if that returns non-zero, the
> - * index into the fgraph_array[] for that fgraph_ops is added to the ret_stack.
> + * is allocated on the task's ret_stack with indexes entry, then each
> + * fgraph_ops on the fgraph_array[]'s entryfunc is called and if that returns
> + * non-zero, the index into the fgraph_array[] for that fgraph_ops is recorded
> + * on the indexes entry as a bit flag.
>   * As the associated ftrace_ret_stack saved for those fgraph_ops needs to
>   * be found, the index to it is also added to the ret_stack along with the
>   * index of the fgraph_array[] to each fgraph_ops that needs their retfunc
> @@ -42,61 +36,59 @@
>   * to the last ftrace_ret_stack saved. All references to the
>   * ftrace_ret_stack has the format of:
>   *
> - * bits:  0 - 13	Index in words from the previous ftrace_ret_stack
> - * bits: 14 - 15	Type of storage
> + * bits:  0 -  9	offset in words from the previous ftrace_ret_stack
> + *			(bitmap type should have FGRAPH_RET_INDEX always)
> + * bits: 10 - 11	Type of storage
>   *			  0 - reserved
> - *			  1 - fgraph_array index
> - * For fgraph_array_index:
> - *  bits: 16 - 23	The fgraph_ops fgraph_array index
> + *			  1 - bitmap of fgraph_array index
> + *
> + * For bitmap of fgraph_array index
> + *  bits: 12 - 27	The bitmap of fgraph_ops fgraph_array index
>   *
>   * That is, at the end of function_graph_enter, if the first and forth
>   * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
>   * on the return of the function being traced, this is what will be on the
>   * task's shadow ret_stack: (the stack grows upward)
>   *
> - * |                                  | <- task->curr_ret_stack
> - * +----------------------------------+
> - * | (3 << FGRAPH_ARRAY_SHIFT)|(2)    | ( 3 for index of fourth fgraph_ops)
> - * +----------------------------------+
> - * | (0 << FGRAPH_ARRAY_SHIFT)|(1)    | ( 0 for index of first fgraph_ops)
> - * +----------------------------------+
> - * | struct ftrace_ret_stack          |
> - * |   (stores the saved ret pointer) |
> - * +----------------------------------+
> - * |             (X) | (N)            | ( N words away from previous ret_stack)
> - * |                                  |
> + * |                                            | <- task->curr_ret_stack
> + * +--------------------------------------------+
> + * | bitmap_type(bitmap:(BIT(3)|BIT(0)),        |
> + * |             offset:FGRAPH_RET_INDEX)       | <- the offset is from here
> + * +--------------------------------------------+
> + * | struct ftrace_ret_stack                    |
> + * |   (stores the saved ret pointer)           | <- the offset points here
> + * +--------------------------------------------+
> + * |                 (X) | (N)                  | ( N words away from
> + * |                                            |   previous ret_stack)
>   *
>   * If a backtrace is required, and the real return pointer needs to be
>   * fetched, then it looks at the task's curr_ret_stack index, if it
> - * is greater than zero, it would subtact one, and then mask the value
> - * on the ret_stack by FGRAPH_RET_INDEX_MASK and subtract FGRAPH_RET_INDEX
> - * from that, to get the index of the ftrace_ret_stack structure stored
> - * on the shadow stack.
> + * is greater than zero (reserved, or right before poped), it would mask
> + * the value by FGRAPH_RET_INDEX_MASK to get the offset index of the
> + * ftrace_ret_stack structure stored on the shadow stack.
>   */
>  
> -#define FGRAPH_RET_INDEX_SIZE	14
> -#define FGRAPH_RET_INDEX_MASK	((1 << FGRAPH_RET_INDEX_SIZE) - 1)
> -
> +#define FGRAPH_RET_INDEX_SIZE	10
> +#define FGRAPH_RET_INDEX_MASK	GENMASK(FGRAPH_RET_INDEX_SIZE - 1, 0)
>  
>  #define FGRAPH_TYPE_SIZE	2
> -#define FGRAPH_TYPE_MASK	((1 << FGRAPH_TYPE_SIZE) - 1)
> +#define FGRAPH_TYPE_MASK	GENMASK(FGRAPH_TYPE_SIZE - 1, 0)
>  #define FGRAPH_TYPE_SHIFT	FGRAPH_RET_INDEX_SIZE
>  
>  enum {
>  	FGRAPH_TYPE_RESERVED	= 0,
> -	FGRAPH_TYPE_ARRAY	= 1,
> +	FGRAPH_TYPE_BITMAP	= 1,
>  };
>  
> -#define FGRAPH_ARRAY_SIZE	16
> -#define FGRAPH_ARRAY_MASK	((1 << FGRAPH_ARRAY_SIZE) - 1)
> -#define FGRAPH_ARRAY_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
> +#define FGRAPH_INDEX_SIZE	16
> +#define FGRAPH_INDEX_MASK	GENMASK(FGRAPH_INDEX_SIZE - 1, 0)
> +#define FGRAPH_INDEX_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
>  
>  /* Currently the max stack index can't be more than register callers */
> -#define FGRAPH_MAX_INDEX	FGRAPH_ARRAY_SIZE
> +#define FGRAPH_MAX_INDEX	(FGRAPH_INDEX_SIZE + FGRAPH_RET_INDEX)
> +
> +#define FGRAPH_ARRAY_SIZE	FGRAPH_INDEX_SIZE
>  
> -#define FGRAPH_FRAME_SIZE (FGRAPH_RET_SIZE + FGRAPH_ARRAY_SIZE * (sizeof(long)))
> -#define FGRAPH_FRAME_INDEX (ALIGN(FGRAPH_FRAME_SIZE,		\
> -				  sizeof(long)) / sizeof(long))
>  #define SHADOW_STACK_SIZE (PAGE_SIZE)
>  #define SHADOW_STACK_INDEX (SHADOW_STACK_SIZE / sizeof(long))
>  /* Leave on a buffer at the end */
> @@ -113,19 +105,36 @@ static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
>  
>  static inline int get_ret_stack_index(struct task_struct *t, int offset)
>  {
> -	return current->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
> +	return t->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
>  }
>  
>  static inline int get_fgraph_type(struct task_struct *t, int offset)
>  {
> -	return (current->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) &
> -		FGRAPH_TYPE_MASK;
> +	return (t->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
> +}
> +
> +static inline unsigned long
> +get_fgraph_index_bitmap(struct task_struct *t, int offset)
> +{
> +	return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
>  }
>  
> -static inline int get_fgraph_array(struct task_struct *t, int offset)
> +static inline void
> +set_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
>  {
> -	return (current->ret_stack[offset] >> FGRAPH_ARRAY_SHIFT) &
> -		FGRAPH_ARRAY_MASK;
> +	t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
> +		(FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
> +}
> +
> +static inline bool is_fgraph_index_set(struct task_struct *t, int offset, int idx)
> +{
> +	return !!(get_fgraph_index_bitmap(t, offset) & BIT(idx));
> +}
> +
> +static inline void
> +add_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
> +{
> +	t->ret_stack[offset] |= (bitmap << FGRAPH_INDEX_SHIFT);
>  }
>  
>  /* ftrace_graph_entry set to this to tell some archs to run function graph */
> @@ -160,17 +169,14 @@ get_ret_stack(struct task_struct *t, int offset, int *index)
>  
>  	BUILD_BUG_ON(FGRAPH_RET_SIZE % sizeof(long));
>  
> -	if (offset <= 0)
> +	if (unlikely(offset <= 0))
>  		return NULL;
>  
> -	idx = get_ret_stack_index(t, offset - 1);
> -
> -	if (idx <= 0 || idx > FGRAPH_MAX_INDEX)
> +	idx = get_ret_stack_index(t, --offset);
> +	if (WARN_ON_ONCE(idx <= 0 || idx > offset))
>  		return NULL;
>  
> -	offset -= idx + FGRAPH_RET_INDEX;
> -	if (offset < 0)
> -		return NULL;
> +	offset -= idx;
>  
>  	*index = offset;
>  	return RET_STACK(t, offset);
> @@ -231,10 +237,12 @@ void ftrace_graph_stop(void)
>  /* Add a function return address to the trace stack on thread info.*/
>  static int
>  ftrace_push_return_trace(unsigned long ret, unsigned long func,
> -			 unsigned long frame_pointer, unsigned long *retp)
> +			 unsigned long frame_pointer, unsigned long *retp,
> +			 int fgraph_idx)
>  {
>  	struct ftrace_ret_stack *ret_stack;
>  	unsigned long long calltime;
> +	unsigned long val;
>  	int index;
>  
>  	if (unlikely(ftrace_graph_is_dead()))
> @@ -243,6 +251,21 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  	if (!current->ret_stack)
>  		return -EBUSY;
>  
> +	/*
> +	 * At first, check whether the previous fgraph callback is pushed by
> +	 * the fgraph on the same function entry.
> +	 * But if @func is the self tail-call function, we also need to ensure
> +	 * the ret_stack is not for the previous call by checking whether the
> +	 * bit of @fgraph_idx is set or not.
> +	 */
> +	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
> +	if (ret_stack && ret_stack->func == func &&
> +	    get_fgraph_type(current, index + FGRAPH_RET_INDEX) == FGRAPH_TYPE_BITMAP &&
> +	    !is_fgraph_index_set(current, index + FGRAPH_RET_INDEX, fgraph_idx))
> +		return index + FGRAPH_RET_INDEX;
> +
> +	val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
> +
>  	BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
>  
>  	/*
> @@ -252,17 +275,19 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  	smp_rmb();
>  
>  	/* The return trace stack is full */
> -	if (current->curr_ret_stack >= SHADOW_STACK_MAX_INDEX) {
> +	if (current->curr_ret_stack + FGRAPH_RET_INDEX >= SHADOW_STACK_MAX_INDEX) {
>  		atomic_inc(&current->trace_overrun);
>  		return -EBUSY;
>  	}
>  
>  	calltime = trace_clock_local();
>  
> -	index = current->curr_ret_stack;
> -	/* ret offset = 1 ; type = reserved */
> -	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
> +	index = READ_ONCE(current->curr_ret_stack);
>  	ret_stack = RET_STACK(current, index);
> +	index += FGRAPH_RET_INDEX;
> +
> +	/* ret offset = FGRAPH_RET_INDEX ; type = reserved */
> +	current->ret_stack[index] = val;
>  	ret_stack->ret = ret;
>  	/*
>  	 * The unwinders expect curr_ret_stack to point to either zero
> @@ -278,7 +303,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  	 * at least a correct index!
>  	 */
>  	barrier();
> -	current->curr_ret_stack += FGRAPH_RET_INDEX + 1;
> +	current->curr_ret_stack = index + 1;
>  	/*
>  	 * This next barrier is to ensure that an interrupt coming in
>  	 * will not corrupt what we are about to write.
> @@ -286,7 +311,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  	barrier();
>  
>  	/* Still keep it reserved even if an interrupt came in */
> -	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
> +	current->ret_stack[index] = val;
>  
>  	ret_stack->ret = ret;
>  	ret_stack->func = func;
> @@ -297,7 +322,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
>  	ret_stack->retp = retp;
>  #endif
> -	return 0;
> +	return index;
>  }
>  
>  /*
> @@ -314,15 +339,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
>  # define MCOUNT_INSN_SIZE 0
>  #endif
>  
> +/* If the caller does not use ftrace, call this function. */
>  int function_graph_enter(unsigned long ret, unsigned long func,
>  			 unsigned long frame_pointer, unsigned long *retp)
>  {
>  	struct ftrace_graph_ent trace;
> -	int offset;
> -	int start;
> -	int type;
> -	int val;
> -	int cnt = 0;
> +	unsigned long bitmap = 0;
> +	int index;
>  	int i;
>  
>  #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> @@ -337,69 +360,33 @@ int function_graph_enter(unsigned long ret, unsigned long func,
>  		return -EBUSY;
>  #endif
>  
> -	if (!ftrace_ops_test(&global_ops, func, NULL))
> -		return -EBUSY;
> -
>  	trace.func = func;
>  	trace.depth = ++current->curr_ret_depth;
>  
> -	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
> +	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
> +	if (index < 0)
>  		goto out;
>  
> -	/* Use start for the distance to ret_stack (skipping over reserve) */
> -	start = offset = current->curr_ret_stack - 2;
> -
>  	for (i = 0; i < fgraph_array_cnt; i++) {
>  		struct fgraph_ops *gops = fgraph_array[i];
>  
>  		if (gops == &fgraph_stub)
>  			continue;
>  
> -		if ((offset == start) &&
> -		    (current->curr_ret_stack >= SHADOW_STACK_INDEX - 1)) {
> -			atomic_inc(&current->trace_overrun);
> -			break;
> -		}
> -		if (fgraph_array[i]->entryfunc(&trace, fgraph_array[i])) {
> -			offset = current->curr_ret_stack;
> -			/* Check the top level stored word */
> -			type = get_fgraph_type(current, offset - 1);
> -
> -			val = (i << FGRAPH_ARRAY_SHIFT) |
> -				(FGRAPH_TYPE_ARRAY << FGRAPH_TYPE_SHIFT) |
> -				((offset - start) - 1);
> -
> -			/* We can reuse the top word if it is reserved */
> -			if (type == FGRAPH_TYPE_RESERVED) {
> -				current->ret_stack[offset - 1] = val;
> -				cnt++;
> -				continue;
> -			}
> -			val++;
> -
> -			current->ret_stack[offset] = val;
> -			/*
> -			 * Write the value before we increment, so that
> -			 * if an interrupt comes in after we increment
> -			 * it will still see the value and skip over
> -			 * this.
> -			 */
> -			barrier();
> -			current->curr_ret_stack++;
> -			/*
> -			 * Have to write again, in case an interrupt
> -			 * came in before the increment and after we
> -			 * wrote the value.
> -			 */
> -			barrier();
> -			current->ret_stack[offset] = val;
> -			cnt++;
> -		}
> +		if (ftrace_ops_test(&gops->ops, func, NULL) &&
> +		    gops->entryfunc(&trace, gops))
> +			bitmap |= BIT(i);
>  	}
>  
> -	if (!cnt)
> +	if (!bitmap)
>  		goto out_ret;
>  
> +	/*
> +	 * Since this function uses fgraph_idx = 0 as a tail-call checking
> +	 * flag, set that bit always.
> +	 */
> +	set_fgraph_index_bitmap(current, index, bitmap | BIT(0));
> +
>  	return 0;
>   out_ret:
>  	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
> @@ -408,15 +395,51 @@ int function_graph_enter(unsigned long ret, unsigned long func,
>  	return -EBUSY;
>  }
>  
> +/* This is called from ftrace_graph_func() via ftrace */
> +int function_graph_enter_ops(unsigned long ret, unsigned long func,
> +			     unsigned long frame_pointer, unsigned long *retp,
> +			     struct fgraph_ops *gops)
> +{
> +	struct ftrace_graph_ent trace;
> +	int index;
> +	int type;
> +
> +
> +	/* Use start for the distance to ret_stack (skipping over reserve) */
> +	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, gops->idx);
> +	if (index < 0)
> +		return index;
> +	type = get_fgraph_type(current, index);
> +
> +	/* This is the first ret_stack for this fentry */
> +	if (type == FGRAPH_TYPE_RESERVED)
> +		++current->curr_ret_depth;
> +
> +	trace.func = func;
> +	trace.depth = current->curr_ret_depth;
> +	if (gops->entryfunc(&trace, gops)) {
> +		if (type == FGRAPH_TYPE_RESERVED)
> +			set_fgraph_index_bitmap(current, index, BIT(gops->idx));
> +		else
> +			add_fgraph_index_bitmap(current, index, BIT(gops->idx));
> +		return 0;
> +	}
> +
> +	if (type == FGRAPH_TYPE_RESERVED) {
> +		current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
> +		current->curr_ret_depth--;
> +	}
> +	return -EBUSY;
> +}
> +
>  /* Retrieve a function return address to the trace stack on thread info.*/
>  static struct ftrace_ret_stack *
>  ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
> -			unsigned long frame_pointer)
> +			unsigned long frame_pointer, int *index)
>  {
>  	struct ftrace_ret_stack *ret_stack;
> -	int index;
>  
> -	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
> +	ret_stack = get_ret_stack(current, current->curr_ret_stack, index);
>  
>  	if (unlikely(!ret_stack)) {
>  		ftrace_graph_stop();
> @@ -455,6 +478,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
>  	}
>  #endif
>  
> +	*index += FGRAPH_RET_INDEX;
>  	*ret = ret_stack->ret;
>  	trace->func = ret_stack->func;
>  	trace->calltime = ret_stack->calltime;
> @@ -507,13 +531,12 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
>  {
>  	struct ftrace_ret_stack *ret_stack;
>  	struct ftrace_graph_ret trace;
> +	unsigned long bitmap;
>  	unsigned long ret;
> -	int offset;
>  	int index;
> -	int idx;
>  	int i;
>  
> -	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer);
> +	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &index);
>  
>  	if (unlikely(!ret_stack)) {
>  		ftrace_graph_stop();
> @@ -527,16 +550,17 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
>  	trace.retval = fgraph_ret_regs_return_value(ret_regs);
>  #endif
>  
> -	offset = current->curr_ret_stack - 1;
> -	index = get_ret_stack_index(current, offset);
> +	bitmap = get_fgraph_index_bitmap(current, index);
> +	for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
> +		struct fgraph_ops *gops = fgraph_array[i];
>  
> -	/* index has to be at least one! Optimize for it */
> -	i = 0;
> -	do {
> -		idx = get_fgraph_array(current, offset - i);
> -		fgraph_array[idx]->retfunc(&trace, fgraph_array[idx]);
> -		i++;
> -	} while (i < index);
> +		if (!(bitmap & BIT(i)))
> +			continue;
> +		if (gops == &fgraph_stub)
> +			continue;
> +
> +		gops->retfunc(&trace, gops);
> +	}
>  
>  	/*
>  	 * The ftrace_graph_return() may still access the current
> @@ -544,7 +568,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
>  	 * curr_ret_stack is after that.
>  	 */
>  	barrier();
> -	current->curr_ret_stack -= index + FGRAPH_RET_INDEX;
> +	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
>  	current->curr_ret_depth--;
>  	return ret;
>  }
> @@ -622,7 +646,17 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
>  		ret_stack = get_ret_stack(current, i, &i);
>  		if (!ret_stack)
>  			break;
> -		if (ret_stack->retp == retp)
> +		/*
> +		 * For the tail-call, there would be 2 or more ftrace_ret_stacks on
> +		 * the ret_stack, which records "return_to_handler" as the return
> +		 * address excpt for the last one.
> +		 * But on the real stack, there should be 1 entry because tail-call
> +		 * reuses the return address on the stack and jump to the next function.
> +		 * Thus we will continue to find real return address.
> +		 */
> +		if (ret_stack->retp == retp &&
> +		    ret_stack->ret !=
> +		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
>  			return ret_stack->ret;
>  	}
>  
> @@ -645,6 +679,9 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
>  	i = *idx;
>  	do {
>  		ret_stack = get_ret_stack(task, task_idx, &task_idx);
> +		if (ret_stack && ret_stack->ret ==
> +		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
> +			continue;
>  		i--;
>  	} while (i >= 0 && ret_stack);
>  
> @@ -655,17 +692,25 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
>  }
>  #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
>  
> -static struct ftrace_ops graph_ops = {
> -	.func			= ftrace_graph_func,
> -	.flags			= FTRACE_OPS_FL_INITIALIZED |
> -				   FTRACE_OPS_FL_PID |
> -				   FTRACE_OPS_GRAPH_STUB,
> +void fgraph_init_ops(struct ftrace_ops *dst_ops,
> +		     struct ftrace_ops *src_ops)
> +{
> +	dst_ops->func = ftrace_graph_func;
> +	dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
> +
>  #ifdef FTRACE_GRAPH_TRAMP_ADDR
> -	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
> +	dst_ops->trampoline = FTRACE_GRAPH_TRAMP_ADDR;
>  	/* trampoline_size is only needed for dynamically allocated tramps */
>  #endif
> -	ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
> -};
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE
> +	if (src_ops) {
> +		dst_ops->func_hash = &src_ops->local_hash;
> +		mutex_init(&dst_ops->local_hash.regex_lock);
> +		dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
> +	}
> +#endif
> +}
>  
>  void ftrace_graph_sleep_time_control(bool enable)
>  {
> @@ -869,11 +914,20 @@ static int start_graph_tracing(void)
>  
>  int register_ftrace_graph(struct fgraph_ops *gops)
>  {
> +	int command = 0;
>  	int ret = 0;
>  	int i;
>  
>  	mutex_lock(&ftrace_lock);
>  
> +	if (!gops->ops.func) {
> +		gops->ops.flags |= FTRACE_OPS_GRAPH_STUB;
> +		gops->ops.func = ftrace_graph_func;
> +#ifdef FTRACE_GRAPH_TRAMP_ADDR
> +		gops->ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
> +#endif
> +	}
> +
>  	if (!fgraph_array[0]) {
>  		/* The array must always have real data on it */
>  		for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
> @@ -893,6 +947,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
>  	fgraph_array[i] = gops;
>  	if (i + 1 > fgraph_array_cnt)
>  		fgraph_array_cnt = i + 1;
> +	gops->idx = i;
>  
>  	ftrace_graph_active++;
>  
> @@ -909,9 +964,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
>  		 */
>  		ftrace_graph_return = return_run;
>  		ftrace_graph_entry = entry_run;
> -
> -		ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
> +		command = FTRACE_START_FUNC_RET;
>  	}
> +
> +	ret = ftrace_startup(&gops->ops, command);
>  out:
>  	mutex_unlock(&ftrace_lock);
>  	return ret;
> @@ -919,6 +975,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
>  
>  void unregister_ftrace_graph(struct fgraph_ops *gops)
>  {
> +	int command = 0;
>  	int i;
>  
>  	mutex_lock(&ftrace_lock);
> @@ -926,25 +983,29 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
>  	if (unlikely(!ftrace_graph_active))
>  		goto out;
>  
> -	for (i = 0; i < fgraph_array_cnt; i++)
> -		if (gops == fgraph_array[i])
> -			break;
> -	if (i >= fgraph_array_cnt)
> +	if (unlikely(gops->idx < 0 || gops->idx >= fgraph_array_cnt))
>  		goto out;
>  
> -	fgraph_array[i] = &fgraph_stub;
> -	if (i + 1 == fgraph_array_cnt) {
> -		for (; i >= 0; i--)
> -			if (fgraph_array[i] != &fgraph_stub)
> -				break;
> +	WARN_ON_ONCE(fgraph_array[gops->idx] != gops);
> +
> +	fgraph_array[gops->idx] = &fgraph_stub;
> +	if (gops->idx + 1 == fgraph_array_cnt) {
> +		i = gops->idx;
> +		while (i >= 0 && fgraph_array[i] == &fgraph_stub)
> +			i--;
>  		fgraph_array_cnt = i + 1;
>  	}
>  
>  	ftrace_graph_active--;
> +
> +	if (!ftrace_graph_active)
> +		command = FTRACE_STOP_FUNC_RET;
> +
> +	ftrace_shutdown(&gops->ops, command);
> +
>  	if (!ftrace_graph_active) {
>  		ftrace_graph_return = ftrace_stub_graph;
>  		ftrace_graph_entry = ftrace_graph_entry_stub;
> -		ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
>  		unregister_pm_notifier(&ftrace_suspend_notifier);
>  		unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
>  	}
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 83fbfb7b48f8..c4cc2a9d0047 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -3050,6 +3050,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
>  	if (unlikely(ftrace_disabled))
>  		return -ENODEV;
>  
> +	ftrace_ops_init(ops);
> +
>  	ret = __register_ftrace_function(ops);
>  	if (ret)
>  		return ret;
> @@ -7319,7 +7321,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
>  	tr->ops = &global_ops;
>  	tr->ops->private = tr;
>  	ftrace_init_trace_array(tr);
> -	init_array_fgraph_ops(tr);
> +	init_array_fgraph_ops(tr, tr->ops);
>  }
>  
>  void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
> @@ -8051,7 +8053,7 @@ static int register_ftrace_function_nolock(struct ftrace_ops *ops)
>   */
>  int register_ftrace_function(struct ftrace_ops *ops)
>  {
> -	int ret;
> +	int ret = -1;
>  
>  	lock_direct_mutex();
>  	ret = prepare_direct_functions_for_ipmodify(ops);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 16948c0ed00a..02edfdb68933 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -890,8 +890,8 @@ extern int __trace_graph_entry(struct trace_array *tr,
>  extern void __trace_graph_return(struct trace_array *tr,
>  				 struct ftrace_graph_ret *trace,
>  				 unsigned int trace_ctx);
> -extern void init_array_fgraph_ops(struct trace_array *tr);
> -extern int allocate_fgraph_ops(struct trace_array *tr);
> +extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
> +extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
>  extern void free_fgraph_ops(struct trace_array *tr);
>  
>  #ifdef CONFIG_DYNAMIC_FTRACE
> @@ -974,6 +974,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
>  	preempt_enable_notrace();
>  	return ret;
>  }
> +
>  #else
>  static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
>  {
> @@ -999,18 +1000,19 @@ static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
>  		(fgraph_max_depth && trace->depth >= fgraph_max_depth);
>  }
>  
> +void fgraph_init_ops(struct ftrace_ops *dst_ops,
> +		     struct ftrace_ops *src_ops);
> +
>  #else /* CONFIG_FUNCTION_GRAPH_TRACER */
>  static inline enum print_line_t
>  print_graph_function_flags(struct trace_iterator *iter, u32 flags)
>  {
>  	return TRACE_TYPE_UNHANDLED;
>  }
> -static inline void init_array_fgraph_ops(struct trace_array *tr) { }
> -static inline int allocate_fgraph_ops(struct trace_array *tr)
> -{
> -	return 0;
> -}
>  static inline void free_fgraph_ops(struct trace_array *tr) { }
> +/* ftrace_ops may not be defined */
> +#define init_array_fgraph_ops(tr, ops) do { } while (0)
> +#define allocate_fgraph_ops(tr, ops) ({ 0; })
>  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
>  
>  extern struct list_head ftrace_pids;
> diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
> index 8e8da0d0ee52..13bf2415245d 100644
> --- a/kernel/trace/trace_functions.c
> +++ b/kernel/trace/trace_functions.c
> @@ -91,7 +91,7 @@ int ftrace_create_function_files(struct trace_array *tr,
>  	if (!tr->ops)
>  		return -EINVAL;
>  
> -	ret = allocate_fgraph_ops(tr);
> +	ret = allocate_fgraph_ops(tr, tr->ops);
>  	if (ret) {
>  		kfree(tr->ops);
>  		return ret;
> diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> index 9ccc904a7703..7f30652f0e97 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -288,7 +288,7 @@ static struct fgraph_ops funcgraph_ops = {
>  	.retfunc = &trace_graph_return,
>  };
>  
> -int allocate_fgraph_ops(struct trace_array *tr)
> +int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
>  {
>  	struct fgraph_ops *gops;
>  
> @@ -301,6 +301,9 @@ int allocate_fgraph_ops(struct trace_array *tr)
>  
>  	tr->gops = gops;
>  	gops->private = tr;
> +
> +	fgraph_init_ops(&gops->ops, ops);
> +
>  	return 0;
>  }
>  
> @@ -309,10 +312,11 @@ void free_fgraph_ops(struct trace_array *tr)
>  	kfree(tr->gops);
>  }
>  
> -__init void init_array_fgraph_ops(struct trace_array *tr)
> +__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
>  {
>  	tr->gops = &funcgraph_ops;
>  	funcgraph_ops.private = tr;
> +	fgraph_init_ops(&tr->gops->ops, ops);
>  }
>  
>  static int graph_trace_init(struct trace_array *tr)
>
Masami Hiramatsu (Google) Jan. 8, 2024, 1:14 a.m. UTC | #3
On Fri, 5 Jan 2024 17:09:10 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > 
> > Allow for instances to have their own ftrace_ops part of the fgraph_ops
> > that makes the funtion_graph tracer filter on the set_ftrace_filter file
> > of the instance and not the top instance.
> > 
> > This also change how the function_graph handles multiple instances on the
> > shadow stack. Previously we use ARRAY type entries to record which one
> > is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> > Previous function_graph_enter() expects calling back from
> > prepare_ftrace_return() function which is called back only once if it is
> > enabled. But this introduces different ftrace_ops for each fgraph
> > instance and those are called from ftrace_graph_func() one by one. Thus
> > we can not loop on the fgraph_array(), and need to reuse the ret_stack
> > pushed by the previous instance. Finding the ret_stack is easy because
> > we can check the ret_stack->func. But that is not enough for the self-
> > recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> > is already set (this means that entry is for previous tail call).
> > 
> > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
> a warning which bisets down to this commit:

Hmm, so does this happen when enabling function graph tracer?

> 
> | Testing tracer function_graph: 
> | ------------[ cut here ]------------
> | WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
> | Modules linked in:
> | CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
> | Hardware name: linux,dummy-virt (DT)
> | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> | pc : arch_stack_walk+0x3c0/0x3d8
> | lr : arch_stack_walk+0x260/0x3d8
> | sp : ffff80008318be00
> | x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
> | x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
> | x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
> | x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
> | x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
> | x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
> | x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
> | x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
> | x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
> | x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
> | Call trace:
> |  arch_stack_walk+0x3c0/0x3d8
> |  return_address+0x40/0x80
> |  trace_hardirqs_on+0x8c/0x198
> |  __do_softirq+0xe8/0x440
> | ---[ end trace 0000000000000000 ]---
> 
> That's a warning in arm64's unwind_recover_return_address() function, which
> fires when ftrace_graph_ret_addr() finds return_to_handler:
> 
> 	if (state->task->ret_stack &&
> 	    (state->pc == (unsigned long)return_to_handler)) {
> 		unsigned long orig_pc;
> 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> 						(void *)state->fp);
> 		if (WARN_ON_ONCE(state->pc == orig_pc))
> 			return -EINVAL;
> 		state->pc = orig_pc;
> 	}
> 
> The rationale there is that since tail calls are (currently) disabled on arm64,
> the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
> it fails to find the original return address.

Yes. what about FP check?

> 
> Does this change make it legitimate for ftrace_graph_ret_addr() to return
> return_to_handler in other cases, or is that a bug?

It should be a bug to be fixed.

> 
> Either way, we'll need *some* way to recover the original return addresss...

At least it needs to dump the shadow stack so that we can analyze what
happened. 

Thank you!

> 
> Mark.
> 
> > ---
> >  Changes in v4:
> >   - Simplify get_ret_stack() sanity check and use WARN_ON_ONCE() for
> >     obviously wrong value.
> >   - Do not check ret == return_to_handler but always read the previous
> >     ret_stack in ftrace_push_return_trace() to check it is reusable.
> >   - Set the bit 0 of the bitmap entry always in function_graph_enter()
> >     because it uses bit 0 to check re-usability.
> >   - Fix to ensure the ret_stack entry is bitmap type when checking the
> >     bitmap.
> >  Changes in v3:
> >   - Pass current fgraph_ops to the new entry handler
> >    (function_graph_enter_ops) if fgraph use ftrace.
> >   - Add fgraph_ops::idx in this patch.
> >   - Replace the array type with the bitmap type so that it can record
> >     which fgraph is called.
> >   - Fix some helper function to use passed task_struct instead of current.
> >   - Reduce the ret-index size to 1024 words.
> >   - Make the ret-index directly points the ret_stack.
> >   - Fix ftrace_graph_ret_addr() to handle tail-call case correctly.
> >  Changes in v2:
> >   - Use ftrace_graph_func and FTRACE_OPS_GRAPH_STUB instead of
> >     ftrace_stub and FTRACE_OPS_FL_STUB for new ftrace based fgraph.
> > ---
> >  arch/arm64/kernel/ftrace.c           |   19 ++
> >  arch/x86/kernel/ftrace.c             |   19 ++
> >  include/linux/ftrace.h               |    7 +
> >  kernel/trace/fgraph.c                |  369 ++++++++++++++++++++--------------
> >  kernel/trace/ftrace.c                |    6 -
> >  kernel/trace/trace.h                 |   16 +
> >  kernel/trace/trace_functions.c       |    2 
> >  kernel/trace/trace_functions_graph.c |    8 +
> >  8 files changed, 277 insertions(+), 169 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > index a650f5e11fc5..205937e04ece 100644
> > --- a/arch/arm64/kernel/ftrace.c
> > +++ b/arch/arm64/kernel/ftrace.c
> > @@ -481,7 +481,24 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
> >  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> >  		       struct ftrace_ops *op, struct ftrace_regs *fregs)
> >  {
> > -	prepare_ftrace_return(ip, &fregs->lr, fregs->fp);
> > +	unsigned long *parent = &fregs->lr;
> > +	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
> > +	int bit;
> > +
> > +	if (unlikely(ftrace_graph_is_dead()))
> > +		return;
> > +
> > +	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> > +		return;
> > +
> > +	bit = ftrace_test_recursion_trylock(ip, *parent);
> > +	if (bit < 0)
> > +		return;
> > +
> > +	if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
> > +		*parent = (unsigned long)&return_to_handler;
> > +
> > +	ftrace_test_recursion_unlock(bit);
> >  }
> >  #else
> >  /*
> > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> > index 12df54ff0e81..845e29b4254f 100644
> > --- a/arch/x86/kernel/ftrace.c
> > +++ b/arch/x86/kernel/ftrace.c
> > @@ -657,9 +657,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> >  		       struct ftrace_ops *op, struct ftrace_regs *fregs)
> >  {
> >  	struct pt_regs *regs = &fregs->regs;
> > -	unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
> > +	unsigned long *parent = (unsigned long *)kernel_stack_pointer(regs);
> > +	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
> > +	int bit;
> > +
> > +	if (unlikely(ftrace_graph_is_dead()))
> > +		return;
> > +
> > +	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> > +		return;
> >  
> > -	prepare_ftrace_return(ip, (unsigned long *)stack, 0);
> > +	bit = ftrace_test_recursion_trylock(ip, *parent);
> > +	if (bit < 0)
> > +		return;
> > +
> > +	if (!function_graph_enter_ops(*parent, ip, 0, parent, gops))
> > +		*parent = (unsigned long)&return_to_handler;
> > +
> > +	ftrace_test_recursion_unlock(bit);
> >  }
> >  #endif
> >  
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index 7b08169aa51d..c431a33fe789 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -1070,7 +1070,9 @@ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph
> >  struct fgraph_ops {
> >  	trace_func_graph_ent_t		entryfunc;
> >  	trace_func_graph_ret_t		retfunc;
> > +	struct ftrace_ops		ops; /* for the hash lists */
> >  	void				*private;
> > +	int				idx;
> >  };
> >  
> >  /*
> > @@ -1104,6 +1106,11 @@ extern int
> >  function_graph_enter(unsigned long ret, unsigned long func,
> >  		     unsigned long frame_pointer, unsigned long *retp);
> >  
> > +extern int
> > +function_graph_enter_ops(unsigned long ret, unsigned long func,
> > +			 unsigned long frame_pointer, unsigned long *retp,
> > +			 struct fgraph_ops *gops);
> > +
> >  struct ftrace_ret_stack *
> >  ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
> >  
> > diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> > index 62c35d6d95f9..6f537ebd3ed7 100644
> > --- a/kernel/trace/fgraph.c
> > +++ b/kernel/trace/fgraph.c
> > @@ -7,6 +7,7 @@
> >   *
> >   * Highly modified by Steven Rostedt (VMware).
> >   */
> > +#include <linux/bits.h>
> >  #include <linux/jump_label.h>
> >  #include <linux/suspend.h>
> >  #include <linux/ftrace.h>
> > @@ -17,22 +18,15 @@
> >  #include "ftrace_internal.h"
> >  #include "trace.h"
> >  
> > -#ifdef CONFIG_DYNAMIC_FTRACE
> > -#define ASSIGN_OPS_HASH(opsname, val) \
> > -	.func_hash		= val, \
> > -	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
> > -#else
> > -#define ASSIGN_OPS_HASH(opsname, val)
> > -#endif
> > -
> >  #define FGRAPH_RET_SIZE sizeof(struct ftrace_ret_stack)
> >  #define FGRAPH_RET_INDEX (FGRAPH_RET_SIZE / sizeof(long))
> >  
> >  /*
> >   * On entry to a function (via function_graph_enter()), a new ftrace_ret_stack
> > - * is allocated on the task's ret_stack, then each fgraph_ops on the
> > - * fgraph_array[]'s entryfunc is called and if that returns non-zero, the
> > - * index into the fgraph_array[] for that fgraph_ops is added to the ret_stack.
> > + * is allocated on the task's ret_stack with indexes entry, then each
> > + * fgraph_ops on the fgraph_array[]'s entryfunc is called and if that returns
> > + * non-zero, the index into the fgraph_array[] for that fgraph_ops is recorded
> > + * on the indexes entry as a bit flag.
> >   * As the associated ftrace_ret_stack saved for those fgraph_ops needs to
> >   * be found, the index to it is also added to the ret_stack along with the
> >   * index of the fgraph_array[] to each fgraph_ops that needs their retfunc
> > @@ -42,61 +36,59 @@
> >   * to the last ftrace_ret_stack saved. All references to the
> >   * ftrace_ret_stack has the format of:
> >   *
> > - * bits:  0 - 13	Index in words from the previous ftrace_ret_stack
> > - * bits: 14 - 15	Type of storage
> > + * bits:  0 -  9	offset in words from the previous ftrace_ret_stack
> > + *			(bitmap type should have FGRAPH_RET_INDEX always)
> > + * bits: 10 - 11	Type of storage
> >   *			  0 - reserved
> > - *			  1 - fgraph_array index
> > - * For fgraph_array_index:
> > - *  bits: 16 - 23	The fgraph_ops fgraph_array index
> > + *			  1 - bitmap of fgraph_array index
> > + *
> > + * For bitmap of fgraph_array index
> > + *  bits: 12 - 27	The bitmap of fgraph_ops fgraph_array index
> >   *
> >   * That is, at the end of function_graph_enter, if the first and forth
> >   * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
> >   * on the return of the function being traced, this is what will be on the
> >   * task's shadow ret_stack: (the stack grows upward)
> >   *
> > - * |                                  | <- task->curr_ret_stack
> > - * +----------------------------------+
> > - * | (3 << FGRAPH_ARRAY_SHIFT)|(2)    | ( 3 for index of fourth fgraph_ops)
> > - * +----------------------------------+
> > - * | (0 << FGRAPH_ARRAY_SHIFT)|(1)    | ( 0 for index of first fgraph_ops)
> > - * +----------------------------------+
> > - * | struct ftrace_ret_stack          |
> > - * |   (stores the saved ret pointer) |
> > - * +----------------------------------+
> > - * |             (X) | (N)            | ( N words away from previous ret_stack)
> > - * |                                  |
> > + * |                                            | <- task->curr_ret_stack
> > + * +--------------------------------------------+
> > + * | bitmap_type(bitmap:(BIT(3)|BIT(0)),        |
> > + * |             offset:FGRAPH_RET_INDEX)       | <- the offset is from here
> > + * +--------------------------------------------+
> > + * | struct ftrace_ret_stack                    |
> > + * |   (stores the saved ret pointer)           | <- the offset points here
> > + * +--------------------------------------------+
> > + * |                 (X) | (N)                  | ( N words away from
> > + * |                                            |   previous ret_stack)
> >   *
> >   * If a backtrace is required, and the real return pointer needs to be
> >   * fetched, then it looks at the task's curr_ret_stack index, if it
> > - * is greater than zero, it would subtact one, and then mask the value
> > - * on the ret_stack by FGRAPH_RET_INDEX_MASK and subtract FGRAPH_RET_INDEX
> > - * from that, to get the index of the ftrace_ret_stack structure stored
> > - * on the shadow stack.
> > + * is greater than zero (reserved, or right before poped), it would mask
> > + * the value by FGRAPH_RET_INDEX_MASK to get the offset index of the
> > + * ftrace_ret_stack structure stored on the shadow stack.
> >   */
> >  
> > -#define FGRAPH_RET_INDEX_SIZE	14
> > -#define FGRAPH_RET_INDEX_MASK	((1 << FGRAPH_RET_INDEX_SIZE) - 1)
> > -
> > +#define FGRAPH_RET_INDEX_SIZE	10
> > +#define FGRAPH_RET_INDEX_MASK	GENMASK(FGRAPH_RET_INDEX_SIZE - 1, 0)
> >  
> >  #define FGRAPH_TYPE_SIZE	2
> > -#define FGRAPH_TYPE_MASK	((1 << FGRAPH_TYPE_SIZE) - 1)
> > +#define FGRAPH_TYPE_MASK	GENMASK(FGRAPH_TYPE_SIZE - 1, 0)
> >  #define FGRAPH_TYPE_SHIFT	FGRAPH_RET_INDEX_SIZE
> >  
> >  enum {
> >  	FGRAPH_TYPE_RESERVED	= 0,
> > -	FGRAPH_TYPE_ARRAY	= 1,
> > +	FGRAPH_TYPE_BITMAP	= 1,
> >  };
> >  
> > -#define FGRAPH_ARRAY_SIZE	16
> > -#define FGRAPH_ARRAY_MASK	((1 << FGRAPH_ARRAY_SIZE) - 1)
> > -#define FGRAPH_ARRAY_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
> > +#define FGRAPH_INDEX_SIZE	16
> > +#define FGRAPH_INDEX_MASK	GENMASK(FGRAPH_INDEX_SIZE - 1, 0)
> > +#define FGRAPH_INDEX_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
> >  
> >  /* Currently the max stack index can't be more than register callers */
> > -#define FGRAPH_MAX_INDEX	FGRAPH_ARRAY_SIZE
> > +#define FGRAPH_MAX_INDEX	(FGRAPH_INDEX_SIZE + FGRAPH_RET_INDEX)
> > +
> > +#define FGRAPH_ARRAY_SIZE	FGRAPH_INDEX_SIZE
> >  
> > -#define FGRAPH_FRAME_SIZE (FGRAPH_RET_SIZE + FGRAPH_ARRAY_SIZE * (sizeof(long)))
> > -#define FGRAPH_FRAME_INDEX (ALIGN(FGRAPH_FRAME_SIZE,		\
> > -				  sizeof(long)) / sizeof(long))
> >  #define SHADOW_STACK_SIZE (PAGE_SIZE)
> >  #define SHADOW_STACK_INDEX (SHADOW_STACK_SIZE / sizeof(long))
> >  /* Leave on a buffer at the end */
> > @@ -113,19 +105,36 @@ static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
> >  
> >  static inline int get_ret_stack_index(struct task_struct *t, int offset)
> >  {
> > -	return current->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
> > +	return t->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
> >  }
> >  
> >  static inline int get_fgraph_type(struct task_struct *t, int offset)
> >  {
> > -	return (current->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) &
> > -		FGRAPH_TYPE_MASK;
> > +	return (t->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
> > +}
> > +
> > +static inline unsigned long
> > +get_fgraph_index_bitmap(struct task_struct *t, int offset)
> > +{
> > +	return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
> >  }
> >  
> > -static inline int get_fgraph_array(struct task_struct *t, int offset)
> > +static inline void
> > +set_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
> >  {
> > -	return (current->ret_stack[offset] >> FGRAPH_ARRAY_SHIFT) &
> > -		FGRAPH_ARRAY_MASK;
> > +	t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
> > +		(FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
> > +}
> > +
> > +static inline bool is_fgraph_index_set(struct task_struct *t, int offset, int idx)
> > +{
> > +	return !!(get_fgraph_index_bitmap(t, offset) & BIT(idx));
> > +}
> > +
> > +static inline void
> > +add_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
> > +{
> > +	t->ret_stack[offset] |= (bitmap << FGRAPH_INDEX_SHIFT);
> >  }
> >  
> >  /* ftrace_graph_entry set to this to tell some archs to run function graph */
> > @@ -160,17 +169,14 @@ get_ret_stack(struct task_struct *t, int offset, int *index)
> >  
> >  	BUILD_BUG_ON(FGRAPH_RET_SIZE % sizeof(long));
> >  
> > -	if (offset <= 0)
> > +	if (unlikely(offset <= 0))
> >  		return NULL;
> >  
> > -	idx = get_ret_stack_index(t, offset - 1);
> > -
> > -	if (idx <= 0 || idx > FGRAPH_MAX_INDEX)
> > +	idx = get_ret_stack_index(t, --offset);
> > +	if (WARN_ON_ONCE(idx <= 0 || idx > offset))
> >  		return NULL;
> >  
> > -	offset -= idx + FGRAPH_RET_INDEX;
> > -	if (offset < 0)
> > -		return NULL;
> > +	offset -= idx;
> >  
> >  	*index = offset;
> >  	return RET_STACK(t, offset);
> > @@ -231,10 +237,12 @@ void ftrace_graph_stop(void)
> >  /* Add a function return address to the trace stack on thread info.*/
> >  static int
> >  ftrace_push_return_trace(unsigned long ret, unsigned long func,
> > -			 unsigned long frame_pointer, unsigned long *retp)
> > +			 unsigned long frame_pointer, unsigned long *retp,
> > +			 int fgraph_idx)
> >  {
> >  	struct ftrace_ret_stack *ret_stack;
> >  	unsigned long long calltime;
> > +	unsigned long val;
> >  	int index;
> >  
> >  	if (unlikely(ftrace_graph_is_dead()))
> > @@ -243,6 +251,21 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  	if (!current->ret_stack)
> >  		return -EBUSY;
> >  
> > +	/*
> > +	 * At first, check whether the previous fgraph callback is pushed by
> > +	 * the fgraph on the same function entry.
> > +	 * But if @func is the self tail-call function, we also need to ensure
> > +	 * the ret_stack is not for the previous call by checking whether the
> > +	 * bit of @fgraph_idx is set or not.
> > +	 */
> > +	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
> > +	if (ret_stack && ret_stack->func == func &&
> > +	    get_fgraph_type(current, index + FGRAPH_RET_INDEX) == FGRAPH_TYPE_BITMAP &&
> > +	    !is_fgraph_index_set(current, index + FGRAPH_RET_INDEX, fgraph_idx))
> > +		return index + FGRAPH_RET_INDEX;
> > +
> > +	val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
> > +
> >  	BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
> >  
> >  	/*
> > @@ -252,17 +275,19 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  	smp_rmb();
> >  
> >  	/* The return trace stack is full */
> > -	if (current->curr_ret_stack >= SHADOW_STACK_MAX_INDEX) {
> > +	if (current->curr_ret_stack + FGRAPH_RET_INDEX >= SHADOW_STACK_MAX_INDEX) {
> >  		atomic_inc(&current->trace_overrun);
> >  		return -EBUSY;
> >  	}
> >  
> >  	calltime = trace_clock_local();
> >  
> > -	index = current->curr_ret_stack;
> > -	/* ret offset = 1 ; type = reserved */
> > -	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
> > +	index = READ_ONCE(current->curr_ret_stack);
> >  	ret_stack = RET_STACK(current, index);
> > +	index += FGRAPH_RET_INDEX;
> > +
> > +	/* ret offset = FGRAPH_RET_INDEX ; type = reserved */
> > +	current->ret_stack[index] = val;
> >  	ret_stack->ret = ret;
> >  	/*
> >  	 * The unwinders expect curr_ret_stack to point to either zero
> > @@ -278,7 +303,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  	 * at least a correct index!
> >  	 */
> >  	barrier();
> > -	current->curr_ret_stack += FGRAPH_RET_INDEX + 1;
> > +	current->curr_ret_stack = index + 1;
> >  	/*
> >  	 * This next barrier is to ensure that an interrupt coming in
> >  	 * will not corrupt what we are about to write.
> > @@ -286,7 +311,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  	barrier();
> >  
> >  	/* Still keep it reserved even if an interrupt came in */
> > -	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
> > +	current->ret_stack[index] = val;
> >  
> >  	ret_stack->ret = ret;
> >  	ret_stack->func = func;
> > @@ -297,7 +322,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
> >  	ret_stack->retp = retp;
> >  #endif
> > -	return 0;
> > +	return index;
> >  }
> >  
> >  /*
> > @@ -314,15 +339,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
> >  # define MCOUNT_INSN_SIZE 0
> >  #endif
> >  
> > +/* If the caller does not use ftrace, call this function. */
> >  int function_graph_enter(unsigned long ret, unsigned long func,
> >  			 unsigned long frame_pointer, unsigned long *retp)
> >  {
> >  	struct ftrace_graph_ent trace;
> > -	int offset;
> > -	int start;
> > -	int type;
> > -	int val;
> > -	int cnt = 0;
> > +	unsigned long bitmap = 0;
> > +	int index;
> >  	int i;
> >  
> >  #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> > @@ -337,69 +360,33 @@ int function_graph_enter(unsigned long ret, unsigned long func,
> >  		return -EBUSY;
> >  #endif
> >  
> > -	if (!ftrace_ops_test(&global_ops, func, NULL))
> > -		return -EBUSY;
> > -
> >  	trace.func = func;
> >  	trace.depth = ++current->curr_ret_depth;
> >  
> > -	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
> > +	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
> > +	if (index < 0)
> >  		goto out;
> >  
> > -	/* Use start for the distance to ret_stack (skipping over reserve) */
> > -	start = offset = current->curr_ret_stack - 2;
> > -
> >  	for (i = 0; i < fgraph_array_cnt; i++) {
> >  		struct fgraph_ops *gops = fgraph_array[i];
> >  
> >  		if (gops == &fgraph_stub)
> >  			continue;
> >  
> > -		if ((offset == start) &&
> > -		    (current->curr_ret_stack >= SHADOW_STACK_INDEX - 1)) {
> > -			atomic_inc(&current->trace_overrun);
> > -			break;
> > -		}
> > -		if (fgraph_array[i]->entryfunc(&trace, fgraph_array[i])) {
> > -			offset = current->curr_ret_stack;
> > -			/* Check the top level stored word */
> > -			type = get_fgraph_type(current, offset - 1);
> > -
> > -			val = (i << FGRAPH_ARRAY_SHIFT) |
> > -				(FGRAPH_TYPE_ARRAY << FGRAPH_TYPE_SHIFT) |
> > -				((offset - start) - 1);
> > -
> > -			/* We can reuse the top word if it is reserved */
> > -			if (type == FGRAPH_TYPE_RESERVED) {
> > -				current->ret_stack[offset - 1] = val;
> > -				cnt++;
> > -				continue;
> > -			}
> > -			val++;
> > -
> > -			current->ret_stack[offset] = val;
> > -			/*
> > -			 * Write the value before we increment, so that
> > -			 * if an interrupt comes in after we increment
> > -			 * it will still see the value and skip over
> > -			 * this.
> > -			 */
> > -			barrier();
> > -			current->curr_ret_stack++;
> > -			/*
> > -			 * Have to write again, in case an interrupt
> > -			 * came in before the increment and after we
> > -			 * wrote the value.
> > -			 */
> > -			barrier();
> > -			current->ret_stack[offset] = val;
> > -			cnt++;
> > -		}
> > +		if (ftrace_ops_test(&gops->ops, func, NULL) &&
> > +		    gops->entryfunc(&trace, gops))
> > +			bitmap |= BIT(i);
> >  	}
> >  
> > -	if (!cnt)
> > +	if (!bitmap)
> >  		goto out_ret;
> >  
> > +	/*
> > +	 * Since this function uses fgraph_idx = 0 as a tail-call checking
> > +	 * flag, set that bit always.
> > +	 */
> > +	set_fgraph_index_bitmap(current, index, bitmap | BIT(0));
> > +
> >  	return 0;
> >   out_ret:
> >  	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
> > @@ -408,15 +395,51 @@ int function_graph_enter(unsigned long ret, unsigned long func,
> >  	return -EBUSY;
> >  }
> >  
> > +/* This is called from ftrace_graph_func() via ftrace */
> > +int function_graph_enter_ops(unsigned long ret, unsigned long func,
> > +			     unsigned long frame_pointer, unsigned long *retp,
> > +			     struct fgraph_ops *gops)
> > +{
> > +	struct ftrace_graph_ent trace;
> > +	int index;
> > +	int type;
> > +
> > +
> > +	/* Use start for the distance to ret_stack (skipping over reserve) */
> > +	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, gops->idx);
> > +	if (index < 0)
> > +		return index;
> > +	type = get_fgraph_type(current, index);
> > +
> > +	/* This is the first ret_stack for this fentry */
> > +	if (type == FGRAPH_TYPE_RESERVED)
> > +		++current->curr_ret_depth;
> > +
> > +	trace.func = func;
> > +	trace.depth = current->curr_ret_depth;
> > +	if (gops->entryfunc(&trace, gops)) {
> > +		if (type == FGRAPH_TYPE_RESERVED)
> > +			set_fgraph_index_bitmap(current, index, BIT(gops->idx));
> > +		else
> > +			add_fgraph_index_bitmap(current, index, BIT(gops->idx));
> > +		return 0;
> > +	}
> > +
> > +	if (type == FGRAPH_TYPE_RESERVED) {
> > +		current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
> > +		current->curr_ret_depth--;
> > +	}
> > +	return -EBUSY;
> > +}
> > +
> >  /* Retrieve a function return address to the trace stack on thread info.*/
> >  static struct ftrace_ret_stack *
> >  ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
> > -			unsigned long frame_pointer)
> > +			unsigned long frame_pointer, int *index)
> >  {
> >  	struct ftrace_ret_stack *ret_stack;
> > -	int index;
> >  
> > -	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
> > +	ret_stack = get_ret_stack(current, current->curr_ret_stack, index);
> >  
> >  	if (unlikely(!ret_stack)) {
> >  		ftrace_graph_stop();
> > @@ -455,6 +478,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
> >  	}
> >  #endif
> >  
> > +	*index += FGRAPH_RET_INDEX;
> >  	*ret = ret_stack->ret;
> >  	trace->func = ret_stack->func;
> >  	trace->calltime = ret_stack->calltime;
> > @@ -507,13 +531,12 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
> >  {
> >  	struct ftrace_ret_stack *ret_stack;
> >  	struct ftrace_graph_ret trace;
> > +	unsigned long bitmap;
> >  	unsigned long ret;
> > -	int offset;
> >  	int index;
> > -	int idx;
> >  	int i;
> >  
> > -	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer);
> > +	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &index);
> >  
> >  	if (unlikely(!ret_stack)) {
> >  		ftrace_graph_stop();
> > @@ -527,16 +550,17 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
> >  	trace.retval = fgraph_ret_regs_return_value(ret_regs);
> >  #endif
> >  
> > -	offset = current->curr_ret_stack - 1;
> > -	index = get_ret_stack_index(current, offset);
> > +	bitmap = get_fgraph_index_bitmap(current, index);
> > +	for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
> > +		struct fgraph_ops *gops = fgraph_array[i];
> >  
> > -	/* index has to be at least one! Optimize for it */
> > -	i = 0;
> > -	do {
> > -		idx = get_fgraph_array(current, offset - i);
> > -		fgraph_array[idx]->retfunc(&trace, fgraph_array[idx]);
> > -		i++;
> > -	} while (i < index);
> > +		if (!(bitmap & BIT(i)))
> > +			continue;
> > +		if (gops == &fgraph_stub)
> > +			continue;
> > +
> > +		gops->retfunc(&trace, gops);
> > +	}
> >  
> >  	/*
> >  	 * The ftrace_graph_return() may still access the current
> > @@ -544,7 +568,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
> >  	 * curr_ret_stack is after that.
> >  	 */
> >  	barrier();
> > -	current->curr_ret_stack -= index + FGRAPH_RET_INDEX;
> > +	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
> >  	current->curr_ret_depth--;
> >  	return ret;
> >  }
> > @@ -622,7 +646,17 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
> >  		ret_stack = get_ret_stack(current, i, &i);
> >  		if (!ret_stack)
> >  			break;
> > -		if (ret_stack->retp == retp)
> > +		/*
> > +		 * For the tail-call, there would be 2 or more ftrace_ret_stacks on
> > +		 * the ret_stack, which records "return_to_handler" as the return
> > +		 * address excpt for the last one.
> > +		 * But on the real stack, there should be 1 entry because tail-call
> > +		 * reuses the return address on the stack and jump to the next function.
> > +		 * Thus we will continue to find real return address.
> > +		 */
> > +		if (ret_stack->retp == retp &&
> > +		    ret_stack->ret !=
> > +		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
> >  			return ret_stack->ret;
> >  	}
> >  
> > @@ -645,6 +679,9 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
> >  	i = *idx;
> >  	do {
> >  		ret_stack = get_ret_stack(task, task_idx, &task_idx);
> > +		if (ret_stack && ret_stack->ret ==
> > +		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
> > +			continue;
> >  		i--;
> >  	} while (i >= 0 && ret_stack);
> >  
> > @@ -655,17 +692,25 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
> >  }
> >  #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
> >  
> > -static struct ftrace_ops graph_ops = {
> > -	.func			= ftrace_graph_func,
> > -	.flags			= FTRACE_OPS_FL_INITIALIZED |
> > -				   FTRACE_OPS_FL_PID |
> > -				   FTRACE_OPS_GRAPH_STUB,
> > +void fgraph_init_ops(struct ftrace_ops *dst_ops,
> > +		     struct ftrace_ops *src_ops)
> > +{
> > +	dst_ops->func = ftrace_graph_func;
> > +	dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
> > +
> >  #ifdef FTRACE_GRAPH_TRAMP_ADDR
> > -	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
> > +	dst_ops->trampoline = FTRACE_GRAPH_TRAMP_ADDR;
> >  	/* trampoline_size is only needed for dynamically allocated tramps */
> >  #endif
> > -	ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
> > -};
> > +
> > +#ifdef CONFIG_DYNAMIC_FTRACE
> > +	if (src_ops) {
> > +		dst_ops->func_hash = &src_ops->local_hash;
> > +		mutex_init(&dst_ops->local_hash.regex_lock);
> > +		dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
> > +	}
> > +#endif
> > +}
> >  
> >  void ftrace_graph_sleep_time_control(bool enable)
> >  {
> > @@ -869,11 +914,20 @@ static int start_graph_tracing(void)
> >  
> >  int register_ftrace_graph(struct fgraph_ops *gops)
> >  {
> > +	int command = 0;
> >  	int ret = 0;
> >  	int i;
> >  
> >  	mutex_lock(&ftrace_lock);
> >  
> > +	if (!gops->ops.func) {
> > +		gops->ops.flags |= FTRACE_OPS_GRAPH_STUB;
> > +		gops->ops.func = ftrace_graph_func;
> > +#ifdef FTRACE_GRAPH_TRAMP_ADDR
> > +		gops->ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
> > +#endif
> > +	}
> > +
> >  	if (!fgraph_array[0]) {
> >  		/* The array must always have real data on it */
> >  		for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
> > @@ -893,6 +947,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
> >  	fgraph_array[i] = gops;
> >  	if (i + 1 > fgraph_array_cnt)
> >  		fgraph_array_cnt = i + 1;
> > +	gops->idx = i;
> >  
> >  	ftrace_graph_active++;
> >  
> > @@ -909,9 +964,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
> >  		 */
> >  		ftrace_graph_return = return_run;
> >  		ftrace_graph_entry = entry_run;
> > -
> > -		ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
> > +		command = FTRACE_START_FUNC_RET;
> >  	}
> > +
> > +	ret = ftrace_startup(&gops->ops, command);
> >  out:
> >  	mutex_unlock(&ftrace_lock);
> >  	return ret;
> > @@ -919,6 +975,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
> >  
> >  void unregister_ftrace_graph(struct fgraph_ops *gops)
> >  {
> > +	int command = 0;
> >  	int i;
> >  
> >  	mutex_lock(&ftrace_lock);
> > @@ -926,25 +983,29 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
> >  	if (unlikely(!ftrace_graph_active))
> >  		goto out;
> >  
> > -	for (i = 0; i < fgraph_array_cnt; i++)
> > -		if (gops == fgraph_array[i])
> > -			break;
> > -	if (i >= fgraph_array_cnt)
> > +	if (unlikely(gops->idx < 0 || gops->idx >= fgraph_array_cnt))
> >  		goto out;
> >  
> > -	fgraph_array[i] = &fgraph_stub;
> > -	if (i + 1 == fgraph_array_cnt) {
> > -		for (; i >= 0; i--)
> > -			if (fgraph_array[i] != &fgraph_stub)
> > -				break;
> > +	WARN_ON_ONCE(fgraph_array[gops->idx] != gops);
> > +
> > +	fgraph_array[gops->idx] = &fgraph_stub;
> > +	if (gops->idx + 1 == fgraph_array_cnt) {
> > +		i = gops->idx;
> > +		while (i >= 0 && fgraph_array[i] == &fgraph_stub)
> > +			i--;
> >  		fgraph_array_cnt = i + 1;
> >  	}
> >  
> >  	ftrace_graph_active--;
> > +
> > +	if (!ftrace_graph_active)
> > +		command = FTRACE_STOP_FUNC_RET;
> > +
> > +	ftrace_shutdown(&gops->ops, command);
> > +
> >  	if (!ftrace_graph_active) {
> >  		ftrace_graph_return = ftrace_stub_graph;
> >  		ftrace_graph_entry = ftrace_graph_entry_stub;
> > -		ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
> >  		unregister_pm_notifier(&ftrace_suspend_notifier);
> >  		unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
> >  	}
> > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> > index 83fbfb7b48f8..c4cc2a9d0047 100644
> > --- a/kernel/trace/ftrace.c
> > +++ b/kernel/trace/ftrace.c
> > @@ -3050,6 +3050,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
> >  	if (unlikely(ftrace_disabled))
> >  		return -ENODEV;
> >  
> > +	ftrace_ops_init(ops);
> > +
> >  	ret = __register_ftrace_function(ops);
> >  	if (ret)
> >  		return ret;
> > @@ -7319,7 +7321,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr)
> >  	tr->ops = &global_ops;
> >  	tr->ops->private = tr;
> >  	ftrace_init_trace_array(tr);
> > -	init_array_fgraph_ops(tr);
> > +	init_array_fgraph_ops(tr, tr->ops);
> >  }
> >  
> >  void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
> > @@ -8051,7 +8053,7 @@ static int register_ftrace_function_nolock(struct ftrace_ops *ops)
> >   */
> >  int register_ftrace_function(struct ftrace_ops *ops)
> >  {
> > -	int ret;
> > +	int ret = -1;
> >  
> >  	lock_direct_mutex();
> >  	ret = prepare_direct_functions_for_ipmodify(ops);
> > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> > index 16948c0ed00a..02edfdb68933 100644
> > --- a/kernel/trace/trace.h
> > +++ b/kernel/trace/trace.h
> > @@ -890,8 +890,8 @@ extern int __trace_graph_entry(struct trace_array *tr,
> >  extern void __trace_graph_return(struct trace_array *tr,
> >  				 struct ftrace_graph_ret *trace,
> >  				 unsigned int trace_ctx);
> > -extern void init_array_fgraph_ops(struct trace_array *tr);
> > -extern int allocate_fgraph_ops(struct trace_array *tr);
> > +extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
> > +extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
> >  extern void free_fgraph_ops(struct trace_array *tr);
> >  
> >  #ifdef CONFIG_DYNAMIC_FTRACE
> > @@ -974,6 +974,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
> >  	preempt_enable_notrace();
> >  	return ret;
> >  }
> > +
> >  #else
> >  static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
> >  {
> > @@ -999,18 +1000,19 @@ static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
> >  		(fgraph_max_depth && trace->depth >= fgraph_max_depth);
> >  }
> >  
> > +void fgraph_init_ops(struct ftrace_ops *dst_ops,
> > +		     struct ftrace_ops *src_ops);
> > +
> >  #else /* CONFIG_FUNCTION_GRAPH_TRACER */
> >  static inline enum print_line_t
> >  print_graph_function_flags(struct trace_iterator *iter, u32 flags)
> >  {
> >  	return TRACE_TYPE_UNHANDLED;
> >  }
> > -static inline void init_array_fgraph_ops(struct trace_array *tr) { }
> > -static inline int allocate_fgraph_ops(struct trace_array *tr)
> > -{
> > -	return 0;
> > -}
> >  static inline void free_fgraph_ops(struct trace_array *tr) { }
> > +/* ftrace_ops may not be defined */
> > +#define init_array_fgraph_ops(tr, ops) do { } while (0)
> > +#define allocate_fgraph_ops(tr, ops) ({ 0; })
> >  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
> >  
> >  extern struct list_head ftrace_pids;
> > diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
> > index 8e8da0d0ee52..13bf2415245d 100644
> > --- a/kernel/trace/trace_functions.c
> > +++ b/kernel/trace/trace_functions.c
> > @@ -91,7 +91,7 @@ int ftrace_create_function_files(struct trace_array *tr,
> >  	if (!tr->ops)
> >  		return -EINVAL;
> >  
> > -	ret = allocate_fgraph_ops(tr);
> > +	ret = allocate_fgraph_ops(tr, tr->ops);
> >  	if (ret) {
> >  		kfree(tr->ops);
> >  		return ret;
> > diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> > index 9ccc904a7703..7f30652f0e97 100644
> > --- a/kernel/trace/trace_functions_graph.c
> > +++ b/kernel/trace/trace_functions_graph.c
> > @@ -288,7 +288,7 @@ static struct fgraph_ops funcgraph_ops = {
> >  	.retfunc = &trace_graph_return,
> >  };
> >  
> > -int allocate_fgraph_ops(struct trace_array *tr)
> > +int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
> >  {
> >  	struct fgraph_ops *gops;
> >  
> > @@ -301,6 +301,9 @@ int allocate_fgraph_ops(struct trace_array *tr)
> >  
> >  	tr->gops = gops;
> >  	gops->private = tr;
> > +
> > +	fgraph_init_ops(&gops->ops, ops);
> > +
> >  	return 0;
> >  }
> >  
> > @@ -309,10 +312,11 @@ void free_fgraph_ops(struct trace_array *tr)
> >  	kfree(tr->gops);
> >  }
> >  
> > -__init void init_array_fgraph_ops(struct trace_array *tr)
> > +__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
> >  {
> >  	tr->gops = &funcgraph_ops;
> >  	funcgraph_ops.private = tr;
> > +	fgraph_init_ops(&tr->gops->ops, ops);
> >  }
> >  
> >  static int graph_trace_init(struct trace_array *tr)
> >
Mark Rutland Jan. 8, 2024, 12:25 p.m. UTC | #4
Hi,

There's a bit more of an info-dump below; I'll go try to dump the fgraph shadow
stack so that we can analyse this in more detail.

On Mon, Jan 08, 2024 at 10:14:36AM +0900, Masami Hiramatsu wrote:
> On Fri, 5 Jan 2024 17:09:10 +0000
> Mark Rutland <mark.rutland@arm.com> wrote:
> 
> > On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> > > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > 
> > > Allow for instances to have their own ftrace_ops part of the fgraph_ops
> > > that makes the funtion_graph tracer filter on the set_ftrace_filter file
> > > of the instance and not the top instance.
> > > 
> > > This also change how the function_graph handles multiple instances on the
> > > shadow stack. Previously we use ARRAY type entries to record which one
> > > is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> > > Previous function_graph_enter() expects calling back from
> > > prepare_ftrace_return() function which is called back only once if it is
> > > enabled. But this introduces different ftrace_ops for each fgraph
> > > instance and those are called from ftrace_graph_func() one by one. Thus
> > > we can not loop on the fgraph_array(), and need to reuse the ret_stack
> > > pushed by the previous instance. Finding the ret_stack is easy because
> > > we can check the ret_stack->func. But that is not enough for the self-
> > > recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> > > is already set (this means that entry is for previous tail call).
> > > 
> > > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > 
> > As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
> > a warning which bisets down to this commit:
> 
> Hmm, so does this happen when enabling function graph tracer?

Yes; I see it during the function_graph boot-time self-test if I also enable
CONFIG_IRQSOFF_TRACER=y. I can also trigger it regardless of
CONFIG_IRQSOFF_TRACER if I cat /proc/self/stack with the function_graph tracer
enabled (note that I hacked the unwinder to continue after failing to recover a
return address):

| # mount -t tracefs none /sys/kernel/tracing/
| # echo function_graph > /sys/kernel/tracing/current_tracer
| # cat /proc/self/stack
| [   37.469980] ------------[ cut here ]------------
| [   37.471503] WARNING: CPU: 2 PID: 174 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
| [   37.474381] Modules linked in:
| [   37.475501] CPU: 2 PID: 174 Comm: cat Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #15
| [   37.478133] Hardware name: linux,dummy-virt (DT)
| [   37.479670] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| [   37.481923] pc : arch_stack_walk+0x2d8/0x338
| [   37.483373] lr : arch_stack_walk+0x1bc/0x338
| [   37.484818] sp : ffff8000835f3a90
| [   37.485974] x29: ffff8000835f3a90 x28: ffff8000835f3b80 x27: ffff8000835f3b38
| [   37.488405] x26: ffff000004341e00 x25: ffff8000835f4000 x24: ffff80008002df18
| [   37.490842] x23: ffff80008002df18 x22: ffff8000835f3b60 x21: ffff80008015d240
| [   37.493269] x20: ffff8000835f3b50 x19: ffff8000835f3b40 x18: 0000000000000000
| [   37.495704] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| [   37.498144] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
| [   37.500579] x11: ffff800082b4d920 x10: ffff8000835f3a70 x9 : ffff8000800e55a0
| [   37.503021] x8 : ffff80008002df18 x7 : ffff000004341e00 x6 : 00000000ffffffff
| [   37.505452] x5 : 0000000000000000 x4 : ffff8000835f3e48 x3 : ffff8000835f3b80
| [   37.507888] x2 : ffff80008002df18 x1 : ffff000007f7b000 x0 : ffff80008002df18
| [   37.510319] Call trace:
| [   37.511202]  arch_stack_walk+0x2d8/0x338
| [   37.512541]  stack_trace_save_tsk+0x90/0x110
| [   37.514012]  return_to_handler+0x0/0x48
| [   37.515336]  return_to_handler+0x0/0x48
| [   37.516657]  return_to_handler+0x0/0x48
| [   37.517985]  return_to_handler+0x0/0x48
| [   37.519305]  return_to_handler+0x0/0x48
| [   37.520623]  return_to_handler+0x0/0x48
| [   37.521957]  return_to_handler+0x0/0x48
| [   37.523272]  return_to_handler+0x0/0x48
| [   37.524595]  return_to_handler+0x0/0x48
| [   37.525931]  return_to_handler+0x0/0x48
| [   37.527254]  return_to_handler+0x0/0x48
| [   37.528564]  el0t_64_sync_handler+0x120/0x130
| [   37.530046]  el0t_64_sync+0x190/0x198
| [   37.531310] ---[ end trace 0000000000000000 ]---
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] ftrace_stub_graph+0x8/0x8
| [<0>] el0t_64_sync_handler+0x120/0x130
| [<0>] el0t_64_sync+0x190/0x198

One interesting thing there is that there are two distinct failure modes: the
unwind for the WARNING gives return_to_handler instead of the original return
address, and the unwind returned from /proc/self/stack gives ftrace_stub_graph
rather than the original return address.

> > 
> > | Testing tracer function_graph: 
> > | ------------[ cut here ]------------
> > | WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
> > | Modules linked in:
> > | CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
> > | Hardware name: linux,dummy-virt (DT)
> > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > | pc : arch_stack_walk+0x3c0/0x3d8
> > | lr : arch_stack_walk+0x260/0x3d8
> > | sp : ffff80008318be00
> > | x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
> > | x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
> > | x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
> > | x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
> > | x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
> > | x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
> > | x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
> > | x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
> > | x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
> > | x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
> > | Call trace:
> > |  arch_stack_walk+0x3c0/0x3d8
> > |  return_address+0x40/0x80
> > |  trace_hardirqs_on+0x8c/0x198
> > |  __do_softirq+0xe8/0x440
> > | ---[ end trace 0000000000000000 ]---

With the smae hack to continue after failing to recover a return address, the
failure in the selftest looks like:

| ------------[ cut here ]------------
| WARNING: CPU: 7 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
| Modules linked in:
| CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #14
| Hardware name: linux,dummy-virt (DT)
| pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : arch_stack_walk+0x2d8/0x338
| lr : arch_stack_walk+0x1bc/0x338
| sp : ffff8000830c3e20
| x29: ffff8000830c3e20 x28: ffff8000830c3ff0 x27: ffff8000830c3ec8
| x26: ffff0000037e0000 x25: ffff8000830c4000 x24: ffff80008002e080
| x23: ffff80008002e080 x22: ffff8000830c3ee8 x21: ffff800080023418
| x20: ffff8000830c3f50 x19: ffff8000830c3f40 x18: ffffffffffffffff
| x17: 0000000000000000 x16: ffff8000830c0000 x15: 0000000000000000
| x14: 0000000000000002 x13: ffff8000800360f8 x12: ffff800080028330
| x11: ffff800081f4a978 x10: ffff8000830c3ff0 x9 : ffff800080010798
| x8 : ffff80008002e080 x7 : ffff0000037e0000 x6 : 00000000ffffffff
| x5 : 0000000000000000 x4 : ffff8000831dbc18 x3 : ffff8000830c3ff0
| x2 : ffff80008002e080 x1 : ffff0000040a3000 x0 : ffff80008002e080
| Call trace:
|  arch_stack_walk+0x2d8/0x338
|  return_address+0x40/0x80
|  trace_hardirqs_on+0x8c/0x198
|  __do_softirq+0xe8/0x43c
|  return_to_handler+0x0/0x48
|  return_to_handler+0x0/0x48
|  do_softirq_own_stack+0x24/0x38
|  return_to_handler+0x0/0x48
|  el1_interrupt+0x38/0x68
|  el1h_64_irq_handler+0x18/0x28
|  el1h_64_irq+0x64/0x68
|  default_idle_call+0x70/0x178
|  do_idle+0x228/0x290
|  cpu_startup_entry+0x40/0x50
|  secondary_start_kernel+0x138/0x160
|  __secondary_switched+0xb8/0xc0
| ---[ end trace 0000000000000000 ]---

The portion of the trace with:

	__do_softirq+0xe8/0x43c
	return_to_handler+0x0/0x48
	return_to_handler+0x0/0x48
	do_softirq_own_stack+0x24/0x38

... should be something like:

	__do_softirq
	____do_softirq
	call_on_irq_stack	// asm trampoline, not traceable
	do_softirq_own_stack

The generated assembly for do_softirq_own_stack(), ____do_softirq(), and
__do_softirq() is as I'd expect with no tail calls, so I can't see an obvious
reason the return address cannot be recovered correctly.

> > That's a warning in arm64's unwind_recover_return_address() function, which
> > fires when ftrace_graph_ret_addr() finds return_to_handler:
> > 
> > 	if (state->task->ret_stack &&
> > 	    (state->pc == (unsigned long)return_to_handler)) {
> > 		unsigned long orig_pc;
> > 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> > 						(void *)state->fp);
> > 		if (WARN_ON_ONCE(state->pc == orig_pc))
> > 			return -EINVAL;
> > 		state->pc = orig_pc;
> > 	}
> > 
> > The rationale there is that since tail calls are (currently) disabled on arm64,
> > the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
> > it fails to find the original return address.
> 
> Yes. what about FP check?

Do you mean HAVE_FUNCTION_GRAPH_FP_TEST?

That is enabled, and there are warnings from ftrace_pop_return_trace(), so I
believe push/pop is balanced.

We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
not on the stack at the point function-entry is intercepted we use the FP as
the retp value -- in the absence of tail calls this will be different between a
caller and callee.

> > Does this change make it legitimate for ftrace_graph_ret_addr() to return
> > return_to_handler in other cases, or is that a bug?
> 
> It should be a bug to be fixed.

Cool; thanks for confirming!

> > Either way, we'll need *some* way to recover the original return addresss...
> 
> At least it needs to dump the shadow stack so that we can analyze what
> happened. 

Sounds like a plan; as above I'll have a go at putting that together and will
dump the results here.

Thanks for the help! :)

Mark.
Mark Rutland Jan. 8, 2024, 2:21 p.m. UTC | #5
On Mon, Jan 08, 2024 at 12:25:55PM +0000, Mark Rutland wrote:
> We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> not on the stack at the point function-entry is intercepted we use the FP as
> the retp value -- in the absence of tail calls this will be different between a
> caller and callee.

Ah; I just spotted that this patch changed that in ftrace_graph_func(), which
is the source of the bug. 

As of this patch, we use the address of fregs->lr as the retp value, but the
unwinder still uses the FP value, and so when unwind_recover_return_address()
calls ftrace_graph_ret_addr(), the retp value won't match the expected entry on
the fgraph ret_stack, resulting in failing to find the expected entry.

Since the ftrace_regs only exist transiently during function entry/exit, it's
possible for a stackframe to reuse that same address on the stack, which would
result in finding a different entry by mistake.

The diff below restores the existing behaviour and fixes the issue for me.
Could you please fold that into this patch?

On a separate note, looking at how this patch changed arm64's
ftrace_graph_func(), do we need similar changes to arm64's
prepare_ftrace_return() for the old-style mcount based ftrace?

Mark.

---->8----
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 205937e04ece..329092ce06ba 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -495,7 +495,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
        if (bit < 0)
                return;
 
-       if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
+       if (!function_graph_enter_ops(*parent, ip, fregs->fp, (void *)fregs->fp, gops))
                *parent = (unsigned long)&return_to_handler;
 
        ftrace_test_recursion_unlock(bit);
Mark Rutland Jan. 8, 2024, 3:03 p.m. UTC | #6
On Mon, Jan 08, 2024 at 02:21:03PM +0000, Mark Rutland wrote:
> On Mon, Jan 08, 2024 at 12:25:55PM +0000, Mark Rutland wrote:
> > We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> > not on the stack at the point function-entry is intercepted we use the FP as
> > the retp value -- in the absence of tail calls this will be different between a
> > caller and callee.
> 
> Ah; I just spotted that this patch changed that in ftrace_graph_func(), which
> is the source of the bug. 
> 
> As of this patch, we use the address of fregs->lr as the retp value, but the
> unwinder still uses the FP value, and so when unwind_recover_return_address()
> calls ftrace_graph_ret_addr(), the retp value won't match the expected entry on
> the fgraph ret_stack, resulting in failing to find the expected entry.
> 
> Since the ftrace_regs only exist transiently during function entry/exit, it's
> possible for a stackframe to reuse that same address on the stack, which would
> result in finding a different entry by mistake.
> 
> The diff below restores the existing behaviour and fixes the issue for me.
> Could you please fold that into this patch?
> 
> On a separate note, looking at how this patch changed arm64's
> ftrace_graph_func(), do we need similar changes to arm64's
> prepare_ftrace_return() for the old-style mcount based ftrace?
> 
> Mark.
> 
> ---->8----
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 205937e04ece..329092ce06ba 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -495,7 +495,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>         if (bit < 0)
>                 return;
>  
> -       if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
> +       if (!function_graph_enter_ops(*parent, ip, fregs->fp, (void *)fregs->fp, gops))
>                 *parent = (unsigned long)&return_to_handler;
>  
>         ftrace_test_recursion_unlock(bit);

Thinking some more, this line gets excessively long when we pass the fregs too,
so it's probably worth adding a local variable for fp, i.e. the diff below.

Mark.

---->8----
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 205937e04ece..d4e142ef4686 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -481,8 +481,9 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-       unsigned long *parent = &fregs->lr;
        struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+       unsigned long *parent = &fregs->lr;
+       unsigned long fp = fregs->fp;
        int bit;
 
        if (unlikely(ftrace_graph_is_dead()))
@@ -495,7 +496,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
        if (bit < 0)
                return;
 
-       if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
+       if (!function_graph_enter_ops(*parent, ip, fp, (void *)fp, gops))
                *parent = (unsigned long)&return_to_handler;
 
        ftrace_test_recursion_unlock(bit);
Masami Hiramatsu (Google) Jan. 11, 2024, 2:15 a.m. UTC | #7
Hi Mark,

Thanks for the investigation.

On Mon, 8 Jan 2024 12:25:55 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> Hi,
> 
> There's a bit more of an info-dump below; I'll go try to dump the fgraph shadow
> stack so that we can analyse this in more detail.
> 
> On Mon, Jan 08, 2024 at 10:14:36AM +0900, Masami Hiramatsu wrote:
> > On Fri, 5 Jan 2024 17:09:10 +0000
> > Mark Rutland <mark.rutland@arm.com> wrote:
> > 
> > > On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> > > > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > 
> > > > Allow for instances to have their own ftrace_ops part of the fgraph_ops
> > > > that makes the funtion_graph tracer filter on the set_ftrace_filter file
> > > > of the instance and not the top instance.
> > > > 
> > > > This also change how the function_graph handles multiple instances on the
> > > > shadow stack. Previously we use ARRAY type entries to record which one
> > > > is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> > > > Previous function_graph_enter() expects calling back from
> > > > prepare_ftrace_return() function which is called back only once if it is
> > > > enabled. But this introduces different ftrace_ops for each fgraph
> > > > instance and those are called from ftrace_graph_func() one by one. Thus
> > > > we can not loop on the fgraph_array(), and need to reuse the ret_stack
> > > > pushed by the previous instance. Finding the ret_stack is easy because
> > > > we can check the ret_stack->func. But that is not enough for the self-
> > > > recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> > > > is already set (this means that entry is for previous tail call).
> > > > 
> > > > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > 
> > > As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
> > > a warning which bisets down to this commit:
> > 
> > Hmm, so does this happen when enabling function graph tracer?
> 
> Yes; I see it during the function_graph boot-time self-test if I also enable
> CONFIG_IRQSOFF_TRACER=y. I can also trigger it regardless of
> CONFIG_IRQSOFF_TRACER if I cat /proc/self/stack with the function_graph tracer
> enabled (note that I hacked the unwinder to continue after failing to recover a
> return address):
> 
> | # mount -t tracefs none /sys/kernel/tracing/
> | # echo function_graph > /sys/kernel/tracing/current_tracer
> | # cat /proc/self/stack
> | [   37.469980] ------------[ cut here ]------------
> | [   37.471503] WARNING: CPU: 2 PID: 174 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> | [   37.474381] Modules linked in:
> | [   37.475501] CPU: 2 PID: 174 Comm: cat Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #15
> | [   37.478133] Hardware name: linux,dummy-virt (DT)
> | [   37.479670] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> | [   37.481923] pc : arch_stack_walk+0x2d8/0x338
> | [   37.483373] lr : arch_stack_walk+0x1bc/0x338
> | [   37.484818] sp : ffff8000835f3a90
> | [   37.485974] x29: ffff8000835f3a90 x28: ffff8000835f3b80 x27: ffff8000835f3b38
> | [   37.488405] x26: ffff000004341e00 x25: ffff8000835f4000 x24: ffff80008002df18
> | [   37.490842] x23: ffff80008002df18 x22: ffff8000835f3b60 x21: ffff80008015d240
> | [   37.493269] x20: ffff8000835f3b50 x19: ffff8000835f3b40 x18: 0000000000000000
> | [   37.495704] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> | [   37.498144] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
> | [   37.500579] x11: ffff800082b4d920 x10: ffff8000835f3a70 x9 : ffff8000800e55a0
> | [   37.503021] x8 : ffff80008002df18 x7 : ffff000004341e00 x6 : 00000000ffffffff
> | [   37.505452] x5 : 0000000000000000 x4 : ffff8000835f3e48 x3 : ffff8000835f3b80
> | [   37.507888] x2 : ffff80008002df18 x1 : ffff000007f7b000 x0 : ffff80008002df18
> | [   37.510319] Call trace:
> | [   37.511202]  arch_stack_walk+0x2d8/0x338
> | [   37.512541]  stack_trace_save_tsk+0x90/0x110
> | [   37.514012]  return_to_handler+0x0/0x48
> | [   37.515336]  return_to_handler+0x0/0x48
> | [   37.516657]  return_to_handler+0x0/0x48
> | [   37.517985]  return_to_handler+0x0/0x48
> | [   37.519305]  return_to_handler+0x0/0x48
> | [   37.520623]  return_to_handler+0x0/0x48
> | [   37.521957]  return_to_handler+0x0/0x48
> | [   37.523272]  return_to_handler+0x0/0x48
> | [   37.524595]  return_to_handler+0x0/0x48
> | [   37.525931]  return_to_handler+0x0/0x48
> | [   37.527254]  return_to_handler+0x0/0x48
> | [   37.528564]  el0t_64_sync_handler+0x120/0x130
> | [   37.530046]  el0t_64_sync+0x190/0x198
> | [   37.531310] ---[ end trace 0000000000000000 ]---
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] ftrace_stub_graph+0x8/0x8
> | [<0>] el0t_64_sync_handler+0x120/0x130
> | [<0>] el0t_64_sync+0x190/0x198

Hmm, I haven't see this mode.

> 
> One interesting thing there is that there are two distinct failure modes: the
> unwind for the WARNING gives return_to_handler instead of the original return
> address, and the unwind returned from /proc/self/stack gives ftrace_stub_graph
> rather than the original return address.
> 
> > > 
> > > | Testing tracer function_graph: 
> > > | ------------[ cut here ]------------
> > > | WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
> > > | Modules linked in:
> > > | CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
> > > | Hardware name: linux,dummy-virt (DT)
> > > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > | pc : arch_stack_walk+0x3c0/0x3d8
> > > | lr : arch_stack_walk+0x260/0x3d8
> > > | sp : ffff80008318be00
> > > | x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
> > > | x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
> > > | x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
> > > | x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
> > > | x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
> > > | x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
> > > | x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
> > > | x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
> > > | x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
> > > | x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
> > > | Call trace:
> > > |  arch_stack_walk+0x3c0/0x3d8
> > > |  return_address+0x40/0x80
> > > |  trace_hardirqs_on+0x8c/0x198
> > > |  __do_softirq+0xe8/0x440
> > > | ---[ end trace 0000000000000000 ]---
> 
> With the smae hack to continue after failing to recover a return address, the
> failure in the selftest looks like:
> 
> | ------------[ cut here ]------------
> | WARNING: CPU: 7 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> | Modules linked in:
> | CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #14
> | Hardware name: linux,dummy-virt (DT)
> | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> | pc : arch_stack_walk+0x2d8/0x338
> | lr : arch_stack_walk+0x1bc/0x338
> | sp : ffff8000830c3e20
> | x29: ffff8000830c3e20 x28: ffff8000830c3ff0 x27: ffff8000830c3ec8
> | x26: ffff0000037e0000 x25: ffff8000830c4000 x24: ffff80008002e080
> | x23: ffff80008002e080 x22: ffff8000830c3ee8 x21: ffff800080023418
> | x20: ffff8000830c3f50 x19: ffff8000830c3f40 x18: ffffffffffffffff
> | x17: 0000000000000000 x16: ffff8000830c0000 x15: 0000000000000000
> | x14: 0000000000000002 x13: ffff8000800360f8 x12: ffff800080028330
> | x11: ffff800081f4a978 x10: ffff8000830c3ff0 x9 : ffff800080010798
> | x8 : ffff80008002e080 x7 : ffff0000037e0000 x6 : 00000000ffffffff
> | x5 : 0000000000000000 x4 : ffff8000831dbc18 x3 : ffff8000830c3ff0
> | x2 : ffff80008002e080 x1 : ffff0000040a3000 x0 : ffff80008002e080
> | Call trace:
> |  arch_stack_walk+0x2d8/0x338
> |  return_address+0x40/0x80
> |  trace_hardirqs_on+0x8c/0x198
> |  __do_softirq+0xe8/0x43c
> |  return_to_handler+0x0/0x48
> |  return_to_handler+0x0/0x48
> |  do_softirq_own_stack+0x24/0x38
> |  return_to_handler+0x0/0x48
> |  el1_interrupt+0x38/0x68
> |  el1h_64_irq_handler+0x18/0x28
> |  el1h_64_irq+0x64/0x68
> |  default_idle_call+0x70/0x178
> |  do_idle+0x228/0x290
> |  cpu_startup_entry+0x40/0x50
> |  secondary_start_kernel+0x138/0x160
> |  __secondary_switched+0xb8/0xc0
> | ---[ end trace 0000000000000000 ]---

I usually see this and reproduced. Here, I also add a dump of shadow stack.
It seems that the unwinder goes to the bottome of the shadow stack.

/sys/kernel/tracing # echo function_graph > current_tracer 
[   89.887750] ------------[ cut here ]------------
[   89.889864] Dump: return_to_handler = ffffb45fc6a2f1e8
[   89.891833]  ret_stack[20]: 20406 0x20406 type = 1, index = 6
[   89.896118]  ret_stack[19]: ffff800080003be8 0xffff800080003be8 type = 2, index = 1000
[   89.896233]  ret_stack[18]: ffff800080003c20 0xffff800080003c20 type = 3, index = 32
[   89.896362]  ret_stack[17]: 0 0x0 type = 0, index = 0
[   89.896425]  ret_stack[16]: 14edac7710 0x14edac7710 type = 1, index = 784
[   89.896635]  ret_stack[15]: ffffb45fc6a1610c call_break_hook+0x4/0x108 type = 0, index = 268
[   89.897882]  ret_stack[14]: ffffb45fc6a162fc brk_handler+0x24/0x70 type = 0, index = 764
[   89.898139]  ret_stack[13]: 20406 0x20406 type = 1, index = 6
[   89.898337]  ret_stack[12]: ffff800080003c08 0xffff800080003c08 type = 3, index = 8
[   89.898554]  ret_stack[11]: ffff800080003c40 0xffff800080003c40 type = 3, index = 64
[   89.898645]  ret_stack[10]: 0 0x0 type = 0, index = 0
[   89.898832]  ret_stack[9]: 14eda8f920 0x14eda8f920 type = 2, index = 288
[   89.899069]  ret_stack[8]: ffffb45fc6a162dc brk_handler+0x4/0x70 type = 0, index = 732
[   89.899230]  ret_stack[7]: ffffb45fc6a36c24 do_debug_exception+0x74/0x108 type = 3, index = 36
[   89.899385]  ret_stack[6]: 20406 0x20406 type = 1, index = 6
[   89.899456]  ret_stack[5]: ffff800080003fb8 0xffff800080003fb8 type = 3, index = 952
[   89.899518]  ret_stack[4]: ffff800080003ff0 0xffff800080003ff0 type = 3, index = 1008
[   89.899578]  ret_stack[3]: ffff62a80534d21c 0xffff62a80534d21c type = 0, index = 540
[   89.899637]  ret_stack[2]: 14ed8ed2e0 0x14ed8ed2e0 type = 0, index = 736
[   89.899695]  ret_stack[1]: ffffb45fc6a1069c __do_softirq+0x4/0x4f0 type = 1, index = 668
[   89.899986] ret_stack[15]: ffff62a80534d070
[   89.900221] 	func: call_break_hook, return:brk_handler
[   89.901025] ret_stack[8]: ffff62a80534d038
[   89.901223] 	func: brk_handler, return:do_debug_exception
[   89.901450] ret_stack[1]: ffff62a80534d000
[   89.901501] 	func: __do_softirq, return:____do_softirq
[   89.901693] ret_stack[1]: 0
[   90.015738] WARNING: CPU: 0 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x380
[   90.022314] Modules linked in:
[   90.032375] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G                 N 6.7.0-rc8-00036-g3897e34e8ae2-dirty #79
[   90.038797] Hardware name: linux,dummy-virt (DT)
[   90.044170] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   90.048879] pc : arch_stack_walk+0x2d8/0x380
[   90.052222] lr : arch_stack_walk+0x248/0x380
[   90.055635] sp : ffff800080003e20
[   90.058147] x29: ffff800080003e20 x28: ffffb45fc91993c0 x27: 0000000000000000
[   90.063705] x26: 0000000000000000 x25: 0000000000000000 x24: ffffb45fc918fb40
[   90.068946] x23: ffffb45fc6a247b8 x22: ffffb45fc6a2f1e8 x21: ffffb45fc6a35b30
[   90.074894] x20: ffff800080003ef8 x19: ffffb45fc6a24750 x18: 0000000000000000
[   90.078796] x17: 0000000000000000 x16: ffff800080000000 x15: 0000ffffff477588
[   90.084310] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
[   90.088898] x11: ffffb45fc924ca08 x10: ffff62a8040341c0 x9 : ffffb45fc6a10760
[   90.094430] x8 : ffffb45fc6a2f1e8 x7 : ffffb45fc91993c0 x6 : ffff62a80534d000
[   90.099829] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff800080003ff0
[   90.104442] x2 : ffffb45fc6a2f1e8 x1 : ffffb45fc6a2f1e8 x0 : ffffb45fc6a2f1e8
[   90.111735] Call trace:
[   90.114923]  arch_stack_walk+0x2d8/0x380
[   90.118820]  return_address+0x40/0x80
[   90.122057]  trace_hardirqs_on+0xa0/0x100
[   90.125001]  __do_softirq+0xec/0x4f0
[   90.130907] irq event stamp: 102709
[   90.134223] hardirqs last  enabled at (102707): [<ffffb45fc7af51d8>] default_idle_call+0xa0/0x160
[   90.140612] hardirqs last disabled at (102708): [<ffffb45fc7af26ec>] el1_interrupt+0x24/0x68
[   90.145877] softirqs last  enabled at (102702): [<ffffb45fc6a10b40>] __do_softirq+0x4a8/0x4f0
[   90.148952] softirqs last disabled at (102709): [<ffffb45fc6a2f1e8>] return_to_handler+0x0/0x50
[   90.152834] ---[ end trace 0000000000000000 ]---


> 
> The portion of the trace with:
> 
> 	__do_softirq+0xe8/0x43c
> 	return_to_handler+0x0/0x48
> 	return_to_handler+0x0/0x48
> 	do_softirq_own_stack+0x24/0x38
> 
> ... should be something like:
> 
> 	__do_softirq
> 	____do_softirq
> 	call_on_irq_stack	// asm trampoline, not traceable
> 	do_softirq_own_stack
> 
> The generated assembly for do_softirq_own_stack(), ____do_softirq(), and
> __do_softirq() is as I'd expect with no tail calls, so I can't see an obvious
> reason the return address cannot be recovered correctly.

My question is that even if unwinder fails, the program runs normally.
Isn't it a real stack entry?

> 
> > > That's a warning in arm64's unwind_recover_return_address() function, which
> > > fires when ftrace_graph_ret_addr() finds return_to_handler:
> > > 
> > > 	if (state->task->ret_stack &&
> > > 	    (state->pc == (unsigned long)return_to_handler)) {
> > > 		unsigned long orig_pc;
> > > 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> > > 						(void *)state->fp);
> > > 		if (WARN_ON_ONCE(state->pc == orig_pc))
> > > 			return -EINVAL;
> > > 		state->pc = orig_pc;
> > > 	}
> > > 
> > > The rationale there is that since tail calls are (currently) disabled on arm64,
> > > the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
> > > it fails to find the original return address.
> > 
> > Yes. what about FP check?
> 
> Do you mean HAVE_FUNCTION_GRAPH_FP_TEST?
> 
> That is enabled, and there are warnings from ftrace_pop_return_trace(), so I
> believe push/pop is balanced.

OK.

> 
> We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> not on the stack at the point function-entry is intercepted we use the FP as
> the retp value -- in the absence of tail calls this will be different between a
> caller and callee.
> 
> > > Does this change make it legitimate for ftrace_graph_ret_addr() to return
> > > return_to_handler in other cases, or is that a bug?
> > 
> > It should be a bug to be fixed.
> 
> Cool; thanks for confirming!
> 
> > > Either way, we'll need *some* way to recover the original return addresss...
> > 
> > At least it needs to dump the shadow stack so that we can analyze what
> > happened. 
> 
> Sounds like a plan; as above I'll have a go at putting that together and will
> dump the results here.

Yeah, please try below patch.

Thanks,

---
 arch/arm64/kernel/stacktrace.c | 10 +++++++++-
 include/linux/ftrace.h         |  2 ++
 kernel/trace/fgraph.c          | 24 ++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 17f66a74c745..0eaba1bad599 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -81,8 +81,16 @@ unwind_recover_return_address(struct unwind_state *state)
 		unsigned long orig_pc;
 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
 						(void *)state->fp);
-		if (WARN_ON_ONCE(state->pc == orig_pc))
+		if (WARN_ON_ONCE(state->pc == orig_pc)) {
+			static bool dumped;
+
+			if (!dumped) {
+				pr_info("Dump: return_to_handler = %lx\n", (unsigned long)return_to_handler);
+				dumped = true;
+				fgraph_dump_ret_stack(state->task);
+			}
 			return -EINVAL;
+		}
 		state->pc = orig_pc;
 	}
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ad28daa507f7..cfb79977fdec 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1258,6 +1258,8 @@ static inline void unpause_graph_tracing(void)
 {
 	atomic_dec(&current->tracing_graph_pause);
 }
+
+void fgraph_dump_ret_stack(struct task_struct *t);
 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
 
 #define __notrace_funcgraph
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 0f11f80bdd6c..5dd560fbacce 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -437,6 +437,30 @@ get_ret_stack(struct task_struct *t, int offset, int *index)
 	return RET_STACK(t, offset);
 }
 
+void fgraph_dump_ret_stack(struct task_struct *t)
+{
+	struct ftrace_ret_stack *ret_stack;
+	unsigned long val;
+	int i, offset, next;
+
+	for (i = t->curr_ret_stack - 1; i > 0; i--) {
+		val = get_fgraph_entry(t, i);
+		pr_err(" ret_stack[%d]: %lx %pS type = %d, index = %d\n",
+			i, val, (void *)val, __get_type(val), __get_index(val));
+	}
+	offset = t->curr_ret_stack;
+	do {
+		ret_stack = get_ret_stack(t, offset, &next);
+		pr_err("ret_stack[%d]: %lx\n",
+			next + 1, (unsigned long)ret_stack);
+		if (ret_stack) {
+			pr_err("\tfunc: %ps, return:%ps\n",
+				(void *)ret_stack->func, (void *)ret_stack->ret);
+		}
+		offset = next;
+	} while (ret_stack);
+}
+
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
Mark Rutland Jan. 11, 2024, 11:01 a.m. UTC | #8
On Thu, Jan 11, 2024 at 11:15:33AM +0900, Masami Hiramatsu wrote:
> Hi Mark,
> 
> Thanks for the investigation.

Hi!

As a heads-up, I already figured out the problem and sent a fixup at:

  https://lore.kernel.org/lkml/ZZwEz8HsTa2IZE3L@FVFF77S0Q05N/

... and a more refined fix at:

  https://lore.kernel.org/lkml/ZZwOubTSbB_FucVz@FVFF77S0Q05N/

The gist was that before this patch, arm64 used the FP as the 'retp' value, but
this patch changed that to the address of fregs->lr. This meant that the fgraph
ret_stack contained all of the correct return addresses, but when the unwinder
called ftrace_graph_ret_addr() with FP as the 'retp' value, it failed to match
any entry in the ret_stack.

Since the fregs only exist transiently at function entry and exit, I'd prefer
that we still use the FP as the 'retp' value, which is what I proposed in the
fixups above.

Thanks,
Mark.

> On Mon, 8 Jan 2024 12:25:55 +0000
> Mark Rutland <mark.rutland@arm.com> wrote:
> 
> > Hi,
> > 
> > There's a bit more of an info-dump below; I'll go try to dump the fgraph shadow
> > stack so that we can analyse this in more detail.
> > 
> > On Mon, Jan 08, 2024 at 10:14:36AM +0900, Masami Hiramatsu wrote:
> > > On Fri, 5 Jan 2024 17:09:10 +0000
> > > Mark Rutland <mark.rutland@arm.com> wrote:
> > > 
> > > > On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> > > > > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > 
> > > > > Allow for instances to have their own ftrace_ops part of the fgraph_ops
> > > > > that makes the funtion_graph tracer filter on the set_ftrace_filter file
> > > > > of the instance and not the top instance.
> > > > > 
> > > > > This also change how the function_graph handles multiple instances on the
> > > > > shadow stack. Previously we use ARRAY type entries to record which one
> > > > > is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> > > > > Previous function_graph_enter() expects calling back from
> > > > > prepare_ftrace_return() function which is called back only once if it is
> > > > > enabled. But this introduces different ftrace_ops for each fgraph
> > > > > instance and those are called from ftrace_graph_func() one by one. Thus
> > > > > we can not loop on the fgraph_array(), and need to reuse the ret_stack
> > > > > pushed by the previous instance. Finding the ret_stack is easy because
> > > > > we can check the ret_stack->func. But that is not enough for the self-
> > > > > recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> > > > > is already set (this means that entry is for previous tail call).
> > > > > 
> > > > > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > > 
> > > > As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
> > > > a warning which bisets down to this commit:
> > > 
> > > Hmm, so does this happen when enabling function graph tracer?
> > 
> > Yes; I see it during the function_graph boot-time self-test if I also enable
> > CONFIG_IRQSOFF_TRACER=y. I can also trigger it regardless of
> > CONFIG_IRQSOFF_TRACER if I cat /proc/self/stack with the function_graph tracer
> > enabled (note that I hacked the unwinder to continue after failing to recover a
> > return address):
> > 
> > | # mount -t tracefs none /sys/kernel/tracing/
> > | # echo function_graph > /sys/kernel/tracing/current_tracer
> > | # cat /proc/self/stack
> > | [   37.469980] ------------[ cut here ]------------
> > | [   37.471503] WARNING: CPU: 2 PID: 174 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> > | [   37.474381] Modules linked in:
> > | [   37.475501] CPU: 2 PID: 174 Comm: cat Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #15
> > | [   37.478133] Hardware name: linux,dummy-virt (DT)
> > | [   37.479670] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > | [   37.481923] pc : arch_stack_walk+0x2d8/0x338
> > | [   37.483373] lr : arch_stack_walk+0x1bc/0x338
> > | [   37.484818] sp : ffff8000835f3a90
> > | [   37.485974] x29: ffff8000835f3a90 x28: ffff8000835f3b80 x27: ffff8000835f3b38
> > | [   37.488405] x26: ffff000004341e00 x25: ffff8000835f4000 x24: ffff80008002df18
> > | [   37.490842] x23: ffff80008002df18 x22: ffff8000835f3b60 x21: ffff80008015d240
> > | [   37.493269] x20: ffff8000835f3b50 x19: ffff8000835f3b40 x18: 0000000000000000
> > | [   37.495704] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> > | [   37.498144] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
> > | [   37.500579] x11: ffff800082b4d920 x10: ffff8000835f3a70 x9 : ffff8000800e55a0
> > | [   37.503021] x8 : ffff80008002df18 x7 : ffff000004341e00 x6 : 00000000ffffffff
> > | [   37.505452] x5 : 0000000000000000 x4 : ffff8000835f3e48 x3 : ffff8000835f3b80
> > | [   37.507888] x2 : ffff80008002df18 x1 : ffff000007f7b000 x0 : ffff80008002df18
> > | [   37.510319] Call trace:
> > | [   37.511202]  arch_stack_walk+0x2d8/0x338
> > | [   37.512541]  stack_trace_save_tsk+0x90/0x110
> > | [   37.514012]  return_to_handler+0x0/0x48
> > | [   37.515336]  return_to_handler+0x0/0x48
> > | [   37.516657]  return_to_handler+0x0/0x48
> > | [   37.517985]  return_to_handler+0x0/0x48
> > | [   37.519305]  return_to_handler+0x0/0x48
> > | [   37.520623]  return_to_handler+0x0/0x48
> > | [   37.521957]  return_to_handler+0x0/0x48
> > | [   37.523272]  return_to_handler+0x0/0x48
> > | [   37.524595]  return_to_handler+0x0/0x48
> > | [   37.525931]  return_to_handler+0x0/0x48
> > | [   37.527254]  return_to_handler+0x0/0x48
> > | [   37.528564]  el0t_64_sync_handler+0x120/0x130
> > | [   37.530046]  el0t_64_sync+0x190/0x198
> > | [   37.531310] ---[ end trace 0000000000000000 ]---
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] ftrace_stub_graph+0x8/0x8
> > | [<0>] el0t_64_sync_handler+0x120/0x130
> > | [<0>] el0t_64_sync+0x190/0x198
> 
> Hmm, I haven't see this mode.
> 
> > 
> > One interesting thing there is that there are two distinct failure modes: the
> > unwind for the WARNING gives return_to_handler instead of the original return
> > address, and the unwind returned from /proc/self/stack gives ftrace_stub_graph
> > rather than the original return address.
> > 
> > > > 
> > > > | Testing tracer function_graph: 
> > > > | ------------[ cut here ]------------
> > > > | WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
> > > > | Modules linked in:
> > > > | CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
> > > > | Hardware name: linux,dummy-virt (DT)
> > > > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > > | pc : arch_stack_walk+0x3c0/0x3d8
> > > > | lr : arch_stack_walk+0x260/0x3d8
> > > > | sp : ffff80008318be00
> > > > | x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
> > > > | x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
> > > > | x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
> > > > | x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
> > > > | x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
> > > > | x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
> > > > | x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
> > > > | x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
> > > > | x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
> > > > | x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
> > > > | Call trace:
> > > > |  arch_stack_walk+0x3c0/0x3d8
> > > > |  return_address+0x40/0x80
> > > > |  trace_hardirqs_on+0x8c/0x198
> > > > |  __do_softirq+0xe8/0x440
> > > > | ---[ end trace 0000000000000000 ]---
> > 
> > With the smae hack to continue after failing to recover a return address, the
> > failure in the selftest looks like:
> > 
> > | ------------[ cut here ]------------
> > | WARNING: CPU: 7 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> > | Modules linked in:
> > | CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #14
> > | Hardware name: linux,dummy-virt (DT)
> > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > | pc : arch_stack_walk+0x2d8/0x338
> > | lr : arch_stack_walk+0x1bc/0x338
> > | sp : ffff8000830c3e20
> > | x29: ffff8000830c3e20 x28: ffff8000830c3ff0 x27: ffff8000830c3ec8
> > | x26: ffff0000037e0000 x25: ffff8000830c4000 x24: ffff80008002e080
> > | x23: ffff80008002e080 x22: ffff8000830c3ee8 x21: ffff800080023418
> > | x20: ffff8000830c3f50 x19: ffff8000830c3f40 x18: ffffffffffffffff
> > | x17: 0000000000000000 x16: ffff8000830c0000 x15: 0000000000000000
> > | x14: 0000000000000002 x13: ffff8000800360f8 x12: ffff800080028330
> > | x11: ffff800081f4a978 x10: ffff8000830c3ff0 x9 : ffff800080010798
> > | x8 : ffff80008002e080 x7 : ffff0000037e0000 x6 : 00000000ffffffff
> > | x5 : 0000000000000000 x4 : ffff8000831dbc18 x3 : ffff8000830c3ff0
> > | x2 : ffff80008002e080 x1 : ffff0000040a3000 x0 : ffff80008002e080
> > | Call trace:
> > |  arch_stack_walk+0x2d8/0x338
> > |  return_address+0x40/0x80
> > |  trace_hardirqs_on+0x8c/0x198
> > |  __do_softirq+0xe8/0x43c
> > |  return_to_handler+0x0/0x48
> > |  return_to_handler+0x0/0x48
> > |  do_softirq_own_stack+0x24/0x38
> > |  return_to_handler+0x0/0x48
> > |  el1_interrupt+0x38/0x68
> > |  el1h_64_irq_handler+0x18/0x28
> > |  el1h_64_irq+0x64/0x68
> > |  default_idle_call+0x70/0x178
> > |  do_idle+0x228/0x290
> > |  cpu_startup_entry+0x40/0x50
> > |  secondary_start_kernel+0x138/0x160
> > |  __secondary_switched+0xb8/0xc0
> > | ---[ end trace 0000000000000000 ]---
> 
> I usually see this and reproduced. Here, I also add a dump of shadow stack.
> It seems that the unwinder goes to the bottome of the shadow stack.
> 
> /sys/kernel/tracing # echo function_graph > current_tracer 
> [   89.887750] ------------[ cut here ]------------
> [   89.889864] Dump: return_to_handler = ffffb45fc6a2f1e8
> [   89.891833]  ret_stack[20]: 20406 0x20406 type = 1, index = 6
> [   89.896118]  ret_stack[19]: ffff800080003be8 0xffff800080003be8 type = 2, index = 1000
> [   89.896233]  ret_stack[18]: ffff800080003c20 0xffff800080003c20 type = 3, index = 32
> [   89.896362]  ret_stack[17]: 0 0x0 type = 0, index = 0
> [   89.896425]  ret_stack[16]: 14edac7710 0x14edac7710 type = 1, index = 784
> [   89.896635]  ret_stack[15]: ffffb45fc6a1610c call_break_hook+0x4/0x108 type = 0, index = 268
> [   89.897882]  ret_stack[14]: ffffb45fc6a162fc brk_handler+0x24/0x70 type = 0, index = 764
> [   89.898139]  ret_stack[13]: 20406 0x20406 type = 1, index = 6
> [   89.898337]  ret_stack[12]: ffff800080003c08 0xffff800080003c08 type = 3, index = 8
> [   89.898554]  ret_stack[11]: ffff800080003c40 0xffff800080003c40 type = 3, index = 64
> [   89.898645]  ret_stack[10]: 0 0x0 type = 0, index = 0
> [   89.898832]  ret_stack[9]: 14eda8f920 0x14eda8f920 type = 2, index = 288
> [   89.899069]  ret_stack[8]: ffffb45fc6a162dc brk_handler+0x4/0x70 type = 0, index = 732
> [   89.899230]  ret_stack[7]: ffffb45fc6a36c24 do_debug_exception+0x74/0x108 type = 3, index = 36
> [   89.899385]  ret_stack[6]: 20406 0x20406 type = 1, index = 6
> [   89.899456]  ret_stack[5]: ffff800080003fb8 0xffff800080003fb8 type = 3, index = 952
> [   89.899518]  ret_stack[4]: ffff800080003ff0 0xffff800080003ff0 type = 3, index = 1008
> [   89.899578]  ret_stack[3]: ffff62a80534d21c 0xffff62a80534d21c type = 0, index = 540
> [   89.899637]  ret_stack[2]: 14ed8ed2e0 0x14ed8ed2e0 type = 0, index = 736
> [   89.899695]  ret_stack[1]: ffffb45fc6a1069c __do_softirq+0x4/0x4f0 type = 1, index = 668
> [   89.899986] ret_stack[15]: ffff62a80534d070
> [   89.900221] 	func: call_break_hook, return:brk_handler
> [   89.901025] ret_stack[8]: ffff62a80534d038
> [   89.901223] 	func: brk_handler, return:do_debug_exception
> [   89.901450] ret_stack[1]: ffff62a80534d000
> [   89.901501] 	func: __do_softirq, return:____do_softirq
> [   89.901693] ret_stack[1]: 0
> [   90.015738] WARNING: CPU: 0 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x380
> [   90.022314] Modules linked in:
> [   90.032375] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G                 N 6.7.0-rc8-00036-g3897e34e8ae2-dirty #79
> [   90.038797] Hardware name: linux,dummy-virt (DT)
> [   90.044170] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   90.048879] pc : arch_stack_walk+0x2d8/0x380
> [   90.052222] lr : arch_stack_walk+0x248/0x380
> [   90.055635] sp : ffff800080003e20
> [   90.058147] x29: ffff800080003e20 x28: ffffb45fc91993c0 x27: 0000000000000000
> [   90.063705] x26: 0000000000000000 x25: 0000000000000000 x24: ffffb45fc918fb40
> [   90.068946] x23: ffffb45fc6a247b8 x22: ffffb45fc6a2f1e8 x21: ffffb45fc6a35b30
> [   90.074894] x20: ffff800080003ef8 x19: ffffb45fc6a24750 x18: 0000000000000000
> [   90.078796] x17: 0000000000000000 x16: ffff800080000000 x15: 0000ffffff477588
> [   90.084310] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
> [   90.088898] x11: ffffb45fc924ca08 x10: ffff62a8040341c0 x9 : ffffb45fc6a10760
> [   90.094430] x8 : ffffb45fc6a2f1e8 x7 : ffffb45fc91993c0 x6 : ffff62a80534d000
> [   90.099829] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff800080003ff0
> [   90.104442] x2 : ffffb45fc6a2f1e8 x1 : ffffb45fc6a2f1e8 x0 : ffffb45fc6a2f1e8
> [   90.111735] Call trace:
> [   90.114923]  arch_stack_walk+0x2d8/0x380
> [   90.118820]  return_address+0x40/0x80
> [   90.122057]  trace_hardirqs_on+0xa0/0x100
> [   90.125001]  __do_softirq+0xec/0x4f0
> [   90.130907] irq event stamp: 102709
> [   90.134223] hardirqs last  enabled at (102707): [<ffffb45fc7af51d8>] default_idle_call+0xa0/0x160
> [   90.140612] hardirqs last disabled at (102708): [<ffffb45fc7af26ec>] el1_interrupt+0x24/0x68
> [   90.145877] softirqs last  enabled at (102702): [<ffffb45fc6a10b40>] __do_softirq+0x4a8/0x4f0
> [   90.148952] softirqs last disabled at (102709): [<ffffb45fc6a2f1e8>] return_to_handler+0x0/0x50
> [   90.152834] ---[ end trace 0000000000000000 ]---
> 
> 
> > 
> > The portion of the trace with:
> > 
> > 	__do_softirq+0xe8/0x43c
> > 	return_to_handler+0x0/0x48
> > 	return_to_handler+0x0/0x48
> > 	do_softirq_own_stack+0x24/0x38
> > 
> > ... should be something like:
> > 
> > 	__do_softirq
> > 	____do_softirq
> > 	call_on_irq_stack	// asm trampoline, not traceable
> > 	do_softirq_own_stack
> > 
> > The generated assembly for do_softirq_own_stack(), ____do_softirq(), and
> > __do_softirq() is as I'd expect with no tail calls, so I can't see an obvious
> > reason the return address cannot be recovered correctly.
> 
> My question is that even if unwinder fails, the program runs normally.
> Isn't it a real stack entry?
> 
> > 
> > > > That's a warning in arm64's unwind_recover_return_address() function, which
> > > > fires when ftrace_graph_ret_addr() finds return_to_handler:
> > > > 
> > > > 	if (state->task->ret_stack &&
> > > > 	    (state->pc == (unsigned long)return_to_handler)) {
> > > > 		unsigned long orig_pc;
> > > > 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> > > > 						(void *)state->fp);
> > > > 		if (WARN_ON_ONCE(state->pc == orig_pc))
> > > > 			return -EINVAL;
> > > > 		state->pc = orig_pc;
> > > > 	}
> > > > 
> > > > The rationale there is that since tail calls are (currently) disabled on arm64,
> > > > the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
> > > > it fails to find the original return address.
> > > 
> > > Yes. what about FP check?
> > 
> > Do you mean HAVE_FUNCTION_GRAPH_FP_TEST?
> > 
> > That is enabled, and there are warnings from ftrace_pop_return_trace(), so I
> > believe push/pop is balanced.
> 
> OK.
> 
> > 
> > We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> > not on the stack at the point function-entry is intercepted we use the FP as
> > the retp value -- in the absence of tail calls this will be different between a
> > caller and callee.
> > 
> > > > Does this change make it legitimate for ftrace_graph_ret_addr() to return
> > > > return_to_handler in other cases, or is that a bug?
> > > 
> > > It should be a bug to be fixed.
> > 
> > Cool; thanks for confirming!
> > 
> > > > Either way, we'll need *some* way to recover the original return addresss...
> > > 
> > > At least it needs to dump the shadow stack so that we can analyze what
> > > happened. 
> > 
> > Sounds like a plan; as above I'll have a go at putting that together and will
> > dump the results here.
> 
> Yeah, please try below patch.
> 
> Thanks,
> 
> ---
>  arch/arm64/kernel/stacktrace.c | 10 +++++++++-
>  include/linux/ftrace.h         |  2 ++
>  kernel/trace/fgraph.c          | 24 ++++++++++++++++++++++++
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> index 17f66a74c745..0eaba1bad599 100644
> --- a/arch/arm64/kernel/stacktrace.c
> +++ b/arch/arm64/kernel/stacktrace.c
> @@ -81,8 +81,16 @@ unwind_recover_return_address(struct unwind_state *state)
>  		unsigned long orig_pc;
>  		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
>  						(void *)state->fp);
> -		if (WARN_ON_ONCE(state->pc == orig_pc))
> +		if (WARN_ON_ONCE(state->pc == orig_pc)) {
> +			static bool dumped;
> +
> +			if (!dumped) {
> +				pr_info("Dump: return_to_handler = %lx\n", (unsigned long)return_to_handler);
> +				dumped = true;
> +				fgraph_dump_ret_stack(state->task);
> +			}
>  			return -EINVAL;
> +		}
>  		state->pc = orig_pc;
>  	}
>  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index ad28daa507f7..cfb79977fdec 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -1258,6 +1258,8 @@ static inline void unpause_graph_tracing(void)
>  {
>  	atomic_dec(&current->tracing_graph_pause);
>  }
> +
> +void fgraph_dump_ret_stack(struct task_struct *t);
>  #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
>  
>  #define __notrace_funcgraph
> diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> index 0f11f80bdd6c..5dd560fbacce 100644
> --- a/kernel/trace/fgraph.c
> +++ b/kernel/trace/fgraph.c
> @@ -437,6 +437,30 @@ get_ret_stack(struct task_struct *t, int offset, int *index)
>  	return RET_STACK(t, offset);
>  }
>  
> +void fgraph_dump_ret_stack(struct task_struct *t)
> +{
> +	struct ftrace_ret_stack *ret_stack;
> +	unsigned long val;
> +	int i, offset, next;
> +
> +	for (i = t->curr_ret_stack - 1; i > 0; i--) {
> +		val = get_fgraph_entry(t, i);
> +		pr_err(" ret_stack[%d]: %lx %pS type = %d, index = %d\n",
> +			i, val, (void *)val, __get_type(val), __get_index(val));
> +	}
> +	offset = t->curr_ret_stack;
> +	do {
> +		ret_stack = get_ret_stack(t, offset, &next);
> +		pr_err("ret_stack[%d]: %lx\n",
> +			next + 1, (unsigned long)ret_stack);
> +		if (ret_stack) {
> +			pr_err("\tfunc: %ps, return:%ps\n",
> +				(void *)ret_stack->func, (void *)ret_stack->ret);
> +		}
> +		offset = next;
> +	} while (ret_stack);
> +}
> +
>  /* Both enabled by default (can be cleared by function_graph tracer flags */
>  static bool fgraph_sleep_time = true;
>  
> -- 
> 2.34.1
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
Masami Hiramatsu (Google) Jan. 11, 2024, 1:45 p.m. UTC | #9
Hi Mark,

On Thu, 11 Jan 2024 11:01:56 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> On Thu, Jan 11, 2024 at 11:15:33AM +0900, Masami Hiramatsu wrote:
> > Hi Mark,
> > 
> > Thanks for the investigation.
> 
> Hi!
> 
> As a heads-up, I already figured out the problem and sent a fixup at:
> 
>   https://lore.kernel.org/lkml/ZZwEz8HsTa2IZE3L@FVFF77S0Q05N/
> 
> ... and a more refined fix at:
> 
>   https://lore.kernel.org/lkml/ZZwOubTSbB_FucVz@FVFF77S0Q05N/

Oops, I missed it. And I also confirmed that.

> 
> The gist was that before this patch, arm64 used the FP as the 'retp' value, but
> this patch changed that to the address of fregs->lr. This meant that the fgraph
> ret_stack contained all of the correct return addresses, but when the unwinder
> called ftrace_graph_ret_addr() with FP as the 'retp' value, it failed to match
> any entry in the ret_stack.

Yeah, this patch introduced new arm64 ftrace_graph_func(). and I missed
to pass the 'parent'... OK let me fix that.

> 
> Since the fregs only exist transiently at function entry and exit, I'd prefer
> that we still use the FP as the 'retp' value, which is what I proposed in the
> fixups above.

OK. Let me add it.

Thank you!

> 
> Thanks,
> Mark.
> 
> > On Mon, 8 Jan 2024 12:25:55 +0000
> > Mark Rutland <mark.rutland@arm.com> wrote:
> > 
> > > Hi,
> > > 
> > > There's a bit more of an info-dump below; I'll go try to dump the fgraph shadow
> > > stack so that we can analyse this in more detail.
> > > 
> > > On Mon, Jan 08, 2024 at 10:14:36AM +0900, Masami Hiramatsu wrote:
> > > > On Fri, 5 Jan 2024 17:09:10 +0000
> > > > Mark Rutland <mark.rutland@arm.com> wrote:
> > > > 
> > > > > On Mon, Dec 18, 2023 at 10:13:46PM +0900, Masami Hiramatsu (Google) wrote:
> > > > > > From: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > > 
> > > > > > Allow for instances to have their own ftrace_ops part of the fgraph_ops
> > > > > > that makes the funtion_graph tracer filter on the set_ftrace_filter file
> > > > > > of the instance and not the top instance.
> > > > > > 
> > > > > > This also change how the function_graph handles multiple instances on the
> > > > > > shadow stack. Previously we use ARRAY type entries to record which one
> > > > > > is enabled, and this makes it a bitmap of the fgraph_array's indexes.
> > > > > > Previous function_graph_enter() expects calling back from
> > > > > > prepare_ftrace_return() function which is called back only once if it is
> > > > > > enabled. But this introduces different ftrace_ops for each fgraph
> > > > > > instance and those are called from ftrace_graph_func() one by one. Thus
> > > > > > we can not loop on the fgraph_array(), and need to reuse the ret_stack
> > > > > > pushed by the previous instance. Finding the ret_stack is easy because
> > > > > > we can check the ret_stack->func. But that is not enough for the self-
> > > > > > recursive tail-call case. Thus fgraph uses the bitmap entry to find it
> > > > > > is already set (this means that entry is for previous tail call).
> > > > > > 
> > > > > > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > > > > 
> > > > > As a heads-up, while testing the topic/fprobe-on-fgraph branch on arm64, I get
> > > > > a warning which bisets down to this commit:
> > > > 
> > > > Hmm, so does this happen when enabling function graph tracer?
> > > 
> > > Yes; I see it during the function_graph boot-time self-test if I also enable
> > > CONFIG_IRQSOFF_TRACER=y. I can also trigger it regardless of
> > > CONFIG_IRQSOFF_TRACER if I cat /proc/self/stack with the function_graph tracer
> > > enabled (note that I hacked the unwinder to continue after failing to recover a
> > > return address):
> > > 
> > > | # mount -t tracefs none /sys/kernel/tracing/
> > > | # echo function_graph > /sys/kernel/tracing/current_tracer
> > > | # cat /proc/self/stack
> > > | [   37.469980] ------------[ cut here ]------------
> > > | [   37.471503] WARNING: CPU: 2 PID: 174 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> > > | [   37.474381] Modules linked in:
> > > | [   37.475501] CPU: 2 PID: 174 Comm: cat Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #15
> > > | [   37.478133] Hardware name: linux,dummy-virt (DT)
> > > | [   37.479670] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > | [   37.481923] pc : arch_stack_walk+0x2d8/0x338
> > > | [   37.483373] lr : arch_stack_walk+0x1bc/0x338
> > > | [   37.484818] sp : ffff8000835f3a90
> > > | [   37.485974] x29: ffff8000835f3a90 x28: ffff8000835f3b80 x27: ffff8000835f3b38
> > > | [   37.488405] x26: ffff000004341e00 x25: ffff8000835f4000 x24: ffff80008002df18
> > > | [   37.490842] x23: ffff80008002df18 x22: ffff8000835f3b60 x21: ffff80008015d240
> > > | [   37.493269] x20: ffff8000835f3b50 x19: ffff8000835f3b40 x18: 0000000000000000
> > > | [   37.495704] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> > > | [   37.498144] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
> > > | [   37.500579] x11: ffff800082b4d920 x10: ffff8000835f3a70 x9 : ffff8000800e55a0
> > > | [   37.503021] x8 : ffff80008002df18 x7 : ffff000004341e00 x6 : 00000000ffffffff
> > > | [   37.505452] x5 : 0000000000000000 x4 : ffff8000835f3e48 x3 : ffff8000835f3b80
> > > | [   37.507888] x2 : ffff80008002df18 x1 : ffff000007f7b000 x0 : ffff80008002df18
> > > | [   37.510319] Call trace:
> > > | [   37.511202]  arch_stack_walk+0x2d8/0x338
> > > | [   37.512541]  stack_trace_save_tsk+0x90/0x110
> > > | [   37.514012]  return_to_handler+0x0/0x48
> > > | [   37.515336]  return_to_handler+0x0/0x48
> > > | [   37.516657]  return_to_handler+0x0/0x48
> > > | [   37.517985]  return_to_handler+0x0/0x48
> > > | [   37.519305]  return_to_handler+0x0/0x48
> > > | [   37.520623]  return_to_handler+0x0/0x48
> > > | [   37.521957]  return_to_handler+0x0/0x48
> > > | [   37.523272]  return_to_handler+0x0/0x48
> > > | [   37.524595]  return_to_handler+0x0/0x48
> > > | [   37.525931]  return_to_handler+0x0/0x48
> > > | [   37.527254]  return_to_handler+0x0/0x48
> > > | [   37.528564]  el0t_64_sync_handler+0x120/0x130
> > > | [   37.530046]  el0t_64_sync+0x190/0x198
> > > | [   37.531310] ---[ end trace 0000000000000000 ]---
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] ftrace_stub_graph+0x8/0x8
> > > | [<0>] el0t_64_sync_handler+0x120/0x130
> > > | [<0>] el0t_64_sync+0x190/0x198
> > 
> > Hmm, I haven't see this mode.
> > 
> > > 
> > > One interesting thing there is that there are two distinct failure modes: the
> > > unwind for the WARNING gives return_to_handler instead of the original return
> > > address, and the unwind returned from /proc/self/stack gives ftrace_stub_graph
> > > rather than the original return address.
> > > 
> > > > > 
> > > > > | Testing tracer function_graph: 
> > > > > | ------------[ cut here ]------------
> > > > > | WARNING: CPU: 2 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x3c0/0x3d8
> > > > > | Modules linked in:
> > > > > | CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc2-00026-gea1e68a341c2 #12
> > > > > | Hardware name: linux,dummy-virt (DT)
> > > > > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > > > | pc : arch_stack_walk+0x3c0/0x3d8
> > > > > | lr : arch_stack_walk+0x260/0x3d8
> > > > > | sp : ffff80008318be00
> > > > > | x29: ffff80008318be00 x28: ffff000003c0ae80 x27: 0000000000000000
> > > > > | x26: 0000000000000000 x25: ffff000003c0ae80 x24: 0000000000000000
> > > > > | x23: ffff8000800234c8 x22: ffff80008002dc30 x21: ffff800080035d10
> > > > > | x20: ffff80008318bee8 x19: ffff800080023460 x18: ffff800083453c68
> > > > > | x17: 0000000000000000 x16: ffff800083188000 x15: 000000008ccc5058
> > > > > | x14: 0000000000000004 x13: ffff800082b8c4f0 x12: 0000000000000000
> > > > > | x11: ffff800081fba9b0 x10: ffff80008318bff0 x9 : ffff800080010798
> > > > > | x8 : ffff80008002dc30 x7 : ffff000003c0ae80 x6 : 00000000ffffffff
> > > > > | x5 : 0000000000000000 x4 : ffff8000832a3c18 x3 : ffff80008318bff0
> > > > > | x2 : ffff80008002dc30 x1 : ffff80008002dc30 x0 : ffff80008002dc30
> > > > > | Call trace:
> > > > > |  arch_stack_walk+0x3c0/0x3d8
> > > > > |  return_address+0x40/0x80
> > > > > |  trace_hardirqs_on+0x8c/0x198
> > > > > |  __do_softirq+0xe8/0x440
> > > > > | ---[ end trace 0000000000000000 ]---
> > > 
> > > With the smae hack to continue after failing to recover a return address, the
> > > failure in the selftest looks like:
> > > 
> > > | ------------[ cut here ]------------
> > > | WARNING: CPU: 7 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x338
> > > | Modules linked in:
> > > | CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.7.0-rc2-00026-gea1e68a341c2-dirty #14
> > > | Hardware name: linux,dummy-virt (DT)
> > > | pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > | pc : arch_stack_walk+0x2d8/0x338
> > > | lr : arch_stack_walk+0x1bc/0x338
> > > | sp : ffff8000830c3e20
> > > | x29: ffff8000830c3e20 x28: ffff8000830c3ff0 x27: ffff8000830c3ec8
> > > | x26: ffff0000037e0000 x25: ffff8000830c4000 x24: ffff80008002e080
> > > | x23: ffff80008002e080 x22: ffff8000830c3ee8 x21: ffff800080023418
> > > | x20: ffff8000830c3f50 x19: ffff8000830c3f40 x18: ffffffffffffffff
> > > | x17: 0000000000000000 x16: ffff8000830c0000 x15: 0000000000000000
> > > | x14: 0000000000000002 x13: ffff8000800360f8 x12: ffff800080028330
> > > | x11: ffff800081f4a978 x10: ffff8000830c3ff0 x9 : ffff800080010798
> > > | x8 : ffff80008002e080 x7 : ffff0000037e0000 x6 : 00000000ffffffff
> > > | x5 : 0000000000000000 x4 : ffff8000831dbc18 x3 : ffff8000830c3ff0
> > > | x2 : ffff80008002e080 x1 : ffff0000040a3000 x0 : ffff80008002e080
> > > | Call trace:
> > > |  arch_stack_walk+0x2d8/0x338
> > > |  return_address+0x40/0x80
> > > |  trace_hardirqs_on+0x8c/0x198
> > > |  __do_softirq+0xe8/0x43c
> > > |  return_to_handler+0x0/0x48
> > > |  return_to_handler+0x0/0x48
> > > |  do_softirq_own_stack+0x24/0x38
> > > |  return_to_handler+0x0/0x48
> > > |  el1_interrupt+0x38/0x68
> > > |  el1h_64_irq_handler+0x18/0x28
> > > |  el1h_64_irq+0x64/0x68
> > > |  default_idle_call+0x70/0x178
> > > |  do_idle+0x228/0x290
> > > |  cpu_startup_entry+0x40/0x50
> > > |  secondary_start_kernel+0x138/0x160
> > > |  __secondary_switched+0xb8/0xc0
> > > | ---[ end trace 0000000000000000 ]---
> > 
> > I usually see this and reproduced. Here, I also add a dump of shadow stack.
> > It seems that the unwinder goes to the bottome of the shadow stack.
> > 
> > /sys/kernel/tracing # echo function_graph > current_tracer 
> > [   89.887750] ------------[ cut here ]------------
> > [   89.889864] Dump: return_to_handler = ffffb45fc6a2f1e8
> > [   89.891833]  ret_stack[20]: 20406 0x20406 type = 1, index = 6
> > [   89.896118]  ret_stack[19]: ffff800080003be8 0xffff800080003be8 type = 2, index = 1000
> > [   89.896233]  ret_stack[18]: ffff800080003c20 0xffff800080003c20 type = 3, index = 32
> > [   89.896362]  ret_stack[17]: 0 0x0 type = 0, index = 0
> > [   89.896425]  ret_stack[16]: 14edac7710 0x14edac7710 type = 1, index = 784
> > [   89.896635]  ret_stack[15]: ffffb45fc6a1610c call_break_hook+0x4/0x108 type = 0, index = 268
> > [   89.897882]  ret_stack[14]: ffffb45fc6a162fc brk_handler+0x24/0x70 type = 0, index = 764
> > [   89.898139]  ret_stack[13]: 20406 0x20406 type = 1, index = 6
> > [   89.898337]  ret_stack[12]: ffff800080003c08 0xffff800080003c08 type = 3, index = 8
> > [   89.898554]  ret_stack[11]: ffff800080003c40 0xffff800080003c40 type = 3, index = 64
> > [   89.898645]  ret_stack[10]: 0 0x0 type = 0, index = 0
> > [   89.898832]  ret_stack[9]: 14eda8f920 0x14eda8f920 type = 2, index = 288
> > [   89.899069]  ret_stack[8]: ffffb45fc6a162dc brk_handler+0x4/0x70 type = 0, index = 732
> > [   89.899230]  ret_stack[7]: ffffb45fc6a36c24 do_debug_exception+0x74/0x108 type = 3, index = 36
> > [   89.899385]  ret_stack[6]: 20406 0x20406 type = 1, index = 6
> > [   89.899456]  ret_stack[5]: ffff800080003fb8 0xffff800080003fb8 type = 3, index = 952
> > [   89.899518]  ret_stack[4]: ffff800080003ff0 0xffff800080003ff0 type = 3, index = 1008
> > [   89.899578]  ret_stack[3]: ffff62a80534d21c 0xffff62a80534d21c type = 0, index = 540
> > [   89.899637]  ret_stack[2]: 14ed8ed2e0 0x14ed8ed2e0 type = 0, index = 736
> > [   89.899695]  ret_stack[1]: ffffb45fc6a1069c __do_softirq+0x4/0x4f0 type = 1, index = 668
> > [   89.899986] ret_stack[15]: ffff62a80534d070
> > [   89.900221] 	func: call_break_hook, return:brk_handler
> > [   89.901025] ret_stack[8]: ffff62a80534d038
> > [   89.901223] 	func: brk_handler, return:do_debug_exception
> > [   89.901450] ret_stack[1]: ffff62a80534d000
> > [   89.901501] 	func: __do_softirq, return:____do_softirq
> > [   89.901693] ret_stack[1]: 0
> > [   90.015738] WARNING: CPU: 0 PID: 0 at arch/arm64/kernel/stacktrace.c:84 arch_stack_walk+0x2d8/0x380
> > [   90.022314] Modules linked in:
> > [   90.032375] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G                 N 6.7.0-rc8-00036-g3897e34e8ae2-dirty #79
> > [   90.038797] Hardware name: linux,dummy-virt (DT)
> > [   90.044170] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > [   90.048879] pc : arch_stack_walk+0x2d8/0x380
> > [   90.052222] lr : arch_stack_walk+0x248/0x380
> > [   90.055635] sp : ffff800080003e20
> > [   90.058147] x29: ffff800080003e20 x28: ffffb45fc91993c0 x27: 0000000000000000
> > [   90.063705] x26: 0000000000000000 x25: 0000000000000000 x24: ffffb45fc918fb40
> > [   90.068946] x23: ffffb45fc6a247b8 x22: ffffb45fc6a2f1e8 x21: ffffb45fc6a35b30
> > [   90.074894] x20: ffff800080003ef8 x19: ffffb45fc6a24750 x18: 0000000000000000
> > [   90.078796] x17: 0000000000000000 x16: ffff800080000000 x15: 0000ffffff477588
> > [   90.084310] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
> > [   90.088898] x11: ffffb45fc924ca08 x10: ffff62a8040341c0 x9 : ffffb45fc6a10760
> > [   90.094430] x8 : ffffb45fc6a2f1e8 x7 : ffffb45fc91993c0 x6 : ffff62a80534d000
> > [   90.099829] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff800080003ff0
> > [   90.104442] x2 : ffffb45fc6a2f1e8 x1 : ffffb45fc6a2f1e8 x0 : ffffb45fc6a2f1e8
> > [   90.111735] Call trace:
> > [   90.114923]  arch_stack_walk+0x2d8/0x380
> > [   90.118820]  return_address+0x40/0x80
> > [   90.122057]  trace_hardirqs_on+0xa0/0x100
> > [   90.125001]  __do_softirq+0xec/0x4f0
> > [   90.130907] irq event stamp: 102709
> > [   90.134223] hardirqs last  enabled at (102707): [<ffffb45fc7af51d8>] default_idle_call+0xa0/0x160
> > [   90.140612] hardirqs last disabled at (102708): [<ffffb45fc7af26ec>] el1_interrupt+0x24/0x68
> > [   90.145877] softirqs last  enabled at (102702): [<ffffb45fc6a10b40>] __do_softirq+0x4a8/0x4f0
> > [   90.148952] softirqs last disabled at (102709): [<ffffb45fc6a2f1e8>] return_to_handler+0x0/0x50
> > [   90.152834] ---[ end trace 0000000000000000 ]---
> > 
> > 
> > > 
> > > The portion of the trace with:
> > > 
> > > 	__do_softirq+0xe8/0x43c
> > > 	return_to_handler+0x0/0x48
> > > 	return_to_handler+0x0/0x48
> > > 	do_softirq_own_stack+0x24/0x38
> > > 
> > > ... should be something like:
> > > 
> > > 	__do_softirq
> > > 	____do_softirq
> > > 	call_on_irq_stack	// asm trampoline, not traceable
> > > 	do_softirq_own_stack
> > > 
> > > The generated assembly for do_softirq_own_stack(), ____do_softirq(), and
> > > __do_softirq() is as I'd expect with no tail calls, so I can't see an obvious
> > > reason the return address cannot be recovered correctly.
> > 
> > My question is that even if unwinder fails, the program runs normally.
> > Isn't it a real stack entry?
> > 
> > > 
> > > > > That's a warning in arm64's unwind_recover_return_address() function, which
> > > > > fires when ftrace_graph_ret_addr() finds return_to_handler:
> > > > > 
> > > > > 	if (state->task->ret_stack &&
> > > > > 	    (state->pc == (unsigned long)return_to_handler)) {
> > > > > 		unsigned long orig_pc;
> > > > > 		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> > > > > 						(void *)state->fp);
> > > > > 		if (WARN_ON_ONCE(state->pc == orig_pc))
> > > > > 			return -EINVAL;
> > > > > 		state->pc = orig_pc;
> > > > > 	}
> > > > > 
> > > > > The rationale there is that since tail calls are (currently) disabled on arm64,
> > > > > the only reason for ftrace_graph_ret_addr() to return return_to_handler is when
> > > > > it fails to find the original return address.
> > > > 
> > > > Yes. what about FP check?
> > > 
> > > Do you mean HAVE_FUNCTION_GRAPH_FP_TEST?
> > > 
> > > That is enabled, and there are warnings from ftrace_pop_return_trace(), so I
> > > believe push/pop is balanced.
> > 
> > OK.
> > 
> > > 
> > > We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> > > not on the stack at the point function-entry is intercepted we use the FP as
> > > the retp value -- in the absence of tail calls this will be different between a
> > > caller and callee.
> > > 
> > > > > Does this change make it legitimate for ftrace_graph_ret_addr() to return
> > > > > return_to_handler in other cases, or is that a bug?
> > > > 
> > > > It should be a bug to be fixed.
> > > 
> > > Cool; thanks for confirming!
> > > 
> > > > > Either way, we'll need *some* way to recover the original return addresss...
> > > > 
> > > > At least it needs to dump the shadow stack so that we can analyze what
> > > > happened. 
> > > 
> > > Sounds like a plan; as above I'll have a go at putting that together and will
> > > dump the results here.
> > 
> > Yeah, please try below patch.
> > 
> > Thanks,
> > 
> > ---
> >  arch/arm64/kernel/stacktrace.c | 10 +++++++++-
> >  include/linux/ftrace.h         |  2 ++
> >  kernel/trace/fgraph.c          | 24 ++++++++++++++++++++++++
> >  3 files changed, 35 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
> > index 17f66a74c745..0eaba1bad599 100644
> > --- a/arch/arm64/kernel/stacktrace.c
> > +++ b/arch/arm64/kernel/stacktrace.c
> > @@ -81,8 +81,16 @@ unwind_recover_return_address(struct unwind_state *state)
> >  		unsigned long orig_pc;
> >  		orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc,
> >  						(void *)state->fp);
> > -		if (WARN_ON_ONCE(state->pc == orig_pc))
> > +		if (WARN_ON_ONCE(state->pc == orig_pc)) {
> > +			static bool dumped;
> > +
> > +			if (!dumped) {
> > +				pr_info("Dump: return_to_handler = %lx\n", (unsigned long)return_to_handler);
> > +				dumped = true;
> > +				fgraph_dump_ret_stack(state->task);
> > +			}
> >  			return -EINVAL;
> > +		}
> >  		state->pc = orig_pc;
> >  	}
> >  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index ad28daa507f7..cfb79977fdec 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -1258,6 +1258,8 @@ static inline void unpause_graph_tracing(void)
> >  {
> >  	atomic_dec(&current->tracing_graph_pause);
> >  }
> > +
> > +void fgraph_dump_ret_stack(struct task_struct *t);
> >  #else /* !CONFIG_FUNCTION_GRAPH_TRACER */
> >  
> >  #define __notrace_funcgraph
> > diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
> > index 0f11f80bdd6c..5dd560fbacce 100644
> > --- a/kernel/trace/fgraph.c
> > +++ b/kernel/trace/fgraph.c
> > @@ -437,6 +437,30 @@ get_ret_stack(struct task_struct *t, int offset, int *index)
> >  	return RET_STACK(t, offset);
> >  }
> >  
> > +void fgraph_dump_ret_stack(struct task_struct *t)
> > +{
> > +	struct ftrace_ret_stack *ret_stack;
> > +	unsigned long val;
> > +	int i, offset, next;
> > +
> > +	for (i = t->curr_ret_stack - 1; i > 0; i--) {
> > +		val = get_fgraph_entry(t, i);
> > +		pr_err(" ret_stack[%d]: %lx %pS type = %d, index = %d\n",
> > +			i, val, (void *)val, __get_type(val), __get_index(val));
> > +	}
> > +	offset = t->curr_ret_stack;
> > +	do {
> > +		ret_stack = get_ret_stack(t, offset, &next);
> > +		pr_err("ret_stack[%d]: %lx\n",
> > +			next + 1, (unsigned long)ret_stack);
> > +		if (ret_stack) {
> > +			pr_err("\tfunc: %ps, return:%ps\n",
> > +				(void *)ret_stack->func, (void *)ret_stack->ret);
> > +		}
> > +		offset = next;
> > +	} while (ret_stack);
> > +}
> > +
> >  /* Both enabled by default (can be cleared by function_graph tracer flags */
> >  static bool fgraph_sleep_time = true;
> >  
> > -- 
> > 2.34.1
> > 
> > -- 
> > Masami Hiramatsu (Google) <mhiramat@kernel.org>
> >
Masami Hiramatsu (Google) Jan. 11, 2024, 1:47 p.m. UTC | #10
On Mon, 8 Jan 2024 15:03:21 +0000
Mark Rutland <mark.rutland@arm.com> wrote:

> On Mon, Jan 08, 2024 at 02:21:03PM +0000, Mark Rutland wrote:
> > On Mon, Jan 08, 2024 at 12:25:55PM +0000, Mark Rutland wrote:
> > > We also have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, but since the return address is
> > > not on the stack at the point function-entry is intercepted we use the FP as
> > > the retp value -- in the absence of tail calls this will be different between a
> > > caller and callee.
> > 
> > Ah; I just spotted that this patch changed that in ftrace_graph_func(), which
> > is the source of the bug. 
> > 
> > As of this patch, we use the address of fregs->lr as the retp value, but the
> > unwinder still uses the FP value, and so when unwind_recover_return_address()
> > calls ftrace_graph_ret_addr(), the retp value won't match the expected entry on
> > the fgraph ret_stack, resulting in failing to find the expected entry.
> > 
> > Since the ftrace_regs only exist transiently during function entry/exit, it's
> > possible for a stackframe to reuse that same address on the stack, which would
> > result in finding a different entry by mistake.
> > 
> > The diff below restores the existing behaviour and fixes the issue for me.
> > Could you please fold that into this patch?
> > 
> > On a separate note, looking at how this patch changed arm64's
> > ftrace_graph_func(), do we need similar changes to arm64's
> > prepare_ftrace_return() for the old-style mcount based ftrace?
> > 
> > Mark.
> > 
> > ---->8----
> > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > index 205937e04ece..329092ce06ba 100644
> > --- a/arch/arm64/kernel/ftrace.c
> > +++ b/arch/arm64/kernel/ftrace.c
> > @@ -495,7 +495,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
> >         if (bit < 0)
> >                 return;
> >  
> > -       if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
> > +       if (!function_graph_enter_ops(*parent, ip, fregs->fp, (void *)fregs->fp, gops))
> >                 *parent = (unsigned long)&return_to_handler;
> >  
> >         ftrace_test_recursion_unlock(bit);
> 
> Thinking some more, this line gets excessively long when we pass the fregs too,
> so it's probably worth adding a local variable for fp, i.e. the diff below.

Yeah, that will be better to keep the line short.

Thank you,

> 
> Mark.
> 
> ---->8----
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index 205937e04ece..d4e142ef4686 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -481,8 +481,9 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
>  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>                        struct ftrace_ops *op, struct ftrace_regs *fregs)
>  {
> -       unsigned long *parent = &fregs->lr;
>         struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
> +       unsigned long *parent = &fregs->lr;
> +       unsigned long fp = fregs->fp;
>         int bit;
>  
>         if (unlikely(ftrace_graph_is_dead()))
> @@ -495,7 +496,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
>         if (bit < 0)
>                 return;
>  
> -       if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
> +       if (!function_graph_enter_ops(*parent, ip, fp, (void *)fp, gops))
>                 *parent = (unsigned long)&return_to_handler;
>  
>         ftrace_test_recursion_unlock(bit);
>
diff mbox series

Patch

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a650f5e11fc5..205937e04ece 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -481,7 +481,24 @@  void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	prepare_ftrace_return(ip, &fregs->lr, fregs->fp);
+	unsigned long *parent = &fregs->lr;
+	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+	int bit;
+
+	if (unlikely(ftrace_graph_is_dead()))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	bit = ftrace_test_recursion_trylock(ip, *parent);
+	if (bit < 0)
+		return;
+
+	if (!function_graph_enter_ops(*parent, ip, fregs->fp, parent, gops))
+		*parent = (unsigned long)&return_to_handler;
+
+	ftrace_test_recursion_unlock(bit);
 }
 #else
 /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 12df54ff0e81..845e29b4254f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -657,9 +657,24 @@  void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct pt_regs *regs = &fregs->regs;
-	unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+	unsigned long *parent = (unsigned long *)kernel_stack_pointer(regs);
+	struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+	int bit;
+
+	if (unlikely(ftrace_graph_is_dead()))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
 
-	prepare_ftrace_return(ip, (unsigned long *)stack, 0);
+	bit = ftrace_test_recursion_trylock(ip, *parent);
+	if (bit < 0)
+		return;
+
+	if (!function_graph_enter_ops(*parent, ip, 0, parent, gops))
+		*parent = (unsigned long)&return_to_handler;
+
+	ftrace_test_recursion_unlock(bit);
 }
 #endif
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7b08169aa51d..c431a33fe789 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1070,7 +1070,9 @@  extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph
 struct fgraph_ops {
 	trace_func_graph_ent_t		entryfunc;
 	trace_func_graph_ret_t		retfunc;
+	struct ftrace_ops		ops; /* for the hash lists */
 	void				*private;
+	int				idx;
 };
 
 /*
@@ -1104,6 +1106,11 @@  extern int
 function_graph_enter(unsigned long ret, unsigned long func,
 		     unsigned long frame_pointer, unsigned long *retp);
 
+extern int
+function_graph_enter_ops(unsigned long ret, unsigned long func,
+			 unsigned long frame_pointer, unsigned long *retp,
+			 struct fgraph_ops *gops);
+
 struct ftrace_ret_stack *
 ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
 
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 62c35d6d95f9..6f537ebd3ed7 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@ 
  *
  * Highly modified by Steven Rostedt (VMware).
  */
+#include <linux/bits.h>
 #include <linux/jump_label.h>
 #include <linux/suspend.h>
 #include <linux/ftrace.h>
@@ -17,22 +18,15 @@ 
 #include "ftrace_internal.h"
 #include "trace.h"
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-#define ASSIGN_OPS_HASH(opsname, val) \
-	.func_hash		= val, \
-	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#else
-#define ASSIGN_OPS_HASH(opsname, val)
-#endif
-
 #define FGRAPH_RET_SIZE sizeof(struct ftrace_ret_stack)
 #define FGRAPH_RET_INDEX (FGRAPH_RET_SIZE / sizeof(long))
 
 /*
  * On entry to a function (via function_graph_enter()), a new ftrace_ret_stack
- * is allocated on the task's ret_stack, then each fgraph_ops on the
- * fgraph_array[]'s entryfunc is called and if that returns non-zero, the
- * index into the fgraph_array[] for that fgraph_ops is added to the ret_stack.
+ * is allocated on the task's ret_stack with indexes entry, then each
+ * fgraph_ops on the fgraph_array[]'s entryfunc is called and if that returns
+ * non-zero, the index into the fgraph_array[] for that fgraph_ops is recorded
+ * on the indexes entry as a bit flag.
  * As the associated ftrace_ret_stack saved for those fgraph_ops needs to
  * be found, the index to it is also added to the ret_stack along with the
  * index of the fgraph_array[] to each fgraph_ops that needs their retfunc
@@ -42,61 +36,59 @@ 
  * to the last ftrace_ret_stack saved. All references to the
  * ftrace_ret_stack has the format of:
  *
- * bits:  0 - 13	Index in words from the previous ftrace_ret_stack
- * bits: 14 - 15	Type of storage
+ * bits:  0 -  9	offset in words from the previous ftrace_ret_stack
+ *			(bitmap type should have FGRAPH_RET_INDEX always)
+ * bits: 10 - 11	Type of storage
  *			  0 - reserved
- *			  1 - fgraph_array index
- * For fgraph_array_index:
- *  bits: 16 - 23	The fgraph_ops fgraph_array index
+ *			  1 - bitmap of fgraph_array index
+ *
+ * For bitmap of fgraph_array index
+ *  bits: 12 - 27	The bitmap of fgraph_ops fgraph_array index
  *
  * That is, at the end of function_graph_enter, if the first and forth
  * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
  * on the return of the function being traced, this is what will be on the
  * task's shadow ret_stack: (the stack grows upward)
  *
- * |                                  | <- task->curr_ret_stack
- * +----------------------------------+
- * | (3 << FGRAPH_ARRAY_SHIFT)|(2)    | ( 3 for index of fourth fgraph_ops)
- * +----------------------------------+
- * | (0 << FGRAPH_ARRAY_SHIFT)|(1)    | ( 0 for index of first fgraph_ops)
- * +----------------------------------+
- * | struct ftrace_ret_stack          |
- * |   (stores the saved ret pointer) |
- * +----------------------------------+
- * |             (X) | (N)            | ( N words away from previous ret_stack)
- * |                                  |
+ * |                                            | <- task->curr_ret_stack
+ * +--------------------------------------------+
+ * | bitmap_type(bitmap:(BIT(3)|BIT(0)),        |
+ * |             offset:FGRAPH_RET_INDEX)       | <- the offset is from here
+ * +--------------------------------------------+
+ * | struct ftrace_ret_stack                    |
+ * |   (stores the saved ret pointer)           | <- the offset points here
+ * +--------------------------------------------+
+ * |                 (X) | (N)                  | ( N words away from
+ * |                                            |   previous ret_stack)
  *
  * If a backtrace is required, and the real return pointer needs to be
  * fetched, then it looks at the task's curr_ret_stack index, if it
- * is greater than zero, it would subtact one, and then mask the value
- * on the ret_stack by FGRAPH_RET_INDEX_MASK and subtract FGRAPH_RET_INDEX
- * from that, to get the index of the ftrace_ret_stack structure stored
- * on the shadow stack.
+ * is greater than zero (reserved, or right before poped), it would mask
+ * the value by FGRAPH_RET_INDEX_MASK to get the offset index of the
+ * ftrace_ret_stack structure stored on the shadow stack.
  */
 
-#define FGRAPH_RET_INDEX_SIZE	14
-#define FGRAPH_RET_INDEX_MASK	((1 << FGRAPH_RET_INDEX_SIZE) - 1)
-
+#define FGRAPH_RET_INDEX_SIZE	10
+#define FGRAPH_RET_INDEX_MASK	GENMASK(FGRAPH_RET_INDEX_SIZE - 1, 0)
 
 #define FGRAPH_TYPE_SIZE	2
-#define FGRAPH_TYPE_MASK	((1 << FGRAPH_TYPE_SIZE) - 1)
+#define FGRAPH_TYPE_MASK	GENMASK(FGRAPH_TYPE_SIZE - 1, 0)
 #define FGRAPH_TYPE_SHIFT	FGRAPH_RET_INDEX_SIZE
 
 enum {
 	FGRAPH_TYPE_RESERVED	= 0,
-	FGRAPH_TYPE_ARRAY	= 1,
+	FGRAPH_TYPE_BITMAP	= 1,
 };
 
-#define FGRAPH_ARRAY_SIZE	16
-#define FGRAPH_ARRAY_MASK	((1 << FGRAPH_ARRAY_SIZE) - 1)
-#define FGRAPH_ARRAY_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
+#define FGRAPH_INDEX_SIZE	16
+#define FGRAPH_INDEX_MASK	GENMASK(FGRAPH_INDEX_SIZE - 1, 0)
+#define FGRAPH_INDEX_SHIFT	(FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_SIZE)
 
 /* Currently the max stack index can't be more than register callers */
-#define FGRAPH_MAX_INDEX	FGRAPH_ARRAY_SIZE
+#define FGRAPH_MAX_INDEX	(FGRAPH_INDEX_SIZE + FGRAPH_RET_INDEX)
+
+#define FGRAPH_ARRAY_SIZE	FGRAPH_INDEX_SIZE
 
-#define FGRAPH_FRAME_SIZE (FGRAPH_RET_SIZE + FGRAPH_ARRAY_SIZE * (sizeof(long)))
-#define FGRAPH_FRAME_INDEX (ALIGN(FGRAPH_FRAME_SIZE,		\
-				  sizeof(long)) / sizeof(long))
 #define SHADOW_STACK_SIZE (PAGE_SIZE)
 #define SHADOW_STACK_INDEX (SHADOW_STACK_SIZE / sizeof(long))
 /* Leave on a buffer at the end */
@@ -113,19 +105,36 @@  static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
 
 static inline int get_ret_stack_index(struct task_struct *t, int offset)
 {
-	return current->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
+	return t->ret_stack[offset] & FGRAPH_RET_INDEX_MASK;
 }
 
 static inline int get_fgraph_type(struct task_struct *t, int offset)
 {
-	return (current->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) &
-		FGRAPH_TYPE_MASK;
+	return (t->ret_stack[offset] >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
+}
+
+static inline unsigned long
+get_fgraph_index_bitmap(struct task_struct *t, int offset)
+{
+	return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
 }
 
-static inline int get_fgraph_array(struct task_struct *t, int offset)
+static inline void
+set_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
 {
-	return (current->ret_stack[offset] >> FGRAPH_ARRAY_SHIFT) &
-		FGRAPH_ARRAY_MASK;
+	t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
+		(FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
+}
+
+static inline bool is_fgraph_index_set(struct task_struct *t, int offset, int idx)
+{
+	return !!(get_fgraph_index_bitmap(t, offset) & BIT(idx));
+}
+
+static inline void
+add_fgraph_index_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
+{
+	t->ret_stack[offset] |= (bitmap << FGRAPH_INDEX_SHIFT);
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
@@ -160,17 +169,14 @@  get_ret_stack(struct task_struct *t, int offset, int *index)
 
 	BUILD_BUG_ON(FGRAPH_RET_SIZE % sizeof(long));
 
-	if (offset <= 0)
+	if (unlikely(offset <= 0))
 		return NULL;
 
-	idx = get_ret_stack_index(t, offset - 1);
-
-	if (idx <= 0 || idx > FGRAPH_MAX_INDEX)
+	idx = get_ret_stack_index(t, --offset);
+	if (WARN_ON_ONCE(idx <= 0 || idx > offset))
 		return NULL;
 
-	offset -= idx + FGRAPH_RET_INDEX;
-	if (offset < 0)
-		return NULL;
+	offset -= idx;
 
 	*index = offset;
 	return RET_STACK(t, offset);
@@ -231,10 +237,12 @@  void ftrace_graph_stop(void)
 /* Add a function return address to the trace stack on thread info.*/
 static int
 ftrace_push_return_trace(unsigned long ret, unsigned long func,
-			 unsigned long frame_pointer, unsigned long *retp)
+			 unsigned long frame_pointer, unsigned long *retp,
+			 int fgraph_idx)
 {
 	struct ftrace_ret_stack *ret_stack;
 	unsigned long long calltime;
+	unsigned long val;
 	int index;
 
 	if (unlikely(ftrace_graph_is_dead()))
@@ -243,6 +251,21 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 	if (!current->ret_stack)
 		return -EBUSY;
 
+	/*
+	 * At first, check whether the previous fgraph callback is pushed by
+	 * the fgraph on the same function entry.
+	 * But if @func is the self tail-call function, we also need to ensure
+	 * the ret_stack is not for the previous call by checking whether the
+	 * bit of @fgraph_idx is set or not.
+	 */
+	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
+	if (ret_stack && ret_stack->func == func &&
+	    get_fgraph_type(current, index + FGRAPH_RET_INDEX) == FGRAPH_TYPE_BITMAP &&
+	    !is_fgraph_index_set(current, index + FGRAPH_RET_INDEX, fgraph_idx))
+		return index + FGRAPH_RET_INDEX;
+
+	val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_RET_INDEX;
+
 	BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
 
 	/*
@@ -252,17 +275,19 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 	smp_rmb();
 
 	/* The return trace stack is full */
-	if (current->curr_ret_stack >= SHADOW_STACK_MAX_INDEX) {
+	if (current->curr_ret_stack + FGRAPH_RET_INDEX >= SHADOW_STACK_MAX_INDEX) {
 		atomic_inc(&current->trace_overrun);
 		return -EBUSY;
 	}
 
 	calltime = trace_clock_local();
 
-	index = current->curr_ret_stack;
-	/* ret offset = 1 ; type = reserved */
-	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
+	index = READ_ONCE(current->curr_ret_stack);
 	ret_stack = RET_STACK(current, index);
+	index += FGRAPH_RET_INDEX;
+
+	/* ret offset = FGRAPH_RET_INDEX ; type = reserved */
+	current->ret_stack[index] = val;
 	ret_stack->ret = ret;
 	/*
 	 * The unwinders expect curr_ret_stack to point to either zero
@@ -278,7 +303,7 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 	 * at least a correct index!
 	 */
 	barrier();
-	current->curr_ret_stack += FGRAPH_RET_INDEX + 1;
+	current->curr_ret_stack = index + 1;
 	/*
 	 * This next barrier is to ensure that an interrupt coming in
 	 * will not corrupt what we are about to write.
@@ -286,7 +311,7 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 	barrier();
 
 	/* Still keep it reserved even if an interrupt came in */
-	current->ret_stack[index + FGRAPH_RET_INDEX] = 1;
+	current->ret_stack[index] = val;
 
 	ret_stack->ret = ret;
 	ret_stack->func = func;
@@ -297,7 +322,7 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 	ret_stack->retp = retp;
 #endif
-	return 0;
+	return index;
 }
 
 /*
@@ -314,15 +339,13 @@  ftrace_push_return_trace(unsigned long ret, unsigned long func,
 # define MCOUNT_INSN_SIZE 0
 #endif
 
+/* If the caller does not use ftrace, call this function. */
 int function_graph_enter(unsigned long ret, unsigned long func,
 			 unsigned long frame_pointer, unsigned long *retp)
 {
 	struct ftrace_graph_ent trace;
-	int offset;
-	int start;
-	int type;
-	int val;
-	int cnt = 0;
+	unsigned long bitmap = 0;
+	int index;
 	int i;
 
 #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
@@ -337,69 +360,33 @@  int function_graph_enter(unsigned long ret, unsigned long func,
 		return -EBUSY;
 #endif
 
-	if (!ftrace_ops_test(&global_ops, func, NULL))
-		return -EBUSY;
-
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
-	if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
+	if (index < 0)
 		goto out;
 
-	/* Use start for the distance to ret_stack (skipping over reserve) */
-	start = offset = current->curr_ret_stack - 2;
-
 	for (i = 0; i < fgraph_array_cnt; i++) {
 		struct fgraph_ops *gops = fgraph_array[i];
 
 		if (gops == &fgraph_stub)
 			continue;
 
-		if ((offset == start) &&
-		    (current->curr_ret_stack >= SHADOW_STACK_INDEX - 1)) {
-			atomic_inc(&current->trace_overrun);
-			break;
-		}
-		if (fgraph_array[i]->entryfunc(&trace, fgraph_array[i])) {
-			offset = current->curr_ret_stack;
-			/* Check the top level stored word */
-			type = get_fgraph_type(current, offset - 1);
-
-			val = (i << FGRAPH_ARRAY_SHIFT) |
-				(FGRAPH_TYPE_ARRAY << FGRAPH_TYPE_SHIFT) |
-				((offset - start) - 1);
-
-			/* We can reuse the top word if it is reserved */
-			if (type == FGRAPH_TYPE_RESERVED) {
-				current->ret_stack[offset - 1] = val;
-				cnt++;
-				continue;
-			}
-			val++;
-
-			current->ret_stack[offset] = val;
-			/*
-			 * Write the value before we increment, so that
-			 * if an interrupt comes in after we increment
-			 * it will still see the value and skip over
-			 * this.
-			 */
-			barrier();
-			current->curr_ret_stack++;
-			/*
-			 * Have to write again, in case an interrupt
-			 * came in before the increment and after we
-			 * wrote the value.
-			 */
-			barrier();
-			current->ret_stack[offset] = val;
-			cnt++;
-		}
+		if (ftrace_ops_test(&gops->ops, func, NULL) &&
+		    gops->entryfunc(&trace, gops))
+			bitmap |= BIT(i);
 	}
 
-	if (!cnt)
+	if (!bitmap)
 		goto out_ret;
 
+	/*
+	 * Since this function uses fgraph_idx = 0 as a tail-call checking
+	 * flag, set that bit always.
+	 */
+	set_fgraph_index_bitmap(current, index, bitmap | BIT(0));
+
 	return 0;
  out_ret:
 	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
@@ -408,15 +395,51 @@  int function_graph_enter(unsigned long ret, unsigned long func,
 	return -EBUSY;
 }
 
+/* This is called from ftrace_graph_func() via ftrace */
+int function_graph_enter_ops(unsigned long ret, unsigned long func,
+			     unsigned long frame_pointer, unsigned long *retp,
+			     struct fgraph_ops *gops)
+{
+	struct ftrace_graph_ent trace;
+	int index;
+	int type;
+
+
+	/* Use start for the distance to ret_stack (skipping over reserve) */
+	index = ftrace_push_return_trace(ret, func, frame_pointer, retp, gops->idx);
+	if (index < 0)
+		return index;
+	type = get_fgraph_type(current, index);
+
+	/* This is the first ret_stack for this fentry */
+	if (type == FGRAPH_TYPE_RESERVED)
+		++current->curr_ret_depth;
+
+	trace.func = func;
+	trace.depth = current->curr_ret_depth;
+	if (gops->entryfunc(&trace, gops)) {
+		if (type == FGRAPH_TYPE_RESERVED)
+			set_fgraph_index_bitmap(current, index, BIT(gops->idx));
+		else
+			add_fgraph_index_bitmap(current, index, BIT(gops->idx));
+		return 0;
+	}
+
+	if (type == FGRAPH_TYPE_RESERVED) {
+		current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
+		current->curr_ret_depth--;
+	}
+	return -EBUSY;
+}
+
 /* Retrieve a function return address to the trace stack on thread info.*/
 static struct ftrace_ret_stack *
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
-			unsigned long frame_pointer)
+			unsigned long frame_pointer, int *index)
 {
 	struct ftrace_ret_stack *ret_stack;
-	int index;
 
-	ret_stack = get_ret_stack(current, current->curr_ret_stack, &index);
+	ret_stack = get_ret_stack(current, current->curr_ret_stack, index);
 
 	if (unlikely(!ret_stack)) {
 		ftrace_graph_stop();
@@ -455,6 +478,7 @@  ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
 	}
 #endif
 
+	*index += FGRAPH_RET_INDEX;
 	*ret = ret_stack->ret;
 	trace->func = ret_stack->func;
 	trace->calltime = ret_stack->calltime;
@@ -507,13 +531,12 @@  static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
 {
 	struct ftrace_ret_stack *ret_stack;
 	struct ftrace_graph_ret trace;
+	unsigned long bitmap;
 	unsigned long ret;
-	int offset;
 	int index;
-	int idx;
 	int i;
 
-	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &index);
 
 	if (unlikely(!ret_stack)) {
 		ftrace_graph_stop();
@@ -527,16 +550,17 @@  static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
 	trace.retval = fgraph_ret_regs_return_value(ret_regs);
 #endif
 
-	offset = current->curr_ret_stack - 1;
-	index = get_ret_stack_index(current, offset);
+	bitmap = get_fgraph_index_bitmap(current, index);
+	for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+		struct fgraph_ops *gops = fgraph_array[i];
 
-	/* index has to be at least one! Optimize for it */
-	i = 0;
-	do {
-		idx = get_fgraph_array(current, offset - i);
-		fgraph_array[idx]->retfunc(&trace, fgraph_array[idx]);
-		i++;
-	} while (i < index);
+		if (!(bitmap & BIT(i)))
+			continue;
+		if (gops == &fgraph_stub)
+			continue;
+
+		gops->retfunc(&trace, gops);
+	}
 
 	/*
 	 * The ftrace_graph_return() may still access the current
@@ -544,7 +568,7 @@  static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
 	 * curr_ret_stack is after that.
 	 */
 	barrier();
-	current->curr_ret_stack -= index + FGRAPH_RET_INDEX;
+	current->curr_ret_stack -= FGRAPH_RET_INDEX + 1;
 	current->curr_ret_depth--;
 	return ret;
 }
@@ -622,7 +646,17 @@  unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 		ret_stack = get_ret_stack(current, i, &i);
 		if (!ret_stack)
 			break;
-		if (ret_stack->retp == retp)
+		/*
+		 * For the tail-call, there would be 2 or more ftrace_ret_stacks on
+		 * the ret_stack, which records "return_to_handler" as the return
+		 * address excpt for the last one.
+		 * But on the real stack, there should be 1 entry because tail-call
+		 * reuses the return address on the stack and jump to the next function.
+		 * Thus we will continue to find real return address.
+		 */
+		if (ret_stack->retp == retp &&
+		    ret_stack->ret !=
+		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
 			return ret_stack->ret;
 	}
 
@@ -645,6 +679,9 @@  unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 	i = *idx;
 	do {
 		ret_stack = get_ret_stack(task, task_idx, &task_idx);
+		if (ret_stack && ret_stack->ret ==
+		    (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
+			continue;
 		i--;
 	} while (i >= 0 && ret_stack);
 
@@ -655,17 +692,25 @@  unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 }
 #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
 
-static struct ftrace_ops graph_ops = {
-	.func			= ftrace_graph_func,
-	.flags			= FTRACE_OPS_FL_INITIALIZED |
-				   FTRACE_OPS_FL_PID |
-				   FTRACE_OPS_GRAPH_STUB,
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+		     struct ftrace_ops *src_ops)
+{
+	dst_ops->func = ftrace_graph_func;
+	dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
+
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
-	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
+	dst_ops->trampoline = FTRACE_GRAPH_TRAMP_ADDR;
 	/* trampoline_size is only needed for dynamically allocated tramps */
 #endif
-	ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
-};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	if (src_ops) {
+		dst_ops->func_hash = &src_ops->local_hash;
+		mutex_init(&dst_ops->local_hash.regex_lock);
+		dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+	}
+#endif
+}
 
 void ftrace_graph_sleep_time_control(bool enable)
 {
@@ -869,11 +914,20 @@  static int start_graph_tracing(void)
 
 int register_ftrace_graph(struct fgraph_ops *gops)
 {
+	int command = 0;
 	int ret = 0;
 	int i;
 
 	mutex_lock(&ftrace_lock);
 
+	if (!gops->ops.func) {
+		gops->ops.flags |= FTRACE_OPS_GRAPH_STUB;
+		gops->ops.func = ftrace_graph_func;
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+		gops->ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
+#endif
+	}
+
 	if (!fgraph_array[0]) {
 		/* The array must always have real data on it */
 		for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
@@ -893,6 +947,7 @@  int register_ftrace_graph(struct fgraph_ops *gops)
 	fgraph_array[i] = gops;
 	if (i + 1 > fgraph_array_cnt)
 		fgraph_array_cnt = i + 1;
+	gops->idx = i;
 
 	ftrace_graph_active++;
 
@@ -909,9 +964,10 @@  int register_ftrace_graph(struct fgraph_ops *gops)
 		 */
 		ftrace_graph_return = return_run;
 		ftrace_graph_entry = entry_run;
-
-		ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
+		command = FTRACE_START_FUNC_RET;
 	}
+
+	ret = ftrace_startup(&gops->ops, command);
 out:
 	mutex_unlock(&ftrace_lock);
 	return ret;
@@ -919,6 +975,7 @@  int register_ftrace_graph(struct fgraph_ops *gops)
 
 void unregister_ftrace_graph(struct fgraph_ops *gops)
 {
+	int command = 0;
 	int i;
 
 	mutex_lock(&ftrace_lock);
@@ -926,25 +983,29 @@  void unregister_ftrace_graph(struct fgraph_ops *gops)
 	if (unlikely(!ftrace_graph_active))
 		goto out;
 
-	for (i = 0; i < fgraph_array_cnt; i++)
-		if (gops == fgraph_array[i])
-			break;
-	if (i >= fgraph_array_cnt)
+	if (unlikely(gops->idx < 0 || gops->idx >= fgraph_array_cnt))
 		goto out;
 
-	fgraph_array[i] = &fgraph_stub;
-	if (i + 1 == fgraph_array_cnt) {
-		for (; i >= 0; i--)
-			if (fgraph_array[i] != &fgraph_stub)
-				break;
+	WARN_ON_ONCE(fgraph_array[gops->idx] != gops);
+
+	fgraph_array[gops->idx] = &fgraph_stub;
+	if (gops->idx + 1 == fgraph_array_cnt) {
+		i = gops->idx;
+		while (i >= 0 && fgraph_array[i] == &fgraph_stub)
+			i--;
 		fgraph_array_cnt = i + 1;
 	}
 
 	ftrace_graph_active--;
+
+	if (!ftrace_graph_active)
+		command = FTRACE_STOP_FUNC_RET;
+
+	ftrace_shutdown(&gops->ops, command);
+
 	if (!ftrace_graph_active) {
 		ftrace_graph_return = ftrace_stub_graph;
 		ftrace_graph_entry = ftrace_graph_entry_stub;
-		ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
 		unregister_pm_notifier(&ftrace_suspend_notifier);
 		unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 	}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83fbfb7b48f8..c4cc2a9d0047 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3050,6 +3050,8 @@  int ftrace_startup(struct ftrace_ops *ops, int command)
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
+	ftrace_ops_init(ops);
+
 	ret = __register_ftrace_function(ops);
 	if (ret)
 		return ret;
@@ -7319,7 +7321,7 @@  __init void ftrace_init_global_array_ops(struct trace_array *tr)
 	tr->ops = &global_ops;
 	tr->ops->private = tr;
 	ftrace_init_trace_array(tr);
-	init_array_fgraph_ops(tr);
+	init_array_fgraph_ops(tr, tr->ops);
 }
 
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -8051,7 +8053,7 @@  static int register_ftrace_function_nolock(struct ftrace_ops *ops)
  */
 int register_ftrace_function(struct ftrace_ops *ops)
 {
-	int ret;
+	int ret = -1;
 
 	lock_direct_mutex();
 	ret = prepare_direct_functions_for_ipmodify(ops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 16948c0ed00a..02edfdb68933 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -890,8 +890,8 @@  extern int __trace_graph_entry(struct trace_array *tr,
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned int trace_ctx);
-extern void init_array_fgraph_ops(struct trace_array *tr);
-extern int allocate_fgraph_ops(struct trace_array *tr);
+extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
 extern void free_fgraph_ops(struct trace_array *tr);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -974,6 +974,7 @@  static inline int ftrace_graph_notrace_addr(unsigned long addr)
 	preempt_enable_notrace();
 	return ret;
 }
+
 #else
 static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
 {
@@ -999,18 +1000,19 @@  static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
 		(fgraph_max_depth && trace->depth >= fgraph_max_depth);
 }
 
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+		     struct ftrace_ops *src_ops);
+
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
-static inline void init_array_fgraph_ops(struct trace_array *tr) { }
-static inline int allocate_fgraph_ops(struct trace_array *tr)
-{
-	return 0;
-}
 static inline void free_fgraph_ops(struct trace_array *tr) { }
+/* ftrace_ops may not be defined */
+#define init_array_fgraph_ops(tr, ops) do { } while (0)
+#define allocate_fgraph_ops(tr, ops) ({ 0; })
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 extern struct list_head ftrace_pids;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e8da0d0ee52..13bf2415245d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -91,7 +91,7 @@  int ftrace_create_function_files(struct trace_array *tr,
 	if (!tr->ops)
 		return -EINVAL;
 
-	ret = allocate_fgraph_ops(tr);
+	ret = allocate_fgraph_ops(tr, tr->ops);
 	if (ret) {
 		kfree(tr->ops);
 		return ret;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9ccc904a7703..7f30652f0e97 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -288,7 +288,7 @@  static struct fgraph_ops funcgraph_ops = {
 	.retfunc = &trace_graph_return,
 };
 
-int allocate_fgraph_ops(struct trace_array *tr)
+int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
 {
 	struct fgraph_ops *gops;
 
@@ -301,6 +301,9 @@  int allocate_fgraph_ops(struct trace_array *tr)
 
 	tr->gops = gops;
 	gops->private = tr;
+
+	fgraph_init_ops(&gops->ops, ops);
+
 	return 0;
 }
 
@@ -309,10 +312,11 @@  void free_fgraph_ops(struct trace_array *tr)
 	kfree(tr->gops);
 }
 
-__init void init_array_fgraph_ops(struct trace_array *tr)
+__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
 {
 	tr->gops = &funcgraph_ops;
 	funcgraph_ops.private = tr;
+	fgraph_init_ops(&tr->gops->ops, ops);
 }
 
 static int graph_trace_init(struct trace_array *tr)