diff mbox series

[v5,28/34] fprobe: Rewrite fprobe on function-graph tracer

Message ID 170290542972.220107.9135357273431693988.stgit@devnote2 (mailing list archive)
State Not Applicable
Headers show
Series tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat

Commit Message

Masami Hiramatsu (Google) Dec. 18, 2023, 1:17 p.m. UTC
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Rewrite fprobe implementation on function-graph tracer.
Major API changes are:
 -  'nr_maxactive' field is deprecated.
 -  This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
    !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and
    CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only
    on x86_64.
 -  Currently the entry size is limited in 15 * sizeof(long).
 -  If there is too many fprobe exit handler set on the same
    function, it will fail to probe.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v3:
  - Update for new reserve_data/retrieve_data API.
  - Fix internal push/pop on fgraph data logic so that it can
    correctly save/restore the returning fprobes.
 Changes in v2:
  - Add more lockdep_assert_held(fprobe_mutex)
  - Use READ_ONCE() and WRITE_ONCE() for fprobe_hlist_node::fp.
  - Add NOKPROBE_SYMBOL() for the functions which is called from
    entry/exit callback.
---
 include/linux/fprobe.h |   54 +++-
 kernel/trace/Kconfig   |    8 -
 kernel/trace/fprobe.c  |  632 ++++++++++++++++++++++++++++++++++--------------
 lib/test_fprobe.c      |   45 ---
 4 files changed, 494 insertions(+), 245 deletions(-)

Comments

Jiri Olsa Dec. 19, 2023, 2:39 p.m. UTC | #1
On Mon, Dec 18, 2023 at 10:17:10PM +0900, Masami Hiramatsu (Google) wrote:

SNIP

> -static void fprobe_exit_handler(struct rethook_node *rh, void *data,
> -				unsigned long ret_ip, struct pt_regs *regs)
> +static int fprobe_entry(unsigned long func, unsigned long ret_ip,
> +			struct ftrace_regs *fregs, struct fgraph_ops *gops)
>  {
> -	struct fprobe *fp = (struct fprobe *)data;
> -	struct fprobe_rethook_node *fpr;
> -	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
> -	int bit;
> +	struct fprobe_hlist_node *node, *first;
> +	unsigned long *fgraph_data = NULL;
> +	unsigned long header;
> +	int reserved_words;
> +	struct fprobe *fp;
> +	int used, ret;
>  
> -	if (!fp || fprobe_disabled(fp))
> -		return;
> +	if (WARN_ON_ONCE(!fregs))
> +		return 0;
>  
> -	fpr = container_of(rh, struct fprobe_rethook_node, node);
> +	first = node = find_first_fprobe_node(func);
> +	if (unlikely(!first))
> +		return 0;
> +
> +	reserved_words = 0;
> +	hlist_for_each_entry_from_rcu(node, hlist) {
> +		if (node->addr != func)
> +			break;
> +		fp = READ_ONCE(node->fp);
> +		if (!fp || !fp->exit_handler)
> +			continue;
> +		/*
> +		 * Since fprobe can be enabled until the next loop, we ignore the
> +		 * fprobe's disabled flag in this loop.
> +		 */
> +		reserved_words +=
> +			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
> +	}
> +	node = first;
> +	if (reserved_words) {
> +		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> +		if (unlikely(!fgraph_data)) {
> +			hlist_for_each_entry_from_rcu(node, hlist) {
> +				if (node->addr != func)
> +					break;
> +				fp = READ_ONCE(node->fp);
> +				if (fp && !fprobe_disabled(fp))
> +					fp->nmissed++;
> +			}
> +			return 0;
> +		}
> +	}

this looks expensive compared to what we do now.. IIUC due to the graph
ops limitations (16 ctive ops), you have just single graph ops for fprobe
and each fprobe registration stores ips into hash which you need to search
in here to get registered callbacks..?

I wonder would it make sense to allow arbitrary number of active graph_ops
with the price some callback might fail because there's no stack space so
each fprobe instance would have its own graph_ops.. and we would get rid
of the code above (and below) ?

jirka

>  
>  	/*
> -	 * we need to assure no calls to traceable functions in-between the
> -	 * end of fprobe_handler and the beginning of fprobe_exit_handler.
> +	 * TODO: recursion detection has been done in the fgraph. Thus we need
> +	 * to add a callback to increment missed counter.
>  	 */
> -	bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
> -	if (bit < 0) {
> -		fp->nmissed++;
> +	used = 0;
> +	hlist_for_each_entry_from_rcu(node, hlist) {
> +		void *data;
> +
> +		if (node->addr != func)
> +			break;
> +		fp = READ_ONCE(node->fp);
> +		if (!fp || fprobe_disabled(fp))
> +			continue;
> +
> +		if (fp->entry_data_size && fp->exit_handler)
> +			data = fgraph_data + used + 1;
> +		else
> +			data = NULL;
> +
> +		if (fprobe_shared_with_kprobes(fp))
> +			ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
> +		else
> +			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
> +		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
> +		if (!ret && fp->exit_handler) {
> +			int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
> +
> +			header = encode_fprobe_header(fp, size_words);
> +			if (likely(header)) {
> +				fgraph_data[used] = header;
> +				used += size_words + 1;
> +			}
> +		}
> +	}
> +	if (used < reserved_words)
> +		memset(fgraph_data + used, 0, reserved_words - used);
> +
> +	/* If any exit_handler is set, data must be used. */
> +	return used != 0;
> +}
> +NOKPROBE_SYMBOL(fprobe_entry);
> +
> +static void fprobe_return(unsigned long func, unsigned long ret_ip,
> +			  struct ftrace_regs *fregs, struct fgraph_ops *gops)
> +{
> +	unsigned long *fgraph_data = NULL;
> +	unsigned long val;
> +	struct fprobe *fp;
> +	int size, curr;
> +	int size_words;
> +
> +	fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
> +	if (!fgraph_data)
> +		return;
> +	size_words = DIV_ROUND_UP(size, sizeof(long));
> +
> +	preempt_disable();
> +
> +	curr = 0;
> +	while (size_words > curr) {
> +		val = fgraph_data[curr++];
> +		if (!val)
> +			break;
> +
> +		size = decode_fprobe_header(val, &fp);
> +		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
> +			if (WARN_ON_ONCE(curr + size > size_words))
> +				break;
> +			fp->exit_handler(fp, func, ret_ip, fregs,
> +					 size ? fgraph_data + curr : NULL);
> +		}
> +		curr += size + 1;
> +	}
> +	preempt_enable();
> +}
> +NOKPROBE_SYMBOL(fprobe_return);
> +
> +static struct fgraph_ops fprobe_graph_ops = {
> +	.entryregfunc	= fprobe_entry,
> +	.retregfunc	= fprobe_return,
> +};
> +static int fprobe_graph_active;
> +
> +/* Add @addrs to the ftrace filter and register fgraph if needed. */
> +static int fprobe_graph_add_ips(unsigned long *addrs, int num)
> +{
> +	int ret;
> +
> +	lockdep_assert_held(&fprobe_mutex);
> +
> +	ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
> +	if (ret)
> +		return ret;
> +
> +	if (!fprobe_graph_active) {
> +		ret = register_ftrace_graph(&fprobe_graph_ops);
> +		if (WARN_ON_ONCE(ret)) {
> +			ftrace_free_filter(&fprobe_graph_ops.ops);
> +			return ret;
> +		}
> +	}
> +	fprobe_graph_active++;
> +	return 0;
> +}
> +
> +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
> +static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
> +{
> +	lockdep_assert_held(&fprobe_mutex);
> +
> +	fprobe_graph_active--;
> +	if (!fprobe_graph_active) {
> +		/* Q: should we unregister it ? */
> +		unregister_ftrace_graph(&fprobe_graph_ops);
>  		return;
>  	}
>  
> -	fp->exit_handler(fp, fpr->entry_ip, ret_ip, fregs,
> -			 fp->entry_data_size ? (void *)fpr->data : NULL);
> -	ftrace_test_recursion_unlock(bit);
> +	ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
>  }
> -NOKPROBE_SYMBOL(fprobe_exit_handler);
>  
>  static int symbols_cmp(const void *a, const void *b)
>  {
> @@ -176,55 +406,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
>  	return ERR_PTR(-ENOENT);
>  }
>  
> -static void fprobe_init(struct fprobe *fp)
> -{
> -	fp->nmissed = 0;
> -	if (fprobe_shared_with_kprobes(fp))
> -		fp->ops.func = fprobe_kprobe_handler;
> -	else
> -		fp->ops.func = fprobe_handler;
> -	fp->ops.flags |= FTRACE_OPS_FL_SAVE_ARGS;
> -}
> +struct filter_match_data {
> +	const char *filter;
> +	const char *notfilter;
> +	size_t index;
> +	size_t size;
> +	unsigned long *addrs;
> +};
>  
> -static int fprobe_init_rethook(struct fprobe *fp, int num)
> +static int filter_match_callback(void *data, const char *name, unsigned long addr)
>  {
> -	int size;
> +	struct filter_match_data *match = data;
>  
> -	if (num <= 0)
> -		return -EINVAL;
> +	if (!glob_match(match->filter, name) ||
> +	    (match->notfilter && glob_match(match->notfilter, name)))
> +		return 0;
>  
> -	if (!fp->exit_handler) {
> -		fp->rethook = NULL;
> +	if (!ftrace_location(addr))
>  		return 0;
> -	}
>  
> -	/* Initialize rethook if needed */
> -	if (fp->nr_maxactive)
> -		size = fp->nr_maxactive;
> -	else
> -		size = num * num_possible_cpus() * 2;
> -	if (size <= 0)
> -		return -EINVAL;
> +	if (match->addrs)
> +		match->addrs[match->index] = addr;
>  
> -	/* Initialize rethook */
> -	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
> -				sizeof(struct fprobe_rethook_node), size);
> -	if (IS_ERR(fp->rethook))
> -		return PTR_ERR(fp->rethook);
> +	match->index++;
> +	return match->index == match->size;
> +}
>  
> -	return 0;
> +/*
> + * Make IP list from the filter/no-filter glob patterns.
> + * Return the number of matched symbols, or -ENOENT.
> + */
> +static int ip_list_from_filter(const char *filter, const char *notfilter,
> +			       unsigned long *addrs, size_t size)
> +{
> +	struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
> +		.index = 0, .size = size, .addrs = addrs};
> +	int ret;
> +
> +	ret = kallsyms_on_each_symbol(filter_match_callback, &match);
> +	if (ret < 0)
> +		return ret;
> +	ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
> +	if (ret < 0)
> +		return ret;
> +
> +	return match.index ?: -ENOENT;
>  }
>  
>  static void fprobe_fail_cleanup(struct fprobe *fp)
>  {
> -	if (!IS_ERR_OR_NULL(fp->rethook)) {
> -		/* Don't need to cleanup rethook->handler because this is not used. */
> -		rethook_free(fp->rethook);
> -		fp->rethook = NULL;
> +	kfree(fp->hlist_array);
> +	fp->hlist_array = NULL;
> +}
> +
> +/* Initialize the fprobe data structure. */
> +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
> +{
> +	struct fprobe_hlist *hlist_array;
> +	unsigned long addr;
> +	int size, i;
> +
> +	if (!fp || !addrs || num <= 0)
> +		return -EINVAL;
> +
> +	size = ALIGN(fp->entry_data_size, sizeof(long));
> +	if (size > MAX_FPROBE_DATA_SIZE)
> +		return -E2BIG;
> +	fp->entry_data_size = size;
> +
> +	hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
> +	if (!hlist_array)
> +		return -ENOMEM;
> +
> +	fp->nmissed = 0;
> +
> +	hlist_array->size = num;
> +	fp->hlist_array = hlist_array;
> +	hlist_array->fp = fp;
> +	for (i = 0; i < num; i++) {
> +		hlist_array->array[i].fp = fp;
> +		addr = ftrace_location(addrs[i]);
> +		if (!addr) {
> +			fprobe_fail_cleanup(fp);
> +			return -ENOENT;
> +		}
> +		hlist_array->array[i].addr = addr;
>  	}
> -	ftrace_free_filter(&fp->ops);
> +	return 0;
>  }
>  
> +#define FPROBE_IPS_MAX	INT_MAX
> +
>  /**
>   * register_fprobe() - Register fprobe to ftrace by pattern.
>   * @fp: A fprobe data structure to be registered.
> @@ -238,46 +510,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
>   */
>  int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
>  {
> -	struct ftrace_hash *hash;
> -	unsigned char *str;
> -	int ret, len;
> +	unsigned long *addrs;
> +	int ret;
>  
>  	if (!fp || !filter)
>  		return -EINVAL;
>  
> -	fprobe_init(fp);
> -
> -	len = strlen(filter);
> -	str = kstrdup(filter, GFP_KERNEL);
> -	ret = ftrace_set_filter(&fp->ops, str, len, 0);
> -	kfree(str);
> -	if (ret)
> +	ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
> +	if (ret < 0)
>  		return ret;
>  
> -	if (notfilter) {
> -		len = strlen(notfilter);
> -		str = kstrdup(notfilter, GFP_KERNEL);
> -		ret = ftrace_set_notrace(&fp->ops, str, len, 0);
> -		kfree(str);
> -		if (ret)
> -			goto out;
> -	}
> -
> -	/* TODO:
> -	 * correctly calculate the total number of filtered symbols
> -	 * from both filter and notfilter.
> -	 */
> -	hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
> -	if (WARN_ON_ONCE(!hash))
> -		goto out;
> -
> -	ret = fprobe_init_rethook(fp, (int)hash->count);
> -	if (!ret)
> -		ret = register_ftrace_function(&fp->ops);
> +	addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
> +	if (!addrs)
> +		return -ENOMEM;
> +	ret = ip_list_from_filter(filter, notfilter, addrs, ret);
> +	if (ret > 0)
> +		ret = register_fprobe_ips(fp, addrs, ret);
>  
> -out:
> -	if (ret)
> -		fprobe_fail_cleanup(fp);
> +	kfree(addrs);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(register_fprobe);
> @@ -285,7 +535,7 @@ EXPORT_SYMBOL_GPL(register_fprobe);
>  /**
>   * register_fprobe_ips() - Register fprobe to ftrace by address.
>   * @fp: A fprobe data structure to be registered.
> - * @addrs: An array of target ftrace location addresses.
> + * @addrs: An array of target function address.
>   * @num: The number of entries of @addrs.
>   *
>   * Register @fp to ftrace for enabling the probe on the address given by @addrs.
> @@ -297,23 +547,27 @@ EXPORT_SYMBOL_GPL(register_fprobe);
>   */
>  int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
>  {
> -	int ret;
> -
> -	if (!fp || !addrs || num <= 0)
> -		return -EINVAL;
> -
> -	fprobe_init(fp);
> +	struct fprobe_hlist *hlist_array;
> +	int ret, i;
>  
> -	ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
> +	ret = fprobe_init(fp, addrs, num);
>  	if (ret)
>  		return ret;
>  
> -	ret = fprobe_init_rethook(fp, num);
> -	if (!ret)
> -		ret = register_ftrace_function(&fp->ops);
> +	mutex_lock(&fprobe_mutex);
> +
> +	hlist_array = fp->hlist_array;
> +	ret = fprobe_graph_add_ips(addrs, num);
> +	if (!ret) {
> +		add_fprobe_hash(fp);
> +		for (i = 0; i < hlist_array->size; i++)
> +			insert_fprobe_node(&hlist_array->array[i]);
> +	}
> +	mutex_unlock(&fprobe_mutex);
>  
>  	if (ret)
>  		fprobe_fail_cleanup(fp);
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(register_fprobe_ips);
> @@ -351,14 +605,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
>  
>  bool fprobe_is_registered(struct fprobe *fp)
>  {
> -	if (!fp || (fp->ops.saved_func != fprobe_handler &&
> -		    fp->ops.saved_func != fprobe_kprobe_handler))
> +	if (!fp || !fp->hlist_array)
>  		return false;
>  	return true;
>  }
>  
>  /**
> - * unregister_fprobe() - Unregister fprobe from ftrace
> + * unregister_fprobe() - Unregister fprobe.
>   * @fp: A fprobe data structure to be unregistered.
>   *
>   * Unregister fprobe (and remove ftrace hooks from the function entries).
> @@ -367,23 +620,40 @@ bool fprobe_is_registered(struct fprobe *fp)
>   */
>  int unregister_fprobe(struct fprobe *fp)
>  {
> -	int ret;
> +	struct fprobe_hlist *hlist_array;
> +	unsigned long *addrs = NULL;
> +	int ret = 0, i, count;
>  
> -	if (!fprobe_is_registered(fp))
> -		return -EINVAL;
> +	mutex_lock(&fprobe_mutex);
> +	if (!fp || !is_fprobe_still_exist(fp)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
>  
> -	if (!IS_ERR_OR_NULL(fp->rethook))
> -		rethook_stop(fp->rethook);
> +	hlist_array = fp->hlist_array;
> +	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
> +	if (!addrs) {
> +		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
> +		goto out;
> +	}
>  
> -	ret = unregister_ftrace_function(&fp->ops);
> -	if (ret < 0)
> -		return ret;
> +	/* Remove non-synonim ips from table and hash */
> +	count = 0;
> +	for (i = 0; i < hlist_array->size; i++) {
> +		if (!delete_fprobe_node(&hlist_array->array[i]))
> +			addrs[count++] = hlist_array->array[i].addr;
> +	}
> +	del_fprobe_hash(fp);
>  
> -	if (!IS_ERR_OR_NULL(fp->rethook))
> -		rethook_free(fp->rethook);
> +	fprobe_graph_remove_ips(addrs, count);
>  
> -	ftrace_free_filter(&fp->ops);
> +	kfree_rcu(hlist_array, rcu);
> +	fp->hlist_array = NULL;
>  
> +out:
> +	mutex_unlock(&fprobe_mutex);
> +
> +	kfree(addrs);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(unregister_fprobe);
> diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
> index 271ce0caeec0..cf92111b5c79 100644
> --- a/lib/test_fprobe.c
> +++ b/lib/test_fprobe.c
> @@ -17,10 +17,8 @@ static u32 rand1, entry_val, exit_val;
>  /* Use indirect calls to avoid inlining the target functions */
>  static u32 (*target)(u32 value);
>  static u32 (*target2)(u32 value);
> -static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
>  static unsigned long target_ip;
>  static unsigned long target2_ip;
> -static unsigned long target_nest_ip;
>  static int entry_return_value;
>  
>  static noinline u32 fprobe_selftest_target(u32 value)
> @@ -33,11 +31,6 @@ static noinline u32 fprobe_selftest_target2(u32 value)
>  	return (value / div_factor) + 1;
>  }
>  
> -static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
> -{
> -	return nest(value + 2);
> -}
> -
>  static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
>  				    unsigned long ret_ip,
>  				    struct ftrace_regs *fregs, void *data)
> @@ -79,22 +72,6 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
>  		KUNIT_EXPECT_NULL(current_test, data);
>  }
>  
> -static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
> -				      unsigned long ret_ip,
> -				      struct ftrace_regs *fregs, void *data)
> -{
> -	KUNIT_EXPECT_FALSE(current_test, preemptible());
> -	return 0;
> -}
> -
> -static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
> -				      unsigned long ret_ip,
> -				      struct ftrace_regs *fregs, void *data)
> -{
> -	KUNIT_EXPECT_FALSE(current_test, preemptible());
> -	KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
> -}
> -
>  /* Test entry only (no rethook) */
>  static void test_fprobe_entry(struct kunit *test)
>  {
> @@ -191,25 +168,6 @@ static void test_fprobe_data(struct kunit *test)
>  	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
>  }
>  
> -/* Test nr_maxactive */
> -static void test_fprobe_nest(struct kunit *test)
> -{
> -	static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
> -	struct fprobe fp = {
> -		.entry_handler = nest_entry_handler,
> -		.exit_handler = nest_exit_handler,
> -		.nr_maxactive = 1,
> -	};
> -
> -	current_test = test;
> -	KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
> -
> -	target_nest(rand1, target);
> -	KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
> -
> -	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
> -}
> -
>  static void test_fprobe_skip(struct kunit *test)
>  {
>  	struct fprobe fp = {
> @@ -247,10 +205,8 @@ static int fprobe_test_init(struct kunit *test)
>  	rand1 = get_random_u32_above(div_factor);
>  	target = fprobe_selftest_target;
>  	target2 = fprobe_selftest_target2;
> -	target_nest = fprobe_selftest_nest_target;
>  	target_ip = get_ftrace_location(target);
>  	target2_ip = get_ftrace_location(target2);
> -	target_nest_ip = get_ftrace_location(target_nest);
>  
>  	return 0;
>  }
> @@ -260,7 +216,6 @@ static struct kunit_case fprobe_testcases[] = {
>  	KUNIT_CASE(test_fprobe),
>  	KUNIT_CASE(test_fprobe_syms),
>  	KUNIT_CASE(test_fprobe_data),
> -	KUNIT_CASE(test_fprobe_nest),
>  	KUNIT_CASE(test_fprobe_skip),
>  	{}
>  };
>
Masami Hiramatsu (Google) Dec. 20, 2023, 1 a.m. UTC | #2
On Tue, 19 Dec 2023 15:39:03 +0100
Jiri Olsa <olsajiri@gmail.com> wrote:

> On Mon, Dec 18, 2023 at 10:17:10PM +0900, Masami Hiramatsu (Google) wrote:
> 
> SNIP
> 
> > -static void fprobe_exit_handler(struct rethook_node *rh, void *data,
> > -				unsigned long ret_ip, struct pt_regs *regs)
> > +static int fprobe_entry(unsigned long func, unsigned long ret_ip,
> > +			struct ftrace_regs *fregs, struct fgraph_ops *gops)
> >  {
> > -	struct fprobe *fp = (struct fprobe *)data;
> > -	struct fprobe_rethook_node *fpr;
> > -	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
> > -	int bit;
> > +	struct fprobe_hlist_node *node, *first;
> > +	unsigned long *fgraph_data = NULL;
> > +	unsigned long header;
> > +	int reserved_words;
> > +	struct fprobe *fp;
> > +	int used, ret;
> >  
> > -	if (!fp || fprobe_disabled(fp))
> > -		return;
> > +	if (WARN_ON_ONCE(!fregs))
> > +		return 0;
> >  
> > -	fpr = container_of(rh, struct fprobe_rethook_node, node);
> > +	first = node = find_first_fprobe_node(func);
> > +	if (unlikely(!first))
> > +		return 0;
> > +
> > +	reserved_words = 0;
> > +	hlist_for_each_entry_from_rcu(node, hlist) {
> > +		if (node->addr != func)
> > +			break;
> > +		fp = READ_ONCE(node->fp);
> > +		if (!fp || !fp->exit_handler)
> > +			continue;
> > +		/*
> > +		 * Since fprobe can be enabled until the next loop, we ignore the
> > +		 * fprobe's disabled flag in this loop.
> > +		 */
> > +		reserved_words +=
> > +			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
> > +	}
> > +	node = first;
> > +	if (reserved_words) {
> > +		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
> > +		if (unlikely(!fgraph_data)) {
> > +			hlist_for_each_entry_from_rcu(node, hlist) {
> > +				if (node->addr != func)
> > +					break;
> > +				fp = READ_ONCE(node->fp);
> > +				if (fp && !fprobe_disabled(fp))
> > +					fp->nmissed++;
> > +			}
> > +			return 0;
> > +		}
> > +	}
> 
> this looks expensive compared to what we do now.. IIUC due to the graph
> ops limitations (16 ctive ops), you have just single graph ops for fprobe
> and each fprobe registration stores ips into hash which you need to search
> in here to get registered callbacks..?

I think this is not so expensive. Most cases, it only hits 1 fprobe on the
hash. And if the fprobe is only used to hook the entry, reserved_words == 0.

> I wonder would it make sense to allow arbitrary number of active graph_ops
> with the price some callback might fail because there's no stack space so
> each fprobe instance would have its own graph_ops.. and we would get rid
> of the code above (and below) ?

Yeah, actually my first implementation is that. But I realized that doesn't
work, this requires intermediate object which has refcounter because the
"marker" on the shadow stack will be left after unregistering it. We need to
identify which is still available and which is not. And for that purpose,
we may need to introduce similar structure in the fgraph too.

The current multi-fgraph does;

 - if CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=n (called from dedicated mcount
   asm code), it has to loop on all fgraph_ops and check the hash, which is
   inefficient but it can easily push the return trace entry on the shadow
   stack.

 - if CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS=y (called from ftrace asm code),
   it does not need to loop (that will be done by ftrace) but each handler
   does NOT know who pushed the return trace entry on the shadow stack.
   Thus it has to decode the shadow stack and check it needs to push return
   trace entry or not. And this is hard if the traced function is self-
   recursive call or tail call. To check the recursive call, I introduced
   a bitmap entry on the shadow stack. This bitmap size limits the max
   number of fgraph.

So, unlimit the number of fgraph, we may need to stack the number of fgraph
on the stack and each fgraph callback has to unwind the shadow stack to check
whether their own number is there instead of checking the bit in the bitmap.
That will be more trusted way but maybe slow.

Another option is introducing a pair of pre- and post-callbacks which is called
before and after calling the list/direct call of ftrace_ops. And pre-callback
will push the ret_stack on shadow stack and post-callback will commit or cancel it.
(but this one is hard to design... maybe becomes ugly interface.)

Thank you,
diff mbox series

Patch

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 879a30956009..08b37b0d1d05 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -5,32 +5,56 @@ 
 
 #include <linux/compiler.h>
 #include <linux/ftrace.h>
-#include <linux/rethook.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+
+struct fprobe;
+
+/**
+ * strcut fprobe_hlist_node - address based hash list node for fprobe.
+ *
+ * @hlist: The hlist node for address search hash table.
+ * @addr: The address represented by this.
+ * @fp: The fprobe which owns this.
+ */
+struct fprobe_hlist_node {
+	struct hlist_node	hlist;
+	unsigned long		addr;
+	struct fprobe		*fp;
+};
+
+/**
+ * struct fprobe_hlist - hash list nodes for fprobe.
+ *
+ * @hlist: The hlist node for existence checking hash table.
+ * @rcu: rcu_head for RCU deferred release.
+ * @fp: The fprobe which owns this fprobe_hlist.
+ * @size: The size of @array.
+ * @array: The fprobe_hlist_node for each address to probe.
+ */
+struct fprobe_hlist {
+	struct hlist_node		hlist;
+	struct rcu_head			rcu;
+	struct fprobe			*fp;
+	int				size;
+	struct fprobe_hlist_node	array[];
+};
 
 /**
  * struct fprobe - ftrace based probe.
- * @ops: The ftrace_ops.
+ *
  * @nmissed: The counter for missing events.
  * @flags: The status flag.
- * @rethook: The rethook data structure. (internal data)
  * @entry_data_size: The private data storage size.
- * @nr_maxactive: The max number of active functions.
+ * @nr_maxactive: The max number of active functions. (*deprecated)
  * @entry_handler: The callback function for function entry.
  * @exit_handler: The callback function for function exit.
+ * @hlist_array: The fprobe_hlist for fprobe search from IP hash table.
  */
 struct fprobe {
-#ifdef CONFIG_FUNCTION_TRACER
-	/*
-	 * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too.
-	 * But user of fprobe may keep embedding the struct fprobe on their own
-	 * code. To avoid build error, this will keep the fprobe data structure
-	 * defined here, but remove ftrace_ops data structure.
-	 */
-	struct ftrace_ops	ops;
-#endif
 	unsigned long		nmissed;
 	unsigned int		flags;
-	struct rethook		*rethook;
 	size_t			entry_data_size;
 	int			nr_maxactive;
 
@@ -40,6 +64,8 @@  struct fprobe {
 	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
 			     unsigned long ret_ip, struct ftrace_regs *fregs,
 			     void *entry_data);
+
+	struct fprobe_hlist	*hlist_array;
 };
 
 /* This fprobe is soft-disabled. */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1a2544712690..11a96275b68c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -296,11 +296,9 @@  config DYNAMIC_FTRACE_WITH_ARGS
 
 config FPROBE
 	bool "Kernel Function Probe (fprobe)"
-	depends on FUNCTION_TRACER
-	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
-	depends on HAVE_PT_REGS_TO_FTRACE_REGS_CAST || !HAVE_DYNAMIC_FTRACE_WITH_ARGS
-	depends on HAVE_RETHOOK
-	select RETHOOK
+	depends on FUNCTION_GRAPH_TRACER
+	depends on HAVE_FUNCTION_GRAPH_FREGS
+	depends on DYNAMIC_FTRACE_WITH_ARGS || !HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	default n
 	help
 	  This option enables kernel function probe (fprobe) based on ftrace.
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 688b897626b4..53e681c2458b 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,98 +8,193 @@ 
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 
 #include "trace.h"
 
-struct fprobe_rethook_node {
-	struct rethook_node node;
-	unsigned long entry_ip;
-	unsigned long entry_parent_ip;
-	char data[];
-};
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
 
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
-			struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
-	struct fprobe_rethook_node *fpr;
-	struct rethook_node *rh = NULL;
-	struct fprobe *fp;
-	void *entry_data = NULL;
-	int ret = 0;
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
 
-	fp = container_of(ops, struct fprobe, ops);
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ *   exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ *   instance related to the funciton address. The key is the ftrace IP
+ *   address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ *  - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ *  - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static DEFINE_MUTEX(fprobe_mutex);
 
-	if (fp->exit_handler) {
-		rh = rethook_try_get(fp->rethook);
-		if (!rh) {
-			fp->nmissed++;
-			return;
-		}
-		fpr = container_of(rh, struct fprobe_rethook_node, node);
-		fpr->entry_ip = ip;
-		fpr->entry_parent_ip = parent_ip;
-		if (fp->entry_data_size)
-			entry_data = fpr->data;
+/*
+ * Find first fprobe in the hlist. It will be iterated twice in the entry
+ * probe, once for correcting the total required size, the second time is
+ * calling back the user handlers.
+ * Thus the hlist in the fprobe_table must be sorted and new probe needs to
+ * be added *before* the first fprobe.
+ */
+static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+{
+	struct fprobe_hlist_node *node;
+	struct hlist_head *head;
+
+	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+	hlist_for_each_entry_rcu(node, head, hlist,
+				 lockdep_is_held(&fprobe_mutex)) {
+		if (node->addr == ip)
+			return node;
 	}
+	return NULL;
+}
+NOKPROBE_SYMBOL(find_first_fprobe_node);
 
-	if (fp->entry_handler)
-		ret = fp->entry_handler(fp, ip, parent_ip, fregs, entry_data);
+/* Node insertion and deletion requires the fprobe_mutex */
+static void insert_fprobe_node(struct fprobe_hlist_node *node)
+{
+	unsigned long ip = node->addr;
+	struct fprobe_hlist_node *next;
+	struct hlist_head *head;
 
-	/* If entry_handler returns !0, nmissed is not counted. */
-	if (rh) {
-		if (ret)
-			rethook_recycle(rh);
-		else
-			rethook_hook(rh, ftrace_get_regs(fregs), true);
+	lockdep_assert_held(&fprobe_mutex);
+
+	next = find_first_fprobe_node(ip);
+	if (next) {
+		hlist_add_before_rcu(&node->hlist, &next->hlist);
+		return;
 	}
+	head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+	hlist_add_head_rcu(&node->hlist, head);
 }
 
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
-		struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Return true if there are synonims */
+static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	struct fprobe *fp;
-	int bit;
+	lockdep_assert_held(&fprobe_mutex);
 
-	fp = container_of(ops, struct fprobe, ops);
-	if (fprobe_disabled(fp))
-		return;
+	WRITE_ONCE(node->fp, NULL);
+	hlist_del_rcu(&node->hlist);
+	return !!find_first_fprobe_node(node->addr);
+}
 
-	/* recursion detection has to go before any traceable function and
-	 * all functions before this point should be marked as notrace
-	 */
-	bit = ftrace_test_recursion_trylock(ip, parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
-		return;
+/* Check existence of the fprobe */
+static bool is_fprobe_still_exist(struct fprobe *fp)
+{
+	struct hlist_head *head;
+	struct fprobe_hlist *fph;
+
+	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+	hlist_for_each_entry_rcu(fph, head, hlist,
+				 lockdep_is_held(&fprobe_mutex)) {
+		if (fph->fp == fp)
+			return true;
 	}
-	__fprobe_handler(ip, parent_ip, ops, fregs);
-	ftrace_test_recursion_unlock(bit);
+	return false;
+}
+NOKPROBE_SYMBOL(is_fprobe_still_exist);
+
+static int add_fprobe_hash(struct fprobe *fp)
+{
+	struct fprobe_hlist *fph = fp->hlist_array;
+	struct hlist_head *head;
+
+	lockdep_assert_held(&fprobe_mutex);
 
+	if (WARN_ON_ONCE(!fph))
+		return -EINVAL;
+
+	if (is_fprobe_still_exist(fp))
+		return -EEXIST;
+
+	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+	return 0;
 }
-NOKPROBE_SYMBOL(fprobe_handler);
 
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static int del_fprobe_hash(struct fprobe *fp)
 {
-	struct fprobe *fp;
-	int bit;
+	struct fprobe_hlist *fph = fp->hlist_array;
 
-	fp = container_of(ops, struct fprobe, ops);
-	if (fprobe_disabled(fp))
-		return;
+	lockdep_assert_held(&fprobe_mutex);
 
-	/* recursion detection has to go before any traceable function and
-	 * all functions called before this point should be marked as notrace
-	 */
-	bit = ftrace_test_recursion_trylock(ip, parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
-		return;
+	if (WARN_ON_ONCE(!fph))
+		return -EINVAL;
+
+	if (!is_fprobe_still_exist(fp))
+		return -ENOENT;
+
+	fph->fp = NULL;
+	hlist_del_rcu(&fph->hlist);
+	return 0;
+}
+
+/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */
+#define FPROBE_HEADER_SIZE_BITS		4
+#define MAX_FPROBE_DATA_SIZE_WORD	((1L << FPROBE_HEADER_SIZE_BITS) - 1)
+#define MAX_FPROBE_DATA_SIZE		(MAX_FPROBE_DATA_SIZE_WORD * sizeof(long))
+#define FPROBE_HEADER_PTR_BITS		(BITS_PER_LONG - FPROBE_HEADER_SIZE_BITS)
+#define FPROBE_HEADER_PTR_MASK		GENMASK(FPROBE_HEADER_PTR_BITS - 1, 0)
+#define FPROBE_HEADER_SIZE		sizeof(unsigned long)
+
+static inline unsigned long encode_fprobe_header(struct fprobe *fp, int size_words)
+{
+	if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+	    ((unsigned long)fp & ~FPROBE_HEADER_PTR_MASK) !=
+	    ~FPROBE_HEADER_PTR_MASK)) {
+		return 0;
 	}
+	return ((unsigned long)size_words << FPROBE_HEADER_PTR_BITS) |
+		((unsigned long)fp & FPROBE_HEADER_PTR_MASK);
+}
+
+/* Return reserved data size in words */
+static inline int decode_fprobe_header(unsigned long val, struct fprobe **fp)
+{
+	unsigned long ptr;
+
+	ptr = (val & FPROBE_HEADER_PTR_MASK) | ~FPROBE_HEADER_PTR_MASK;
+	if (fp)
+		*fp = (struct fprobe *)ptr;
+	return val >> FPROBE_HEADER_PTR_BITS;
+}
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+				   struct fprobe *fp, struct ftrace_regs *fregs,
+				   void *data)
+{
+	if (!fp->entry_handler)
+		return 0;
+
+	return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
 
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+					  struct fprobe *fp, struct ftrace_regs *fregs,
+					  void *data)
+{
+	int ret;
 	/*
 	 * This user handler is shared with other kprobes and is not expected to be
 	 * called recursively. So if any other kprobe handler is running, this will
@@ -108,45 +203,180 @@  static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
 	 */
 	if (unlikely(kprobe_running())) {
 		fp->nmissed++;
-		goto recursion_unlock;
+		return 0;
 	}
 
 	kprobe_busy_begin();
-	__fprobe_handler(ip, parent_ip, ops, fregs);
+	ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
 	kprobe_busy_end();
-
-recursion_unlock:
-	ftrace_test_recursion_unlock(bit);
+	return ret;
 }
 
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
-				unsigned long ret_ip, struct pt_regs *regs)
+static int fprobe_entry(unsigned long func, unsigned long ret_ip,
+			struct ftrace_regs *fregs, struct fgraph_ops *gops)
 {
-	struct fprobe *fp = (struct fprobe *)data;
-	struct fprobe_rethook_node *fpr;
-	struct ftrace_regs *fregs = (struct ftrace_regs *)regs;
-	int bit;
+	struct fprobe_hlist_node *node, *first;
+	unsigned long *fgraph_data = NULL;
+	unsigned long header;
+	int reserved_words;
+	struct fprobe *fp;
+	int used, ret;
 
-	if (!fp || fprobe_disabled(fp))
-		return;
+	if (WARN_ON_ONCE(!fregs))
+		return 0;
 
-	fpr = container_of(rh, struct fprobe_rethook_node, node);
+	first = node = find_first_fprobe_node(func);
+	if (unlikely(!first))
+		return 0;
+
+	reserved_words = 0;
+	hlist_for_each_entry_from_rcu(node, hlist) {
+		if (node->addr != func)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (!fp || !fp->exit_handler)
+			continue;
+		/*
+		 * Since fprobe can be enabled until the next loop, we ignore the
+		 * fprobe's disabled flag in this loop.
+		 */
+		reserved_words +=
+			DIV_ROUND_UP(fp->entry_data_size, sizeof(long)) + 1;
+	}
+	node = first;
+	if (reserved_words) {
+		fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+		if (unlikely(!fgraph_data)) {
+			hlist_for_each_entry_from_rcu(node, hlist) {
+				if (node->addr != func)
+					break;
+				fp = READ_ONCE(node->fp);
+				if (fp && !fprobe_disabled(fp))
+					fp->nmissed++;
+			}
+			return 0;
+		}
+	}
 
 	/*
-	 * we need to assure no calls to traceable functions in-between the
-	 * end of fprobe_handler and the beginning of fprobe_exit_handler.
+	 * TODO: recursion detection has been done in the fgraph. Thus we need
+	 * to add a callback to increment missed counter.
 	 */
-	bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
-	if (bit < 0) {
-		fp->nmissed++;
+	used = 0;
+	hlist_for_each_entry_from_rcu(node, hlist) {
+		void *data;
+
+		if (node->addr != func)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (!fp || fprobe_disabled(fp))
+			continue;
+
+		if (fp->entry_data_size && fp->exit_handler)
+			data = fgraph_data + used + 1;
+		else
+			data = NULL;
+
+		if (fprobe_shared_with_kprobes(fp))
+			ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+		else
+			ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+		/* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+		if (!ret && fp->exit_handler) {
+			int size_words = DIV_ROUND_UP(fp->entry_data_size, sizeof(long));
+
+			header = encode_fprobe_header(fp, size_words);
+			if (likely(header)) {
+				fgraph_data[used] = header;
+				used += size_words + 1;
+			}
+		}
+	}
+	if (used < reserved_words)
+		memset(fgraph_data + used, 0, reserved_words - used);
+
+	/* If any exit_handler is set, data must be used. */
+	return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_entry);
+
+static void fprobe_return(unsigned long func, unsigned long ret_ip,
+			  struct ftrace_regs *fregs, struct fgraph_ops *gops)
+{
+	unsigned long *fgraph_data = NULL;
+	unsigned long val;
+	struct fprobe *fp;
+	int size, curr;
+	int size_words;
+
+	fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+	if (!fgraph_data)
+		return;
+	size_words = DIV_ROUND_UP(size, sizeof(long));
+
+	preempt_disable();
+
+	curr = 0;
+	while (size_words > curr) {
+		val = fgraph_data[curr++];
+		if (!val)
+			break;
+
+		size = decode_fprobe_header(val, &fp);
+		if (fp && is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+			if (WARN_ON_ONCE(curr + size > size_words))
+				break;
+			fp->exit_handler(fp, func, ret_ip, fregs,
+					 size ? fgraph_data + curr : NULL);
+		}
+		curr += size + 1;
+	}
+	preempt_enable();
+}
+NOKPROBE_SYMBOL(fprobe_return);
+
+static struct fgraph_ops fprobe_graph_ops = {
+	.entryregfunc	= fprobe_entry,
+	.retregfunc	= fprobe_return,
+};
+static int fprobe_graph_active;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+	if (ret)
+		return ret;
+
+	if (!fprobe_graph_active) {
+		ret = register_ftrace_graph(&fprobe_graph_ops);
+		if (WARN_ON_ONCE(ret)) {
+			ftrace_free_filter(&fprobe_graph_ops.ops);
+			return ret;
+		}
+	}
+	fprobe_graph_active++;
+	return 0;
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+	lockdep_assert_held(&fprobe_mutex);
+
+	fprobe_graph_active--;
+	if (!fprobe_graph_active) {
+		/* Q: should we unregister it ? */
+		unregister_ftrace_graph(&fprobe_graph_ops);
 		return;
 	}
 
-	fp->exit_handler(fp, fpr->entry_ip, ret_ip, fregs,
-			 fp->entry_data_size ? (void *)fpr->data : NULL);
-	ftrace_test_recursion_unlock(bit);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
 }
-NOKPROBE_SYMBOL(fprobe_exit_handler);
 
 static int symbols_cmp(const void *a, const void *b)
 {
@@ -176,55 +406,97 @@  static unsigned long *get_ftrace_locations(const char **syms, int num)
 	return ERR_PTR(-ENOENT);
 }
 
-static void fprobe_init(struct fprobe *fp)
-{
-	fp->nmissed = 0;
-	if (fprobe_shared_with_kprobes(fp))
-		fp->ops.func = fprobe_kprobe_handler;
-	else
-		fp->ops.func = fprobe_handler;
-	fp->ops.flags |= FTRACE_OPS_FL_SAVE_ARGS;
-}
+struct filter_match_data {
+	const char *filter;
+	const char *notfilter;
+	size_t index;
+	size_t size;
+	unsigned long *addrs;
+};
 
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
 {
-	int size;
+	struct filter_match_data *match = data;
 
-	if (num <= 0)
-		return -EINVAL;
+	if (!glob_match(match->filter, name) ||
+	    (match->notfilter && glob_match(match->notfilter, name)))
+		return 0;
 
-	if (!fp->exit_handler) {
-		fp->rethook = NULL;
+	if (!ftrace_location(addr))
 		return 0;
-	}
 
-	/* Initialize rethook if needed */
-	if (fp->nr_maxactive)
-		size = fp->nr_maxactive;
-	else
-		size = num * num_possible_cpus() * 2;
-	if (size <= 0)
-		return -EINVAL;
+	if (match->addrs)
+		match->addrs[match->index] = addr;
 
-	/* Initialize rethook */
-	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
-				sizeof(struct fprobe_rethook_node), size);
-	if (IS_ERR(fp->rethook))
-		return PTR_ERR(fp->rethook);
+	match->index++;
+	return match->index == match->size;
+}
 
-	return 0;
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or -ENOENT.
+ */
+static int ip_list_from_filter(const char *filter, const char *notfilter,
+			       unsigned long *addrs, size_t size)
+{
+	struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+		.index = 0, .size = size, .addrs = addrs};
+	int ret;
+
+	ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+	if (ret < 0)
+		return ret;
+	ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+	if (ret < 0)
+		return ret;
+
+	return match.index ?: -ENOENT;
 }
 
 static void fprobe_fail_cleanup(struct fprobe *fp)
 {
-	if (!IS_ERR_OR_NULL(fp->rethook)) {
-		/* Don't need to cleanup rethook->handler because this is not used. */
-		rethook_free(fp->rethook);
-		fp->rethook = NULL;
+	kfree(fp->hlist_array);
+	fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+	struct fprobe_hlist *hlist_array;
+	unsigned long addr;
+	int size, i;
+
+	if (!fp || !addrs || num <= 0)
+		return -EINVAL;
+
+	size = ALIGN(fp->entry_data_size, sizeof(long));
+	if (size > MAX_FPROBE_DATA_SIZE)
+		return -E2BIG;
+	fp->entry_data_size = size;
+
+	hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
+	if (!hlist_array)
+		return -ENOMEM;
+
+	fp->nmissed = 0;
+
+	hlist_array->size = num;
+	fp->hlist_array = hlist_array;
+	hlist_array->fp = fp;
+	for (i = 0; i < num; i++) {
+		hlist_array->array[i].fp = fp;
+		addr = ftrace_location(addrs[i]);
+		if (!addr) {
+			fprobe_fail_cleanup(fp);
+			return -ENOENT;
+		}
+		hlist_array->array[i].addr = addr;
 	}
-	ftrace_free_filter(&fp->ops);
+	return 0;
 }
 
+#define FPROBE_IPS_MAX	INT_MAX
+
 /**
  * register_fprobe() - Register fprobe to ftrace by pattern.
  * @fp: A fprobe data structure to be registered.
@@ -238,46 +510,24 @@  static void fprobe_fail_cleanup(struct fprobe *fp)
  */
 int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
 {
-	struct ftrace_hash *hash;
-	unsigned char *str;
-	int ret, len;
+	unsigned long *addrs;
+	int ret;
 
 	if (!fp || !filter)
 		return -EINVAL;
 
-	fprobe_init(fp);
-
-	len = strlen(filter);
-	str = kstrdup(filter, GFP_KERNEL);
-	ret = ftrace_set_filter(&fp->ops, str, len, 0);
-	kfree(str);
-	if (ret)
+	ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
+	if (ret < 0)
 		return ret;
 
-	if (notfilter) {
-		len = strlen(notfilter);
-		str = kstrdup(notfilter, GFP_KERNEL);
-		ret = ftrace_set_notrace(&fp->ops, str, len, 0);
-		kfree(str);
-		if (ret)
-			goto out;
-	}
-
-	/* TODO:
-	 * correctly calculate the total number of filtered symbols
-	 * from both filter and notfilter.
-	 */
-	hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
-	if (WARN_ON_ONCE(!hash))
-		goto out;
-
-	ret = fprobe_init_rethook(fp, (int)hash->count);
-	if (!ret)
-		ret = register_ftrace_function(&fp->ops);
+	addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+	ret = ip_list_from_filter(filter, notfilter, addrs, ret);
+	if (ret > 0)
+		ret = register_fprobe_ips(fp, addrs, ret);
 
-out:
-	if (ret)
-		fprobe_fail_cleanup(fp);
+	kfree(addrs);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
@@ -285,7 +535,7 @@  EXPORT_SYMBOL_GPL(register_fprobe);
 /**
  * register_fprobe_ips() - Register fprobe to ftrace by address.
  * @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
  * @num: The number of entries of @addrs.
  *
  * Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -297,23 +547,27 @@  EXPORT_SYMBOL_GPL(register_fprobe);
  */
 int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 {
-	int ret;
-
-	if (!fp || !addrs || num <= 0)
-		return -EINVAL;
-
-	fprobe_init(fp);
+	struct fprobe_hlist *hlist_array;
+	int ret, i;
 
-	ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+	ret = fprobe_init(fp, addrs, num);
 	if (ret)
 		return ret;
 
-	ret = fprobe_init_rethook(fp, num);
-	if (!ret)
-		ret = register_ftrace_function(&fp->ops);
+	mutex_lock(&fprobe_mutex);
+
+	hlist_array = fp->hlist_array;
+	ret = fprobe_graph_add_ips(addrs, num);
+	if (!ret) {
+		add_fprobe_hash(fp);
+		for (i = 0; i < hlist_array->size; i++)
+			insert_fprobe_node(&hlist_array->array[i]);
+	}
+	mutex_unlock(&fprobe_mutex);
 
 	if (ret)
 		fprobe_fail_cleanup(fp);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -351,14 +605,13 @@  EXPORT_SYMBOL_GPL(register_fprobe_syms);
 
 bool fprobe_is_registered(struct fprobe *fp)
 {
-	if (!fp || (fp->ops.saved_func != fprobe_handler &&
-		    fp->ops.saved_func != fprobe_kprobe_handler))
+	if (!fp || !fp->hlist_array)
 		return false;
 	return true;
 }
 
 /**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe() - Unregister fprobe.
  * @fp: A fprobe data structure to be unregistered.
  *
  * Unregister fprobe (and remove ftrace hooks from the function entries).
@@ -367,23 +620,40 @@  bool fprobe_is_registered(struct fprobe *fp)
  */
 int unregister_fprobe(struct fprobe *fp)
 {
-	int ret;
+	struct fprobe_hlist *hlist_array;
+	unsigned long *addrs = NULL;
+	int ret = 0, i, count;
 
-	if (!fprobe_is_registered(fp))
-		return -EINVAL;
+	mutex_lock(&fprobe_mutex);
+	if (!fp || !is_fprobe_still_exist(fp)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	if (!IS_ERR_OR_NULL(fp->rethook))
-		rethook_stop(fp->rethook);
+	hlist_array = fp->hlist_array;
+	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+	if (!addrs) {
+		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
+		goto out;
+	}
 
-	ret = unregister_ftrace_function(&fp->ops);
-	if (ret < 0)
-		return ret;
+	/* Remove non-synonim ips from table and hash */
+	count = 0;
+	for (i = 0; i < hlist_array->size; i++) {
+		if (!delete_fprobe_node(&hlist_array->array[i]))
+			addrs[count++] = hlist_array->array[i].addr;
+	}
+	del_fprobe_hash(fp);
 
-	if (!IS_ERR_OR_NULL(fp->rethook))
-		rethook_free(fp->rethook);
+	fprobe_graph_remove_ips(addrs, count);
 
-	ftrace_free_filter(&fp->ops);
+	kfree_rcu(hlist_array, rcu);
+	fp->hlist_array = NULL;
 
+out:
+	mutex_unlock(&fprobe_mutex);
+
+	kfree(addrs);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c
index 271ce0caeec0..cf92111b5c79 100644
--- a/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@ -17,10 +17,8 @@  static u32 rand1, entry_val, exit_val;
 /* Use indirect calls to avoid inlining the target functions */
 static u32 (*target)(u32 value);
 static u32 (*target2)(u32 value);
-static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
 static unsigned long target_ip;
 static unsigned long target2_ip;
-static unsigned long target_nest_ip;
 static int entry_return_value;
 
 static noinline u32 fprobe_selftest_target(u32 value)
@@ -33,11 +31,6 @@  static noinline u32 fprobe_selftest_target2(u32 value)
 	return (value / div_factor) + 1;
 }
 
-static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
-{
-	return nest(value + 2);
-}
-
 static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
 				    unsigned long ret_ip,
 				    struct ftrace_regs *fregs, void *data)
@@ -79,22 +72,6 @@  static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
 		KUNIT_EXPECT_NULL(current_test, data);
 }
 
-static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
-				      unsigned long ret_ip,
-				      struct ftrace_regs *fregs, void *data)
-{
-	KUNIT_EXPECT_FALSE(current_test, preemptible());
-	return 0;
-}
-
-static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
-				      unsigned long ret_ip,
-				      struct ftrace_regs *fregs, void *data)
-{
-	KUNIT_EXPECT_FALSE(current_test, preemptible());
-	KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
-}
-
 /* Test entry only (no rethook) */
 static void test_fprobe_entry(struct kunit *test)
 {
@@ -191,25 +168,6 @@  static void test_fprobe_data(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
 }
 
-/* Test nr_maxactive */
-static void test_fprobe_nest(struct kunit *test)
-{
-	static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
-	struct fprobe fp = {
-		.entry_handler = nest_entry_handler,
-		.exit_handler = nest_exit_handler,
-		.nr_maxactive = 1,
-	};
-
-	current_test = test;
-	KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
-
-	target_nest(rand1, target);
-	KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
-
-	KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
-}
-
 static void test_fprobe_skip(struct kunit *test)
 {
 	struct fprobe fp = {
@@ -247,10 +205,8 @@  static int fprobe_test_init(struct kunit *test)
 	rand1 = get_random_u32_above(div_factor);
 	target = fprobe_selftest_target;
 	target2 = fprobe_selftest_target2;
-	target_nest = fprobe_selftest_nest_target;
 	target_ip = get_ftrace_location(target);
 	target2_ip = get_ftrace_location(target2);
-	target_nest_ip = get_ftrace_location(target_nest);
 
 	return 0;
 }
@@ -260,7 +216,6 @@  static struct kunit_case fprobe_testcases[] = {
 	KUNIT_CASE(test_fprobe),
 	KUNIT_CASE(test_fprobe_syms),
 	KUNIT_CASE(test_fprobe_data),
-	KUNIT_CASE(test_fprobe_nest),
 	KUNIT_CASE(test_fprobe_skip),
 	{}
 };