diff mbox series

[v3,bpf-next,4/4] bpf: support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)

Message ID 20220718001405.2236811-5-song@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series ftrace: host klp and bpf trampoline together | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1424 this patch: 1424
netdev/cc_maintainers warning 8 maintainers not CCed: haoluo@google.com ast@kernel.org martin.lau@linux.dev yhs@fb.com john.fastabend@gmail.com andrii@kernel.org sdf@google.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 168 this patch: 168
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1433 this patch: 1433
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc

Commit Message

Song Liu July 18, 2022, 12:14 a.m. UTC
When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf
trampoline must follow the instruction pointer saved on stack. This needs
extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag.

Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used
by BPF trampoline. This enables tracing functions with livepatch.

This also requires moving bpf trampoline to *_ftrace_direct_mult APIs.

Link: https://lore.kernel.org/all/20220602193706.2607681-2-song@kernel.org/
Signed-off-by: Song Liu <song@kernel.org>
---
 include/linux/bpf.h     |   8 ++
 kernel/bpf/trampoline.c | 158 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 149 insertions(+), 17 deletions(-)

Comments

Petr Mladek July 18, 2022, 1:07 p.m. UTC | #1
On Sun 2022-07-17 17:14:05, Song Liu wrote:
> When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf
> trampoline must follow the instruction pointer saved on stack. This needs
> extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag.
> 
> Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used
> by BPF trampoline. This enables tracing functions with livepatch.
> 
> This also requires moving bpf trampoline to *_ftrace_direct_mult APIs.
> 
> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -13,6 +13,7 @@
>  #include <linux/static_call.h>
>  #include <linux/bpf_verifier.h>
>  #include <linux/bpf_lsm.h>
> +#include <linux/delay.h>
>  
>  /* dummy _ops. The verifier will operate on target program's ops. */
>  const struct bpf_verifier_ops bpf_extension_verifier_ops = {
> @@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
>  /* serializes access to trampoline_table */
>  static DEFINE_MUTEX(trampoline_mutex);
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
> +
> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
> +{
> +	struct bpf_trampoline *tr = ops->private;
> +	int ret = 0;
> +
> +	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
> +		/* This is called inside register_ftrace_direct_multi(), so
> +		 * tr->mutex is already locked.
> +		 */
> +		WARN_ON_ONCE(!mutex_is_locked(&tr->mutex));

Again, better is:

		lockdep_assert_held_once(&tr->mutex);

> +
> +		/* Instead of updating the trampoline here, we propagate
> +		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
> +		 * retry register_ftrace_direct_multi() after updating the
> +		 * trampoline.
> +		 */
> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
> +			if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
> +				return -EBUSY;
> +
> +			tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
> +			return -EAGAIN;
> +		}
> +
> +		return 0;
> +	}
> +
> +	/* The normal locking order is
> +	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
> +	 *
> +	 * The following two commands are called from
> +	 *
> +	 *   prepare_direct_functions_for_ipmodify
> +	 *   cleanup_direct_functions_after_ipmodify
> +	 *
> +	 * In both cases, direct_mutex is already locked. Use
> +	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
> +	 * (something else is making changes to this same trampoline).
> +	 */
> +	if (!mutex_trylock(&tr->mutex)) {
> +		/* sleep 1 ms to make sure whatever holding tr->mutex makes
> +		 * some progress.
> +		 */
> +		msleep(1);
> +		return -EAGAIN;
> +	}

Huh, this looks horrible. And I do not get it. The above block prints
a warning when the mutex is not taken. Why it is already taken
when cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF
and why it has to be explicitly taken otherwise?

Would it be possible to call prepare_direct_functions_for_ipmodify(),
cleanup_direct_functions_after_ipmodify() with rt->mutex already taken
so that the ordering is correct even in this case.

That said, this is the first version when I am in Cc. I am not sure
if it has already been discussed.


> +	switch (cmd) {
> +	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
> +		tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
> +
> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
> +		break;
> +	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
> +		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
> +
> +		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	};
> +
> +	mutex_unlock(&tr->mutex);
> +	return ret;
> +}
> +#endif
> +
>  bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
>  {
>  	enum bpf_attach_type eatype = prog->expected_attach_type;

Note that I did not do proper review. I not much familiar with the
ftrace code. I just wanted to check how much this patchset affects
livepatching and noticed the commented things.

Best Regards,
Petr
Song Liu July 18, 2022, 4:55 p.m. UTC | #2
> On Jul 18, 2022, at 6:07 AM, Petr Mladek <pmladek@suse.com> wrote:
> 
> On Sun 2022-07-17 17:14:05, Song Liu wrote:
>> When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf
>> trampoline must follow the instruction pointer saved on stack. This needs
>> extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag.
>> 
>> Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used
>> by BPF trampoline. This enables tracing functions with livepatch.
>> 
>> This also requires moving bpf trampoline to *_ftrace_direct_mult APIs.
>> 
>> --- a/kernel/bpf/trampoline.c
>> +++ b/kernel/bpf/trampoline.c
>> @@ -13,6 +13,7 @@
>> #include <linux/static_call.h>
>> #include <linux/bpf_verifier.h>
>> #include <linux/bpf_lsm.h>
>> +#include <linux/delay.h>
>> 
>> /* dummy _ops. The verifier will operate on target program's ops. */
>> const struct bpf_verifier_ops bpf_extension_verifier_ops = {
>> @@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
>> /* serializes access to trampoline_table */
>> static DEFINE_MUTEX(trampoline_mutex);
>> 
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
>> +
>> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
>> +{
>> +	struct bpf_trampoline *tr = ops->private;
>> +	int ret = 0;
>> +
>> +	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
>> +		/* This is called inside register_ftrace_direct_multi(), so
>> +		 * tr->mutex is already locked.
>> +		 */
>> +		WARN_ON_ONCE(!mutex_is_locked(&tr->mutex));
> 
> Again, better is:
> 
> 		lockdep_assert_held_once(&tr->mutex);

Will fix. 

> 
>> +
>> +		/* Instead of updating the trampoline here, we propagate
>> +		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
>> +		 * retry register_ftrace_direct_multi() after updating the
>> +		 * trampoline.
>> +		 */
>> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
>> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
>> +			if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
>> +				return -EBUSY;
>> +
>> +			tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
>> +			return -EAGAIN;
>> +		}
>> +
>> +		return 0;
>> +	}
>> +
>> +	/* The normal locking order is
>> +	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
>> +	 *
>> +	 * The following two commands are called from
>> +	 *
>> +	 *   prepare_direct_functions_for_ipmodify
>> +	 *   cleanup_direct_functions_after_ipmodify
>> +	 *
>> +	 * In both cases, direct_mutex is already locked. Use
>> +	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
>> +	 * (something else is making changes to this same trampoline).
>> +	 */
>> +	if (!mutex_trylock(&tr->mutex)) {
>> +		/* sleep 1 ms to make sure whatever holding tr->mutex makes
>> +		 * some progress.
>> +		 */
>> +		msleep(1);
>> +		return -EAGAIN;
>> +	}
> 
> Huh, this looks horrible. And I do not get it. The above block prints
> a warning when the mutex is not taken. Why it is already taken
> when cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF
> and why it has to be explicitly taken otherwise?

There are two different scenarios:

1. livepatch applied first, then bpf trampoline is registered. 

In this case, we call ops_func(FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF). 
_SELF means we are making change to the DIRECT ops (bpf trampoline) itself. 
In this case, tr->mutex is already taken. 

2. bpf_trampoline registered first, then livepatch is applied. 

In this case, ftrace cannot take tr->mutex first. Instead, ftrace has to 
lock direct_mutex, find any conflict DIRECT ops, and call 
ops_func(FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER). _PEER means this is
called by a peer ftrace ops (livepatch). 

> 
> Would it be possible to call prepare_direct_functions_for_ipmodify(),
> cleanup_direct_functions_after_ipmodify() with rt->mutex already taken
> so that the ordering is correct even in this case.
> 
> That said, this is the first version when I am in Cc. I am not sure
> if it has already been discussed.
> 
> 
>> +	switch (cmd) {
>> +	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
>> +		tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
>> +
>> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
>> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
>> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
>> +		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
>> +
>> +		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
>> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	default:
>> +		ret = -EINVAL;
>> +		break;
>> +	};
>> +
>> +	mutex_unlock(&tr->mutex);
>> +	return ret;
>> +}
>> +#endif
>> +
>> bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
>> {
>> 	enum bpf_attach_type eatype = prog->expected_attach_type;
> 
> Note that I did not do proper review. I not much familiar with the
> ftrace code. I just wanted to check how much this patchset affects
> livepatching and noticed the commented things.

Before this set, livepatch and bpf trampoline cannot work on the same
function. Whichever applied latter will fail. After this, the two
will work almost perfectly together. By "almost" I mean the race
condition protected by the mutex_trylock:

	if (!mutex_trylock(&tr->mutex)) {
		/* sleep 1 ms to make sure whatever holding tr->mutex makes
		 * some progress.
		 */
		msleep(1);
		return -EAGAIN;
	}

If livepatch is applied while something is making changes to the bpf
trampoline at the same time, livepatch code will get -EAGAIN from 
register_ftrace_function(). Then we need to retry from livepatch side. 
AFAICT, kpatch user space already does the retry, so it gonna work 
as-is. I am not sure about other user space though. 

The msleep(1) is suggested by Steven to avoid deadlock in RT case [1].

Does this make sense?

Thanks,
Song

[1] https://lore.kernel.org/bpf/20220706211858.67f9254d@rorschach.local.home/
Song Liu July 18, 2022, 4:55 p.m. UTC | #3
> On Jul 18, 2022, at 6:07 AM, Petr Mladek <pmladek@suse.com> wrote:
> 
> On Sun 2022-07-17 17:14:05, Song Liu wrote:
>> When tracing a function with IPMODIFY ftrace_ops (livepatch), the bpf
>> trampoline must follow the instruction pointer saved on stack. This needs
>> extra handling for bpf trampolines with BPF_TRAMP_F_CALL_ORIG flag.
>> 
>> Implement bpf_tramp_ftrace_ops_func and use it for the ftrace_ops used
>> by BPF trampoline. This enables tracing functions with livepatch.
>> 
>> This also requires moving bpf trampoline to *_ftrace_direct_mult APIs.
>> 
>> --- a/kernel/bpf/trampoline.c
>> +++ b/kernel/bpf/trampoline.c
>> @@ -13,6 +13,7 @@
>> #include <linux/static_call.h>
>> #include <linux/bpf_verifier.h>
>> #include <linux/bpf_lsm.h>
>> +#include <linux/delay.h>
>> 
>> /* dummy _ops. The verifier will operate on target program's ops. */
>> const struct bpf_verifier_ops bpf_extension_verifier_ops = {
>> @@ -29,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
>> /* serializes access to trampoline_table */
>> static DEFINE_MUTEX(trampoline_mutex);
>> 
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
>> +
>> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
>> +{
>> +	struct bpf_trampoline *tr = ops->private;
>> +	int ret = 0;
>> +
>> +	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
>> +		/* This is called inside register_ftrace_direct_multi(), so
>> +		 * tr->mutex is already locked.
>> +		 */
>> +		WARN_ON_ONCE(!mutex_is_locked(&tr->mutex));
> 
> Again, better is:
> 
> 		lockdep_assert_held_once(&tr->mutex);

Will fix. 

> 
>> +
>> +		/* Instead of updating the trampoline here, we propagate
>> +		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
>> +		 * retry register_ftrace_direct_multi() after updating the
>> +		 * trampoline.
>> +		 */
>> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
>> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
>> +			if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
>> +				return -EBUSY;
>> +
>> +			tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
>> +			return -EAGAIN;
>> +		}
>> +
>> +		return 0;
>> +	}
>> +
>> +	/* The normal locking order is
>> +	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
>> +	 *
>> +	 * The following two commands are called from
>> +	 *
>> +	 *   prepare_direct_functions_for_ipmodify
>> +	 *   cleanup_direct_functions_after_ipmodify
>> +	 *
>> +	 * In both cases, direct_mutex is already locked. Use
>> +	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
>> +	 * (something else is making changes to this same trampoline).
>> +	 */
>> +	if (!mutex_trylock(&tr->mutex)) {
>> +		/* sleep 1 ms to make sure whatever holding tr->mutex makes
>> +		 * some progress.
>> +		 */
>> +		msleep(1);
>> +		return -EAGAIN;
>> +	}
> 
> Huh, this looks horrible. And I do not get it. The above block prints
> a warning when the mutex is not taken. Why it is already taken
> when cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF
> and why it has to be explicitly taken otherwise?

There are two different scenarios:

1. livepatch applied first, then bpf trampoline is registered. 

In this case, we call ops_func(FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF). 
_SELF means we are making change to the DIRECT ops (bpf trampoline) itself. 
In this case, tr->mutex is already taken. 

2. bpf_trampoline registered first, then livepatch is applied. 

In this case, ftrace cannot take tr->mutex first. Instead, ftrace has to 
lock direct_mutex, find any conflict DIRECT ops, and call 
ops_func(FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER). _PEER means this is
called by a peer ftrace ops (livepatch). 

> 
> Would it be possible to call prepare_direct_functions_for_ipmodify(),
> cleanup_direct_functions_after_ipmodify() with rt->mutex already taken
> so that the ordering is correct even in this case.
> 
> That said, this is the first version when I am in Cc. I am not sure
> if it has already been discussed.
> 
> 
>> +	switch (cmd) {
>> +	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
>> +		tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
>> +
>> +		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
>> +		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
>> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
>> +		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
>> +
>> +		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
>> +			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	default:
>> +		ret = -EINVAL;
>> +		break;
>> +	};
>> +
>> +	mutex_unlock(&tr->mutex);
>> +	return ret;
>> +}
>> +#endif
>> +
>> bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
>> {
>> 	enum bpf_attach_type eatype = prog->expected_attach_type;
> 
> Note that I did not do proper review. I not much familiar with the
> ftrace code. I just wanted to check how much this patchset affects
> livepatching and noticed the commented things.

Before this set, livepatch and bpf trampoline cannot work on the same
function. Whichever applied latter will fail. After this, the two
will work almost perfectly together. By "almost" I mean the race
condition protected by the mutex_trylock:

	if (!mutex_trylock(&tr->mutex)) {
		/* sleep 1 ms to make sure whatever holding tr->mutex makes
		 * some progress.
		 */
		msleep(1);
		return -EAGAIN;
	}

If livepatch is applied while something is making changes to the bpf
trampoline at the same time, livepatch code will get -EAGAIN from 
register_ftrace_function(). Then we need to retry from livepatch side. 
AFAICT, kpatch user space already does the retry, so it gonna work 
as-is. I am not sure about other user space though. 

The msleep(1) is suggested by Steven to avoid deadlock in RT case [1].

Does this make sense?

Thanks,
Song

[1] https://lore.kernel.org/bpf/20220706211858.67f9254d@rorschach.local.home/
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7496842a4671..f35c59e0b742 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -47,6 +47,7 @@  struct kobject;
 struct mem_cgroup;
 struct module;
 struct bpf_func_state;
+struct ftrace_ops;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -756,6 +757,11 @@  struct btf_func_model {
  */
 #define BPF_TRAMP_F_ORIG_STACK		BIT(5)
 
+/* This trampoline is on a function with another ftrace_ops with IPMODIFY,
+ * e.g., a live patch. This flag is set and cleared by ftrace call backs,
+ */
+#define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
@@ -838,9 +844,11 @@  struct bpf_tramp_image {
 struct bpf_trampoline {
 	/* hlist for trampoline_table */
 	struct hlist_node hlist;
+	struct ftrace_ops *fops;
 	/* serializes access to fields of this trampoline */
 	struct mutex mutex;
 	refcount_t refcnt;
+	u32 flags;
 	u64 key;
 	struct {
 		struct btf_func_model model;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index fd69812412ca..fa901aef7930 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,7 @@ 
 #include <linux/static_call.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
+#include <linux/delay.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -29,6 +30,81 @@  static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+	struct bpf_trampoline *tr = ops->private;
+	int ret = 0;
+
+	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+		/* This is called inside register_ftrace_direct_multi(), so
+		 * tr->mutex is already locked.
+		 */
+		WARN_ON_ONCE(!mutex_is_locked(&tr->mutex));
+
+		/* Instead of updating the trampoline here, we propagate
+		 * -EAGAIN to register_ftrace_direct_multi(). Then we can
+		 * retry register_ftrace_direct_multi() after updating the
+		 * trampoline.
+		 */
+		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+			if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+				return -EBUSY;
+
+			tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+			return -EAGAIN;
+		}
+
+		return 0;
+	}
+
+	/* The normal locking order is
+	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+	 *
+	 * The following two commands are called from
+	 *
+	 *   prepare_direct_functions_for_ipmodify
+	 *   cleanup_direct_functions_after_ipmodify
+	 *
+	 * In both cases, direct_mutex is already locked. Use
+	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+	 * (something else is making changes to this same trampoline).
+	 */
+	if (!mutex_trylock(&tr->mutex)) {
+		/* sleep 1 ms to make sure whatever holding tr->mutex makes
+		 * some progress.
+		 */
+		msleep(1);
+		return -EAGAIN;
+	}
+
+	switch (cmd) {
+	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+		tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	mutex_unlock(&tr->mutex);
+	return ret;
+}
+#endif
+
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	enum bpf_attach_type eatype = prog->expected_attach_type;
@@ -89,6 +165,16 @@  static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
 	if (!tr)
 		goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+	if (!tr->fops) {
+		kfree(tr);
+		tr = NULL;
+		goto out;
+	}
+	tr->fops->private = tr;
+	tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
 
 	tr->key = key;
 	INIT_HLIST_NODE(&tr->hlist);
@@ -128,7 +214,7 @@  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	int ret;
 
 	if (tr->func.ftrace_managed)
-		ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
@@ -137,15 +223,20 @@  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	return ret;
 }
 
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+			 bool lock_direct_mutex)
 {
 	void *ip = tr->func.addr;
 	int ret;
 
-	if (tr->func.ftrace_managed)
-		ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		if (lock_direct_mutex)
+			ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+		else
+			ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+	}
 	return ret;
 }
 
@@ -163,10 +254,12 @@  static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	if (bpf_trampoline_module_get(tr))
 		return -ENOENT;
 
-	if (tr->func.ftrace_managed)
-		ret = register_ftrace_direct((long)ip, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0);
+		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+	}
 
 	if (ret)
 		bpf_trampoline_module_put(tr);
@@ -332,11 +425,11 @@  static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 	return ERR_PTR(err);
 }
 
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_tramp_links *tlinks;
-	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+	u32 orig_flags = tr->flags;
 	bool ip_arg = false;
 	int err, total;
 
@@ -358,18 +451,31 @@  static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	}
 
+	/* clear all bits except SHARE_IPMODIFY */
+	tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+
 	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
-	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
+	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
 		/* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
 		 * should not be set together.
 		 */
-		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+		tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+	} else {
+		tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+	}
 
 	if (ip_arg)
-		flags |= BPF_TRAMP_F_IP_ARG;
+		tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+	if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+	    (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+		tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
 
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
-					  &tr->func.model, flags, tlinks,
+					  &tr->func.model, tr->flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
@@ -378,17 +484,34 @@  static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	WARN_ON(!tr->cur_image && tr->selector);
 	if (tr->cur_image)
 		/* progs already running at this address */
-		err = modify_fentry(tr, tr->cur_image->image, im->image);
+		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
 	else
 		/* first time registering */
 		err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	if (err == -EAGAIN) {
+		/* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+		 * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+		 * trampoline again, and retry register.
+		 */
+		/* reset fops->func and fops->trampoline for re-register */
+		tr->fops->func = NULL;
+		tr->fops->trampoline = 0;
+		goto again;
+	}
+#endif
 	if (err)
 		goto out;
+
 	if (tr->cur_image)
 		bpf_tramp_image_put(tr->cur_image);
 	tr->cur_image = im;
 	tr->selector++;
 out:
+	/* If any error happens, restore previous flags */
+	if (err)
+		tr->flags = orig_flags;
 	kfree(tlinks);
 	return err;
 }
@@ -454,7 +577,7 @@  static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
 
 	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
 	tr->progs_cnt[kind]++;
-	err = bpf_trampoline_update(tr);
+	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 	if (err) {
 		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
@@ -487,7 +610,7 @@  static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
-	return bpf_trampoline_update(tr);
+	return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
@@ -715,6 +838,7 @@  void bpf_trampoline_put(struct bpf_trampoline *tr)
 	 * multiple rcu callbacks.
 	 */
 	hlist_del(&tr->hlist);
+	kfree(tr->fops);
 	kfree(tr);
 out:
 	mutex_unlock(&trampoline_mutex);