diff mbox series

[v2,bpf-next,5/5] bpf: trampoline: support FTRACE_OPS_FL_SHARE_IPMODIFY

Message ID 20220602193706.2607681-6-song@kernel.org (mailing list archive)
State New, archived
Delegated to: BPF
Headers show
Series ftrace: host klp and bpf trampoline together | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1444 this patch: 1444
netdev/cc_maintainers warning 5 maintainers not CCed: songliubraving@fb.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 177 this patch: 177
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1451 this patch: 1451
netdev/checkpatch warning WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 fail Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 fail Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc

Commit Message

Song Liu June 2, 2022, 7:37 p.m. UTC
This allows bpf trampoline to trace kernel functions with live patch.
Also, move bpf trampoline to *_ftrace_direct_multi APIs, which allows
setting different flags of ftrace_ops.

Signed-off-by: Song Liu <song@kernel.org>
---
 include/linux/bpf.h     |   3 ++
 kernel/bpf/trampoline.c | 109 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 99 insertions(+), 13 deletions(-)

Comments

Steven Rostedt July 6, 2022, 7:38 p.m. UTC | #1
On Thu, 2 Jun 2022 12:37:06 -0700
Song Liu <song@kernel.org> wrote:


> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -27,6 +27,44 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
>  /* serializes access to trampoline_table */
>  static DEFINE_MUTEX(trampoline_mutex);
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
> +
> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
> +{
> +	struct bpf_trampoline *tr = ops->private;
> +	int ret;
> +
> +	/*
> +	 * The normal locking order is
> +	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
> +	 *
> +	 * This is called from prepare_direct_functions_for_ipmodify, with
> +	 * direct_mutex locked. Use mutex_trylock() to avoid dead lock.
> +	 * Also, bpf_trampoline_update here should not lock direct_mutex.
> +	 */
> +	if (!mutex_trylock(&tr->mutex))

Can you comment here that returning -EAGAIN will not cause this to repeat.
That it will change things where the next try will not return -EGAIN?

> +		return -EAGAIN;
> +
> +	switch (cmd) {
> +	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY:
> +		tr->indirect_call = true;
> +		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
> +		break;
> +	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY:
> +		tr->indirect_call = false;
> +		tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY;
> +		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	};
> +	mutex_unlock(&tr->mutex);
> +	return ret;
> +}
> +#endif
> +
> 


> @@ -330,7 +387,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
>  	return ERR_PTR(err);
>  }
>  
> -static int bpf_trampoline_update(struct bpf_trampoline *tr)
> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
>  {
>  	struct bpf_tramp_image *im;
>  	struct bpf_tramp_links *tlinks;
> @@ -363,20 +420,45 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
>  	if (ip_arg)
>  		flags |= BPF_TRAMP_F_IP_ARG;
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +again:
> +	if (tr->indirect_call)
> +		flags |= BPF_TRAMP_F_ORIG_STACK;
> +#endif
> +
>  	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
>  					  &tr->func.model, flags, tlinks,
>  					  tr->func.addr);
>  	if (err < 0)
>  		goto out;
>  
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	if (tr->indirect_call)
> +		tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY;
> +#endif
> +
>  	WARN_ON(tr->cur_image && tr->selector == 0);
>  	WARN_ON(!tr->cur_image && tr->selector);
>  	if (tr->cur_image)
>  		/* progs already running at this address */
> -		err = modify_fentry(tr, tr->cur_image->image, im->image);
> +		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
>  	else
>  		/* first time registering */
>  		err = register_fentry(tr, im->image);
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> +	if (err == -EAGAIN) {
> +		if (WARN_ON_ONCE(tr->indirect_call))
> +			goto out;
> +		/* should only retry on the first register */
> +		if (WARN_ON_ONCE(tr->cur_image))
> +			goto out;
> +		tr->indirect_call = true;
> +		tr->fops->func = NULL;
> +		tr->fops->trampoline = 0;
> +		goto again;

I'm assuming that the above will prevent a return of -EAGAIN again. As if
it can, then this could turn into a dead lock.

Can you please comment that?

Thanks,

-- Steve

> +	}
> +#endif
>  	if (err)
>  		goto out;
>  	if (tr->cur_image)
> @@ -460,7 +542,7 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
>  
>  	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
>  	tr->progs_cnt[kind]++;
> -	err = bpf_trampoline_update(tr);
> +	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
>  	if (err) {
>  		hlist_del_init(&link->tramp_hlist);
>  		tr->progs_cnt[kind]--;
> @@ -487,7 +569,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
>  	}
>  	hlist_del_init(&link->tramp_hlist);
>  	tr->progs_cnt[kind]--;
> -	err = bpf_trampoline_update(tr);
> +	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
>  out:
>  	mutex_unlock(&tr->mutex);
>  	return err;
> @@ -535,6 +617,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
>  	 * multiple rcu callbacks.
>  	 */
>  	hlist_del(&tr->hlist);
> +	kfree(tr->fops);
>  	kfree(tr);
>  out:
>  	mutex_unlock(&trampoline_mutex);
Song Liu July 6, 2022, 9:37 p.m. UTC | #2
> On Jul 6, 2022, at 12:38 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Thu, 2 Jun 2022 12:37:06 -0700
> Song Liu <song@kernel.org> wrote:
> 
> 
>> --- a/kernel/bpf/trampoline.c
>> +++ b/kernel/bpf/trampoline.c
>> @@ -27,6 +27,44 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
>> /* serializes access to trampoline_table */
>> static DEFINE_MUTEX(trampoline_mutex);
>> 
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
>> +
>> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
>> +{
>> +	struct bpf_trampoline *tr = ops->private;
>> +	int ret;
>> +
>> +	/*
>> +	 * The normal locking order is
>> +	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
>> +	 *
>> +	 * This is called from prepare_direct_functions_for_ipmodify, with
>> +	 * direct_mutex locked. Use mutex_trylock() to avoid dead lock.
>> +	 * Also, bpf_trampoline_update here should not lock direct_mutex.
>> +	 */
>> +	if (!mutex_trylock(&tr->mutex))
> 
> Can you comment here that returning -EAGAIN will not cause this to repeat.
> That it will change things where the next try will not return -EGAIN?

Hmm.. this is not the guarantee here. This conflict is a real race condition 
that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
when something else, for example bpftrace, is updating the BPF trampoline. 

This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
and we need to retry there. In the case of livepatch, the retry is initiated 
from user space. 

> 
>> +		return -EAGAIN;
>> +
>> +	switch (cmd) {
>> +	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY:
>> +		tr->indirect_call = true;
>> +		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY:
>> +		tr->indirect_call = false;
>> +		tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY;
>> +		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
>> +		break;
>> +	default:
>> +		ret = -EINVAL;
>> +		break;
>> +	};
>> +	mutex_unlock(&tr->mutex);
>> +	return ret;
>> +}
>> +#endif
>> +
>> 
> 
> 
>> @@ -330,7 +387,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
>> 	return ERR_PTR(err);
>> }
>> 
>> -static int bpf_trampoline_update(struct bpf_trampoline *tr)
>> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
>> {
>> 	struct bpf_tramp_image *im;
>> 	struct bpf_tramp_links *tlinks;
>> @@ -363,20 +420,45 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
>> 	if (ip_arg)
>> 		flags |= BPF_TRAMP_F_IP_ARG;
>> 
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +again:
>> +	if (tr->indirect_call)
>> +		flags |= BPF_TRAMP_F_ORIG_STACK;
>> +#endif
>> +
>> 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
>> 					  &tr->func.model, flags, tlinks,
>> 					  tr->func.addr);
>> 	if (err < 0)
>> 		goto out;
>> 
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +	if (tr->indirect_call)
>> +		tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY;
>> +#endif
>> +
>> 	WARN_ON(tr->cur_image && tr->selector == 0);
>> 	WARN_ON(!tr->cur_image && tr->selector);
>> 	if (tr->cur_image)
>> 		/* progs already running at this address */
>> -		err = modify_fentry(tr, tr->cur_image->image, im->image);
>> +		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
>> 	else
>> 		/* first time registering */
>> 		err = register_fentry(tr, im->image);
>> +
>> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>> +	if (err == -EAGAIN) {
>> +		if (WARN_ON_ONCE(tr->indirect_call))
>> +			goto out;
>> +		/* should only retry on the first register */
>> +		if (WARN_ON_ONCE(tr->cur_image))
>> +			goto out;
>> +		tr->indirect_call = true;
>> +		tr->fops->func = NULL;
>> +		tr->fops->trampoline = 0;
>> +		goto again;
> 
> I'm assuming that the above will prevent a return of -EAGAIN again. As if
> it can, then this could turn into a dead lock.
> 
> Can you please comment that?

This is a different case. This EAGAIN happens when the live patch came first, 
and we register bpf trampoline later. By enabling tr->indirect_call, we should
not get EAGAIN from register_fentry. 

I will add more comments for both cases. 

Thanks,
Song
Steven Rostedt July 6, 2022, 9:40 p.m. UTC | #3
On Wed, 6 Jul 2022 21:37:52 +0000
Song Liu <songliubraving@fb.com> wrote:

> > Can you comment here that returning -EAGAIN will not cause this to repeat.
> > That it will change things where the next try will not return -EGAIN?  
> 
> Hmm.. this is not the guarantee here. This conflict is a real race condition 
> that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
> when something else, for example bpftrace, is updating the BPF trampoline. 
> 
> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
> and we need to retry there. In the case of livepatch, the retry is initiated 
> from user space. 

We need to be careful here then. If there's a userspace application that
runs at real-time and does a:

	do {
		errno = 0;
		regsiter_bpf();
	} while (errno != -EAGAIN);

it could in theory preempt the owner of the lock and never make any
progress.

-- Steve
Song Liu July 6, 2022, 9:50 p.m. UTC | #4
> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Wed, 6 Jul 2022 21:37:52 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>> Can you comment here that returning -EAGAIN will not cause this to repeat.
>>> That it will change things where the next try will not return -EGAIN?  
>> 
>> Hmm.. this is not the guarantee here. This conflict is a real race condition 
>> that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
>> when something else, for example bpftrace, is updating the BPF trampoline. 
>> 
>> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
>> and we need to retry there. In the case of livepatch, the retry is initiated 
>> from user space. 
> 
> We need to be careful here then. If there's a userspace application that
> runs at real-time and does a:
> 
> 	do {
> 		errno = 0;
> 		regsiter_bpf();
> 	} while (errno != -EAGAIN);
> 
> it could in theory preempt the owner of the lock and never make any
> progress.

We can probably workaround this with some trick on tr->indirect_call. However, 
I don't think this is a real concern from livepatch side. We have seen many 
other issues that cause live patch to fail and requires retry. This race 
condition in theory shouldn't cause real world issues. 

Thanks,
Song
Song Liu July 6, 2022, 10:15 p.m. UTC | #5
> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Wed, 6 Jul 2022 21:37:52 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>> Can you comment here that returning -EAGAIN will not cause this to repeat.
>>> That it will change things where the next try will not return -EGAIN?  
>> 
>> Hmm.. this is not the guarantee here. This conflict is a real race condition 
>> that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
>> when something else, for example bpftrace, is updating the BPF trampoline. 
>> 
>> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
>> and we need to retry there. In the case of livepatch, the retry is initiated 
>> from user space. 
> 
> We need to be careful here then. If there's a userspace application that
> runs at real-time and does a:
> 
> 	do {
> 		errno = 0;
> 		regsiter_bpf();
> 	} while (errno != -EAGAIN);

Actually, do you mean:

	do {
		errno = 0;
		regsiter_bpf();
	} while (errno == -EAGAIN);

(== -EAGAIN) here?

In this specific race condition, register_bpf() will succeed, as it already
got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. 

Since both livepatch and bpf trampoline changes are rare operations, I think 
the chance of the race condition is low enough. 

Does this make sense?

Thanks,
Song
Steven Rostedt July 6, 2022, 10:29 p.m. UTC | #6
On Wed, 6 Jul 2022 22:15:47 +0000
Song Liu <songliubraving@fb.com> wrote:

> > On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > On Wed, 6 Jul 2022 21:37:52 +0000
> > Song Liu <songliubraving@fb.com> wrote:
> >   
> >>> Can you comment here that returning -EAGAIN will not cause this to repeat.
> >>> That it will change things where the next try will not return -EGAIN?    
> >> 
> >> Hmm.. this is not the guarantee here. This conflict is a real race condition 
> >> that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
> >> when something else, for example bpftrace, is updating the BPF trampoline. 
> >> 
> >> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
> >> and we need to retry there. In the case of livepatch, the retry is initiated 
> >> from user space.   
> > 
> > We need to be careful here then. If there's a userspace application that
> > runs at real-time and does a:
> > 
> > 	do {
> > 		errno = 0;
> > 		regsiter_bpf();
> > 	} while (errno != -EAGAIN);  
> 
> Actually, do you mean:
> 
> 	do {
> 		errno = 0;
> 		regsiter_bpf();
> 	} while (errno == -EAGAIN);
> 
> (== -EAGAIN) here?

Yeah, of course.

> 
> In this specific race condition, register_bpf() will succeed, as it already
> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. 

What else takes the tr->mutex ?

If it preempts anything else taking that mutex, when this runs, then it
needs to be careful.

You said this can happen when the live patch came first. This isn't racing
against live patch, it's racing against anything that takes the tr->mutex
and then adds a bpf trampoline to a location that has a live patch.

> 
> Since both livepatch and bpf trampoline changes are rare operations, I think 
> the chance of the race condition is low enough. 
> 
> Does this make sense?
> 

It's low, and if it is also a privileged operation then there's less to be
concern about. As if it is not, then we could have a way to deadlock the
system. I'm more concerned that this will lead to a CVE than it just
happening randomly. In other words, it only takes something that can run at
a real-time priority to connect to a live patch location, and something
that runs at a low priority to take a tr->mutex. If an attacker has both,
then it can pin both to a CPU and then cause the deadlock to the system.

One hack to fix this is to add a msleep(1) in the failed case of the
trylock. This will at least give the owner of the lock a millisecond to
release it. This was what the RT patch use to do with spin_trylock() that
was converted to a mutex (and we worked hard to remove all of them).

-- Steve
Song Liu July 7, 2022, 12:19 a.m. UTC | #7
> On Jul 6, 2022, at 3:29 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Wed, 6 Jul 2022 22:15:47 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>>> 
>>> On Wed, 6 Jul 2022 21:37:52 +0000
>>> Song Liu <songliubraving@fb.com> wrote:
>>> 
>>>>> Can you comment here that returning -EAGAIN will not cause this to repeat.
>>>>> That it will change things where the next try will not return -EGAIN?    
>>>> 
>>>> Hmm.. this is not the guarantee here. This conflict is a real race condition 
>>>> that an IPMODIFY function (i.e. livepatch) is being registered at the same time 
>>>> when something else, for example bpftrace, is updating the BPF trampoline. 
>>>> 
>>>> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch),
>>>> and we need to retry there. In the case of livepatch, the retry is initiated 
>>>> from user space.   
>>> 
>>> We need to be careful here then. If there's a userspace application that
>>> runs at real-time and does a:
>>> 
>>> 	do {
>>> 		errno = 0;
>>> 		regsiter_bpf();
>>> 	} while (errno != -EAGAIN);  
>> 
>> Actually, do you mean:
>> 
>> 	do {
>> 		errno = 0;
>> 		regsiter_bpf();
>> 	} while (errno == -EAGAIN);
>> 
>> (== -EAGAIN) here?
> 
> Yeah, of course.
> 
>> 
>> In this specific race condition, register_bpf() will succeed, as it already
>> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. 
> 
> What else takes the tr->mutex ?

tr->mutex is the local mutex for a single BPF trampoline, we only need to take
it when we make changes to the trampoline (add/remove fentry/fexit programs). 

> 
> If it preempts anything else taking that mutex, when this runs, then it
> needs to be careful.
> 
> You said this can happen when the live patch came first. This isn't racing
> against live patch, it's racing against anything that takes the tr->mutex
> and then adds a bpf trampoline to a location that has a live patch.

There are a few scenarios here:
1. Live patch is already applied, then a BPF trampoline is being registered 
to the same function. In bpf_trampoline_update(), register_fentry returns
-EAGAIN, and this will be resolved. 

2. BPF trampoline is already registered, then a live patch is being applied 
to the same function. The live patch code need to ask the bpf trampoline to
prepare the trampoline before live patch. This is done by calling 
bpf_tramp_ftrace_ops_func. 

2.1 If nothing else is modifying the trampoline at the same time, 
bpf_tramp_ftrace_ops_func will succeed. 

2.2 In rare cases, if something else is holding tr->mutex to make changes to 
the trampoline (add/remove fentry functions, etc.), mutex_trylock in 
bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the 
change to BPF trampoline will still succeed. It is common for live patch to
retry, so we just need to try live patch again when no one is making changes 
to the BPF trampoline in parallel. 

> 
>> 
>> Since both livepatch and bpf trampoline changes are rare operations, I think 
>> the chance of the race condition is low enough. 
>> 
>> Does this make sense?
>> 
> 
> It's low, and if it is also a privileged operation then there's less to be
> concern about.

Both live patch and BPF trampoline are privileged operations. 

> As if it is not, then we could have a way to deadlock the
> system. I'm more concerned that this will lead to a CVE than it just
> happening randomly. In other words, it only takes something that can run at
> a real-time priority to connect to a live patch location, and something
> that runs at a low priority to take a tr->mutex. If an attacker has both,
> then it can pin both to a CPU and then cause the deadlock to the system.
> 
> One hack to fix this is to add a msleep(1) in the failed case of the
> trylock. This will at least give the owner of the lock a millisecond to
> release it. This was what the RT patch use to do with spin_trylock() that
> was converted to a mutex (and we worked hard to remove all of them).

The fix is really simple. But I still think we don't need it. We only hit
the trylock case for something with IPMODIFY. The non-privileged user 
should not be able to do that, right?

Thanks,
Song
Steven Rostedt July 7, 2022, 1:18 a.m. UTC | #8
On Thu, 7 Jul 2022 00:19:07 +0000
Song Liu <songliubraving@fb.com> wrote:

> >> In this specific race condition, register_bpf() will succeed, as it already
> >> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry.   
> > 
> > What else takes the tr->mutex ?  
> 
> tr->mutex is the local mutex for a single BPF trampoline, we only need to take
> it when we make changes to the trampoline (add/remove fentry/fexit programs). 
> 
> > 
> > If it preempts anything else taking that mutex, when this runs, then it
> > needs to be careful.
> > 
> > You said this can happen when the live patch came first. This isn't racing
> > against live patch, it's racing against anything that takes the tr->mutex
> > and then adds a bpf trampoline to a location that has a live patch.  
> 
> There are a few scenarios here:
> 1. Live patch is already applied, then a BPF trampoline is being registered 
> to the same function. In bpf_trampoline_update(), register_fentry returns
> -EAGAIN, and this will be resolved. 

Where will it be resolved?

> 
> 2. BPF trampoline is already registered, then a live patch is being applied 
> to the same function. The live patch code need to ask the bpf trampoline to
> prepare the trampoline before live patch. This is done by calling 
> bpf_tramp_ftrace_ops_func. 
> 
> 2.1 If nothing else is modifying the trampoline at the same time, 
> bpf_tramp_ftrace_ops_func will succeed. 
> 
> 2.2 In rare cases, if something else is holding tr->mutex to make changes to 
> the trampoline (add/remove fentry functions, etc.), mutex_trylock in 
> bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the 
> change to BPF trampoline will still succeed. It is common for live patch to
> retry, so we just need to try live patch again when no one is making changes 
> to the BPF trampoline in parallel. 

If the live patch is going to try again, and the task doing the live
patch is SCHED_FIFO, and the task holding the tr->mutex is SCHED_OTHER
(or just a lower priority), then there is a chance that the live patch
task preempted the tr->mutex owner, and let's say the tr->mutex owner
is pinned to the CPU (by the user or whatever), then because the live
patch task is in a loop trying to take that mutex, it will never let
the owner continue.

Yes, this is a real scenario with trylock on mutexes. We hit it all the
time in RT.

> 
> >   
> >> 
> >> Since both livepatch and bpf trampoline changes are rare operations, I think 
> >> the chance of the race condition is low enough. 


A low race condition in a world that does this a billion times a day,
ends up being not so rare.

I like to say, "I live in a world where the unlikely is very much likely!"


> >> 
> >> Does this make sense?
> >>   
> > 
> > It's low, and if it is also a privileged operation then there's less to be
> > concern about.  
> 
> Both live patch and BPF trampoline are privileged operations. 

This makes the issue less of an issue, but if there's an application
that does this with setuid or something, there's a chance that it can
be used by an attacker as well.

> 
> > As if it is not, then we could have a way to deadlock the
> > system. I'm more concerned that this will lead to a CVE than it just
> > happening randomly. In other words, it only takes something that can run at
> > a real-time priority to connect to a live patch location, and something
> > that runs at a low priority to take a tr->mutex. If an attacker has both,
> > then it can pin both to a CPU and then cause the deadlock to the system.
> > 
> > One hack to fix this is to add a msleep(1) in the failed case of the
> > trylock. This will at least give the owner of the lock a millisecond to
> > release it. This was what the RT patch use to do with spin_trylock() that
> > was converted to a mutex (and we worked hard to remove all of them).  
> 
> The fix is really simple. But I still think we don't need it. We only hit
> the trylock case for something with IPMODIFY. The non-privileged user 
> should not be able to do that, right?

For now, perhaps. But what useful applications are there going to be in
the future that performs these privileged operations on behalf of a
non-privileged user?

In other words, if we can fix it now, we should, and avoid debugging
this issue 5 years from now where it may take months to figure out.

-- Steve
Song Liu July 7, 2022, 2:11 a.m. UTC | #9
> On Jul 6, 2022, at 6:18 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Thu, 7 Jul 2022 00:19:07 +0000
> Song Liu <songliubraving@fb.com> wrote:
> 
>>>> In this specific race condition, register_bpf() will succeed, as it already
>>>> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry.   
>>> 
>>> What else takes the tr->mutex ?  
>> 
>> tr->mutex is the local mutex for a single BPF trampoline, we only need to take
>> it when we make changes to the trampoline (add/remove fentry/fexit programs). 
>> 
>>> 
>>> If it preempts anything else taking that mutex, when this runs, then it
>>> needs to be careful.
>>> 
>>> You said this can happen when the live patch came first. This isn't racing
>>> against live patch, it's racing against anything that takes the tr->mutex
>>> and then adds a bpf trampoline to a location that has a live patch.  
>> 
>> There are a few scenarios here:
>> 1. Live patch is already applied, then a BPF trampoline is being registered 
>> to the same function. In bpf_trampoline_update(), register_fentry returns
>> -EAGAIN, and this will be resolved. 
> 
> Where will it be resolved?

bpf_trampoline_update() will set tr->indirect_call and goto again. Then the 
trampoline is re-prepared to be able to share with the IPMODIFY functions 
and register_fentry will succeed. 

> 
>> 
>> 2. BPF trampoline is already registered, then a live patch is being applied 
>> to the same function. The live patch code need to ask the bpf trampoline to
>> prepare the trampoline before live patch. This is done by calling 
>> bpf_tramp_ftrace_ops_func. 
>> 
>> 2.1 If nothing else is modifying the trampoline at the same time, 
>> bpf_tramp_ftrace_ops_func will succeed. 
>> 
>> 2.2 In rare cases, if something else is holding tr->mutex to make changes to 
>> the trampoline (add/remove fentry functions, etc.), mutex_trylock in 
>> bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the 
>> change to BPF trampoline will still succeed. It is common for live patch to
>> retry, so we just need to try live patch again when no one is making changes 
>> to the BPF trampoline in parallel. 
> 
> If the live patch is going to try again, and the task doing the live
> patch is SCHED_FIFO, and the task holding the tr->mutex is SCHED_OTHER
> (or just a lower priority), then there is a chance that the live patch
> task preempted the tr->mutex owner, and let's say the tr->mutex owner
> is pinned to the CPU (by the user or whatever), then because the live
> patch task is in a loop trying to take that mutex, it will never let
> the owner continue.

Yeah, I got this scenario. I just don't think we should run live patch
with high priority. Well, maybe we shouldn't make such assumptions.  

> 
> Yes, this is a real scenario with trylock on mutexes. We hit it all the
> time in RT.
> 
>> 
>>> 
>>>> 
>>>> Since both livepatch and bpf trampoline changes are rare operations, I think 
>>>> the chance of the race condition is low enough. 
> 
> 
> A low race condition in a world that does this a billion times a day,
> ends up being not so rare.
> 
> I like to say, "I live in a world where the unlikely is very much likely!"
> 
> 
>>>> 
>>>> Does this make sense?
>>>> 
>>> 
>>> It's low, and if it is also a privileged operation then there's less to be
>>> concern about.  
>> 
>> Both live patch and BPF trampoline are privileged operations. 
> 
> This makes the issue less of an issue, but if there's an application
> that does this with setuid or something, there's a chance that it can
> be used by an attacker as well.
> 
>> 
>>> As if it is not, then we could have a way to deadlock the
>>> system. I'm more concerned that this will lead to a CVE than it just
>>> happening randomly. In other words, it only takes something that can run at
>>> a real-time priority to connect to a live patch location, and something
>>> that runs at a low priority to take a tr->mutex. If an attacker has both,
>>> then it can pin both to a CPU and then cause the deadlock to the system.
>>> 
>>> One hack to fix this is to add a msleep(1) in the failed case of the
>>> trylock. This will at least give the owner of the lock a millisecond to
>>> release it. This was what the RT patch use to do with spin_trylock() that
>>> was converted to a mutex (and we worked hard to remove all of them).  
>> 
>> The fix is really simple. But I still think we don't need it. We only hit
>> the trylock case for something with IPMODIFY. The non-privileged user 
>> should not be able to do that, right?
> 
> For now, perhaps. But what useful applications are there going to be in
> the future that performs these privileged operations on behalf of a
> non-privileged user?
> 
> In other words, if we can fix it now, we should, and avoid debugging
> this issue 5 years from now where it may take months to figure out.

Fair enough.. I guess we will just add the msleep(1) in the -EAGAIN
path. If this sounds good, I will send v3 with this change and more
comments. 

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6e06f384e81..20a8ed600ca6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -44,6 +44,7 @@  struct kobject;
 struct mem_cgroup;
 struct module;
 struct bpf_func_state;
+struct ftrace_ops;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -816,6 +817,7 @@  struct bpf_tramp_image {
 struct bpf_trampoline {
 	/* hlist for trampoline_table */
 	struct hlist_node hlist;
+	struct ftrace_ops *fops;
 	/* serializes access to fields of this trampoline */
 	struct mutex mutex;
 	refcount_t refcnt;
@@ -838,6 +840,7 @@  struct bpf_trampoline {
 	struct bpf_tramp_image *cur_image;
 	u64 selector;
 	struct module *mod;
+	bool indirect_call;
 };
 
 struct bpf_attach_target_info {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 93c7675f0c9e..447c788c5520 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -27,6 +27,44 @@  static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+	struct bpf_trampoline *tr = ops->private;
+	int ret;
+
+	/*
+	 * The normal locking order is
+	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+	 *
+	 * This is called from prepare_direct_functions_for_ipmodify, with
+	 * direct_mutex locked. Use mutex_trylock() to avoid dead lock.
+	 * Also, bpf_trampoline_update here should not lock direct_mutex.
+	 */
+	if (!mutex_trylock(&tr->mutex))
+		return -EAGAIN;
+
+	switch (cmd) {
+	case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY:
+		tr->indirect_call = true;
+		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY:
+		tr->indirect_call = false;
+		tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY;
+		ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+	mutex_unlock(&tr->mutex);
+	return ret;
+}
+#endif
+
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	enum bpf_attach_type eatype = prog->expected_attach_type;
@@ -87,7 +125,16 @@  static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
 	if (!tr)
 		goto out;
-
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+	if (!tr->fops) {
+		kfree(tr);
+		tr = NULL;
+		goto out;
+	}
+	tr->fops->private = tr;
+	tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
 	tr->key = key;
 	INIT_HLIST_NODE(&tr->hlist);
 	hlist_add_head(&tr->hlist, head);
@@ -126,7 +173,7 @@  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	int ret;
 
 	if (tr->func.ftrace_managed)
-		ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
@@ -135,15 +182,20 @@  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	return ret;
 }
 
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+			 bool lock_direct_mutex)
 {
 	void *ip = tr->func.addr;
 	int ret;
 
-	if (tr->func.ftrace_managed)
-		ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		if (lock_direct_mutex)
+			ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+		else
+			ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+	}
 	return ret;
 }
 
@@ -161,10 +213,15 @@  static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	if (bpf_trampoline_module_get(tr))
 		return -ENOENT;
 
-	if (tr->func.ftrace_managed)
-		ret = register_ftrace_direct((long)ip, (long)new_addr);
-	else
+	if (tr->func.ftrace_managed) {
+		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0);
+		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+		if (ret)
+			ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 1, 0);
+
+	} else {
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+	}
 
 	if (ret)
 		bpf_trampoline_module_put(tr);
@@ -330,7 +387,7 @@  static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 	return ERR_PTR(err);
 }
 
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_tramp_links *tlinks;
@@ -363,20 +420,45 @@  static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	if (ip_arg)
 		flags |= BPF_TRAMP_F_IP_ARG;
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+	if (tr->indirect_call)
+		flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
+
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
 					  &tr->func.model, flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	if (tr->indirect_call)
+		tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY;
+#endif
+
 	WARN_ON(tr->cur_image && tr->selector == 0);
 	WARN_ON(!tr->cur_image && tr->selector);
 	if (tr->cur_image)
 		/* progs already running at this address */
-		err = modify_fentry(tr, tr->cur_image->image, im->image);
+		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
 	else
 		/* first time registering */
 		err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	if (err == -EAGAIN) {
+		if (WARN_ON_ONCE(tr->indirect_call))
+			goto out;
+		/* should only retry on the first register */
+		if (WARN_ON_ONCE(tr->cur_image))
+			goto out;
+		tr->indirect_call = true;
+		tr->fops->func = NULL;
+		tr->fops->trampoline = 0;
+		goto again;
+	}
+#endif
 	if (err)
 		goto out;
 	if (tr->cur_image)
@@ -460,7 +542,7 @@  int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
 
 	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
 	tr->progs_cnt[kind]++;
-	err = bpf_trampoline_update(tr);
+	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 	if (err) {
 		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
@@ -487,7 +569,7 @@  int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
-	err = bpf_trampoline_update(tr);
+	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 out:
 	mutex_unlock(&tr->mutex);
 	return err;
@@ -535,6 +617,7 @@  void bpf_trampoline_put(struct bpf_trampoline *tr)
 	 * multiple rcu callbacks.
 	 */
 	hlist_del(&tr->hlist);
+	kfree(tr->fops);
 	kfree(tr);
 out:
 	mutex_unlock(&trampoline_mutex);