Message ID | 20220602193706.2607681-6-song@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | BPF |
Headers | show |
Series | ftrace: host klp and bpf trampoline together | expand |
On Thu, 2 Jun 2022 12:37:06 -0700 Song Liu <song@kernel.org> wrote: > --- a/kernel/bpf/trampoline.c > +++ b/kernel/bpf/trampoline.c > @@ -27,6 +27,44 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; > /* serializes access to trampoline_table */ > static DEFINE_MUTEX(trampoline_mutex); > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS > +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); > + > +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) > +{ > + struct bpf_trampoline *tr = ops->private; > + int ret; > + > + /* > + * The normal locking order is > + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) > + * > + * This is called from prepare_direct_functions_for_ipmodify, with > + * direct_mutex locked. Use mutex_trylock() to avoid dead lock. > + * Also, bpf_trampoline_update here should not lock direct_mutex. > + */ > + if (!mutex_trylock(&tr->mutex)) Can you comment here that returning -EAGAIN will not cause this to repeat. That it will change things where the next try will not return -EGAIN? > + return -EAGAIN; > + > + switch (cmd) { > + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY: > + tr->indirect_call = true; > + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); > + break; > + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY: > + tr->indirect_call = false; > + tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY; > + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); > + break; > + default: > + ret = -EINVAL; > + break; > + }; > + mutex_unlock(&tr->mutex); > + return ret; > +} > +#endif > + > > @@ -330,7 +387,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) > return ERR_PTR(err); > } > > -static int bpf_trampoline_update(struct bpf_trampoline *tr) > +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) > { > struct bpf_tramp_image *im; > struct bpf_tramp_links *tlinks; > @@ -363,20 +420,45 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) > if (ip_arg) > flags |= BPF_TRAMP_F_IP_ARG; > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS > +again: > + if (tr->indirect_call) > + flags |= BPF_TRAMP_F_ORIG_STACK; > +#endif > + > err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, > &tr->func.model, flags, tlinks, > tr->func.addr); > if (err < 0) > goto out; > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS > + if (tr->indirect_call) > + tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY; > +#endif > + > WARN_ON(tr->cur_image && tr->selector == 0); > WARN_ON(!tr->cur_image && tr->selector); > if (tr->cur_image) > /* progs already running at this address */ > - err = modify_fentry(tr, tr->cur_image->image, im->image); > + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); > else > /* first time registering */ > err = register_fentry(tr, im->image); > + > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS > + if (err == -EAGAIN) { > + if (WARN_ON_ONCE(tr->indirect_call)) > + goto out; > + /* should only retry on the first register */ > + if (WARN_ON_ONCE(tr->cur_image)) > + goto out; > + tr->indirect_call = true; > + tr->fops->func = NULL; > + tr->fops->trampoline = 0; > + goto again; I'm assuming that the above will prevent a return of -EAGAIN again. As if it can, then this could turn into a dead lock. Can you please comment that? Thanks, -- Steve > + } > +#endif > if (err) > goto out; > if (tr->cur_image) > @@ -460,7 +542,7 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline > > hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); > tr->progs_cnt[kind]++; > - err = bpf_trampoline_update(tr); > + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); > if (err) { > hlist_del_init(&link->tramp_hlist); > tr->progs_cnt[kind]--; > @@ -487,7 +569,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin > } > hlist_del_init(&link->tramp_hlist); > tr->progs_cnt[kind]--; > - err = bpf_trampoline_update(tr); > + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); > out: > mutex_unlock(&tr->mutex); > return err; > @@ -535,6 +617,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) > * multiple rcu callbacks. > */ > hlist_del(&tr->hlist); > + kfree(tr->fops); > kfree(tr); > out: > mutex_unlock(&trampoline_mutex);
> On Jul 6, 2022, at 12:38 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Thu, 2 Jun 2022 12:37:06 -0700 > Song Liu <song@kernel.org> wrote: > > >> --- a/kernel/bpf/trampoline.c >> +++ b/kernel/bpf/trampoline.c >> @@ -27,6 +27,44 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; >> /* serializes access to trampoline_table */ >> static DEFINE_MUTEX(trampoline_mutex); >> >> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS >> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); >> + >> +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) >> +{ >> + struct bpf_trampoline *tr = ops->private; >> + int ret; >> + >> + /* >> + * The normal locking order is >> + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) >> + * >> + * This is called from prepare_direct_functions_for_ipmodify, with >> + * direct_mutex locked. Use mutex_trylock() to avoid dead lock. >> + * Also, bpf_trampoline_update here should not lock direct_mutex. >> + */ >> + if (!mutex_trylock(&tr->mutex)) > > Can you comment here that returning -EAGAIN will not cause this to repeat. > That it will change things where the next try will not return -EGAIN? Hmm.. this is not the guarantee here. This conflict is a real race condition that an IPMODIFY function (i.e. livepatch) is being registered at the same time when something else, for example bpftrace, is updating the BPF trampoline. This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), and we need to retry there. In the case of livepatch, the retry is initiated from user space. > >> + return -EAGAIN; >> + >> + switch (cmd) { >> + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY: >> + tr->indirect_call = true; >> + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); >> + break; >> + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY: >> + tr->indirect_call = false; >> + tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY; >> + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); >> + break; >> + default: >> + ret = -EINVAL; >> + break; >> + }; >> + mutex_unlock(&tr->mutex); >> + return ret; >> +} >> +#endif >> + >> > > >> @@ -330,7 +387,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) >> return ERR_PTR(err); >> } >> >> -static int bpf_trampoline_update(struct bpf_trampoline *tr) >> +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) >> { >> struct bpf_tramp_image *im; >> struct bpf_tramp_links *tlinks; >> @@ -363,20 +420,45 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) >> if (ip_arg) >> flags |= BPF_TRAMP_F_IP_ARG; >> >> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS >> +again: >> + if (tr->indirect_call) >> + flags |= BPF_TRAMP_F_ORIG_STACK; >> +#endif >> + >> err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, >> &tr->func.model, flags, tlinks, >> tr->func.addr); >> if (err < 0) >> goto out; >> >> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS >> + if (tr->indirect_call) >> + tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY; >> +#endif >> + >> WARN_ON(tr->cur_image && tr->selector == 0); >> WARN_ON(!tr->cur_image && tr->selector); >> if (tr->cur_image) >> /* progs already running at this address */ >> - err = modify_fentry(tr, tr->cur_image->image, im->image); >> + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); >> else >> /* first time registering */ >> err = register_fentry(tr, im->image); >> + >> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS >> + if (err == -EAGAIN) { >> + if (WARN_ON_ONCE(tr->indirect_call)) >> + goto out; >> + /* should only retry on the first register */ >> + if (WARN_ON_ONCE(tr->cur_image)) >> + goto out; >> + tr->indirect_call = true; >> + tr->fops->func = NULL; >> + tr->fops->trampoline = 0; >> + goto again; > > I'm assuming that the above will prevent a return of -EAGAIN again. As if > it can, then this could turn into a dead lock. > > Can you please comment that? This is a different case. This EAGAIN happens when the live patch came first, and we register bpf trampoline later. By enabling tr->indirect_call, we should not get EAGAIN from register_fentry. I will add more comments for both cases. Thanks, Song
On Wed, 6 Jul 2022 21:37:52 +0000 Song Liu <songliubraving@fb.com> wrote: > > Can you comment here that returning -EAGAIN will not cause this to repeat. > > That it will change things where the next try will not return -EGAIN? > > Hmm.. this is not the guarantee here. This conflict is a real race condition > that an IPMODIFY function (i.e. livepatch) is being registered at the same time > when something else, for example bpftrace, is updating the BPF trampoline. > > This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), > and we need to retry there. In the case of livepatch, the retry is initiated > from user space. We need to be careful here then. If there's a userspace application that runs at real-time and does a: do { errno = 0; regsiter_bpf(); } while (errno != -EAGAIN); it could in theory preempt the owner of the lock and never make any progress. -- Steve
> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 6 Jul 2022 21:37:52 +0000 > Song Liu <songliubraving@fb.com> wrote: > >>> Can you comment here that returning -EAGAIN will not cause this to repeat. >>> That it will change things where the next try will not return -EGAIN? >> >> Hmm.. this is not the guarantee here. This conflict is a real race condition >> that an IPMODIFY function (i.e. livepatch) is being registered at the same time >> when something else, for example bpftrace, is updating the BPF trampoline. >> >> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), >> and we need to retry there. In the case of livepatch, the retry is initiated >> from user space. > > We need to be careful here then. If there's a userspace application that > runs at real-time and does a: > > do { > errno = 0; > regsiter_bpf(); > } while (errno != -EAGAIN); > > it could in theory preempt the owner of the lock and never make any > progress. We can probably workaround this with some trick on tr->indirect_call. However, I don't think this is a real concern from livepatch side. We have seen many other issues that cause live patch to fail and requires retry. This race condition in theory shouldn't cause real world issues. Thanks, Song
> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 6 Jul 2022 21:37:52 +0000 > Song Liu <songliubraving@fb.com> wrote: > >>> Can you comment here that returning -EAGAIN will not cause this to repeat. >>> That it will change things where the next try will not return -EGAIN? >> >> Hmm.. this is not the guarantee here. This conflict is a real race condition >> that an IPMODIFY function (i.e. livepatch) is being registered at the same time >> when something else, for example bpftrace, is updating the BPF trampoline. >> >> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), >> and we need to retry there. In the case of livepatch, the retry is initiated >> from user space. > > We need to be careful here then. If there's a userspace application that > runs at real-time and does a: > > do { > errno = 0; > regsiter_bpf(); > } while (errno != -EAGAIN); Actually, do you mean: do { errno = 0; regsiter_bpf(); } while (errno == -EAGAIN); (== -EAGAIN) here? In this specific race condition, register_bpf() will succeed, as it already got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. Since both livepatch and bpf trampoline changes are rare operations, I think the chance of the race condition is low enough. Does this make sense? Thanks, Song
On Wed, 6 Jul 2022 22:15:47 +0000 Song Liu <songliubraving@fb.com> wrote: > > On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > On Wed, 6 Jul 2022 21:37:52 +0000 > > Song Liu <songliubraving@fb.com> wrote: > > > >>> Can you comment here that returning -EAGAIN will not cause this to repeat. > >>> That it will change things where the next try will not return -EGAIN? > >> > >> Hmm.. this is not the guarantee here. This conflict is a real race condition > >> that an IPMODIFY function (i.e. livepatch) is being registered at the same time > >> when something else, for example bpftrace, is updating the BPF trampoline. > >> > >> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), > >> and we need to retry there. In the case of livepatch, the retry is initiated > >> from user space. > > > > We need to be careful here then. If there's a userspace application that > > runs at real-time and does a: > > > > do { > > errno = 0; > > regsiter_bpf(); > > } while (errno != -EAGAIN); > > Actually, do you mean: > > do { > errno = 0; > regsiter_bpf(); > } while (errno == -EAGAIN); > > (== -EAGAIN) here? Yeah, of course. > > In this specific race condition, register_bpf() will succeed, as it already > got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. What else takes the tr->mutex ? If it preempts anything else taking that mutex, when this runs, then it needs to be careful. You said this can happen when the live patch came first. This isn't racing against live patch, it's racing against anything that takes the tr->mutex and then adds a bpf trampoline to a location that has a live patch. > > Since both livepatch and bpf trampoline changes are rare operations, I think > the chance of the race condition is low enough. > > Does this make sense? > It's low, and if it is also a privileged operation then there's less to be concern about. As if it is not, then we could have a way to deadlock the system. I'm more concerned that this will lead to a CVE than it just happening randomly. In other words, it only takes something that can run at a real-time priority to connect to a live patch location, and something that runs at a low priority to take a tr->mutex. If an attacker has both, then it can pin both to a CPU and then cause the deadlock to the system. One hack to fix this is to add a msleep(1) in the failed case of the trylock. This will at least give the owner of the lock a millisecond to release it. This was what the RT patch use to do with spin_trylock() that was converted to a mutex (and we worked hard to remove all of them). -- Steve
> On Jul 6, 2022, at 3:29 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 6 Jul 2022 22:15:47 +0000 > Song Liu <songliubraving@fb.com> wrote: > >>> On Jul 6, 2022, at 2:40 PM, Steven Rostedt <rostedt@goodmis.org> wrote: >>> >>> On Wed, 6 Jul 2022 21:37:52 +0000 >>> Song Liu <songliubraving@fb.com> wrote: >>> >>>>> Can you comment here that returning -EAGAIN will not cause this to repeat. >>>>> That it will change things where the next try will not return -EGAIN? >>>> >>>> Hmm.. this is not the guarantee here. This conflict is a real race condition >>>> that an IPMODIFY function (i.e. livepatch) is being registered at the same time >>>> when something else, for example bpftrace, is updating the BPF trampoline. >>>> >>>> This EAGAIN will propagate to the user of the IPMODIFY function (i.e. livepatch), >>>> and we need to retry there. In the case of livepatch, the retry is initiated >>>> from user space. >>> >>> We need to be careful here then. If there's a userspace application that >>> runs at real-time and does a: >>> >>> do { >>> errno = 0; >>> regsiter_bpf(); >>> } while (errno != -EAGAIN); >> >> Actually, do you mean: >> >> do { >> errno = 0; >> regsiter_bpf(); >> } while (errno == -EAGAIN); >> >> (== -EAGAIN) here? > > Yeah, of course. > >> >> In this specific race condition, register_bpf() will succeed, as it already >> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. > > What else takes the tr->mutex ? tr->mutex is the local mutex for a single BPF trampoline, we only need to take it when we make changes to the trampoline (add/remove fentry/fexit programs). > > If it preempts anything else taking that mutex, when this runs, then it > needs to be careful. > > You said this can happen when the live patch came first. This isn't racing > against live patch, it's racing against anything that takes the tr->mutex > and then adds a bpf trampoline to a location that has a live patch. There are a few scenarios here: 1. Live patch is already applied, then a BPF trampoline is being registered to the same function. In bpf_trampoline_update(), register_fentry returns -EAGAIN, and this will be resolved. 2. BPF trampoline is already registered, then a live patch is being applied to the same function. The live patch code need to ask the bpf trampoline to prepare the trampoline before live patch. This is done by calling bpf_tramp_ftrace_ops_func. 2.1 If nothing else is modifying the trampoline at the same time, bpf_tramp_ftrace_ops_func will succeed. 2.2 In rare cases, if something else is holding tr->mutex to make changes to the trampoline (add/remove fentry functions, etc.), mutex_trylock in bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the change to BPF trampoline will still succeed. It is common for live patch to retry, so we just need to try live patch again when no one is making changes to the BPF trampoline in parallel. > >> >> Since both livepatch and bpf trampoline changes are rare operations, I think >> the chance of the race condition is low enough. >> >> Does this make sense? >> > > It's low, and if it is also a privileged operation then there's less to be > concern about. Both live patch and BPF trampoline are privileged operations. > As if it is not, then we could have a way to deadlock the > system. I'm more concerned that this will lead to a CVE than it just > happening randomly. In other words, it only takes something that can run at > a real-time priority to connect to a live patch location, and something > that runs at a low priority to take a tr->mutex. If an attacker has both, > then it can pin both to a CPU and then cause the deadlock to the system. > > One hack to fix this is to add a msleep(1) in the failed case of the > trylock. This will at least give the owner of the lock a millisecond to > release it. This was what the RT patch use to do with spin_trylock() that > was converted to a mutex (and we worked hard to remove all of them). The fix is really simple. But I still think we don't need it. We only hit the trylock case for something with IPMODIFY. The non-privileged user should not be able to do that, right? Thanks, Song
On Thu, 7 Jul 2022 00:19:07 +0000 Song Liu <songliubraving@fb.com> wrote: > >> In this specific race condition, register_bpf() will succeed, as it already > >> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. > > > > What else takes the tr->mutex ? > > tr->mutex is the local mutex for a single BPF trampoline, we only need to take > it when we make changes to the trampoline (add/remove fentry/fexit programs). > > > > > If it preempts anything else taking that mutex, when this runs, then it > > needs to be careful. > > > > You said this can happen when the live patch came first. This isn't racing > > against live patch, it's racing against anything that takes the tr->mutex > > and then adds a bpf trampoline to a location that has a live patch. > > There are a few scenarios here: > 1. Live patch is already applied, then a BPF trampoline is being registered > to the same function. In bpf_trampoline_update(), register_fentry returns > -EAGAIN, and this will be resolved. Where will it be resolved? > > 2. BPF trampoline is already registered, then a live patch is being applied > to the same function. The live patch code need to ask the bpf trampoline to > prepare the trampoline before live patch. This is done by calling > bpf_tramp_ftrace_ops_func. > > 2.1 If nothing else is modifying the trampoline at the same time, > bpf_tramp_ftrace_ops_func will succeed. > > 2.2 In rare cases, if something else is holding tr->mutex to make changes to > the trampoline (add/remove fentry functions, etc.), mutex_trylock in > bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the > change to BPF trampoline will still succeed. It is common for live patch to > retry, so we just need to try live patch again when no one is making changes > to the BPF trampoline in parallel. If the live patch is going to try again, and the task doing the live patch is SCHED_FIFO, and the task holding the tr->mutex is SCHED_OTHER (or just a lower priority), then there is a chance that the live patch task preempted the tr->mutex owner, and let's say the tr->mutex owner is pinned to the CPU (by the user or whatever), then because the live patch task is in a loop trying to take that mutex, it will never let the owner continue. Yes, this is a real scenario with trylock on mutexes. We hit it all the time in RT. > > > > >> > >> Since both livepatch and bpf trampoline changes are rare operations, I think > >> the chance of the race condition is low enough. A low race condition in a world that does this a billion times a day, ends up being not so rare. I like to say, "I live in a world where the unlikely is very much likely!" > >> > >> Does this make sense? > >> > > > > It's low, and if it is also a privileged operation then there's less to be > > concern about. > > Both live patch and BPF trampoline are privileged operations. This makes the issue less of an issue, but if there's an application that does this with setuid or something, there's a chance that it can be used by an attacker as well. > > > As if it is not, then we could have a way to deadlock the > > system. I'm more concerned that this will lead to a CVE than it just > > happening randomly. In other words, it only takes something that can run at > > a real-time priority to connect to a live patch location, and something > > that runs at a low priority to take a tr->mutex. If an attacker has both, > > then it can pin both to a CPU and then cause the deadlock to the system. > > > > One hack to fix this is to add a msleep(1) in the failed case of the > > trylock. This will at least give the owner of the lock a millisecond to > > release it. This was what the RT patch use to do with spin_trylock() that > > was converted to a mutex (and we worked hard to remove all of them). > > The fix is really simple. But I still think we don't need it. We only hit > the trylock case for something with IPMODIFY. The non-privileged user > should not be able to do that, right? For now, perhaps. But what useful applications are there going to be in the future that performs these privileged operations on behalf of a non-privileged user? In other words, if we can fix it now, we should, and avoid debugging this issue 5 years from now where it may take months to figure out. -- Steve
> On Jul 6, 2022, at 6:18 PM, Steven Rostedt <rostedt@goodmis.org> wrote: > > On Thu, 7 Jul 2022 00:19:07 +0000 > Song Liu <songliubraving@fb.com> wrote: > >>>> In this specific race condition, register_bpf() will succeed, as it already >>>> got tr->mutex. But the IPMODIFY (livepatch) side will fail and retry. >>> >>> What else takes the tr->mutex ? >> >> tr->mutex is the local mutex for a single BPF trampoline, we only need to take >> it when we make changes to the trampoline (add/remove fentry/fexit programs). >> >>> >>> If it preempts anything else taking that mutex, when this runs, then it >>> needs to be careful. >>> >>> You said this can happen when the live patch came first. This isn't racing >>> against live patch, it's racing against anything that takes the tr->mutex >>> and then adds a bpf trampoline to a location that has a live patch. >> >> There are a few scenarios here: >> 1. Live patch is already applied, then a BPF trampoline is being registered >> to the same function. In bpf_trampoline_update(), register_fentry returns >> -EAGAIN, and this will be resolved. > > Where will it be resolved? bpf_trampoline_update() will set tr->indirect_call and goto again. Then the trampoline is re-prepared to be able to share with the IPMODIFY functions and register_fentry will succeed. > >> >> 2. BPF trampoline is already registered, then a live patch is being applied >> to the same function. The live patch code need to ask the bpf trampoline to >> prepare the trampoline before live patch. This is done by calling >> bpf_tramp_ftrace_ops_func. >> >> 2.1 If nothing else is modifying the trampoline at the same time, >> bpf_tramp_ftrace_ops_func will succeed. >> >> 2.2 In rare cases, if something else is holding tr->mutex to make changes to >> the trampoline (add/remove fentry functions, etc.), mutex_trylock in >> bpf_tramp_ftrace_ops_func will fail, and live patch will fail. However, the >> change to BPF trampoline will still succeed. It is common for live patch to >> retry, so we just need to try live patch again when no one is making changes >> to the BPF trampoline in parallel. > > If the live patch is going to try again, and the task doing the live > patch is SCHED_FIFO, and the task holding the tr->mutex is SCHED_OTHER > (or just a lower priority), then there is a chance that the live patch > task preempted the tr->mutex owner, and let's say the tr->mutex owner > is pinned to the CPU (by the user or whatever), then because the live > patch task is in a loop trying to take that mutex, it will never let > the owner continue. Yeah, I got this scenario. I just don't think we should run live patch with high priority. Well, maybe we shouldn't make such assumptions. > > Yes, this is a real scenario with trylock on mutexes. We hit it all the > time in RT. > >> >>> >>>> >>>> Since both livepatch and bpf trampoline changes are rare operations, I think >>>> the chance of the race condition is low enough. > > > A low race condition in a world that does this a billion times a day, > ends up being not so rare. > > I like to say, "I live in a world where the unlikely is very much likely!" > > >>>> >>>> Does this make sense? >>>> >>> >>> It's low, and if it is also a privileged operation then there's less to be >>> concern about. >> >> Both live patch and BPF trampoline are privileged operations. > > This makes the issue less of an issue, but if there's an application > that does this with setuid or something, there's a chance that it can > be used by an attacker as well. > >> >>> As if it is not, then we could have a way to deadlock the >>> system. I'm more concerned that this will lead to a CVE than it just >>> happening randomly. In other words, it only takes something that can run at >>> a real-time priority to connect to a live patch location, and something >>> that runs at a low priority to take a tr->mutex. If an attacker has both, >>> then it can pin both to a CPU and then cause the deadlock to the system. >>> >>> One hack to fix this is to add a msleep(1) in the failed case of the >>> trylock. This will at least give the owner of the lock a millisecond to >>> release it. This was what the RT patch use to do with spin_trylock() that >>> was converted to a mutex (and we worked hard to remove all of them). >> >> The fix is really simple. But I still think we don't need it. We only hit >> the trylock case for something with IPMODIFY. The non-privileged user >> should not be able to do that, right? > > For now, perhaps. But what useful applications are there going to be in > the future that performs these privileged operations on behalf of a > non-privileged user? > > In other words, if we can fix it now, we should, and avoid debugging > this issue 5 years from now where it may take months to figure out. Fair enough.. I guess we will just add the msleep(1) in the -EAGAIN path. If this sounds good, I will send v3 with this change and more comments. Thanks, Song
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6e06f384e81..20a8ed600ca6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -44,6 +44,7 @@ struct kobject; struct mem_cgroup; struct module; struct bpf_func_state; +struct ftrace_ops; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -816,6 +817,7 @@ struct bpf_tramp_image { struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; + struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; @@ -838,6 +840,7 @@ struct bpf_trampoline { struct bpf_tramp_image *cur_image; u64 selector; struct module *mod; + bool indirect_call; }; struct bpf_attach_target_info { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 93c7675f0c9e..447c788c5520 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -27,6 +27,44 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); + +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +{ + struct bpf_trampoline *tr = ops->private; + int ret; + + /* + * The normal locking order is + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * + * This is called from prepare_direct_functions_for_ipmodify, with + * direct_mutex locked. Use mutex_trylock() to avoid dead lock. + * Also, bpf_trampoline_update here should not lock direct_mutex. + */ + if (!mutex_trylock(&tr->mutex)) + return -EAGAIN; + + switch (cmd) { + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY: + tr->indirect_call = true; + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY: + tr->indirect_call = false; + tr->fops->flags &= ~FTRACE_OPS_FL_SHARE_IPMODIFY; + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + default: + ret = -EINVAL; + break; + }; + mutex_unlock(&tr->mutex); + return ret; +} +#endif + bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; @@ -87,7 +125,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; - +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) { + kfree(tr); + tr = NULL; + goto out; + } + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; +#endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); @@ -126,7 +173,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct((long)ip, (long)old_addr); + ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); @@ -135,15 +182,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, + bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; - if (tr->func.ftrace_managed) - ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); - else + if (tr->func.ftrace_managed) { + if (lock_direct_mutex) + ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); + else + ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + } return ret; } @@ -161,10 +213,15 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) if (bpf_trampoline_module_get(tr)) return -ENOENT; - if (tr->func.ftrace_managed) - ret = register_ftrace_direct((long)ip, (long)new_addr); - else + if (tr->func.ftrace_managed) { + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 0); + ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); + if (ret) + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 1, 0); + + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + } if (ret) bpf_trampoline_module_put(tr); @@ -330,7 +387,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; @@ -363,20 +420,45 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (ip_arg) flags |= BPF_TRAMP_F_IP_ARG; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +again: + if (tr->indirect_call) + flags |= BPF_TRAMP_F_ORIG_STACK; +#endif + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tlinks, tr->func.addr); if (err < 0) goto out; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + if (tr->indirect_call) + tr->fops->flags |= FTRACE_OPS_FL_SHARE_IPMODIFY; +#endif + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image); + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + if (err == -EAGAIN) { + if (WARN_ON_ONCE(tr->indirect_call)) + goto out; + /* should only retry on the first register */ + if (WARN_ON_ONCE(tr->cur_image)) + goto out; + tr->indirect_call = true; + tr->fops->func = NULL; + tr->fops->trampoline = 0; + goto again; + } +#endif if (err) goto out; if (tr->cur_image) @@ -460,7 +542,7 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(tr); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; @@ -487,7 +569,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); out: mutex_unlock(&tr->mutex); return err; @@ -535,6 +617,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); + kfree(tr->fops); kfree(tr); out: mutex_unlock(&trampoline_mutex);
This allows bpf trampoline to trace kernel functions with live patch. Also, move bpf trampoline to *_ftrace_direct_multi APIs, which allows setting different flags of ftrace_ops. Signed-off-by: Song Liu <song@kernel.org> --- include/linux/bpf.h | 3 ++ kernel/bpf/trampoline.c | 109 +++++++++++++++++++++++++++++++++++----- 2 files changed, 99 insertions(+), 13 deletions(-)