Message ID | 20200820164753.3256899-1-jackmanb@chromium.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] security: replace indirect calls with static calls | expand |
On Thu, 20 Aug 2020, Brendan Jackman wrote: > With this implementation, any overhead of the indirect call in the LSM > framework is completely mitigated (performance results: [7]). This > facilitates the adoption of "bpf" LSM on production machines and also > benefits all other LSMs. This looks like a potentially useful improvement, although I wonder if it would be overshadowed by an LSM hook doing real work. Do you have any more benchmarking beyond eventfd_write() ? > > [1]: https://lwn.net/ml/linux-kernel/20200710133831.943894387@infradead.org/ > [2]: https://lwn.net/Articles/798157/ > [3] measurements: https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/62437b1416829ca0e8a0ed9101530bc90fd42d69/lsm-performance.png > protocol: https://gist.github.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5#file-measurement-protocol-md > [4]: https://lwn.net/Articles/813261/ > [5]: git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git x86/static_call > [6]: https://lwn.net/ml/linux-kernel/20200710133831.943894387@infradead.org/#t > [7]: https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/00e414b73e0c38c2eae8f05d5363a745179ba285/faster-lsm-results.png > > Cc: Alexei Starovoitov <ast@kernel.org> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: James Morris <jmorris@namei.org> > Cc: pjt@google.com > Cc: jannh@google.com > Cc: peterz@infradead.org > Cc: rafael.j.wysocki@intel.com > Cc: keescook@chromium.org > Cc: thgarnie@chromium.org > Cc: kpsingh@google.com > Cc: paul.renauld.epfl@gmail.com > > Signed-off-by: Paul Renauld <renauld@google.com> > Signed-off-by: KP Singh <kpsingh@google.com> > Signed-off-by: Brendan Jackman <jackmanb@google.com> > --- > include/linux/lsm_hooks.h | 1 + > include/linux/lsm_static_call.h | 134 ++++++++++++++++++++ > security/security.c | 217 ++++++++++++++++++++++++++++---- > 3 files changed, 331 insertions(+), 21 deletions(-) > create mode 100644 include/linux/lsm_static_call.h > > diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h > index 95b7c1d32062..d11e116b588e 100644 > --- a/include/linux/lsm_hooks.h > +++ b/include/linux/lsm_hooks.h > @@ -1524,6 +1524,7 @@ union security_list_options { > #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); > #include "lsm_hook_defs.h" > #undef LSM_HOOK > + void *generic_func; > }; > > struct security_hook_heads { > diff --git a/include/linux/lsm_static_call.h b/include/linux/lsm_static_call.h > new file mode 100644 > index 000000000000..f5f5698292e0 > --- /dev/null > +++ b/include/linux/lsm_static_call.h > @@ -0,0 +1,134 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +/* > + * Copyright (C) 2020 Google LLC. > + */ > + > +#ifndef __LINUX_LSM_STATIC_CALL_H > +#define __LINUX_LSM_STATIC_CALL_H > + > +/* > + * Static slots are used in security/security.c to avoid costly > + * indirect calls by replacing them with static calls. > + * The number of static calls for each LSM hook is fixed. > + */ > +#define SECURITY_STATIC_SLOT_COUNT 11 > + > +/* > + * Identifier for the LSM static slots. > + * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h > + * IDX is the index of the slot. 0 <= NUM < SECURITY_STATIC_SLOT_COUNT > + */ > +#define STATIC_SLOT(HOOK, IDX) security_static_slot_##HOOK##_##IDX > + > +/* > + * Call the macro M for each LSM hook slot. > + * M should take as first argument the index and then > + * the same __VA_ARGS__ > + * Essentially, this will expand to: > + * M(0, ...) > + * M(1, ...) > + * M(2, ...) > + * ... > + * Note that no trailing semicolon is placed so M should be defined > + * accordingly. > + * This adapts to a change to SECURITY_STATIC_SLOT_COUNT. > + */ > +#define SECURITY_FOREACH_STATIC_SLOT(M, ...) \ > + UNROLL_MACRO_LOOP(SECURITY_STATIC_SLOT_COUNT, M, __VA_ARGS__) > + > +/* > + * Intermediate macros to expand SECURITY_STATIC_SLOT_COUNT > + */ > +#define UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + _UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) > + > +#define _UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + __UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + __UNROLL_MACRO_LOOP_##N(MACRO, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_0(MACRO, ...) > + > +#define __UNROLL_MACRO_LOOP_1(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_0(MACRO, __VA_ARGS__) \ > + MACRO(0, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_2(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_1(MACRO, __VA_ARGS__) \ > + MACRO(1, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_3(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_2(MACRO, __VA_ARGS__) \ > + MACRO(2, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_4(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_3(MACRO, __VA_ARGS__) \ > + MACRO(3, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_5(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_4(MACRO, __VA_ARGS__) \ > + MACRO(4, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_6(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_5(MACRO, __VA_ARGS__) \ > + MACRO(5, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_7(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_6(MACRO, __VA_ARGS__) \ > + MACRO(6, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_8(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_7(MACRO, __VA_ARGS__) \ > + MACRO(7, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_9(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_8(MACRO, __VA_ARGS__) \ > + MACRO(8, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_10(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_9(MACRO, __VA_ARGS__) \ > + MACRO(9, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_11(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_10(MACRO, __VA_ARGS__) \ > + MACRO(10, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_12(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_11(MACRO, __VA_ARGS__) \ > + MACRO(11, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_13(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_12(MACRO, __VA_ARGS__) \ > + MACRO(12, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_14(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_13(MACRO, __VA_ARGS__) \ > + MACRO(13, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_15(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_14(MACRO, __VA_ARGS__) \ > + MACRO(14, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_16(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_15(MACRO, __VA_ARGS__) \ > + MACRO(15, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_17(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_16(MACRO, __VA_ARGS__) \ > + MACRO(16, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_18(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_17(MACRO, __VA_ARGS__) \ > + MACRO(17, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_19(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_18(MACRO, __VA_ARGS__) \ > + MACRO(18, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ > + MACRO(19, __VA_ARGS__) > + > +#endif /* __LINUX_LSM_STATIC_CALL_H */ > diff --git a/security/security.c b/security/security.c > index 70a7ad357bc6..15026bc716f2 100644 > --- a/security/security.c > +++ b/security/security.c > @@ -28,6 +28,8 @@ > #include <linux/string.h> > #include <linux/msg.h> > #include <net/flow.h> > +#include <linux/static_call.h> > +#include <linux/lsm_static_call.h> > > #define MAX_LSM_EVM_XATTR 2 > > @@ -86,6 +88,128 @@ static __initconst const char * const builtin_lsm_order = CONFIG_LSM; > static __initdata struct lsm_info **ordered_lsms; > static __initdata struct lsm_info *exclusive; > > +/* > + * Necessary information about a static > + * slot to call __static_call_update > + */ > +struct static_slot { > + /* static call key as defined by STATIC_CALL_KEY */ > + struct static_call_key *key; > + /* static call trampoline as defined by STATIC_CALL_TRAMP */ > + void *trampoline; > +}; > + > +/* > + * Table of the static calls for each LSM hook. > + * Once the LSMs are initialized, their callbacks will be copied to these > + * tables such that the slots are filled backwards (from last to first). > + * This way, we can jump directly to the first used slot, and execute > + * all of them after. This essentially makes the entry point point > + * dynamic to adapt the number of slot to the number of callbacks. > + */ > +struct static_slot_list { > + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + struct static_slot NAME[SECURITY_STATIC_SLOT_COUNT]; > + #include <linux/lsm_hook_defs.h> > + #undef LSM_HOOK > +} __randomize_layout; > + > +/* > + * Index of the first used static call for each LSM hook > + * in the corresponding static_slot_list table. > + * All slots with greater indices are used. > + * If no slot is used, the default value is INT_MAX. > + */ > +struct base_slot_idx { > + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + int NAME; > + #include <linux/lsm_hook_defs.h> > + #undef LSM_HOOK > +} __randomize_layout; > + > +/* > + * Create the static slots for each LSM hook, initially empty. > + * This will expand to: > + * > + * [...] > + * > + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_0, > + * *((int(*)(struct file *file, int mask)))NULL); > + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_1, ...); > + * > + * [...] > + */ > +#define CREATE_STATIC_SLOT(NUM, NAME, RET, ...) \ > + DEFINE_STATIC_CALL_NULL(STATIC_SLOT(NAME, NUM), \ > + *((RET(*)(__VA_ARGS__))NULL)); > + > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + SECURITY_FOREACH_STATIC_SLOT(CREATE_STATIC_SLOT, NAME, RET, __VA_ARGS__) > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +#undef CREATE_STATIC_SLOT > + > +/* > + * Initialise a table of static slots for each LSM hook. > + * When defined with DEFINE_STATIC_CALL_NULL as above, a static call is > + * a key and a trampoline. Both are needed to use __static_call_update. > + * This will expand to: > + * struct static_slot_list static_slots = { > + * [...] > + * .file_permission = { > + * (struct static_slot) { > + * .key = &STATIC_CALL_KEY( > + * security_static_slot_file_permission_0), > + * .trampoline = &STATIC_CALL_TRAMP( > + * security_static_slot_file_permission_0) > + * }, > + * (struct static_slot) { > + * .key = &STATIC_CALL_KEY( > + * security_static_slot_file_permission_1), > + * .trampoline = &STATIC_CALL_TRAMP( > + * security_static_slot_file_permission_1) > + * }, > + * [...] > + * }, > + * .file_alloc_security = { > + * [...] > + * }, > + * [...] > + * } > + */ > +static struct static_slot_list static_slots __initdata = { > +#define DEFINE_SLOT(NUM, NAME) \ > + (struct static_slot) { \ > + .key = &STATIC_CALL_KEY(STATIC_SLOT(NAME, NUM)), \ > + .trampoline = &STATIC_CALL_TRAMP(STATIC_SLOT(NAME, NUM))\ > + }, > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + .NAME = { \ > + SECURITY_FOREACH_STATIC_SLOT(DEFINE_SLOT, NAME) \ > + }, > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +#undef DEFINE_SLOT > +}; > + > +/* > + * The base slot index for each is initially INT_MAX, which means > + * that no slot is used yet. > + * When expanded, this results in: > + * struct base_slot_idx base_slot_idx = { > + * [...] > + * .file_permission = INT_MAX, > + * .file_alloc_security = INT_MAX, > + * [...] > + * } > + */ > +static struct base_slot_idx base_slot_idx __lsm_ro_after_init = { > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + .NAME = INT_MAX, > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +}; > + > static __initdata bool debug; > #define init_debug(...) \ > do { \ > @@ -307,6 +431,46 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) > kfree(sep); > } > > +static void __init lsm_init_hook_static_slot(struct static_slot *slots, > + struct hlist_head *head, > + int *first_slot_idx) > +{ > + struct security_hook_list *pos; > + struct static_slot *slot; > + int slot_cnt; > + > + slot_cnt = 0; > + hlist_for_each_entry_rcu(pos, head, list) > + slot_cnt++; > + > + if (slot_cnt > SECURITY_STATIC_SLOT_COUNT) > + panic("%s - No static hook slot remaining to add LSM hook.\n", > + __func__); > + > + if (slot_cnt == 0) { > + *first_slot_idx = INT_MAX; > + return; > + } > + > + *first_slot_idx = SECURITY_STATIC_SLOT_COUNT - slot_cnt; > + slot = slots + *first_slot_idx; > + hlist_for_each_entry_rcu(pos, head, list) { > + __static_call_update(slot->key, slot->trampoline, > + pos->hook.generic_func); > + slot++; > + } > +} > + > +static void __init lsm_init_static_slots(void) > +{ > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + lsm_init_hook_static_slot(static_slots.NAME, \ > + &security_hook_heads.NAME, \ > + &base_slot_idx.NAME); > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +} > + > static void __init lsm_early_cred(struct cred *cred); > static void __init lsm_early_task(struct task_struct *task); > > @@ -354,6 +518,7 @@ static void __init ordered_lsm_init(void) > lsm_early_task(current); > for (lsm = ordered_lsms; *lsm; lsm++) > initialize_lsm(*lsm); > + lsm_init_static_slots(); > > kfree(ordered_lsms); > } > @@ -374,6 +539,7 @@ int __init early_security_init(void) > prepare_lsm(lsm); > initialize_lsm(lsm); > } > + lsm_init_static_slots(); > > return 0; > } > @@ -696,27 +862,36 @@ static void __init lsm_early_task(struct task_struct *task) > * call_int_hook: > * This is a hook that returns a value. > */ > - > -#define call_void_hook(FUNC, ...) \ > - do { \ > - struct security_hook_list *P; \ > - \ > - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ > - P->hook.FUNC(__VA_ARGS__); \ > - } while (0) > - > -#define call_int_hook(FUNC, IRC, ...) ({ \ > - int RC = IRC; \ > - do { \ > - struct security_hook_list *P; \ > - \ > - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ > - RC = P->hook.FUNC(__VA_ARGS__); \ > - if (RC != 0) \ > - break; \ > - } \ > - } while (0); \ > - RC; \ > +#define __CASE_CALL_STATIC_VOID(NUM, HOOK, ...) \ > + case NUM: \ > + static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ > + fallthrough; > + > +#define call_void_hook(FUNC, ...) do { \ > + switch (base_slot_idx.FUNC) { \ > + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_VOID, \ > + FUNC, __VA_ARGS__) \ > + default : \ > + break; \ > + } \ > +} while (0) > + > +#define __CASE_CALL_STATIC_INT(NUM, R, HOOK, ...) \ > + case NUM: \ > + R = static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ > + if (R != 0) \ > + break; \ > + fallthrough; > + > +#define call_int_hook(FUNC, IRC, ...) ({ \ > + int RC = IRC; \ > + switch (base_slot_idx.FUNC) { \ > + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_INT, \ > + RC, FUNC, __VA_ARGS__) \ > + default : \ > + break; \ > + } \ > + RC; \ > }) > > /* Security operations */ >
On Thu, Aug 20, 2020 at 8:43 PM James Morris <jmorris@namei.org> wrote: > > On Thu, 20 Aug 2020, Brendan Jackman wrote: > > > With this implementation, any overhead of the indirect call in the LSM > > framework is completely mitigated (performance results: [7]). This > > facilitates the adoption of "bpf" LSM on production machines and also > > benefits all other LSMs. > > This looks like a potentially useful improvement, although I wonder if it > would be overshadowed by an LSM hook doing real work. > Thanks for taking a look! We can surely look at other examples, but the real goal is to optimize the case where the "bpf" LSM adds callbacks to every LSM hook which don't do any real work and cause an avoidable overhead. This makes it not very practical for data center environments where one would want a framework that adds a zero base case overhead and allows the user to decide where to hook / add performance penalties. (at boot time for other LSMs and at runtime for bpf) I also think this would be beneficial for LSMs which use a cache for a faster policy decision (e.g. access vector caching in SELinux). - KP > Do you have any more benchmarking beyond eventfd_write() ? > > > > > > > [1]: https://lwn.net/ml/linux-kernel/20200710133831.943894387@infradead.org/ [...] > > > > /* Security operations */ > > > > -- > James Morris > <jmorris@namei.org> >
On Thu, Aug 20, 2020 at 06:47:53PM +0200, Brendan Jackman wrote: > From: Paul Renauld <renauld@google.com> > > LSMs have high overhead due to indirect function calls through > retpolines. This RPC proposes to replace these with static calls [1] typo: RFC > instead. Yay! :) > [...] > This overhead prevents the adoption of bpf LSM on performance critical > systems, and also, in general, slows down all LSMs. I'd be curious to see other workloads too. (Your measurements are a bit synthetic, mostly showing "worst case": one short syscall in a tight loop. I'm curious how much performance gain can be had -- we should still do it, it'll be a direct performance improvement, but I'm curious about "real world" impact too.) > [...] > Previously, the code for this hook would have looked like this: > > ret = DEFAULT_RET; > > for each cb in [A, B, C]: > ret = cb(args); <--- costly indirect call here > if ret != 0: > break; > > return ret; > > Static calls are defined at build time and are initially empty (NOP > instructions). When the LSMs are initialized, the slots are filled as > follows: > > slot idx content > |-----------| > 0 | | > |-----------| > 1 | | > |-----------| > 2 | call A | <-- base_slot_idx = 2 > |-----------| > 3 | call B | > |-----------| > 4 | call C | > |-----------| > > The generated code will unroll the foreach loop to have a static call for > each possible LSM: > > ret = DEFAULT_RET; > switch(base_slot_idx): > > case 0: > NOP > if ret != 0: > break; > // fallthrough > case 1: > NOP > if ret != 0: > break; > // fallthrough > case 2: > ret = A(args); <--- direct call, no retpoline > if ret != 0: > break; > // fallthrough > case 3: > ret = B(args); <--- direct call, no retpoline > if ret != 0: > break; > // fallthrough > > [...] > > default: > break; > > return ret; > > A similar logic is applied for void hooks. > > Why this trick with a switch statement? The table of static call is defined > at compile time. The number of hook callbacks that will be defined is > unknown at that time, and the table cannot be resized at runtime. Static > calls do not define a conditional execution for a non-void function, so the > executed slots must be non-empty. With this use of the table and the > switch, it is possible to jump directly to the first used slot and execute > all of the slots after. This essentially makes the entry point of the table > dynamic. Instead, it would also be possible to start from 0 and break after > the final populated slot, but that would require an additional conditional > after each slot. Instead of just "NOP", having the static branches perform a jump would solve this pretty cleanly, yes? Something like: ret = DEFAULT_RET; ret = A(args); <--- direct call, no retpoline if ret != 0: goto out; ret = B(args); <--- direct call, no retpoline if ret != 0: goto out; goto out; if ret != 0: goto out; out: return ret; > [...] > The number of available slots for each LSM hook is currently fixed at > 11 (the number of LSMs in the kernel). Ideally, it should automatically > adapt to the number of LSMs compiled into the kernel. Seems like a reasonable thing to do and could be a separate patch. > If there’s no practical way to implement such automatic adaptation, an > option instead would be to remove the panic call by falling-back to the old > linked-list mechanism, which is still present anyway (see below). > > A few special cases of LSM don't use the macro call_[int/void]_hook but > have their own calling logic. The linked-lists are kept as a possible slow > path fallback for them. I assume you mean the integrity subsystem? That just needs to be fixed correctly. If we switch to this, let's ditch the linked list entirely. Fixing integrity's stacking can be a separate patch too. > [...] > Signed-off-by: Paul Renauld <renauld@google.com> > Signed-off-by: KP Singh <kpsingh@google.com> > Signed-off-by: Brendan Jackman <jackmanb@google.com> This implies a maintainership chain, with Paul as the sole author. If you mean all of you worked on the patch, include Co-developed-by: as needed[1]. -Kees [1] https://www.kernel.org/doc/html/latest/process/submitting-patches.html#when-to-use-acked-by-cc-and-co-developed-by
On 8/20/2020 9:47 AM, Brendan Jackman wrote: > From: Paul Renauld <renauld@google.com> > > LSMs have high overhead due to indirect function calls through > retpolines. This RPC proposes to replace these with static calls [1] > instead. > > This overhead is especially significant for the "bpf" LSM which supports > the implementation of LSM hooks with eBPF programs (security/bpf)[2]. In > order to facilitate this, the "bpf" LSM provides a default nop callback for > all LSM hooks. When enabled, the "bpf", LSM incurs an unnecessary / > avoidable indirect call to this nop callback. > > The performance impact on a simple syscall eventfd_write (which triggers > the file_permission hook) was measured with and without "bpf" LSM > enabled. Activating the LSM resulted in an overhead of 4% [3]. > > This overhead prevents the adoption of bpf LSM on performance critical > systems, and also, in general, slows down all LSMs. > > Currently, the LSM hook callbacks are stored in a linked list and > dispatched as indirect calls. Using static calls can remove this overhead > by replacing all indirect calls with direct calls. > > During the discussion of the "bpf" LSM patch-set it was proposed to special > case BPF LSM to avoid the overhead by using static keys. This was however > not accepted and it was decided to [4]: > > - Not special-case the "bpf" LSM. > - Implement a general solution benefitting the whole LSM framework. > > This is based on the static call branch [5]. > > For each LSM hook, a table of static calls is defined (referred to as > "static slots", or "slots"). When all the LSMs are initialized and linked > lists are filled, the hook callbacks are copied to the appropriate static > slot. The callbacks are continuously added at the end of the table, and the > index of the first slot that is non empty is stored. Then, when a LSM hook > is called (macro call_[int/void]_hook), the execution jumps to this first > non-empty slot and all of the subsequent static slots are executed. > > The static calls are re-initialized every time the linked list is modified, > i.e. after the early LSM init, and the LSM init. > > Let's say, there are 5 static slots per LSM hook, and 3 LSMs implement some > hook with the callbacks A, B, C. > > Previously, the code for this hook would have looked like this: > > ret = DEFAULT_RET; > > for each cb in [A, B, C]: > ret = cb(args); <--- costly indirect call here > if ret != 0: > break; > > return ret; > > Static calls are defined at build time and are initially empty (NOP > instructions). When the LSMs are initialized, the slots are filled as > follows: > > slot idx content > |-----------| > 0 | | > |-----------| > 1 | | > |-----------| > 2 | call A | <-- base_slot_idx = 2 > |-----------| > 3 | call B | > |-----------| > 4 | call C | > |-----------| > > The generated code will unroll the foreach loop to have a static call for > each possible LSM: > > ret = DEFAULT_RET; > switch(base_slot_idx): > > case 0: > NOP What does NOP really look like? > if ret != 0: I assume you'd want "ret != DEFAULT_RET" instead of "ret != 0". > break; > // fallthrough > case 1: > NOP > if ret != 0: > break; > // fallthrough > case 2: > ret = A(args); <--- direct call, no retpoline > if ret != 0: > break; > // fallthrough > case 3: > ret = B(args); <--- direct call, no retpoline > if ret != 0: > break; > // fallthrough > > [...] > > default: > break; > > return ret; > > A similar logic is applied for void hooks. > > Why this trick with a switch statement? The table of static call is defined > at compile time. The number of hook callbacks that will be defined is > unknown at that time, and the table cannot be resized at runtime. Static > calls do not define a conditional execution for a non-void function, so the > executed slots must be non-empty. So what goes in for empty slots? What about gaps in the table? > With this use of the table and the > switch, it is possible to jump directly to the first used slot and execute > all of the slots after. This essentially makes the entry point of the table > dynamic. Instead, it would also be possible to start from 0 and break after > the final populated slot, but that would require an additional conditional > after each slot. > > This macro is used to generate the code for each static slot, (e.g. each > case statement in the previous example). This will expand into a call to > MACRO for each static slot defined. For example, if with again 5 slots: > > SECURITY_FOREACH_STATIC_SLOT(MACRO, x, y) -> > > MACRO(0, x, y) > MACRO(1, x, y) > MACRO(2, x, y) > MACRO(3, x, y) > MACRO(4, x, y) > > This is used in conjunction with LSM_HOOK definitions in > linux/lsm_hook_defs.h to execute a macro for each static slot of each LSM > hook. > > The patches for static calls [6] are not upstreamed yet. > > The number of available slots for each LSM hook is currently fixed at > 11 (the number of LSMs in the kernel). Ideally, it should automatically > adapt to the number of LSMs compiled into the kernel. #define SECURITY_STATIC_SLOT_COUNT ( \ 1 + /* Capability module is always there */ \ (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ ... \ (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0)) > If there’s no practical way to implement such automatic adaptation, an > option instead would be to remove the panic call by falling-back to the old > linked-list mechanism, which is still present anyway (see below). > > A few special cases of LSM don't use the macro call_[int/void]_hook but > have their own calling logic. The linked-lists are kept as a possible slow > path fallback for them. Unfortunately, the stacking effort is increasing the number of hooks that will require special handling. security_secid_to_secctx() is one example. > > Before: > > https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/62437b1416829ca0e8a0ed9101530bc90fd42d69/lsm-performance.png > > After: > > https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/00e414b73e0c38c2eae8f05d5363a745179ba285/faster-lsm-results.png > > With this implementation, any overhead of the indirect call in the LSM > framework is completely mitigated (performance results: [7]). This > facilitates the adoption of "bpf" LSM on production machines and also > benefits all other LSMs. Your numbers for a system with BPF are encouraging. What do the numbers look like for a system with SELinux, Smack or AppArmor? > > [1]: https://lwn.net/ml/linux-kernel/20200710133831.943894387@infradead.org/ > [2]: https://lwn.net/Articles/798157/ > [3] measurements: https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/62437b1416829ca0e8a0ed9101530bc90fd42d69/lsm-performance.png > protocol: https://gist.github.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5#file-measurement-protocol-md > [4]: https://lwn.net/Articles/813261/ > [5]: git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git x86/static_call > [6]: https://lwn.net/ml/linux-kernel/20200710133831.943894387@infradead.org/#t > [7]: https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/00e414b73e0c38c2eae8f05d5363a745179ba285/faster-lsm-results.png > > Cc: Alexei Starovoitov <ast@kernel.org> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: James Morris <jmorris@namei.org> > Cc: pjt@google.com > Cc: jannh@google.com > Cc: peterz@infradead.org > Cc: rafael.j.wysocki@intel.com > Cc: keescook@chromium.org > Cc: thgarnie@chromium.org > Cc: kpsingh@google.com > Cc: paul.renauld.epfl@gmail.com > > Signed-off-by: Paul Renauld <renauld@google.com> > Signed-off-by: KP Singh <kpsingh@google.com> > Signed-off-by: Brendan Jackman <jackmanb@google.com> > --- > include/linux/lsm_hooks.h | 1 + > include/linux/lsm_static_call.h | 134 ++++++++++++++++++++ > security/security.c | 217 ++++++++++++++++++++++++++++---- > 3 files changed, 331 insertions(+), 21 deletions(-) > create mode 100644 include/linux/lsm_static_call.h > > diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h > index 95b7c1d32062..d11e116b588e 100644 > --- a/include/linux/lsm_hooks.h > +++ b/include/linux/lsm_hooks.h > @@ -1524,6 +1524,7 @@ union security_list_options { > #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); > #include "lsm_hook_defs.h" > #undef LSM_HOOK > + void *generic_func; > }; > > struct security_hook_heads { > diff --git a/include/linux/lsm_static_call.h b/include/linux/lsm_static_call.h > new file mode 100644 > index 000000000000..f5f5698292e0 > --- /dev/null > +++ b/include/linux/lsm_static_call.h > @@ -0,0 +1,134 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +/* > + * Copyright (C) 2020 Google LLC. > + */ > + > +#ifndef __LINUX_LSM_STATIC_CALL_H > +#define __LINUX_LSM_STATIC_CALL_H > + > +/* > + * Static slots are used in security/security.c to avoid costly > + * indirect calls by replacing them with static calls. > + * The number of static calls for each LSM hook is fixed. > + */ > +#define SECURITY_STATIC_SLOT_COUNT 11 See suggested code above. > + > +/* > + * Identifier for the LSM static slots. > + * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h > + * IDX is the index of the slot. 0 <= NUM < SECURITY_STATIC_SLOT_COUNT > + */ > +#define STATIC_SLOT(HOOK, IDX) security_static_slot_##HOOK##_##IDX > + > +/* > + * Call the macro M for each LSM hook slot. > + * M should take as first argument the index and then > + * the same __VA_ARGS__ > + * Essentially, this will expand to: > + * M(0, ...) > + * M(1, ...) > + * M(2, ...) > + * ... > + * Note that no trailing semicolon is placed so M should be defined > + * accordingly. > + * This adapts to a change to SECURITY_STATIC_SLOT_COUNT. > + */ > +#define SECURITY_FOREACH_STATIC_SLOT(M, ...) \ > + UNROLL_MACRO_LOOP(SECURITY_STATIC_SLOT_COUNT, M, __VA_ARGS__) > + > +/* > + * Intermediate macros to expand SECURITY_STATIC_SLOT_COUNT > + */ > +#define UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + _UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) > + > +#define _UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + __UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP(N, MACRO, ...) \ > + __UNROLL_MACRO_LOOP_##N(MACRO, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_0(MACRO, ...) > + > +#define __UNROLL_MACRO_LOOP_1(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_0(MACRO, __VA_ARGS__) \ > + MACRO(0, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_2(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_1(MACRO, __VA_ARGS__) \ > + MACRO(1, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_3(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_2(MACRO, __VA_ARGS__) \ > + MACRO(2, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_4(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_3(MACRO, __VA_ARGS__) \ > + MACRO(3, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_5(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_4(MACRO, __VA_ARGS__) \ > + MACRO(4, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_6(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_5(MACRO, __VA_ARGS__) \ > + MACRO(5, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_7(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_6(MACRO, __VA_ARGS__) \ > + MACRO(6, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_8(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_7(MACRO, __VA_ARGS__) \ > + MACRO(7, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_9(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_8(MACRO, __VA_ARGS__) \ > + MACRO(8, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_10(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_9(MACRO, __VA_ARGS__) \ > + MACRO(9, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_11(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_10(MACRO, __VA_ARGS__) \ > + MACRO(10, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_12(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_11(MACRO, __VA_ARGS__) \ > + MACRO(11, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_13(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_12(MACRO, __VA_ARGS__) \ > + MACRO(12, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_14(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_13(MACRO, __VA_ARGS__) \ > + MACRO(13, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_15(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_14(MACRO, __VA_ARGS__) \ > + MACRO(14, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_16(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_15(MACRO, __VA_ARGS__) \ > + MACRO(15, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_17(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_16(MACRO, __VA_ARGS__) \ > + MACRO(16, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_18(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_17(MACRO, __VA_ARGS__) \ > + MACRO(17, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_19(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_18(MACRO, __VA_ARGS__) \ > + MACRO(18, __VA_ARGS__) > + > +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ > + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ > + MACRO(19, __VA_ARGS__) > + Where does "20" come from? Why are you unrolling beyond 11? > +#endif /* __LINUX_LSM_STATIC_CALL_H */ > diff --git a/security/security.c b/security/security.c > index 70a7ad357bc6..15026bc716f2 100644 > --- a/security/security.c > +++ b/security/security.c > @@ -28,6 +28,8 @@ > #include <linux/string.h> > #include <linux/msg.h> > #include <net/flow.h> > +#include <linux/static_call.h> > +#include <linux/lsm_static_call.h> > > #define MAX_LSM_EVM_XATTR 2 > > @@ -86,6 +88,128 @@ static __initconst const char * const builtin_lsm_order = CONFIG_LSM; > static __initdata struct lsm_info **ordered_lsms; > static __initdata struct lsm_info *exclusive; > > +/* > + * Necessary information about a static > + * slot to call __static_call_update > + */ > +struct static_slot { > + /* static call key as defined by STATIC_CALL_KEY */ > + struct static_call_key *key; > + /* static call trampoline as defined by STATIC_CALL_TRAMP */ > + void *trampoline; > +}; > + > +/* > + * Table of the static calls for each LSM hook. > + * Once the LSMs are initialized, their callbacks will be copied to these > + * tables such that the slots are filled backwards (from last to first). > + * This way, we can jump directly to the first used slot, and execute > + * all of them after. This essentially makes the entry point point > + * dynamic to adapt the number of slot to the number of callbacks. > + */ > +struct static_slot_list { > + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + struct static_slot NAME[SECURITY_STATIC_SLOT_COUNT]; > + #include <linux/lsm_hook_defs.h> > + #undef LSM_HOOK > +} __randomize_layout; > + > +/* > + * Index of the first used static call for each LSM hook > + * in the corresponding static_slot_list table. > + * All slots with greater indices are used. Again, what about gaps? > + * If no slot is used, the default value is INT_MAX. > + */ > +struct base_slot_idx { > + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + int NAME; > + #include <linux/lsm_hook_defs.h> > + #undef LSM_HOOK > +} __randomize_layout; > + > +/* > + * Create the static slots for each LSM hook, initially empty. > + * This will expand to: > + * > + * [...] > + * > + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_0, > + * *((int(*)(struct file *file, int mask)))NULL); > + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_1, ...); > + * > + * [...] > + */ > +#define CREATE_STATIC_SLOT(NUM, NAME, RET, ...) \ > + DEFINE_STATIC_CALL_NULL(STATIC_SLOT(NAME, NUM), \ > + *((RET(*)(__VA_ARGS__))NULL)); > + > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + SECURITY_FOREACH_STATIC_SLOT(CREATE_STATIC_SLOT, NAME, RET, __VA_ARGS__) > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +#undef CREATE_STATIC_SLOT > + > +/* > + * Initialise a table of static slots for each LSM hook. > + * When defined with DEFINE_STATIC_CALL_NULL as above, a static call is > + * a key and a trampoline. Both are needed to use __static_call_update. > + * This will expand to: > + * struct static_slot_list static_slots = { > + * [...] > + * .file_permission = { > + * (struct static_slot) { > + * .key = &STATIC_CALL_KEY( > + * security_static_slot_file_permission_0), > + * .trampoline = &STATIC_CALL_TRAMP( > + * security_static_slot_file_permission_0) > + * }, > + * (struct static_slot) { > + * .key = &STATIC_CALL_KEY( > + * security_static_slot_file_permission_1), > + * .trampoline = &STATIC_CALL_TRAMP( > + * security_static_slot_file_permission_1) > + * }, > + * [...] > + * }, > + * .file_alloc_security = { > + * [...] > + * }, > + * [...] > + * } > + */ > +static struct static_slot_list static_slots __initdata = { > +#define DEFINE_SLOT(NUM, NAME) \ > + (struct static_slot) { \ > + .key = &STATIC_CALL_KEY(STATIC_SLOT(NAME, NUM)), \ > + .trampoline = &STATIC_CALL_TRAMP(STATIC_SLOT(NAME, NUM))\ > + }, > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + .NAME = { \ > + SECURITY_FOREACH_STATIC_SLOT(DEFINE_SLOT, NAME) \ > + }, > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +#undef DEFINE_SLOT > +}; > + > +/* > + * The base slot index for each is initially INT_MAX, which means > + * that no slot is used yet. > + * When expanded, this results in: > + * struct base_slot_idx base_slot_idx = { > + * [...] > + * .file_permission = INT_MAX, > + * .file_alloc_security = INT_MAX, > + * [...] > + * } > + */ > +static struct base_slot_idx base_slot_idx __lsm_ro_after_init = { > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + .NAME = INT_MAX, > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +}; > + > static __initdata bool debug; > #define init_debug(...) \ > do { \ > @@ -307,6 +431,46 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) > kfree(sep); > } > > +static void __init lsm_init_hook_static_slot(struct static_slot *slots, > + struct hlist_head *head, > + int *first_slot_idx) > +{ > + struct security_hook_list *pos; > + struct static_slot *slot; > + int slot_cnt; > + > + slot_cnt = 0; > + hlist_for_each_entry_rcu(pos, head, list) > + slot_cnt++; > + > + if (slot_cnt > SECURITY_STATIC_SLOT_COUNT) > + panic("%s - No static hook slot remaining to add LSM hook.\n", > + __func__); > + > + if (slot_cnt == 0) { > + *first_slot_idx = INT_MAX; > + return; > + } > + > + *first_slot_idx = SECURITY_STATIC_SLOT_COUNT - slot_cnt; > + slot = slots + *first_slot_idx; > + hlist_for_each_entry_rcu(pos, head, list) { > + __static_call_update(slot->key, slot->trampoline, > + pos->hook.generic_func); > + slot++; > + } > +} > + > +static void __init lsm_init_static_slots(void) > +{ > +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ > + lsm_init_hook_static_slot(static_slots.NAME, \ > + &security_hook_heads.NAME, \ > + &base_slot_idx.NAME); > +#include <linux/lsm_hook_defs.h> > +#undef LSM_HOOK > +} > + > static void __init lsm_early_cred(struct cred *cred); > static void __init lsm_early_task(struct task_struct *task); > > @@ -354,6 +518,7 @@ static void __init ordered_lsm_init(void) > lsm_early_task(current); > for (lsm = ordered_lsms; *lsm; lsm++) > initialize_lsm(*lsm); > + lsm_init_static_slots(); > > kfree(ordered_lsms); > } > @@ -374,6 +539,7 @@ int __init early_security_init(void) > prepare_lsm(lsm); > initialize_lsm(lsm); > } > + lsm_init_static_slots(); > > return 0; > } > @@ -696,27 +862,36 @@ static void __init lsm_early_task(struct task_struct *task) > * call_int_hook: > * This is a hook that returns a value. > */ > - > -#define call_void_hook(FUNC, ...) \ > - do { \ > - struct security_hook_list *P; \ > - \ > - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ > - P->hook.FUNC(__VA_ARGS__); \ > - } while (0) > - > -#define call_int_hook(FUNC, IRC, ...) ({ \ > - int RC = IRC; \ > - do { \ > - struct security_hook_list *P; \ > - \ > - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ > - RC = P->hook.FUNC(__VA_ARGS__); \ > - if (RC != 0) \ > - break; \ > - } \ > - } while (0); \ > - RC; \ > +#define __CASE_CALL_STATIC_VOID(NUM, HOOK, ...) \ > + case NUM: \ > + static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ > + fallthrough; > + > +#define call_void_hook(FUNC, ...) do { \ > + switch (base_slot_idx.FUNC) { \ > + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_VOID, \ > + FUNC, __VA_ARGS__) \ > + default : \ > + break; \ > + } \ > +} while (0) > + > +#define __CASE_CALL_STATIC_INT(NUM, R, HOOK, ...) \ > + case NUM: \ > + R = static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ > + if (R != 0) \ > + break; \ > + fallthrough; > + > +#define call_int_hook(FUNC, IRC, ...) ({ \ > + int RC = IRC; \ > + switch (base_slot_idx.FUNC) { \ > + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_INT, \ > + RC, FUNC, __VA_ARGS__) \ > + default : \ > + break; \ > + } \ > + RC; \ > }) > > /* Security operations */
On Thu, 20 Aug 2020 at 23:46, Kees Cook <keescook@chromium.org> wrote: > > On Thu, Aug 20, 2020 at 06:47:53PM +0200, Brendan Jackman wrote: > > From: Paul Renauld <renauld@google.com> > > > > LSMs have high overhead due to indirect function calls through > > retpolines. This RPC proposes to replace these with static calls [1] > > typo: RFC Oops, thanks - I also meant to have the [RFC] subject prefix. > > > instead. > > Yay! :) > > > [...] > > This overhead prevents the adoption of bpf LSM on performance critical > > systems, and also, in general, slows down all LSMs. > > I'd be curious to see other workloads too. (Your measurements are a bit > synthetic, mostly showing "worst case": one short syscall in a tight > loop. I'm curious how much performance gain can be had -- we should > still do it, it'll be a direct performance improvement, but I'm curious > about "real world" impact too.) > Sounds good - I'll gather some more data and get back. (I would also reiterate what KP said in response to James: the "worst case" relative indirect call overhead (i.e. the case where the hook callback does minimal work) is exactly the case we care about here. If the callback is doing enough work that the indirect call overhead becomes negligible, that callback is probably anyway too heavyweight for the use cases that motivated this work). > > [...] > > Previously, the code for this hook would have looked like this: > > > > ret = DEFAULT_RET; > > > > for each cb in [A, B, C]: > > ret = cb(args); <--- costly indirect call here > > if ret != 0: > > break; > > > > return ret; > > > > Static calls are defined at build time and are initially empty (NOP > > instructions). When the LSMs are initialized, the slots are filled as > > follows: > > > > slot idx content > > |-----------| > > 0 | | > > |-----------| > > 1 | | > > |-----------| > > 2 | call A | <-- base_slot_idx = 2 > > |-----------| > > 3 | call B | > > |-----------| > > 4 | call C | > > |-----------| > > > > The generated code will unroll the foreach loop to have a static call for > > each possible LSM: > > > > ret = DEFAULT_RET; > > switch(base_slot_idx): > > > > case 0: > > NOP > > if ret != 0: > > break; > > // fallthrough > > case 1: > > NOP > > if ret != 0: > > break; > > // fallthrough > > case 2: > > ret = A(args); <--- direct call, no retpoline > > if ret != 0: > > break; > > // fallthrough > > case 3: > > ret = B(args); <--- direct call, no retpoline > > if ret != 0: > > break; > > // fallthrough > > > > [...] > > > > default: > > break; > > > > return ret; > > > > A similar logic is applied for void hooks. > > > > Why this trick with a switch statement? The table of static call is defined > > at compile time. The number of hook callbacks that will be defined is > > unknown at that time, and the table cannot be resized at runtime. Static > > calls do not define a conditional execution for a non-void function, so the > > executed slots must be non-empty. With this use of the table and the > > switch, it is possible to jump directly to the first used slot and execute > > all of the slots after. This essentially makes the entry point of the table > > dynamic. Instead, it would also be possible to start from 0 and break after > > the final populated slot, but that would require an additional conditional > > after each slot. > > Instead of just "NOP", having the static branches perform a jump would > solve this pretty cleanly, yes? Something like: > > ret = DEFAULT_RET; > > ret = A(args); <--- direct call, no retpoline > if ret != 0: > goto out; > > ret = B(args); <--- direct call, no retpoline > if ret != 0: > goto out; > > goto out; > if ret != 0: > goto out; > > out: > return ret; Hmm yeah that's a cool idea. This would either need to be implemented with custom code-modification logic for the LSM hooks, or we'd need to think of a way to express it in a sensible addition to the static_call API. I do wonder if the latter could take the form of a generic system for arrays of static calls. It would also need to handle the fact that IIUC at the moment the last static_call may be a tail call, so we'd be patching an existing jump into a jump to a different target, I don't know if we can do that atomically. More research required on my side here, on both points. > [...] > > Signed-off-by: Paul Renauld <renauld@google.com> > > Signed-off-by: KP Singh <kpsingh@google.com> > > Signed-off-by: Brendan Jackman <jackmanb@google.com> > > This implies a maintainership chain, with Paul as the sole author. If > you mean all of you worked on the patch, include Co-developed-by: as > needed[1]. Yep, this is intentional - Paul is the sole author so far (I suppose KP's sign-off is not technically required since he's also at the Google).
On Mon, Aug 24, 2020 at 04:09:09PM +0200, Brendan Jackman wrote: > > > Why this trick with a switch statement? The table of static call is defined > > > at compile time. The number of hook callbacks that will be defined is > > > unknown at that time, and the table cannot be resized at runtime. Static > > > calls do not define a conditional execution for a non-void function, so the > > > executed slots must be non-empty. With this use of the table and the > > > switch, it is possible to jump directly to the first used slot and execute > > > all of the slots after. This essentially makes the entry point of the table > > > dynamic. Instead, it would also be possible to start from 0 and break after > > > the final populated slot, but that would require an additional conditional > > > after each slot. > > > > Instead of just "NOP", having the static branches perform a jump would > > solve this pretty cleanly, yes? Something like: > > > > ret = DEFAULT_RET; > > > > ret = A(args); <--- direct call, no retpoline > > if ret != 0: > > goto out; > > > > ret = B(args); <--- direct call, no retpoline > > if ret != 0: > > goto out; > > > > goto out; > > if ret != 0: > > goto out; > > > > out: > > return ret; > > Hmm yeah that's a cool idea. This would either need to be implemented > with custom code-modification logic for the LSM hooks, or we'd need to > think of a way to express it in a sensible addition to the static_call > API. I do wonder if the latter could take the form of a generic system > for arrays of static calls. So you basically want something like: if (A[0] && (ret = static_call(A[0])(...))) return ret; if (A[1] && (ret = static_call(A[1])(...))) return ret; .... return ret; Right? The problem with static_call_cond() is that we don't know what to do with the return value when !func, which is why it's limited to void return type. You can however construct something like the above with a combination of static_branch() and static_call() though. It'll not be pretty, but it ought to work: if (static_branch_likely(A[0].key)) { ret = static_call(A[0].call)(...); if (ret) return ret; } ... return ret; > It would also need to handle the fact that IIUC at the moment the last > static_call may be a tail call, so we'd be patching an existing jump > into a jump to a different target, I don't know if we can do that > atomically. Of course we can, the static_call() series supports tail-calls just fine. In fact, patching jumps is far easier, it was patching call that was the real problem because it mucks about with the stack.
On Mon, Aug 24, 2020 at 04:33:44PM +0200, Peter Zijlstra wrote: > On Mon, Aug 24, 2020 at 04:09:09PM +0200, Brendan Jackman wrote: > > > > > Why this trick with a switch statement? The table of static call is defined > > > > at compile time. The number of hook callbacks that will be defined is > > > > unknown at that time, and the table cannot be resized at runtime. Static > > > > calls do not define a conditional execution for a non-void function, so the > > > > executed slots must be non-empty. With this use of the table and the > > > > switch, it is possible to jump directly to the first used slot and execute > > > > all of the slots after. This essentially makes the entry point of the table > > > > dynamic. Instead, it would also be possible to start from 0 and break after > > > > the final populated slot, but that would require an additional conditional > > > > after each slot. > > > > > > Instead of just "NOP", having the static branches perform a jump would > > > solve this pretty cleanly, yes? Something like: > > > > > > ret = DEFAULT_RET; > > > > > > ret = A(args); <--- direct call, no retpoline > > > if ret != 0: > > > goto out; > > > > > > ret = B(args); <--- direct call, no retpoline > > > if ret != 0: > > > goto out; > > > > > > goto out; > > > if ret != 0: > > > goto out; > > > > > > out: > > > return ret; > > > > Hmm yeah that's a cool idea. This would either need to be implemented > > with custom code-modification logic for the LSM hooks, or we'd need to > > think of a way to express it in a sensible addition to the static_call > > API. I do wonder if the latter could take the form of a generic system > > for arrays of static calls. > > So you basically want something like: > > if (A[0] && (ret = static_call(A[0])(...))) > return ret; > > if (A[1] && (ret = static_call(A[1])(...))) > return ret; > > .... > > return ret; > > Right? The problem with static_call_cond() is that we don't know what to > do with the return value when !func, which is why it's limited to void > return type. > > You can however construct something like the above with a combination of > static_branch() and static_call() though. It'll not be pretty, but it > ought to work: > > if (static_branch_likely(A[0].key)) { > ret = static_call(A[0].call)(...); > if (ret) > return ret; > } > > ... > > return ret; > Right. That's actually exactly what Paul's first implementation looked like for call_int_hook. But we thought the switch thing was easier to understand. > > > It would also need to handle the fact that IIUC at the moment the last > > static_call may be a tail call, so we'd be patching an existing jump > > into a jump to a different target, I don't know if we can do that > > atomically. > > Of course we can, the static_call() series supports tail-calls just > fine. In fact, patching jumps is far easier, it was patching call that > was the real problem because it mucks about with the stack. > OK great. I had a vague apprehension that we could only patch to or from a NOP.
On Fri, 21 Aug 2020 at 00:46, Casey Schaufler <casey@schaufler-ca.com> wrote: > > On 8/20/2020 9:47 AM, Brendan Jackman wrote: [...] > What does NOP really look like? The NOP is the same as a regular function call but the CALL instruction is replaced with a NOP instruction. The code that sets up the call parameters is unchanged, and so is the code that expects to get the return value in eax or whatever. That means we cannot actually call the static_calls for NULL slots, we'd get undefined behaviour (except for void hooks) - this is what Peter is talking about in the sibling thread. For this reason, there are _no gaps_ in the callback table. For a given LSM hook, all the slots after base_slot_idx are filled, and all before are empty, so jumping to base_slot_idx ensures that we don't reach an empty slot. That's what the switch trick is all about. > > > if ret != 0: > > I assume you'd want "ret != DEFAULT_RET" instead of "ret != 0". Yeah that's a good question - but the existing behaviour is to always check against 0 (DEFAULT_RET is called IRC in the real code), which does seem strange. > So what goes in for empty slots? What about gaps in the table? It's a NOP, but we never execute it (explained above). There are no gaps. >> +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ >> + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ >> + MACRO(19, __VA_ARGS__) >> + > Where does "20" come from? Why are you unrolling beyond 11? It's just an arbitrary limit on the unrolling macro implementation, we aren't actually unrolling beyond 11 where the macro is used (N is set to 11). > > > With this use of the table and the > > switch, it is possible to jump directly to the first used slot and execute > > all of the slots after. This essentially makes the entry point of the table > > dynamic. Instead, it would also be possible to start from 0 and break after > > the final populated slot, but that would require an additional conditional > > after each slot. > > > > This macro is used to generate the code for each static slot, (e.g. each > > case statement in the previous example). This will expand into a call to > > MACRO for each static slot defined. For example, if with again 5 slots: > > > > SECURITY_FOREACH_STATIC_SLOT(MACRO, x, y) -> > > > > MACRO(0, x, y) > > MACRO(1, x, y) > > MACRO(2, x, y) > > MACRO(3, x, y) > > MACRO(4, x, y) > > > > This is used in conjunction with LSM_HOOK definitions in > > linux/lsm_hook_defs.h to execute a macro for each static slot of each LSM > > hook. > > > > The patches for static calls [6] are not upstreamed yet. > > > > The number of available slots for each LSM hook is currently fixed at > > 11 (the number of LSMs in the kernel). Ideally, it should automatically > > adapt to the number of LSMs compiled into the kernel. > > #define SECURITY_STATIC_SLOT_COUNT ( \ > 1 + /* Capability module is always there */ \ > (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ > (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ > ... \ > (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0)) > Yeah, that's exactly what we need but it needs to be expanded to an integer literal at preprocessor time, those +s won't work :( > > If there’s no practical way to implement such automatic adaptation, an > > option instead would be to remove the panic call by falling-back to the old > > linked-list mechanism, which is still present anyway (see below). > > > > A few special cases of LSM don't use the macro call_[int/void]_hook but > > have their own calling logic. The linked-lists are kept as a possible slow > > path fallback for them. > > Unfortunately, the stacking effort is increasing the number of hooks > that will require special handling. security_secid_to_secctx() is one > example. > > > > > Before: > > > > https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/62437b1416829ca0e8a0ed9101530bc90fd42d69/lsm-performance.png > > > > After: > > > > https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/00e414b73e0c38c2eae8f05d5363a745179ba285/faster-lsm-results.png > > > > With this implementation, any overhead of the indirect call in the LSM > > framework is completely mitigated (performance results: [7]). This > > facilitates the adoption of "bpf" LSM on production machines and also > > benefits all other LSMs. > > Your numbers for a system with BPF are encouraging. What do the numbers > look like for a system with SELinux, Smack or AppArmor? Yeah that's a good question. As I said in the sibling thread the motivating example is very lightweight LSM callbacks in very hot codepaths, but I'll get some broader data too and report back.
On 8/24/2020 8:20 AM, Brendan Jackman wrote: > On Fri, 21 Aug 2020 at 00:46, Casey Schaufler <casey@schaufler-ca.com> wrote: >> On 8/20/2020 9:47 AM, Brendan Jackman wrote: > [...] >> What does NOP really look like? > The NOP is the same as a regular function call but the CALL > instruction is replaced with a NOP instruction. The code that sets up > the call parameters is unchanged, and so is the code that expects to > get the return value in eax or whatever. Right. Are you saying that NOP is in-line assembler in your switch? > That means we cannot actually > call the static_calls for NULL slots, we'd get undefined behaviour > (except for void hooks) - this is what Peter is talking about in the > sibling thread. Referring to the "sibling thread" is kinda confusing, and assumes everyone is one all the right mailing lists, and knows which other thread you're talking about. > > For this reason, there are _no gaps_ in the callback table. For a > given LSM hook, all the slots after base_slot_idx are filled, Why go to all the trouble of maintaining the base_slot_idx if NOP is so cheap? Why not fill all unused slots with NOP? Worst case would be a hook with no users, in which case you have 11 NOPS in the void hook case and 11 "if (ret != DEFAULT_RET)" and 11 NOPS in the int case. No switch magic required. Even better, in the int case you have two calls/slot, the first is the module supplied function (or NOP) and the second is int isit(int ret) { return (ret != DEFAULT_RET) ? ret : 0; } (or NOP). The no security module case degenerates to 22 NOP instructions and no if checks of any sort. I'm not the performance guy, but that seems better than maintaining and checking base_slot_idx to me. > and all > before are empty, so jumping to base_slot_idx ensures that we don't > reach an empty slot. That's what the switch trick is all about. I hates tricks. They're so susceptible to clever attacks. >>> if ret != 0: >> I assume you'd want "ret != DEFAULT_RET" instead of "ret != 0". > Yeah that's a good question - but the existing behaviour is to always > check against 0 (DEFAULT_RET is called IRC in the real code), > which does seem strange. If you don't do this correctly you'll make a real mess of the security. >> So what goes in for empty slots? What about gaps in the table? > It's a NOP, but we never execute it (explained above). There are no gaps. Right. Unused slots have NOP. NOP is (assumed to be) cheap. >>> +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ >>> + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ >>> + MACRO(19, __VA_ARGS__) >>> + >> Where does "20" come from? Why are you unrolling beyond 11? > It's just an arbitrary limit on the unrolling macro implementation, we > aren't actually unrolling beyond 11 where the macro is used (N is set > to 11). I'm not a fan of including macros you can't use, especially when they're just obvious variants of other macros. >>> With this use of the table and the >>> switch, it is possible to jump directly to the first used slot and execute >>> all of the slots after. This essentially makes the entry point of the table >>> dynamic. Instead, it would also be possible to start from 0 and break after >>> the final populated slot, but that would require an additional conditional >>> after each slot. >>> >>> This macro is used to generate the code for each static slot, (e.g. each >>> case statement in the previous example). This will expand into a call to >>> MACRO for each static slot defined. For example, if with again 5 slots: >>> >>> SECURITY_FOREACH_STATIC_SLOT(MACRO, x, y) -> >>> >>> MACRO(0, x, y) >>> MACRO(1, x, y) >>> MACRO(2, x, y) >>> MACRO(3, x, y) >>> MACRO(4, x, y) >>> >>> This is used in conjunction with LSM_HOOK definitions in >>> linux/lsm_hook_defs.h to execute a macro for each static slot of each LSM >>> hook. >>> >>> The patches for static calls [6] are not upstreamed yet. >>> >>> The number of available slots for each LSM hook is currently fixed at >>> 11 (the number of LSMs in the kernel). Ideally, it should automatically >>> adapt to the number of LSMs compiled into the kernel. >> #define SECURITY_STATIC_SLOT_COUNT ( \ >> 1 + /* Capability module is always there */ \ >> (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ >> (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ >> ... \ >> (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0)) >> > Yeah, that's exactly what we need but it needs to be expanded to an > integer literal at preprocessor time, those +s won't work :( ???? Gosh. It works in my module stacking code. >>> If there’s no practical way to implement such automatic adaptation, an >>> option instead would be to remove the panic call by falling-back to the old >>> linked-list mechanism, which is still present anyway (see below). >>> >>> A few special cases of LSM don't use the macro call_[int/void]_hook but >>> have their own calling logic. The linked-lists are kept as a possible slow >>> path fallback for them. >> Unfortunately, the stacking effort is increasing the number of hooks >> that will require special handling. security_secid_to_secctx() is one >> example. >> >>> Before: >>> >>> https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/62437b1416829ca0e8a0ed9101530bc90fd42d69/lsm-performance.png >>> >>> After: >>> >>> https://gist.githubusercontent.com/PaulRenauld/fe3ee7b51121556e03c181432c8b3dd5/raw/00e414b73e0c38c2eae8f05d5363a745179ba285/faster-lsm-results.png >>> >>> With this implementation, any overhead of the indirect call in the LSM >>> framework is completely mitigated (performance results: [7]). This >>> facilitates the adoption of "bpf" LSM on production machines and also >>> benefits all other LSMs. >> Your numbers for a system with BPF are encouraging. What do the numbers >> look like for a system with SELinux, Smack or AppArmor? > Yeah that's a good question. As I said in the sibling thread the > motivating example is very lightweight LSM callbacks in very hot > codepaths, but I'll get some broader data too and report back. Even IoT systems are using security modules these days. You'll be hard pressed to identify a class of systems that don't use an LSM or two. My bet is that your fiendishly clever scheme is going to make everyone's life better, but as with all things, it does need to have hard evidence.
On Mon, 24 Aug 2020 at 18:43, Casey Schaufler <casey@schaufler-ca.com> wrote: > > On 8/24/2020 8:20 AM, Brendan Jackman wrote: > > On Fri, 21 Aug 2020 at 00:46, Casey Schaufler <casey@schaufler-ca.com> wrote: > >> On 8/20/2020 9:47 AM, Brendan Jackman wrote: > > [...] > >> What does NOP really look like? > > The NOP is the same as a regular function call but the CALL > > instruction is replaced with a NOP instruction. The code that sets up > > the call parameters is unchanged, and so is the code that expects to > > get the return value in eax or whatever. > > Right. Are you saying that NOP is in-line assembler in your switch? That's right - although it's behind the static_call API that the patch depends on ([5] in the original mail). > > That means we cannot actually > > call the static_calls for NULL slots, we'd get undefined behaviour > > (except for void hooks) - this is what Peter is talking about in the > > sibling thread. > > Referring to the "sibling thread" is kinda confusing, and > assumes everyone is one all the right mailing lists, and knows > which other thread you're talking about. Sure, sorry - here's the Lore link for future reference: https://lore.kernel.org/lkml/20200820164753.3256899-1-jackmanb@chromium.org/T/#m5a6fb3f10141049ce43e18a41f154796090ae1d5 > > > > For this reason, there are _no gaps_ in the callback table. For a > > given LSM hook, all the slots after base_slot_idx are filled, > > Why go to all the trouble of maintaining the base_slot_idx > if NOP is so cheap? Why not fill all unused slots with NOP? > Worst case would be a hook with no users, in which case you > have 11 NOPS in the void hook case and 11 "if (ret != DEFAULT_RET)" > and 11 NOPS in the int case. No switch magic required. Even > better, in the int case you have two calls/slot, the first is the > module supplied function (or NOP) and the second is > int isit(int ret) { return (ret != DEFAULT_RET) ? ret : 0; } > (or NOP). > > The no security module case degenerates to 22 NOP instructions > and no if checks of any sort. I'm not the performance guy, but > that seems better than maintaining and checking base_slot_idx > to me. The switch trick is not really motivated by performance. I think all the focus on the NOPs themselves is a bit misleading here - we _can't_ execute the NOPs for the int hooks, because there are instructions after them that expect a function to have just returned a value, which NOP doesn't do. When there is a NOP in the slot instead of a CALL, it would appear to "return" whatever value is leftover in the return register. At the C level, this is why the static_call API doesn't allow static_call_cond to return a value (which is what PeterZ is referring to in the thread I linked above). So, we could drop the switch trick for void hooks and just use static_call_cond, but this doesn't work for int hooks. IMO that variation between the two hook types would just add confusion. > >>> +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ > >>> + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ > >>> + MACRO(19, __VA_ARGS__) > >>> + > >> Where does "20" come from? Why are you unrolling beyond 11? > > It's just an arbitrary limit on the unrolling macro implementation, we > > aren't actually unrolling beyond 11 where the macro is used (N is set > > to 11). > > I'm not a fan of including macros you can't use, especially > when they're just obvious variants of other macros. Not sure what you mean here - is there already a macro that does what UNROLL_MACRO_LOOP does?
On 8/24/2020 10:04 AM, Brendan Jackman wrote: > On Mon, 24 Aug 2020 at 18:43, Casey Schaufler <casey@schaufler-ca.com> wrote: >> On 8/24/2020 8:20 AM, Brendan Jackman wrote: >>> On Fri, 21 Aug 2020 at 00:46, Casey Schaufler <casey@schaufler-ca.com> wrote: >>>> On 8/20/2020 9:47 AM, Brendan Jackman wrote: >>> [...] >>>> What does NOP really look like? >>> The NOP is the same as a regular function call but the CALL >>> instruction is replaced with a NOP instruction. The code that sets up >>> the call parameters is unchanged, and so is the code that expects to >>> get the return value in eax or whatever. >> Right. Are you saying that NOP is in-line assembler in your switch? > That's right - although it's behind the static_call API that the patch > depends on ([5] in the original mail). > >>> That means we cannot actually >>> call the static_calls for NULL slots, we'd get undefined behaviour >>> (except for void hooks) - this is what Peter is talking about in the >>> sibling thread. >> Referring to the "sibling thread" is kinda confusing, and >> assumes everyone is one all the right mailing lists, and knows >> which other thread you're talking about. > Sure, sorry - here's the Lore link for future reference: > > https://lore.kernel.org/lkml/20200820164753.3256899-1-jackmanb@chromium.org/T/#m5a6fb3f10141049ce43e18a41f154796090ae1d5 > >>> For this reason, there are _no gaps_ in the callback table. For a >>> given LSM hook, all the slots after base_slot_idx are filled, >> Why go to all the trouble of maintaining the base_slot_idx >> if NOP is so cheap? Why not fill all unused slots with NOP? >> Worst case would be a hook with no users, in which case you >> have 11 NOPS in the void hook case and 11 "if (ret != DEFAULT_RET)" >> and 11 NOPS in the int case. No switch magic required. Even >> better, in the int case you have two calls/slot, the first is the >> module supplied function (or NOP) and the second is >> int isit(int ret) { return (ret != DEFAULT_RET) ? ret : 0; } >> (or NOP). >> >> The no security module case degenerates to 22 NOP instructions >> and no if checks of any sort. I'm not the performance guy, but >> that seems better than maintaining and checking base_slot_idx >> to me. > The switch trick is not really motivated by performance. Then what is its motivation? It makes the code more complicated, and is unnecessary. > I think all the focus on the NOPs themselves is a bit misleading here > - we _can't_ execute the NOPs for the int hooks, because there are > instructions after them that expect a function to have just returned a > value, which NOP doesn't do. That's what I was hoping to address with the second call in the slot. The first call in the slot would be either the module supplied code or a NOP if the module isn't using the hook. The second would be supplied by the LSM infrastructure and would be NOP if the module didn't use the hook. The LSM supplied function would do the necessary checking. Its more complicated than the void case, but not that much more complicated than the existing list based scheme. The concern about the non-existent return on a NOP can be dealt with by setting up initial conditions correctly in most cases. Dealing with multiple security modules providing information (e.g. secid_to_secctx) is where it gets tricky. > When there is a NOP in the slot instead > of a CALL, it would appear to "return" whatever value is leftover in > the return register. At the C level, this is why the static_call API > doesn't allow static_call_cond to return a value (which is what PeterZ > is referring to in the thread I linked above). > > So, we could drop the switch trick for void hooks and just use > static_call_cond, but this doesn't work for int hooks. IMO that > variation between the two hook types would just add confusion. With the number of cases where the switch trick isn't going to work in the long term I'm disinclined to think it makes things less confusing. >>>>> +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ >>>>> + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ >>>>> + MACRO(19, __VA_ARGS__) >>>>> + >>>> Where does "20" come from? Why are you unrolling beyond 11? >>> It's just an arbitrary limit on the unrolling macro implementation, we >>> aren't actually unrolling beyond 11 where the macro is used (N is set >>> to 11). >> I'm not a fan of including macros you can't use, especially >> when they're just obvious variants of other macros. > Not sure what you mean here - is there already a macro that does what > UNROLL_MACRO_LOOP does? No, I'm saying that __UNROLL_MACRO_LOOP_20() will never be used on a system that has at most 11 security modules. You've added a bunch of text that serves no purpose. "Future expansion" is pretty silly here.
On 20-Aug-2020 06:47:53 PM, Brendan Jackman wrote: > From: Paul Renauld <renauld@google.com> > > LSMs have high overhead due to indirect function calls through > retpolines. This RPC proposes to replace these with static calls [1] > instead. > > This overhead is especially significant for the "bpf" LSM which supports > the implementation of LSM hooks with eBPF programs (security/bpf)[2]. In > order to facilitate this, the "bpf" LSM provides a default nop callback for > all LSM hooks. When enabled, the "bpf", LSM incurs an unnecessary / > avoidable indirect call to this nop callback. > > The performance impact on a simple syscall eventfd_write (which triggers > the file_permission hook) was measured with and without "bpf" LSM > enabled. Activating the LSM resulted in an overhead of 4% [3]. > > This overhead prevents the adoption of bpf LSM on performance critical > systems, and also, in general, slows down all LSMs. > > Currently, the LSM hook callbacks are stored in a linked list and > dispatched as indirect calls. Using static calls can remove this overhead > by replacing all indirect calls with direct calls. > > During the discussion of the "bpf" LSM patch-set it was proposed to special > case BPF LSM to avoid the overhead by using static keys. This was however > not accepted and it was decided to [4]: > > - Not special-case the "bpf" LSM. > - Implement a general solution benefitting the whole LSM framework. > > This is based on the static call branch [5]. Hi! So I reviewed this quickly, and hopefully my understanding is correct. AFAIU, your approach is limited to scenarios where the callbacks are known at compile-time. It also appears to add the overhead of a switch/case for every function call on the fast-path. I am the original author of the tracepoint infrastructure in the Linux kernel, which also needs to iterate on an array of callbacks. Recently, Steven Rostedt pushed a change which accelerates the single-callback case using static calls to reduce retpoline mitigation overhead, but I would prefer if we could accelerate the multiple-callback case as well. Note that for tracepoints, the callbacks are not known at compile-time. This is where I think we could come up with a generic solution that would fit both LSM and tracepoint use-cases. Here is what I have in mind. Let's say we generate code to accelerate up to N calls, and after that we have a fallback using indirect calls. Then we should be able to generate the following using static keys as a jump table and N static calls: jump <static key label target> label_N: stack setup call label_N-1: stack setup call label_N-2: stack setup call ... label_0: jump end label_fallback: <iteration and indirect calls> end: So the static keys would be used to jump to the appropriate label (using a static branch, which has pretty much 0 overhead). Static calls would be used to implement each of the calls. Thoughts ? Thanks, Mathieu
On Fri, Feb 05, 2021 at 10:09:26AM -0500, Mathieu Desnoyers wrote: > Then we should be able to generate the following using static keys as a > jump table and N static calls: > > jump <static key label target> > label_N: > stack setup > call > label_N-1: > stack setup > call > label_N-2: > stack setup > call > ... > label_0: > jump end > label_fallback: > <iteration and indirect calls> > end: > > So the static keys would be used to jump to the appropriate label (using > a static branch, which has pretty much 0 overhead). Static calls would > be used to implement each of the calls. > > Thoughts ? At some point I tried to extend the static_branch infra to do multiple targets and while the low level plumbing is trivial, I ran into trouble trying to get a sane C level API for it.
----- On Feb 5, 2021, at 10:40 AM, Peter Zijlstra peterz@infradead.org wrote: > On Fri, Feb 05, 2021 at 10:09:26AM -0500, Mathieu Desnoyers wrote: >> Then we should be able to generate the following using static keys as a >> jump table and N static calls: >> >> jump <static key label target> >> label_N: >> stack setup >> call >> label_N-1: >> stack setup >> call >> label_N-2: >> stack setup >> call >> ... >> label_0: >> jump end >> label_fallback: >> <iteration and indirect calls> >> end: >> >> So the static keys would be used to jump to the appropriate label (using >> a static branch, which has pretty much 0 overhead). Static calls would >> be used to implement each of the calls. >> >> Thoughts ? > > At some point I tried to extend the static_branch infra to do multiple > targets and while the low level plumbing is trivial, I ran into trouble > trying to get a sane C level API for it. Did you try doing an API for a variable number of targets, or was it for a specific number of targets ? It might be easier to just duplicate some of the API code for number of targets between 2 and 12, and let the users code choose the maximum number of targets they want to accelerate. Thanks, Mathieu
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 95b7c1d32062..d11e116b588e 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1524,6 +1524,7 @@ union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); #include "lsm_hook_defs.h" #undef LSM_HOOK + void *generic_func; }; struct security_hook_heads { diff --git a/include/linux/lsm_static_call.h b/include/linux/lsm_static_call.h new file mode 100644 index 000000000000..f5f5698292e0 --- /dev/null +++ b/include/linux/lsm_static_call.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2020 Google LLC. + */ + +#ifndef __LINUX_LSM_STATIC_CALL_H +#define __LINUX_LSM_STATIC_CALL_H + +/* + * Static slots are used in security/security.c to avoid costly + * indirect calls by replacing them with static calls. + * The number of static calls for each LSM hook is fixed. + */ +#define SECURITY_STATIC_SLOT_COUNT 11 + +/* + * Identifier for the LSM static slots. + * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h + * IDX is the index of the slot. 0 <= NUM < SECURITY_STATIC_SLOT_COUNT + */ +#define STATIC_SLOT(HOOK, IDX) security_static_slot_##HOOK##_##IDX + +/* + * Call the macro M for each LSM hook slot. + * M should take as first argument the index and then + * the same __VA_ARGS__ + * Essentially, this will expand to: + * M(0, ...) + * M(1, ...) + * M(2, ...) + * ... + * Note that no trailing semicolon is placed so M should be defined + * accordingly. + * This adapts to a change to SECURITY_STATIC_SLOT_COUNT. + */ +#define SECURITY_FOREACH_STATIC_SLOT(M, ...) \ + UNROLL_MACRO_LOOP(SECURITY_STATIC_SLOT_COUNT, M, __VA_ARGS__) + +/* + * Intermediate macros to expand SECURITY_STATIC_SLOT_COUNT + */ +#define UNROLL_MACRO_LOOP(N, MACRO, ...) \ + _UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) + +#define _UNROLL_MACRO_LOOP(N, MACRO, ...) \ + __UNROLL_MACRO_LOOP(N, MACRO, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP(N, MACRO, ...) \ + __UNROLL_MACRO_LOOP_##N(MACRO, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_0(MACRO, ...) + +#define __UNROLL_MACRO_LOOP_1(MACRO, ...) \ + __UNROLL_MACRO_LOOP_0(MACRO, __VA_ARGS__) \ + MACRO(0, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_2(MACRO, ...) \ + __UNROLL_MACRO_LOOP_1(MACRO, __VA_ARGS__) \ + MACRO(1, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_3(MACRO, ...) \ + __UNROLL_MACRO_LOOP_2(MACRO, __VA_ARGS__) \ + MACRO(2, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_4(MACRO, ...) \ + __UNROLL_MACRO_LOOP_3(MACRO, __VA_ARGS__) \ + MACRO(3, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_5(MACRO, ...) \ + __UNROLL_MACRO_LOOP_4(MACRO, __VA_ARGS__) \ + MACRO(4, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_6(MACRO, ...) \ + __UNROLL_MACRO_LOOP_5(MACRO, __VA_ARGS__) \ + MACRO(5, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_7(MACRO, ...) \ + __UNROLL_MACRO_LOOP_6(MACRO, __VA_ARGS__) \ + MACRO(6, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_8(MACRO, ...) \ + __UNROLL_MACRO_LOOP_7(MACRO, __VA_ARGS__) \ + MACRO(7, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_9(MACRO, ...) \ + __UNROLL_MACRO_LOOP_8(MACRO, __VA_ARGS__) \ + MACRO(8, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_10(MACRO, ...) \ + __UNROLL_MACRO_LOOP_9(MACRO, __VA_ARGS__) \ + MACRO(9, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_11(MACRO, ...) \ + __UNROLL_MACRO_LOOP_10(MACRO, __VA_ARGS__) \ + MACRO(10, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_12(MACRO, ...) \ + __UNROLL_MACRO_LOOP_11(MACRO, __VA_ARGS__) \ + MACRO(11, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_13(MACRO, ...) \ + __UNROLL_MACRO_LOOP_12(MACRO, __VA_ARGS__) \ + MACRO(12, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_14(MACRO, ...) \ + __UNROLL_MACRO_LOOP_13(MACRO, __VA_ARGS__) \ + MACRO(13, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_15(MACRO, ...) \ + __UNROLL_MACRO_LOOP_14(MACRO, __VA_ARGS__) \ + MACRO(14, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_16(MACRO, ...) \ + __UNROLL_MACRO_LOOP_15(MACRO, __VA_ARGS__) \ + MACRO(15, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_17(MACRO, ...) \ + __UNROLL_MACRO_LOOP_16(MACRO, __VA_ARGS__) \ + MACRO(16, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_18(MACRO, ...) \ + __UNROLL_MACRO_LOOP_17(MACRO, __VA_ARGS__) \ + MACRO(17, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_19(MACRO, ...) \ + __UNROLL_MACRO_LOOP_18(MACRO, __VA_ARGS__) \ + MACRO(18, __VA_ARGS__) + +#define __UNROLL_MACRO_LOOP_20(MACRO, ...) \ + __UNROLL_MACRO_LOOP_19(MACRO, __VA_ARGS__) \ + MACRO(19, __VA_ARGS__) + +#endif /* __LINUX_LSM_STATIC_CALL_H */ diff --git a/security/security.c b/security/security.c index 70a7ad357bc6..15026bc716f2 100644 --- a/security/security.c +++ b/security/security.c @@ -28,6 +28,8 @@ #include <linux/string.h> #include <linux/msg.h> #include <net/flow.h> +#include <linux/static_call.h> +#include <linux/lsm_static_call.h> #define MAX_LSM_EVM_XATTR 2 @@ -86,6 +88,128 @@ static __initconst const char * const builtin_lsm_order = CONFIG_LSM; static __initdata struct lsm_info **ordered_lsms; static __initdata struct lsm_info *exclusive; +/* + * Necessary information about a static + * slot to call __static_call_update + */ +struct static_slot { + /* static call key as defined by STATIC_CALL_KEY */ + struct static_call_key *key; + /* static call trampoline as defined by STATIC_CALL_TRAMP */ + void *trampoline; +}; + +/* + * Table of the static calls for each LSM hook. + * Once the LSMs are initialized, their callbacks will be copied to these + * tables such that the slots are filled backwards (from last to first). + * This way, we can jump directly to the first used slot, and execute + * all of them after. This essentially makes the entry point point + * dynamic to adapt the number of slot to the number of callbacks. + */ +struct static_slot_list { + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + struct static_slot NAME[SECURITY_STATIC_SLOT_COUNT]; + #include <linux/lsm_hook_defs.h> + #undef LSM_HOOK +} __randomize_layout; + +/* + * Index of the first used static call for each LSM hook + * in the corresponding static_slot_list table. + * All slots with greater indices are used. + * If no slot is used, the default value is INT_MAX. + */ +struct base_slot_idx { + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + int NAME; + #include <linux/lsm_hook_defs.h> + #undef LSM_HOOK +} __randomize_layout; + +/* + * Create the static slots for each LSM hook, initially empty. + * This will expand to: + * + * [...] + * + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_0, + * *((int(*)(struct file *file, int mask)))NULL); + * DEFINE_STATIC_CALL_NULL(security_static_slot_file_permission_1, ...); + * + * [...] + */ +#define CREATE_STATIC_SLOT(NUM, NAME, RET, ...) \ + DEFINE_STATIC_CALL_NULL(STATIC_SLOT(NAME, NUM), \ + *((RET(*)(__VA_ARGS__))NULL)); + +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + SECURITY_FOREACH_STATIC_SLOT(CREATE_STATIC_SLOT, NAME, RET, __VA_ARGS__) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +#undef CREATE_STATIC_SLOT + +/* + * Initialise a table of static slots for each LSM hook. + * When defined with DEFINE_STATIC_CALL_NULL as above, a static call is + * a key and a trampoline. Both are needed to use __static_call_update. + * This will expand to: + * struct static_slot_list static_slots = { + * [...] + * .file_permission = { + * (struct static_slot) { + * .key = &STATIC_CALL_KEY( + * security_static_slot_file_permission_0), + * .trampoline = &STATIC_CALL_TRAMP( + * security_static_slot_file_permission_0) + * }, + * (struct static_slot) { + * .key = &STATIC_CALL_KEY( + * security_static_slot_file_permission_1), + * .trampoline = &STATIC_CALL_TRAMP( + * security_static_slot_file_permission_1) + * }, + * [...] + * }, + * .file_alloc_security = { + * [...] + * }, + * [...] + * } + */ +static struct static_slot_list static_slots __initdata = { +#define DEFINE_SLOT(NUM, NAME) \ + (struct static_slot) { \ + .key = &STATIC_CALL_KEY(STATIC_SLOT(NAME, NUM)), \ + .trampoline = &STATIC_CALL_TRAMP(STATIC_SLOT(NAME, NUM))\ + }, +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + .NAME = { \ + SECURITY_FOREACH_STATIC_SLOT(DEFINE_SLOT, NAME) \ + }, +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +#undef DEFINE_SLOT +}; + +/* + * The base slot index for each is initially INT_MAX, which means + * that no slot is used yet. + * When expanded, this results in: + * struct base_slot_idx base_slot_idx = { + * [...] + * .file_permission = INT_MAX, + * .file_alloc_security = INT_MAX, + * [...] + * } + */ +static struct base_slot_idx base_slot_idx __lsm_ro_after_init = { +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + .NAME = INT_MAX, +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +}; + static __initdata bool debug; #define init_debug(...) \ do { \ @@ -307,6 +431,46 @@ static void __init ordered_lsm_parse(const char *order, const char *origin) kfree(sep); } +static void __init lsm_init_hook_static_slot(struct static_slot *slots, + struct hlist_head *head, + int *first_slot_idx) +{ + struct security_hook_list *pos; + struct static_slot *slot; + int slot_cnt; + + slot_cnt = 0; + hlist_for_each_entry_rcu(pos, head, list) + slot_cnt++; + + if (slot_cnt > SECURITY_STATIC_SLOT_COUNT) + panic("%s - No static hook slot remaining to add LSM hook.\n", + __func__); + + if (slot_cnt == 0) { + *first_slot_idx = INT_MAX; + return; + } + + *first_slot_idx = SECURITY_STATIC_SLOT_COUNT - slot_cnt; + slot = slots + *first_slot_idx; + hlist_for_each_entry_rcu(pos, head, list) { + __static_call_update(slot->key, slot->trampoline, + pos->hook.generic_func); + slot++; + } +} + +static void __init lsm_init_static_slots(void) +{ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + lsm_init_hook_static_slot(static_slots.NAME, \ + &security_hook_heads.NAME, \ + &base_slot_idx.NAME); +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +} + static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); @@ -354,6 +518,7 @@ static void __init ordered_lsm_init(void) lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); + lsm_init_static_slots(); kfree(ordered_lsms); } @@ -374,6 +539,7 @@ int __init early_security_init(void) prepare_lsm(lsm); initialize_lsm(lsm); } + lsm_init_static_slots(); return 0; } @@ -696,27 +862,36 @@ static void __init lsm_early_task(struct task_struct *task) * call_int_hook: * This is a hook that returns a value. */ - -#define call_void_hook(FUNC, ...) \ - do { \ - struct security_hook_list *P; \ - \ - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ - P->hook.FUNC(__VA_ARGS__); \ - } while (0) - -#define call_int_hook(FUNC, IRC, ...) ({ \ - int RC = IRC; \ - do { \ - struct security_hook_list *P; \ - \ - hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ - RC = P->hook.FUNC(__VA_ARGS__); \ - if (RC != 0) \ - break; \ - } \ - } while (0); \ - RC; \ +#define __CASE_CALL_STATIC_VOID(NUM, HOOK, ...) \ + case NUM: \ + static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ + fallthrough; + +#define call_void_hook(FUNC, ...) do { \ + switch (base_slot_idx.FUNC) { \ + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_VOID, \ + FUNC, __VA_ARGS__) \ + default : \ + break; \ + } \ +} while (0) + +#define __CASE_CALL_STATIC_INT(NUM, R, HOOK, ...) \ + case NUM: \ + R = static_call(STATIC_SLOT(HOOK, NUM))(__VA_ARGS__); \ + if (R != 0) \ + break; \ + fallthrough; + +#define call_int_hook(FUNC, IRC, ...) ({ \ + int RC = IRC; \ + switch (base_slot_idx.FUNC) { \ + SECURITY_FOREACH_STATIC_SLOT(__CASE_CALL_STATIC_INT, \ + RC, FUNC, __VA_ARGS__) \ + default : \ + break; \ + } \ + RC; \ }) /* Security operations */