@@ -125,6 +125,7 @@ struct aa_label {
long flags;
struct aa_proxy *proxy;
struct rb_node node;
+ struct llist_node reclaim_node;
struct rcu_head rcu;
__counted char *hname;
u32 secid;
@@ -465,4 +466,6 @@ static inline void aa_put_proxy(struct aa_proxy *proxy)
void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new);
+void aa_label_reclaim_add_label(struct aa_label *label);
+
#endif /* __AA_LABEL_H */
@@ -64,6 +64,143 @@ static LIST_HEAD(aa_global_buffers);
static DEFINE_SPINLOCK(aa_buffers_lock);
static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers);
+static struct workqueue_struct *aa_label_reclaim_wq;
+static void aa_label_reclaim_work_fn(struct work_struct *work);
+
+/*
+ * Dummy llist nodes, for lockless list traveral and deletions by
+ * the reclaim worker, while nodes are added from normal label
+ * insertion paths.
+ */
+struct aa_label_reclaim_node {
+ bool inuse;
+ struct llist_node node;
+};
+
+/*
+ * We need two dummy head nodes for lockless list manipulations from reclaim
+ * worker - first dummy node will be used in current reclaim iteration;
+ * the second one will be used in next iteration. Next iteration marks
+ * the first dummy node as free, for use in following iteration.
+ */
+#define AA_LABEL_RECLAIM_NODE_MAX 2
+
+#define AA_MAX_LABEL_RECLAIMS 100
+#define AA_LABEL_RECLAIM_INTERVAL_MS 5000
+
+static LLIST_HEAD(aa_label_reclaim_head);
+static struct llist_node *last_reclaim_label;
+static struct aa_label_reclaim_node aa_label_reclaim_nodes[AA_LABEL_RECLAIM_NODE_MAX];
+static DECLARE_DELAYED_WORK(aa_label_reclaim_work, aa_label_reclaim_work_fn);
+
+void aa_label_reclaim_add_label(struct aa_label *label)
+{
+ percpu_ref_get(&label->count);
+ llist_add(&label->reclaim_node, &aa_label_reclaim_head);
+}
+
+static bool aa_label_is_reclaim_node(struct llist_node *node)
+{
+ return &aa_label_reclaim_nodes[0].node <= node &&
+ node <= &aa_label_reclaim_nodes[AA_LABEL_RECLAIM_NODE_MAX - 1].node;
+}
+
+static struct llist_node *aa_label_get_reclaim_node(void)
+{
+ int i;
+ struct aa_label_reclaim_node *rn;
+
+ for (i = 0; i < AA_LABEL_RECLAIM_NODE_MAX; i++) {
+ rn = &aa_label_reclaim_nodes[i];
+ if (!rn->inuse) {
+ rn->inuse = true;
+ return &rn->node;
+ }
+ }
+
+ return NULL;
+}
+
+static void aa_label_put_reclaim_node(struct llist_node *node)
+{
+ struct aa_label_reclaim_node *rn = container_of(node, struct aa_label_reclaim_node, node);
+
+ rn->inuse = false;
+}
+
+static void aa_put_all_reclaim_nodes(void)
+{
+ int i;
+
+ for (i = 0; i < AA_LABEL_RECLAIM_NODE_MAX; i++)
+ aa_label_reclaim_nodes[i].inuse = false;
+}
+
+static void aa_label_reclaim_work_fn(struct work_struct *work)
+{
+ struct llist_node *pos, *first, *head, *prev, *next;
+ struct llist_node *reclaim_node;
+ struct aa_label *label;
+ int cnt = 0;
+ bool held;
+
+ first = aa_label_reclaim_head.first;
+ if (!first)
+ goto queue_reclaim_work;
+
+ if (last_reclaim_label == NULL || last_reclaim_label->next == NULL) {
+ reclaim_node = aa_label_get_reclaim_node();
+ WARN_ONCE(!reclaim_node, "Reclaim heads exhausted\n");
+ if (unlikely(!reclaim_node)) {
+ head = first->next;
+ if (!head) {
+ aa_put_all_reclaim_nodes();
+ goto queue_reclaim_work;
+ }
+ prev = first;
+ } else {
+ llist_add(reclaim_node, &aa_label_reclaim_head);
+ prev = reclaim_node;
+ head = prev->next;
+ }
+ } else {
+ prev = last_reclaim_label;
+ head = prev->next;
+ }
+
+ last_reclaim_label = NULL;
+ llist_for_each_safe(pos, next, head) {
+ /* Free reclaim node, which is present in the list */
+ if (aa_label_is_reclaim_node(pos)) {
+ prev->next = pos->next;
+ aa_label_put_reclaim_node(pos);
+ continue;
+ }
+
+ label = container_of(pos, struct aa_label, reclaim_node);
+ percpu_ref_switch_to_atomic_sync(&label->count);
+ rcu_read_lock();
+ percpu_ref_put(&label->count);
+ held = percpu_ref_tryget(&label->count);
+ if (!held)
+ prev->next = pos->next;
+ rcu_read_unlock();
+ if (!held)
+ continue;
+ percpu_ref_switch_to_percpu(&label->count);
+ cnt++;
+ if (cnt == AA_MAX_LABEL_RECLAIMS) {
+ last_reclaim_label = pos;
+ break;
+ }
+ prev = pos;
+ }
+
+queue_reclaim_work:
+ queue_delayed_work(aa_label_reclaim_wq, &aa_label_reclaim_work,
+ msecs_to_jiffies(AA_LABEL_RECLAIM_INTERVAL_MS));
+}
+
/*
* LSM hook functions
*/
@@ -2277,6 +2414,14 @@ static int __init apparmor_init(void)
aa_free_root_ns();
goto buffers_out;
}
+
+ aa_label_reclaim_wq = alloc_workqueue("aa_label_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+ WARN_ON(!aa_label_reclaim_wq);
+ if (aa_label_reclaim_wq)
+ queue_delayed_work(aa_label_reclaim_wq, &aa_label_reclaim_work,
+ AA_LABEL_RECLAIM_INTERVAL_MS);
+
security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
&apparmor_lsmid);
Nginx performance testing with Apparmor enabled (with nginx running in unconfined profile), on kernel versions 6.1 and 6.5 show significant drop in throughput scalability, when Nginx workers are scaled to higher number of CPUs across various L3 cache domains. Below is one sample data on the throughput scalability loss, based on results on AMD Zen4 system with 96 CPUs with SMT core count 2; so, overall, 192 CPUs: Config Cache Domains apparmor=off apparmor=on scaling eff (%) scaling eff (%) 8C16T 1 100% 100% 16C32T 2 95% 94% 24C48T 3 94% 93% 48C96T 6 92% 88% 96C192T 12 85% 68% There is a significant drop in scaling efficiency, when we move to 96 CPUs/192 SMT threads. Perf tool shows most of the contention coming from below 6.56% nginx [kernel.vmlinux] [k] apparmor_current_getsecid_subj 6.22% nginx [kernel.vmlinux] [k] apparmor_file_open The majority of the CPU cycles is found to be due to memory contention in atomic_fetch_add and atomic_fetch_sub operations from kref_get() and kref_put() operations on label. A part of the contention was remove with commit 2516fde1fa00 ("apparmor: Optimize retrieving current task secid"), which is part of 6.7-rc1 release. After including this commit, the scaling efficiency improved as shown below: Config Cache Domains apparmor=on apparmor=on (patched) scaling eff (%) scaling eff (%) 8C16T 1 100% 100% 16C32T 2 97% 93% 24C48T 3 94% 92% 48C96T 6 88% 88% 96C192T 12 65% 79% However, the scaling efficiency impact is still significant even after including the commit. In addition, the performance impact is even higher when we move to >192 CPUs. In addition, the memory contention impact would increase whem there is a high frequency of label update operations and labels are marked stale more frequently. This patch adds a mechanism to manage reclaim of apparmor labels, when they are working in percpu mode. Using percpu refcount for apparmor label refcount helps solve the throughput scalability drop problem seen on nginx. Config Cache Domains apparmor=on (percpuref) scaling eff (%) 8C16T 1 100% 16C32T 2 96% 24C48T 3 94% 48C96T 6 93% 96C192T 12 90% Below is the sequence of operations, which shows the refcount management with this approach: 1. During label initialization, the percpu ref is initialized in atomic mode. This is done to ensure that, for cases where the label hasn't gone live (->ns isn't assigned), mostly during initialization error paths. 2. Labels are switched to percpu mode at various points - when a label is added to labelset tree, when a unconfined profile has been assigned a namespace. 3. As part of the initial prototype, only the in tree labels are managed by the kworker. These labels are added to a lockless list. The unconfined labels invoke a percpu_ref_kill() operation when the namespace is destroyed. 4. The kworker does a periodic scan of all the labels in the llist. It does below sequence of operations: a. Enqueue a dummy node to mark the start of scan. This dummy node is used as start point of scan and ensures that we there is no additional synchronization required with new label node additions to the llist. Any new labels will be processed in next run of the kworker. SCAN START PTR | v +----------+ +------+ +------+ +------+ | | | | | | | | | head ------> dummy|--->|label |--->| label|--->NULL | | | node | | | | | +----------+ +------+ +------+ +------+ New label addition: SCAN START PTR | v +----------+ +------+ +------+ +------+ +------+ | | | | | | | | | | | head |--> label|--> dummy|--->|label |--->| label|--->NULL | | | | | node | | | | | +----------+ +------+ +------+ +------+ +------+ b. Traverse through the llist, starting from dummy->next. If the node is a dummy node, mark it free. If the node is a label node, do, i) Switch the label ref to atomic mode. The ref switch wait for the existing percpu_ref_get() and percpu_ref_put() operations to complete, by waiting for a RCU grace period. Once the switch is complete, from this point onwards, any percpu_ref_get(), percpu_ref_put() operations use atomic operations. ii) Drop the initial reference, which was taken while adding the label node to the llist. iii) Use a percpu_ref_tryget() increment operation on the ref, to see if we dropped the last ref count. if we dropped the last count, we remove the node from the llist. All of these operations are done inside a RCU critical section, to avoid race with the release operations, which can potentially trigger, as soon as we drop the initial ref count. iv) If we didn't drop the last ref, switch back the counter to percpu mode. Signed-off-by: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com> --- security/apparmor/include/label.h | 3 + security/apparmor/lsm.c | 145 ++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+)