@@ -1,8 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-
+#define _GNU_SOURCE
#include <test_progs.h>
#include <network_helpers.h>
+#include <pthread.h>
+#include <sched.h>
#include "refcounted_kptr.skel.h"
#include "refcounted_kptr_fail.skel.h"
@@ -16,3 +18,103 @@ void test_refcounted_kptr_fail(void)
{
RUN_TESTS(refcounted_kptr_fail);
}
+
+static void force_cpu(pthread_t thread, int cpunum)
+{
+ cpu_set_t cpuset;
+ int err;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpunum, &cpuset);
+ err = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
+ if (!ASSERT_OK(err, "pthread_setaffinity_np"))
+ return;
+}
+
+struct refcounted_kptr *skel;
+
+static void *run_unstash_acq_ref(void *unused)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+ long ret, unstash_acq_ref_fd;
+ force_cpu(pthread_self(), 1);
+
+ unstash_acq_ref_fd = bpf_program__fd(skel->progs.unstash_add_and_acquire_refcount);
+
+ ret = bpf_prog_test_run_opts(unstash_acq_ref_fd, &opts);
+ ASSERT_EQ(opts.retval, 0, "unstash_add_and_acquire_refcount retval");
+ ASSERT_EQ(skel->bss->ref_check_3, 2, "ref_check_3");
+ ASSERT_EQ(skel->bss->ref_check_4, 1, "ref_check_4");
+ ASSERT_EQ(skel->bss->ref_check_5, 0, "ref_check_5");
+ pthread_exit((void *)ret);
+}
+
+void test_refcounted_kptr_races(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+ int ref_acq_lock_fd, ref_acq_unlock_fd, rem_node_lock_fd;
+ int add_stash_fd, remove_tree_fd;
+ pthread_t thread_id;
+ int ret;
+
+ force_cpu(pthread_self(), 0);
+ skel = refcounted_kptr__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load"))
+ return;
+
+ add_stash_fd = bpf_program__fd(skel->progs.add_refcounted_node_to_tree_and_stash);
+ remove_tree_fd = bpf_program__fd(skel->progs.remove_refcounted_node_from_tree);
+ ref_acq_lock_fd = bpf_program__fd(skel->progs.unsafe_ref_acq_lock);
+ ref_acq_unlock_fd = bpf_program__fd(skel->progs.unsafe_ref_acq_unlock);
+ rem_node_lock_fd = bpf_program__fd(skel->progs.unsafe_rem_node_lock);
+
+ ret = bpf_prog_test_run_opts(rem_node_lock_fd, &opts);
+ if (!ASSERT_OK(ret, "rem_node_lock"))
+ return;
+
+ ret = bpf_prog_test_run_opts(ref_acq_lock_fd, &opts);
+ if (!ASSERT_OK(ret, "ref_acq_lock"))
+ return;
+
+ ret = bpf_prog_test_run_opts(add_stash_fd, &opts);
+ if (!ASSERT_OK(ret, "add_stash"))
+ return;
+ if (!ASSERT_OK(opts.retval, "add_stash retval"))
+ return;
+
+ ret = pthread_create(&thread_id, NULL, &run_unstash_acq_ref, NULL);
+ if (!ASSERT_OK(ret, "pthread_create"))
+ goto cleanup;
+
+ force_cpu(thread_id, 1);
+
+ /* This program will execute before unstash_acq_ref's refcount_acquire, then
+ * unstash_acq_ref can proceed after unsafe_unlock
+ */
+ ret = bpf_prog_test_run_opts(remove_tree_fd, &opts);
+ if (!ASSERT_OK(ret, "remove_tree"))
+ goto cleanup;
+
+ ret = bpf_prog_test_run_opts(ref_acq_unlock_fd, &opts);
+ if (!ASSERT_OK(ret, "ref_acq_unlock"))
+ goto cleanup;
+
+ ret = pthread_join(thread_id, NULL);
+ if (!ASSERT_OK(ret, "pthread_join"))
+ goto cleanup;
+
+ refcounted_kptr__destroy(skel);
+ return;
+cleanup:
+ bpf_prog_test_run_opts(ref_acq_unlock_fd, &opts);
+ refcounted_kptr__destroy(skel);
+ return;
+}
@@ -39,9 +39,20 @@ private(A) struct bpf_spin_lock lock;
private(A) struct bpf_rb_root root __contains(node_data, r);
private(A) struct bpf_list_head head __contains(node_data, l);
+private(C) struct bpf_spin_lock lock2;
+private(C) struct bpf_rb_root root2 __contains(node_data, r);
+
private(B) struct bpf_spin_lock alock;
private(B) struct bpf_rb_root aroot __contains(node_acquire, node);
+private(D) struct bpf_spin_lock ref_acq_lock;
+private(E) struct bpf_spin_lock rem_node_lock;
+
+/* Provided by bpf_testmod */
+extern void bpf__unsafe_spin_lock(void *lock__ign) __ksym;
+extern void bpf__unsafe_spin_unlock(void *lock__ign) __ksym;
+extern volatile int bpf_refcount_read(void *refcount__ign) __ksym;
+
static bool less(struct bpf_rb_node *node_a, const struct bpf_rb_node *node_b)
{
struct node_data *a;
@@ -405,4 +416,151 @@ long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx)
return 0;
}
+SEC("tc")
+long unsafe_ref_acq_lock(void *ctx)
+{
+ bpf__unsafe_spin_lock(&ref_acq_lock);
+ return 0;
+}
+
+SEC("tc")
+long unsafe_ref_acq_unlock(void *ctx)
+{
+ bpf__unsafe_spin_unlock(&ref_acq_lock);
+ return 0;
+}
+
+SEC("tc")
+long unsafe_rem_node_lock(void *ctx)
+{
+ bpf__unsafe_spin_lock(&rem_node_lock);
+ return 0;
+}
+
+/* The following 3 progs are used in concert to test a bpf_refcount-related
+ * race. Consider the following pseudocode interleaving of rbtree operations:
+ *
+ * (Assumptions: n, m, o, p, q are pointers to nodes, t1 and t2 are different
+ * rbtrees, l1 and l2 are locks accompanying the trees, mapval is some
+ * kptr_xchg'able ptr_to_map_value. A single node is being manipulated by both
+ * programs. Irrelevant error-checking and casting is omitted.)
+ *
+ * CPU O CPU 1
+ * ----------------------------------|---------------------------
+ * n = bpf_obj_new [0] |
+ * lock(l1) |
+ * bpf_rbtree_add(t1, &n->r, less) |
+ * m = bpf_refcount_acquire(n) [1] |
+ * unlock(l1) |
+ * kptr_xchg(mapval, m) [2] |
+ * --------------------------------------------------------------
+ * | o = kptr_xchg(mapval, NULL) [3]
+ * | lock(l2)
+ * | rbtree_add(t2, &o->r, less) [4]
+ * --------------------------------------------------------------
+ * lock(l1) |
+ * p = rbtree_first(t1) |
+ * p = rbtree_remove(t1, p) |
+ * unlock(l1) |
+ * if (p) |
+ * bpf_obj_drop(p) [5] |
+ * --------------------------------------------------------------
+ * | q = bpf_refcount_acquire(o) [6]
+ * | unlock(l2)
+ *
+ * If bpf_refcount_acquire can't fail, the sequence of operations on the node's
+ * refcount is:
+ * [0] - refcount initialized to 1
+ * [1] - refcount bumped to 2
+ * [2] - refcount is still 2, but m's ownership passed to mapval
+ * [3] - refcount is still 2, mapval's ownership passed to o
+ * [4] - refcount is decr'd to 1, rbtree_add fails, node is already in t1
+ * o is converted to non-owning reference
+ * [5] - refcount is decr'd to 0, node free'd
+ * [6] - refcount is incr'd to 1 from 0, ERROR
+ *
+ * To prevent [6] bpf_refcount_acquire was made failable. This interleaving is
+ * used to test failable refcount_acquire.
+ *
+ * The two halves of CPU 0's operations are implemented by
+ * add_refcounted_node_to_tree_and_stash and remove_refcounted_node_from_tree.
+ * We can't do the same for CPU 1's operations due to l2 critical section.
+ * Instead, bpf__unsafe_spin_{lock, unlock} are used to ensure the expected
+ * order of operations.
+ */
+
+SEC("tc")
+long add_refcounted_node_to_tree_and_stash(void *ctx)
+{
+ long err;
+
+ err = __stash_map_insert_tree(0, 42, &root, &lock);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+SEC("tc")
+long remove_refcounted_node_from_tree(void *ctx)
+{
+ long ret = 0;
+
+ /* rem_node_lock is held by another program to force race */
+ bpf__unsafe_spin_lock(&rem_node_lock);
+ ret = __read_from_tree(&root, &lock, true);
+ if (ret != 42)
+ return ret;
+
+ bpf__unsafe_spin_unlock(&rem_node_lock);
+ return 0;
+}
+
+/* ref_check_n numbers correspond to refcount operation points in comment above */
+int ref_check_3, ref_check_4, ref_check_5;
+
+SEC("tc")
+long unstash_add_and_acquire_refcount(void *ctx)
+{
+ struct map_value *mapval;
+ struct node_data *n, *m;
+ int idx = 0;
+
+ mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
+ if (!mapval)
+ return -1;
+
+ n = bpf_kptr_xchg(&mapval->node, NULL);
+ if (!n)
+ return -2;
+ ref_check_3 = bpf_refcount_read(&n->ref);
+
+ bpf_spin_lock(&lock2);
+ bpf_rbtree_add(&root2, &n->r, less);
+ ref_check_4 = bpf_refcount_read(&n->ref);
+
+ /* Let CPU 0 do first->remove->drop */
+ bpf__unsafe_spin_unlock(&rem_node_lock);
+
+ /* ref_acq_lock is held by another program to force race
+ * when this program holds the lock, remove_refcounted_node_from_tree
+ * has finished
+ */
+ bpf__unsafe_spin_lock(&ref_acq_lock);
+ ref_check_5 = bpf_refcount_read(&n->ref);
+
+ /* Error-causing use-after-free incr ([6] in long comment above) */
+ m = bpf_refcount_acquire(n);
+ bpf__unsafe_spin_unlock(&ref_acq_lock);
+
+ bpf_spin_unlock(&lock2);
+
+ if (m) {
+ bpf_obj_drop(m);
+ return -3;
+ }
+
+ return !!m;
+}
+
char _license[] SEC("license") = "GPL";
The selftest added in this patch is the exact scenario described by Kumar in [0] and fixed by earlier patches in this series. The long comment added in progs/refcounted_kptr.c restates the use-after-free scenario. The added test uses bpf__unsafe_spin_{lock, unlock} to force the specific problematic interleaving we're interested in testing, and bpf_refcount_read to confirm refcount incr/decr work as expected. [0]: https://lore.kernel.org/bpf/atfviesiidev4hu53hzravmtlau3wdodm2vqs7rd7tnwft34e3@xktodqeqevir/ Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> --- .../bpf/prog_tests/refcounted_kptr.c | 104 +++++++++++- .../selftests/bpf/progs/refcounted_kptr.c | 158 ++++++++++++++++++ 2 files changed, 261 insertions(+), 1 deletion(-)