@@ -10,12 +10,15 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/hash.h>
#include <linux/irq_work.h>
+#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sam
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __ro_after_init;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -105,6 +112,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocatio
/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM 2
+#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
/* Statistics counters for debugfs. */
enum kfence_counter_id {
KFENCE_COUNTER_ALLOCATED,
@@ -114,6 +147,7 @@ enum kfence_counter_id {
KFENCE_COUNTER_BUGS,
KFENCE_COUNTER_SKIP_INCOMPAT,
KFENCE_COUNTER_SKIP_CAPACITY,
+ KFENCE_COUNTER_SKIP_COVERED,
KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -125,11 +159,57 @@ static const char *const counter_names[]
[KFENCE_COUNTER_BUGS] = "total bugs",
[KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
[KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
+ [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
/* === Internals ============================================================ */
+static inline bool should_skip_covered(void)
+{
+ unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+ return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+ num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+ num_entries = filter_irq_stacks(stack_entries, num_entries);
+ return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+ return false;
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+
+ return true;
+}
+
static bool kfence_protect(unsigned long addr)
{
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -269,7 +349,8 @@ static __always_inline void for_each_can
}
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
- unsigned long *stack_entries, size_t num_stack_entries)
+ unsigned long *stack_entries, size_t num_stack_entries,
+ u32 alloc_stack_hash)
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
@@ -332,6 +413,8 @@ static void *kfence_guarded_alloc(struct
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
WRITE_ONCE(meta->cache, cache);
meta->size = size;
+ meta->alloc_stack_hash = alloc_stack_hash;
+
for_each_canary(meta, set_canary_byte);
/* Set required struct page fields. */
@@ -344,6 +427,8 @@ static void *kfence_guarded_alloc(struct
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(alloc_stack_hash, 1);
+
/* Memory initialization. */
/*
@@ -412,6 +497,8 @@ static void kfence_guarded_free(void *ad
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(meta->alloc_stack_hash, -1);
+
/* Protect to detect use-after-frees. */
kfence_protect((unsigned long)addr);
@@ -677,6 +764,7 @@ void __init kfence_init(void)
if (!kfence_sample_interval)
return;
+ stack_hash_seed = (u32)random_get_entropy();
if (!kfence_init_pool()) {
pr_err("%s failed\n", __func__);
return;
@@ -752,6 +840,7 @@ void *__kfence_alloc(struct kmem_cache *
{
unsigned long stack_entries[KFENCE_STACK_DEPTH];
size_t num_stack_entries;
+ u32 alloc_stack_hash;
/*
* Perform size check before switching kfence_allocation_gate, so that
@@ -799,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
- return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
+ /*
+ * Do expensive check for coverage of allocation in slow-path after
+ * allocation_gate has already become non-zero, even though it might
+ * mean not making any allocation within a given sample interval.
+ *
+ * This ensures reasonable allocation coverage when the pool is almost
+ * full, including avoiding long-lived allocations of the same source
+ * filling up the pool (e.g. pagecache allocations).
+ */
+ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+ if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+ return NULL;
+ }
+
+ return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+ alloc_stack_hash);
}
size_t kfence_ksize(const void *addr)
@@ -87,6 +87,8 @@ struct kfence_metadata {
/* Allocation and free stack information. */
struct kfence_track alloc_track;
struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];