@@ -19,6 +19,7 @@ struct bpf_mem_alloc {
enum {
BPF_MA_PERCPU = 1U << 0,
BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1,
+ BPF_MA_FREE_AFTER_RCU_GP = 1U << 2,
};
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
@@ -63,7 +63,26 @@ static u8 size_index[24] __ro_after_init = {
2 /* 192 */
};
+#define BPF_MA_FREE_TYPE_NR 2
+
+struct bpf_ma_free_ctx {
+ raw_spinlock_t lock;
+ int cpu;
+ local_t active;
+ /* For both no per-cpu and per-cpu */
+ struct llist_head to_free[BPF_MA_FREE_TYPE_NR];
+ unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR];
+ struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR];
+ struct delayed_work dwork;
+};
+
+struct bpf_free_batch {
+ struct rcu_head rcu;
+ struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+};
+
static struct workqueue_struct *bpf_ma_wq;
+static DEFINE_PER_CPU(struct bpf_ma_free_ctx, percpu_free_ctx);
static void bpf_ma_prepare_reuse_work(struct work_struct *work);
@@ -910,6 +929,112 @@ static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_n
irq_work_raise(c);
}
+static void bpf_ma_batch_free_cb(struct rcu_head *rcu)
+{
+ struct bpf_free_batch *batch = container_of(rcu, struct bpf_free_batch, rcu);
+
+ free_all(batch->to_free[0], false);
+ free_all(batch->to_free[1], true);
+ kfree(batch);
+}
+
+static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_ctx *ctx)
+{
+ long delay, left;
+ u64 to_free_cnt;
+
+ /* TODO: More reasonable threshold ? */
+ to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1];
+ delay = to_free_cnt >= 256 ? 0 : HZ;
+ if (delayed_work_pending(&ctx->dwork)) {
+ left = ctx->dwork.timer.expires - jiffies;
+ if (delay < left)
+ mod_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+ return;
+ }
+ queue_delayed_work(bpf_ma_wq, &ctx->dwork, delay);
+}
+
+static void splice_llist(struct llist_head *llist, struct llist_node **head)
+{
+ struct llist_node *first, *last;
+
+ first = llist_del_all(llist);
+ if (!first)
+ return;
+
+ last = first;
+ while (last->next)
+ last = last->next;
+ last->next = *head;
+ *head = first;
+}
+
+static void bpf_ma_splice_to_free_list(struct bpf_ma_free_ctx *ctx, struct llist_node **to_free)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Might be interrupted by a NMI which invokes unit_free() */
+ if (ctx->cpu == smp_processor_id())
+ WARN_ON_ONCE(local_inc_return(&ctx->active) != 1);
+ raw_spin_lock(&ctx->lock);
+ to_free[0] = __llist_del_all(&ctx->to_free[0]);
+ to_free[1] = __llist_del_all(&ctx->to_free[1]);
+ ctx->to_free_cnt[0] = 0;
+ ctx->to_free_cnt[1] = 0;
+ raw_spin_unlock(&ctx->lock);
+ if (ctx->cpu == smp_processor_id())
+ local_dec(&ctx->active);
+ local_irq_restore(flags);
+
+ splice_llist(&ctx->to_free_extra[0], &to_free[0]);
+ splice_llist(&ctx->to_free_extra[1], &to_free[1]);
+}
+
+static void bpf_ma_free_dwork(struct work_struct *work)
+{
+ struct bpf_ma_free_ctx *ctx = container_of(to_delayed_work(work),
+ struct bpf_ma_free_ctx, dwork);
+ struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+ struct bpf_free_batch *batch;
+
+ bpf_ma_splice_to_free_list(ctx, to_free);
+
+ batch = kmalloc(sizeof(*batch), GFP_KERNEL);
+ if (!batch) {
+ synchronize_rcu_expedited();
+ free_all(to_free[0], false);
+ free_all(to_free[1], true);
+ return;
+ }
+
+ batch->to_free[0] = to_free[0];
+ batch->to_free[1] = to_free[1];
+ call_rcu(&batch->rcu, bpf_ma_batch_free_cb);
+}
+
+static void notrace wait_gp_direct_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+ bool percpu = !!c->percpu_size;
+ struct bpf_ma_free_ctx *ctx;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ctx = this_cpu_ptr(&percpu_free_ctx);
+ if (local_inc_return(&ctx->active) == 1) {
+ raw_spin_lock(&ctx->lock);
+ __llist_add(llnode, &ctx->to_free[percpu]);
+ ctx->to_free_cnt[percpu] += 1;
+ bpf_ma_schedule_free_dwork(ctx);
+ raw_spin_unlock(&ctx->lock);
+ } else {
+ llist_add(llnode, &ctx->to_free_extra[percpu]);
+ }
+ local_dec(&ctx->active);
+ local_irq_restore(flags);
+}
+
static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
{
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
@@ -918,6 +1043,8 @@ static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP)
wait_gp_reuse_free(c, llnode);
+ else if (c->flags & BPF_MA_FREE_AFTER_RCU_GP)
+ wait_gp_direct_free(c, llnode);
else
immediate_reuse_free(c, llnode);
}
@@ -1016,8 +1143,20 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
static int __init bpf_ma_init(void)
{
+ int cpu;
+
bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0);
BUG_ON(!bpf_ma_wq);
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_ma_free_ctx *ctx;
+
+ ctx = per_cpu_ptr(&percpu_free_ctx, cpu);
+ raw_spin_lock_init(&ctx->lock);
+ ctx->cpu = cpu;
+ INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork);
+ }
+
return 0;
}
late_initcall(bpf_ma_init);