Message ID | 20240819165939.745801-10-kent.overstreet@linux.dev (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | rcu_pending | expand |
On Mon, Aug 19, 2024 at 12:59:35PM -0400, Kent Overstreet wrote: > This nets us a slight performance increase, and converts to common > code. How did you measure the performance, and what was the actual amount of the increase? Either way, the performance increase needs to be verified by the people for whom the current code works well. As noted a couple months ago, I am having a hard time imagining a tree beating a linked list of pages of pointers in terms of cache locality. I added linux-arch on CC in case I am out of date on how computer systems work. In the meantime, I need you to hold off on this one. Thanx, Paul > Todo - re-add the shrinker, so that memory reclaim can free expired > objects and expedite a grace period when necessary. > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> > --- > include/linux/rcu_pending.h | 2 + > init/main.c | 2 + > kernel/rcu/pending.c | 20 + > kernel/rcu/tree.c | 746 ------------------------------------ > kernel/rcu/update.c | 1 - > 5 files changed, 24 insertions(+), 747 deletions(-) > > diff --git a/include/linux/rcu_pending.h b/include/linux/rcu_pending.h > index a875c640da8d..5ef6392ce180 100644 > --- a/include/linux/rcu_pending.h > +++ b/include/linux/rcu_pending.h > @@ -22,4 +22,6 @@ int rcu_pending_init(struct rcu_pending *pending, > struct srcu_struct *srcu, > rcu_pending_process_fn process); > > +void __init kvfree_rcu_pending_init(void); > + > #endif /* _LINUX_RCU_PENDING_H */ > diff --git a/init/main.c b/init/main.c > index 206acdde51f5..a3f0fd6ec3da 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -43,6 +43,7 @@ > #include <linux/profile.h> > #include <linux/kfence.h> > #include <linux/rcupdate.h> > +#include <linux/rcu_pending.h> > #include <linux/srcu.h> > #include <linux/moduleparam.h> > #include <linux/kallsyms.h> > @@ -993,6 +994,7 @@ void start_kernel(void) > workqueue_init_early(); > > rcu_init(); > + kvfree_rcu_pending_init(); > > /* Trace events are available after this */ > trace_init(); > diff --git a/kernel/rcu/pending.c b/kernel/rcu/pending.c > index c0e2351ba198..9c57f373d494 100644 > --- a/kernel/rcu/pending.c > +++ b/kernel/rcu/pending.c > @@ -601,3 +601,23 @@ int rcu_pending_init(struct rcu_pending *pending, > > return 0; > } > + > +#ifndef CONFIG_TINY_RCU > +/* kvfree_rcu */ > + > +static struct rcu_pending kvfree_rcu_pending; > + > +void kvfree_call_rcu(struct rcu_head *head, void *ptr) > +{ > + BUG_ON(!ptr); > + > + __rcu_pending_enqueue(&kvfree_rcu_pending, head, ptr, head == NULL); > +} > +EXPORT_SYMBOL_GPL(kvfree_call_rcu); > + > +void __init kvfree_rcu_pending_init(void) > +{ > + if (rcu_pending_init(&kvfree_rcu_pending, NULL, RCU_PENDING_KVFREE_FN)) > + panic("%s failed\n", __func__); > +} > +#endif > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index 52f9f0bf1b8e..3bb6c81e2d30 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3216,702 +3216,6 @@ EXPORT_SYMBOL_GPL(call_rcu); > #define KFREE_N_BATCHES 2 > #define FREE_N_CHANNELS 2 > > -/** > - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers > - * @list: List node. All blocks are linked between each other > - * @gp_snap: Snapshot of RCU state for objects placed to this bulk > - * @nr_records: Number of active pointers in the array > - * @records: Array of the kvfree_rcu() pointers > - */ > -struct kvfree_rcu_bulk_data { > - struct list_head list; > - struct rcu_gp_oldstate gp_snap; > - unsigned long nr_records; > - void *records[]; > -}; > - > -/* > - * This macro defines how many entries the "records" array > - * will contain. It is based on the fact that the size of > - * kvfree_rcu_bulk_data structure becomes exactly one page. > - */ > -#define KVFREE_BULK_MAX_ENTR \ > - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) > - > -/** > - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests > - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period > - * @head_free: List of kfree_rcu() objects waiting for a grace period > - * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. > - * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period > - * @krcp: Pointer to @kfree_rcu_cpu structure > - */ > - > -struct kfree_rcu_cpu_work { > - struct rcu_work rcu_work; > - struct rcu_head *head_free; > - struct rcu_gp_oldstate head_free_gp_snap; > - struct list_head bulk_head_free[FREE_N_CHANNELS]; > - struct kfree_rcu_cpu *krcp; > -}; > - > -/** > - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period > - * @head: List of kfree_rcu() objects not yet waiting for a grace period > - * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" > - * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period > - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period > - * @lock: Synchronize access to this structure > - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES > - * @initialized: The @rcu_work fields have been initialized > - * @head_count: Number of objects in rcu_head singular list > - * @bulk_count: Number of objects in bulk-list > - * @bkvcache: > - * A simple cache list that contains objects for reuse purpose. > - * In order to save some per-cpu space the list is singular. > - * Even though it is lockless an access has to be protected by the > - * per-cpu lock. > - * @page_cache_work: A work to refill the cache when it is empty > - * @backoff_page_cache_fill: Delay cache refills > - * @work_in_progress: Indicates that page_cache_work is running > - * @hrtimer: A hrtimer for scheduling a page_cache_work > - * @nr_bkv_objs: number of allocated objects at @bkvcache. > - * > - * This is a per-CPU structure. The reason that it is not included in > - * the rcu_data structure is to permit this code to be extracted from > - * the RCU files. Such extraction could allow further optimization of > - * the interactions with the slab allocators. > - */ > -struct kfree_rcu_cpu { > - // Objects queued on a linked list > - // through their rcu_head structures. > - struct rcu_head *head; > - unsigned long head_gp_snap; > - atomic_t head_count; > - > - // Objects queued on a bulk-list. > - struct list_head bulk_head[FREE_N_CHANNELS]; > - atomic_t bulk_count[FREE_N_CHANNELS]; > - > - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; > - raw_spinlock_t lock; > - struct delayed_work monitor_work; > - bool initialized; > - > - struct delayed_work page_cache_work; > - atomic_t backoff_page_cache_fill; > - atomic_t work_in_progress; > - struct hrtimer hrtimer; > - > - struct llist_head bkvcache; > - int nr_bkv_objs; > -}; > - > -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { > - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), > -}; > - > -static __always_inline void > -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) > -{ > -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD > - int i; > - > - for (i = 0; i < bhead->nr_records; i++) > - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); > -#endif > -} > - > -static inline struct kfree_rcu_cpu * > -krc_this_cpu_lock(unsigned long *flags) > -{ > - struct kfree_rcu_cpu *krcp; > - > - local_irq_save(*flags); // For safely calling this_cpu_ptr(). > - krcp = this_cpu_ptr(&krc); > - raw_spin_lock(&krcp->lock); > - > - return krcp; > -} > - > -static inline void > -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) > -{ > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > -} > - > -static inline struct kvfree_rcu_bulk_data * > -get_cached_bnode(struct kfree_rcu_cpu *krcp) > -{ > - if (!krcp->nr_bkv_objs) > - return NULL; > - > - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); > - return (struct kvfree_rcu_bulk_data *) > - llist_del_first(&krcp->bkvcache); > -} > - > -static inline bool > -put_cached_bnode(struct kfree_rcu_cpu *krcp, > - struct kvfree_rcu_bulk_data *bnode) > -{ > - // Check the limit. > - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) > - return false; > - > - llist_add((struct llist_node *) bnode, &krcp->bkvcache); > - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); > - return true; > -} > - > -static int > -drain_page_cache(struct kfree_rcu_cpu *krcp) > -{ > - unsigned long flags; > - struct llist_node *page_list, *pos, *n; > - int freed = 0; > - > - if (!rcu_min_cached_objs) > - return 0; > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - page_list = llist_del_all(&krcp->bkvcache); > - WRITE_ONCE(krcp->nr_bkv_objs, 0); > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - llist_for_each_safe(pos, n, page_list) { > - free_page((unsigned long)pos); > - freed++; > - } > - > - return freed; > -} > - > -static void > -kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, > - struct kvfree_rcu_bulk_data *bnode, int idx) > -{ > - unsigned long flags; > - int i; > - > - if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { > - debug_rcu_bhead_unqueue(bnode); > - rcu_lock_acquire(&rcu_callback_map); > - if (idx == 0) { // kmalloc() / kfree(). > - trace_rcu_invoke_kfree_bulk_callback( > - rcu_state.name, bnode->nr_records, > - bnode->records); > - > - kfree_bulk(bnode->nr_records, bnode->records); > - } else { // vmalloc() / vfree(). > - for (i = 0; i < bnode->nr_records; i++) { > - trace_rcu_invoke_kvfree_callback( > - rcu_state.name, bnode->records[i], 0); > - > - vfree(bnode->records[i]); > - } > - } > - rcu_lock_release(&rcu_callback_map); > - } > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - if (put_cached_bnode(krcp, bnode)) > - bnode = NULL; > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - if (bnode) > - free_page((unsigned long) bnode); > - > - cond_resched_tasks_rcu_qs(); > -} > - > -static void > -kvfree_rcu_list(struct rcu_head *head) > -{ > - struct rcu_head *next; > - > - for (; head; head = next) { > - void *ptr = (void *) head->func; > - unsigned long offset = (void *) head - ptr; > - > - next = head->next; > - debug_rcu_head_unqueue((struct rcu_head *)ptr); > - rcu_lock_acquire(&rcu_callback_map); > - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); > - > - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) > - kvfree(ptr); > - > - rcu_lock_release(&rcu_callback_map); > - cond_resched_tasks_rcu_qs(); > - } > -} > - > -/* > - * This function is invoked in workqueue context after a grace period. > - * It frees all the objects queued on ->bulk_head_free or ->head_free. > - */ > -static void kfree_rcu_work(struct work_struct *work) > -{ > - unsigned long flags; > - struct kvfree_rcu_bulk_data *bnode, *n; > - struct list_head bulk_head[FREE_N_CHANNELS]; > - struct rcu_head *head; > - struct kfree_rcu_cpu *krcp; > - struct kfree_rcu_cpu_work *krwp; > - struct rcu_gp_oldstate head_gp_snap; > - int i; > - > - krwp = container_of(to_rcu_work(work), > - struct kfree_rcu_cpu_work, rcu_work); > - krcp = krwp->krcp; > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - // Channels 1 and 2. > - for (i = 0; i < FREE_N_CHANNELS; i++) > - list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); > - > - // Channel 3. > - head = krwp->head_free; > - krwp->head_free = NULL; > - head_gp_snap = krwp->head_free_gp_snap; > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - // Handle the first two channels. > - for (i = 0; i < FREE_N_CHANNELS; i++) { > - // Start from the tail page, so a GP is likely passed for it. > - list_for_each_entry_safe(bnode, n, &bulk_head[i], list) > - kvfree_rcu_bulk(krcp, bnode, i); > - } > - > - /* > - * This is used when the "bulk" path can not be used for the > - * double-argument of kvfree_rcu(). This happens when the > - * page-cache is empty, which means that objects are instead > - * queued on a linked list through their rcu_head structures. > - * This list is named "Channel 3". > - */ > - if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) > - kvfree_rcu_list(head); > -} > - > -static bool > -need_offload_krc(struct kfree_rcu_cpu *krcp) > -{ > - int i; > - > - for (i = 0; i < FREE_N_CHANNELS; i++) > - if (!list_empty(&krcp->bulk_head[i])) > - return true; > - > - return !!READ_ONCE(krcp->head); > -} > - > -static bool > -need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) > -{ > - int i; > - > - for (i = 0; i < FREE_N_CHANNELS; i++) > - if (!list_empty(&krwp->bulk_head_free[i])) > - return true; > - > - return !!krwp->head_free; > -} > - > -static int krc_count(struct kfree_rcu_cpu *krcp) > -{ > - int sum = atomic_read(&krcp->head_count); > - int i; > - > - for (i = 0; i < FREE_N_CHANNELS; i++) > - sum += atomic_read(&krcp->bulk_count[i]); > - > - return sum; > -} > - > -static void > -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) > -{ > - long delay, delay_left; > - > - delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; > - if (delayed_work_pending(&krcp->monitor_work)) { > - delay_left = krcp->monitor_work.timer.expires - jiffies; > - if (delay < delay_left) > - mod_delayed_work(system_wq, &krcp->monitor_work, delay); > - return; > - } > - queue_delayed_work(system_wq, &krcp->monitor_work, delay); > -} > - > -static void > -kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) > -{ > - struct list_head bulk_ready[FREE_N_CHANNELS]; > - struct kvfree_rcu_bulk_data *bnode, *n; > - struct rcu_head *head_ready = NULL; > - unsigned long flags; > - int i; > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - for (i = 0; i < FREE_N_CHANNELS; i++) { > - INIT_LIST_HEAD(&bulk_ready[i]); > - > - list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { > - if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) > - break; > - > - atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); > - list_move(&bnode->list, &bulk_ready[i]); > - } > - } > - > - if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { > - head_ready = krcp->head; > - atomic_set(&krcp->head_count, 0); > - WRITE_ONCE(krcp->head, NULL); > - } > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - for (i = 0; i < FREE_N_CHANNELS; i++) { > - list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) > - kvfree_rcu_bulk(krcp, bnode, i); > - } > - > - if (head_ready) > - kvfree_rcu_list(head_ready); > -} > - > -/* > - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. > - */ > -static void kfree_rcu_monitor(struct work_struct *work) > -{ > - struct kfree_rcu_cpu *krcp = container_of(work, > - struct kfree_rcu_cpu, monitor_work.work); > - unsigned long flags; > - int i, j; > - > - // Drain ready for reclaim. > - kvfree_rcu_drain_ready(krcp); > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - > - // Attempt to start a new batch. > - for (i = 0; i < KFREE_N_BATCHES; i++) { > - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); > - > - // Try to detach bulk_head or head and attach it, only when > - // all channels are free. Any channel is not free means at krwp > - // there is on-going rcu work to handle krwp's free business. > - if (need_wait_for_krwp_work(krwp)) > - continue; > - > - // kvfree_rcu_drain_ready() might handle this krcp, if so give up. > - if (need_offload_krc(krcp)) { > - // Channel 1 corresponds to the SLAB-pointer bulk path. > - // Channel 2 corresponds to vmalloc-pointer bulk path. > - for (j = 0; j < FREE_N_CHANNELS; j++) { > - if (list_empty(&krwp->bulk_head_free[j])) { > - atomic_set(&krcp->bulk_count[j], 0); > - list_replace_init(&krcp->bulk_head[j], > - &krwp->bulk_head_free[j]); > - } > - } > - > - // Channel 3 corresponds to both SLAB and vmalloc > - // objects queued on the linked list. > - if (!krwp->head_free) { > - krwp->head_free = krcp->head; > - get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); > - atomic_set(&krcp->head_count, 0); > - WRITE_ONCE(krcp->head, NULL); > - } > - > - // One work is per one batch, so there are three > - // "free channels", the batch can handle. It can > - // be that the work is in the pending state when > - // channels have been detached following by each > - // other. > - queue_rcu_work(system_wq, &krwp->rcu_work); > - } > - } > - > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - // If there is nothing to detach, it means that our job is > - // successfully done here. In case of having at least one > - // of the channels that is still busy we should rearm the > - // work to repeat an attempt. Because previous batches are > - // still in progress. > - if (need_offload_krc(krcp)) > - schedule_delayed_monitor_work(krcp); > -} > - > -static enum hrtimer_restart > -schedule_page_work_fn(struct hrtimer *t) > -{ > - struct kfree_rcu_cpu *krcp = > - container_of(t, struct kfree_rcu_cpu, hrtimer); > - > - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); > - return HRTIMER_NORESTART; > -} > - > -static void fill_page_cache_func(struct work_struct *work) > -{ > - struct kvfree_rcu_bulk_data *bnode; > - struct kfree_rcu_cpu *krcp = > - container_of(work, struct kfree_rcu_cpu, > - page_cache_work.work); > - unsigned long flags; > - int nr_pages; > - bool pushed; > - int i; > - > - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? > - 1 : rcu_min_cached_objs; > - > - for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { > - bnode = (struct kvfree_rcu_bulk_data *) > - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); > - > - if (!bnode) > - break; > - > - raw_spin_lock_irqsave(&krcp->lock, flags); > - pushed = put_cached_bnode(krcp, bnode); > - raw_spin_unlock_irqrestore(&krcp->lock, flags); > - > - if (!pushed) { > - free_page((unsigned long) bnode); > - break; > - } > - } > - > - atomic_set(&krcp->work_in_progress, 0); > - atomic_set(&krcp->backoff_page_cache_fill, 0); > -} > - > -static void > -run_page_cache_worker(struct kfree_rcu_cpu *krcp) > -{ > - // If cache disabled, bail out. > - if (!rcu_min_cached_objs) > - return; > - > - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && > - !atomic_xchg(&krcp->work_in_progress, 1)) { > - if (atomic_read(&krcp->backoff_page_cache_fill)) { > - queue_delayed_work(system_wq, > - &krcp->page_cache_work, > - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); > - } else { > - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > - krcp->hrtimer.function = schedule_page_work_fn; > - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); > - } > - } > -} > - > -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() > -// state specified by flags. If can_alloc is true, the caller must > -// be schedulable and not be holding any locks or mutexes that might be > -// acquired by the memory allocator or anything that it might invoke. > -// Returns true if ptr was successfully recorded, else the caller must > -// use a fallback. > -static inline bool > -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, > - unsigned long *flags, void *ptr, bool can_alloc) > -{ > - struct kvfree_rcu_bulk_data *bnode; > - int idx; > - > - *krcp = krc_this_cpu_lock(flags); > - if (unlikely(!(*krcp)->initialized)) > - return false; > - > - idx = !!is_vmalloc_addr(ptr); > - bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], > - struct kvfree_rcu_bulk_data, list); > - > - /* Check if a new block is required. */ > - if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { > - bnode = get_cached_bnode(*krcp); > - if (!bnode && can_alloc) { > - krc_this_cpu_unlock(*krcp, *flags); > - > - // __GFP_NORETRY - allows a light-weight direct reclaim > - // what is OK from minimizing of fallback hitting point of > - // view. Apart of that it forbids any OOM invoking what is > - // also beneficial since we are about to release memory soon. > - // > - // __GFP_NOMEMALLOC - prevents from consuming of all the > - // memory reserves. Please note we have a fallback path. > - // > - // __GFP_NOWARN - it is supposed that an allocation can > - // be failed under low memory or high memory pressure > - // scenarios. > - bnode = (struct kvfree_rcu_bulk_data *) > - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); > - raw_spin_lock_irqsave(&(*krcp)->lock, *flags); > - } > - > - if (!bnode) > - return false; > - > - // Initialize the new block and attach it. > - bnode->nr_records = 0; > - list_add(&bnode->list, &(*krcp)->bulk_head[idx]); > - } > - > - // Finally insert and update the GP for this page. > - bnode->records[bnode->nr_records++] = ptr; > - get_state_synchronize_rcu_full(&bnode->gp_snap); > - atomic_inc(&(*krcp)->bulk_count[idx]); > - > - return true; > -} > - > -/* > - * Queue a request for lazy invocation of the appropriate free routine > - * after a grace period. Please note that three paths are maintained, > - * two for the common case using arrays of pointers and a third one that > - * is used only when the main paths cannot be used, for example, due to > - * memory pressure. > - * > - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained > - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will > - * be free'd in workqueue context. This allows us to: batch requests together to > - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. > - */ > -void kvfree_call_rcu(struct rcu_head *head, void *ptr) > -{ > - unsigned long flags; > - struct kfree_rcu_cpu *krcp; > - bool success; > - > - /* > - * Please note there is a limitation for the head-less > - * variant, that is why there is a clear rule for such > - * objects: it can be used from might_sleep() context > - * only. For other places please embed an rcu_head to > - * your data. > - */ > - if (!head) > - might_sleep(); > - > - // Queue the object but don't yet schedule the batch. > - if (debug_rcu_head_queue(ptr)) { > - // Probable double kfree_rcu(), just leak. > - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", > - __func__, head); > - > - // Mark as success and leave. > - return; > - } > - > - kasan_record_aux_stack_noalloc(ptr); > - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); > - if (!success) { > - run_page_cache_worker(krcp); > - > - if (head == NULL) > - // Inline if kvfree_rcu(one_arg) call. > - goto unlock_return; > - > - head->func = ptr; > - head->next = krcp->head; > - WRITE_ONCE(krcp->head, head); > - atomic_inc(&krcp->head_count); > - > - // Take a snapshot for this krcp. > - krcp->head_gp_snap = get_state_synchronize_rcu(); > - success = true; > - } > - > - /* > - * The kvfree_rcu() caller considers the pointer freed at this point > - * and likely removes any references to it. Since the actual slab > - * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore > - * this object (no scanning or false positives reporting). > - */ > - kmemleak_ignore(ptr); > - > - // Set timer to drain after KFREE_DRAIN_JIFFIES. > - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) > - schedule_delayed_monitor_work(krcp); > - > -unlock_return: > - krc_this_cpu_unlock(krcp, flags); > - > - /* > - * Inline kvfree() after synchronize_rcu(). We can do > - * it from might_sleep() context only, so the current > - * CPU can pass the QS state. > - */ > - if (!success) { > - debug_rcu_head_unqueue((struct rcu_head *) ptr); > - synchronize_rcu(); > - kvfree(ptr); > - } > -} > -EXPORT_SYMBOL_GPL(kvfree_call_rcu); > - > -static unsigned long > -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) > -{ > - int cpu; > - unsigned long count = 0; > - > - /* Snapshot count of all CPUs */ > - for_each_possible_cpu(cpu) { > - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > - > - count += krc_count(krcp); > - count += READ_ONCE(krcp->nr_bkv_objs); > - atomic_set(&krcp->backoff_page_cache_fill, 1); > - } > - > - return count == 0 ? SHRINK_EMPTY : count; > -} > - > -static unsigned long > -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) > -{ > - int cpu, freed = 0; > - > - for_each_possible_cpu(cpu) { > - int count; > - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > - > - count = krc_count(krcp); > - count += drain_page_cache(krcp); > - kfree_rcu_monitor(&krcp->monitor_work.work); > - > - sc->nr_to_scan -= count; > - freed += count; > - > - if (sc->nr_to_scan <= 0) > - break; > - } > - > - return freed == 0 ? SHRINK_STOP : freed; > -} > - > -void __init kfree_rcu_scheduler_running(void) > -{ > - int cpu; > - > - for_each_possible_cpu(cpu) { > - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > - > - if (need_offload_krc(krcp)) > - schedule_delayed_monitor_work(krcp); > - } > -} > - > /* > * During early boot, any blocking grace-period wait automatically > * implies a grace period. > @@ -5567,62 +4871,12 @@ static void __init rcu_dump_rcu_node_tree(void) > > struct workqueue_struct *rcu_gp_wq; > > -static void __init kfree_rcu_batch_init(void) > -{ > - int cpu; > - int i, j; > - struct shrinker *kfree_rcu_shrinker; > - > - /* Clamp it to [0:100] seconds interval. */ > - if (rcu_delay_page_cache_fill_msec < 0 || > - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { > - > - rcu_delay_page_cache_fill_msec = > - clamp(rcu_delay_page_cache_fill_msec, 0, > - (int) (100 * MSEC_PER_SEC)); > - > - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", > - rcu_delay_page_cache_fill_msec); > - } > - > - for_each_possible_cpu(cpu) { > - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > - > - for (i = 0; i < KFREE_N_BATCHES; i++) { > - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); > - krcp->krw_arr[i].krcp = krcp; > - > - for (j = 0; j < FREE_N_CHANNELS; j++) > - INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); > - } > - > - for (i = 0; i < FREE_N_CHANNELS; i++) > - INIT_LIST_HEAD(&krcp->bulk_head[i]); > - > - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); > - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); > - krcp->initialized = true; > - } > - > - kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); > - if (!kfree_rcu_shrinker) { > - pr_err("Failed to allocate kfree_rcu() shrinker!\n"); > - return; > - } > - > - kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; > - kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; > - > - shrinker_register(kfree_rcu_shrinker); > -} > - > void __init rcu_init(void) > { > int cpu = smp_processor_id(); > > rcu_early_boot_tests(); > > - kfree_rcu_batch_init(); > rcu_bootup_announce(); > sanitize_kthread_prio(); > rcu_init_geometry(); > diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c > index f8436969e0c8..82ca9f490d7c 100644 > --- a/kernel/rcu/update.c > +++ b/kernel/rcu/update.c > @@ -273,7 +273,6 @@ static int __init rcu_set_runtime_mode(void) > { > rcu_test_sync_prims(); > rcu_scheduler_active = RCU_SCHEDULER_RUNNING; > - kfree_rcu_scheduler_running(); > rcu_test_sync_prims(); > return 0; > } > -- > 2.45.2 >
On Mon, Aug 19, 2024 at 03:18:02PM GMT, Paul E. McKenney wrote: > On Mon, Aug 19, 2024 at 12:59:35PM -0400, Kent Overstreet wrote: > > This nets us a slight performance increase, and converts to common > > code. > > How did you measure the performance, and what was the actual amount > of the increase? > > Either way, the performance increase needs to be verified by the people > for whom the current code works well. As noted a couple months ago, > I am having a hard time imagining a tree beating a linked list of pages > of pointers in terms of cache locality. I added linux-arch on CC in > case I am out of date on how computer systems work. The point isn't performance, I was just noting what I saw, and the small performance increase just came from carefull optimization, nothing special. The old and new versions are close enough to not matter. The point of the patchset is to consolidate kvfree_rcu() and SLAB_TYPESAFE_BY_RCU into something more general, and dare I say it, something a bit saner.
diff --git a/include/linux/rcu_pending.h b/include/linux/rcu_pending.h index a875c640da8d..5ef6392ce180 100644 --- a/include/linux/rcu_pending.h +++ b/include/linux/rcu_pending.h @@ -22,4 +22,6 @@ int rcu_pending_init(struct rcu_pending *pending, struct srcu_struct *srcu, rcu_pending_process_fn process); +void __init kvfree_rcu_pending_init(void); + #endif /* _LINUX_RCU_PENDING_H */ diff --git a/init/main.c b/init/main.c index 206acdde51f5..a3f0fd6ec3da 100644 --- a/init/main.c +++ b/init/main.c @@ -43,6 +43,7 @@ #include <linux/profile.h> #include <linux/kfence.h> #include <linux/rcupdate.h> +#include <linux/rcu_pending.h> #include <linux/srcu.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> @@ -993,6 +994,7 @@ void start_kernel(void) workqueue_init_early(); rcu_init(); + kvfree_rcu_pending_init(); /* Trace events are available after this */ trace_init(); diff --git a/kernel/rcu/pending.c b/kernel/rcu/pending.c index c0e2351ba198..9c57f373d494 100644 --- a/kernel/rcu/pending.c +++ b/kernel/rcu/pending.c @@ -601,3 +601,23 @@ int rcu_pending_init(struct rcu_pending *pending, return 0; } + +#ifndef CONFIG_TINY_RCU +/* kvfree_rcu */ + +static struct rcu_pending kvfree_rcu_pending; + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + BUG_ON(!ptr); + + __rcu_pending_enqueue(&kvfree_rcu_pending, head, ptr, head == NULL); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_pending_init(void) +{ + if (rcu_pending_init(&kvfree_rcu_pending, NULL, RCU_PENDING_KVFREE_FN)) + panic("%s failed\n", __func__); +} +#endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 52f9f0bf1b8e..3bb6c81e2d30 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3216,702 +3216,6 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2 -/** - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers - * @list: List node. All blocks are linked between each other - * @gp_snap: Snapshot of RCU state for objects placed to this bulk - * @nr_records: Number of active pointers in the array - * @records: Array of the kvfree_rcu() pointers - */ -struct kvfree_rcu_bulk_data { - struct list_head list; - struct rcu_gp_oldstate gp_snap; - unsigned long nr_records; - void *records[]; -}; - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kvfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KVFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) - -/** - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period - * @head_free: List of kfree_rcu() objects waiting for a grace period - * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. - * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period - * @krcp: Pointer to @kfree_rcu_cpu structure - */ - -struct kfree_rcu_cpu_work { - struct rcu_work rcu_work; - struct rcu_head *head_free; - struct rcu_gp_oldstate head_free_gp_snap; - struct list_head bulk_head_free[FREE_N_CHANNELS]; - struct kfree_rcu_cpu *krcp; -}; - -/** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period - * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" - * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period - * @lock: Synchronize access to this structure - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @initialized: The @rcu_work fields have been initialized - * @head_count: Number of objects in rcu_head singular list - * @bulk_count: Number of objects in bulk-list - * @bkvcache: - * A simple cache list that contains objects for reuse purpose. - * In order to save some per-cpu space the list is singular. - * Even though it is lockless an access has to be protected by the - * per-cpu lock. - * @page_cache_work: A work to refill the cache when it is empty - * @backoff_page_cache_fill: Delay cache refills - * @work_in_progress: Indicates that page_cache_work is running - * @hrtimer: A hrtimer for scheduling a page_cache_work - * @nr_bkv_objs: number of allocated objects at @bkvcache. - * - * This is a per-CPU structure. The reason that it is not included in - * the rcu_data structure is to permit this code to be extracted from - * the RCU files. Such extraction could allow further optimization of - * the interactions with the slab allocators. - */ -struct kfree_rcu_cpu { - // Objects queued on a linked list - // through their rcu_head structures. - struct rcu_head *head; - unsigned long head_gp_snap; - atomic_t head_count; - - // Objects queued on a bulk-list. - struct list_head bulk_head[FREE_N_CHANNELS]; - atomic_t bulk_count[FREE_N_CHANNELS]; - - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - raw_spinlock_t lock; - struct delayed_work monitor_work; - bool initialized; - - struct delayed_work page_cache_work; - atomic_t backoff_page_cache_fill; - atomic_t work_in_progress; - struct hrtimer hrtimer; - - struct llist_head bkvcache; - int nr_bkv_objs; -}; - -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), -}; - -static __always_inline void -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - int i; - - for (i = 0; i < bhead->nr_records; i++) - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); -#endif -} - -static inline struct kfree_rcu_cpu * -krc_this_cpu_lock(unsigned long *flags) -{ - struct kfree_rcu_cpu *krcp; - - local_irq_save(*flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - raw_spin_lock(&krcp->lock); - - return krcp; -} - -static inline void -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static inline struct kvfree_rcu_bulk_data * -get_cached_bnode(struct kfree_rcu_cpu *krcp) -{ - if (!krcp->nr_bkv_objs) - return NULL; - - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); - return (struct kvfree_rcu_bulk_data *) - llist_del_first(&krcp->bkvcache); -} - -static inline bool -put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode) -{ - // Check the limit. - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) - return false; - - llist_add((struct llist_node *) bnode, &krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); - return true; -} - -static int -drain_page_cache(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - struct llist_node *page_list, *pos, *n; - int freed = 0; - - if (!rcu_min_cached_objs) - return 0; - - raw_spin_lock_irqsave(&krcp->lock, flags); - page_list = llist_del_all(&krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, 0); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - llist_for_each_safe(pos, n, page_list) { - free_page((unsigned long)pos); - freed++; - } - - return freed; -} - -static void -kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode, int idx) -{ - unsigned long flags; - int i; - - if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { - debug_rcu_bhead_unqueue(bnode); - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); - } - } - rcu_lock_release(&rcu_callback_map); - } - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bnode)) - bnode = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bnode) - free_page((unsigned long) bnode); - - cond_resched_tasks_rcu_qs(); -} - -static void -kvfree_rcu_list(struct rcu_head *head) -{ - struct rcu_head *next; - - for (; head; head = next) { - void *ptr = (void *) head->func; - unsigned long offset = (void *) head - ptr; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } -} - -/* - * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bulk_head_free or ->head_free. - */ -static void kfree_rcu_work(struct work_struct *work) -{ - unsigned long flags; - struct kvfree_rcu_bulk_data *bnode, *n; - struct list_head bulk_head[FREE_N_CHANNELS]; - struct rcu_head *head; - struct kfree_rcu_cpu *krcp; - struct kfree_rcu_cpu_work *krwp; - struct rcu_gp_oldstate head_gp_snap; - int i; - - krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); - krcp = krwp->krcp; - - raw_spin_lock_irqsave(&krcp->lock, flags); - // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) - list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); - - // Channel 3. - head = krwp->head_free; - krwp->head_free = NULL; - head_gp_snap = krwp->head_free_gp_snap; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // Handle the first two channels. - for (i = 0; i < FREE_N_CHANNELS; i++) { - // Start from the tail page, so a GP is likely passed for it. - list_for_each_entry_safe(bnode, n, &bulk_head[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - /* - * This is used when the "bulk" path can not be used for the - * double-argument of kvfree_rcu(). This happens when the - * page-cache is empty, which means that objects are instead - * queued on a linked list through their rcu_head structures. - * This list is named "Channel 3". - */ - if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) - kvfree_rcu_list(head); -} - -static bool -need_offload_krc(struct kfree_rcu_cpu *krcp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krcp->bulk_head[i])) - return true; - - return !!READ_ONCE(krcp->head); -} - -static bool -need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krwp->bulk_head_free[i])) - return true; - - return !!krwp->head_free; -} - -static int krc_count(struct kfree_rcu_cpu *krcp) -{ - int sum = atomic_read(&krcp->head_count); - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - sum += atomic_read(&krcp->bulk_count[i]); - - return sum; -} - -static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - long delay, delay_left; - - delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; - if (delayed_work_pending(&krcp->monitor_work)) { - delay_left = krcp->monitor_work.timer.expires - jiffies; - if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); - return; - } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); -} - -static void -kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) -{ - struct list_head bulk_ready[FREE_N_CHANNELS]; - struct kvfree_rcu_bulk_data *bnode, *n; - struct rcu_head *head_ready = NULL; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&krcp->lock, flags); - for (i = 0; i < FREE_N_CHANNELS; i++) { - INIT_LIST_HEAD(&bulk_ready[i]); - - list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) - break; - - atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); - list_move(&bnode->list, &bulk_ready[i]); - } - } - - if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { - head_ready = krcp->head; - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - for (i = 0; i < FREE_N_CHANNELS; i++) { - list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - if (head_ready) - kvfree_rcu_list(head_ready); -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); - unsigned long flags; - int i, j; - - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - - raw_spin_lock_irqsave(&krcp->lock, flags); - - // Attempt to start a new batch. - for (i = 0; i < KFREE_N_BATCHES; i++) { - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - - // Try to detach bulk_head or head and attach it, only when - // all channels are free. Any channel is not free means at krwp - // there is on-going rcu work to handle krwp's free business. - if (need_wait_for_krwp_work(krwp)) - continue; - - // kvfree_rcu_drain_ready() might handle this krcp, if so give up. - if (need_offload_krc(krcp)) { - // Channel 1 corresponds to the SLAB-pointer bulk path. - // Channel 2 corresponds to vmalloc-pointer bulk path. - for (j = 0; j < FREE_N_CHANNELS; j++) { - if (list_empty(&krwp->bulk_head_free[j])) { - atomic_set(&krcp->bulk_count[j], 0); - list_replace_init(&krcp->bulk_head[j], - &krwp->bulk_head_free[j]); - } - } - - // Channel 3 corresponds to both SLAB and vmalloc - // objects queued on the linked list. - if (!krwp->head_free) { - krwp->head_free = krcp->head; - get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - - // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. - queue_rcu_work(system_wq, &krwp->rcu_work); - } - } - - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // If there is nothing to detach, it means that our job is - // successfully done here. In case of having at least one - // of the channels that is still busy we should rearm the - // work to repeat an attempt. Because previous batches are - // still in progress. - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); -} - -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - -static void fill_page_cache_func(struct work_struct *work) -{ - struct kvfree_rcu_bulk_data *bnode; - struct kfree_rcu_cpu *krcp = - container_of(work, struct kfree_rcu_cpu, - page_cache_work.work); - unsigned long flags; - int nr_pages; - bool pushed; - int i; - - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? - 1 : rcu_min_cached_objs; - - for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - - if (!bnode) - break; - - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (!pushed) { - free_page((unsigned long) bnode); - break; - } - } - - atomic_set(&krcp->work_in_progress, 0); - atomic_set(&krcp->backoff_page_cache_fill, 0); -} - -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - // If cache disabled, bail out. - if (!rcu_min_cached_objs) - return; - - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() -// state specified by flags. If can_alloc is true, the caller must -// be schedulable and not be holding any locks or mutexes that might be -// acquired by the memory allocator or anything that it might invoke. -// Returns true if ptr was successfully recorded, else the caller must -// use a fallback. -static inline bool -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, - unsigned long *flags, void *ptr, bool can_alloc) -{ - struct kvfree_rcu_bulk_data *bnode; - int idx; - - *krcp = krc_this_cpu_lock(flags); - if (unlikely(!(*krcp)->initialized)) - return false; - - idx = !!is_vmalloc_addr(ptr); - bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], - struct kvfree_rcu_bulk_data, list); - - /* Check if a new block is required. */ - if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(*krcp); - if (!bnode && can_alloc) { - krc_this_cpu_unlock(*krcp, *flags); - - // __GFP_NORETRY - allows a light-weight direct reclaim - // what is OK from minimizing of fallback hitting point of - // view. Apart of that it forbids any OOM invoking what is - // also beneficial since we are about to release memory soon. - // - // __GFP_NOMEMALLOC - prevents from consuming of all the - // memory reserves. Please note we have a fallback path. - // - // __GFP_NOWARN - it is supposed that an allocation can - // be failed under low memory or high memory pressure - // scenarios. - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - raw_spin_lock_irqsave(&(*krcp)->lock, *flags); - } - - if (!bnode) - return false; - - // Initialize the new block and attach it. - bnode->nr_records = 0; - list_add(&bnode->list, &(*krcp)->bulk_head[idx]); - } - - // Finally insert and update the GP for this page. - bnode->records[bnode->nr_records++] = ptr; - get_state_synchronize_rcu_full(&bnode->gp_snap); - atomic_inc(&(*krcp)->bulk_count[idx]); - - return true; -} - -/* - * Queue a request for lazy invocation of the appropriate free routine - * after a grace period. Please note that three paths are maintained, - * two for the common case using arrays of pointers and a third one that - * is used only when the main paths cannot be used, for example, due to - * memory pressure. - * - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will - * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. - */ -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp; - bool success; - - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ - if (!head) - might_sleep(); - - // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(ptr)) { - // Probable double kfree_rcu(), just leak. - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", - __func__, head); - - // Mark as success and leave. - return; - } - - kasan_record_aux_stack_noalloc(ptr); - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); - if (!success) { - run_page_cache_worker(krcp); - - if (head == NULL) - // Inline if kvfree_rcu(one_arg) call. - goto unlock_return; - - head->func = ptr; - head->next = krcp->head; - WRITE_ONCE(krcp->head, head); - atomic_inc(&krcp->head_count); - - // Take a snapshot for this krcp. - krcp->head_gp_snap = get_state_synchronize_rcu(); - success = true; - } - - /* - * The kvfree_rcu() caller considers the pointer freed at this point - * and likely removes any references to it. Since the actual slab - * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore - * this object (no scanning or false positives reporting). - */ - kmemleak_ignore(ptr); - - // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_monitor_work(krcp); - -unlock_return: - krc_this_cpu_unlock(krcp, flags); - - /* - * Inline kvfree() after synchronize_rcu(). We can do - * it from might_sleep() context only, so the current - * CPU can pass the QS state. - */ - if (!success) { - debug_rcu_head_unqueue((struct rcu_head *) ptr); - synchronize_rcu(); - kvfree(ptr); - } -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); - -static unsigned long -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu; - unsigned long count = 0; - - /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count += krc_count(krcp); - count += READ_ONCE(krcp->nr_bkv_objs); - atomic_set(&krcp->backoff_page_cache_fill, 1); - } - - return count == 0 ? SHRINK_EMPTY : count; -} - -static unsigned long -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu, freed = 0; - - for_each_possible_cpu(cpu) { - int count; - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count = krc_count(krcp); - count += drain_page_cache(krcp); - kfree_rcu_monitor(&krcp->monitor_work.work); - - sc->nr_to_scan -= count; - freed += count; - - if (sc->nr_to_scan <= 0) - break; - } - - return freed == 0 ? SHRINK_STOP : freed; -} - -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - } -} - /* * During early boot, any blocking grace-period wait automatically * implies a grace period. @@ -5567,62 +4871,12 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -static void __init kfree_rcu_batch_init(void) -{ - int cpu; - int i, j; - struct shrinker *kfree_rcu_shrinker; - - /* Clamp it to [0:100] seconds interval. */ - if (rcu_delay_page_cache_fill_msec < 0 || - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { - - rcu_delay_page_cache_fill_msec = - clamp(rcu_delay_page_cache_fill_msec, 0, - (int) (100 * MSEC_PER_SEC)); - - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", - rcu_delay_page_cache_fill_msec); - } - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); - krcp->krw_arr[i].krcp = krcp; - - for (j = 0; j < FREE_N_CHANNELS; j++) - INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); - } - - for (i = 0; i < FREE_N_CHANNELS; i++) - INIT_LIST_HEAD(&krcp->bulk_head[i]); - - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); - krcp->initialized = true; - } - - kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); - if (!kfree_rcu_shrinker) { - pr_err("Failed to allocate kfree_rcu() shrinker!\n"); - return; - } - - kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; - kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; - - shrinker_register(kfree_rcu_shrinker); -} - void __init rcu_init(void) { int cpu = smp_processor_id(); rcu_early_boot_tests(); - kfree_rcu_batch_init(); rcu_bootup_announce(); sanitize_kthread_prio(); rcu_init_geometry(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f8436969e0c8..82ca9f490d7c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -273,7 +273,6 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; - kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; }
This nets us a slight performance increase, and converts to common code. Todo - re-add the shrinker, so that memory reclaim can free expired objects and expedite a grace period when necessary. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- include/linux/rcu_pending.h | 2 + init/main.c | 2 + kernel/rcu/pending.c | 20 + kernel/rcu/tree.c | 746 ------------------------------------ kernel/rcu/update.c | 1 - 5 files changed, 24 insertions(+), 747 deletions(-)