Message ID | 20200428205903.61704-10-urezki@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce kvfree_rcu(1 or 2 arguments) | expand |
On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote: > Cache some extra objects per-CPU. During reclaim process > some pages are cached instead of releasing by linking them > into the list. Such approach provides O(1) access time to > the cache. > > That reduces number of requests to the page allocator, also > that makes it more helpful if a low memory condition occurs. > > A parameter reflecting the minimum allowed pages to be > cached per one CPU is propagated via sysfs, it is read > only, the name is "rcu_min_cached_objs". > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > --- > kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 60 insertions(+), 4 deletions(-) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index 89e9ca3f4e3e..d8975819b1c9 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); > static int gp_cleanup_delay; > module_param(gp_cleanup_delay, int, 0444); > > +/* > + * This rcu parameter is read-only, but can be write also. You mean that although the parameter is read-only, you see no reason why it could not be converted to writeable? If it was writeable, and a given CPU had the maximum numbr of cached objects, the rcu_min_cached_objs value was decreased, but that CPU never saw another kfree_rcu(), would the number of cached objects change? (Just curious, not asking for a change in functionality.) > + * It reflects the minimum allowed number of objects which > + * can be cached per-CPU. Object size is equal to one page. > + */ > +int rcu_min_cached_objs = 2; > +module_param(rcu_min_cached_objs, int, 0444); > + > /* Retrieve RCU kthreads priority for rcutorture */ > int rcu_get_gp_kthreads_prio(void) > { > @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work { > * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period > * @head: List of kfree_rcu() objects not yet waiting for a grace period > * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period > - * @bcached: Keeps at most one object for later reuse when build chain blocks > * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period > * @lock: Synchronize access to this structure > * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES > @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work { > struct kfree_rcu_cpu { > struct rcu_head *head; > struct kfree_rcu_bulk_data *bhead; > - struct kfree_rcu_bulk_data *bcached; > struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; > raw_spinlock_t lock; > struct delayed_work monitor_work; > @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu { > bool initialized; > // Number of objects for which GP not started > int count; > + > + /* > + * Number of cached objects which are queued into > + * the lock-less list. This cache is used by the > + * kvfree_call_rcu() function and as of now its > + * size is static. > + */ > + struct llist_head bkvcache; > + int nr_bkv_objs; > }; > > static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { > @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) > local_irq_restore(flags); > } > > +static inline struct kfree_rcu_bulk_data * > +get_cached_bnode(struct kfree_rcu_cpu *krcp) > +{ > + if (!krcp->nr_bkv_objs) > + return NULL; > + > + krcp->nr_bkv_objs--; > + return (struct kfree_rcu_bulk_data *) > + llist_del_first(&krcp->bkvcache); > +} > + > +static inline bool > +put_cached_bnode(struct kfree_rcu_cpu *krcp, > + struct kfree_rcu_bulk_data *bnode) > +{ > + /* Check the limit. */ > + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) > + return false; > + > + llist_add((struct llist_node *) bnode, &krcp->bkvcache); > + krcp->nr_bkv_objs++; > + return true; > + > +} > + > /* > * This function is invoked in workqueue context after a grace period. > * It frees all the objects queued on ->bhead_free or ->head_free. > @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work) > kfree_bulk(bhead->nr_records, bhead->records); > rcu_lock_release(&rcu_callback_map); > > - if (cmpxchg(&krcp->bcached, NULL, bhead)) > + krcp = krc_this_cpu_lock(&flags); Presumably the list can also be accessed without holding this lock, because otherwise we shouldn't need llist... Thanx, Paul > + if (put_cached_bnode(krcp, bhead)) > + bhead = NULL; > + krc_this_cpu_unlock(krcp, flags); > + > + if (bhead) > free_page((unsigned long) bhead); > > cond_resched_tasks_rcu_qs(); > @@ -3114,7 +3159,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, > /* Check if a new block is required. */ > if (!krcp->bhead || > krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { > - bnode = xchg(&krcp->bcached, NULL); > + bnode = get_cached_bnode(krcp); > if (!bnode) { > WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); > > @@ -4167,12 +4212,23 @@ static void __init kfree_rcu_batch_init(void) > > for_each_possible_cpu(cpu) { > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > + struct kfree_rcu_bulk_data *bnode; > > for (i = 0; i < KFREE_N_BATCHES; i++) { > INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); > krcp->krw_arr[i].krcp = krcp; > } > > + for (i = 0; i < rcu_min_cached_objs; i++) { > + bnode = (struct kfree_rcu_bulk_data *) > + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); > + > + if (bnode) > + put_cached_bnode(krcp, bnode); > + else > + pr_err("Failed to preallocate for %d CPU!\n", cpu); > + } > + > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); > krcp->initialized = true; > } > -- > 2.20.1 >
On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote: > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote: > > Cache some extra objects per-CPU. During reclaim process > > some pages are cached instead of releasing by linking them > > into the list. Such approach provides O(1) access time to > > the cache. > > > > That reduces number of requests to the page allocator, also > > that makes it more helpful if a low memory condition occurs. > > > > A parameter reflecting the minimum allowed pages to be > > cached per one CPU is propagated via sysfs, it is read > > only, the name is "rcu_min_cached_objs". > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > --- > > kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 60 insertions(+), 4 deletions(-) > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index 89e9ca3f4e3e..d8975819b1c9 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); > > static int gp_cleanup_delay; > > module_param(gp_cleanup_delay, int, 0444); > > > > +/* > > + * This rcu parameter is read-only, but can be write also. > > You mean that although the parameter is read-only, you see no reason > why it could not be converted to writeable? > I added just a note. If it is writable, then we can change the size of the per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs" would cache 5 pages. But i do not have a strong opinion if it should be writable. > > If it was writeable, and a given CPU had the maximum numbr of cached > objects, the rcu_min_cached_objs value was decreased, but that CPU never > saw another kfree_rcu(), would the number of cached objects change? > No. It works the way: unqueue the page from cache in the kfree_rcu(), whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs, if >= will free the page. > > (Just curious, not asking for a change in functionality.) > > > + * It reflects the minimum allowed number of objects which > > + * can be cached per-CPU. Object size is equal to one page. > > + */ > > +int rcu_min_cached_objs = 2; > > +module_param(rcu_min_cached_objs, int, 0444); > > + > > /* Retrieve RCU kthreads priority for rcutorture */ > > int rcu_get_gp_kthreads_prio(void) > > { > > @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work { > > * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period > > * @head: List of kfree_rcu() objects not yet waiting for a grace period > > * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period > > - * @bcached: Keeps at most one object for later reuse when build chain blocks > > * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period > > * @lock: Synchronize access to this structure > > * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES > > @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work { > > struct kfree_rcu_cpu { > > struct rcu_head *head; > > struct kfree_rcu_bulk_data *bhead; > > - struct kfree_rcu_bulk_data *bcached; > > struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; > > raw_spinlock_t lock; > > struct delayed_work monitor_work; > > @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu { > > bool initialized; > > // Number of objects for which GP not started > > int count; > > + > > + /* > > + * Number of cached objects which are queued into > > + * the lock-less list. This cache is used by the > > + * kvfree_call_rcu() function and as of now its > > + * size is static. > > + */ > > + struct llist_head bkvcache; > > + int nr_bkv_objs; > > }; > > > > static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { > > @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) > > local_irq_restore(flags); > > } > > > > +static inline struct kfree_rcu_bulk_data * > > +get_cached_bnode(struct kfree_rcu_cpu *krcp) > > +{ > > + if (!krcp->nr_bkv_objs) > > + return NULL; > > + > > + krcp->nr_bkv_objs--; > > + return (struct kfree_rcu_bulk_data *) > > + llist_del_first(&krcp->bkvcache); > > +} > > + > > +static inline bool > > +put_cached_bnode(struct kfree_rcu_cpu *krcp, > > + struct kfree_rcu_bulk_data *bnode) > > +{ > > + /* Check the limit. */ > > + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) > > + return false; > > + > > + llist_add((struct llist_node *) bnode, &krcp->bkvcache); > > + krcp->nr_bkv_objs++; > > + return true; > > + > > +} > > + > > /* > > * This function is invoked in workqueue context after a grace period. > > * It frees all the objects queued on ->bhead_free or ->head_free. > > @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work) > > kfree_bulk(bhead->nr_records, bhead->records); > > rcu_lock_release(&rcu_callback_map); > > > > - if (cmpxchg(&krcp->bcached, NULL, bhead)) > > + krcp = krc_this_cpu_lock(&flags); > > Presumably the list can also be accessed without holding this lock, > because otherwise we shouldn't need llist... > Hm... We increase the number of elements in cache, therefore it is not lockless. From the other hand i used llist_head to maintain the cache because it is single linked list, we do not need "*prev" link. Also we do not need to init the list. But i can change it to list_head. Please let me know if i need :) -- Vlad Rezki
On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote: > On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote: > > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote: > > > Cache some extra objects per-CPU. During reclaim process > > > some pages are cached instead of releasing by linking them > > > into the list. Such approach provides O(1) access time to > > > the cache. > > > > > > That reduces number of requests to the page allocator, also > > > that makes it more helpful if a low memory condition occurs. > > > > > > A parameter reflecting the minimum allowed pages to be > > > cached per one CPU is propagated via sysfs, it is read > > > only, the name is "rcu_min_cached_objs". > > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > > --- > > > kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- > > > 1 file changed, 60 insertions(+), 4 deletions(-) > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > index 89e9ca3f4e3e..d8975819b1c9 100644 > > > --- a/kernel/rcu/tree.c > > > +++ b/kernel/rcu/tree.c > > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); > > > static int gp_cleanup_delay; > > > module_param(gp_cleanup_delay, int, 0444); > > > > > > +/* > > > + * This rcu parameter is read-only, but can be write also. > > > > You mean that although the parameter is read-only, you see no reason > > why it could not be converted to writeable? > > > I added just a note. If it is writable, then we can change the size of the > per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs" > would cache 5 pages. But i do not have a strong opinion if it should be > writable. > > > If it was writeable, and a given CPU had the maximum numbr of cached > > objects, the rcu_min_cached_objs value was decreased, but that CPU never > > saw another kfree_rcu(), would the number of cached objects change? > > > No. It works the way: unqueue the page from cache in the kfree_rcu(), > whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs, > if >= will free the page. Just to make sure I understand... If someone writes a smaller number to the sysfs variable, the per-CPU caches will be decreased at that point, immediately during that sysfs write? Or are you saying something else? > > (Just curious, not asking for a change in functionality.) > > > > > + * It reflects the minimum allowed number of objects which > > > + * can be cached per-CPU. Object size is equal to one page. > > > + */ > > > +int rcu_min_cached_objs = 2; > > > +module_param(rcu_min_cached_objs, int, 0444); > > > + > > > /* Retrieve RCU kthreads priority for rcutorture */ > > > int rcu_get_gp_kthreads_prio(void) > > > { > > > @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work { > > > * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period > > > * @head: List of kfree_rcu() objects not yet waiting for a grace period > > > * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period > > > - * @bcached: Keeps at most one object for later reuse when build chain blocks > > > * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period > > > * @lock: Synchronize access to this structure > > > * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES > > > @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work { > > > struct kfree_rcu_cpu { > > > struct rcu_head *head; > > > struct kfree_rcu_bulk_data *bhead; > > > - struct kfree_rcu_bulk_data *bcached; > > > struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; > > > raw_spinlock_t lock; > > > struct delayed_work monitor_work; > > > @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu { > > > bool initialized; > > > // Number of objects for which GP not started > > > int count; > > > + > > > + /* > > > + * Number of cached objects which are queued into > > > + * the lock-less list. This cache is used by the > > > + * kvfree_call_rcu() function and as of now its > > > + * size is static. > > > + */ > > > + struct llist_head bkvcache; > > > + int nr_bkv_objs; > > > }; > > > > > > static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { > > > @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) > > > local_irq_restore(flags); > > > } > > > > > > +static inline struct kfree_rcu_bulk_data * > > > +get_cached_bnode(struct kfree_rcu_cpu *krcp) > > > +{ > > > + if (!krcp->nr_bkv_objs) > > > + return NULL; > > > + > > > + krcp->nr_bkv_objs--; > > > + return (struct kfree_rcu_bulk_data *) > > > + llist_del_first(&krcp->bkvcache); > > > +} > > > + > > > +static inline bool > > > +put_cached_bnode(struct kfree_rcu_cpu *krcp, > > > + struct kfree_rcu_bulk_data *bnode) > > > +{ > > > + /* Check the limit. */ > > > + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) > > > + return false; > > > + > > > + llist_add((struct llist_node *) bnode, &krcp->bkvcache); > > > + krcp->nr_bkv_objs++; > > > + return true; > > > + > > > +} > > > + > > > /* > > > * This function is invoked in workqueue context after a grace period. > > > * It frees all the objects queued on ->bhead_free or ->head_free. > > > @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work) > > > kfree_bulk(bhead->nr_records, bhead->records); > > > rcu_lock_release(&rcu_callback_map); > > > > > > - if (cmpxchg(&krcp->bcached, NULL, bhead)) > > > + krcp = krc_this_cpu_lock(&flags); > > > > Presumably the list can also be accessed without holding this lock, > > because otherwise we shouldn't need llist... > > > Hm... We increase the number of elements in cache, therefore it is not > lockless. From the other hand i used llist_head to maintain the cache > because it is single linked list, we do not need "*prev" link. Also > we do not need to init the list. > > But i can change it to list_head. Please let me know if i need :) Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU callback processing, the operations were open-coded, but they have been pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. Maybe some non-atomic/protected/whatever macros in the llist.h file? Or maybe just open-code the singly linked list? (Probably not the best choice, though.) Add comments stating that the atomic properties of the llist functions aren't neded? Something else? The comments would be a good start. Just to take pity on people seeing the potential for concurrency and wondering how the concurrent accesses actually happen. ;-) Thanx, Paul
On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote: > On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote: > > On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote: > > > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote: > > > > Cache some extra objects per-CPU. During reclaim process > > > > some pages are cached instead of releasing by linking them > > > > into the list. Such approach provides O(1) access time to > > > > the cache. > > > > > > > > That reduces number of requests to the page allocator, also > > > > that makes it more helpful if a low memory condition occurs. > > > > > > > > A parameter reflecting the minimum allowed pages to be > > > > cached per one CPU is propagated via sysfs, it is read > > > > only, the name is "rcu_min_cached_objs". > > > > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > > > --- > > > > kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- > > > > 1 file changed, 60 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > > index 89e9ca3f4e3e..d8975819b1c9 100644 > > > > --- a/kernel/rcu/tree.c > > > > +++ b/kernel/rcu/tree.c > > > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); > > > > static int gp_cleanup_delay; > > > > module_param(gp_cleanup_delay, int, 0444); > > > > > > > > +/* > > > > + * This rcu parameter is read-only, but can be write also. > > > > > > You mean that although the parameter is read-only, you see no reason > > > why it could not be converted to writeable? > > > > > I added just a note. If it is writable, then we can change the size of the > > per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs" > > would cache 5 pages. But i do not have a strong opinion if it should be > > writable. > > > > > If it was writeable, and a given CPU had the maximum numbr of cached > > > objects, the rcu_min_cached_objs value was decreased, but that CPU never > > > saw another kfree_rcu(), would the number of cached objects change? > > > > > No. It works the way: unqueue the page from cache in the kfree_rcu(), > > whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs, > > if >= will free the page. > > Just to make sure I understand... If someone writes a smaller number to > the sysfs variable, the per-CPU caches will be decreased at that point, > immediately during that sysfs write? Or are you saying something else? > This patch defines it as read-only. It defines the minimum threshold that controls number of elements in the per-CPU cache. If we decide to make it write also, then we will have full of freedom how to define its behavior, i.e. it is not defined because it is read only. > > > Presumably the list can also be accessed without holding this lock, > > > because otherwise we shouldn't need llist... > > > > > Hm... We increase the number of elements in cache, therefore it is not > > lockless. From the other hand i used llist_head to maintain the cache > > because it is single linked list, we do not need "*prev" link. Also > > we do not need to init the list. > > > > But i can change it to list_head. Please let me know if i need :) > > Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU > callback processing, the operations were open-coded, but they have been > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. > > Maybe some non-atomic/protected/whatever macros in the llist.h file? > Or maybe just open-code the singly linked list? (Probably not the > best choice, though.) Add comments stating that the atomic properties > of the llist functions aren't neded? Something else? > In order to keep it simple i can replace llist_head by the list_head? > > The comments would be a good start. Just to take pity on people seeing > the potential for concurrency and wondering how the concurrent accesses > actually happen. ;-) > Sounds like you are kidding me :) -- Vlad Rezki
On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote: > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote: > > On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote: > > > On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote: > > > > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote: > > > > > Cache some extra objects per-CPU. During reclaim process > > > > > some pages are cached instead of releasing by linking them > > > > > into the list. Such approach provides O(1) access time to > > > > > the cache. > > > > > > > > > > That reduces number of requests to the page allocator, also > > > > > that makes it more helpful if a low memory condition occurs. > > > > > > > > > > A parameter reflecting the minimum allowed pages to be > > > > > cached per one CPU is propagated via sysfs, it is read > > > > > only, the name is "rcu_min_cached_objs". > > > > > > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > > > > --- > > > > > kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- > > > > > 1 file changed, 60 insertions(+), 4 deletions(-) > > > > > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > > > index 89e9ca3f4e3e..d8975819b1c9 100644 > > > > > --- a/kernel/rcu/tree.c > > > > > +++ b/kernel/rcu/tree.c > > > > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); > > > > > static int gp_cleanup_delay; > > > > > module_param(gp_cleanup_delay, int, 0444); > > > > > > > > > > +/* > > > > > + * This rcu parameter is read-only, but can be write also. > > > > > > > > You mean that although the parameter is read-only, you see no reason > > > > why it could not be converted to writeable? > > > > > > > I added just a note. If it is writable, then we can change the size of the > > > per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs" > > > would cache 5 pages. But i do not have a strong opinion if it should be > > > writable. > > > > > > > If it was writeable, and a given CPU had the maximum numbr of cached > > > > objects, the rcu_min_cached_objs value was decreased, but that CPU never > > > > saw another kfree_rcu(), would the number of cached objects change? > > > > > > > No. It works the way: unqueue the page from cache in the kfree_rcu(), > > > whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs, > > > if >= will free the page. > > > > Just to make sure I understand... If someone writes a smaller number to > > the sysfs variable, the per-CPU caches will be decreased at that point, > > immediately during that sysfs write? Or are you saying something else? > > > This patch defines it as read-only. It defines the minimum threshold that > controls number of elements in the per-CPU cache. If we decide to make it > write also, then we will have full of freedom how to define its behavior, > i.e. it is not defined because it is read only. And runtime-read-only sounds like an excellent state for it. > > > > Presumably the list can also be accessed without holding this lock, > > > > because otherwise we shouldn't need llist... > > > > > > > Hm... We increase the number of elements in cache, therefore it is not > > > lockless. From the other hand i used llist_head to maintain the cache > > > because it is single linked list, we do not need "*prev" link. Also > > > we do not need to init the list. > > > > > > But i can change it to list_head. Please let me know if i need :) > > > > Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU > > callback processing, the operations were open-coded, but they have been > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. > > > > Maybe some non-atomic/protected/whatever macros in the llist.h file? > > Or maybe just open-code the singly linked list? (Probably not the > > best choice, though.) Add comments stating that the atomic properties > > of the llist functions aren't neded? Something else? > > > In order to keep it simple i can replace llist_head by the list_head? Fine by me! > > The comments would be a good start. Just to take pity on people seeing > > the potential for concurrency and wondering how the concurrent accesses > > actually happen. ;-) > > > Sounds like you are kidding me :) "Only those who have gone too far can possibly tell you how far you can go!" ;-) Thanx, Paul
On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote: > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote: [..] > > > > Presumably the list can also be accessed without holding this lock, > > > > because otherwise we shouldn't need llist... > > > > > > > Hm... We increase the number of elements in cache, therefore it is not > > > lockless. From the other hand i used llist_head to maintain the cache > > > because it is single linked list, we do not need "*prev" link. Also > > > we do not need to init the list. > > > > > > But i can change it to list_head. Please let me know if i need :) > > > > Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU > > callback processing, the operations were open-coded, but they have been > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. > > > > Maybe some non-atomic/protected/whatever macros in the llist.h file? > > Or maybe just open-code the singly linked list? (Probably not the > > best choice, though.) Add comments stating that the atomic properties > > of the llist functions aren't neded? Something else? > > > In order to keep it simple i can replace llist_head by the list_head? Just to clarify for me, what is the disadvantage of using llist here? Since we don't care about traversing backwards, isn't it better to use llist for this usecase? I think Vlad is using locking as we're also tracking the size of the llist to know when to free pages. This tracking could suffer from the lost-update problem without any locking, 2 lockless llist_add happened simulatenously. Also if list_head is used, it will take more space and still use locking. Thoughts? thanks, - Joel > > > > The comments would be a good start. Just to take pity on people seeing > > the potential for concurrency and wondering how the concurrent accesses > > actually happen. ;-) > > > Sounds like you are kidding me :) > > -- > Vlad Rezki
On Mon, May 04, 2020 at 02:08:05PM -0400, Joel Fernandes wrote: > On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote: > > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote: > [..] > > > > > Presumably the list can also be accessed without holding this lock, > > > > > because otherwise we shouldn't need llist... > > > > > > > > > Hm... We increase the number of elements in cache, therefore it is not > > > > lockless. From the other hand i used llist_head to maintain the cache > > > > because it is single linked list, we do not need "*prev" link. Also > > > > we do not need to init the list. > > > > > > > > But i can change it to list_head. Please let me know if i need :) > > > > > > Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU > > > callback processing, the operations were open-coded, but they have been > > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. > > > > > > Maybe some non-atomic/protected/whatever macros in the llist.h file? > > > Or maybe just open-code the singly linked list? (Probably not the > > > best choice, though.) Add comments stating that the atomic properties > > > of the llist functions aren't neded? Something else? > > > > > In order to keep it simple i can replace llist_head by the list_head? > > Just to clarify for me, what is the disadvantage of using llist here? Are there some llist APIs that are not set up for concurrency? I am not seeing any. The overhead isn't that much of a concern, given that these are not on the hotpath, but people reading the code and seeing the cmpxchg operations might be forgiven for believing that there is some concurrency involved somewhere. Or am I confused and there are now single-threaded add/delete operations for llist? > Since we don't care about traversing backwards, isn't it better to use llist > for this usecase? > > I think Vlad is using locking as we're also tracking the size of the llist to > know when to free pages. This tracking could suffer from the lost-update > problem without any locking, 2 lockless llist_add happened simulatenously. > > Also if list_head is used, it will take more space and still use locking. Indeed, it would be best to use a non-concurrent singly linked list. Thanx, Paul > Thoughts? > > thanks, > > - Joel > > > > > > > The comments would be a good start. Just to take pity on people seeing > > > the potential for concurrency and wondering how the concurrent accesses > > > actually happen. ;-) > > > > > Sounds like you are kidding me :) > > > > -- > > Vlad Rezki
Hi Paul, On Mon, May 4, 2020 at 3:01 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > On Mon, May 04, 2020 at 02:08:05PM -0400, Joel Fernandes wrote: > > On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote: > > > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote: > > [..] > > > > > > Presumably the list can also be accessed without holding this lock, > > > > > > because otherwise we shouldn't need llist... > > > > > > > > > > > Hm... We increase the number of elements in cache, therefore it is not > > > > > lockless. From the other hand i used llist_head to maintain the cache > > > > > because it is single linked list, we do not need "*prev" link. Also > > > > > we do not need to init the list. > > > > > > > > > > But i can change it to list_head. Please let me know if i need :) > > > > > > > > Hmmm... Maybe it is time for a non-atomic singly linked list? In the RCU > > > > callback processing, the operations were open-coded, but they have been > > > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*. > > > > > > > > Maybe some non-atomic/protected/whatever macros in the llist.h file? > > > > Or maybe just open-code the singly linked list? (Probably not the > > > > best choice, though.) Add comments stating that the atomic properties > > > > of the llist functions aren't neded? Something else? > > > > > > > In order to keep it simple i can replace llist_head by the list_head? > > > > Just to clarify for me, what is the disadvantage of using llist here? > > Are there some llist APIs that are not set up for concurrency? I am > not seeing any. llist deletion racing with another llist deletion will need locking. So strictly speaking, some locking is possible with llist usage? The locklessness as I understand comes when adding and deleting at the same time. For that no lock is needed. But in the current patch, it locks anyway to avoid the lost-update of the size of the list. > The overhead isn't that much of a concern, given that these are not on the > hotpath, but people reading the code and seeing the cmpxchg operations > might be forgiven for believing that there is some concurrency involved > somewhere. > > Or am I confused and there are now single-threaded add/delete operations > for llist? I do see some examples of llist usage with locking in the kernel code. One case is: do_init_module() calling llist_add to add to the init_free_list under module_mutex. > > Since we don't care about traversing backwards, isn't it better to use llist > > for this usecase? > > > > I think Vlad is using locking as we're also tracking the size of the llist to > > know when to free pages. This tracking could suffer from the lost-update > > problem without any locking, 2 lockless llist_add happened simulatenously. > > > > Also if list_head is used, it will take more space and still use locking. > > Indeed, it would be best to use a non-concurrent singly linked list. Ok cool :-) Is it safe to say something like the following is ruled out? ;-) :-D #define kfree_rcu_list_add llist_add Thanks, - Joel
> > > Since we don't care about traversing backwards, isn't it better to use llist > > > for this usecase? > > > > > > I think Vlad is using locking as we're also tracking the size of the llist to > > > know when to free pages. This tracking could suffer from the lost-update > > > problem without any locking, 2 lockless llist_add happened simulatenously. > > > > > > Also if list_head is used, it will take more space and still use locking. > > > > Indeed, it would be best to use a non-concurrent singly linked list. > > Ok cool :-) > > Is it safe to say something like the following is ruled out? ;-) :-D > #define kfree_rcu_list_add llist_add > In that case i think it is better just to add a comment about using llist_head. To state that it used as a singular list to save space and the access is synchronized by the lock :) IMHO. -- Vlad Rezki
On May 4, 2020 3:51:28 PM EDT, Uladzislau Rezki <urezki@gmail.com> wrote: >> > > Since we don't care about traversing backwards, isn't it better >to use llist >> > > for this usecase? >> > > >> > > I think Vlad is using locking as we're also tracking the size of >the llist to >> > > know when to free pages. This tracking could suffer from the >lost-update >> > > problem without any locking, 2 lockless llist_add happened >simulatenously. >> > > >> > > Also if list_head is used, it will take more space and still use >locking. >> > >> > Indeed, it would be best to use a non-concurrent singly linked >list. >> >> Ok cool :-) >> >> Is it safe to say something like the following is ruled out? ;-) :-D >> #define kfree_rcu_list_add llist_add >> >In that case i think it is better just to add a comment about using >llist_head. To state that it used as a singular list to save space >and the access is synchronized by the lock :) > >IMHO. Sounds good to me. thanks, - Joel > >-- >Vlad Rezki
On Mon, May 04, 2020 at 09:51:28PM +0200, Uladzislau Rezki wrote: > > > > Since we don't care about traversing backwards, isn't it better to use llist > > > > for this usecase? > > > > > > > > I think Vlad is using locking as we're also tracking the size of the llist to > > > > know when to free pages. This tracking could suffer from the lost-update > > > > problem without any locking, 2 lockless llist_add happened simulatenously. > > > > > > > > Also if list_head is used, it will take more space and still use locking. > > > > > > Indeed, it would be best to use a non-concurrent singly linked list. > > > > Ok cool :-) > > > > Is it safe to say something like the following is ruled out? ;-) :-D > > #define kfree_rcu_list_add llist_add > > > In that case i think it is better just to add a comment about using > llist_head. To state that it used as a singular list to save space > and the access is synchronized by the lock :) > > IMHO. But adding such a comment would be fine as well. Thanx, Paul
On Mon, May 04, 2020 at 01:16:41PM -0700, Paul E. McKenney wrote: > On Mon, May 04, 2020 at 09:51:28PM +0200, Uladzislau Rezki wrote: > > > > > Since we don't care about traversing backwards, isn't it better to use llist > > > > > for this usecase? > > > > > > > > > > I think Vlad is using locking as we're also tracking the size of the llist to > > > > > know when to free pages. This tracking could suffer from the lost-update > > > > > problem without any locking, 2 lockless llist_add happened simulatenously. > > > > > > > > > > Also if list_head is used, it will take more space and still use locking. > > > > > > > > Indeed, it would be best to use a non-concurrent singly linked list. > > > > > > Ok cool :-) > > > > > > Is it safe to say something like the following is ruled out? ;-) :-D > > > #define kfree_rcu_list_add llist_add > > > > > In that case i think it is better just to add a comment about using > > llist_head. To state that it used as a singular list to save space > > and the access is synchronized by the lock :) > > > > IMHO. > > But adding such a comment would be fine as well. > Thank you Paul and Joel! -- Vlad Rezki
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 89e9ca3f4e3e..d8975819b1c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is read-only, but can be write also. + * It reflects the minimum allowed number of objects which + * can be cached per-CPU. Object size is equal to one page. + */ +int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work { * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work { struct kfree_rcu_cpu { struct rcu_head *head; struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu { bool initialized; // Number of objects for which GP not started int count; + + /* + * Number of cached objects which are queued into + * the lock-less list. This cache is used by the + * kvfree_call_rcu() function and as of now its + * size is static. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } +static inline struct kfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kfree_rcu_bulk_data *bnode) +{ + /* Check the limit. */ + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work) kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bhead)) + bhead = NULL; + krc_this_cpu_unlock(krcp, flags); + + if (bhead) free_page((unsigned long) bhead); cond_resched_tasks_rcu_qs(); @@ -3114,7 +3159,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Check if a new block is required. */ if (!krcp->bhead || krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + bnode = get_cached_bnode(krcp); if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); @@ -4167,12 +4212,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; }
Cache some extra objects per-CPU. During reclaim process some pages are cached instead of releasing by linking them into the list. Such approach provides O(1) access time to the cache. That reduces number of requests to the page allocator, also that makes it more helpful if a low memory condition occurs. A parameter reflecting the minimum allowed pages to be cached per one CPU is propagated via sysfs, it is read only, the name is "rcu_min_cached_objs". Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> --- kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-)