Message ID | 20230220091637.64865-3-zhengqi.arch@bytedance.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | make slab shrink lockless | expand |
On 20.02.2023 12:16, Qi Zheng wrote: > Like global slab shrink, since commit 1cd0bd06093c > ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU > to protect readers who previously held shrinker_rwsem. > > We can test with the following script: > > ``` > DIR="/root/shrinker/memcg/mnt" > > do_create() > { > mkdir /sys/fs/cgroup/memory/test > echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes > for i in `seq 0 $1`; > do > mkdir /sys/fs/cgroup/memory/test/$i; > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > mkdir -p $DIR/$i; > done > } > > do_mount() > { > for i in `seq $1 $2`; > do > mount -t tmpfs $i $DIR/$i; > done > } > > do_touch() > { > for i in `seq $1 $2`; > do > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & > done > } > > do_create 2000 > do_mount 0 2000 > do_touch 0 1000 > ``` > > Before applying: > > 46.60% [kernel] [k] down_read_trylock > 18.70% [kernel] [k] up_read > 15.44% [kernel] [k] shrink_slab > 4.37% [kernel] [k] _find_next_bit > 2.75% [kernel] [k] xa_load > 2.07% [kernel] [k] idr_find > 1.73% [kernel] [k] do_shrink_slab > 1.42% [kernel] [k] shrink_lruvec > 0.74% [kernel] [k] shrink_node > 0.60% [kernel] [k] list_lru_count_one > > After applying: > > 19.53% [kernel] [k] _find_next_bit > 14.63% [kernel] [k] do_shrink_slab > 14.58% [kernel] [k] shrink_slab > 11.83% [kernel] [k] shrink_lruvec > 9.33% [kernel] [k] __blk_flush_plug > 6.67% [kernel] [k] mem_cgroup_iter > 3.73% [kernel] [k] list_lru_count_one > 2.43% [kernel] [k] shrink_node > 1.96% [kernel] [k] super_cache_count > 1.78% [kernel] [k] __rcu_read_unlock > 1.38% [kernel] [k] __srcu_read_lock > 1.30% [kernel] [k] xas_descend > > We can see that the readers is no longer blocked. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > mm/vmscan.c | 56 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 32 insertions(+), 24 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 95a3d6ddc6c1..dc47396ecd0e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -57,6 +57,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/srcu.h> > > #include <asm/tlbflush.h> > #include <asm/div64.h> > @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) > static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, > int nid) > { > - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, > - lockdep_is_held(&shrinker_rwsem)); > + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu, > + lockdep_is_held(&shrinker_rwsem)); > +} > + > +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, > + int nid) > +{ > + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu); > +} > + > +static void free_shrinker_info_rcu(struct rcu_head *head) > +{ > + kvfree(container_of(head, struct shrinker_info, rcu)); > } > > static int expand_one_shrinker_info(struct mem_cgroup *memcg, > @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, > defer_size - old_defer_size); > > rcu_assign_pointer(pn->shrinker_info, new); > - kvfree_rcu(old, rcu); > + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); > } > > return 0; > @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) > { > if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { > struct shrinker_info *info; > + int srcu_idx; > > - rcu_read_lock(); > - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > /* Pairs with smp mb in shrink_slab() */ > smp_mb__before_atomic(); > set_bit(shrinker_id, info->map); > - rcu_read_unlock(); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > } > > @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) > return -ENOSYS; > > down_write(&shrinker_rwsem); > - /* This may call shrinker, so it must use down_read_trylock() */ > id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); > if (id < 0) > goto unlock; > @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); > } > > @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); > } > > void reparent_shrinker_deferred(struct mem_cgroup *memcg) > { > - int i, nid; > + int i, nid, srcu_idx; > long nr; > struct mem_cgroup *parent; > struct shrinker_info *child_info, *parent_info; > @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) > parent = root_mem_cgroup; > > /* Prevent from concurrent shrinker_info expand */ > - down_read(&shrinker_rwsem); > + srcu_idx = srcu_read_lock(&shrinker_srcu); Don't we still have to be protected against parallel expand_one_shrinker_info()? It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand* right after we've dereferenced it here. > for_each_node(nid) { > - child_info = shrinker_info_protected(memcg, nid); > - parent_info = shrinker_info_protected(parent, nid); > + child_info = shrinker_info_srcu(memcg, nid); > + parent_info = shrinker_info_srcu(parent, nid); > for (i = 0; i < shrinker_nr_max; i++) { > nr = atomic_long_read(&child_info->nr_deferred[i]); > atomic_long_add(nr, &parent_info->nr_deferred[i]); > } > } > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > > static bool cgroup_reclaim(struct scan_control *sc) > @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > { > struct shrinker_info *info; > unsigned long ret, freed = 0; > + int srcu_idx; > int i; > > if (!mem_cgroup_online(memcg)) > return 0; > > - if (!down_read_trylock(&shrinker_rwsem)) > - return 0; > - > - info = shrinker_info_protected(memcg, nid); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (unlikely(!info)) > goto unlock; > > @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > set_shrinker_bit(memcg, nid, i); > } > freed += ret; > - > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > unlock: > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > return freed; > } > #else /* CONFIG_MEMCG */
On 20.02.2023 12:16, Qi Zheng wrote: > Like global slab shrink, since commit 1cd0bd06093c > ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU > to protect readers who previously held shrinker_rwsem. > > We can test with the following script: > > ``` > DIR="/root/shrinker/memcg/mnt" > > do_create() > { > mkdir /sys/fs/cgroup/memory/test > echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes > for i in `seq 0 $1`; > do > mkdir /sys/fs/cgroup/memory/test/$i; > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > mkdir -p $DIR/$i; > done > } > > do_mount() > { > for i in `seq $1 $2`; > do > mount -t tmpfs $i $DIR/$i; > done > } > > do_touch() > { > for i in `seq $1 $2`; > do > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & > done > } > > do_create 2000 > do_mount 0 2000 > do_touch 0 1000 > ``` > > Before applying: > > 46.60% [kernel] [k] down_read_trylock > 18.70% [kernel] [k] up_read > 15.44% [kernel] [k] shrink_slab > 4.37% [kernel] [k] _find_next_bit > 2.75% [kernel] [k] xa_load > 2.07% [kernel] [k] idr_find > 1.73% [kernel] [k] do_shrink_slab > 1.42% [kernel] [k] shrink_lruvec > 0.74% [kernel] [k] shrink_node > 0.60% [kernel] [k] list_lru_count_one > > After applying: > > 19.53% [kernel] [k] _find_next_bit > 14.63% [kernel] [k] do_shrink_slab > 14.58% [kernel] [k] shrink_slab > 11.83% [kernel] [k] shrink_lruvec > 9.33% [kernel] [k] __blk_flush_plug > 6.67% [kernel] [k] mem_cgroup_iter > 3.73% [kernel] [k] list_lru_count_one > 2.43% [kernel] [k] shrink_node > 1.96% [kernel] [k] super_cache_count > 1.78% [kernel] [k] __rcu_read_unlock > 1.38% [kernel] [k] __srcu_read_lock > 1.30% [kernel] [k] xas_descend > > We can see that the readers is no longer blocked. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > --- > mm/vmscan.c | 56 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 32 insertions(+), 24 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 95a3d6ddc6c1..dc47396ecd0e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -57,6 +57,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/srcu.h> > > #include <asm/tlbflush.h> > #include <asm/div64.h> > @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) > static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, > int nid) > { > - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, > - lockdep_is_held(&shrinker_rwsem)); > + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu, > + lockdep_is_held(&shrinker_rwsem)); > +} > + > +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, > + int nid) > +{ > + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu); > +} > + > +static void free_shrinker_info_rcu(struct rcu_head *head) > +{ > + kvfree(container_of(head, struct shrinker_info, rcu)); > } > > static int expand_one_shrinker_info(struct mem_cgroup *memcg, > @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, > defer_size - old_defer_size); > > rcu_assign_pointer(pn->shrinker_info, new); > - kvfree_rcu(old, rcu); > + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); > } > > return 0; > @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) > { > if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { > struct shrinker_info *info; > + int srcu_idx; > > - rcu_read_lock(); > - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > /* Pairs with smp mb in shrink_slab() */ > smp_mb__before_atomic(); > set_bit(shrinker_id, info->map); > - rcu_read_unlock(); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > } > > @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) > return -ENOSYS; > > down_write(&shrinker_rwsem); > - /* This may call shrinker, so it must use down_read_trylock() */ > id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); > if (id < 0) > goto unlock; > @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); > } > > @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); > } > > void reparent_shrinker_deferred(struct mem_cgroup *memcg) > { > - int i, nid; > + int i, nid, srcu_idx; > long nr; > struct mem_cgroup *parent; > struct shrinker_info *child_info, *parent_info; > @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) > parent = root_mem_cgroup; > > /* Prevent from concurrent shrinker_info expand */ > - down_read(&shrinker_rwsem); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > for_each_node(nid) { > - child_info = shrinker_info_protected(memcg, nid); > - parent_info = shrinker_info_protected(parent, nid); > + child_info = shrinker_info_srcu(memcg, nid); > + parent_info = shrinker_info_srcu(parent, nid); > for (i = 0; i < shrinker_nr_max; i++) { > nr = atomic_long_read(&child_info->nr_deferred[i]); > atomic_long_add(nr, &parent_info->nr_deferred[i]); > } > } > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > > static bool cgroup_reclaim(struct scan_control *sc) > @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > { > struct shrinker_info *info; > unsigned long ret, freed = 0; > + int srcu_idx; > int i; > > if (!mem_cgroup_online(memcg)) > return 0; > > - if (!down_read_trylock(&shrinker_rwsem)) > - return 0; > - > - info = shrinker_info_protected(memcg, nid); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (unlikely(!info)) > goto unlock; There is shrinker_nr_max dereference under this hunk. It's not in the patch: for_each_set_bit(i, info->map, shrinker_nr_max) { Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( It looks like we should save size of info->map as a new member of struct shrinker_info. > > @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > set_shrinker_bit(memcg, nid, i); > } > freed += ret; > - > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > unlock: > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > return freed; > } > #else /* CONFIG_MEMCG */
On 2023/2/22 05:28, Kirill Tkhai wrote: > On 20.02.2023 12:16, Qi Zheng wrote: <...> >> >> void reparent_shrinker_deferred(struct mem_cgroup *memcg) >> { >> - int i, nid; >> + int i, nid, srcu_idx; >> long nr; >> struct mem_cgroup *parent; >> struct shrinker_info *child_info, *parent_info; >> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) >> parent = root_mem_cgroup; >> >> /* Prevent from concurrent shrinker_info expand */ >> - down_read(&shrinker_rwsem); >> + srcu_idx = srcu_read_lock(&shrinker_srcu); > > Don't we still have to be protected against parallel expand_one_shrinker_info()? > > It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand* > right after we've dereferenced it here. Hi Kirill, Oh, indeed. We may wrongly reparent the child's nr_deferred to the old parent's nr_deferred (it is about to be freed by call_srcu). The reparent_shrinker_deferred() will only be called on the offline path (not a hotspot path), so we may be able to use shrinker_mutex introduced later for protection. What do you think? Thanks, Qi > >> for_each_node(nid) { >> - child_info = shrinker_info_protected(memcg, nid); >> - parent_info = shrinker_info_protected(parent, nid); >> + child_info = shrinker_info_srcu(memcg, nid); >> + parent_info = shrinker_info_srcu(parent, nid); >> for (i = 0; i < shrinker_nr_max; i++) { >> nr = atomic_long_read(&child_info->nr_deferred[i]); >> atomic_long_add(nr, &parent_info->nr_deferred[i]); >> } >> } >> - up_read(&shrinker_rwsem); >> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >> } >> >> static bool cgroup_reclaim(struct scan_control *sc) >> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> { >> struct shrinker_info *info; >> unsigned long ret, freed = 0; >> + int srcu_idx; >> int i; >> >> if (!mem_cgroup_online(memcg)) >> return 0; >> >> - if (!down_read_trylock(&shrinker_rwsem)) >> - return 0; >> - >> - info = shrinker_info_protected(memcg, nid); >> + srcu_idx = srcu_read_lock(&shrinker_srcu); >> + info = shrinker_info_srcu(memcg, nid); >> if (unlikely(!info)) >> goto unlock; >> >> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> set_shrinker_bit(memcg, nid, i); >> } >> freed += ret; >> - >> - if (rwsem_is_contended(&shrinker_rwsem)) { >> - freed = freed ? : 1; >> - break; >> - } >> } >> unlock: >> - up_read(&shrinker_rwsem); >> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >> return freed; >> } >> #else /* CONFIG_MEMCG */ >
Hi Kirill, On 2023/2/22 05:43, Kirill Tkhai wrote: > On 20.02.2023 12:16, Qi Zheng wrote: >> Like global slab shrink, since commit 1cd0bd06093c<...> >> static bool cgroup_reclaim(struct scan_control *sc) >> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> { >> struct shrinker_info *info; >> unsigned long ret, freed = 0; >> + int srcu_idx; >> int i; >> >> if (!mem_cgroup_online(memcg)) >> return 0; >> >> - if (!down_read_trylock(&shrinker_rwsem)) >> - return 0; >> - >> - info = shrinker_info_protected(memcg, nid); >> + srcu_idx = srcu_read_lock(&shrinker_srcu); >> + info = shrinker_info_srcu(memcg, nid); >> if (unlikely(!info)) >> goto unlock; > > There is shrinker_nr_max dereference under this hunk. It's not in the patch: > > for_each_set_bit(i, info->map, shrinker_nr_max) { > > Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( Oh, indeed. > > It looks like we should save size of info->map as a new member of struct shrinker_info. Agree, then we only traverse info->map_size each time. Like below: diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b6eda2ab205d..f1b5d2803007 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -97,6 +97,7 @@ struct shrinker_info { struct rcu_head rcu; atomic_long_t *nr_deferred; unsigned long *map; + int map_size; }; struct lruvec_stats_percpu { diff --git a/mm/vmscan.c b/mm/vmscan.c index f94bfe540675..dd07eb107915 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head) kvfree(container_of(head, struct shrinker_info, rcu)); } -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max, + int old_nr_max) { + int map_size, defer_size, old_map_size, old_defer_size; struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; - int size = map_size + defer_size; + int size; + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, new->nr_deferred = (atomic_long_t *)(new + 1); new->map = (void *)new->nr_deferred + defer_size; + new->map_size = new_nr_max; /* map: set all old bits, clear all new bits */ memset(new->map, (int)0xff, old_map_size); @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) } info->nr_deferred = (atomic_long_t *)(info + 1); info->map = (void *)info->nr_deferred + defer_size; + info->map_size = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id) { int ret = 0; int new_nr_max = new_id + 1; - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; struct mem_cgroup *memcg; if (!need_expand(new_nr_max)) @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_mutex); - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); - memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size); + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (unlikely(!info)) goto unlock; - for_each_set_bit(i, info->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, info->map_size) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, I will send the v2. Thanks, Qi > >> >> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> set_shrinker_bit(memcg, nid, i); >> } >> freed += ret; >> - >> - if (rwsem_is_contended(&shrinker_rwsem)) { >> - freed = freed ? : 1; >> - break; >> - } >> } >> unlock: >> - up_read(&shrinker_rwsem); >> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >> return freed; >> } >> #else /* CONFIG_MEMCG */ >
On 2023/2/22 16:16, Qi Zheng wrote: > Hi Kirill, > > On 2023/2/22 05:43, Kirill Tkhai wrote: >> On 20.02.2023 12:16, Qi Zheng wrote: >>> Like global slab shrink, since commit 1cd0bd06093c<...> >>> static bool cgroup_reclaim(struct scan_control *sc) >>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t >>> gfp_mask, int nid, >>> { >>> struct shrinker_info *info; >>> unsigned long ret, freed = 0; >>> + int srcu_idx; >>> int i; >>> if (!mem_cgroup_online(memcg)) >>> return 0; >>> - if (!down_read_trylock(&shrinker_rwsem)) >>> - return 0; >>> - >>> - info = shrinker_info_protected(memcg, nid); >>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>> + info = shrinker_info_srcu(memcg, nid); >>> if (unlikely(!info)) >>> goto unlock; >> >> There is shrinker_nr_max dereference under this hunk. It's not in the >> patch: >> >> for_each_set_bit(i, info->map, shrinker_nr_max) { >> >> Since shrinker_nr_max may grow in parallel, this leads to access >> beyond allocated memory :( > > Oh, indeed. > >> >> It looks like we should save size of info->map as a new member of >> struct shrinker_info. > > Agree, then we only traverse info->map_size each time. Like below: > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index b6eda2ab205d..f1b5d2803007 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -97,6 +97,7 @@ struct shrinker_info { > struct rcu_head rcu; > atomic_long_t *nr_deferred; > unsigned long *map; > + int map_size; > }; > > struct lruvec_stats_percpu { > diff --git a/mm/vmscan.c b/mm/vmscan.c > index f94bfe540675..dd07eb107915 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head > *head) > kvfree(container_of(head, struct shrinker_info, rcu)); > } > > -static int expand_one_shrinker_info(struct mem_cgroup *memcg, > - int map_size, int defer_size, > - int old_map_size, int old_defer_size) > +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int > new_nr_max, > + int old_nr_max) > { > + int map_size, defer_size, old_map_size, old_defer_size; > struct shrinker_info *new, *old; > struct mem_cgroup_per_node *pn; > int nid; > - int size = map_size + defer_size; > + int size; > + > + map_size = shrinker_map_size(new_nr_max); > + defer_size = shrinker_defer_size(new_nr_max); > + old_map_size = shrinker_map_size(shrinker_nr_max); > + old_defer_size = shrinker_defer_size(shrinker_nr_max); Perhaps these should still be calculated outside the loop as before. > + size = map_size + defer_size; > > for_each_node(nid) { > pn = memcg->nodeinfo[nid]; > @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct > mem_cgroup *memcg, > > new->nr_deferred = (atomic_long_t *)(new + 1); > new->map = (void *)new->nr_deferred + defer_size; > + new->map_size = new_nr_max; > > /* map: set all old bits, clear all new bits */ > memset(new->map, (int)0xff, old_map_size); > @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) > } > info->nr_deferred = (atomic_long_t *)(info + 1); > info->map = (void *)info->nr_deferred + defer_size; > + info->map_size = shrinker_nr_max; > rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, > info); > } > mutex_unlock(&shrinker_mutex); > @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id) > { > int ret = 0; > int new_nr_max = new_id + 1; > - int map_size, defer_size = 0; > - int old_map_size, old_defer_size = 0; > struct mem_cgroup *memcg; > > if (!need_expand(new_nr_max)) > @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id) > > lockdep_assert_held(&shrinker_mutex); > > - map_size = shrinker_map_size(new_nr_max); > - defer_size = shrinker_defer_size(new_nr_max); > - old_map_size = shrinker_map_size(shrinker_nr_max); > - old_defer_size = shrinker_defer_size(shrinker_nr_max); > - > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > - ret = expand_one_shrinker_info(memcg, map_size, defer_size, > - old_map_size, > old_defer_size); > + ret = expand_one_shrinker_info(memcg, new_nr_max, > shrinker_nr_max); > if (ret) { > mem_cgroup_iter_break(NULL, memcg); > goto out; > @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t > gfp_mask, int nid, > if (unlikely(!info)) > goto unlock; > > - for_each_set_bit(i, info->map, shrinker_nr_max) { > + for_each_set_bit(i, info->map, info->map_size) { > struct shrink_control sc = { > .gfp_mask = gfp_mask, > .nid = nid, > > I will send the v2. > > Thanks, > Qi > >> >>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t >>> gfp_mask, int nid, >>> set_shrinker_bit(memcg, nid, i); >>> } >>> freed += ret; >>> - >>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>> - freed = freed ? : 1; >>> - break; >>> - } >>> } >>> unlock: >>> - up_read(&shrinker_rwsem); >>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>> return freed; >>> } >>> #else /* CONFIG_MEMCG */ >> >
On 22.02.2023 10:32, Qi Zheng wrote: > > > On 2023/2/22 05:28, Kirill Tkhai wrote: >> On 20.02.2023 12:16, Qi Zheng wrote: > <...> >>> void reparent_shrinker_deferred(struct mem_cgroup *memcg) >>> { >>> - int i, nid; >>> + int i, nid, srcu_idx; >>> long nr; >>> struct mem_cgroup *parent; >>> struct shrinker_info *child_info, *parent_info; >>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) >>> parent = root_mem_cgroup; >>> /* Prevent from concurrent shrinker_info expand */ >>> - down_read(&shrinker_rwsem); >>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >> >> Don't we still have to be protected against parallel expand_one_shrinker_info()? >> >> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand* >> right after we've dereferenced it here. > > Hi Kirill, > > Oh, indeed. We may wrongly reparent the child's nr_deferred to the old > parent's nr_deferred (it is about to be freed by call_srcu). > > The reparent_shrinker_deferred() will only be called on the offline > path (not a hotspot path), so we may be able to use shrinker_mutex > introduced later for protection. What do you think? It looks good for me. One more thing I'd analyzed is whether we want to have is two reparent_shrinker_deferred() are executing in parallel. Possible, we should leave rwsem there as it was used before.. >> >>> for_each_node(nid) { >>> - child_info = shrinker_info_protected(memcg, nid); >>> - parent_info = shrinker_info_protected(parent, nid); >>> + child_info = shrinker_info_srcu(memcg, nid); >>> + parent_info = shrinker_info_srcu(parent, nid); >>> for (i = 0; i < shrinker_nr_max; i++) { >>> nr = atomic_long_read(&child_info->nr_deferred[i]); >>> atomic_long_add(nr, &parent_info->nr_deferred[i]); >>> } >>> } >>> - up_read(&shrinker_rwsem); >>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>> } >>> static bool cgroup_reclaim(struct scan_control *sc) >>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>> { >>> struct shrinker_info *info; >>> unsigned long ret, freed = 0; >>> + int srcu_idx; >>> int i; >>> if (!mem_cgroup_online(memcg)) >>> return 0; >>> - if (!down_read_trylock(&shrinker_rwsem)) >>> - return 0; >>> - >>> - info = shrinker_info_protected(memcg, nid); >>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>> + info = shrinker_info_srcu(memcg, nid); >>> if (unlikely(!info)) >>> goto unlock; >>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>> set_shrinker_bit(memcg, nid, i); >>> } >>> freed += ret; >>> - >>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>> - freed = freed ? : 1; >>> - break; >>> - } >>> } >>> unlock: >>> - up_read(&shrinker_rwsem); >>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>> return freed; >>> } >>> #else /* CONFIG_MEMCG */ >> >
On 22.02.2023 11:21, Qi Zheng wrote: > > > On 2023/2/22 16:16, Qi Zheng wrote: >> Hi Kirill, >> >> On 2023/2/22 05:43, Kirill Tkhai wrote: >>> On 20.02.2023 12:16, Qi Zheng wrote: >>>> Like global slab shrink, since commit 1cd0bd06093c<...> >>>> static bool cgroup_reclaim(struct scan_control *sc) >>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> { >>>> struct shrinker_info *info; >>>> unsigned long ret, freed = 0; >>>> + int srcu_idx; >>>> int i; >>>> if (!mem_cgroup_online(memcg)) >>>> return 0; >>>> - if (!down_read_trylock(&shrinker_rwsem)) >>>> - return 0; >>>> - >>>> - info = shrinker_info_protected(memcg, nid); >>>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>>> + info = shrinker_info_srcu(memcg, nid); >>>> if (unlikely(!info)) >>>> goto unlock; >>> >>> There is shrinker_nr_max dereference under this hunk. It's not in the patch: >>> >>> for_each_set_bit(i, info->map, shrinker_nr_max) { >>> >>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( >> >> Oh, indeed. >> >>> >>> It looks like we should save size of info->map as a new member of struct shrinker_info. >> >> Agree, then we only traverse info->map_size each time. Like below: >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index b6eda2ab205d..f1b5d2803007 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -97,6 +97,7 @@ struct shrinker_info { >> struct rcu_head rcu; >> atomic_long_t *nr_deferred; >> unsigned long *map; >> + int map_size; Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was may suggestion, now it makes me think that name "size" is about size in bytes. Possible, something like map_nr_max would be better here. >> }; >> >> struct lruvec_stats_percpu { >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index f94bfe540675..dd07eb107915 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head) >> kvfree(container_of(head, struct shrinker_info, rcu)); >> } >> >> -static int expand_one_shrinker_info(struct mem_cgroup *memcg, >> - int map_size, int defer_size, >> - int old_map_size, int old_defer_size) >> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max, >> + int old_nr_max) >> { >> + int map_size, defer_size, old_map_size, old_defer_size; >> struct shrinker_info *new, *old; >> struct mem_cgroup_per_node *pn; >> int nid; >> - int size = map_size + defer_size; >> + int size; >> + >> + map_size = shrinker_map_size(new_nr_max); >> + defer_size = shrinker_defer_size(new_nr_max); >> + old_map_size = shrinker_map_size(shrinker_nr_max); >> + old_defer_size = shrinker_defer_size(shrinker_nr_max); > > Perhaps these should still be calculated outside the loop as before. Yeah, for me it's also better. >> + size = map_size + defer_size; >> >> for_each_node(nid) { >> pn = memcg->nodeinfo[nid]; >> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, >> >> new->nr_deferred = (atomic_long_t *)(new + 1); >> new->map = (void *)new->nr_deferred + defer_size; >> + new->map_size = new_nr_max; >> >> /* map: set all old bits, clear all new bits */ >> memset(new->map, (int)0xff, old_map_size); >> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) >> } >> info->nr_deferred = (atomic_long_t *)(info + 1); >> info->map = (void *)info->nr_deferred + defer_size; >> + info->map_size = shrinker_nr_max; >> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); >> } >> mutex_unlock(&shrinker_mutex); >> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id) >> { >> int ret = 0; >> int new_nr_max = new_id + 1; >> - int map_size, defer_size = 0; >> - int old_map_size, old_defer_size = 0; >> struct mem_cgroup *memcg; >> >> if (!need_expand(new_nr_max)) >> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id) >> >> lockdep_assert_held(&shrinker_mutex); >> >> - map_size = shrinker_map_size(new_nr_max); >> - defer_size = shrinker_defer_size(new_nr_max); >> - old_map_size = shrinker_map_size(shrinker_nr_max); >> - old_defer_size = shrinker_defer_size(shrinker_nr_max); >> - >> memcg = mem_cgroup_iter(NULL, NULL, NULL); >> do { >> - ret = expand_one_shrinker_info(memcg, map_size, defer_size, >> - old_map_size, old_defer_size); >> + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max); >> if (ret) { >> mem_cgroup_iter_break(NULL, memcg); >> goto out; >> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> if (unlikely(!info)) >> goto unlock; >> >> - for_each_set_bit(i, info->map, shrinker_nr_max) { >> + for_each_set_bit(i, info->map, info->map_size) { >> struct shrink_control sc = { >> .gfp_mask = gfp_mask, >> .nid = nid, >> >> I will send the v2. >> >> Thanks, >> Qi >> >>> >>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> set_shrinker_bit(memcg, nid, i); >>>> } >>>> freed += ret; >>>> - >>>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>>> - freed = freed ? : 1; >>>> - break; >>>> - } >>>> } >>>> unlock: >>>> - up_read(&shrinker_rwsem); >>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>>> return freed; >>>> } >>>> #else /* CONFIG_MEMCG */ >>> >> >
On 2023/2/23 03:58, Kirill Tkhai wrote: > On 22.02.2023 10:32, Qi Zheng wrote: >> >> >> On 2023/2/22 05:28, Kirill Tkhai wrote: >>> On 20.02.2023 12:16, Qi Zheng wrote: >> <...> >>>> void reparent_shrinker_deferred(struct mem_cgroup *memcg) >>>> { >>>> - int i, nid; >>>> + int i, nid, srcu_idx; >>>> long nr; >>>> struct mem_cgroup *parent; >>>> struct shrinker_info *child_info, *parent_info; >>>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) >>>> parent = root_mem_cgroup; >>>> /* Prevent from concurrent shrinker_info expand */ >>>> - down_read(&shrinker_rwsem); >>>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>> >>> Don't we still have to be protected against parallel expand_one_shrinker_info()? >>> >>> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand* >>> right after we've dereferenced it here. >> >> Hi Kirill, >> >> Oh, indeed. We may wrongly reparent the child's nr_deferred to the old >> parent's nr_deferred (it is about to be freed by call_srcu). >> >> The reparent_shrinker_deferred() will only be called on the offline >> path (not a hotspot path), so we may be able to use shrinker_mutex >> introduced later for protection. What do you think? > > It looks good for me. One more thing I'd analyzed is whether we want to have > is two reparent_shrinker_deferred() are executing in parallel. I see that mem_cgroup_css_offline() is already protected by cgroup_mutex, so maybe shrinker_mutex is enough here. :) > > Possible, we should leave rwsem there as it was used before.. > >>> >>>> for_each_node(nid) { >>>> - child_info = shrinker_info_protected(memcg, nid); >>>> - parent_info = shrinker_info_protected(parent, nid); >>>> + child_info = shrinker_info_srcu(memcg, nid); >>>> + parent_info = shrinker_info_srcu(parent, nid); >>>> for (i = 0; i < shrinker_nr_max; i++) { >>>> nr = atomic_long_read(&child_info->nr_deferred[i]); >>>> atomic_long_add(nr, &parent_info->nr_deferred[i]); >>>> } >>>> } >>>> - up_read(&shrinker_rwsem); >>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>>> } >>>> static bool cgroup_reclaim(struct scan_control *sc) >>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> { >>>> struct shrinker_info *info; >>>> unsigned long ret, freed = 0; >>>> + int srcu_idx; >>>> int i; >>>> if (!mem_cgroup_online(memcg)) >>>> return 0; >>>> - if (!down_read_trylock(&shrinker_rwsem)) >>>> - return 0; >>>> - >>>> - info = shrinker_info_protected(memcg, nid); >>>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>>> + info = shrinker_info_srcu(memcg, nid); >>>> if (unlikely(!info)) >>>> goto unlock; >>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> set_shrinker_bit(memcg, nid, i); >>>> } >>>> freed += ret; >>>> - >>>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>>> - freed = freed ? : 1; >>>> - break; >>>> - } >>>> } >>>> unlock: >>>> - up_read(&shrinker_rwsem); >>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>>> return freed; >>>> } >>>> #else /* CONFIG_MEMCG */ >>> >> >
On 2023/2/23 04:05, Kirill Tkhai wrote: > On 22.02.2023 11:21, Qi Zheng wrote: >> >> >> On 2023/2/22 16:16, Qi Zheng wrote: >>> Hi Kirill, >>> >>> On 2023/2/22 05:43, Kirill Tkhai wrote: >>>> On 20.02.2023 12:16, Qi Zheng wrote: >>>>> Like global slab shrink, since commit 1cd0bd06093c<...> >>>>> static bool cgroup_reclaim(struct scan_control *sc) >>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>>> { >>>>> struct shrinker_info *info; >>>>> unsigned long ret, freed = 0; >>>>> + int srcu_idx; >>>>> int i; >>>>> if (!mem_cgroup_online(memcg)) >>>>> return 0; >>>>> - if (!down_read_trylock(&shrinker_rwsem)) >>>>> - return 0; >>>>> - >>>>> - info = shrinker_info_protected(memcg, nid); >>>>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>>>> + info = shrinker_info_srcu(memcg, nid); >>>>> if (unlikely(!info)) >>>>> goto unlock; >>>> >>>> There is shrinker_nr_max dereference under this hunk. It's not in the patch: >>>> >>>> for_each_set_bit(i, info->map, shrinker_nr_max) { >>>> >>>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( >>> >>> Oh, indeed. >>> >>>> >>>> It looks like we should save size of info->map as a new member of struct shrinker_info. >>> >>> Agree, then we only traverse info->map_size each time. Like below: >>> >>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >>> index b6eda2ab205d..f1b5d2803007 100644 >>> --- a/include/linux/memcontrol.h >>> +++ b/include/linux/memcontrol.h >>> @@ -97,6 +97,7 @@ struct shrinker_info { >>> struct rcu_head rcu; >>> atomic_long_t *nr_deferred; >>> unsigned long *map; >>> + int map_size; > > Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was > may suggestion, now it makes me think that name "size" is about size in bytes. > > Possible, something like map_nr_max would be better here. Agree. Will change to it. > >>> }; >>> >>> struct lruvec_stats_percpu { >>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>> index f94bfe540675..dd07eb107915 100644 >>> --- a/mm/vmscan.c >>> +++ b/mm/vmscan.c >>> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head) >>> kvfree(container_of(head, struct shrinker_info, rcu)); >>> } >>> >>> -static int expand_one_shrinker_info(struct mem_cgroup *memcg, >>> - int map_size, int defer_size, >>> - int old_map_size, int old_defer_size) >>> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max, >>> + int old_nr_max) >>> { >>> + int map_size, defer_size, old_map_size, old_defer_size; >>> struct shrinker_info *new, *old; >>> struct mem_cgroup_per_node *pn; >>> int nid; >>> - int size = map_size + defer_size; >>> + int size; >>> + >>> + map_size = shrinker_map_size(new_nr_max); >>> + defer_size = shrinker_defer_size(new_nr_max); >>> + old_map_size = shrinker_map_size(shrinker_nr_max); >>> + old_defer_size = shrinker_defer_size(shrinker_nr_max); >> >> Perhaps these should still be calculated outside the loop as before. > > Yeah, for me it's also better. > >>> + size = map_size + defer_size; >>> >>> for_each_node(nid) { >>> pn = memcg->nodeinfo[nid]; >>> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, >>> >>> new->nr_deferred = (atomic_long_t *)(new + 1); >>> new->map = (void *)new->nr_deferred + defer_size; >>> + new->map_size = new_nr_max; >>> >>> /* map: set all old bits, clear all new bits */ >>> memset(new->map, (int)0xff, old_map_size); >>> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) >>> } >>> info->nr_deferred = (atomic_long_t *)(info + 1); >>> info->map = (void *)info->nr_deferred + defer_size; >>> + info->map_size = shrinker_nr_max; >>> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); >>> } >>> mutex_unlock(&shrinker_mutex); >>> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id) >>> { >>> int ret = 0; >>> int new_nr_max = new_id + 1; >>> - int map_size, defer_size = 0; >>> - int old_map_size, old_defer_size = 0; >>> struct mem_cgroup *memcg; >>> >>> if (!need_expand(new_nr_max)) >>> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id) >>> >>> lockdep_assert_held(&shrinker_mutex); >>> >>> - map_size = shrinker_map_size(new_nr_max); >>> - defer_size = shrinker_defer_size(new_nr_max); >>> - old_map_size = shrinker_map_size(shrinker_nr_max); >>> - old_defer_size = shrinker_defer_size(shrinker_nr_max); >>> - >>> memcg = mem_cgroup_iter(NULL, NULL, NULL); >>> do { >>> - ret = expand_one_shrinker_info(memcg, map_size, defer_size, >>> - old_map_size, old_defer_size); >>> + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max); >>> if (ret) { >>> mem_cgroup_iter_break(NULL, memcg); >>> goto out; >>> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>> if (unlikely(!info)) >>> goto unlock; >>> >>> - for_each_set_bit(i, info->map, shrinker_nr_max) { >>> + for_each_set_bit(i, info->map, info->map_size) { >>> struct shrink_control sc = { >>> .gfp_mask = gfp_mask, >>> .nid = nid, >>> >>> I will send the v2. >>> >>> Thanks, >>> Qi >>> >>>> >>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>>> set_shrinker_bit(memcg, nid, i); >>>>> } >>>>> freed += ret; >>>>> - >>>>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>>>> - freed = freed ? : 1; >>>>> - break; >>>>> - } >>>>> } >>>>> unlock: >>>>> - up_read(&shrinker_rwsem); >>>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>>>> return freed; >>>>> } >>>>> #else /* CONFIG_MEMCG */ >>>> >>> >> >
diff --git a/mm/vmscan.c b/mm/vmscan.c index 95a3d6ddc6c1..dc47396ecd0e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, + &shrinker_srcu, + lockdep_is_held(&shrinker_rwsem)); +} + +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, + int nid) +{ + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, + &shrinker_srcu); +} + +static void free_shrinker_info_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct shrinker_info, rcu)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); } return 0; @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; + int srcu_idx; - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + srcu_idx = srcu_read_lock(&shrinker_srcu); + info = shrinker_info_srcu(memcg, nid); /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); - rcu_read_unlock(); + srcu_read_unlock(&shrinker_srcu, srcu_idx); } } @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { - int i, nid; + int i, nid, srcu_idx; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); + srcu_idx = srcu_read_lock(&shrinker_srcu); for_each_node(nid) { - child_info = shrinker_info_protected(memcg, nid); - parent_info = shrinker_info_protected(parent, nid); + child_info = shrinker_info_srcu(memcg, nid); + parent_info = shrinker_info_srcu(parent, nid); for (i = 0; i < shrinker_nr_max; i++) { nr = atomic_long_read(&child_info->nr_deferred[i]); atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - up_read(&shrinker_rwsem); + srcu_read_unlock(&shrinker_srcu, srcu_idx); } static bool cgroup_reclaim(struct scan_control *sc) @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; + int srcu_idx; int i; if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + srcu_idx = srcu_read_lock(&shrinker_srcu); + info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } } unlock: - up_read(&shrinker_rwsem); + srcu_read_unlock(&shrinker_srcu, srcu_idx); return freed; } #else /* CONFIG_MEMCG */
Like global slab shrink, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU to protect readers who previously held shrinker_rwsem. We can test with the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir /sys/fs/cgroup/memory/test echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } do_create 2000 do_mount 0 2000 do_touch 0 1000 ``` Before applying: 46.60% [kernel] [k] down_read_trylock 18.70% [kernel] [k] up_read 15.44% [kernel] [k] shrink_slab 4.37% [kernel] [k] _find_next_bit 2.75% [kernel] [k] xa_load 2.07% [kernel] [k] idr_find 1.73% [kernel] [k] do_shrink_slab 1.42% [kernel] [k] shrink_lruvec 0.74% [kernel] [k] shrink_node 0.60% [kernel] [k] list_lru_count_one After applying: 19.53% [kernel] [k] _find_next_bit 14.63% [kernel] [k] do_shrink_slab 14.58% [kernel] [k] shrink_slab 11.83% [kernel] [k] shrink_lruvec 9.33% [kernel] [k] __blk_flush_plug 6.67% [kernel] [k] mem_cgroup_iter 3.73% [kernel] [k] list_lru_count_one 2.43% [kernel] [k] shrink_node 1.96% [kernel] [k] super_cache_count 1.78% [kernel] [k] __rcu_read_unlock 1.38% [kernel] [k] __srcu_read_lock 1.30% [kernel] [k] xas_descend We can see that the readers is no longer blocked. Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- mm/vmscan.c | 56 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 24 deletions(-)