Message ID | 20230307065605.58209-4-zhengqi.arch@bytedance.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | make slab shrink lockless | expand |
On 07.03.2023 09:56, Qi Zheng wrote: > Like global slab shrink, this commit also uses SRCU to make > memcg slab shrink lockless. > > We can reproduce the down_read_trylock() hotspot through the > following script: > > ``` > > DIR="/root/shrinker/memcg/mnt" > > do_create() > { > mkdir -p /sys/fs/cgroup/memory/test > mkdir -p /sys/fs/cgroup/perf_event/test > echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes > for i in `seq 0 $1`; > do > mkdir -p /sys/fs/cgroup/memory/test/$i; > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; > mkdir -p $DIR/$i; > done > } > > do_mount() > { > for i in `seq $1 $2`; > do > mount -t tmpfs $i $DIR/$i; > done > } > > do_touch() > { > for i in `seq $1 $2`; > do > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; > dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & > done > } > > case "$1" in > touch) > do_touch $2 $3 > ;; > test) > do_create 4000 > do_mount 0 4000 > do_touch 0 3000 > ;; > *) > exit 1 > ;; > esac > ``` > > Save the above script, then run test and touch commands. > Then we can use the following perf command to view hotspots: > > perf top -U -F 999 > > 1) Before applying this patchset: > > 32.31% [kernel] [k] down_read_trylock > 19.40% [kernel] [k] pv_native_safe_halt > 16.24% [kernel] [k] up_read > 15.70% [kernel] [k] shrink_slab > 4.69% [kernel] [k] _find_next_bit > 2.62% [kernel] [k] shrink_node > 1.78% [kernel] [k] shrink_lruvec > 0.76% [kernel] [k] do_shrink_slab > > 2) After applying this patchset: > > 27.83% [kernel] [k] _find_next_bit > 16.97% [kernel] [k] shrink_slab > 15.82% [kernel] [k] pv_native_safe_halt > 9.58% [kernel] [k] shrink_node > 8.31% [kernel] [k] shrink_lruvec > 5.64% [kernel] [k] do_shrink_slab > 3.88% [kernel] [k] mem_cgroup_iter > > At the same time, we use the following perf command to capture > IPC information: > > perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10 > > 1) Before applying this patchset: > > Performance counter stats for 'system wide' (5 runs): > > 454187219766 cycles test ( +- 1.84% ) > 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% ) > > 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% ) > > 2) After applying this patchset: > > Performance counter stats for 'system wide' (5 runs): > > 841954709443 cycles test ( +- 15.80% ) (98.69%) > 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%) > > 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% ) > > We can see that IPC drops very seriously when calling > down_read_trylock() at high frequency. After using SRCU, > the IPC is at a normal level. > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Acked-by: Kirill Tkhai <tkhai@ya.ru> > --- > mm/vmscan.c | 46 +++++++++++++++++++++++++++------------------- > 1 file changed, 27 insertions(+), 19 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 8515ac40bcaf..1de9bc3e5aa2 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -57,6 +57,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/srcu.h> > > #include <asm/tlbflush.h> > #include <asm/div64.h> > @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) > static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, > int nid) > { > - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, > - lockdep_is_held(&shrinker_rwsem)); > + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu, > + lockdep_is_held(&shrinker_rwsem)); > +} > + > +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, > + int nid) > +{ > + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu); > +} > + > +static void free_shrinker_info_rcu(struct rcu_head *head) > +{ > + kvfree(container_of(head, struct shrinker_info, rcu)); > } > > static inline bool need_expand(int new_nr_max, int old_nr_max) > @@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, > defer_size - old_defer_size); > > rcu_assign_pointer(pn->shrinker_info, new); > - kvfree_rcu(old, rcu); > + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); > } > > return 0; > @@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) > { > if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { > struct shrinker_info *info; > + int srcu_idx; > > - rcu_read_lock(); > - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { > /* Pairs with smp mb in shrink_slab() */ > smp_mb__before_atomic(); > set_bit(shrinker_id, info->map); > } > - rcu_read_unlock(); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > } > > @@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) > return -ENOSYS; > > down_write(&shrinker_rwsem); > - /* This may call shrinker, so it must use down_read_trylock() */ > id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); > if (id < 0) > goto unlock; > @@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); > } > > @@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); > } > > @@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > { > struct shrinker_info *info; > unsigned long ret, freed = 0; > + int srcu_idx; > int i; > > if (!mem_cgroup_online(memcg)) > return 0; > > - if (!down_read_trylock(&shrinker_rwsem)) > - return 0; > - > - info = shrinker_info_protected(memcg, nid); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (unlikely(!info)) > goto unlock; > > @@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > set_shrinker_bit(memcg, nid, i); > } > freed += ret; > - > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > unlock: > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > return freed; > } > #else /* CONFIG_MEMCG */
On 3/7/23 07:56, Qi Zheng wrote: > Like global slab shrink, this commit also uses SRCU to make > memcg slab shrink lockless. > > We can reproduce the down_read_trylock() hotspot through the > following script: > > ``` > > DIR="/root/shrinker/memcg/mnt" > > do_create() > { > mkdir -p /sys/fs/cgroup/memory/test > mkdir -p /sys/fs/cgroup/perf_event/test > echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes > for i in `seq 0 $1`; > do > mkdir -p /sys/fs/cgroup/memory/test/$i; > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; > mkdir -p $DIR/$i; > done > } > > do_mount() > { > for i in `seq $1 $2`; > do > mount -t tmpfs $i $DIR/$i; > done > } > > do_touch() > { > for i in `seq $1 $2`; > do > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; > dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & > done > } > > case "$1" in > touch) > do_touch $2 $3 > ;; > test) > do_create 4000 > do_mount 0 4000 > do_touch 0 3000 > ;; > *) > exit 1 > ;; > esac > ``` > > Save the above script, then run test and touch commands. > Then we can use the following perf command to view hotspots: > > perf top -U -F 999 > > 1) Before applying this patchset: > > 32.31% [kernel] [k] down_read_trylock > 19.40% [kernel] [k] pv_native_safe_halt > 16.24% [kernel] [k] up_read > 15.70% [kernel] [k] shrink_slab > 4.69% [kernel] [k] _find_next_bit > 2.62% [kernel] [k] shrink_node > 1.78% [kernel] [k] shrink_lruvec > 0.76% [kernel] [k] do_shrink_slab > > 2) After applying this patchset: > > 27.83% [kernel] [k] _find_next_bit > 16.97% [kernel] [k] shrink_slab > 15.82% [kernel] [k] pv_native_safe_halt > 9.58% [kernel] [k] shrink_node > 8.31% [kernel] [k] shrink_lruvec > 5.64% [kernel] [k] do_shrink_slab > 3.88% [kernel] [k] mem_cgroup_iter > > At the same time, we use the following perf command to capture > IPC information: > > perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10 > > 1) Before applying this patchset: > > Performance counter stats for 'system wide' (5 runs): > > 454187219766 cycles test ( +- 1.84% ) > 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% ) > > 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% ) > > 2) After applying this patchset: > > Performance counter stats for 'system wide' (5 runs): > > 841954709443 cycles test ( +- 15.80% ) (98.69%) > 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%) > > 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% ) > > We can see that IPC drops very seriously when calling > down_read_trylock() at high frequency. After using SRCU, > the IPC is at a normal level. The interpretation looks somewhat weird to me. I'd say the workload is stalled a lot as it fails the trylock (there might be some optimistic spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is also more prominent in before. And because of that sleeping, there's less instructions executed in the same amount of cycles (as it's a system wide collection, otherwise it wouldn't be collecting the sleeping processes). > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Other than that: Acked-by: Vlastimil Babka <Vbabka@suse.cz> A small thing below: > --- > mm/vmscan.c | 46 +++++++++++++++++++++++++++------------------- > 1 file changed, 27 insertions(+), 19 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 8515ac40bcaf..1de9bc3e5aa2 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -57,6 +57,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/srcu.h> I guess this should have been in patch 2/8 already? It may work accidentaly because some other header pulls it transitively...
Hi Vlastimil, On 2023/3/9 06:46, Vlastimil Babka wrote: > On 3/7/23 07:56, Qi Zheng wrote: >> Like global slab shrink, this commit also uses SRCU to make >> memcg slab shrink lockless. >> >> We can reproduce the down_read_trylock() hotspot through the >> following script: >> >> ``` >> >> DIR="/root/shrinker/memcg/mnt" >> >> do_create() >> { >> mkdir -p /sys/fs/cgroup/memory/test >> mkdir -p /sys/fs/cgroup/perf_event/test >> echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes >> for i in `seq 0 $1`; >> do >> mkdir -p /sys/fs/cgroup/memory/test/$i; >> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; >> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; >> mkdir -p $DIR/$i; >> done >> } >> >> do_mount() >> { >> for i in `seq $1 $2`; >> do >> mount -t tmpfs $i $DIR/$i; >> done >> } >> >> do_touch() >> { >> for i in `seq $1 $2`; >> do >> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; >> echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; >> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & >> done >> } >> >> case "$1" in >> touch) >> do_touch $2 $3 >> ;; >> test) >> do_create 4000 >> do_mount 0 4000 >> do_touch 0 3000 >> ;; >> *) >> exit 1 >> ;; >> esac >> ``` >> >> Save the above script, then run test and touch commands. >> Then we can use the following perf command to view hotspots: >> >> perf top -U -F 999 >> >> 1) Before applying this patchset: >> >> 32.31% [kernel] [k] down_read_trylock >> 19.40% [kernel] [k] pv_native_safe_halt >> 16.24% [kernel] [k] up_read >> 15.70% [kernel] [k] shrink_slab >> 4.69% [kernel] [k] _find_next_bit >> 2.62% [kernel] [k] shrink_node >> 1.78% [kernel] [k] shrink_lruvec >> 0.76% [kernel] [k] do_shrink_slab >> >> 2) After applying this patchset: >> >> 27.83% [kernel] [k] _find_next_bit >> 16.97% [kernel] [k] shrink_slab >> 15.82% [kernel] [k] pv_native_safe_halt >> 9.58% [kernel] [k] shrink_node >> 8.31% [kernel] [k] shrink_lruvec >> 5.64% [kernel] [k] do_shrink_slab >> 3.88% [kernel] [k] mem_cgroup_iter >> >> At the same time, we use the following perf command to capture >> IPC information: >> >> perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10 >> >> 1) Before applying this patchset: >> >> Performance counter stats for 'system wide' (5 runs): >> >> 454187219766 cycles test ( +- 1.84% ) >> 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% ) >> >> 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% ) >> >> 2) After applying this patchset: >> >> Performance counter stats for 'system wide' (5 runs): >> >> 841954709443 cycles test ( +- 15.80% ) (98.69%) >> 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%) >> >> 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% ) >> >> We can see that IPC drops very seriously when calling >> down_read_trylock() at high frequency. After using SRCU, >> the IPC is at a normal level. > > The interpretation looks somewhat weird to me. I'd say the workload is > stalled a lot as it fails the trylock (there might be some optimistic > spinning perhaps) and then goes to sleep. See how "pv_native_safe_halt" is > also more prominent in before. And because of that sleeping, there's less > instructions executed in the same amount of cycles (as it's a system wide > collection, otherwise it wouldn't be collecting the sleeping processes). But in my tests, the trylock basically did not fail, so I think it is caused by high-frequency atomic operation. bpftrace -e 'kr:down_read_trylock {@[kstack, retval]=count();} interval:s:1 {exit();}' Attaching 2 probes... <...> @[ shrink_slab+288 shrink_node+640 do_try_to_free_pages+203 try_to_free_mem_cgroup_pages+266 try_charge_memcg+412 charge_memcg+51 __mem_cgroup_charge+44 __handle_mm_fault+2119 handle_mm_fault+272 do_user_addr_fault+712 exc_page_fault+124 asm_exc_page_fault+38 clear_user_erms+14 read_zero+86 vfs_read+173 ksys_read+93 do_syscall_64+56 entry_SYSCALL_64_after_hwframe+99 , 1]: 617019 @[ shrink_slab+288 shrink_node+640 do_try_to_free_pages+203 try_to_free_mem_cgroup_pages+266 try_charge_memcg+412 charge_memcg+51 __mem_cgroup_charge+44 shmem_add_to_page_cache+545 shmem_get_folio_gfp+621 shmem_write_begin+95 generic_perform_write+257 __generic_file_write_iter+202 generic_file_write_iter+97 vfs_write+704 ksys_write+93 do_syscall_64+56 entry_SYSCALL_64_after_hwframe+99 , 1]: 617065 > >> >> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> > > Other than that: > > Acked-by: Vlastimil Babka <Vbabka@suse.cz> Thanks. > > A small thing below: > >> --- >> mm/vmscan.c | 46 +++++++++++++++++++++++++++------------------- >> 1 file changed, 27 insertions(+), 19 deletions(-) >> >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index 8515ac40bcaf..1de9bc3e5aa2 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -57,6 +57,7 @@ >> #include <linux/khugepaged.h> >> #include <linux/rculist_nulls.h> >> #include <linux/random.h> >> +#include <linux/srcu.h> > > I guess this should have been in patch 2/8 already? It may work accidentaly > because some other header pulls it transitively... Yeah, in fact, patch 3/8 also can compile successfully without srcu.h, but maybe it is better to explicitly include this header file, I will add it in patch 2/8. Thanks, Qi > >
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8515ac40bcaf..1de9bc3e5aa2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, + &shrinker_srcu, + lockdep_is_held(&shrinker_rwsem)); +} + +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, + int nid) +{ + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, + &shrinker_srcu); +} + +static void free_shrinker_info_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct shrinker_info, rcu)); } static inline bool need_expand(int new_nr_max, int old_nr_max) @@ -269,7 +283,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); } return 0; @@ -355,15 +369,16 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; + int srcu_idx; - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + srcu_idx = srcu_read_lock(&shrinker_srcu); + info = shrinker_info_srcu(memcg, nid); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - rcu_read_unlock(); + srcu_read_unlock(&shrinker_srcu, srcu_idx); } } @@ -377,7 +392,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -411,7 +425,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -420,7 +434,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -898,15 +912,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; + int srcu_idx; int i; if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + srcu_idx = srcu_read_lock(&shrinker_srcu); + info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; @@ -956,14 +969,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } } unlock: - up_read(&shrinker_rwsem); + srcu_read_unlock(&shrinker_srcu, srcu_idx); return freed; } #else /* CONFIG_MEMCG */
Like global slab shrink, this commit also uses SRCU to make memcg slab shrink lockless. We can reproduce the down_read_trylock() hotspot through the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test mkdir -p /sys/fs/cgroup/perf_event/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 32.31% [kernel] [k] down_read_trylock 19.40% [kernel] [k] pv_native_safe_halt 16.24% [kernel] [k] up_read 15.70% [kernel] [k] shrink_slab 4.69% [kernel] [k] _find_next_bit 2.62% [kernel] [k] shrink_node 1.78% [kernel] [k] shrink_lruvec 0.76% [kernel] [k] do_shrink_slab 2) After applying this patchset: 27.83% [kernel] [k] _find_next_bit 16.97% [kernel] [k] shrink_slab 15.82% [kernel] [k] pv_native_safe_halt 9.58% [kernel] [k] shrink_node 8.31% [kernel] [k] shrink_lruvec 5.64% [kernel] [k] do_shrink_slab 3.88% [kernel] [k] mem_cgroup_iter At the same time, we use the following perf command to capture IPC information: perf stat -e cycles,instructions -G test -a --repeat 5 -- sleep 10 1) Before applying this patchset: Performance counter stats for 'system wide' (5 runs): 454187219766 cycles test ( +- 1.84% ) 78896433101 instructions test # 0.17 insn per cycle ( +- 0.44% ) 10.0020430 +- 0.0000366 seconds time elapsed ( +- 0.00% ) 2) After applying this patchset: Performance counter stats for 'system wide' (5 runs): 841954709443 cycles test ( +- 15.80% ) (98.69%) 527258677936 instructions test # 0.63 insn per cycle ( +- 15.11% ) (98.68%) 10.01064 +- 0.00831 seconds time elapsed ( +- 0.08% ) We can see that IPC drops very seriously when calling down_read_trylock() at high frequency. After using SRCU, the IPC is at a normal level. Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> --- mm/vmscan.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-)