Message ID | 20240925171020.32142-6-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Split list_lru lock into per-cgroup scope | expand |
On 25/09/2024 18:10, Kairui Song wrote: > From: Kairui Song <kasong@tencent.com> > > Currently, every list_lru has a per-node lock that protects adding, > deletion, isolation, and reparenting of all list_lru_one instances > belonging to this list_lru on this node. This lock contention is > heavy when multiple cgroups modify the same list_lru. > > This lock can be split into per-cgroup scope to reduce contention. > > To achieve this, we need a stable list_lru_one for every cgroup. > This commit adds a lock to each list_lru_one and introduced a > helper function lock_list_lru_of_memcg, making it possible to pin > the list_lru of a memcg. Then reworked the reparenting process. > > Reparenting will switch the list_lru_one instances one by one. > By locking each instance and marking it dead using the nr_items > counter, reparenting ensures that all items in the corresponding > cgroup (on-list or not, because items have a stable cgroup, see below) > will see the list_lru_one switch synchronously. > > Objcg reparent is also moved after list_lru reparent so items will have > a stable mem cgroup until all list_lru_one instances are drained. > > The only caller that doesn't work the *_obj interfaces are direct > calls to list_lru_{add,del}. But it's only used by zswap and > that's also based on objcg, so it's fine. > > This also changes the bahaviour of the isolation function when > LRU_RETRY or LRU_REMOVED_RETRY is returned, because now releasing the > lock could unblock reparenting and free the list_lru_one, isolation > function will have to return withoug re-lock the lru. > > prepare() { > mkdir /tmp/test-fs > modprobe brd rd_nr=1 rd_size=33554432 > mkfs.xfs -f /dev/ram0 > mount -t xfs /dev/ram0 /tmp/test-fs > for i in $(seq 1 512); do > mkdir "/tmp/test-fs/$i" > for j in $(seq 1 10240); do > echo TEST-CONTENT > "/tmp/test-fs/$i/$j" > done & > done; wait > } > > do_test() { > read_worker() { > sleep 1 > tar -cv "$1" &>/dev/null > } > read_in_all() { > cd "/tmp/test-fs" && ls > for i in $(seq 1 512); do > (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" > read_worker "$i" & > done; wait > } > for i in $(seq 1 512); do > mkdir -p "/sys/fs/cgroup/benchmark/$i" > done > echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control > echo 512M > /sys/fs/cgroup/benchmark/memory.max > echo 3 > /proc/sys/vm/drop_caches > time read_in_all > } > > Above script simulates compression of small files in multiple cgroups > with memory pressure. Run prepare() then do_test for 6 times: > > Before: > real 0m7.762s user 0m11.340s sys 3m11.224s > real 0m8.123s user 0m11.548s sys 3m2.549s > real 0m7.736s user 0m11.515s sys 3m11.171s > real 0m8.539s user 0m11.508s sys 3m7.618s > real 0m7.928s user 0m11.349s sys 3m13.063s > real 0m8.105s user 0m11.128s sys 3m14.313s > > After this commit (about ~15% faster): > real 0m6.953s user 0m11.327s sys 2m42.912s > real 0m7.453s user 0m11.343s sys 2m51.942s > real 0m6.916s user 0m11.269s sys 2m43.957s > real 0m6.894s user 0m11.528s sys 2m45.346s > real 0m6.911s user 0m11.095s sys 2m43.168s > real 0m6.773s user 0m11.518s sys 2m40.774s > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > drivers/android/binder_alloc.c | 1 - > fs/inode.c | 1 - > fs/xfs/xfs_qm.c | 1 - > include/linux/list_lru.h | 6 +- > mm/list_lru.c | 215 +++++++++++++++++++-------------- > mm/memcontrol.c | 7 +- > mm/workingset.c | 1 - > mm/zswap.c | 5 +- > 8 files changed, 134 insertions(+), 103 deletions(-) > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > index b3acbc4174fb..86bbe40f4bcd 100644 > --- a/drivers/android/binder_alloc.c > +++ b/drivers/android/binder_alloc.c > @@ -1106,7 +1106,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, > mmput_async(mm); > __free_page(page_to_free); > > - spin_lock(lock); > return LRU_REMOVED_RETRY; > > err_invalid_vma: > diff --git a/fs/inode.c b/fs/inode.c > index 471ae4a31549..b95d80cb140b 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -932,7 +932,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item, > mm_account_reclaimed_pages(reap); > } > inode_unpin_lru_isolating(inode); > - spin_lock(lru_lock); > return LRU_RETRY; > } > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > index 7e2307921deb..665d26990b78 100644 > --- a/fs/xfs/xfs_qm.c > +++ b/fs/xfs/xfs_qm.c > @@ -496,7 +496,6 @@ xfs_qm_dquot_isolate( > trace_xfs_dqreclaim_busy(dqp); > XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); > xfs_dqunlock(dqp); > - spin_lock(lru_lock); > return LRU_RETRY; > } > > diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h > index eba93f6511f3..10ba9a54d42c 100644 > --- a/include/linux/list_lru.h > +++ b/include/linux/list_lru.h > @@ -32,6 +32,8 @@ struct list_lru_one { > struct list_head list; > /* may become negative during memcg reparenting */ > long nr_items; > + /* protects all fields above */ > + spinlock_t lock; > }; > > struct list_lru_memcg { > @@ -41,11 +43,9 @@ struct list_lru_memcg { > }; > > struct list_lru_node { > - /* protects all lists on the node, including per cgroup */ > - spinlock_t lock; > /* global list, used for the root cgroup in cgroup aware lrus */ > struct list_lru_one lru; > - long nr_items; > + atomic_long_t nr_items; > } ____cacheline_aligned_in_smp; > > struct list_lru { > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 172b16146e15..6d1c187d9019 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -61,18 +61,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > } > > static inline struct list_lru_one * > -list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) > +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > + bool irq, bool skip_empty) > { > struct list_lru_one *l; > + rcu_read_lock(); > again: > l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > - if (likely(l)) > - return l; > - > - memcg = parent_mem_cgroup(memcg); > + if (likely(l)) { > + if (irq) > + spin_lock_irq(&l->lock); > + else > + spin_lock(&l->lock); > + if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { > + WARN_ON(l->nr_items < 0); > + rcu_read_unlock(); > + return l; > + } > + if (irq) > + spin_unlock_irq(&l->lock); > + else > + spin_unlock(&l->lock); > + } > + /* > + * Caller may simply bail out if raced with reparenting or > + * may iterate through the list_lru and expect empty slots. > + */ > + if (skip_empty) { > + rcu_read_unlock(); > + return NULL; > + } > VM_WARN_ON(!css_is_dying(&memcg->css)); > + memcg = parent_mem_cgroup(memcg); > goto again; > } > + > +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) > +{ > + if (irq_off) > + spin_unlock_irq(&l->lock); > + else > + spin_unlock(&l->lock); > +} > #else > static void list_lru_register(struct list_lru *lru) > { > @@ -99,31 +129,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > } > > static inline struct list_lru_one * > -list_lru_from_memcg(struct list_lru *lru, int nid, int idx) > +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > + bool irq, bool skip_empty) > { > - return &lru->node[nid].lru; > + struct list_lru_one *l = &lru->node[nid].lru; > + > + if (irq) > + spin_lock_irq(&l->lock); > + else > + spin_lock(&l->lock); > + > + return l; > +} > + > +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) > +{ > + if (irq_off) > + spin_unlock_irq(&l->lock); > + else > + spin_unlock(&l->lock); > } > #endif /* CONFIG_MEMCG */ > > /* The caller must ensure the memcg lifetime. */ > bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > - struct mem_cgroup *memcg) > + struct mem_cgroup *memcg) > { > struct list_lru_node *nlru = &lru->node[nid]; > struct list_lru_one *l; > > - spin_lock(&nlru->lock); > + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); > + if (!l) > + return false; > if (list_empty(item)) { > - l = list_lru_from_memcg(lru, nid, memcg); > list_add_tail(item, &l->list); > /* Set shrinker bit if the first element was added */ > if (!l->nr_items++) > set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); > - nlru->nr_items++; > - spin_unlock(&nlru->lock); > + unlock_list_lru(l, false); > + atomic_long_inc(&nlru->nr_items); > return true; > } > - spin_unlock(&nlru->lock); > + unlock_list_lru(l, false); > return false; > } > > @@ -146,24 +193,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); > > /* The caller must ensure the memcg lifetime. */ > bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, > - struct mem_cgroup *memcg) > + struct mem_cgroup *memcg) > { > struct list_lru_node *nlru = &lru->node[nid]; > struct list_lru_one *l; > - > - spin_lock(&nlru->lock); > + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); > + if (!l) > + return false; > if (!list_empty(item)) { > - l = list_lru_from_memcg(lru, nid, memcg); > list_del_init(item); > l->nr_items--; > - nlru->nr_items--; > - spin_unlock(&nlru->lock); > + unlock_list_lru(l, false); > + atomic_long_dec(&nlru->nr_items); > return true; > } > - spin_unlock(&nlru->lock); > + unlock_list_lru(l, false); > return false; > } > -EXPORT_SYMBOL_GPL(list_lru_del); > > bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) > { > @@ -220,25 +266,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) > struct list_lru_node *nlru; > > nlru = &lru->node[nid]; > - return nlru->nr_items; > + return atomic_long_read(&nlru->nr_items); > } > EXPORT_SYMBOL_GPL(list_lru_count_node); > > static unsigned long > -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > +__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > list_lru_walk_cb isolate, void *cb_arg, > - unsigned long *nr_to_walk) > + unsigned long *nr_to_walk, bool irq_off) > { > struct list_lru_node *nlru = &lru->node[nid]; > - struct list_lru_one *l; > + struct list_lru_one *l = NULL; > struct list_head *item, *n; > unsigned long isolated = 0; > > restart: > - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); > + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); > if (!l) > - goto out; > - > + return isolated; > list_for_each_safe(item, n, &l->list) { > enum lru_status ret; > > @@ -250,19 +295,20 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > break; > --*nr_to_walk; > > - ret = isolate(item, l, &nlru->lock, cb_arg); > + ret = isolate(item, l, &l->lock, cb_arg); > switch (ret) { > + /* > + * LRU_RETRY and LRU_REMOVED_RETRY will drop the lru lock, > + * the list traversal will be invalid and have to restart from > + * scratch. > + */ > + case LRU_RETRY: > + goto restart; > case LRU_REMOVED_RETRY: > - assert_spin_locked(&nlru->lock); > fallthrough; > case LRU_REMOVED: > isolated++; > - nlru->nr_items--; > - /* > - * If the lru lock has been dropped, our list > - * traversal is now invalid and so we have to > - * restart from scratch. > - */ > + atomic_long_dec(&nlru->nr_items); > if (ret == LRU_REMOVED_RETRY) > goto restart; > break; > @@ -271,21 +317,15 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > break; > case LRU_SKIP: > break; > - case LRU_RETRY: > - /* > - * The lru lock has been dropped, our list traversal is > - * now invalid and so we have to restart from scratch. > - */ > - assert_spin_locked(&nlru->lock); > - goto restart; > case LRU_STOP: > - assert_spin_locked(&nlru->lock); > + assert_spin_locked(&l->lock); > goto out; > default: > BUG(); > } > } > out: > + unlock_list_lru(l, irq_off); > return isolated; > } > > @@ -294,14 +334,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > list_lru_walk_cb isolate, void *cb_arg, > unsigned long *nr_to_walk) > { > - struct list_lru_node *nlru = &lru->node[nid]; > - unsigned long ret; > - > - spin_lock(&nlru->lock); > - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, > - cb_arg, nr_to_walk); > - spin_unlock(&nlru->lock); > - return ret; > + return __list_lru_walk_one(lru, nid, memcg, isolate, > + cb_arg, nr_to_walk, false); > } > EXPORT_SYMBOL_GPL(list_lru_walk_one); > > @@ -310,14 +344,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > list_lru_walk_cb isolate, void *cb_arg, > unsigned long *nr_to_walk) > { > - struct list_lru_node *nlru = &lru->node[nid]; > - unsigned long ret; > - > - spin_lock_irq(&nlru->lock); > - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, > - cb_arg, nr_to_walk); > - spin_unlock_irq(&nlru->lock); > - return ret; > + return __list_lru_walk_one(lru, nid, memcg, isolate, > + cb_arg, nr_to_walk, true); > } > > unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > @@ -332,16 +360,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > #ifdef CONFIG_MEMCG > if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { > struct list_lru_memcg *mlru; > + struct mem_cgroup *memcg; > unsigned long index; > > xa_for_each(&lru->xa, index, mlru) { > - struct list_lru_node *nlru = &lru->node[nid]; > - > - spin_lock(&nlru->lock); > - isolated += __list_lru_walk_one(lru, nid, index, > + rcu_read_lock(); > + memcg = mem_cgroup_from_id(index); > + if (!mem_cgroup_tryget(memcg)) { > + rcu_read_unlock(); > + continue; > + } > + rcu_read_unlock(); > + isolated += __list_lru_walk_one(lru, nid, memcg, > isolate, cb_arg, > - nr_to_walk); > - spin_unlock(&nlru->lock); > + nr_to_walk, false); > + mem_cgroup_put(memcg); > > if (*nr_to_walk <= 0) > break; > @@ -353,14 +386,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > } > EXPORT_SYMBOL_GPL(list_lru_walk_node); > > -static void init_one_lru(struct list_lru_one *l) > +static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) > { > INIT_LIST_HEAD(&l->list); > + spin_lock_init(&l->lock); > l->nr_items = 0; > +#ifdef CONFIG_LOCKDEP > + if (lru->key) > + lockdep_set_class(&l->lock, lru->key); > +#endif > } > > #ifdef CONFIG_MEMCG > -static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > +static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) > { > int nid; > struct list_lru_memcg *mlru; > @@ -370,7 +408,7 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > return NULL; > > for_each_node(nid) > - init_one_lru(&mlru->node[nid]); > + init_one_lru(lru, &mlru->node[nid]); > > return mlru; > } > @@ -398,28 +436,27 @@ static void memcg_destroy_list_lru(struct list_lru *lru) > xas_unlock_irq(&xas); > } > > -static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > - struct list_lru_one *src, > - struct mem_cgroup *dst_memcg) > +static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, > + struct list_lru_one *src, > + struct mem_cgroup *dst_memcg) > { > - struct list_lru_node *nlru = &lru->node[nid]; > + int dst_idx = dst_memcg->kmemcg_id; > struct list_lru_one *dst; > > - /* > - * Since list_lru_{add,del} may be called under an IRQ-safe lock, > - * we have to use IRQ-safe primitives here to avoid deadlock. > - */ > - spin_lock_irq(&nlru->lock); > - dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); > + spin_lock_irq(&src->lock); > + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); > + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); > > list_splice_init(&src->list, &dst->list); > - > if (src->nr_items) { > dst->nr_items += src->nr_items; > set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); > - src->nr_items = 0; > } > - spin_unlock_irq(&nlru->lock); > + /* Mark the list_lru_one dead */ > + src->nr_items = LONG_MIN; > + > + spin_unlock(&dst->lock); > + spin_unlock_irq(&src->lock); > } > > void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) > @@ -448,7 +485,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren > * safe to reparent. > */ > for_each_node(i) > - memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); > + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); > > /* > * Here all list_lrus corresponding to the cgroup are guaranteed > @@ -497,7 +534,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, > parent = parent_mem_cgroup(pos); > } > > - mlru = memcg_init_list_lru_one(gfp); > + mlru = memcg_init_list_lru_one(lru, gfp); > if (!mlru) > return -ENOMEM; > xas_set(&xas, pos->kmemcg_id); > @@ -544,14 +581,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr > if (!lru->node) > return -ENOMEM; > > - for_each_node(i) { > - spin_lock_init(&lru->node[i].lock); > -#ifdef CONFIG_LOCKDEP > - if (lru->key) > - lockdep_set_class(&lru->node[i].lock, lru->key); > -#endif > - init_one_lru(&lru->node[i].lru); > - } > + for_each_node(i) > + init_one_lru(lru, &lru->node[i].lru); > > memcg_init_list_lru(lru, memcg_aware); > list_lru_register(lru); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 8e90aa026c47..f421dfcfe8a1 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3098,8 +3098,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) > if (!parent) > parent = root_mem_cgroup; > > - memcg_reparent_objcgs(memcg, parent); > memcg_reparent_list_lrus(memcg, parent); > + > + /* > + * Objcg's reparenting must be after list_lru's, make sure list_lru > + * helpers won't use parent's list_lru until child is drained. > + */ > + memcg_reparent_objcgs(memcg, parent); > } > > #ifdef CONFIG_CGROUP_WRITEBACK > diff --git a/mm/workingset.c b/mm/workingset.c > index df3937c5eedc..8c4b6738dcad 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -777,7 +777,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, > ret = LRU_REMOVED_RETRY; > out: > cond_resched(); > - spin_lock_irq(lru_lock); > return ret; > } > > diff --git a/mm/zswap.c b/mm/zswap.c > index 95a28301b7cd..cb37a25fd00c 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -706,9 +706,9 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > * Note that it is safe to use rcu_read_lock() here, even in the face of > * concurrent memcg offlining: > * > - * 1. list_lru_add() is called before list_lru_memcg is erased. The > + * 1. list_lru_add() is called before list_lru_one is dead. The > * new entry will be reparented to memcg's parent's list_lru. > - * 2. list_lru_add() is called after list_lru_memcg is erased. The > + * 2. list_lru_add() is called after list_lru_one is dead. The > * new entry will be added directly to memcg's parent's list_lru. > * > * Similar reasoning holds for list_lru_del(). > @@ -1173,7 +1173,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o > zswap_written_back_pages++; > } > > - spin_lock(lock); > return ret; > } > Hi Kairui, I was testing zswap writeback in mm-unstable, and I think this patch might be breaking things. I have added the panic below 130.051024] ------------[ cut here ]------------ [ 130.051489] kernel BUG at mm/list_lru.c:321! [ 130.051732] Oops: invalid opcode: 0000 [#1] SMP [ 130.052133] CPU: 1 UID: 0 PID: 4976 Comm: cc1 Not tainted 6.12.0-rc1-00084-g278bd01cdaf1 #276 [ 130.052595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.el9 04/01/2014 [ 130.053276] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 [ 130.053983] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f [ 130.055557] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 [ 130.056084] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 [ 130.057407] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 [ 130.057794] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 [ 130.058579] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 [ 130.058962] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 [ 130.059773] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 [ 130.060242] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 130.060563] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 [ 130.060952] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 130.061658] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 130.062425] PKRU: 55555554 [ 130.062578] Call Trace: [ 130.062720] <TASK> [ 130.062941] ? __die_body+0x66/0xb0 [ 130.063145] ? die+0x88/0xb0 [ 130.063309] ? do_trap+0x9d/0x170 [ 130.063499] ? __list_lru_walk_one+0x1ae/0x1b0 [ 130.063745] ? __list_lru_walk_one+0x1ae/0x1b0 [ 130.063995] ? handle_invalid_op+0x65/0x80 [ 130.064223] ? __list_lru_walk_one+0x1ae/0x1b0 [ 130.064467] ? exc_invalid_op+0x2f/0x40 [ 130.064681] ? asm_exc_invalid_op+0x16/0x20 [ 130.064912] ? zswap_shrinker_count+0x1c0/0x1c0 [ 130.065172] ? __list_lru_walk_one+0x1ae/0x1b0 [ 130.065417] list_lru_walk_one+0xc/0x20 [ 130.065630] zswap_shrinker_scan+0x4b/0x80 [ 130.065856] do_shrink_slab+0x15f/0x2f0 [ 130.066075] shrink_slab+0x2bf/0x3d0 [ 130.066276] shrink_node+0x4f0/0x8a0 [ 130.066477] do_try_to_free_pages+0x131/0x4d0 [ 130.066717] try_to_free_mem_cgroup_pages+0x143/0x220 [ 130.067000] try_charge_memcg+0x22a/0x610 [ 130.067224] __mem_cgroup_charge+0x74/0x100 [ 130.068060] do_pte_missing+0xaa8/0x1020 [ 130.068280] handle_mm_fault+0x75d/0x1120 [ 130.068502] do_user_addr_fault+0x1c2/0x6f0 [ 130.068802] exc_page_fault+0x4f/0xb0 [ 130.069014] asm_exc_page_fault+0x22/0x30 [ 130.069240] RIP: 0033:0x7ff3f19ede49 [ 130.069441] Code: c9 62 e1 7f 29 7f 00 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 fa f3 aa 48 89 d0 c3 48 3b 15 c9 a3 06 00 77 e7 <62> e1 fe 28 7f 07 62 e1 fe 28 7f 47 01 48 81 fa 80 00 00 00 76 89 [ 130.070477] RSP: 002b:00007ffc5c818078 EFLAGS: 00010283 [ 130.070830] RAX: 00007ff3efac9000 RBX: 00007ff3f02d1940 RCX: 0000000000000001 [ 130.071522] RDX: 00000000000005a8 RSI: 0000000000000000 RDI: 00007ff3efac9000 [ 130.072146] RBP: 00007ffc5c8180c0 R08: 0000000003007320 R09: 0000000000000007 [ 130.072594] R10: 0000000003007320 R11: 0000000000000012 R12: 00007ff3f1f0e000 [ 130.072981] R13: 000000007ffa1e74 R14: 00000000000005a8 R15: 00000000000000b5 [ 130.073369] </TASK> [ 130.073496] Modules linked in: [ 130.073701] ---[ end trace 0000000000000000 ]--- [ 130.073960] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 [ 130.074319] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f [ 130.075564] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 [ 130.075897] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 [ 130.076342] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 [ 130.076739] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 [ 130.077192] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 [ 130.077739] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 [ 130.078149] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 [ 130.078764] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 130.079095] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 [ 130.079521] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 130.080009] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 130.080402] PKRU: 55555554 [ 130.080713] Kernel panic - not syncing: Fatal exception [ 130.081198] Kernel Offset: disabled [ 130.081396] ---[ end Kernel panic - not syncing: Fatal exception ]--- Thanks, Usama
Hi Usama, On Sat, Oct 26, 2024 at 5:16 AM Usama Arif <usamaarif642@gmail.com> wrote: > > > > On 25/09/2024 18:10, Kairui Song wrote: > > From: Kairui Song <kasong@tencent.com> > > > > Currently, every list_lru has a per-node lock that protects adding, > > deletion, isolation, and reparenting of all list_lru_one instances > > belonging to this list_lru on this node. This lock contention is > > heavy when multiple cgroups modify the same list_lru. > > > > This lock can be split into per-cgroup scope to reduce contention. > > > > To achieve this, we need a stable list_lru_one for every cgroup. > > This commit adds a lock to each list_lru_one and introduced a > > helper function lock_list_lru_of_memcg, making it possible to pin > > the list_lru of a memcg. Then reworked the reparenting process. > > > > Reparenting will switch the list_lru_one instances one by one. > > By locking each instance and marking it dead using the nr_items > > counter, reparenting ensures that all items in the corresponding > > cgroup (on-list or not, because items have a stable cgroup, see below) > > will see the list_lru_one switch synchronously. > > > > Objcg reparent is also moved after list_lru reparent so items will have > > a stable mem cgroup until all list_lru_one instances are drained. > > > > The only caller that doesn't work the *_obj interfaces are direct > > calls to list_lru_{add,del}. But it's only used by zswap and > > that's also based on objcg, so it's fine. > > > > This also changes the bahaviour of the isolation function when > > LRU_RETRY or LRU_REMOVED_RETRY is returned, because now releasing the > > lock could unblock reparenting and free the list_lru_one, isolation > > function will have to return withoug re-lock the lru. > > > > prepare() { > > mkdir /tmp/test-fs > > modprobe brd rd_nr=1 rd_size=33554432 > > mkfs.xfs -f /dev/ram0 > > mount -t xfs /dev/ram0 /tmp/test-fs > > for i in $(seq 1 512); do > > mkdir "/tmp/test-fs/$i" > > for j in $(seq 1 10240); do > > echo TEST-CONTENT > "/tmp/test-fs/$i/$j" > > done & > > done; wait > > } > > > > do_test() { > > read_worker() { > > sleep 1 > > tar -cv "$1" &>/dev/null > > } > > read_in_all() { > > cd "/tmp/test-fs" && ls > > for i in $(seq 1 512); do > > (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" > > read_worker "$i" & > > done; wait > > } > > for i in $(seq 1 512); do > > mkdir -p "/sys/fs/cgroup/benchmark/$i" > > done > > echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control > > echo 512M > /sys/fs/cgroup/benchmark/memory.max > > echo 3 > /proc/sys/vm/drop_caches > > time read_in_all > > } > > > > Above script simulates compression of small files in multiple cgroups > > with memory pressure. Run prepare() then do_test for 6 times: > > > > Before: > > real 0m7.762s user 0m11.340s sys 3m11.224s > > real 0m8.123s user 0m11.548s sys 3m2.549s > > real 0m7.736s user 0m11.515s sys 3m11.171s > > real 0m8.539s user 0m11.508s sys 3m7.618s > > real 0m7.928s user 0m11.349s sys 3m13.063s > > real 0m8.105s user 0m11.128s sys 3m14.313s > > > > After this commit (about ~15% faster): > > real 0m6.953s user 0m11.327s sys 2m42.912s > > real 0m7.453s user 0m11.343s sys 2m51.942s > > real 0m6.916s user 0m11.269s sys 2m43.957s > > real 0m6.894s user 0m11.528s sys 2m45.346s > > real 0m6.911s user 0m11.095s sys 2m43.168s > > real 0m6.773s user 0m11.518s sys 2m40.774s > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > drivers/android/binder_alloc.c | 1 - > > fs/inode.c | 1 - > > fs/xfs/xfs_qm.c | 1 - > > include/linux/list_lru.h | 6 +- > > mm/list_lru.c | 215 +++++++++++++++++++-------------- > > mm/memcontrol.c | 7 +- > > mm/workingset.c | 1 - > > mm/zswap.c | 5 +- > > 8 files changed, 134 insertions(+), 103 deletions(-) > > > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > > index b3acbc4174fb..86bbe40f4bcd 100644 > > --- a/drivers/android/binder_alloc.c > > +++ b/drivers/android/binder_alloc.c > > @@ -1106,7 +1106,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, > > mmput_async(mm); > > __free_page(page_to_free); > > > > - spin_lock(lock); > > return LRU_REMOVED_RETRY; > > > > err_invalid_vma: > > diff --git a/fs/inode.c b/fs/inode.c > > index 471ae4a31549..b95d80cb140b 100644 > > --- a/fs/inode.c > > +++ b/fs/inode.c > > @@ -932,7 +932,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item, > > mm_account_reclaimed_pages(reap); > > } > > inode_unpin_lru_isolating(inode); > > - spin_lock(lru_lock); > > return LRU_RETRY; > > } > > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > > index 7e2307921deb..665d26990b78 100644 > > --- a/fs/xfs/xfs_qm.c > > +++ b/fs/xfs/xfs_qm.c > > @@ -496,7 +496,6 @@ xfs_qm_dquot_isolate( > > trace_xfs_dqreclaim_busy(dqp); > > XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); > > xfs_dqunlock(dqp); > > - spin_lock(lru_lock); > > return LRU_RETRY; > > } > > > > diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h > > index eba93f6511f3..10ba9a54d42c 100644 > > --- a/include/linux/list_lru.h > > +++ b/include/linux/list_lru.h > > @@ -32,6 +32,8 @@ struct list_lru_one { > > struct list_head list; > > /* may become negative during memcg reparenting */ > > long nr_items; > > + /* protects all fields above */ > > + spinlock_t lock; > > }; > > > > struct list_lru_memcg { > > @@ -41,11 +43,9 @@ struct list_lru_memcg { > > }; > > > > struct list_lru_node { > > - /* protects all lists on the node, including per cgroup */ > > - spinlock_t lock; > > /* global list, used for the root cgroup in cgroup aware lrus */ > > struct list_lru_one lru; > > - long nr_items; > > + atomic_long_t nr_items; > > } ____cacheline_aligned_in_smp; > > > > struct list_lru { > > diff --git a/mm/list_lru.c b/mm/list_lru.c > > index 172b16146e15..6d1c187d9019 100644 > > --- a/mm/list_lru.c > > +++ b/mm/list_lru.c > > @@ -61,18 +61,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > > } > > > > static inline struct list_lru_one * > > -list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) > > +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > > + bool irq, bool skip_empty) > > { > > struct list_lru_one *l; > > + rcu_read_lock(); > > again: > > l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > > - if (likely(l)) > > - return l; > > - > > - memcg = parent_mem_cgroup(memcg); > > + if (likely(l)) { > > + if (irq) > > + spin_lock_irq(&l->lock); > > + else > > + spin_lock(&l->lock); > > + if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { > > + WARN_ON(l->nr_items < 0); > > + rcu_read_unlock(); > > + return l; > > + } > > + if (irq) > > + spin_unlock_irq(&l->lock); > > + else > > + spin_unlock(&l->lock); > > + } > > + /* > > + * Caller may simply bail out if raced with reparenting or > > + * may iterate through the list_lru and expect empty slots. > > + */ > > + if (skip_empty) { > > + rcu_read_unlock(); > > + return NULL; > > + } > > VM_WARN_ON(!css_is_dying(&memcg->css)); > > + memcg = parent_mem_cgroup(memcg); > > goto again; > > } > > + > > +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) > > +{ > > + if (irq_off) > > + spin_unlock_irq(&l->lock); > > + else > > + spin_unlock(&l->lock); > > +} > > #else > > static void list_lru_register(struct list_lru *lru) > > { > > @@ -99,31 +129,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > > } > > > > static inline struct list_lru_one * > > -list_lru_from_memcg(struct list_lru *lru, int nid, int idx) > > +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > > + bool irq, bool skip_empty) > > { > > - return &lru->node[nid].lru; > > + struct list_lru_one *l = &lru->node[nid].lru; > > + > > + if (irq) > > + spin_lock_irq(&l->lock); > > + else > > + spin_lock(&l->lock); > > + > > + return l; > > +} > > + > > +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) > > +{ > > + if (irq_off) > > + spin_unlock_irq(&l->lock); > > + else > > + spin_unlock(&l->lock); > > } > > #endif /* CONFIG_MEMCG */ > > > > /* The caller must ensure the memcg lifetime. */ > > bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > > - struct mem_cgroup *memcg) > > + struct mem_cgroup *memcg) > > { > > struct list_lru_node *nlru = &lru->node[nid]; > > struct list_lru_one *l; > > > > - spin_lock(&nlru->lock); > > + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); > > + if (!l) > > + return false; > > if (list_empty(item)) { > > - l = list_lru_from_memcg(lru, nid, memcg); > > list_add_tail(item, &l->list); > > /* Set shrinker bit if the first element was added */ > > if (!l->nr_items++) > > set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); > > - nlru->nr_items++; > > - spin_unlock(&nlru->lock); > > + unlock_list_lru(l, false); > > + atomic_long_inc(&nlru->nr_items); > > return true; > > } > > - spin_unlock(&nlru->lock); > > + unlock_list_lru(l, false); > > return false; > > } > > > > @@ -146,24 +193,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); > > > > /* The caller must ensure the memcg lifetime. */ > > bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, > > - struct mem_cgroup *memcg) > > + struct mem_cgroup *memcg) > > { > > struct list_lru_node *nlru = &lru->node[nid]; > > struct list_lru_one *l; > > - > > - spin_lock(&nlru->lock); > > + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); > > + if (!l) > > + return false; > > if (!list_empty(item)) { > > - l = list_lru_from_memcg(lru, nid, memcg); > > list_del_init(item); > > l->nr_items--; > > - nlru->nr_items--; > > - spin_unlock(&nlru->lock); > > + unlock_list_lru(l, false); > > + atomic_long_dec(&nlru->nr_items); > > return true; > > } > > - spin_unlock(&nlru->lock); > > + unlock_list_lru(l, false); > > return false; > > } > > -EXPORT_SYMBOL_GPL(list_lru_del); > > > > bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) > > { > > @@ -220,25 +266,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) > > struct list_lru_node *nlru; > > > > nlru = &lru->node[nid]; > > - return nlru->nr_items; > > + return atomic_long_read(&nlru->nr_items); > > } > > EXPORT_SYMBOL_GPL(list_lru_count_node); > > > > static unsigned long > > -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > > +__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > > list_lru_walk_cb isolate, void *cb_arg, > > - unsigned long *nr_to_walk) > > + unsigned long *nr_to_walk, bool irq_off) > > { > > struct list_lru_node *nlru = &lru->node[nid]; > > - struct list_lru_one *l; > > + struct list_lru_one *l = NULL; > > struct list_head *item, *n; > > unsigned long isolated = 0; > > > > restart: > > - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); > > + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); > > if (!l) > > - goto out; > > - > > + return isolated; > > list_for_each_safe(item, n, &l->list) { > > enum lru_status ret; > > > > @@ -250,19 +295,20 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > > break; > > --*nr_to_walk; > > > > - ret = isolate(item, l, &nlru->lock, cb_arg); > > + ret = isolate(item, l, &l->lock, cb_arg); > > switch (ret) { > > + /* > > + * LRU_RETRY and LRU_REMOVED_RETRY will drop the lru lock, > > + * the list traversal will be invalid and have to restart from > > + * scratch. > > + */ > > + case LRU_RETRY: > > + goto restart; > > case LRU_REMOVED_RETRY: > > - assert_spin_locked(&nlru->lock); > > fallthrough; > > case LRU_REMOVED: > > isolated++; > > - nlru->nr_items--; > > - /* > > - * If the lru lock has been dropped, our list > > - * traversal is now invalid and so we have to > > - * restart from scratch. > > - */ > > + atomic_long_dec(&nlru->nr_items); > > if (ret == LRU_REMOVED_RETRY) > > goto restart; > > break; > > @@ -271,21 +317,15 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, > > break; > > case LRU_SKIP: > > break; > > - case LRU_RETRY: > > - /* > > - * The lru lock has been dropped, our list traversal is > > - * now invalid and so we have to restart from scratch. > > - */ > > - assert_spin_locked(&nlru->lock); > > - goto restart; > > case LRU_STOP: > > - assert_spin_locked(&nlru->lock); > > + assert_spin_locked(&l->lock); > > goto out; > > default: > > BUG(); > > } > > } > > out: > > + unlock_list_lru(l, irq_off); > > return isolated; > > } > > > > @@ -294,14 +334,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > > list_lru_walk_cb isolate, void *cb_arg, > > unsigned long *nr_to_walk) > > { > > - struct list_lru_node *nlru = &lru->node[nid]; > > - unsigned long ret; > > - > > - spin_lock(&nlru->lock); > > - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, > > - cb_arg, nr_to_walk); > > - spin_unlock(&nlru->lock); > > - return ret; > > + return __list_lru_walk_one(lru, nid, memcg, isolate, > > + cb_arg, nr_to_walk, false); > > } > > EXPORT_SYMBOL_GPL(list_lru_walk_one); > > > > @@ -310,14 +344,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, > > list_lru_walk_cb isolate, void *cb_arg, > > unsigned long *nr_to_walk) > > { > > - struct list_lru_node *nlru = &lru->node[nid]; > > - unsigned long ret; > > - > > - spin_lock_irq(&nlru->lock); > > - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, > > - cb_arg, nr_to_walk); > > - spin_unlock_irq(&nlru->lock); > > - return ret; > > + return __list_lru_walk_one(lru, nid, memcg, isolate, > > + cb_arg, nr_to_walk, true); > > } > > > > unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > > @@ -332,16 +360,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > > #ifdef CONFIG_MEMCG > > if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { > > struct list_lru_memcg *mlru; > > + struct mem_cgroup *memcg; > > unsigned long index; > > > > xa_for_each(&lru->xa, index, mlru) { > > - struct list_lru_node *nlru = &lru->node[nid]; > > - > > - spin_lock(&nlru->lock); > > - isolated += __list_lru_walk_one(lru, nid, index, > > + rcu_read_lock(); > > + memcg = mem_cgroup_from_id(index); > > + if (!mem_cgroup_tryget(memcg)) { > > + rcu_read_unlock(); > > + continue; > > + } > > + rcu_read_unlock(); > > + isolated += __list_lru_walk_one(lru, nid, memcg, > > isolate, cb_arg, > > - nr_to_walk); > > - spin_unlock(&nlru->lock); > > + nr_to_walk, false); > > + mem_cgroup_put(memcg); > > > > if (*nr_to_walk <= 0) > > break; > > @@ -353,14 +386,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, > > } > > EXPORT_SYMBOL_GPL(list_lru_walk_node); > > > > -static void init_one_lru(struct list_lru_one *l) > > +static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) > > { > > INIT_LIST_HEAD(&l->list); > > + spin_lock_init(&l->lock); > > l->nr_items = 0; > > +#ifdef CONFIG_LOCKDEP > > + if (lru->key) > > + lockdep_set_class(&l->lock, lru->key); > > +#endif > > } > > > > #ifdef CONFIG_MEMCG > > -static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > > +static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) > > { > > int nid; > > struct list_lru_memcg *mlru; > > @@ -370,7 +408,7 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > > return NULL; > > > > for_each_node(nid) > > - init_one_lru(&mlru->node[nid]); > > + init_one_lru(lru, &mlru->node[nid]); > > > > return mlru; > > } > > @@ -398,28 +436,27 @@ static void memcg_destroy_list_lru(struct list_lru *lru) > > xas_unlock_irq(&xas); > > } > > > > -static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > > - struct list_lru_one *src, > > - struct mem_cgroup *dst_memcg) > > +static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, > > + struct list_lru_one *src, > > + struct mem_cgroup *dst_memcg) > > { > > - struct list_lru_node *nlru = &lru->node[nid]; > > + int dst_idx = dst_memcg->kmemcg_id; > > struct list_lru_one *dst; > > > > - /* > > - * Since list_lru_{add,del} may be called under an IRQ-safe lock, > > - * we have to use IRQ-safe primitives here to avoid deadlock. > > - */ > > - spin_lock_irq(&nlru->lock); > > - dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); > > + spin_lock_irq(&src->lock); > > + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); > > + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); > > > > list_splice_init(&src->list, &dst->list); > > - > > if (src->nr_items) { > > dst->nr_items += src->nr_items; > > set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); > > - src->nr_items = 0; > > } > > - spin_unlock_irq(&nlru->lock); > > + /* Mark the list_lru_one dead */ > > + src->nr_items = LONG_MIN; > > + > > + spin_unlock(&dst->lock); > > + spin_unlock_irq(&src->lock); > > } > > > > void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) > > @@ -448,7 +485,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren > > * safe to reparent. > > */ > > for_each_node(i) > > - memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); > > + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); > > > > /* > > * Here all list_lrus corresponding to the cgroup are guaranteed > > @@ -497,7 +534,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, > > parent = parent_mem_cgroup(pos); > > } > > > > - mlru = memcg_init_list_lru_one(gfp); > > + mlru = memcg_init_list_lru_one(lru, gfp); > > if (!mlru) > > return -ENOMEM; > > xas_set(&xas, pos->kmemcg_id); > > @@ -544,14 +581,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr > > if (!lru->node) > > return -ENOMEM; > > > > - for_each_node(i) { > > - spin_lock_init(&lru->node[i].lock); > > -#ifdef CONFIG_LOCKDEP > > - if (lru->key) > > - lockdep_set_class(&lru->node[i].lock, lru->key); > > -#endif > > - init_one_lru(&lru->node[i].lru); > > - } > > + for_each_node(i) > > + init_one_lru(lru, &lru->node[i].lru); > > > > memcg_init_list_lru(lru, memcg_aware); > > list_lru_register(lru); > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index 8e90aa026c47..f421dfcfe8a1 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -3098,8 +3098,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) > > if (!parent) > > parent = root_mem_cgroup; > > > > - memcg_reparent_objcgs(memcg, parent); > > memcg_reparent_list_lrus(memcg, parent); > > + > > + /* > > + * Objcg's reparenting must be after list_lru's, make sure list_lru > > + * helpers won't use parent's list_lru until child is drained. > > + */ > > + memcg_reparent_objcgs(memcg, parent); > > } > > > > #ifdef CONFIG_CGROUP_WRITEBACK > > diff --git a/mm/workingset.c b/mm/workingset.c > > index df3937c5eedc..8c4b6738dcad 100644 > > --- a/mm/workingset.c > > +++ b/mm/workingset.c > > @@ -777,7 +777,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, > > ret = LRU_REMOVED_RETRY; > > out: > > cond_resched(); > > - spin_lock_irq(lru_lock); > > return ret; > > } > > > > diff --git a/mm/zswap.c b/mm/zswap.c > > index 95a28301b7cd..cb37a25fd00c 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -706,9 +706,9 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > * Note that it is safe to use rcu_read_lock() here, even in the face of > > * concurrent memcg offlining: > > * > > - * 1. list_lru_add() is called before list_lru_memcg is erased. The > > + * 1. list_lru_add() is called before list_lru_one is dead. The > > * new entry will be reparented to memcg's parent's list_lru. > > - * 2. list_lru_add() is called after list_lru_memcg is erased. The > > + * 2. list_lru_add() is called after list_lru_one is dead. The > > * new entry will be added directly to memcg's parent's list_lru. > > * > > * Similar reasoning holds for list_lru_del(). > > @@ -1173,7 +1173,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o > > zswap_written_back_pages++; > > } > > > > - spin_lock(lock); > > return ret; > > } > > > > Hi Kairui, > > I was testing zswap writeback in mm-unstable, and I think this patch might be breaking things. > > I have added the panic below > > 130.051024] ------------[ cut here ]------------ > [ 130.051489] kernel BUG at mm/list_lru.c:321! > [ 130.051732] Oops: invalid opcode: 0000 [#1] SMP > [ 130.052133] CPU: 1 UID: 0 PID: 4976 Comm: cc1 Not tainted 6.12.0-rc1-00084-g278bd01cdaf1 #276 > [ 130.052595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.el9 04/01/2014 > [ 130.053276] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 > [ 130.053983] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f > [ 130.055557] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 > [ 130.056084] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 > [ 130.057407] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 > [ 130.057794] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 > [ 130.058579] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 > [ 130.058962] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 > [ 130.059773] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 > [ 130.060242] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 130.060563] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 > [ 130.060952] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 130.061658] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 130.062425] PKRU: 55555554 > [ 130.062578] Call Trace: > [ 130.062720] <TASK> > [ 130.062941] ? __die_body+0x66/0xb0 > [ 130.063145] ? die+0x88/0xb0 > [ 130.063309] ? do_trap+0x9d/0x170 > [ 130.063499] ? __list_lru_walk_one+0x1ae/0x1b0 > [ 130.063745] ? __list_lru_walk_one+0x1ae/0x1b0 > [ 130.063995] ? handle_invalid_op+0x65/0x80 > [ 130.064223] ? __list_lru_walk_one+0x1ae/0x1b0 > [ 130.064467] ? exc_invalid_op+0x2f/0x40 > [ 130.064681] ? asm_exc_invalid_op+0x16/0x20 > [ 130.064912] ? zswap_shrinker_count+0x1c0/0x1c0 > [ 130.065172] ? __list_lru_walk_one+0x1ae/0x1b0 > [ 130.065417] list_lru_walk_one+0xc/0x20 > [ 130.065630] zswap_shrinker_scan+0x4b/0x80 > [ 130.065856] do_shrink_slab+0x15f/0x2f0 > [ 130.066075] shrink_slab+0x2bf/0x3d0 > [ 130.066276] shrink_node+0x4f0/0x8a0 > [ 130.066477] do_try_to_free_pages+0x131/0x4d0 > [ 130.066717] try_to_free_mem_cgroup_pages+0x143/0x220 > [ 130.067000] try_charge_memcg+0x22a/0x610 > [ 130.067224] __mem_cgroup_charge+0x74/0x100 > [ 130.068060] do_pte_missing+0xaa8/0x1020 > [ 130.068280] handle_mm_fault+0x75d/0x1120 > [ 130.068502] do_user_addr_fault+0x1c2/0x6f0 > [ 130.068802] exc_page_fault+0x4f/0xb0 > [ 130.069014] asm_exc_page_fault+0x22/0x30 > [ 130.069240] RIP: 0033:0x7ff3f19ede49 > [ 130.069441] Code: c9 62 e1 7f 29 7f 00 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 fa f3 aa 48 89 d0 c3 48 3b 15 c9 a3 06 00 77 e7 <62> e1 fe 28 7f 07 62 e1 fe 28 7f 47 01 48 81 fa 80 00 00 00 76 89 > [ 130.070477] RSP: 002b:00007ffc5c818078 EFLAGS: 00010283 > [ 130.070830] RAX: 00007ff3efac9000 RBX: 00007ff3f02d1940 RCX: 0000000000000001 > [ 130.071522] RDX: 00000000000005a8 RSI: 0000000000000000 RDI: 00007ff3efac9000 > [ 130.072146] RBP: 00007ffc5c8180c0 R08: 0000000003007320 R09: 0000000000000007 > [ 130.072594] R10: 0000000003007320 R11: 0000000000000012 R12: 00007ff3f1f0e000 > [ 130.072981] R13: 000000007ffa1e74 R14: 00000000000005a8 R15: 00000000000000b5 > [ 130.073369] </TASK> > [ 130.073496] Modules linked in: > [ 130.073701] ---[ end trace 0000000000000000 ]--- > [ 130.073960] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 > [ 130.074319] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f > [ 130.075564] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 > [ 130.075897] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 > [ 130.076342] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 > [ 130.076739] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 > [ 130.077192] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 > [ 130.077739] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 > [ 130.078149] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 > [ 130.078764] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 130.079095] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 > [ 130.079521] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 130.080009] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 130.080402] PKRU: 55555554 > [ 130.080713] Kernel panic - not syncing: Fatal exception > [ 130.081198] Kernel Offset: disabled > [ 130.081396] ---[ end Kernel panic - not syncing: Fatal exception ]--- > > Thanks, > Usama > Thanks for the report. I converted list_lru_walk callback to keep the list unlocked when LRU_RETRY and LRU_REMOVED_RETRY is returned, but didn't notice shrink_memcg_cg in zswap.c could return LRU_STOP after it unlocked the list. The fix should be simple, is it easy to reproduce? Can you help verify? diff --git a/mm/list_lru.c b/mm/list_lru.c index 79c2d21504a2..1a3caf4c4e14 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -298,9 +298,9 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, ret = isolate(item, l, cb_arg); switch (ret) { /* - * LRU_RETRY and LRU_REMOVED_RETRY will drop the lru lock, - * the list traversal will be invalid and have to restart from - * scratch. + * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru + * lock, the list traversal will be invalid and have to restart + * from scratch. */ case LRU_RETRY: goto restart; @@ -318,14 +318,13 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, case LRU_SKIP: break; case LRU_STOP: - assert_spin_locked(&l->lock); goto out; default: BUG(); } } -out: unlock_list_lru(l, irq_off); +out: return isolated; }
On 27/10/2024 17:26, Kairui Song wrote: > Hi Usama, > >> >> Hi Kairui, >> >> I was testing zswap writeback in mm-unstable, and I think this patch might be breaking things. >> >> I have added the panic below >> >> 130.051024] ------------[ cut here ]------------ >> [ 130.051489] kernel BUG at mm/list_lru.c:321! >> [ 130.051732] Oops: invalid opcode: 0000 [#1] SMP >> [ 130.052133] CPU: 1 UID: 0 PID: 4976 Comm: cc1 Not tainted 6.12.0-rc1-00084-g278bd01cdaf1 #276 >> [ 130.052595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.el9 04/01/2014 >> [ 130.053276] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 >> [ 130.053983] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f >> [ 130.055557] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 >> [ 130.056084] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 >> [ 130.057407] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 >> [ 130.057794] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 >> [ 130.058579] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 >> [ 130.058962] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 >> [ 130.059773] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 >> [ 130.060242] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 130.060563] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 >> [ 130.060952] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >> [ 130.061658] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 >> [ 130.062425] PKRU: 55555554 >> [ 130.062578] Call Trace: >> [ 130.062720] <TASK> >> [ 130.062941] ? __die_body+0x66/0xb0 >> [ 130.063145] ? die+0x88/0xb0 >> [ 130.063309] ? do_trap+0x9d/0x170 >> [ 130.063499] ? __list_lru_walk_one+0x1ae/0x1b0 >> [ 130.063745] ? __list_lru_walk_one+0x1ae/0x1b0 >> [ 130.063995] ? handle_invalid_op+0x65/0x80 >> [ 130.064223] ? __list_lru_walk_one+0x1ae/0x1b0 >> [ 130.064467] ? exc_invalid_op+0x2f/0x40 >> [ 130.064681] ? asm_exc_invalid_op+0x16/0x20 >> [ 130.064912] ? zswap_shrinker_count+0x1c0/0x1c0 >> [ 130.065172] ? __list_lru_walk_one+0x1ae/0x1b0 >> [ 130.065417] list_lru_walk_one+0xc/0x20 >> [ 130.065630] zswap_shrinker_scan+0x4b/0x80 >> [ 130.065856] do_shrink_slab+0x15f/0x2f0 >> [ 130.066075] shrink_slab+0x2bf/0x3d0 >> [ 130.066276] shrink_node+0x4f0/0x8a0 >> [ 130.066477] do_try_to_free_pages+0x131/0x4d0 >> [ 130.066717] try_to_free_mem_cgroup_pages+0x143/0x220 >> [ 130.067000] try_charge_memcg+0x22a/0x610 >> [ 130.067224] __mem_cgroup_charge+0x74/0x100 >> [ 130.068060] do_pte_missing+0xaa8/0x1020 >> [ 130.068280] handle_mm_fault+0x75d/0x1120 >> [ 130.068502] do_user_addr_fault+0x1c2/0x6f0 >> [ 130.068802] exc_page_fault+0x4f/0xb0 >> [ 130.069014] asm_exc_page_fault+0x22/0x30 >> [ 130.069240] RIP: 0033:0x7ff3f19ede49 >> [ 130.069441] Code: c9 62 e1 7f 29 7f 00 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 fa f3 aa 48 89 d0 c3 48 3b 15 c9 a3 06 00 77 e7 <62> e1 fe 28 7f 07 62 e1 fe 28 7f 47 01 48 81 fa 80 00 00 00 76 89 >> [ 130.070477] RSP: 002b:00007ffc5c818078 EFLAGS: 00010283 >> [ 130.070830] RAX: 00007ff3efac9000 RBX: 00007ff3f02d1940 RCX: 0000000000000001 >> [ 130.071522] RDX: 00000000000005a8 RSI: 0000000000000000 RDI: 00007ff3efac9000 >> [ 130.072146] RBP: 00007ffc5c8180c0 R08: 0000000003007320 R09: 0000000000000007 >> [ 130.072594] R10: 0000000003007320 R11: 0000000000000012 R12: 00007ff3f1f0e000 >> [ 130.072981] R13: 000000007ffa1e74 R14: 00000000000005a8 R15: 00000000000000b5 >> [ 130.073369] </TASK> >> [ 130.073496] Modules linked in: >> [ 130.073701] ---[ end trace 0000000000000000 ]--- >> [ 130.073960] RIP: 0010:__list_lru_walk_one+0x1ae/0x1b0 >> [ 130.074319] Code: 7c 24 78 00 74 03 fb eb 00 48 89 d8 48 83 c4 40 5b 41 5c 41 5d 41 5e 41 5f 5d c3 41 c6 07 00 eb e8 41 c6 07 00 fb eb e1 0f 0b <0f> 0b 0f 1f 44 00 00 6a 01 e8 44 fe ff ff 48 83 c4 08 c3 66 2e 0f >> [ 130.075564] RSP: 0000:ffffc90004a2b9a0 EFLAGS: 00010246 >> [ 130.075897] RAX: ffff88805dedf6e8 RBX: 0000000000000071 RCX: 0000000000000005 >> [ 130.076342] RDX: 0000000000000000 RSI: 0000000000000022 RDI: ffff888008a26400 >> [ 130.076739] RBP: ffff88805dedf6d0 R08: 0000000000000402 R09: 0000000000000001 >> [ 130.077192] R10: ffffc90004a2b7e8 R11: 0000000000000000 R12: ffffffff81342930 >> [ 130.077739] R13: ffff888017532ca0 R14: ffffc90004a2bae8 R15: ffff8880175322c8 >> [ 130.078149] FS: 00007ff3f1e21f00(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 >> [ 130.078764] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 130.079095] CR2: 00007f428e2e2ed8 CR3: 0000000067db6001 CR4: 0000000000770ef0 >> [ 130.079521] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >> [ 130.080009] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 >> [ 130.080402] PKRU: 55555554 >> [ 130.080713] Kernel panic - not syncing: Fatal exception >> [ 130.081198] Kernel Offset: disabled >> [ 130.081396] ---[ end Kernel panic - not syncing: Fatal exception ]--- >> >> Thanks, >> Usama >> > > Thanks for the report. I converted list_lru_walk callback to keep the > list unlocked when LRU_RETRY and LRU_REMOVED_RETRY is returned, but > didn't notice shrink_memcg_cg in zswap.c could return LRU_STOP after > it unlocked the list. > > The fix should be simple, is it easy to reproduce? Can you help verify? > > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 79c2d21504a2..1a3caf4c4e14 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -298,9 +298,9 @@ __list_lru_walk_one(struct list_lru *lru, int nid, > struct mem_cgroup *memcg, > ret = isolate(item, l, cb_arg); > switch (ret) { > /* > - * LRU_RETRY and LRU_REMOVED_RETRY will drop the lru lock, > - * the list traversal will be invalid and have to restart from > - * scratch. > + * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru > + * lock, the list traversal will be invalid and have to restart > + * from scratch. > */ > case LRU_RETRY: > goto restart; > @@ -318,14 +318,13 @@ __list_lru_walk_one(struct list_lru *lru, int > nid, struct mem_cgroup *memcg, > case LRU_SKIP: > break; > case LRU_STOP: > - assert_spin_locked(&l->lock); > goto out; > default: > BUG(); > } > } > -out: > unlock_list_lru(l, irq_off); > +out: > return isolated; > } Hi Kairui, With this fix there are no more crashes. Thanks for the quick fix. Just FYI, to test it, just enable zswap and zswap shrinker (echo Y > /sys/module/zswap/parameters/shrinker_enabled) and build the kernel in a memory constrained environment (memory.max 1G). Thanks, Usama
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index b3acbc4174fb..86bbe40f4bcd 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1106,7 +1106,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mmput_async(mm); __free_page(page_to_free); - spin_lock(lock); return LRU_REMOVED_RETRY; err_invalid_vma: diff --git a/fs/inode.c b/fs/inode.c index 471ae4a31549..b95d80cb140b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -932,7 +932,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item, mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); - spin_lock(lru_lock); return LRU_RETRY; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7e2307921deb..665d26990b78 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -496,7 +496,6 @@ xfs_qm_dquot_isolate( trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); - spin_lock(lru_lock); return LRU_RETRY; } diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index eba93f6511f3..10ba9a54d42c 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -32,6 +32,8 @@ struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; + /* protects all fields above */ + spinlock_t lock; }; struct list_lru_memcg { @@ -41,11 +43,9 @@ struct list_lru_memcg { }; struct list_lru_node { - /* protects all lists on the node, including per cgroup */ - spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; - long nr_items; + atomic_long_t nr_items; } ____cacheline_aligned_in_smp; struct list_lru { diff --git a/mm/list_lru.c b/mm/list_lru.c index 172b16146e15..6d1c187d9019 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -61,18 +61,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) { struct list_lru_one *l; + rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - if (likely(l)) - return l; - - memcg = parent_mem_cgroup(memcg); + if (likely(l)) { + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { + WARN_ON(l->nr_items < 0); + rcu_read_unlock(); + return l; + } + if (irq) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); + } + /* + * Caller may simply bail out if raced with reparenting or + * may iterate through the list_lru and expect empty slots. + */ + if (skip_empty) { + rcu_read_unlock(); + return NULL; + } VM_WARN_ON(!css_is_dying(&memcg->css)); + memcg = parent_mem_cgroup(memcg); goto again; } + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} #else static void list_lru_register(struct list_lru *lru) { @@ -99,31 +129,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -list_lru_from_memcg(struct list_lru *lru, int nid, int idx) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) { - return &lru->node[nid].lru; + struct list_lru_one *l = &lru->node[nid].lru; + + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + + return l; +} + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (list_empty(item)) { - l = list_lru_from_memcg(lru, nid, memcg); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - nlru->nr_items++; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_inc(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } @@ -146,24 +193,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (!list_empty(item)) { - l = list_lru_from_memcg(lru, nid, memcg); list_del_init(item); l->nr_items--; - nlru->nr_items--; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_dec(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } -EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { @@ -220,25 +266,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) struct list_lru_node *nlru; nlru = &lru->node[nid]; - return nlru->nr_items; + return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, - unsigned long *nr_to_walk) + unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; + struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) - goto out; - + return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; @@ -250,19 +295,20 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, break; --*nr_to_walk; - ret = isolate(item, l, &nlru->lock, cb_arg); + ret = isolate(item, l, &l->lock, cb_arg); switch (ret) { + /* + * LRU_RETRY and LRU_REMOVED_RETRY will drop the lru lock, + * the list traversal will be invalid and have to restart from + * scratch. + */ + case LRU_RETRY: + goto restart; case LRU_REMOVED_RETRY: - assert_spin_locked(&nlru->lock); fallthrough; case LRU_REMOVED: isolated++; - nlru->nr_items--; - /* - * If the lru lock has been dropped, our list - * traversal is now invalid and so we have to - * restart from scratch. - */ + atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; @@ -271,21 +317,15 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, break; case LRU_SKIP: break; - case LRU_RETRY: - /* - * The lru lock has been dropped, our list traversal is - * now invalid and so we have to restart from scratch. - */ - assert_spin_locked(&nlru->lock); - goto restart; case LRU_STOP: - assert_spin_locked(&nlru->lock); + assert_spin_locked(&l->lock); goto out; default: BUG(); } } out: + unlock_list_lru(l, irq_off); return isolated; } @@ -294,14 +334,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); @@ -310,14 +344,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock_irq(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock_irq(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, @@ -332,16 +360,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; + struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { - struct list_lru_node *nlru = &lru->node[nid]; - - spin_lock(&nlru->lock); - isolated += __list_lru_walk_one(lru, nid, index, + rcu_read_lock(); + memcg = mem_cgroup_from_id(index); + if (!mem_cgroup_tryget(memcg)) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, - nr_to_walk); - spin_unlock(&nlru->lock); + nr_to_walk, false); + mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; @@ -353,14 +386,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, } EXPORT_SYMBOL_GPL(list_lru_walk_node); -static void init_one_lru(struct list_lru_one *l) +static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); + spin_lock_init(&l->lock); l->nr_items = 0; +#ifdef CONFIG_LOCKDEP + if (lru->key) + lockdep_set_class(&l->lock, lru->key); +#endif } #ifdef CONFIG_MEMCG -static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) +static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; @@ -370,7 +408,7 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) return NULL; for_each_node(nid) - init_one_lru(&mlru->node[nid]); + init_one_lru(lru, &mlru->node[nid]); return mlru; } @@ -398,28 +436,27 @@ static void memcg_destroy_list_lru(struct list_lru *lru) xas_unlock_irq(&xas); } -static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, - struct list_lru_one *src, - struct mem_cgroup *dst_memcg) +static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, + struct list_lru_one *src, + struct mem_cgroup *dst_memcg) { - struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; - /* - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); - dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); + spin_lock_irq(&src->lock); + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); - if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); - src->nr_items = 0; } - spin_unlock_irq(&nlru->lock); + /* Mark the list_lru_one dead */ + src->nr_items = LONG_MIN; + + spin_unlock(&dst->lock); + spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) @@ -448,7 +485,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * safe to reparent. */ for_each_node(i) - memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed @@ -497,7 +534,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, parent = parent_mem_cgroup(pos); } - mlru = memcg_init_list_lru_one(gfp); + mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; xas_set(&xas, pos->kmemcg_id); @@ -544,14 +581,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr if (!lru->node) return -ENOMEM; - for_each_node(i) { - spin_lock_init(&lru->node[i].lock); -#ifdef CONFIG_LOCKDEP - if (lru->key) - lockdep_set_class(&lru->node[i].lock, lru->key); -#endif - init_one_lru(&lru->node[i].lru); - } + for_each_node(i) + init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e90aa026c47..f421dfcfe8a1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3098,8 +3098,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; - memcg_reparent_objcgs(memcg, parent); memcg_reparent_list_lrus(memcg, parent); + + /* + * Objcg's reparenting must be after list_lru's, make sure list_lru + * helpers won't use parent's list_lru until child is drained. + */ + memcg_reparent_objcgs(memcg, parent); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/workingset.c b/mm/workingset.c index df3937c5eedc..8c4b6738dcad 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -777,7 +777,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, ret = LRU_REMOVED_RETRY; out: cond_resched(); - spin_lock_irq(lru_lock); return ret; } diff --git a/mm/zswap.c b/mm/zswap.c index 95a28301b7cd..cb37a25fd00c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -706,9 +706,9 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining: * - * 1. list_lru_add() is called before list_lru_memcg is erased. The + * 1. list_lru_add() is called before list_lru_one is dead. The * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after list_lru_memcg is erased. The + * 2. list_lru_add() is called after list_lru_one is dead. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del(). @@ -1173,7 +1173,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_written_back_pages++; } - spin_lock(lock); return ret; }