diff mbox series

[RFC,06/28] mm: thp: introduce folio_split_queue_lock and its variants

Message ID 20250415024532.26632-7-songmuchun@bytedance.com (mailing list archive)
State New
Headers show
Series Eliminate Dying Memory Cgroup | expand

Commit Message

Muchun Song April 15, 2025, 2:45 a.m. UTC
In future memcg removal, the binding between a folio and a memcg may change,
making the split lock within the memcg unstable when held.

A new approach is required to reparent the split queue to its parent. This
patch starts introducing a unified way to acquire the split lock for future
work.

It's a code-only refactoring with no functional changes.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/memcontrol.h |  10 ++++
 mm/huge_memory.c           | 100 +++++++++++++++++++++++++++----------
 2 files changed, 83 insertions(+), 27 deletions(-)

Comments

Johannes Weiner April 17, 2025, 2:58 p.m. UTC | #1
On Tue, Apr 15, 2025 at 10:45:10AM +0800, Muchun Song wrote:
> In future memcg removal, the binding between a folio and a memcg may change,
> making the split lock within the memcg unstable when held.
> 
> A new approach is required to reparent the split queue to its parent. This
> patch starts introducing a unified way to acquire the split lock for future
> work.
> 
> It's a code-only refactoring with no functional changes.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Johannes Weiner April 18, 2025, 7:50 p.m. UTC | #2
On Tue, Apr 15, 2025 at 10:45:10AM +0800, Muchun Song wrote:
> @@ -4202,7 +4248,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  		if (!--sc->nr_to_scan)
>  			break;
>  	}
> -	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> +	split_queue_unlock_irqrestore(ds_queue, flags);
>  
>  	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
>  		bool did_split = false;
> @@ -4251,7 +4297,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>  	list_splice_tail(&list, &ds_queue->split_queue);
>  	ds_queue->split_queue_len -= removed;
> -	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> +	split_queue_unlock_irqrestore(ds_queue, flags);

These just tripped up in my testing. You use the new helpers for
unlock, but not for the lock path. That's fine in this patch, but when
"mm: thp: prepare for reparenting LRU pages for split queue lock" adds
the rcu locking to the helpers, it results in missing rcu read locks:

[  108.814880]
[  108.816378] =====================================
[  108.821069] WARNING: bad unlock balance detected!
[  108.825762] 6.15.0-rc2-00028-g570c8034f057 #192 Not tainted
[  108.831323] -------------------------------------
[  108.836016] cc1/2031 is trying to release lock (rcu_read_lock) at:
[  108.842181] [<ffffffff815f9d05>] deferred_split_scan+0x235/0x4b0
[  108.848179] but there are no more locks to release!
[  108.853046]
[  108.853046] other info that might help us debug this:
[  108.859553] 2 locks held by cc1/2031:
[  108.863211]  #0: ffff88801ddbbd88 (vm_lock){....}-{0:0}, at: do_user_addr_fault+0x19c/0x6b0
[  108.871544]  #1: ffffffff83042400 (fs_reclaim){....}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x337/0xf20
[  108.881511]
[  108.881511] stack backtrace:
[  108.885862] CPU: 4 UID: 0 PID: 2031 Comm: cc1 Not tainted 6.15.0-rc2-00028-g570c8034f057 #192 PREEMPT(voluntary)
[  108.885865] Hardware name: Micro-Star International Co., Ltd. MS-7B98/Z390-A PRO (MS-7B98), BIOS 1.80 12/25/2019
[  108.885866] Call Trace:
[  108.885867]  <TASK>
[  108.885868]  dump_stack_lvl+0x57/0x80
[  108.885871]  ? deferred_split_scan+0x235/0x4b0
[  108.885874]  print_unlock_imbalance_bug.part.0+0xfb/0x110
[  108.885877]  ? deferred_split_scan+0x235/0x4b0
[  108.885878]  lock_release+0x258/0x3e0
[  108.885880]  ? deferred_split_scan+0x85/0x4b0
[  108.885881]  deferred_split_scan+0x23a/0x4b0
[  108.885885]  ? find_held_lock+0x32/0x80
[  108.885886]  ? local_clock_noinstr+0x9/0xd0
[  108.885887]  ? lock_release+0x17e/0x3e0
[  108.885889]  do_shrink_slab+0x155/0x480
[  108.885891]  shrink_slab+0x33c/0x480
[  108.885892]  ? shrink_slab+0x1c1/0x480
[  108.885893]  shrink_node+0x324/0x840
[  108.885895]  do_try_to_free_pages+0xdf/0x550
[  108.885897]  try_to_free_pages+0xeb/0x260
[  108.885899]  __alloc_pages_slowpath.constprop.0+0x35c/0xf20
[  108.885901]  __alloc_frozen_pages_noprof+0x339/0x360
[  108.885903]  __folio_alloc_noprof+0x10/0x90
[  108.885904]  __handle_mm_fault+0xca5/0x1930
[  108.885906]  handle_mm_fault+0xb6/0x310
[  108.885908]  do_user_addr_fault+0x21e/0x6b0
[  108.885910]  exc_page_fault+0x62/0x1d0
[  108.885911]  asm_exc_page_fault+0x22/0x30
[  108.885912] RIP: 0033:0xf64890
[  108.885914] Code: 4e 64 31 d2 b9 01 00 00 00 31 f6 4c 89 45 98 e8 66 b3 88 ff 4c 8b 45 98 bf 28 00 00 00 b9 08 00 00 00 49 8b 70 18 48 8b 56 58 <48> 89 10 48 8b 13 48 89 46 58 c7 46 60 00 00 00 00 e9 62 01 00 00
[  108.885915] RSP: 002b:00007ffcf3c7d920 EFLAGS: 00010206
[  108.885916] RAX: 00007f7bf07c5000 RBX: 00007ffcf3c7d9a0 RCX: 0000000000000008
[  108.885917] RDX: 00007f7bf06aa000 RSI: 00007f7bf09dd400 RDI: 0000000000000028
[  108.885917] RBP: 00007ffcf3c7d990 R08: 00007f7bf080c540 R09: 0000000000000007
[  108.885918] R10: 000000000000009a R11: 000000003e969900 R12: 00007f7bf07bbe70
[  108.885918] R13: 0000000000000000 R14: 00007f7bf07bbec0 R15: 00007ffcf3c7d930
[  108.885920]  </TASK>
Muchun Song April 19, 2025, 2:20 p.m. UTC | #3
> On Apr 19, 2025, at 03:50, Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> On Tue, Apr 15, 2025 at 10:45:10AM +0800, Muchun Song wrote:
>> @@ -4202,7 +4248,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>> if (!--sc->nr_to_scan)
>> break;
>> }
>> - 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> + 	split_queue_unlock_irqrestore(ds_queue, flags);
>> 
>> 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
>> 		bool did_split = false;
>> @@ -4251,7 +4297,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>> 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> 	list_splice_tail(&list, &ds_queue->split_queue);
>> 	ds_queue->split_queue_len -= removed;
>> - 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> + 	split_queue_unlock_irqrestore(ds_queue, flags);
> 
> These just tripped up in my testing. You use the new helpers for
> unlock, but not for the lock path. That's fine in this patch, but when
> "mm: thp: prepare for reparenting LRU pages for split queue lock" adds
> the rcu locking to the helpers, it results in missing rcu read locks:

Good catch! Thanks for pointing out. You are right, I shouldn't use the
new unlock helpers here without the corresponding new lock helpers. I'll
revert this change in this function.

Muchun,
Thanks.

> 
> [  108.814880]
> [  108.816378] =====================================
> [  108.821069] WARNING: bad unlock balance detected!
> [  108.825762] 6.15.0-rc2-00028-g570c8034f057 #192 Not tainted
> [  108.831323] -------------------------------------
> [  108.836016] cc1/2031 is trying to release lock (rcu_read_lock) at:
> [  108.842181] [<ffffffff815f9d05>] deferred_split_scan+0x235/0x4b0
> [  108.848179] but there are no more locks to release!
> [  108.853046]
> [  108.853046] other info that might help us debug this:
> [  108.859553] 2 locks held by cc1/2031:
> [  108.863211]  #0: ffff88801ddbbd88 (vm_lock){....}-{0:0}, at: do_user_addr_fault+0x19c/0x6b0
> [  108.871544]  #1: ffffffff83042400 (fs_reclaim){....}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x337/0xf20
> [  108.881511]
> [  108.881511] stack backtrace:
> [  108.885862] CPU: 4 UID: 0 PID: 2031 Comm: cc1 Not tainted 6.15.0-rc2-00028-g570c8034f057 #192 PREEMPT(voluntary)
> [  108.885865] Hardware name: Micro-Star International Co., Ltd. MS-7B98/Z390-A PRO (MS-7B98), BIOS 1.80 12/25/2019
> [  108.885866] Call Trace:
> [  108.885867]  <TASK>
> [  108.885868]  dump_stack_lvl+0x57/0x80
> [  108.885871]  ? deferred_split_scan+0x235/0x4b0
> [  108.885874]  print_unlock_imbalance_bug.part.0+0xfb/0x110
> [  108.885877]  ? deferred_split_scan+0x235/0x4b0
> [  108.885878]  lock_release+0x258/0x3e0
> [  108.885880]  ? deferred_split_scan+0x85/0x4b0
> [  108.885881]  deferred_split_scan+0x23a/0x4b0
> [  108.885885]  ? find_held_lock+0x32/0x80
> [  108.885886]  ? local_clock_noinstr+0x9/0xd0
> [  108.885887]  ? lock_release+0x17e/0x3e0
> [  108.885889]  do_shrink_slab+0x155/0x480
> [  108.885891]  shrink_slab+0x33c/0x480
> [  108.885892]  ? shrink_slab+0x1c1/0x480
> [  108.885893]  shrink_node+0x324/0x840
> [  108.885895]  do_try_to_free_pages+0xdf/0x550
> [  108.885897]  try_to_free_pages+0xeb/0x260
> [  108.885899]  __alloc_pages_slowpath.constprop.0+0x35c/0xf20
> [  108.885901]  __alloc_frozen_pages_noprof+0x339/0x360
> [  108.885903]  __folio_alloc_noprof+0x10/0x90
> [  108.885904]  __handle_mm_fault+0xca5/0x1930
> [  108.885906]  handle_mm_fault+0xb6/0x310
> [  108.885908]  do_user_addr_fault+0x21e/0x6b0
> [  108.885910]  exc_page_fault+0x62/0x1d0
> [  108.885911]  asm_exc_page_fault+0x22/0x30
> [  108.885912] RIP: 0033:0xf64890
> [  108.885914] Code: 4e 64 31 d2 b9 01 00 00 00 31 f6 4c 89 45 98 e8 66 b3 88 ff 4c 8b 45 98 bf 28 00 00 00 b9 08 00 00 00 49 8b 70 18 48 8b 56 58 <48> 89 10 48 8b 13 48 89 46 58 c7 46 60 00 00 00 00 e9 62 01 00 00
> [  108.885915] RSP: 002b:00007ffcf3c7d920 EFLAGS: 00010206
> [  108.885916] RAX: 00007f7bf07c5000 RBX: 00007ffcf3c7d9a0 RCX: 0000000000000008
> [  108.885917] RDX: 00007f7bf06aa000 RSI: 00007f7bf09dd400 RDI: 0000000000000028
> [  108.885917] RBP: 00007ffcf3c7d990 R08: 00007f7bf080c540 R09: 0000000000000007
> [  108.885918] R10: 000000000000009a R11: 000000003e969900 R12: 00007f7bf07bbe70
> [  108.885918] R13: 0000000000000000 R14: 00007f7bf07bbec0 R15: 00007ffcf3c7d930
> [  108.885920]  </TASK>
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a045819bcf40..bb4f203733f3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1639,6 +1639,11 @@  int alloc_shrinker_info(struct mem_cgroup *memcg);
 void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return shrinker->id;
+}
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1652,6 +1657,11 @@  static inline void set_shrinker_bit(struct mem_cgroup *memcg,
 				    int nid, int shrinker_id)
 {
 }
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a81e89987ca2..70820fa75c1f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1059,26 +1059,75 @@  pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 
 #ifdef CONFIG_MEMCG
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
+{
+	if (mem_cgroup_disabled())
+		return NULL;
+	if (&NODE_DATA(folio_nid(folio))->deferred_split_queue == queue)
+		return NULL;
+	return container_of(queue, struct mem_cgroup, deferred_split_queue);
+}
+
+static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio)
 {
 	struct mem_cgroup *memcg = folio_memcg(folio);
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
 
-	if (memcg)
-		return &memcg->deferred_split_queue;
-	else
-		return &pgdat->deferred_split_queue;
+	return memcg ? &memcg->deferred_split_queue : NULL;
 }
 #else
 static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+					   struct deferred_split *queue)
 {
-	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+	return NULL;
+}
 
-	return &pgdat->deferred_split_queue;
+static inline struct deferred_split *folio_memcg_split_queue(struct folio *folio)
+{
+	return NULL;
 }
 #endif
 
+static struct deferred_split *folio_split_queue(struct folio *folio)
+{
+	struct deferred_split *queue = folio_memcg_split_queue(folio);
+
+	return queue ? : &NODE_DATA(folio_nid(folio))->deferred_split_queue;
+}
+
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+	struct deferred_split *queue;
+
+	queue = folio_split_queue(folio);
+	spin_lock(&queue->split_queue_lock);
+
+	return queue;
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+	struct deferred_split *queue;
+
+	queue = folio_split_queue(folio);
+	spin_lock_irqsave(&queue->split_queue_lock, *flags);
+
+	return queue;
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+	spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+						 unsigned long flags)
+{
+	spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
 static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -3723,7 +3772,7 @@  static int __folio_split(struct folio *folio, unsigned int new_order,
 		struct page *split_at, struct page *lock_at,
 		struct list_head *list, bool uniform_split)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
+	struct deferred_split *ds_queue;
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
 	bool is_anon = folio_test_anon(folio);
 	struct address_space *mapping = NULL;
@@ -3857,7 +3906,7 @@  static int __folio_split(struct folio *folio, unsigned int new_order,
 	}
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	spin_lock(&ds_queue->split_queue_lock);
+	ds_queue = folio_split_queue_lock(folio);
 	if (folio_ref_freeze(folio, 1 + extra_pins)) {
 		if (folio_order(folio) > 1 &&
 		    !list_empty(&folio->_deferred_list)) {
@@ -3875,7 +3924,7 @@  static int __folio_split(struct folio *folio, unsigned int new_order,
 			 */
 			list_del_init(&folio->_deferred_list);
 		}
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -3896,7 +3945,7 @@  static int __folio_split(struct folio *folio, unsigned int new_order,
 				split_at, lock_at, list, end, &xas, mapping,
 				uniform_split);
 	} else {
-		spin_unlock(&ds_queue->split_queue_lock);
+		split_queue_unlock(ds_queue);
 fail:
 		if (mapping)
 			xas_unlock(&xas);
@@ -4050,8 +4099,7 @@  bool __folio_unqueue_deferred_split(struct folio *folio)
 	WARN_ON_ONCE(folio_ref_count(folio));
 	WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
 
-	ds_queue = get_deferred_split_queue(folio);
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
 		if (folio_test_partially_mapped(folio)) {
@@ -4062,7 +4110,7 @@  bool __folio_unqueue_deferred_split(struct folio *folio)
 		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	return unqueued;	/* useful for debug warnings */
 }
@@ -4070,10 +4118,7 @@  bool __folio_unqueue_deferred_split(struct folio *folio)
 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
 void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
-	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-#ifdef CONFIG_MEMCG
-	struct mem_cgroup *memcg = folio_memcg(folio);
-#endif
+	struct deferred_split *ds_queue;
 	unsigned long flags;
 
 	/*
@@ -4096,7 +4141,7 @@  void deferred_split_folio(struct folio *folio, bool partially_mapped)
 	if (folio_test_swapcache(folio))
 		return;
 
-	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
 	if (partially_mapped) {
 		if (!folio_test_partially_mapped(folio)) {
 			folio_set_partially_mapped(folio);
@@ -4111,15 +4156,16 @@  void deferred_split_folio(struct folio *folio, bool partially_mapped)
 		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
 	}
 	if (list_empty(&folio->_deferred_list)) {
+		struct mem_cgroup *memcg;
+
+		memcg = folio_split_queue_memcg(folio, ds_queue);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
 		if (memcg)
 			set_shrinker_bit(memcg, folio_nid(folio),
-					 deferred_split_shrinker->id);
-#endif
+					 shrinker_id(deferred_split_shrinker));
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4202,7 +4248,7 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 		if (!--sc->nr_to_scan)
 			break;
 	}
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
 		bool did_split = false;
@@ -4251,7 +4297,7 @@  static unsigned long deferred_split_scan(struct shrinker *shrink,
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	list_splice_tail(&list, &ds_queue->split_queue);
 	ds_queue->split_queue_len -= removed;
-	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	split_queue_unlock_irqrestore(ds_queue, flags);
 
 	if (prev)
 		folio_put(prev);