diff mbox series

[v2,3/3] mm, swap_cgroup: remove global swap cgroup lock

Message ID 20241210092805.87281-4-ryncsn@gmail.com (mailing list archive)
State New
Headers show
Series mm/swap_cgroup: remove global swap cgroup lock | expand

Commit Message

Kairui Song Dec. 10, 2024, 9:28 a.m. UTC
From: Kairui Song <kasong@tencent.com>

commit e9e58a4ec3b1 ("memcg: avoid use cmpxchg in swap cgroup maintainance")
replaced the cmpxchg/xchg with a global irq spinlock because some archs
doesn't support 2 bytes cmpxchg/xchg. Clearly this won't scale well.

And as commented in swap_cgroup.c, this lock is not needed for map
synchronization.

Emulation of 2 bytes xchg with atomic cmpxchg isn't hard, so implement
it to get rid of this lock. Introduced two helpers for doing so and they
can be easily dropped if a generic 2 byte xchg is support.

Testing using 64G brd and build with build kernel with make -j96 in 1.5G
memory cgroup using 4k folios showed below improvement (10 test run):

Before this series:
Sys time: 10809.46 (stdev 80.831491)
Real time: 171.41 (stdev 1.239894)

After this commit:
Sys time: 9621.26 (stdev 34.620000), -10.42%
Real time: 160.00 (stdev 0.497814), -6.57%

With 64k folios and 2G memcg:
Before this series:
Sys time: 8231.99 (stdev 30.030994)
Real time: 143.57 (stdev 0.577394)

After this commit:
Sys time: 7403.47 (stdev 6.270000), -10.06%
Real time: 135.18 (stdev 0.605000), -5.84%

Sequential swapout of 8G 64k zero folios with madvise (24 test run):
Before this series:
5461409.12 us (stdev 183957.827084)

After this commit:
5420447.26 us (stdev 196419.240317)

Sequential swapin of 8G 4k zero folios (24 test run):
Before this series:
19736958.916667 us (stdev 189027.246676)

After this commit:
19662182.629630 us (stdev 172717.640614)

Performance is better or at least not worse for all tests above.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swap_cgroup.c | 73 +++++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 28 deletions(-)

Comments

Roman Gushchin Dec. 11, 2024, 1:19 a.m. UTC | #1
On Tue, Dec 10, 2024 at 05:28:05PM +0800, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> commit e9e58a4ec3b1 ("memcg: avoid use cmpxchg in swap cgroup maintainance")
> replaced the cmpxchg/xchg with a global irq spinlock because some archs
> doesn't support 2 bytes cmpxchg/xchg. Clearly this won't scale well.
> 
> And as commented in swap_cgroup.c, this lock is not needed for map
> synchronization.
> 
> Emulation of 2 bytes xchg with atomic cmpxchg isn't hard, so implement
> it to get rid of this lock. Introduced two helpers for doing so and they
> can be easily dropped if a generic 2 byte xchg is support.
> 
> Testing using 64G brd and build with build kernel with make -j96 in 1.5G
> memory cgroup using 4k folios showed below improvement (10 test run):
> 
> Before this series:
> Sys time: 10809.46 (stdev 80.831491)
> Real time: 171.41 (stdev 1.239894)
> 
> After this commit:
> Sys time: 9621.26 (stdev 34.620000), -10.42%
> Real time: 160.00 (stdev 0.497814), -6.57%
> 
> With 64k folios and 2G memcg:
> Before this series:
> Sys time: 8231.99 (stdev 30.030994)
> Real time: 143.57 (stdev 0.577394)
> 
> After this commit:
> Sys time: 7403.47 (stdev 6.270000), -10.06%
> Real time: 135.18 (stdev 0.605000), -5.84%
> 
> Sequential swapout of 8G 64k zero folios with madvise (24 test run):
> Before this series:
> 5461409.12 us (stdev 183957.827084)
> 
> After this commit:
> 5420447.26 us (stdev 196419.240317)
> 
> Sequential swapin of 8G 4k zero folios (24 test run):
> Before this series:
> 19736958.916667 us (stdev 189027.246676)
> 
> After this commit:
> 19662182.629630 us (stdev 172717.640614)
> 
> Performance is better or at least not worse for all tests above.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>

Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>

Thanks!
diff mbox series

Patch

diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 1770b076f6b7..a0a8547dc85d 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -7,19 +7,20 @@ 
 
 static DEFINE_MUTEX(swap_cgroup_mutex);
 
+/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
+#define ID_PER_SC (sizeof(atomic_t) / sizeof(unsigned short))
+#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
+#define ID_MASK (BIT(ID_SHIFT) - 1)
 struct swap_cgroup {
-	unsigned short		id;
+	atomic_t ids;
 };
 
 struct swap_cgroup_ctrl {
 	struct swap_cgroup *map;
-	spinlock_t	lock;
 };
 
 static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 
-#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
-
 /*
  * SwapCgroup implements "lookup" and "exchange" operations.
  * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
@@ -30,19 +31,32 @@  static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
  *    SwapCache(and its swp_entry) is under lock.
  *  - When called via swap_free(), there is no user of this entry and no race.
  * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
  */
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
-					struct swap_cgroup_ctrl **ctrlp)
+static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
+					      pgoff_t offset)
 {
-	pgoff_t offset = swp_offset(ent);
-	struct swap_cgroup_ctrl *ctrl;
+	unsigned int shift = (offset & 1) ? 0 : ID_SHIFT;
+	unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);
 
-	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
-	if (ctrlp)
-		*ctrlp = ctrl;
-	return &ctrl->map[offset];
+	return (old_ids & (ID_MASK << shift)) >> shift;
+}
+
+static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
+					    pgoff_t offset,
+					    unsigned short new_id)
+{
+	unsigned short old_id;
+	unsigned int shift = (offset & 1) ? 0 : ID_SHIFT;
+	struct swap_cgroup *sc = &map[offset / ID_PER_SC];
+	unsigned int new_ids, old_ids = atomic_read(&sc->ids);
+
+	do {
+		old_id = (old_ids & (ID_MASK << shift)) >> shift;
+		new_ids = (old_ids & ~(ID_MASK << shift));
+		new_ids |= ((unsigned int)new_id) << shift;
+	} while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));
+
+	return old_id;
 }
 
 /**
@@ -58,21 +72,19 @@  unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
 				  unsigned int nr_ents)
 {
 	struct swap_cgroup_ctrl *ctrl;
-	struct swap_cgroup *sc;
-	unsigned short old;
-	unsigned long flags;
 	pgoff_t offset = swp_offset(ent);
 	pgoff_t end = offset + nr_ents;
+	unsigned short old, iter;
+	struct swap_cgroup *map;
 
-	sc = lookup_swap_cgroup(ent, &ctrl);
+	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+	map = ctrl->map;
 
-	spin_lock_irqsave(&ctrl->lock, flags);
-	old = sc->id;
-	for (; offset < end; offset++, sc++) {
-		VM_BUG_ON(sc->id != old);
-		sc->id = id;
-	}
-	spin_unlock_irqrestore(&ctrl->lock, flags);
+	old = __swap_cgroup_id_lookup(map, offset);
+	do {
+		iter = __swap_cgroup_id_xchg(map, offset, id);
+		VM_BUG_ON(iter != old);
+	} while (++offset != end);
 
 	return old;
 }
@@ -85,9 +97,13 @@  unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
  */
 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
+	struct swap_cgroup_ctrl *ctrl;
+
 	if (mem_cgroup_disabled())
 		return 0;
-	return lookup_swap_cgroup(ent, NULL)->id;
+
+	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+	return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
 }
 
 int swap_cgroup_swapon(int type, unsigned long max_pages)
@@ -98,14 +114,15 @@  int swap_cgroup_swapon(int type, unsigned long max_pages)
 	if (mem_cgroup_disabled())
 		return 0;
 
-	map = vcalloc(max_pages, sizeof(struct swap_cgroup));
+	BUILD_BUG_ON(!ID_PER_SC);
+	map = vcalloc(DIV_ROUND_UP(max_pages, ID_PER_SC),
+		      sizeof(struct swap_cgroup));
 	if (!map)
 		goto nomem;
 
 	ctrl = &swap_cgroup_ctrl[type];
 	mutex_lock(&swap_cgroup_mutex);
 	ctrl->map = map;
-	spin_lock_init(&ctrl->lock);
 	mutex_unlock(&swap_cgroup_mutex);
 
 	return 0;