diff mbox series

[4/4] cgroup: save memory by splitting cgroup_rstat_cpu into compact and full versions

Message ID 20250319221634.71128-5-inwardvessel@gmail.com (mailing list archive)
State New
Headers show
Series cgroup: separate rstat trees | expand

Commit Message

JP Kobryn March 19, 2025, 10:16 p.m. UTC
The cgroup_rstat_cpu struct contains rstat node pointers and also the base
stat objects. Since ownership of the cgroup_rstat_cpu has shifted from
cgroup to cgroup_subsys_state, css's other than cgroup::self are now
carrying along these base stat objects which go unused. Eliminate this
wasted memory by splitting up cgroup_rstat_cpu into two separate structs.

The cgroup_rstat_cpu struct is modified in a way that it now contains only
the rstat node pointers. css's that are associated with a subsystem
(memory, io) use this compact struct to participate in rstat without the
memory overhead of the base stat objects.

As for css's represented by cgroup::self, a new cgroup_rstat_base_cpu
struct is introduced. It contains the new compact cgroup_rstat_cpu struct
as its first field followed by the base stat objects.

Because the rstat pointers exist at the same offset (beginning) in both
structs, cgroup_subsys_state is modified to contain a union of the two
structs. Where css initialization is done, the compact struct is allocated
when the css is associated with a subsystem. When the css is not associated
with a subsystem, the full struct is allocated. The union allows the
existing rstat updated/flush routines to work with any css regardless of
subsystem association. The base stats routines however, were modified to
access the full struct field in the union.

The change in memory on a per-cpu basis is shown below.

before:
    struct size
    sizeof(cgroup_rstat_cpu) =~ 144 bytes /* can vary based on config */

    per-cpu overhead
    nr_cgroups * (
	sizeof(cgroup_rstat_cpu) * (1 + nr_rstat_subsystems)
    )
    nr_cgroups * (144 * (1 + 2))
    nr_cgroups * 432

    432 bytes per cgroup per cpu

after:
    struct sizes
    sizeof(cgroup_rstat_base_cpu) =~ 144 bytes
    sizeof(cgroup_rstat_cpu) = 16 bytes

    per-cpu overhead
    nr_cgroups * (
	sizeof(cgroup_rstat_base_cpu) +
	sizeof(cgroup_rstat_cpu) * (nr_rstat_subsystems)
    )
    nr_cgroups * (144 + 16 * 2)
    nr_cgroups * 176

    176 bytes per cgroup per cpu

savings:
    256 bytes per cgroup per cpu

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
 include/linux/cgroup-defs.h |  41 +++++++++------
 kernel/cgroup/rstat.c       | 100 ++++++++++++++++++++++--------------
 2 files changed, 86 insertions(+), 55 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 0ffc8438c6d9..f9b84e7f718d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -170,7 +170,10 @@  struct cgroup_subsys_state {
 	struct percpu_ref refcnt;
 
 	/* per-cpu recursive resource statistics */
-	struct css_rstat_cpu __percpu *rstat_cpu;
+	union {
+		struct css_rstat_cpu __percpu *rstat_cpu;
+		struct css_rstat_base_cpu __percpu *rstat_base_cpu;
+	};
 
 	/*
 	 * siblings list anchored at the parent's ->children
@@ -358,6 +361,26 @@  struct cgroup_base_stat {
  * resource statistics on top of it - bsync, bstat and last_bstat.
  */
 struct css_rstat_cpu {
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next.
+	 *
+	 * In addition to being more compact, singly-linked list pointing
+	 * to the cgroup makes it unnecessary for each per-cpu struct to
+	 * point back to the associated cgroup.
+	 *
+	 * Protected by per-cpu rstat_base_cpu_lock when css->ss == NULL
+	 * otherwise,
+	 * Protected by per-cpu css->ss->rstat_cpu_lock
+	 */
+	struct cgroup_subsys_state *updated_children;	/* terminated by self */
+	struct cgroup_subsys_state *updated_next;	/* NULL if not on list */
+};
+
+struct css_rstat_base_cpu {
+	struct css_rstat_cpu rstat_cpu;
+
 	/*
 	 * ->bsync protects ->bstat.  These are the only fields which get
 	 * updated in the hot path.
@@ -384,22 +407,6 @@  struct css_rstat_cpu {
 	 * deltas to propagate to the per-cpu subtree_bstat.
 	 */
 	struct cgroup_base_stat last_subtree_bstat;
-
-	/*
-	 * Child cgroups with stat updates on this cpu since the last read
-	 * are linked on the parent's ->updated_children through
-	 * ->updated_next.
-	 *
-	 * In addition to being more compact, singly-linked list pointing
-	 * to the cgroup makes it unnecessary for each per-cpu struct to
-	 * point back to the associated cgroup.
-	 *
-	 * Protected by per-cpu rstat_base_cpu_lock when css->ss == NULL
-	 * otherwise,
-	 * Protected by per-cpu css->ss->rstat_cpu_lock
-	 */
-	struct cgroup_subsys_state *updated_children;	/* terminated by self */
-	struct cgroup_subsys_state *updated_next;	/* NULL if not on list */
 };
 
 struct cgroup_freezer_state {
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ffd7ac6bcefc..250f0987407e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -20,6 +20,12 @@  static struct css_rstat_cpu *css_rstat_cpu(
 	return per_cpu_ptr(css->rstat_cpu, cpu);
 }
 
+static struct css_rstat_base_cpu *css_rstat_base_cpu(
+		struct cgroup_subsys_state *css, int cpu)
+{
+	return per_cpu_ptr(css->rstat_base_cpu, cpu);
+}
+
 static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
 {
 	if (ss)
@@ -425,17 +431,35 @@  int css_rstat_init(struct cgroup_subsys_state *css)
 
 	/* the root cgrp's self css has rstat_cpu preallocated */
 	if (!css->rstat_cpu) {
-		css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
-		if (!css->rstat_cpu)
-			return -ENOMEM;
+		/* One of the union fields must be initialized.
+		 * Allocate the larger rstat struct for base stats when css is
+		 * cgroup::self.
+		 * Otherwise, allocate the compact rstat struct since the css is
+		 * associated with a subsystem.
+		 */
+		if (css_is_cgroup(css)) {
+			css->rstat_base_cpu = alloc_percpu(struct css_rstat_base_cpu);
+			if (!css->rstat_base_cpu)
+				return -ENOMEM;
+		} else {
+			css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
+			if (!css->rstat_cpu)
+				return -ENOMEM;
+		}
 	}
 
-	/* ->updated_children list is self terminated */
 	for_each_possible_cpu(cpu) {
-		struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+		struct css_rstat_cpu *rstatc;
 
+		rstatc = css_rstat_cpu(css, cpu);
 		rstatc->updated_children = css;
-		u64_stats_init(&rstatc->bsync);
+
+		if (css_is_cgroup(css)) {
+			struct css_rstat_base_cpu *rstatbc;
+
+			rstatbc = css_rstat_base_cpu(css, cpu);
+			u64_stats_init(&rstatbc->bsync);
+		}
 	}
 
 	return 0;
@@ -522,9 +546,9 @@  static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 {
-	struct css_rstat_cpu *rstatc = css_rstat_cpu(&cgrp->self, cpu);
+	struct css_rstat_base_cpu *rstatbc = css_rstat_base_cpu(&cgrp->self, cpu);
 	struct cgroup *parent = cgroup_parent(cgrp);
-	struct css_rstat_cpu *prstatc;
+	struct css_rstat_base_cpu *prstatbc;
 	struct cgroup_base_stat delta;
 	unsigned seq;
 
@@ -534,15 +558,15 @@  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 
 	/* fetch the current per-cpu values */
 	do {
-		seq = __u64_stats_fetch_begin(&rstatc->bsync);
-		delta = rstatc->bstat;
-	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+		seq = __u64_stats_fetch_begin(&rstatbc->bsync);
+		delta = rstatbc->bstat;
+	} while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));
 
 	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
-	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
+	cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
 	cgroup_base_stat_add(&cgrp->bstat, &delta);
-	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
-	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
+	cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
+	cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);
 
 	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
 	if (cgroup_parent(parent)) {
@@ -551,73 +575,73 @@  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 		cgroup_base_stat_add(&parent->bstat, &delta);
 		cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 
-		delta = rstatc->subtree_bstat;
-		prstatc = css_rstat_cpu(&parent->self, cpu);
-		cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
-		cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
-		cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
+		delta = rstatbc->subtree_bstat;
+		prstatbc = css_rstat_base_cpu(&parent->self, cpu);
+		cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
+		cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
+		cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
 	}
 }
 
-static struct css_rstat_cpu *
+static struct css_rstat_base_cpu *
 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
 {
-	struct css_rstat_cpu *rstatc;
+	struct css_rstat_base_cpu *rstatbc;
 
-	rstatc = get_cpu_ptr(cgrp->self.rstat_cpu);
-	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
-	return rstatc;
+	rstatbc = get_cpu_ptr(cgrp->self.rstat_base_cpu);
+	*flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
+	return rstatbc;
 }
 
 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
-						 struct css_rstat_cpu *rstatc,
+						 struct css_rstat_base_cpu *rstatbc,
 						 unsigned long flags)
 {
-	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
+	u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
 	css_rstat_updated(&cgrp->self, smp_processor_id());
-	put_cpu_ptr(rstatc);
+	put_cpu_ptr(rstatbc);
 }
 
 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 {
-	struct css_rstat_cpu *rstatc;
+	struct css_rstat_base_cpu *rstatbc;
 	unsigned long flags;
 
-	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
-	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
-	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+	rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+	rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
+	cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
 }
 
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 				    enum cpu_usage_stat index, u64 delta_exec)
 {
-	struct css_rstat_cpu *rstatc;
+	struct css_rstat_base_cpu *rstatbc;
 	unsigned long flags;
 
-	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+	rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 
 	switch (index) {
 	case CPUTIME_NICE:
-		rstatc->bstat.ntime += delta_exec;
+		rstatbc->bstat.ntime += delta_exec;
 		fallthrough;
 	case CPUTIME_USER:
-		rstatc->bstat.cputime.utime += delta_exec;
+		rstatbc->bstat.cputime.utime += delta_exec;
 		break;
 	case CPUTIME_SYSTEM:
 	case CPUTIME_IRQ:
 	case CPUTIME_SOFTIRQ:
-		rstatc->bstat.cputime.stime += delta_exec;
+		rstatbc->bstat.cputime.stime += delta_exec;
 		break;
 #ifdef CONFIG_SCHED_CORE
 	case CPUTIME_FORCEIDLE:
-		rstatc->bstat.forceidle_sum += delta_exec;
+		rstatbc->bstat.forceidle_sum += delta_exec;
 		break;
 #endif
 	default:
 		break;
 	}
 
-	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+	cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
 }
 
 /*