diff mbox series

[v2,2/7] memcg: dynamically allocate lruvec_stats

Message ID 20240427003733.3898961-3-shakeel.butt@linux.dev (mailing list archive)
State New
Headers show
Series memcg: reduce memory consumption by memcg stats | expand

Commit Message

Shakeel Butt April 27, 2024, 12:37 a.m. UTC
To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
structure. Also move the definition of lruvec_stats_percpu and
lruvec_stats and related functions to the memcontrol.c to facilitate
later patches. No functional changes in the patch.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/memcontrol.h | 62 +++------------------------
 mm/memcontrol.c            | 87 ++++++++++++++++++++++++++++++++------
 2 files changed, 81 insertions(+), 68 deletions(-)

Comments

Roman Gushchin April 29, 2024, 3:50 p.m. UTC | #1
On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
> need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
> structure. Also move the definition of lruvec_stats_percpu and
> lruvec_stats and related functions to the memcontrol.c to facilitate
> later patches. No functional changes in the patch.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>  include/linux/memcontrol.h | 62 +++------------------------
>  mm/memcontrol.c            | 87 ++++++++++++++++++++++++++++++++------
>  2 files changed, 81 insertions(+), 68 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9aba0d0462ca..ab8a6e884375 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -83,6 +83,8 @@ enum mem_cgroup_events_target {
>  
>  struct memcg_vmstats_percpu;
>  struct memcg_vmstats;
> +struct lruvec_stats_percpu;
> +struct lruvec_stats;
>  
>  struct mem_cgroup_reclaim_iter {
>  	struct mem_cgroup *position;
> @@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter {
>  	unsigned int generation;
>  };
>  
> -struct lruvec_stats_percpu {
> -	/* Local (CPU and cgroup) state */
> -	long state[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Delta calculation for lockless upward propagation */
> -	long state_prev[NR_VM_NODE_STAT_ITEMS];
> -};
> -
> -struct lruvec_stats {
> -	/* Aggregated (CPU and subtree) state */
> -	long state[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Non-hierarchical (CPU aggregated) state */
> -	long state_local[NR_VM_NODE_STAT_ITEMS];
> -
> -	/* Pending child counts during tree propagation */
> -	long state_pending[NR_VM_NODE_STAT_ITEMS];
> -};
> -
>  /*
>   * per-node information in memory controller.
>   */
> @@ -116,7 +99,7 @@ struct mem_cgroup_per_node {
>  	struct lruvec		lruvec;
>  
>  	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
> -	struct lruvec_stats			lruvec_stats;
> +	struct lruvec_stats			*lruvec_stats;
>  
>  	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
>  
> @@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page,
>  }
>  
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
> -
> -static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> -					      enum node_stat_item idx)
> -{
> -	struct mem_cgroup_per_node *pn;
> -	long x;
> -
> -	if (mem_cgroup_disabled())
> -		return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> -	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -	x = READ_ONCE(pn->lruvec_stats.state[idx]);
> -#ifdef CONFIG_SMP
> -	if (x < 0)
> -		x = 0;
> -#endif
> -	return x;
> -}
> -
> -static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> -						    enum node_stat_item idx)
> -{
> -	struct mem_cgroup_per_node *pn;
> -	long x = 0;
> -
> -	if (mem_cgroup_disabled())
> -		return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> -	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -	x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
> -#ifdef CONFIG_SMP
> -	if (x < 0)
> -		x = 0;
> -#endif
> -	return x;
> -}
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> +				      enum node_stat_item idx);
>  
>  void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 53769d06053f..5e337ed6c6bf 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,6 +576,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>  	return mz;
>  }
>  
> +struct lruvec_stats_percpu {
> +	/* Local (CPU and cgroup) state */
> +	long state[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Delta calculation for lockless upward propagation */
> +	long state_prev[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +struct lruvec_stats {
> +	/* Aggregated (CPU and subtree) state */
> +	long state[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Non-hierarchical (CPU aggregated) state */
> +	long state_local[NR_VM_NODE_STAT_ITEMS];
> +
> +	/* Pending child counts during tree propagation */
> +	long state_pending[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	long x;
> +
> +	if (mem_cgroup_disabled())
> +		return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> +	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +	x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +#ifdef CONFIG_SMP
> +	if (x < 0)
> +		x = 0;
> +#endif
> +	return x;
> +}
> +
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> +				      enum node_stat_item idx)
> +{
> +	struct mem_cgroup_per_node *pn;
> +	long x = 0;
> +
> +	if (mem_cgroup_disabled())
> +		return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> +	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +#ifdef CONFIG_SMP
> +	if (x < 0)
> +		x = 0;
> +#endif

Not directly related to your change, but do we still need it? And if yes,
do we really care about !CONFIG_SMP case enough to justify these #ifdefs?

> +	return x;
> +}
> +
>  /* Subset of vm_event_item to report for memcg event stats */
>  static const unsigned int memcg_vm_event_stat[] = {
>  	PGPGIN,
> @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  	if (!pn)
>  		return 1;
>  
> +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> +					node);

Why not GFP_KERNEL_ACCOUNT?

> +	if (!pn->lruvec_stats)
> +		goto fail;
> +
>  	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
>  						   GFP_KERNEL_ACCOUNT);
> -	if (!pn->lruvec_stats_percpu) {
> -		kfree(pn);
> -		return 1;
> -	}
> +	if (!pn->lruvec_stats_percpu)
> +		goto fail;
>  
>  	lruvec_init(&pn->lruvec);
>  	pn->memcg = memcg;
>  
>  	memcg->nodeinfo[node] = pn;
>  	return 0;
> +fail:
> +	kfree(pn->lruvec_stats);
> +	kfree(pn);
> +	return 1;
>  }
>  
>  static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> @@ -5514,6 +5575,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  		return;
>  
>  	free_percpu(pn->lruvec_stats_percpu);
> +	kfree(pn->lruvec_stats);
>  	kfree(pn);
>  }
>  
> @@ -5866,18 +5928,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>  
>  	for_each_node_state(nid, N_MEMORY) {
>  		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
> -		struct mem_cgroup_per_node *ppn = NULL;
> +		struct lruvec_stats *lstats = pn->lruvec_stats;
> +		struct lruvec_stats *plstats = NULL;
>  		struct lruvec_stats_percpu *lstatc;
>  
>  		if (parent)
> -			ppn = parent->nodeinfo[nid];
> +			plstats = parent->nodeinfo[nid]->lruvec_stats;
>  
>  		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>  
>  		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> -			delta = pn->lruvec_stats.state_pending[i];
> +			delta = lstats->state_pending[i];
>  			if (delta)
> -				pn->lruvec_stats.state_pending[i] = 0;
> +				lstats->state_pending[i] = 0;
>  
>  			delta_cpu = 0;
>  			v = READ_ONCE(lstatc->state[i]);
> @@ -5888,12 +5951,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>  			}
>  
>  			if (delta_cpu)
> -				pn->lruvec_stats.state_local[i] += delta_cpu;
> +				lstats->state_local[i] += delta_cpu;
>  
>  			if (delta) {
> -				pn->lruvec_stats.state[i] += delta;
> -				if (ppn)
> -					ppn->lruvec_stats.state_pending[i] += delta;
> +				lstats->state[i] += delta;
> +				if (plstats)
> +					plstats->state_pending[i] += delta;
>  			}
>  		}
>  	}
> -- 
> 2.43.0
>
Shakeel Butt April 29, 2024, 7:46 p.m. UTC | #2
On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
[...]
> > +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > +				      enum node_stat_item idx)
> > +{
> > +	struct mem_cgroup_per_node *pn;
> > +	long x = 0;
> > +
> > +	if (mem_cgroup_disabled())
> > +		return node_page_state(lruvec_pgdat(lruvec), idx);
> > +
> > +	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > +	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> > +#ifdef CONFIG_SMP
> > +	if (x < 0)
> > +		x = 0;
> > +#endif
> 
> Not directly related to your change, but do we still need it? And if yes,
> do we really care about !CONFIG_SMP case enough to justify these #ifdefs?
> 

That's a good question and I think this is still needed. Particularly on
large machines with large number of CPUs, we can have a situation where
the flusher is flushing the CPU 100 and in parallel some workload
allocated a lot of pages on, let's say, CPU 0 and freed on CPU 200.

> > +	return x;
> > +}
> > +
> >  /* Subset of vm_event_item to report for memcg event stats */
> >  static const unsigned int memcg_vm_event_stat[] = {
> >  	PGPGIN,
> > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> >  	if (!pn)
> >  		return 1;
> >  
> > +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > +					node);
> 
> Why not GFP_KERNEL_ACCOUNT?
> 

Previously struct lruvec_stats was part of struct mem_cgroup_per_node
and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
should be a separate patch.

Thanks for the review.
Shakeel
Roman Gushchin April 29, 2024, 9:02 p.m. UTC | #3
On Mon, Apr 29, 2024 at 12:46:32PM -0700, Shakeel Butt wrote:
> On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> > On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> [...]
> 
> > > +	return x;
> > > +}
> > > +
> > >  /* Subset of vm_event_item to report for memcg event stats */
> > >  static const unsigned int memcg_vm_event_stat[] = {
> > >  	PGPGIN,
> > > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> > >  	if (!pn)
> > >  		return 1;
> > >  
> > > +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > > +					node);
> > 
> > Why not GFP_KERNEL_ACCOUNT?
> > 
> 
> Previously struct lruvec_stats was part of struct mem_cgroup_per_node
> and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
> behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
> should be a separate patch.

Agree. Here is the patch:

--

From fd6854c0b272c5314bce6c9dee7d3c8f8cee3a86 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Mon, 29 Apr 2024 13:57:26 -0700
Subject: [PATCH] mm: memcg: account memory used for memcg vmstats and lruvec
 stats

The percpu memory used by memcg's memory statistics is already accounted.
For consistency, let's enable accounting for vmstats and lruvec stats
as well.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 mm/memcontrol.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d11536ef59ef..2fe25d49cfaa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5661,8 +5661,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return 1;
 
-	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
-					node);
+	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
+					GFP_KERNEL_ACCOUNT, node);
 	if (!pn->lruvec_stats)
 		goto fail;
 
@@ -5733,7 +5733,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 		goto fail;
 	}
 
-	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
+				 GFP_KERNEL_ACCOUNT);
 	if (!memcg->vmstats)
 		goto fail;
Shakeel Butt April 29, 2024, 9:59 p.m. UTC | #4
On Mon, Apr 29, 2024 at 02:02:23PM -0700, Roman Gushchin wrote:
> On Mon, Apr 29, 2024 at 12:46:32PM -0700, Shakeel Butt wrote:
> > On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> > > On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> > [...]
> > 
> > > > +	return x;
> > > > +}
> > > > +
> > > >  /* Subset of vm_event_item to report for memcg event stats */
> > > >  static const unsigned int memcg_vm_event_stat[] = {
> > > >  	PGPGIN,
> > > > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> > > >  	if (!pn)
> > > >  		return 1;
> > > >  
> > > > +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > > > +					node);
> > > 
> > > Why not GFP_KERNEL_ACCOUNT?
> > > 
> > 
> > Previously struct lruvec_stats was part of struct mem_cgroup_per_node
> > and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
> > behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
> > should be a separate patch.
> 
> Agree. Here is the patch:

Awesome, I will pull this in my v3 post.

> 
> --
> 
> From fd6854c0b272c5314bce6c9dee7d3c8f8cee3a86 Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <roman.gushchin@linux.dev>
> Date: Mon, 29 Apr 2024 13:57:26 -0700
> Subject: [PATCH] mm: memcg: account memory used for memcg vmstats and lruvec
>  stats
> 
> The percpu memory used by memcg's memory statistics is already accounted.
> For consistency, let's enable accounting for vmstats and lruvec stats
> as well.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>

Acked-by: Shakeel Butt <shakeel.butt@linux.dev>

> ---
>  mm/memcontrol.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d11536ef59ef..2fe25d49cfaa 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5661,8 +5661,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
>  	if (!pn)
>  		return 1;
>  
> -	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> -					node);
> +	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
> +					GFP_KERNEL_ACCOUNT, node);
>  	if (!pn->lruvec_stats)
>  		goto fail;
>  
> @@ -5733,7 +5733,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
>  		goto fail;
>  	}
>  
> -	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
> +	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
> +				 GFP_KERNEL_ACCOUNT);
>  	if (!memcg->vmstats)
>  		goto fail;
>  
> -- 
> 2.43.2
>
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9aba0d0462ca..ab8a6e884375 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -83,6 +83,8 @@  enum mem_cgroup_events_target {
 
 struct memcg_vmstats_percpu;
 struct memcg_vmstats;
+struct lruvec_stats_percpu;
+struct lruvec_stats;
 
 struct mem_cgroup_reclaim_iter {
 	struct mem_cgroup *position;
@@ -90,25 +92,6 @@  struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-struct lruvec_stats_percpu {
-	/* Local (CPU and cgroup) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
-
-	/* Delta calculation for lockless upward propagation */
-	long state_prev[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct lruvec_stats {
-	/* Aggregated (CPU and subtree) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
-
-	/* Non-hierarchical (CPU aggregated) state */
-	long state_local[NR_VM_NODE_STAT_ITEMS];
-
-	/* Pending child counts during tree propagation */
-	long state_pending[NR_VM_NODE_STAT_ITEMS];
-};
-
 /*
  * per-node information in memory controller.
  */
@@ -116,7 +99,7 @@  struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
 	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
-	struct lruvec_stats			lruvec_stats;
+	struct lruvec_stats			*lruvec_stats;
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -1037,42 +1020,9 @@  static inline void mod_memcg_page_state(struct page *page,
 }
 
 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
-
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
-					      enum node_stat_item idx)
-{
-	struct mem_cgroup_per_node *pn;
-	long x;
-
-	if (mem_cgroup_disabled())
-		return node_page_state(lruvec_pgdat(lruvec), idx);
-
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats.state[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
-
-static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-						    enum node_stat_item idx)
-{
-	struct mem_cgroup_per_node *pn;
-	long x = 0;
-
-	if (mem_cgroup_disabled())
-		return node_page_state(lruvec_pgdat(lruvec), idx);
-
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+				      enum node_stat_item idx);
 
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53769d06053f..5e337ed6c6bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -576,6 +576,60 @@  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
+struct lruvec_stats_percpu {
+	/* Local (CPU and cgroup) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Delta calculation for lockless upward propagation */
+	long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+	/* Aggregated (CPU and subtree) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Non-hierarchical (CPU aggregated) state */
+	long state_local[NR_VM_NODE_STAT_ITEMS];
+
+	/* Pending child counts during tree propagation */
+	long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
+{
+	struct mem_cgroup_per_node *pn;
+	long x;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = READ_ONCE(pn->lruvec_stats->state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	struct mem_cgroup_per_node *pn;
+	long x = 0;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 /* Subset of vm_event_item to report for memcg event stats */
 static const unsigned int memcg_vm_event_stat[] = {
 	PGPGIN,
@@ -5492,18 +5546,25 @@  static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return 1;
 
+	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
+					node);
+	if (!pn->lruvec_stats)
+		goto fail;
+
 	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
 						   GFP_KERNEL_ACCOUNT);
-	if (!pn->lruvec_stats_percpu) {
-		kfree(pn);
-		return 1;
-	}
+	if (!pn->lruvec_stats_percpu)
+		goto fail;
 
 	lruvec_init(&pn->lruvec);
 	pn->memcg = memcg;
 
 	memcg->nodeinfo[node] = pn;
 	return 0;
+fail:
+	kfree(pn->lruvec_stats);
+	kfree(pn);
+	return 1;
 }
 
 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
@@ -5514,6 +5575,7 @@  static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 		return;
 
 	free_percpu(pn->lruvec_stats_percpu);
+	kfree(pn->lruvec_stats);
 	kfree(pn);
 }
 
@@ -5866,18 +5928,19 @@  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 
 	for_each_node_state(nid, N_MEMORY) {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
-		struct mem_cgroup_per_node *ppn = NULL;
+		struct lruvec_stats *lstats = pn->lruvec_stats;
+		struct lruvec_stats *plstats = NULL;
 		struct lruvec_stats_percpu *lstatc;
 
 		if (parent)
-			ppn = parent->nodeinfo[nid];
+			plstats = parent->nodeinfo[nid]->lruvec_stats;
 
 		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
 
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-			delta = pn->lruvec_stats.state_pending[i];
+			delta = lstats->state_pending[i];
 			if (delta)
-				pn->lruvec_stats.state_pending[i] = 0;
+				lstats->state_pending[i] = 0;
 
 			delta_cpu = 0;
 			v = READ_ONCE(lstatc->state[i]);
@@ -5888,12 +5951,12 @@  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 			}
 
 			if (delta_cpu)
-				pn->lruvec_stats.state_local[i] += delta_cpu;
+				lstats->state_local[i] += delta_cpu;
 
 			if (delta) {
-				pn->lruvec_stats.state[i] += delta;
-				if (ppn)
-					ppn->lruvec_stats.state_pending[i] += delta;
+				lstats->state[i] += delta;
+				if (plstats)
+					plstats->state_pending[i] += delta;
 			}
 		}
 	}