Message ID | 20230726002904.655377-2-yosryahmed@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v2] mm: memcg: use rstat for non-hierarchical stats | expand |
On Tue, Jul 25, 2023 at 5:29 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > Currently, memcg uses rstat to maintain hierarchical stats. Counters are > maintained for hierarchical stats at each memcg. Rstat tracks which > cgroups have updates on which cpus to keep those counters fresh on the > read-side. > > For non-hierarchical stats, we do not maintain counters. Instead, the > percpu counters for a given stat need to be summed to get the > non-hierarchical stat value. The original implementation did the same. > At some point before rstat, non-hierarchical counters were introduced by > commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in > memory.stat reporting"). However, those counters were updated on the > performance critical write-side, which caused regressions, so they were > later removed by commit 815744d75152 ("mm: memcontrol: don't batch > updates of local VM stats and events"). See [1] for more detailed > history. > > Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a > half) enjoyed cheap reads of non-hierarchical stats, specifically on > cgroup v1. When moving to more recent kernels, a performance regression > for reading non-hierarchical stats is observed. > > Now that we have rstat, we know exactly which percpu counters have > updates for each stat. We can maintain non-hierarchical counters again, > making reads much more efficient, without affecting the performance > critical write-side. Hence, add non-hierarchical (i.e local) counters > for the stats, and extend rstat flushing to keep those up-to-date. > > A caveat is that we now a stats flush before reading > local/non-hierarchical stats through {memcg/lruvec}_page_state_local() > or memcg_events_local(), where we previously only needed a flush to > read hierarchical stats. Most contexts reading non-hierarchical stats > are already doing a flush, add a flush to the only missing context in > count_shadow_nodes(). > > With this patch, reading memory.stat from 1000 memcgs is 3x faster on a > machine with 256 cpus on cgroup v1: > # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done > # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null > real 0m0.125s > user 0m0.005s > sys 0m0.120s > > After: > real 0m0.032s > user 0m0.005s > sys 0m0.027s > > [1]https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > > --- > > v1 -> v2: > - Rewrite the changelog based on the history context provided by > Johannes (Thanks!). > - Fix a subtle bug where updating a local counter would be missed if it > was cancelled out by a pending update from child memcgs. Johannes, I fixed a subtle bug here and I kept your Ack, I wasn't sure what the Ack retention policy should be here. A quick look at the fix would be great. Thanks! > > > --- > include/linux/memcontrol.h | 7 ++-- > mm/memcontrol.c | 67 +++++++++++++++++++++----------------- > mm/workingset.c | 1 + > 3 files changed, 43 insertions(+), 32 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 5818af8eca5a..a9f2861a57a5 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -112,6 +112,9 @@ struct lruvec_stats { > /* Aggregated (CPU and subtree) state */ > long state[NR_VM_NODE_STAT_ITEMS]; > > + /* Non-hierarchical (CPU aggregated) state */ > + long state_local[NR_VM_NODE_STAT_ITEMS]; > + > /* Pending child counts during tree propagation */ > long state_pending[NR_VM_NODE_STAT_ITEMS]; > }; > @@ -1020,14 +1023,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, > { > struct mem_cgroup_per_node *pn; > long x = 0; > - int cpu; > > if (mem_cgroup_disabled()) > return node_page_state(lruvec_pgdat(lruvec), idx); > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > - for_each_possible_cpu(cpu) > - x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); > + x = READ_ONCE(pn->lruvec_stats.state_local[idx]); > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e8ca4bdcb03c..50f8035e998a 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -742,6 +742,10 @@ struct memcg_vmstats { > long state[MEMCG_NR_STAT]; > unsigned long events[NR_MEMCG_EVENTS]; > > + /* Non-hierarchical (CPU aggregated) page state & events */ > + long state_local[MEMCG_NR_STAT]; > + unsigned long events_local[NR_MEMCG_EVENTS]; > + > /* Pending child counts during tree propagation */ > long state_pending[MEMCG_NR_STAT]; > unsigned long events_pending[NR_MEMCG_EVENTS]; > @@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) > /* idx can be of type enum memcg_stat_item or node_stat_item. */ > static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) > { > - long x = 0; > - int cpu; > + long x = READ_ONCE(memcg->vmstats->state_local[idx]); > > - for_each_possible_cpu(cpu) > - x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > @@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) > > static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) > { > - long x = 0; > - int cpu; > int index = memcg_events_index(event); > > if (index < 0) > return 0; > > - for_each_possible_cpu(cpu) > - x += per_cpu(memcg->vmstats_percpu->events[index], cpu); > - return x; > + return READ_ONCE(memcg->vmstats->events_local[index]); > } > > static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, > @@ -5526,7 +5523,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > struct mem_cgroup *memcg = mem_cgroup_from_css(css); > struct mem_cgroup *parent = parent_mem_cgroup(memcg); > struct memcg_vmstats_percpu *statc; > - long delta, v; > + long delta, delta_cpu, v; > int i, nid; > > statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); > @@ -5542,19 +5539,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > memcg->vmstats->state_pending[i] = 0; > > /* Add CPU changes on this level since the last flush */ > + delta_cpu = 0; > v = READ_ONCE(statc->state[i]); > if (v != statc->state_prev[i]) { > - delta += v - statc->state_prev[i]; > + delta_cpu = v - statc->state_prev[i]; > + delta += delta_cpu; > statc->state_prev[i] = v; > } > > - if (!delta) > - continue; > - > /* Aggregate counts on this level and propagate upwards */ > - memcg->vmstats->state[i] += delta; > - if (parent) > - parent->vmstats->state_pending[i] += delta; > + if (delta_cpu) > + memcg->vmstats->state_local[i] += delta_cpu; > + > + if (delta) { > + memcg->vmstats->state[i] += delta; > + if (parent) > + parent->vmstats->state_pending[i] += delta; > + } > } > > for (i = 0; i < NR_MEMCG_EVENTS; i++) { > @@ -5562,18 +5563,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > if (delta) > memcg->vmstats->events_pending[i] = 0; > > + delta_cpu = 0; > v = READ_ONCE(statc->events[i]); > if (v != statc->events_prev[i]) { > - delta += v - statc->events_prev[i]; > + delta_cpu = v - statc->events_prev[i]; > + delta += delta_cpu; > statc->events_prev[i] = v; > } > > - if (!delta) > - continue; > + if (delta_cpu) > + memcg->vmstats->events_local[i] += delta_cpu; > > - memcg->vmstats->events[i] += delta; > - if (parent) > - parent->vmstats->events_pending[i] += delta; > + if (delta) { > + memcg->vmstats->events[i] += delta; > + if (parent) > + parent->vmstats->events_pending[i] += delta; > + } > } > > for_each_node_state(nid, N_MEMORY) { > @@ -5591,18 +5596,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > if (delta) > pn->lruvec_stats.state_pending[i] = 0; > > + delta_cpu = 0; > v = READ_ONCE(lstatc->state[i]); > if (v != lstatc->state_prev[i]) { > - delta += v - lstatc->state_prev[i]; > + delta_cpu = v - lstatc->state_prev[i]; > + delta += delta_cpu; > lstatc->state_prev[i] = v; > } > > - if (!delta) > - continue; > + if (delta_cpu) > + pn->lruvec_stats.state_local[i] += delta_cpu; > > - pn->lruvec_stats.state[i] += delta; > - if (ppn) > - ppn->lruvec_stats.state_pending[i] += delta; > + if (delta) { > + pn->lruvec_stats.state[i] += delta; > + if (ppn) > + ppn->lruvec_stats.state_pending[i] += delta; > + } > } > } > } > diff --git a/mm/workingset.c b/mm/workingset.c > index 4686ae363000..da58a26d0d4d 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, > struct lruvec *lruvec; > int i; > > + mem_cgroup_flush_stats(); > lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); > for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) > pages += lruvec_page_state_local(lruvec, > -- > 2.41.0.487.g6d72f3e995-goog >
On Wed, Jul 26, 2023 at 12:29:04AM +0000, Yosry Ahmed wrote: > Currently, memcg uses rstat to maintain hierarchical stats. Counters are > maintained for hierarchical stats at each memcg. Rstat tracks which > cgroups have updates on which cpus to keep those counters fresh on the > read-side. > > For non-hierarchical stats, we do not maintain counters. Instead, the global? > percpu counters for a given stat need to be summed to get the > non-hierarchical stat value. The original implementation did the same. > At some point before rstat, non-hierarchical counters were introduced by > commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in > memory.stat reporting"). However, those counters were updated on the > performance critical write-side, which caused regressions, so they were > later removed by commit 815744d75152 ("mm: memcontrol: don't batch > updates of local VM stats and events"). See [1] for more detailed > history. > > Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a > half) enjoyed cheap reads of non-hierarchical stats, specifically on > cgroup v1. When moving to more recent kernels, a performance regression > for reading non-hierarchical stats is observed. > > Now that we have rstat, we know exactly which percpu counters have > updates for each stat. We can maintain non-hierarchical counters again, > making reads much more efficient, without affecting the performance > critical write-side. Hence, add non-hierarchical (i.e local) counters > for the stats, and extend rstat flushing to keep those up-to-date. > > A caveat is that we now a stats flush before reading need? > local/non-hierarchical stats through {memcg/lruvec}_page_state_local() > or memcg_events_local(), where we previously only needed a flush to > read hierarchical stats. Most contexts reading non-hierarchical stats > are already doing a flush, add a flush to the only missing context in > count_shadow_nodes(). > > With this patch, reading memory.stat from 1000 memcgs is 3x faster on a > machine with 256 cpus on cgroup v1: > # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done > # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null > real 0m0.125s > user 0m0.005s > sys 0m0.120s > > After: > real 0m0.032s > user 0m0.005s > sys 0m0.027s > > [1]https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> Thank you!
On Tue, Jul 25, 2023 at 7:15 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > On Wed, Jul 26, 2023 at 12:29:04AM +0000, Yosry Ahmed wrote: > > Currently, memcg uses rstat to maintain hierarchical stats. Counters are > > maintained for hierarchical stats at each memcg. Rstat tracks which > > cgroups have updates on which cpus to keep those counters fresh on the > > read-side. > > > > For non-hierarchical stats, we do not maintain counters. Instead, the > global? Do you mean "we do not maintain global counters"? I think "global" is confusing, because it can be thought of as all cpus or as including the subtree (as opposed to local for non-hierarchical stats). > > percpu counters for a given stat need to be summed to get the > > non-hierarchical stat value. The original implementation did the same. > > At some point before rstat, non-hierarchical counters were introduced by > > commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in > > memory.stat reporting"). However, those counters were updated on the > > performance critical write-side, which caused regressions, so they were > > later removed by commit 815744d75152 ("mm: memcontrol: don't batch > > updates of local VM stats and events"). See [1] for more detailed > > history. > > > > Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a > > half) enjoyed cheap reads of non-hierarchical stats, specifically on > > cgroup v1. When moving to more recent kernels, a performance regression > > for reading non-hierarchical stats is observed. > > > > Now that we have rstat, we know exactly which percpu counters have > > updates for each stat. We can maintain non-hierarchical counters again, > > making reads much more efficient, without affecting the performance > > critical write-side. Hence, add non-hierarchical (i.e local) counters > > for the stats, and extend rstat flushing to keep those up-to-date. > > > > A caveat is that we now a stats flush before reading > need? Ah yes. I am hoping Andrew can amend this but I am happy to send a v3 as well. > > local/non-hierarchical stats through {memcg/lruvec}_page_state_local() > > or memcg_events_local(), where we previously only needed a flush to > > read hierarchical stats. Most contexts reading non-hierarchical stats > > are already doing a flush, add a flush to the only missing context in > > count_shadow_nodes(). > > > > With this patch, reading memory.stat from 1000 memcgs is 3x faster on a > > machine with 256 cpus on cgroup v1: > > # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done > > # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null > > real 0m0.125s > > user 0m0.005s > > sys 0m0.120s > > > > After: > > real 0m0.032s > > user 0m0.005s > > sys 0m0.027s > > > > [1]https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ > > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > > Acked-by: Roman Gushchin <roman.gushchin@linux.dev> Thanks! > > Thank you!
On Tue, Jul 25, 2023 at 05:36:45PM -0700, Yosry Ahmed wrote: > On Tue, Jul 25, 2023 at 5:29 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > - Fix a subtle bug where updating a local counter would be missed if it > > was cancelled out by a pending update from child memcgs. > > > Johannes, I fixed a subtle bug here and I kept your Ack, I wasn't sure > what the Ack retention policy should be here. A quick look at the fix > would be great. Ah, I found it: > > @@ -5542,19 +5539,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > > memcg->vmstats->state_pending[i] = 0; > > > > /* Add CPU changes on this level since the last flush */ > > + delta_cpu = 0; > > v = READ_ONCE(statc->state[i]); > > if (v != statc->state_prev[i]) { > > - delta += v - statc->state_prev[i]; > > + delta_cpu = v - statc->state_prev[i]; > > + delta += delta_cpu; > > statc->state_prev[i] = v; > > } > > > > - if (!delta) > > - continue; > > - > > /* Aggregate counts on this level and propagate upwards */ > > - memcg->vmstats->state[i] += delta; > > - if (parent) > > - parent->vmstats->state_pending[i] += delta; > > + if (delta_cpu) > > + memcg->vmstats->state_local[i] += delta_cpu; When delta nulls out, but delta_cpu is non-zero... subtle. This fixed version looks good, please keep my ack :)
On Wed, Jul 26, 2023 at 8:13 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Jul 25, 2023 at 05:36:45PM -0700, Yosry Ahmed wrote: > > On Tue, Jul 25, 2023 at 5:29 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > - Fix a subtle bug where updating a local counter would be missed if it > > > was cancelled out by a pending update from child memcgs. > > > > > > Johannes, I fixed a subtle bug here and I kept your Ack, I wasn't sure > > what the Ack retention policy should be here. A quick look at the fix > > would be great. > > Ah, I found it: > > > > @@ -5542,19 +5539,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) > > > memcg->vmstats->state_pending[i] = 0; > > > > > > /* Add CPU changes on this level since the last flush */ > > > + delta_cpu = 0; > > > v = READ_ONCE(statc->state[i]); > > > if (v != statc->state_prev[i]) { > > > - delta += v - statc->state_prev[i]; > > > + delta_cpu = v - statc->state_prev[i]; > > > + delta += delta_cpu; > > > statc->state_prev[i] = v; > > > } > > > > > > - if (!delta) > > > - continue; > > > - > > > /* Aggregate counts on this level and propagate upwards */ > > > - memcg->vmstats->state[i] += delta; > > > - if (parent) > > > - parent->vmstats->state_pending[i] += delta; > > > + if (delta_cpu) > > > + memcg->vmstats->state_local[i] += delta_cpu; > > When delta nulls out, but delta_cpu is non-zero... subtle. Yup, led to a few missed updates, was not trivial to track down :) > > This fixed version looks good, please keep my ack :) Thanks!
On Tue, Jul 25, 2023 at 07:20:02PM -0700, Yosry Ahmed wrote: > On Tue, Jul 25, 2023 at 7:15 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > > > On Wed, Jul 26, 2023 at 12:29:04AM +0000, Yosry Ahmed wrote: > > > Currently, memcg uses rstat to maintain hierarchical stats. Counters are > > > maintained for hierarchical stats at each memcg. Rstat tracks which > > > cgroups have updates on which cpus to keep those counters fresh on the > > > read-side. > > > > > > For non-hierarchical stats, we do not maintain counters. Instead, the > > global? > > Do you mean "we do not maintain global counters"? I think "global" is > confusing, because it can be thought of as all cpus or as including > the subtree (as opposed to local for non-hierarchical stats). "global" seems fine to me, I don't think it's ambiguous in the direct comparison with per-cpu counts. Alternatively, rephrase the whole thing? Something like: "Non-hierarchical stats are currently not covered by rstat. Their per-cpu counters are summed up on every read, which is expensive."
On Wed, Jul 26, 2023 at 8:20 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Jul 25, 2023 at 07:20:02PM -0700, Yosry Ahmed wrote: > > On Tue, Jul 25, 2023 at 7:15 PM Roman Gushchin <roman.gushchin@linux.dev> wrote: > > > > > > On Wed, Jul 26, 2023 at 12:29:04AM +0000, Yosry Ahmed wrote: > > > > Currently, memcg uses rstat to maintain hierarchical stats. Counters are > > > > maintained for hierarchical stats at each memcg. Rstat tracks which > > > > cgroups have updates on which cpus to keep those counters fresh on the > > > > read-side. > > > > > > > > For non-hierarchical stats, we do not maintain counters. Instead, the > > > global? > > > > Do you mean "we do not maintain global counters"? I think "global" is > > confusing, because it can be thought of as all cpus or as including > > the subtree (as opposed to local for non-hierarchical stats). > > "global" seems fine to me, I don't think it's ambiguous in the direct > comparison with per-cpu counts. > > Alternatively, rephrase the whole thing? Something like: > > "Non-hierarchical stats are currently not covered by rstat. Their > per-cpu counters are summed up on every read, which is expensive." Rephrasing sounds good to me. I will send a v3 with the correct commit log and collected Acks to make Andrew's life easier.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5818af8eca5a..a9f2861a57a5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -112,6 +112,9 @@ struct lruvec_stats { /* Aggregated (CPU and subtree) state */ long state[NR_VM_NODE_STAT_ITEMS]; + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_VM_NODE_STAT_ITEMS]; + /* Pending child counts during tree propagation */ long state_pending[NR_VM_NODE_STAT_ITEMS]; }; @@ -1020,14 +1023,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; long x = 0; - int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); + x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8ca4bdcb03c..50f8035e998a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -742,6 +742,10 @@ struct memcg_vmstats { long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; + /* Non-hierarchical (CPU aggregated) page state & events */ + long state_local[MEMCG_NR_STAT]; + unsigned long events_local[NR_MEMCG_EVENTS]; + /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; @@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = 0; - int cpu; + long x = READ_ONCE(memcg->vmstats->state_local[idx]); - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - long x = 0; - int cpu; int index = memcg_events_index(event); if (index < 0) return 0; - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[index], cpu); - return x; + return READ_ONCE(memcg->vmstats->events_local[index]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -5526,7 +5523,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; - long delta, v; + long delta, delta_cpu, v; int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5542,19 +5539,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ + delta_cpu = 0; v = READ_ONCE(statc->state[i]); if (v != statc->state_prev[i]) { - delta += v - statc->state_prev[i]; + delta_cpu = v - statc->state_prev[i]; + delta += delta_cpu; statc->state_prev[i] = v; } - if (!delta) - continue; - /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats->state[i] += delta; - if (parent) - parent->vmstats->state_pending[i] += delta; + if (delta_cpu) + memcg->vmstats->state_local[i] += delta_cpu; + + if (delta) { + memcg->vmstats->state[i] += delta; + if (parent) + parent->vmstats->state_pending[i] += delta; + } } for (i = 0; i < NR_MEMCG_EVENTS; i++) { @@ -5562,18 +5563,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) memcg->vmstats->events_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { - delta += v - statc->events_prev[i]; + delta_cpu = v - statc->events_prev[i]; + delta += delta_cpu; statc->events_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + memcg->vmstats->events_local[i] += delta_cpu; - memcg->vmstats->events[i] += delta; - if (parent) - parent->vmstats->events_pending[i] += delta; + if (delta) { + memcg->vmstats->events[i] += delta; + if (parent) + parent->vmstats->events_pending[i] += delta; + } } for_each_node_state(nid, N_MEMORY) { @@ -5591,18 +5596,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) pn->lruvec_stats.state_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(lstatc->state[i]); if (v != lstatc->state_prev[i]) { - delta += v - lstatc->state_prev[i]; + delta_cpu = v - lstatc->state_prev[i]; + delta += delta_cpu; lstatc->state_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + pn->lruvec_stats.state_local[i] += delta_cpu; - pn->lruvec_stats.state[i] += delta; - if (ppn) - ppn->lruvec_stats.state_pending[i] += delta; + if (delta) { + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } } } } diff --git a/mm/workingset.c b/mm/workingset.c index 4686ae363000..da58a26d0d4d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; + mem_cgroup_flush_stats(); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec,