Message ID | 20201202182725.265020-7-shy828301@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Make shrinker's nr_deferred memcg aware | expand |
On Wed, Dec 02, 2020 at 10:27:22AM -0800, Yang Shi wrote: > Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > will be used in the following cases: > 1. Non memcg aware shrinkers > 2. !CONFIG_MEMCG > 3. memcg is disabled by boot parameter > > Signed-off-by: Yang Shi <shy828301@gmail.com> > --- > mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 82 insertions(+), 6 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index cba0bc8d4661..d569fdcaba79 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > static DEFINE_IDR(shrinker_idr); > static int shrinker_nr_max; > > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > +{ > + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > + !mem_cgroup_disabled(); > +} > + > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > { > int id, ret = -ENOMEM; > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > #endif > return false; > } > + > +static inline long count_nr_deferred(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > + struct memcg_shrinker_deferred *deferred; > + struct mem_cgroup *memcg = sc->memcg; > + int nid = sc->nid; > + int id = shrinker->id; > + long nr; > + > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > + nid = 0; > + > + if (per_memcg_deferred) { > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > + true); > + nr = atomic_long_xchg(&deferred->nr_deferred[id], 0); > + } else > + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > + > + return nr; > +} > + > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > + struct memcg_shrinker_deferred *deferred; > + struct mem_cgroup *memcg = sc->memcg; > + int nid = sc->nid; > + int id = shrinker->id; > + long new_nr; > + > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > + nid = 0; > + > + if (per_memcg_deferred) { > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > + true); > + new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]); > + } else > + new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); > + > + return new_nr; > +} > #else > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > +{ > + return false; > +} > + > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > { > return 0; > @@ -290,6 +347,29 @@ static bool writeback_throttling_sane(struct scan_control *sc) > { > return true; > } > + > +static inline long count_nr_deferred(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + int nid = sc->nid; > + > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > + nid = 0; > + > + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > +} > + > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + int nid = sc->nid; > + > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > + nid = 0; > + > + return atomic_long_add_return(nr, > + &shrinker->nr_deferred[nid]); > +} > #endif > > /* > @@ -429,13 +509,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > long freeable; > long nr; > long new_nr; > - int nid = shrinkctl->nid; > long batch_size = shrinker->batch ? shrinker->batch > : SHRINK_BATCH; > long scanned = 0, next_deferred; > > - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > - nid = 0; > > freeable = shrinker->count_objects(shrinker, shrinkctl); > if (freeable == 0 || freeable == SHRINK_EMPTY) > @@ -446,7 +523,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > * and zero it so that other concurrent shrinker invocations > * don't also do this scanning work. > */ > - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > + nr = count_nr_deferred(shrinker, shrinkctl); > > total_scan = nr; > if (shrinker->seeks) { > @@ -539,8 +616,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > * move the unused scan count back into the shrinker in a > * manner that handles concurrent updates. > */ > - new_nr = atomic_long_add_return(next_deferred, > - &shrinker->nr_deferred[nid]); > + new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl); Ok, I think patch (1) can be just merged into this and then it would make total sense.
On Wed, Dec 2, 2020 at 7:08 PM Roman Gushchin <guro@fb.com> wrote: > > On Wed, Dec 02, 2020 at 10:27:22AM -0800, Yang Shi wrote: > > Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > > will be used in the following cases: > > 1. Non memcg aware shrinkers > > 2. !CONFIG_MEMCG > > 3. memcg is disabled by boot parameter > > > > Signed-off-by: Yang Shi <shy828301@gmail.com> > > --- > > mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 82 insertions(+), 6 deletions(-) > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index cba0bc8d4661..d569fdcaba79 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > > static DEFINE_IDR(shrinker_idr); > > static int shrinker_nr_max; > > > > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > > +{ > > + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > > + !mem_cgroup_disabled(); > > +} > > + > > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > > { > > int id, ret = -ENOMEM; > > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > > #endif > > return false; > > } > > + > > +static inline long count_nr_deferred(struct shrinker *shrinker, > > + struct shrink_control *sc) > > +{ > > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > > + struct memcg_shrinker_deferred *deferred; > > + struct mem_cgroup *memcg = sc->memcg; > > + int nid = sc->nid; > > + int id = shrinker->id; > > + long nr; > > + > > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > + nid = 0; > > + > > + if (per_memcg_deferred) { > > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > > + true); > > + nr = atomic_long_xchg(&deferred->nr_deferred[id], 0); > > + } else > > + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > > + > > + return nr; > > +} > > + > > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, > > + struct shrink_control *sc) > > +{ > > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > > + struct memcg_shrinker_deferred *deferred; > > + struct mem_cgroup *memcg = sc->memcg; > > + int nid = sc->nid; > > + int id = shrinker->id; > > + long new_nr; > > + > > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > + nid = 0; > > + > > + if (per_memcg_deferred) { > > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > > + true); > > + new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]); > > + } else > > + new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); > > + > > + return new_nr; > > +} > > #else > > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > > +{ > > + return false; > > +} > > + > > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > > { > > return 0; > > @@ -290,6 +347,29 @@ static bool writeback_throttling_sane(struct scan_control *sc) > > { > > return true; > > } > > + > > +static inline long count_nr_deferred(struct shrinker *shrinker, > > + struct shrink_control *sc) > > +{ > > + int nid = sc->nid; > > + > > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > + nid = 0; > > + > > + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > > +} > > + > > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, > > + struct shrink_control *sc) > > +{ > > + int nid = sc->nid; > > + > > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > + nid = 0; > > + > > + return atomic_long_add_return(nr, > > + &shrinker->nr_deferred[nid]); > > +} > > #endif > > > > /* > > @@ -429,13 +509,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > > long freeable; > > long nr; > > long new_nr; > > - int nid = shrinkctl->nid; > > long batch_size = shrinker->batch ? shrinker->batch > > : SHRINK_BATCH; > > long scanned = 0, next_deferred; > > > > - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > - nid = 0; > > > > freeable = shrinker->count_objects(shrinker, shrinkctl); > > if (freeable == 0 || freeable == SHRINK_EMPTY) > > @@ -446,7 +523,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > > * and zero it so that other concurrent shrinker invocations > > * don't also do this scanning work. > > */ > > - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); > > + nr = count_nr_deferred(shrinker, shrinkctl); > > > > total_scan = nr; > > if (shrinker->seeks) { > > @@ -539,8 +616,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, > > * move the unused scan count back into the shrinker in a > > * manner that handles concurrent updates. > > */ > > - new_nr = atomic_long_add_return(next_deferred, > > - &shrinker->nr_deferred[nid]); > > + new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl); > > Ok, I think patch (1) can be just merged into this and then it would make total sense. Sure. Makes sense to me.
On 02.12.2020 21:27, Yang Shi wrote: > Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > will be used in the following cases: > 1. Non memcg aware shrinkers > 2. !CONFIG_MEMCG > 3. memcg is disabled by boot parameter > > Signed-off-by: Yang Shi <shy828301@gmail.com> > --- > mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 82 insertions(+), 6 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index cba0bc8d4661..d569fdcaba79 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > static DEFINE_IDR(shrinker_idr); > static int shrinker_nr_max; > > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > +{ > + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > + !mem_cgroup_disabled(); > +} > + > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > { > int id, ret = -ENOMEM; > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > #endif > return false; > } > + > +static inline long count_nr_deferred(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > + struct memcg_shrinker_deferred *deferred; > + struct mem_cgroup *memcg = sc->memcg; > + int nid = sc->nid; > + int id = shrinker->id; > + long nr; > + > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > + nid = 0; > + > + if (per_memcg_deferred) { > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > + true); My comment is about both 5/9 and 6/9 patches. shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur because of processor reordering on !x86 (there is no a common lock or memory barriers). Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). The map can't be NULL there. Regarding to shrinker_deferred you should prove either this is not a problem too, or to add proper synchronization (maybe, based on barriers) or to add some similar check (maybe, in shrink_slab_memcg() too). Kirill
On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > > On 02.12.2020 21:27, Yang Shi wrote: > > Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > > will be used in the following cases: > > 1. Non memcg aware shrinkers > > 2. !CONFIG_MEMCG > > 3. memcg is disabled by boot parameter > > > > Signed-off-by: Yang Shi <shy828301@gmail.com> > > --- > > mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 82 insertions(+), 6 deletions(-) > > > > diff --git a/mm/vmscan.c b/mm/vmscan.c > > index cba0bc8d4661..d569fdcaba79 100644 > > --- a/mm/vmscan.c > > +++ b/mm/vmscan.c > > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > > static DEFINE_IDR(shrinker_idr); > > static int shrinker_nr_max; > > > > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > > +{ > > + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > > + !mem_cgroup_disabled(); > > +} > > + > > static int prealloc_memcg_shrinker(struct shrinker *shrinker) > > { > > int id, ret = -ENOMEM; > > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > > #endif > > return false; > > } > > + > > +static inline long count_nr_deferred(struct shrinker *shrinker, > > + struct shrink_control *sc) > > +{ > > + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > > + struct memcg_shrinker_deferred *deferred; > > + struct mem_cgroup *memcg = sc->memcg; > > + int nid = sc->nid; > > + int id = shrinker->id; > > + long nr; > > + > > + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > + nid = 0; > > + > > + if (per_memcg_deferred) { > > + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > > + true); > > My comment is about both 5/9 and 6/9 patches. Sorry for the late reply, I don't know why Gmail filtered this out to spam. > > shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag > in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see > memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur > because of processor reordering on !x86 (there is no a common lock or memory barriers). > > Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). > The map can't be NULL there. > > Regarding to shrinker_deferred you should prove either this is not a problem too, > or to add proper synchronization (maybe, based on barriers) or to add some similar check > (maybe, in shrink_slab_memcg() too). It seems shrink_slab_memcg() might see shrinker_deferred as NULL either due to the same reason. I don't think there is a guarantee it won't happen. We just need guarantee CSS_ONLINE is seen after shrinker_maps and shrinker_deferred are allocated, so I'm supposed barriers before "css->flags |= CSS_ONLINE" should work. So the below patch may be ok: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df128cab900f..9f7fb0450d69 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } + /* + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees shirnker_maps + * and shrinker_deferred before CSS_ONLINE. + */ + smp_mb(); + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); Or add one more check for shrinker_deferred sounds acceptable as well. > Kirill >
On 08.12.2020 20:13, Yang Shi wrote: > On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: >> >> On 02.12.2020 21:27, Yang Shi wrote: >>> Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred >>> will be used in the following cases: >>> 1. Non memcg aware shrinkers >>> 2. !CONFIG_MEMCG >>> 3. memcg is disabled by boot parameter >>> >>> Signed-off-by: Yang Shi <shy828301@gmail.com> >>> --- >>> mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- >>> 1 file changed, 82 insertions(+), 6 deletions(-) >>> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>> index cba0bc8d4661..d569fdcaba79 100644 >>> --- a/mm/vmscan.c >>> +++ b/mm/vmscan.c >>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); >>> static DEFINE_IDR(shrinker_idr); >>> static int shrinker_nr_max; >>> >>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) >>> +{ >>> + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && >>> + !mem_cgroup_disabled(); >>> +} >>> + >>> static int prealloc_memcg_shrinker(struct shrinker *shrinker) >>> { >>> int id, ret = -ENOMEM; >>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) >>> #endif >>> return false; >>> } >>> + >>> +static inline long count_nr_deferred(struct shrinker *shrinker, >>> + struct shrink_control *sc) >>> +{ >>> + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; >>> + struct memcg_shrinker_deferred *deferred; >>> + struct mem_cgroup *memcg = sc->memcg; >>> + int nid = sc->nid; >>> + int id = shrinker->id; >>> + long nr; >>> + >>> + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) >>> + nid = 0; >>> + >>> + if (per_memcg_deferred) { >>> + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, >>> + true); >> >> My comment is about both 5/9 and 6/9 patches. > > Sorry for the late reply, I don't know why Gmail filtered this out to spam. > >> >> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag >> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see >> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur >> because of processor reordering on !x86 (there is no a common lock or memory barriers). >> >> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). >> The map can't be NULL there. >> >> Regarding to shrinker_deferred you should prove either this is not a problem too, >> or to add proper synchronization (maybe, based on barriers) or to add some similar check >> (maybe, in shrink_slab_memcg() too). > > It seems shrink_slab_memcg() might see shrinker_deferred as NULL > either due to the same reason. I don't think there is a guarantee it > won't happen. > > We just need guarantee CSS_ONLINE is seen after shrinker_maps and > shrinker_deferred are allocated, so I'm supposed barriers before > "css->flags |= CSS_ONLINE" should work. > > So the below patch may be ok: > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index df128cab900f..9f7fb0450d69 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct > cgroup_subsys_state *css) > return -ENOMEM; > } > > > + /* > + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > shirnker_maps > + * and shrinker_deferred before CSS_ONLINE. > + */ > + smp_mb(); > + > /* Online state pins memcg ID, memcg ID pins CSS */ > refcount_set(&memcg->id.ref, 1); > css_get(css); smp barriers synchronize data access from different cpus. They should go in a pair. In case of you add the smp barrier into mem_cgroup_css_online(), we should also add one more smp barrier in another place, which we want to synchonize with this. Also, every place should contain a comment referring to its pair: "Pairs with...". Kirill
On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > > On 08.12.2020 20:13, Yang Shi wrote: > > On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > >> > >> On 02.12.2020 21:27, Yang Shi wrote: > >>> Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > >>> will be used in the following cases: > >>> 1. Non memcg aware shrinkers > >>> 2. !CONFIG_MEMCG > >>> 3. memcg is disabled by boot parameter > >>> > >>> Signed-off-by: Yang Shi <shy828301@gmail.com> > >>> --- > >>> mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > >>> 1 file changed, 82 insertions(+), 6 deletions(-) > >>> > >>> diff --git a/mm/vmscan.c b/mm/vmscan.c > >>> index cba0bc8d4661..d569fdcaba79 100644 > >>> --- a/mm/vmscan.c > >>> +++ b/mm/vmscan.c > >>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > >>> static DEFINE_IDR(shrinker_idr); > >>> static int shrinker_nr_max; > >>> > >>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > >>> +{ > >>> + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > >>> + !mem_cgroup_disabled(); > >>> +} > >>> + > >>> static int prealloc_memcg_shrinker(struct shrinker *shrinker) > >>> { > >>> int id, ret = -ENOMEM; > >>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > >>> #endif > >>> return false; > >>> } > >>> + > >>> +static inline long count_nr_deferred(struct shrinker *shrinker, > >>> + struct shrink_control *sc) > >>> +{ > >>> + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > >>> + struct memcg_shrinker_deferred *deferred; > >>> + struct mem_cgroup *memcg = sc->memcg; > >>> + int nid = sc->nid; > >>> + int id = shrinker->id; > >>> + long nr; > >>> + > >>> + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > >>> + nid = 0; > >>> + > >>> + if (per_memcg_deferred) { > >>> + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > >>> + true); > >> > >> My comment is about both 5/9 and 6/9 patches. > > > > Sorry for the late reply, I don't know why Gmail filtered this out to spam. > > > >> > >> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag > >> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see > >> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur > >> because of processor reordering on !x86 (there is no a common lock or memory barriers). > >> > >> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). > >> The map can't be NULL there. > >> > >> Regarding to shrinker_deferred you should prove either this is not a problem too, > >> or to add proper synchronization (maybe, based on barriers) or to add some similar check > >> (maybe, in shrink_slab_memcg() too). > > > > It seems shrink_slab_memcg() might see shrinker_deferred as NULL > > either due to the same reason. I don't think there is a guarantee it > > won't happen. > > > > We just need guarantee CSS_ONLINE is seen after shrinker_maps and > > shrinker_deferred are allocated, so I'm supposed barriers before > > "css->flags |= CSS_ONLINE" should work. > > > > So the below patch may be ok: > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index df128cab900f..9f7fb0450d69 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct > > cgroup_subsys_state *css) > > return -ENOMEM; > > } > > > > > > + /* > > + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > > shirnker_maps > > + * and shrinker_deferred before CSS_ONLINE. > > + */ > > + smp_mb(); > > + > > /* Online state pins memcg ID, memcg ID pins CSS */ > > refcount_set(&memcg->id.ref, 1); > > css_get(css); > > smp barriers synchronize data access from different cpus. They should go in a pair. > In case of you add the smp barrier into mem_cgroup_css_online(), we should also > add one more smp barrier in another place, which we want to synchonize with this. > Also, every place should contain a comment referring to its pair: "Pairs with...". Thanks, I think you are correct. Looked into it further, it seems the race pattern looks like: CPU A CPU B store shrinker_maps pointer load CSS_ONLINE store CSS_ONLINE load shrinker_maps pointer By checking the memory-barriers document, it seems we need write barrier/read barrier pair as below: CPU A CPU B store shrinker_maps pointer load CSS_ONLINE <write barrier> <read barrier> store CSS_ONLINE load shrinker_maps pointer So, the patch should look like: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df128cab900f..489c0a84f82b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) return -ENOMEM; } + /* + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees shirnker_maps + * and shrinker_deferred before CSS_ONLINE. It pairs with the read barrier + * in shrink_slab_memcg(). + */ + smp_wmb(); + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9d2a6485e982..fc9bda576d98 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!mem_cgroup_online(memcg)) return 0; + /* Pairs with write barrier in mem_cgroup_css_online */ + smp_rmb(); + if (!down_read_trylock(&shrinker_rwsem)) return 0; + /* Once memcg is online it can't be NULL */ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, true); - if (unlikely(!map)) - goto unlock; for_each_set_bit(i, map->map, shrinker_nr_max) { struct shrink_control sc = { Does this seem correct? > > Kirill >
On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote: > On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > > > > On 08.12.2020 20:13, Yang Shi wrote: > > > On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > > >> > > >> On 02.12.2020 21:27, Yang Shi wrote: > > >>> Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > > >>> will be used in the following cases: > > >>> 1. Non memcg aware shrinkers > > >>> 2. !CONFIG_MEMCG > > >>> 3. memcg is disabled by boot parameter > > >>> > > >>> Signed-off-by: Yang Shi <shy828301@gmail.com> > > >>> --- > > >>> mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > > >>> 1 file changed, 82 insertions(+), 6 deletions(-) > > >>> > > >>> diff --git a/mm/vmscan.c b/mm/vmscan.c > > >>> index cba0bc8d4661..d569fdcaba79 100644 > > >>> --- a/mm/vmscan.c > > >>> +++ b/mm/vmscan.c > > >>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > > >>> static DEFINE_IDR(shrinker_idr); > > >>> static int shrinker_nr_max; > > >>> > > >>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > > >>> +{ > > >>> + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > > >>> + !mem_cgroup_disabled(); > > >>> +} > > >>> + > > >>> static int prealloc_memcg_shrinker(struct shrinker *shrinker) > > >>> { > > >>> int id, ret = -ENOMEM; > > >>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > > >>> #endif > > >>> return false; > > >>> } > > >>> + > > >>> +static inline long count_nr_deferred(struct shrinker *shrinker, > > >>> + struct shrink_control *sc) > > >>> +{ > > >>> + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > > >>> + struct memcg_shrinker_deferred *deferred; > > >>> + struct mem_cgroup *memcg = sc->memcg; > > >>> + int nid = sc->nid; > > >>> + int id = shrinker->id; > > >>> + long nr; > > >>> + > > >>> + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > > >>> + nid = 0; > > >>> + > > >>> + if (per_memcg_deferred) { > > >>> + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > > >>> + true); > > >> > > >> My comment is about both 5/9 and 6/9 patches. > > > > > > Sorry for the late reply, I don't know why Gmail filtered this out to spam. > > > > > >> > > >> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag > > >> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see > > >> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur > > >> because of processor reordering on !x86 (there is no a common lock or memory barriers). > > >> > > >> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). > > >> The map can't be NULL there. > > >> > > >> Regarding to shrinker_deferred you should prove either this is not a problem too, > > >> or to add proper synchronization (maybe, based on barriers) or to add some similar check > > >> (maybe, in shrink_slab_memcg() too). > > > > > > It seems shrink_slab_memcg() might see shrinker_deferred as NULL > > > either due to the same reason. I don't think there is a guarantee it > > > won't happen. > > > > > > We just need guarantee CSS_ONLINE is seen after shrinker_maps and > > > shrinker_deferred are allocated, so I'm supposed barriers before > > > "css->flags |= CSS_ONLINE" should work. > > > > > > So the below patch may be ok: > > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > > index df128cab900f..9f7fb0450d69 100644 > > > --- a/mm/memcontrol.c > > > +++ b/mm/memcontrol.c > > > @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct > > > cgroup_subsys_state *css) > > > return -ENOMEM; > > > } > > > > > > > > > + /* > > > + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > > > shirnker_maps > > > + * and shrinker_deferred before CSS_ONLINE. > > > + */ > > > + smp_mb(); > > > + > > > /* Online state pins memcg ID, memcg ID pins CSS */ > > > refcount_set(&memcg->id.ref, 1); > > > css_get(css); > > > > smp barriers synchronize data access from different cpus. They should go in a pair. > > In case of you add the smp barrier into mem_cgroup_css_online(), we should also > > add one more smp barrier in another place, which we want to synchonize with this. > > Also, every place should contain a comment referring to its pair: "Pairs with...". > > Thanks, I think you are correct. Looked into it further, it seems the > race pattern looks like: > > CPU A CPU B > store shrinker_maps pointer load CSS_ONLINE > store CSS_ONLINE load shrinker_maps pointer > > By checking the memory-barriers document, it seems we need write > barrier/read barrier pair as below: > > CPU A CPU B > store shrinker_maps pointer load CSS_ONLINE > <write barrier> <read barrier> > store CSS_ONLINE load shrinker_maps pointer > > > So, the patch should look like: > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index df128cab900f..489c0a84f82b 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct > cgroup_subsys_state *css) > return -ENOMEM; > } > > + /* > + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > shirnker_maps > + * and shrinker_deferred before CSS_ONLINE. It pairs with the > read barrier > + * in shrink_slab_memcg(). > + */ > + smp_wmb(); Is there a reason why the shrinker allocations aren't done in .css_alloc()? That would take care of all necessary ordering: #0 css = ->css_alloc() list_add_tail_rcu(css, parent->children) rcu_assign_pointer() ->css_online(css) css->flags |= CSS_ONLINE #1 memcg = mem_cgroup_iter() list_entry_rcu() rcu_dereference() shrink_slab(.., memcg) RCU ensures that once the cgroup shows up in the reclaim cgroup it's also fully allocated. > /* Online state pins memcg ID, memcg ID pins CSS */ > refcount_set(&memcg->id.ref, 1); > css_get(css); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 9d2a6485e982..fc9bda576d98 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t > gfp_mask, int nid, > if (!mem_cgroup_online(memcg)) > return 0; ...then we should be able to delete this online check here entirely: A not-yet online cgroup is guaranteed to have a shrinker map, just with no bits set. The shrinker code handles that just fine. An offlined cgroup will eventually have an empty bitmap as the called shrinkers return SHRINK_EMPTY. This could also be shortcut by clearing the bit in memcg_drain_list_lru_node() the same way we set it in the parent when we move all objects upward, but seems correct as-is.
On 10.12.2020 18:13, Johannes Weiner wrote: > On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote: >> On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: >>> >>> On 08.12.2020 20:13, Yang Shi wrote: >>>> On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: >>>>> >>>>> On 02.12.2020 21:27, Yang Shi wrote: >>>>>> Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred >>>>>> will be used in the following cases: >>>>>> 1. Non memcg aware shrinkers >>>>>> 2. !CONFIG_MEMCG >>>>>> 3. memcg is disabled by boot parameter >>>>>> >>>>>> Signed-off-by: Yang Shi <shy828301@gmail.com> >>>>>> --- >>>>>> mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- >>>>>> 1 file changed, 82 insertions(+), 6 deletions(-) >>>>>> >>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>>>>> index cba0bc8d4661..d569fdcaba79 100644 >>>>>> --- a/mm/vmscan.c >>>>>> +++ b/mm/vmscan.c >>>>>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); >>>>>> static DEFINE_IDR(shrinker_idr); >>>>>> static int shrinker_nr_max; >>>>>> >>>>>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) >>>>>> +{ >>>>>> + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && >>>>>> + !mem_cgroup_disabled(); >>>>>> +} >>>>>> + >>>>>> static int prealloc_memcg_shrinker(struct shrinker *shrinker) >>>>>> { >>>>>> int id, ret = -ENOMEM; >>>>>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) >>>>>> #endif >>>>>> return false; >>>>>> } >>>>>> + >>>>>> +static inline long count_nr_deferred(struct shrinker *shrinker, >>>>>> + struct shrink_control *sc) >>>>>> +{ >>>>>> + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; >>>>>> + struct memcg_shrinker_deferred *deferred; >>>>>> + struct mem_cgroup *memcg = sc->memcg; >>>>>> + int nid = sc->nid; >>>>>> + int id = shrinker->id; >>>>>> + long nr; >>>>>> + >>>>>> + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) >>>>>> + nid = 0; >>>>>> + >>>>>> + if (per_memcg_deferred) { >>>>>> + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, >>>>>> + true); >>>>> >>>>> My comment is about both 5/9 and 6/9 patches. >>>> >>>> Sorry for the late reply, I don't know why Gmail filtered this out to spam. >>>> >>>>> >>>>> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag >>>>> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see >>>>> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur >>>>> because of processor reordering on !x86 (there is no a common lock or memory barriers). >>>>> >>>>> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). >>>>> The map can't be NULL there. >>>>> >>>>> Regarding to shrinker_deferred you should prove either this is not a problem too, >>>>> or to add proper synchronization (maybe, based on barriers) or to add some similar check >>>>> (maybe, in shrink_slab_memcg() too). >>>> >>>> It seems shrink_slab_memcg() might see shrinker_deferred as NULL >>>> either due to the same reason. I don't think there is a guarantee it >>>> won't happen. >>>> >>>> We just need guarantee CSS_ONLINE is seen after shrinker_maps and >>>> shrinker_deferred are allocated, so I'm supposed barriers before >>>> "css->flags |= CSS_ONLINE" should work. >>>> >>>> So the below patch may be ok: >>>> >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>>> index df128cab900f..9f7fb0450d69 100644 >>>> --- a/mm/memcontrol.c >>>> +++ b/mm/memcontrol.c >>>> @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct >>>> cgroup_subsys_state *css) >>>> return -ENOMEM; >>>> } >>>> >>>> >>>> + /* >>>> + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees >>>> shirnker_maps >>>> + * and shrinker_deferred before CSS_ONLINE. >>>> + */ >>>> + smp_mb(); >>>> + >>>> /* Online state pins memcg ID, memcg ID pins CSS */ >>>> refcount_set(&memcg->id.ref, 1); >>>> css_get(css); >>> >>> smp barriers synchronize data access from different cpus. They should go in a pair. >>> In case of you add the smp barrier into mem_cgroup_css_online(), we should also >>> add one more smp barrier in another place, which we want to synchonize with this. >>> Also, every place should contain a comment referring to its pair: "Pairs with...". >> >> Thanks, I think you are correct. Looked into it further, it seems the >> race pattern looks like: >> >> CPU A CPU B >> store shrinker_maps pointer load CSS_ONLINE >> store CSS_ONLINE load shrinker_maps pointer >> >> By checking the memory-barriers document, it seems we need write >> barrier/read barrier pair as below: >> >> CPU A CPU B >> store shrinker_maps pointer load CSS_ONLINE >> <write barrier> <read barrier> >> store CSS_ONLINE load shrinker_maps pointer >> >> >> So, the patch should look like: >> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index df128cab900f..489c0a84f82b 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct >> cgroup_subsys_state *css) >> return -ENOMEM; >> } >> >> + /* >> + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees >> shirnker_maps >> + * and shrinker_deferred before CSS_ONLINE. It pairs with the >> read barrier >> + * in shrink_slab_memcg(). >> + */ >> + smp_wmb(); > > Is there a reason why the shrinker allocations aren't done in > .css_alloc()? That would take care of all necessary ordering: The reason is that allocations have to be made in a place, where mem-cgroup_iter() can't miss it, since memcg_expand_shrinker_maps() shouldn't miss allocated shrinker maps. > > #0 > css = ->css_alloc() > list_add_tail_rcu(css, parent->children) > rcu_assign_pointer() > ->css_online(css) > css->flags |= CSS_ONLINE > > #1 > memcg = mem_cgroup_iter() > list_entry_rcu() > rcu_dereference() > shrink_slab(.., memcg) > > RCU ensures that once the cgroup shows up in the reclaim cgroup it's > also fully allocated. > >> /* Online state pins memcg ID, memcg ID pins CSS */ >> refcount_set(&memcg->id.ref, 1); >> css_get(css); >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index 9d2a6485e982..fc9bda576d98 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t >> gfp_mask, int nid, >> if (!mem_cgroup_online(memcg)) >> return 0; > > ...then we should be able to delete this online check here entirely: > > A not-yet online cgroup is guaranteed to have a shrinker map, just > with no bits set. The shrinker code handles that just fine. > > An offlined cgroup will eventually have an empty bitmap as the called > shrinkers return SHRINK_EMPTY. This could also be shortcut by clearing > the bit in memcg_drain_list_lru_node() the same way we set it in the > parent when we move all objects upward, but seems correct as-is. >
On Thu, Dec 10, 2020 at 06:17:54PM +0300, Kirill Tkhai wrote: > On 10.12.2020 18:13, Johannes Weiner wrote: > > On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote: > >> On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > >>> > >>> On 08.12.2020 20:13, Yang Shi wrote: > >>>> On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote: > >>>>> > >>>>> On 02.12.2020 21:27, Yang Shi wrote: > >>>>>> Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred > >>>>>> will be used in the following cases: > >>>>>> 1. Non memcg aware shrinkers > >>>>>> 2. !CONFIG_MEMCG > >>>>>> 3. memcg is disabled by boot parameter > >>>>>> > >>>>>> Signed-off-by: Yang Shi <shy828301@gmail.com> > >>>>>> --- > >>>>>> mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- > >>>>>> 1 file changed, 82 insertions(+), 6 deletions(-) > >>>>>> > >>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c > >>>>>> index cba0bc8d4661..d569fdcaba79 100644 > >>>>>> --- a/mm/vmscan.c > >>>>>> +++ b/mm/vmscan.c > >>>>>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); > >>>>>> static DEFINE_IDR(shrinker_idr); > >>>>>> static int shrinker_nr_max; > >>>>>> > >>>>>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) > >>>>>> +{ > >>>>>> + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && > >>>>>> + !mem_cgroup_disabled(); > >>>>>> +} > >>>>>> + > >>>>>> static int prealloc_memcg_shrinker(struct shrinker *shrinker) > >>>>>> { > >>>>>> int id, ret = -ENOMEM; > >>>>>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) > >>>>>> #endif > >>>>>> return false; > >>>>>> } > >>>>>> + > >>>>>> +static inline long count_nr_deferred(struct shrinker *shrinker, > >>>>>> + struct shrink_control *sc) > >>>>>> +{ > >>>>>> + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; > >>>>>> + struct memcg_shrinker_deferred *deferred; > >>>>>> + struct mem_cgroup *memcg = sc->memcg; > >>>>>> + int nid = sc->nid; > >>>>>> + int id = shrinker->id; > >>>>>> + long nr; > >>>>>> + > >>>>>> + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) > >>>>>> + nid = 0; > >>>>>> + > >>>>>> + if (per_memcg_deferred) { > >>>>>> + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, > >>>>>> + true); > >>>>> > >>>>> My comment is about both 5/9 and 6/9 patches. > >>>> > >>>> Sorry for the late reply, I don't know why Gmail filtered this out to spam. > >>>> > >>>>> > >>>>> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag > >>>>> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see > >>>>> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur > >>>>> because of processor reordering on !x86 (there is no a common lock or memory barriers). > >>>>> > >>>>> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg(). > >>>>> The map can't be NULL there. > >>>>> > >>>>> Regarding to shrinker_deferred you should prove either this is not a problem too, > >>>>> or to add proper synchronization (maybe, based on barriers) or to add some similar check > >>>>> (maybe, in shrink_slab_memcg() too). > >>>> > >>>> It seems shrink_slab_memcg() might see shrinker_deferred as NULL > >>>> either due to the same reason. I don't think there is a guarantee it > >>>> won't happen. > >>>> > >>>> We just need guarantee CSS_ONLINE is seen after shrinker_maps and > >>>> shrinker_deferred are allocated, so I'm supposed barriers before > >>>> "css->flags |= CSS_ONLINE" should work. > >>>> > >>>> So the below patch may be ok: > >>>> > >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c > >>>> index df128cab900f..9f7fb0450d69 100644 > >>>> --- a/mm/memcontrol.c > >>>> +++ b/mm/memcontrol.c > >>>> @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct > >>>> cgroup_subsys_state *css) > >>>> return -ENOMEM; > >>>> } > >>>> > >>>> > >>>> + /* > >>>> + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > >>>> shirnker_maps > >>>> + * and shrinker_deferred before CSS_ONLINE. > >>>> + */ > >>>> + smp_mb(); > >>>> + > >>>> /* Online state pins memcg ID, memcg ID pins CSS */ > >>>> refcount_set(&memcg->id.ref, 1); > >>>> css_get(css); > >>> > >>> smp barriers synchronize data access from different cpus. They should go in a pair. > >>> In case of you add the smp barrier into mem_cgroup_css_online(), we should also > >>> add one more smp barrier in another place, which we want to synchonize with this. > >>> Also, every place should contain a comment referring to its pair: "Pairs with...". > >> > >> Thanks, I think you are correct. Looked into it further, it seems the > >> race pattern looks like: > >> > >> CPU A CPU B > >> store shrinker_maps pointer load CSS_ONLINE > >> store CSS_ONLINE load shrinker_maps pointer > >> > >> By checking the memory-barriers document, it seems we need write > >> barrier/read barrier pair as below: > >> > >> CPU A CPU B > >> store shrinker_maps pointer load CSS_ONLINE > >> <write barrier> <read barrier> > >> store CSS_ONLINE load shrinker_maps pointer > >> > >> > >> So, the patch should look like: > >> > >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c > >> index df128cab900f..489c0a84f82b 100644 > >> --- a/mm/memcontrol.c > >> +++ b/mm/memcontrol.c > >> @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct > >> cgroup_subsys_state *css) > >> return -ENOMEM; > >> } > >> > >> + /* > >> + * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees > >> shirnker_maps > >> + * and shrinker_deferred before CSS_ONLINE. It pairs with the > >> read barrier > >> + * in shrink_slab_memcg(). > >> + */ > >> + smp_wmb(); > > > > Is there a reason why the shrinker allocations aren't done in > > .css_alloc()? That would take care of all necessary ordering: > > The reason is that allocations have to be made in a place, where > mem-cgroup_iter() can't miss it, since memcg_expand_shrinker_maps() > shouldn't miss allocated shrinker maps. I see, because we could have this: .css_alloc() memcg_alloc_shrinker_maps() down_read(&shrinker_sem) map = alloc(shrinker_nr_max * sizeof(long)); rcu_assign_pointer(memcg->...->shrinker_map = map); up_read(&shrinker_sem); register_shrinker() down_write(&shrinker_sem) shrinker_nr_max = id + 1; memcg_expand_shrinker_maps() for_each_mem_cgroup() realloc up_write(&shrinker_sem) list_add_tail_rcu(&css->sibling, &parent->children); /* boom: missed new shrinker, map too small */ Thanks for the clarification.
diff --git a/mm/vmscan.c b/mm/vmscan.c index cba0bc8d4661..d569fdcaba79 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem); static DEFINE_IDR(shrinker_idr); static int shrinker_nr_max; +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) +{ + return (shrinker->flags & SHRINKER_MEMCG_AWARE) && + !mem_cgroup_disabled(); +} + static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } + +static inline long count_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; + struct memcg_shrinker_deferred *deferred; + struct mem_cgroup *memcg = sc->memcg; + int nid = sc->nid; + int id = shrinker->id; + long nr; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (per_memcg_deferred) { + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, + true); + nr = atomic_long_xchg(&deferred->nr_deferred[id], 0); + } else + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + return nr; +} + +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg; + struct memcg_shrinker_deferred *deferred; + struct mem_cgroup *memcg = sc->memcg; + int nid = sc->nid; + int id = shrinker->id; + long new_nr; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (per_memcg_deferred) { + deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred, + true); + new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]); + } else + new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); + + return new_nr; +} #else +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker) +{ + return false; +} + static int prealloc_memcg_shrinker(struct shrinker *shrinker) { return 0; @@ -290,6 +347,29 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } + +static inline long count_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + +static inline long set_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + return atomic_long_add_return(nr, + &shrinker->nr_deferred[nid]); +} #endif /* @@ -429,13 +509,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) @@ -446,7 +523,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = count_nr_deferred(shrinker, shrinkctl); total_scan = nr; if (shrinker->seeks) { @@ -539,8 +616,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. */ - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); + new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed;
Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred will be used in the following cases: 1. Non memcg aware shrinkers 2. !CONFIG_MEMCG 3. memcg is disabled by boot parameter Signed-off-by: Yang Shi <shy828301@gmail.com> --- mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 6 deletions(-)