diff mbox series

[6/9] mm: vmscan: use per memcg nr_deferred of shrinker

Message ID 20201202182725.265020-7-shy828301@gmail.com (mailing list archive)
State New, archived
Headers show
Series Make shrinker's nr_deferred memcg aware | expand

Commit Message

Yang Shi Dec. 2, 2020, 6:27 p.m. UTC
Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
will be used in the following cases:
    1. Non memcg aware shrinkers
    2. !CONFIG_MEMCG
    3. memcg is disabled by boot parameter

Signed-off-by: Yang Shi <shy828301@gmail.com>
---
 mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 6 deletions(-)

Comments

Roman Gushchin Dec. 3, 2020, 3:08 a.m. UTC | #1
On Wed, Dec 02, 2020 at 10:27:22AM -0800, Yang Shi wrote:
> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> will be used in the following cases:
>     1. Non memcg aware shrinkers
>     2. !CONFIG_MEMCG
>     3. memcg is disabled by boot parameter
> 
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 82 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index cba0bc8d4661..d569fdcaba79 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  static DEFINE_IDR(shrinker_idr);
>  static int shrinker_nr_max;
>  
> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> +{
> +	return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> +		!mem_cgroup_disabled();
> +}
> +
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
>  	int id, ret = -ENOMEM;
> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  #endif
>  	return false;
>  }
> +
> +static inline long count_nr_deferred(struct shrinker *shrinker,
> +				     struct shrink_control *sc)
> +{
> +	bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> +	struct memcg_shrinker_deferred *deferred;
> +	struct mem_cgroup *memcg = sc->memcg;
> +	int nid = sc->nid;
> +	int id = shrinker->id;
> +	long nr;
> +
> +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> +		nid = 0;
> +
> +	if (per_memcg_deferred) {
> +		deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> +						     true);
> +		nr = atomic_long_xchg(&deferred->nr_deferred[id], 0);
> +	} else
> +		nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> +
> +	return nr;
> +}
> +
> +static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
> +				   struct shrink_control *sc)
> +{
> +	bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> +	struct memcg_shrinker_deferred *deferred;
> +	struct mem_cgroup *memcg = sc->memcg;
> +	int nid = sc->nid;
> +	int id = shrinker->id;
> +	long new_nr;
> +
> +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> +		nid = 0;
> +
> +	if (per_memcg_deferred) {
> +		deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> +						     true);
> +		new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]);
> +	} else
> +		new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
> +
> +	return new_nr;
> +}
>  #else
> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> +{
> +	return false;
> +}
> +
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
>  	return 0;
> @@ -290,6 +347,29 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  {
>  	return true;
>  }
> +
> +static inline long count_nr_deferred(struct shrinker *shrinker,
> +				     struct shrink_control *sc)
> +{
> +	int nid = sc->nid;
> +
> +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> +		nid = 0;
> +
> +	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> +}
> +
> +static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
> +				   struct shrink_control *sc)
> +{
> +	int nid = sc->nid;
> +
> +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> +		nid = 0;
> +
> +	return atomic_long_add_return(nr,
> +				      &shrinker->nr_deferred[nid]);
> +}
>  #endif
>  
>  /*
> @@ -429,13 +509,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  	long freeable;
>  	long nr;
>  	long new_nr;
> -	int nid = shrinkctl->nid;
>  	long batch_size = shrinker->batch ? shrinker->batch
>  					  : SHRINK_BATCH;
>  	long scanned = 0, next_deferred;
>  
> -	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> -		nid = 0;
>  
>  	freeable = shrinker->count_objects(shrinker, shrinkctl);
>  	if (freeable == 0 || freeable == SHRINK_EMPTY)
> @@ -446,7 +523,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  	 * and zero it so that other concurrent shrinker invocations
>  	 * don't also do this scanning work.
>  	 */
> -	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> +	nr = count_nr_deferred(shrinker, shrinkctl);
>  
>  	total_scan = nr;
>  	if (shrinker->seeks) {
> @@ -539,8 +616,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
>  	 * move the unused scan count back into the shrinker in a
>  	 * manner that handles concurrent updates.
>  	 */
> -	new_nr = atomic_long_add_return(next_deferred,
> -					&shrinker->nr_deferred[nid]);
> +	new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl);

Ok, I think patch (1) can be just merged into this and then it would make total sense.
Yang Shi Dec. 3, 2020, 5:01 a.m. UTC | #2
On Wed, Dec 2, 2020 at 7:08 PM Roman Gushchin <guro@fb.com> wrote:
>
> On Wed, Dec 02, 2020 at 10:27:22AM -0800, Yang Shi wrote:
> > Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> > will be used in the following cases:
> >     1. Non memcg aware shrinkers
> >     2. !CONFIG_MEMCG
> >     3. memcg is disabled by boot parameter
> >
> > Signed-off-by: Yang Shi <shy828301@gmail.com>
> > ---
> >  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 82 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index cba0bc8d4661..d569fdcaba79 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >  static DEFINE_IDR(shrinker_idr);
> >  static int shrinker_nr_max;
> >
> > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> > +{
> > +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> > +             !mem_cgroup_disabled();
> > +}
> > +
> >  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> >  {
> >       int id, ret = -ENOMEM;
> > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >  #endif
> >       return false;
> >  }
> > +
> > +static inline long count_nr_deferred(struct shrinker *shrinker,
> > +                                  struct shrink_control *sc)
> > +{
> > +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> > +     struct memcg_shrinker_deferred *deferred;
> > +     struct mem_cgroup *memcg = sc->memcg;
> > +     int nid = sc->nid;
> > +     int id = shrinker->id;
> > +     long nr;
> > +
> > +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > +             nid = 0;
> > +
> > +     if (per_memcg_deferred) {
> > +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> > +                                                  true);
> > +             nr = atomic_long_xchg(&deferred->nr_deferred[id], 0);
> > +     } else
> > +             nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> > +
> > +     return nr;
> > +}
> > +
> > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
> > +                                struct shrink_control *sc)
> > +{
> > +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> > +     struct memcg_shrinker_deferred *deferred;
> > +     struct mem_cgroup *memcg = sc->memcg;
> > +     int nid = sc->nid;
> > +     int id = shrinker->id;
> > +     long new_nr;
> > +
> > +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > +             nid = 0;
> > +
> > +     if (per_memcg_deferred) {
> > +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> > +                                                  true);
> > +             new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]);
> > +     } else
> > +             new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
> > +
> > +     return new_nr;
> > +}
> >  #else
> > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> > +{
> > +     return false;
> > +}
> > +
> >  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> >  {
> >       return 0;
> > @@ -290,6 +347,29 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >  {
> >       return true;
> >  }
> > +
> > +static inline long count_nr_deferred(struct shrinker *shrinker,
> > +                                  struct shrink_control *sc)
> > +{
> > +     int nid = sc->nid;
> > +
> > +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > +             nid = 0;
> > +
> > +     return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> > +}
> > +
> > +static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
> > +                                struct shrink_control *sc)
> > +{
> > +     int nid = sc->nid;
> > +
> > +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > +             nid = 0;
> > +
> > +     return atomic_long_add_return(nr,
> > +                                   &shrinker->nr_deferred[nid]);
> > +}
> >  #endif
> >
> >  /*
> > @@ -429,13 +509,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >       long freeable;
> >       long nr;
> >       long new_nr;
> > -     int nid = shrinkctl->nid;
> >       long batch_size = shrinker->batch ? shrinker->batch
> >                                         : SHRINK_BATCH;
> >       long scanned = 0, next_deferred;
> >
> > -     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > -             nid = 0;
> >
> >       freeable = shrinker->count_objects(shrinker, shrinkctl);
> >       if (freeable == 0 || freeable == SHRINK_EMPTY)
> > @@ -446,7 +523,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >        * and zero it so that other concurrent shrinker invocations
> >        * don't also do this scanning work.
> >        */
> > -     nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
> > +     nr = count_nr_deferred(shrinker, shrinkctl);
> >
> >       total_scan = nr;
> >       if (shrinker->seeks) {
> > @@ -539,8 +616,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
> >        * move the unused scan count back into the shrinker in a
> >        * manner that handles concurrent updates.
> >        */
> > -     new_nr = atomic_long_add_return(next_deferred,
> > -                                     &shrinker->nr_deferred[nid]);
> > +     new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl);
>
> Ok, I think patch (1) can be just merged into this and then it would make total sense.

Sure. Makes sense to me.
Kirill Tkhai Dec. 3, 2020, 11:40 a.m. UTC | #3
On 02.12.2020 21:27, Yang Shi wrote:
> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> will be used in the following cases:
>     1. Non memcg aware shrinkers
>     2. !CONFIG_MEMCG
>     3. memcg is disabled by boot parameter
> 
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 82 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index cba0bc8d4661..d569fdcaba79 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  static DEFINE_IDR(shrinker_idr);
>  static int shrinker_nr_max;
>  
> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> +{
> +	return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> +		!mem_cgroup_disabled();
> +}
> +
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
>  	int id, ret = -ENOMEM;
> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  #endif
>  	return false;
>  }
> +
> +static inline long count_nr_deferred(struct shrinker *shrinker,
> +				     struct shrink_control *sc)
> +{
> +	bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> +	struct memcg_shrinker_deferred *deferred;
> +	struct mem_cgroup *memcg = sc->memcg;
> +	int nid = sc->nid;
> +	int id = shrinker->id;
> +	long nr;
> +
> +	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> +		nid = 0;
> +
> +	if (per_memcg_deferred) {
> +		deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> +						     true);

My comment is about both 5/9 and 6/9 patches.

shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
because of processor reordering on !x86 (there is no a common lock or memory barriers).

Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
The map can't be NULL there.

Regarding to shrinker_deferred you should prove either this is not a problem too,
or to add proper synchronization (maybe, based on barriers) or to add some similar check
(maybe, in shrink_slab_memcg() too).

Kirill
Yang Shi Dec. 8, 2020, 5:13 p.m. UTC | #4
On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>
> On 02.12.2020 21:27, Yang Shi wrote:
> > Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> > will be used in the following cases:
> >     1. Non memcg aware shrinkers
> >     2. !CONFIG_MEMCG
> >     3. memcg is disabled by boot parameter
> >
> > Signed-off-by: Yang Shi <shy828301@gmail.com>
> > ---
> >  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 82 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index cba0bc8d4661..d569fdcaba79 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >  static DEFINE_IDR(shrinker_idr);
> >  static int shrinker_nr_max;
> >
> > +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> > +{
> > +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> > +             !mem_cgroup_disabled();
> > +}
> > +
> >  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> >  {
> >       int id, ret = -ENOMEM;
> > @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >  #endif
> >       return false;
> >  }
> > +
> > +static inline long count_nr_deferred(struct shrinker *shrinker,
> > +                                  struct shrink_control *sc)
> > +{
> > +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> > +     struct memcg_shrinker_deferred *deferred;
> > +     struct mem_cgroup *memcg = sc->memcg;
> > +     int nid = sc->nid;
> > +     int id = shrinker->id;
> > +     long nr;
> > +
> > +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > +             nid = 0;
> > +
> > +     if (per_memcg_deferred) {
> > +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> > +                                                  true);
>
> My comment is about both 5/9 and 6/9 patches.

Sorry for the late reply, I don't know why Gmail filtered this out to spam.

>
> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
> because of processor reordering on !x86 (there is no a common lock or memory barriers).
>
> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
> The map can't be NULL there.
>
> Regarding to shrinker_deferred you should prove either this is not a problem too,
> or to add proper synchronization (maybe, based on barriers) or to add some similar check
> (maybe, in shrink_slab_memcg() too).

It seems shrink_slab_memcg() might see shrinker_deferred as NULL
either due to the same reason. I don't think there is a guarantee it
won't happen.

We just need guarantee CSS_ONLINE is seen after shrinker_maps and
shrinker_deferred are allocated, so I'm supposed barriers before
"css->flags |= CSS_ONLINE" should work.

So the below patch may be ok:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df128cab900f..9f7fb0450d69 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
cgroup_subsys_state *css)
                return -ENOMEM;
        }


+       /*
+        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
shirnker_maps
+        * and shrinker_deferred before CSS_ONLINE.
+        */
+       smp_mb();
+
        /* Online state pins memcg ID, memcg ID pins CSS */
        refcount_set(&memcg->id.ref, 1);
        css_get(css);


Or add one more check for shrinker_deferred sounds acceptable as well.

> Kirill
>
Kirill Tkhai Dec. 9, 2020, 3:41 p.m. UTC | #5
On 08.12.2020 20:13, Yang Shi wrote:
> On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>
>> On 02.12.2020 21:27, Yang Shi wrote:
>>> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
>>> will be used in the following cases:
>>>     1. Non memcg aware shrinkers
>>>     2. !CONFIG_MEMCG
>>>     3. memcg is disabled by boot parameter
>>>
>>> Signed-off-by: Yang Shi <shy828301@gmail.com>
>>> ---
>>>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
>>>  1 file changed, 82 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index cba0bc8d4661..d569fdcaba79 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>>  static DEFINE_IDR(shrinker_idr);
>>>  static int shrinker_nr_max;
>>>
>>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
>>> +{
>>> +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
>>> +             !mem_cgroup_disabled();
>>> +}
>>> +
>>>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>>>  {
>>>       int id, ret = -ENOMEM;
>>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>>>  #endif
>>>       return false;
>>>  }
>>> +
>>> +static inline long count_nr_deferred(struct shrinker *shrinker,
>>> +                                  struct shrink_control *sc)
>>> +{
>>> +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
>>> +     struct memcg_shrinker_deferred *deferred;
>>> +     struct mem_cgroup *memcg = sc->memcg;
>>> +     int nid = sc->nid;
>>> +     int id = shrinker->id;
>>> +     long nr;
>>> +
>>> +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
>>> +             nid = 0;
>>> +
>>> +     if (per_memcg_deferred) {
>>> +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
>>> +                                                  true);
>>
>> My comment is about both 5/9 and 6/9 patches.
> 
> Sorry for the late reply, I don't know why Gmail filtered this out to spam.
> 
>>
>> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
>> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
>> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
>> because of processor reordering on !x86 (there is no a common lock or memory barriers).
>>
>> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
>> The map can't be NULL there.
>>
>> Regarding to shrinker_deferred you should prove either this is not a problem too,
>> or to add proper synchronization (maybe, based on barriers) or to add some similar check
>> (maybe, in shrink_slab_memcg() too).
> 
> It seems shrink_slab_memcg() might see shrinker_deferred as NULL
> either due to the same reason. I don't think there is a guarantee it
> won't happen.
> 
> We just need guarantee CSS_ONLINE is seen after shrinker_maps and
> shrinker_deferred are allocated, so I'm supposed barriers before
> "css->flags |= CSS_ONLINE" should work.
> 
> So the below patch may be ok:
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index df128cab900f..9f7fb0450d69 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
> cgroup_subsys_state *css)
>                 return -ENOMEM;
>         }
> 
> 
> +       /*
> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> shirnker_maps
> +        * and shrinker_deferred before CSS_ONLINE.
> +        */
> +       smp_mb();
> +
>         /* Online state pins memcg ID, memcg ID pins CSS */
>         refcount_set(&memcg->id.ref, 1);
>         css_get(css);

smp barriers synchronize data access from different cpus. They should go in a pair.
In case of you add the smp barrier into mem_cgroup_css_online(), we should also
add one more smp barrier in another place, which we want to synchonize with this.
Also, every place should contain a comment referring to its pair: "Pairs with...".

Kirill
Yang Shi Dec. 9, 2020, 5:32 p.m. UTC | #6
On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>
> On 08.12.2020 20:13, Yang Shi wrote:
> > On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> >>
> >> On 02.12.2020 21:27, Yang Shi wrote:
> >>> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> >>> will be used in the following cases:
> >>>     1. Non memcg aware shrinkers
> >>>     2. !CONFIG_MEMCG
> >>>     3. memcg is disabled by boot parameter
> >>>
> >>> Signed-off-by: Yang Shi <shy828301@gmail.com>
> >>> ---
> >>>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
> >>>  1 file changed, 82 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>> index cba0bc8d4661..d569fdcaba79 100644
> >>> --- a/mm/vmscan.c
> >>> +++ b/mm/vmscan.c
> >>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >>>  static DEFINE_IDR(shrinker_idr);
> >>>  static int shrinker_nr_max;
> >>>
> >>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> >>> +{
> >>> +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> >>> +             !mem_cgroup_disabled();
> >>> +}
> >>> +
> >>>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> >>>  {
> >>>       int id, ret = -ENOMEM;
> >>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >>>  #endif
> >>>       return false;
> >>>  }
> >>> +
> >>> +static inline long count_nr_deferred(struct shrinker *shrinker,
> >>> +                                  struct shrink_control *sc)
> >>> +{
> >>> +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> >>> +     struct memcg_shrinker_deferred *deferred;
> >>> +     struct mem_cgroup *memcg = sc->memcg;
> >>> +     int nid = sc->nid;
> >>> +     int id = shrinker->id;
> >>> +     long nr;
> >>> +
> >>> +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> >>> +             nid = 0;
> >>> +
> >>> +     if (per_memcg_deferred) {
> >>> +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> >>> +                                                  true);
> >>
> >> My comment is about both 5/9 and 6/9 patches.
> >
> > Sorry for the late reply, I don't know why Gmail filtered this out to spam.
> >
> >>
> >> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
> >> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
> >> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
> >> because of processor reordering on !x86 (there is no a common lock or memory barriers).
> >>
> >> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
> >> The map can't be NULL there.
> >>
> >> Regarding to shrinker_deferred you should prove either this is not a problem too,
> >> or to add proper synchronization (maybe, based on barriers) or to add some similar check
> >> (maybe, in shrink_slab_memcg() too).
> >
> > It seems shrink_slab_memcg() might see shrinker_deferred as NULL
> > either due to the same reason. I don't think there is a guarantee it
> > won't happen.
> >
> > We just need guarantee CSS_ONLINE is seen after shrinker_maps and
> > shrinker_deferred are allocated, so I'm supposed barriers before
> > "css->flags |= CSS_ONLINE" should work.
> >
> > So the below patch may be ok:
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index df128cab900f..9f7fb0450d69 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
> > cgroup_subsys_state *css)
> >                 return -ENOMEM;
> >         }
> >
> >
> > +       /*
> > +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> > shirnker_maps
> > +        * and shrinker_deferred before CSS_ONLINE.
> > +        */
> > +       smp_mb();
> > +
> >         /* Online state pins memcg ID, memcg ID pins CSS */
> >         refcount_set(&memcg->id.ref, 1);
> >         css_get(css);
>
> smp barriers synchronize data access from different cpus. They should go in a pair.
> In case of you add the smp barrier into mem_cgroup_css_online(), we should also
> add one more smp barrier in another place, which we want to synchonize with this.
> Also, every place should contain a comment referring to its pair: "Pairs with...".

Thanks, I think you are correct. Looked into it further, it seems the
race pattern looks like:

CPU A                                                                  CPU B
store shrinker_maps pointer                      load CSS_ONLINE
store CSS_ONLINE                                   load shrinker_maps pointer

By checking the memory-barriers document, it seems we need write
barrier/read barrier pair as below:

CPU A                                                                  CPU B
store shrinker_maps pointer                       load CSS_ONLINE
<write barrier>                                             <read barrier>
store CSS_ONLINE                                    load shrinker_maps pointer


So, the patch should look like:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df128cab900f..489c0a84f82b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct
cgroup_subsys_state *css)
                return -ENOMEM;
        }

+       /*
+        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
shirnker_maps
+        * and shrinker_deferred before CSS_ONLINE. It pairs with the
read barrier
+        * in shrink_slab_memcg().
+        */
+       smp_wmb();
+
        /* Online state pins memcg ID, memcg ID pins CSS */
        refcount_set(&memcg->id.ref, 1);
        css_get(css);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9d2a6485e982..fc9bda576d98 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t
gfp_mask, int nid,
        if (!mem_cgroup_online(memcg))
                return 0;

+       /* Pairs with write barrier in mem_cgroup_css_online */
+       smp_rmb();
+
        if (!down_read_trylock(&shrinker_rwsem))
                return 0;

+       /* Once memcg is online it can't be NULL */
        map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
                                        true);
-       if (unlikely(!map))
-               goto unlock;

        for_each_set_bit(i, map->map, shrinker_nr_max) {
                struct shrink_control sc = {


Does this seem correct?

>
> Kirill
>
Johannes Weiner Dec. 10, 2020, 3:13 p.m. UTC | #7
On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote:
> On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> >
> > On 08.12.2020 20:13, Yang Shi wrote:
> > > On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> > >>
> > >> On 02.12.2020 21:27, Yang Shi wrote:
> > >>> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> > >>> will be used in the following cases:
> > >>>     1. Non memcg aware shrinkers
> > >>>     2. !CONFIG_MEMCG
> > >>>     3. memcg is disabled by boot parameter
> > >>>
> > >>> Signed-off-by: Yang Shi <shy828301@gmail.com>
> > >>> ---
> > >>>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
> > >>>  1 file changed, 82 insertions(+), 6 deletions(-)
> > >>>
> > >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> > >>> index cba0bc8d4661..d569fdcaba79 100644
> > >>> --- a/mm/vmscan.c
> > >>> +++ b/mm/vmscan.c
> > >>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
> > >>>  static DEFINE_IDR(shrinker_idr);
> > >>>  static int shrinker_nr_max;
> > >>>
> > >>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> > >>> +{
> > >>> +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> > >>> +             !mem_cgroup_disabled();
> > >>> +}
> > >>> +
> > >>>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> > >>>  {
> > >>>       int id, ret = -ENOMEM;
> > >>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> > >>>  #endif
> > >>>       return false;
> > >>>  }
> > >>> +
> > >>> +static inline long count_nr_deferred(struct shrinker *shrinker,
> > >>> +                                  struct shrink_control *sc)
> > >>> +{
> > >>> +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> > >>> +     struct memcg_shrinker_deferred *deferred;
> > >>> +     struct mem_cgroup *memcg = sc->memcg;
> > >>> +     int nid = sc->nid;
> > >>> +     int id = shrinker->id;
> > >>> +     long nr;
> > >>> +
> > >>> +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> > >>> +             nid = 0;
> > >>> +
> > >>> +     if (per_memcg_deferred) {
> > >>> +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> > >>> +                                                  true);
> > >>
> > >> My comment is about both 5/9 and 6/9 patches.
> > >
> > > Sorry for the late reply, I don't know why Gmail filtered this out to spam.
> > >
> > >>
> > >> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
> > >> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
> > >> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
> > >> because of processor reordering on !x86 (there is no a common lock or memory barriers).
> > >>
> > >> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
> > >> The map can't be NULL there.
> > >>
> > >> Regarding to shrinker_deferred you should prove either this is not a problem too,
> > >> or to add proper synchronization (maybe, based on barriers) or to add some similar check
> > >> (maybe, in shrink_slab_memcg() too).
> > >
> > > It seems shrink_slab_memcg() might see shrinker_deferred as NULL
> > > either due to the same reason. I don't think there is a guarantee it
> > > won't happen.
> > >
> > > We just need guarantee CSS_ONLINE is seen after shrinker_maps and
> > > shrinker_deferred are allocated, so I'm supposed barriers before
> > > "css->flags |= CSS_ONLINE" should work.
> > >
> > > So the below patch may be ok:
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index df128cab900f..9f7fb0450d69 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
> > > cgroup_subsys_state *css)
> > >                 return -ENOMEM;
> > >         }
> > >
> > >
> > > +       /*
> > > +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> > > shirnker_maps
> > > +        * and shrinker_deferred before CSS_ONLINE.
> > > +        */
> > > +       smp_mb();
> > > +
> > >         /* Online state pins memcg ID, memcg ID pins CSS */
> > >         refcount_set(&memcg->id.ref, 1);
> > >         css_get(css);
> >
> > smp barriers synchronize data access from different cpus. They should go in a pair.
> > In case of you add the smp barrier into mem_cgroup_css_online(), we should also
> > add one more smp barrier in another place, which we want to synchonize with this.
> > Also, every place should contain a comment referring to its pair: "Pairs with...".
> 
> Thanks, I think you are correct. Looked into it further, it seems the
> race pattern looks like:
> 
> CPU A                                                                  CPU B
> store shrinker_maps pointer                      load CSS_ONLINE
> store CSS_ONLINE                                   load shrinker_maps pointer
> 
> By checking the memory-barriers document, it seems we need write
> barrier/read barrier pair as below:
> 
> CPU A                                                                  CPU B
> store shrinker_maps pointer                       load CSS_ONLINE
> <write barrier>                                             <read barrier>
> store CSS_ONLINE                                    load shrinker_maps pointer
> 
> 
> So, the patch should look like:
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index df128cab900f..489c0a84f82b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct
> cgroup_subsys_state *css)
>                 return -ENOMEM;
>         }
> 
> +       /*
> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> shirnker_maps
> +        * and shrinker_deferred before CSS_ONLINE. It pairs with the
> read barrier
> +        * in shrink_slab_memcg().
> +        */
> +       smp_wmb();

Is there a reason why the shrinker allocations aren't done in
.css_alloc()? That would take care of all necessary ordering:

      #0
      css = ->css_alloc()
      list_add_tail_rcu(css, parent->children)
        rcu_assign_pointer()
      ->css_online(css)
      css->flags |= CSS_ONLINE

      #1
      memcg = mem_cgroup_iter()
        list_entry_rcu()
	  rcu_dereference()
      shrink_slab(.., memcg)

RCU ensures that once the cgroup shows up in the reclaim cgroup it's
also fully allocated.

>         /* Online state pins memcg ID, memcg ID pins CSS */
>         refcount_set(&memcg->id.ref, 1);
>         css_get(css);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9d2a6485e982..fc9bda576d98 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t
> gfp_mask, int nid,
>         if (!mem_cgroup_online(memcg))
>                 return 0;

...then we should be able to delete this online check here entirely:

A not-yet online cgroup is guaranteed to have a shrinker map, just
with no bits set. The shrinker code handles that just fine.

An offlined cgroup will eventually have an empty bitmap as the called
shrinkers return SHRINK_EMPTY. This could also be shortcut by clearing
the bit in memcg_drain_list_lru_node() the same way we set it in the
parent when we move all objects upward, but seems correct as-is.
Kirill Tkhai Dec. 10, 2020, 3:17 p.m. UTC | #8
On 10.12.2020 18:13, Johannes Weiner wrote:
> On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote:
>> On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>
>>> On 08.12.2020 20:13, Yang Shi wrote:
>>>> On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>>>
>>>>> On 02.12.2020 21:27, Yang Shi wrote:
>>>>>> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
>>>>>> will be used in the following cases:
>>>>>>     1. Non memcg aware shrinkers
>>>>>>     2. !CONFIG_MEMCG
>>>>>>     3. memcg is disabled by boot parameter
>>>>>>
>>>>>> Signed-off-by: Yang Shi <shy828301@gmail.com>
>>>>>> ---
>>>>>>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>>>  1 file changed, 82 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index cba0bc8d4661..d569fdcaba79 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>>>>>  static DEFINE_IDR(shrinker_idr);
>>>>>>  static int shrinker_nr_max;
>>>>>>
>>>>>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
>>>>>> +{
>>>>>> +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
>>>>>> +             !mem_cgroup_disabled();
>>>>>> +}
>>>>>> +
>>>>>>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>>>>>>  {
>>>>>>       int id, ret = -ENOMEM;
>>>>>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>>>>>>  #endif
>>>>>>       return false;
>>>>>>  }
>>>>>> +
>>>>>> +static inline long count_nr_deferred(struct shrinker *shrinker,
>>>>>> +                                  struct shrink_control *sc)
>>>>>> +{
>>>>>> +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
>>>>>> +     struct memcg_shrinker_deferred *deferred;
>>>>>> +     struct mem_cgroup *memcg = sc->memcg;
>>>>>> +     int nid = sc->nid;
>>>>>> +     int id = shrinker->id;
>>>>>> +     long nr;
>>>>>> +
>>>>>> +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
>>>>>> +             nid = 0;
>>>>>> +
>>>>>> +     if (per_memcg_deferred) {
>>>>>> +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
>>>>>> +                                                  true);
>>>>>
>>>>> My comment is about both 5/9 and 6/9 patches.
>>>>
>>>> Sorry for the late reply, I don't know why Gmail filtered this out to spam.
>>>>
>>>>>
>>>>> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
>>>>> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
>>>>> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
>>>>> because of processor reordering on !x86 (there is no a common lock or memory barriers).
>>>>>
>>>>> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
>>>>> The map can't be NULL there.
>>>>>
>>>>> Regarding to shrinker_deferred you should prove either this is not a problem too,
>>>>> or to add proper synchronization (maybe, based on barriers) or to add some similar check
>>>>> (maybe, in shrink_slab_memcg() too).
>>>>
>>>> It seems shrink_slab_memcg() might see shrinker_deferred as NULL
>>>> either due to the same reason. I don't think there is a guarantee it
>>>> won't happen.
>>>>
>>>> We just need guarantee CSS_ONLINE is seen after shrinker_maps and
>>>> shrinker_deferred are allocated, so I'm supposed barriers before
>>>> "css->flags |= CSS_ONLINE" should work.
>>>>
>>>> So the below patch may be ok:
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index df128cab900f..9f7fb0450d69 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
>>>> cgroup_subsys_state *css)
>>>>                 return -ENOMEM;
>>>>         }
>>>>
>>>>
>>>> +       /*
>>>> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
>>>> shirnker_maps
>>>> +        * and shrinker_deferred before CSS_ONLINE.
>>>> +        */
>>>> +       smp_mb();
>>>> +
>>>>         /* Online state pins memcg ID, memcg ID pins CSS */
>>>>         refcount_set(&memcg->id.ref, 1);
>>>>         css_get(css);
>>>
>>> smp barriers synchronize data access from different cpus. They should go in a pair.
>>> In case of you add the smp barrier into mem_cgroup_css_online(), we should also
>>> add one more smp barrier in another place, which we want to synchonize with this.
>>> Also, every place should contain a comment referring to its pair: "Pairs with...".
>>
>> Thanks, I think you are correct. Looked into it further, it seems the
>> race pattern looks like:
>>
>> CPU A                                                                  CPU B
>> store shrinker_maps pointer                      load CSS_ONLINE
>> store CSS_ONLINE                                   load shrinker_maps pointer
>>
>> By checking the memory-barriers document, it seems we need write
>> barrier/read barrier pair as below:
>>
>> CPU A                                                                  CPU B
>> store shrinker_maps pointer                       load CSS_ONLINE
>> <write barrier>                                             <read barrier>
>> store CSS_ONLINE                                    load shrinker_maps pointer
>>
>>
>> So, the patch should look like:
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index df128cab900f..489c0a84f82b 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct
>> cgroup_subsys_state *css)
>>                 return -ENOMEM;
>>         }
>>
>> +       /*
>> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
>> shirnker_maps
>> +        * and shrinker_deferred before CSS_ONLINE. It pairs with the
>> read barrier
>> +        * in shrink_slab_memcg().
>> +        */
>> +       smp_wmb();
> 
> Is there a reason why the shrinker allocations aren't done in
> .css_alloc()? That would take care of all necessary ordering:

The reason is that allocations have to be made in a place, where
mem-cgroup_iter() can't miss it, since memcg_expand_shrinker_maps()
shouldn't miss allocated shrinker maps.

> 
>       #0
>       css = ->css_alloc()
>       list_add_tail_rcu(css, parent->children)
>         rcu_assign_pointer()
>       ->css_online(css)
>       css->flags |= CSS_ONLINE
> 
>       #1
>       memcg = mem_cgroup_iter()
>         list_entry_rcu()
> 	  rcu_dereference()
>       shrink_slab(.., memcg)
> 
> RCU ensures that once the cgroup shows up in the reclaim cgroup it's
> also fully allocated.
> 
>>         /* Online state pins memcg ID, memcg ID pins CSS */
>>         refcount_set(&memcg->id.ref, 1);
>>         css_get(css);
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 9d2a6485e982..fc9bda576d98 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -603,13 +603,15 @@ static unsigned long shrink_slab_memcg(gfp_t
>> gfp_mask, int nid,
>>         if (!mem_cgroup_online(memcg))
>>                 return 0;
> 
> ...then we should be able to delete this online check here entirely:
> 
> A not-yet online cgroup is guaranteed to have a shrinker map, just
> with no bits set. The shrinker code handles that just fine.
> 
> An offlined cgroup will eventually have an empty bitmap as the called
> shrinkers return SHRINK_EMPTY. This could also be shortcut by clearing
> the bit in memcg_drain_list_lru_node() the same way we set it in the
> parent when we move all objects upward, but seems correct as-is.
>
Johannes Weiner Dec. 15, 2020, 4:44 p.m. UTC | #9
On Thu, Dec 10, 2020 at 06:17:54PM +0300, Kirill Tkhai wrote:
> On 10.12.2020 18:13, Johannes Weiner wrote:
> > On Wed, Dec 09, 2020 at 09:32:37AM -0800, Yang Shi wrote:
> >> On Wed, Dec 9, 2020 at 7:42 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> >>>
> >>> On 08.12.2020 20:13, Yang Shi wrote:
> >>>> On Thu, Dec 3, 2020 at 3:40 AM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> >>>>>
> >>>>> On 02.12.2020 21:27, Yang Shi wrote:
> >>>>>> Use per memcg's nr_deferred for memcg aware shrinkers.  The shrinker's nr_deferred
> >>>>>> will be used in the following cases:
> >>>>>>     1. Non memcg aware shrinkers
> >>>>>>     2. !CONFIG_MEMCG
> >>>>>>     3. memcg is disabled by boot parameter
> >>>>>>
> >>>>>> Signed-off-by: Yang Shi <shy828301@gmail.com>
> >>>>>> ---
> >>>>>>  mm/vmscan.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----
> >>>>>>  1 file changed, 82 insertions(+), 6 deletions(-)
> >>>>>>
> >>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>>>>> index cba0bc8d4661..d569fdcaba79 100644
> >>>>>> --- a/mm/vmscan.c
> >>>>>> +++ b/mm/vmscan.c
> >>>>>> @@ -203,6 +203,12 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >>>>>>  static DEFINE_IDR(shrinker_idr);
> >>>>>>  static int shrinker_nr_max;
> >>>>>>
> >>>>>> +static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
> >>>>>> +{
> >>>>>> +     return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
> >>>>>> +             !mem_cgroup_disabled();
> >>>>>> +}
> >>>>>> +
> >>>>>>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> >>>>>>  {
> >>>>>>       int id, ret = -ENOMEM;
> >>>>>> @@ -271,7 +277,58 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >>>>>>  #endif
> >>>>>>       return false;
> >>>>>>  }
> >>>>>> +
> >>>>>> +static inline long count_nr_deferred(struct shrinker *shrinker,
> >>>>>> +                                  struct shrink_control *sc)
> >>>>>> +{
> >>>>>> +     bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
> >>>>>> +     struct memcg_shrinker_deferred *deferred;
> >>>>>> +     struct mem_cgroup *memcg = sc->memcg;
> >>>>>> +     int nid = sc->nid;
> >>>>>> +     int id = shrinker->id;
> >>>>>> +     long nr;
> >>>>>> +
> >>>>>> +     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
> >>>>>> +             nid = 0;
> >>>>>> +
> >>>>>> +     if (per_memcg_deferred) {
> >>>>>> +             deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
> >>>>>> +                                                  true);
> >>>>>
> >>>>> My comment is about both 5/9 and 6/9 patches.
> >>>>
> >>>> Sorry for the late reply, I don't know why Gmail filtered this out to spam.
> >>>>
> >>>>>
> >>>>> shrink_slab_memcg() races with mem_cgroup_css_online(). A visibility of CSS_ONLINE flag
> >>>>> in shrink_slab_memcg()->mem_cgroup_online() does not guarantee that you will see
> >>>>> memcg->nodeinfo[nid]->shrinker_deferred != NULL in count_nr_deferred(). This may occur
> >>>>> because of processor reordering on !x86 (there is no a common lock or memory barriers).
> >>>>>
> >>>>> Regarding to shrinker_map this is not a problem due to map check in shrink_slab_memcg().
> >>>>> The map can't be NULL there.
> >>>>>
> >>>>> Regarding to shrinker_deferred you should prove either this is not a problem too,
> >>>>> or to add proper synchronization (maybe, based on barriers) or to add some similar check
> >>>>> (maybe, in shrink_slab_memcg() too).
> >>>>
> >>>> It seems shrink_slab_memcg() might see shrinker_deferred as NULL
> >>>> either due to the same reason. I don't think there is a guarantee it
> >>>> won't happen.
> >>>>
> >>>> We just need guarantee CSS_ONLINE is seen after shrinker_maps and
> >>>> shrinker_deferred are allocated, so I'm supposed barriers before
> >>>> "css->flags |= CSS_ONLINE" should work.
> >>>>
> >>>> So the below patch may be ok:
> >>>>
> >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >>>> index df128cab900f..9f7fb0450d69 100644
> >>>> --- a/mm/memcontrol.c
> >>>> +++ b/mm/memcontrol.c
> >>>> @@ -5539,6 +5539,12 @@ static int mem_cgroup_css_online(struct
> >>>> cgroup_subsys_state *css)
> >>>>                 return -ENOMEM;
> >>>>         }
> >>>>
> >>>>
> >>>> +       /*
> >>>> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> >>>> shirnker_maps
> >>>> +        * and shrinker_deferred before CSS_ONLINE.
> >>>> +        */
> >>>> +       smp_mb();
> >>>> +
> >>>>         /* Online state pins memcg ID, memcg ID pins CSS */
> >>>>         refcount_set(&memcg->id.ref, 1);
> >>>>         css_get(css);
> >>>
> >>> smp barriers synchronize data access from different cpus. They should go in a pair.
> >>> In case of you add the smp barrier into mem_cgroup_css_online(), we should also
> >>> add one more smp barrier in another place, which we want to synchonize with this.
> >>> Also, every place should contain a comment referring to its pair: "Pairs with...".
> >>
> >> Thanks, I think you are correct. Looked into it further, it seems the
> >> race pattern looks like:
> >>
> >> CPU A                                                                  CPU B
> >> store shrinker_maps pointer                      load CSS_ONLINE
> >> store CSS_ONLINE                                   load shrinker_maps pointer
> >>
> >> By checking the memory-barriers document, it seems we need write
> >> barrier/read barrier pair as below:
> >>
> >> CPU A                                                                  CPU B
> >> store shrinker_maps pointer                       load CSS_ONLINE
> >> <write barrier>                                             <read barrier>
> >> store CSS_ONLINE                                    load shrinker_maps pointer
> >>
> >>
> >> So, the patch should look like:
> >>
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index df128cab900f..489c0a84f82b 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -5539,6 +5539,13 @@ static int mem_cgroup_css_online(struct
> >> cgroup_subsys_state *css)
> >>                 return -ENOMEM;
> >>         }
> >>
> >> +       /*
> >> +        * Barrier for CSS_ONLINE, so that shrink_slab_memcg() sees
> >> shirnker_maps
> >> +        * and shrinker_deferred before CSS_ONLINE. It pairs with the
> >> read barrier
> >> +        * in shrink_slab_memcg().
> >> +        */
> >> +       smp_wmb();
> > 
> > Is there a reason why the shrinker allocations aren't done in
> > .css_alloc()? That would take care of all necessary ordering:
> 
> The reason is that allocations have to be made in a place, where
> mem-cgroup_iter() can't miss it, since memcg_expand_shrinker_maps()
> shouldn't miss allocated shrinker maps.

I see, because we could have this:

.css_alloc()
  memcg_alloc_shrinker_maps()
    down_read(&shrinker_sem)
    map = alloc(shrinker_nr_max * sizeof(long));
    rcu_assign_pointer(memcg->...->shrinker_map = map);
    up_read(&shrinker_sem);
                                                            register_shrinker()
                                                              down_write(&shrinker_sem)
                                                              shrinker_nr_max = id + 1;
                                                              memcg_expand_shrinker_maps()
                                                                for_each_mem_cgroup()
                                                                  realloc
                                                              up_write(&shrinker_sem)
  list_add_tail_rcu(&css->sibling, &parent->children);

  /* boom: missed new shrinker, map too small */

Thanks for the clarification.
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cba0bc8d4661..d569fdcaba79 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -203,6 +203,12 @@  static DECLARE_RWSEM(shrinker_rwsem);
 static DEFINE_IDR(shrinker_idr);
 static int shrinker_nr_max;
 
+static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
+{
+	return (shrinker->flags & SHRINKER_MEMCG_AWARE) &&
+		!mem_cgroup_disabled();
+}
+
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
 	int id, ret = -ENOMEM;
@@ -271,7 +277,58 @@  static bool writeback_throttling_sane(struct scan_control *sc)
 #endif
 	return false;
 }
+
+static inline long count_nr_deferred(struct shrinker *shrinker,
+				     struct shrink_control *sc)
+{
+	bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
+	struct memcg_shrinker_deferred *deferred;
+	struct mem_cgroup *memcg = sc->memcg;
+	int nid = sc->nid;
+	int id = shrinker->id;
+	long nr;
+
+	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+		nid = 0;
+
+	if (per_memcg_deferred) {
+		deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
+						     true);
+		nr = atomic_long_xchg(&deferred->nr_deferred[id], 0);
+	} else
+		nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+	return nr;
+}
+
+static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
+				   struct shrink_control *sc)
+{
+	bool per_memcg_deferred = is_deferred_memcg_aware(shrinker) && sc->memcg;
+	struct memcg_shrinker_deferred *deferred;
+	struct mem_cgroup *memcg = sc->memcg;
+	int nid = sc->nid;
+	int id = shrinker->id;
+	long new_nr;
+
+	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+		nid = 0;
+
+	if (per_memcg_deferred) {
+		deferred = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_deferred,
+						     true);
+		new_nr = atomic_long_add_return(nr, &deferred->nr_deferred[id]);
+	} else
+		new_nr = atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+
+	return new_nr;
+}
 #else
+static inline bool is_deferred_memcg_aware(struct shrinker *shrinker)
+{
+	return false;
+}
+
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
 	return 0;
@@ -290,6 +347,29 @@  static bool writeback_throttling_sane(struct scan_control *sc)
 {
 	return true;
 }
+
+static inline long count_nr_deferred(struct shrinker *shrinker,
+				     struct shrink_control *sc)
+{
+	int nid = sc->nid;
+
+	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+		nid = 0;
+
+	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+static inline long set_nr_deferred(long nr, struct shrinker *shrinker,
+				   struct shrink_control *sc)
+{
+	int nid = sc->nid;
+
+	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+		nid = 0;
+
+	return atomic_long_add_return(nr,
+				      &shrinker->nr_deferred[nid]);
+}
 #endif
 
 /*
@@ -429,13 +509,10 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	long freeable;
 	long nr;
 	long new_nr;
-	int nid = shrinkctl->nid;
 	long batch_size = shrinker->batch ? shrinker->batch
 					  : SHRINK_BATCH;
 	long scanned = 0, next_deferred;
 
-	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-		nid = 0;
 
 	freeable = shrinker->count_objects(shrinker, shrinkctl);
 	if (freeable == 0 || freeable == SHRINK_EMPTY)
@@ -446,7 +523,7 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	 * and zero it so that other concurrent shrinker invocations
 	 * don't also do this scanning work.
 	 */
-	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+	nr = count_nr_deferred(shrinker, shrinkctl);
 
 	total_scan = nr;
 	if (shrinker->seeks) {
@@ -539,8 +616,7 @@  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	 * move the unused scan count back into the shrinker in a
 	 * manner that handles concurrent updates.
 	 */
-	new_nr = atomic_long_add_return(next_deferred,
-					&shrinker->nr_deferred[nid]);
+	new_nr = set_nr_deferred(next_deferred, shrinker, shrinkctl);
 
 	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
 	return freed;