diff mbox series

mm, memcg: clear page protection when memcg oom group happens

Message ID 1574676893-1571-1-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series mm, memcg: clear page protection when memcg oom group happens | expand

Commit Message

Yafang Shao Nov. 25, 2019, 10:14 a.m. UTC
We set memory.oom.group to make all processes in this memcg are killed by
OOM killer to free more pages. In this case, it doesn't make sense to
protect the pages with memroy.{min, low} again if they are set.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/memcontrol.h | 11 +++++++++++
 mm/memcontrol.c            |  4 +---
 mm/oom_kill.c              |  1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

Comments

Michal Hocko Nov. 25, 2019, 11:08 a.m. UTC | #1
On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> We set memory.oom.group to make all processes in this memcg are killed by
> OOM killer to free more pages. In this case, it doesn't make sense to
> protect the pages with memroy.{min, low} again if they are set.

I do not see why? What does group OOM killing has anything to do with
the reclaim protection? What is the actual problem you are trying to
solve?

> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  include/linux/memcontrol.h | 11 +++++++++++
>  mm/memcontrol.c            |  4 +---
>  mm/oom_kill.c              |  1 +
>  3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0c762e8..f68a1a5 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -369,6 +369,13 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
>  		   READ_ONCE(memcg->memory.elow));
>  }
>  
> +static inline void mem_cgroup_clear_protection(struct mem_cgroup *memcg)
> +{
> +
> +	page_counter_set_min(&memcg->memory, 0);
> +	page_counter_set_low(&memcg->memory, 0);
> +}
> +
>  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
>  						struct mem_cgroup *memcg);
>  
> @@ -850,6 +857,10 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
>  	return 0;
>  }
>  
> +static inline void mem_cgroup_clear_protection(struct mem_cgroup *memcg)
> +{
> +}
> +
>  static inline enum mem_cgroup_protection mem_cgroup_protected(
>  	struct mem_cgroup *root, struct mem_cgroup *memcg)
>  {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1c4c08b..e5ab119 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5190,9 +5190,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  	}
>  	spin_unlock(&memcg->event_list_lock);
>  
> -	page_counter_set_min(&memcg->memory, 0);
> -	page_counter_set_low(&memcg->memory, 0);
> -
> +	mem_cgroup_clear_protection(memcg);
>  	memcg_offline_kmem(memcg);
>  	wb_memcg_offline(memcg);
>  
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 93eae76..550f830 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -988,6 +988,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
>  		mem_cgroup_print_oom_group(oom_group);
>  		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
>  				      (void*)message);
> +		mem_cgroup_clear_protection(oom_group);
>  		mem_cgroup_put(oom_group);
>  	}
>  }
> -- 
> 1.8.3.1
Yafang Shao Nov. 25, 2019, 11:37 a.m. UTC | #2
On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > We set memory.oom.group to make all processes in this memcg are killed by
> > OOM killer to free more pages. In this case, it doesn't make sense to
> > protect the pages with memroy.{min, low} again if they are set.
>
> I do not see why? What does group OOM killing has anything to do with
> the reclaim protection? What is the actual problem you are trying to
> solve?
>

The cgroup is treated as a indivisible  workload when cgroup.oom.group
is set and OOM killer is trying to kill a prcess in this cgroup.
We set cgroup.oom.group is to  guarantee the workload integrity, now
that processes ara all killed, why keeps the page cache here?

Thanks
Yafang
Michal Hocko Nov. 25, 2019, 11:54 a.m. UTC | #3
On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > We set memory.oom.group to make all processes in this memcg are killed by
> > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > protect the pages with memroy.{min, low} again if they are set.
> >
> > I do not see why? What does group OOM killing has anything to do with
> > the reclaim protection? What is the actual problem you are trying to
> > solve?
> >
> 
> The cgroup is treated as a indivisible  workload when cgroup.oom.group
> is set and OOM killer is trying to kill a prcess in this cgroup.

Yes this is true.

> We set cgroup.oom.group is to  guarantee the workload integrity, now
> that processes ara all killed, why keeps the page cache here?

Because an administrator has configured the reclaim protection in a
certain way and hopefully had a good reason to do that. We are not going
to override that configure just because there is on OOM killer invoked
and killed tasks in that memcg. The workload might get restarted and it
would run under a different constrains all of the sudden which is not
expected.

In short kernel should never silently change the configuration made by
an admistrator.
Yafang Shao Nov. 25, 2019, 12:17 p.m. UTC | #4
On Mon, Nov 25, 2019 at 7:54 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> > On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > > We set memory.oom.group to make all processes in this memcg are killed by
> > > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > > protect the pages with memroy.{min, low} again if they are set.
> > >
> > > I do not see why? What does group OOM killing has anything to do with
> > > the reclaim protection? What is the actual problem you are trying to
> > > solve?
> > >
> >
> > The cgroup is treated as a indivisible  workload when cgroup.oom.group
> > is set and OOM killer is trying to kill a prcess in this cgroup.
>
> Yes this is true.
>
> > We set cgroup.oom.group is to  guarantee the workload integrity, now
> > that processes ara all killed, why keeps the page cache here?
>
> Because an administrator has configured the reclaim protection in a
> certain way and hopefully had a good reason to do that. We are not going
> to override that configure just because there is on OOM killer invoked
> and killed tasks in that memcg. The workload might get restarted and it
> would run under a different constrains all of the sudden which is not
> expected.
>
> In short kernel should never silently change the configuration made by
> an admistrator.

Understood.

So what about bellow changes ? We don't override the admin setting,
but we reclaim the page caches from it if this memcg is oom killed.
Something like,

mem_cgroup_protected
{
...
+       if (!cgroup_is_populated(memcg->css.cgroup) &&
mem_cgroup_under_oom_group_kill(memcg))
+               return MEMCG_PROT_NONE;
+
        usage = page_counter_read(&memcg->memory);
        if (!usage)
                return MEMCG_PROT_NONE;
}
Michal Hocko Nov. 25, 2019, 12:31 p.m. UTC | #5
On Mon 25-11-19 20:17:15, Yafang Shao wrote:
> On Mon, Nov 25, 2019 at 7:54 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> > > On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > > > We set memory.oom.group to make all processes in this memcg are killed by
> > > > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > > > protect the pages with memroy.{min, low} again if they are set.
> > > >
> > > > I do not see why? What does group OOM killing has anything to do with
> > > > the reclaim protection? What is the actual problem you are trying to
> > > > solve?
> > > >
> > >
> > > The cgroup is treated as a indivisible  workload when cgroup.oom.group
> > > is set and OOM killer is trying to kill a prcess in this cgroup.
> >
> > Yes this is true.
> >
> > > We set cgroup.oom.group is to  guarantee the workload integrity, now
> > > that processes ara all killed, why keeps the page cache here?
> >
> > Because an administrator has configured the reclaim protection in a
> > certain way and hopefully had a good reason to do that. We are not going
> > to override that configure just because there is on OOM killer invoked
> > and killed tasks in that memcg. The workload might get restarted and it
> > would run under a different constrains all of the sudden which is not
> > expected.
> >
> > In short kernel should never silently change the configuration made by
> > an admistrator.
> 
> Understood.
> 
> So what about bellow changes ? We don't override the admin setting,
> but we reclaim the page caches from it if this memcg is oom killed.
> Something like,
> 
> mem_cgroup_protected
> {
> ...
> +       if (!cgroup_is_populated(memcg->css.cgroup) &&
> mem_cgroup_under_oom_group_kill(memcg))
> +               return MEMCG_PROT_NONE;
> +
>         usage = page_counter_read(&memcg->memory);
>         if (!usage)
>                 return MEMCG_PROT_NONE;
> }

I assume that mem_cgroup_under_oom_group_kill is essentially
	memcg->under_oom && memcg->oom_group
But that doesn't really help much because all the reclaim attempts have
been already attempted and failed. I do not remember exact details about
under_oom but I have a recollection that it wouldn't really work for
cgroup v2 because the oom_control is not in place and so the state would
be set for only very short time period.

Again, what is a problem that you are trying to fix?
Yafang Shao Nov. 25, 2019, 12:37 p.m. UTC | #6
On Mon, Nov 25, 2019 at 8:31 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 25-11-19 20:17:15, Yafang Shao wrote:
> > On Mon, Nov 25, 2019 at 7:54 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> > > > On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > > > > We set memory.oom.group to make all processes in this memcg are killed by
> > > > > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > > > > protect the pages with memroy.{min, low} again if they are set.
> > > > >
> > > > > I do not see why? What does group OOM killing has anything to do with
> > > > > the reclaim protection? What is the actual problem you are trying to
> > > > > solve?
> > > > >
> > > >
> > > > The cgroup is treated as a indivisible  workload when cgroup.oom.group
> > > > is set and OOM killer is trying to kill a prcess in this cgroup.
> > >
> > > Yes this is true.
> > >
> > > > We set cgroup.oom.group is to  guarantee the workload integrity, now
> > > > that processes ara all killed, why keeps the page cache here?
> > >
> > > Because an administrator has configured the reclaim protection in a
> > > certain way and hopefully had a good reason to do that. We are not going
> > > to override that configure just because there is on OOM killer invoked
> > > and killed tasks in that memcg. The workload might get restarted and it
> > > would run under a different constrains all of the sudden which is not
> > > expected.
> > >
> > > In short kernel should never silently change the configuration made by
> > > an admistrator.
> >
> > Understood.
> >
> > So what about bellow changes ? We don't override the admin setting,
> > but we reclaim the page caches from it if this memcg is oom killed.
> > Something like,
> >
> > mem_cgroup_protected
> > {
> > ...
> > +       if (!cgroup_is_populated(memcg->css.cgroup) &&
> > mem_cgroup_under_oom_group_kill(memcg))
> > +               return MEMCG_PROT_NONE;
> > +
> >         usage = page_counter_read(&memcg->memory);
> >         if (!usage)
> >                 return MEMCG_PROT_NONE;
> > }
>
> I assume that mem_cgroup_under_oom_group_kill is essentially
>         memcg->under_oom && memcg->oom_group
> But that doesn't really help much because all the reclaim attempts have
> been already attempted and failed. I do not remember exact details about
> under_oom but I have a recollection that it wouldn't really work for
> cgroup v2 because the oom_control is not in place and so the state would
> be set for only very short time period.
>
> Again, what is a problem that you are trying to fix?

When there's no processes running in a memcg, for example if they are
killed by OOM killer, we can't reclaim the file page cache protected
by memory.min of this memcg. These file page caches are useless in
this case.
That's what I'm trying to fix.

Thanks
Yafang
Michal Hocko Nov. 25, 2019, 12:45 p.m. UTC | #7
On Mon 25-11-19 20:37:52, Yafang Shao wrote:
> On Mon, Nov 25, 2019 at 8:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 25-11-19 20:17:15, Yafang Shao wrote:
> > > On Mon, Nov 25, 2019 at 7:54 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> > > > > On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > > > >
> > > > > > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > > > > > We set memory.oom.group to make all processes in this memcg are killed by
> > > > > > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > > > > > protect the pages with memroy.{min, low} again if they are set.
> > > > > >
> > > > > > I do not see why? What does group OOM killing has anything to do with
> > > > > > the reclaim protection? What is the actual problem you are trying to
> > > > > > solve?
> > > > > >
> > > > >
> > > > > The cgroup is treated as a indivisible  workload when cgroup.oom.group
> > > > > is set and OOM killer is trying to kill a prcess in this cgroup.
> > > >
> > > > Yes this is true.
> > > >
> > > > > We set cgroup.oom.group is to  guarantee the workload integrity, now
> > > > > that processes ara all killed, why keeps the page cache here?
> > > >
> > > > Because an administrator has configured the reclaim protection in a
> > > > certain way and hopefully had a good reason to do that. We are not going
> > > > to override that configure just because there is on OOM killer invoked
> > > > and killed tasks in that memcg. The workload might get restarted and it
> > > > would run under a different constrains all of the sudden which is not
> > > > expected.
> > > >
> > > > In short kernel should never silently change the configuration made by
> > > > an admistrator.
> > >
> > > Understood.
> > >
> > > So what about bellow changes ? We don't override the admin setting,
> > > but we reclaim the page caches from it if this memcg is oom killed.
> > > Something like,
> > >
> > > mem_cgroup_protected
> > > {
> > > ...
> > > +       if (!cgroup_is_populated(memcg->css.cgroup) &&
> > > mem_cgroup_under_oom_group_kill(memcg))
> > > +               return MEMCG_PROT_NONE;
> > > +
> > >         usage = page_counter_read(&memcg->memory);
> > >         if (!usage)
> > >                 return MEMCG_PROT_NONE;
> > > }
> >
> > I assume that mem_cgroup_under_oom_group_kill is essentially
> >         memcg->under_oom && memcg->oom_group
> > But that doesn't really help much because all the reclaim attempts have
> > been already attempted and failed. I do not remember exact details about
> > under_oom but I have a recollection that it wouldn't really work for
> > cgroup v2 because the oom_control is not in place and so the state would
> > be set for only very short time period.
> >
> > Again, what is a problem that you are trying to fix?
> 
> When there's no processes running in a memcg, for example if they are
> killed by OOM killer, we can't reclaim the file page cache protected
> by memory.min of this memcg. These file page caches are useless in
> this case.
> That's what I'm trying to fix.

Could you be more specific please? I would assume that the group oom
configured memcg would either restart its workload when killed (that is
why you want to kill the whole workload to restart it cleanly in many
case) or simply tear down the memcg altogether.

In other words why do you care about the oom killer case so much? It is
not different that handling a lingering memcg with the workload already
finished. You simply have no way to know whether the reclaim protection
is still required. Admin is supposed to either offline the memcg that is
no longer used or drop the reclaim protection once it is not needed
because that has some visible consequences on the overall system
operation.
Yafang Shao Nov. 25, 2019, 2:11 p.m. UTC | #8
On Mon, Nov 25, 2019 at 8:45 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 25-11-19 20:37:52, Yafang Shao wrote:
> > On Mon, Nov 25, 2019 at 8:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Mon 25-11-19 20:17:15, Yafang Shao wrote:
> > > > On Mon, Nov 25, 2019 at 7:54 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Mon 25-11-19 19:37:59, Yafang Shao wrote:
> > > > > > On Mon, Nov 25, 2019 at 7:08 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > >
> > > > > > > On Mon 25-11-19 05:14:53, Yafang Shao wrote:
> > > > > > > > We set memory.oom.group to make all processes in this memcg are killed by
> > > > > > > > OOM killer to free more pages. In this case, it doesn't make sense to
> > > > > > > > protect the pages with memroy.{min, low} again if they are set.
> > > > > > >
> > > > > > > I do not see why? What does group OOM killing has anything to do with
> > > > > > > the reclaim protection? What is the actual problem you are trying to
> > > > > > > solve?
> > > > > > >
> > > > > >
> > > > > > The cgroup is treated as a indivisible  workload when cgroup.oom.group
> > > > > > is set and OOM killer is trying to kill a prcess in this cgroup.
> > > > >
> > > > > Yes this is true.
> > > > >
> > > > > > We set cgroup.oom.group is to  guarantee the workload integrity, now
> > > > > > that processes ara all killed, why keeps the page cache here?
> > > > >
> > > > > Because an administrator has configured the reclaim protection in a
> > > > > certain way and hopefully had a good reason to do that. We are not going
> > > > > to override that configure just because there is on OOM killer invoked
> > > > > and killed tasks in that memcg. The workload might get restarted and it
> > > > > would run under a different constrains all of the sudden which is not
> > > > > expected.
> > > > >
> > > > > In short kernel should never silently change the configuration made by
> > > > > an admistrator.
> > > >
> > > > Understood.
> > > >
> > > > So what about bellow changes ? We don't override the admin setting,
> > > > but we reclaim the page caches from it if this memcg is oom killed.
> > > > Something like,
> > > >
> > > > mem_cgroup_protected
> > > > {
> > > > ...
> > > > +       if (!cgroup_is_populated(memcg->css.cgroup) &&
> > > > mem_cgroup_under_oom_group_kill(memcg))
> > > > +               return MEMCG_PROT_NONE;
> > > > +
> > > >         usage = page_counter_read(&memcg->memory);
> > > >         if (!usage)
> > > >                 return MEMCG_PROT_NONE;
> > > > }
> > >
> > > I assume that mem_cgroup_under_oom_group_kill is essentially
> > >         memcg->under_oom && memcg->oom_group
> > > But that doesn't really help much because all the reclaim attempts have
> > > been already attempted and failed. I do not remember exact details about
> > > under_oom but I have a recollection that it wouldn't really work for
> > > cgroup v2 because the oom_control is not in place and so the state would
> > > be set for only very short time period.
> > >
> > > Again, what is a problem that you are trying to fix?
> >
> > When there's no processes running in a memcg, for example if they are
> > killed by OOM killer, we can't reclaim the file page cache protected
> > by memory.min of this memcg. These file page caches are useless in
> > this case.
> > That's what I'm trying to fix.
>
> Could you be more specific please? I would assume that the group oom
> configured memcg would either restart its workload when killed (that is
> why you want to kill the whole workload to restart it cleanly in many
> case) or simply tear down the memcg altogether.
>

Yes, we always restart it automatically if these processes are exit
(no matter because of OOM or some other reason).
It is safe to do that if OOM happens, because OOM is always because of
anon pages leaked and the restart can free these anon pages.
But there may be some cases that we can't success to restart it, while
if that happens the protected pages will be never be reclaimed until
the admin reset it or make this memcg offline.
When there're no processes, we don't need to protect the pages. You
can consider it as 'fault tolerance' .


> In other words why do you care about the oom killer case so much? It is
> not different that handling a lingering memcg with the workload already
> finished. You simply have no way to know whether the reclaim protection
> is still required. Admin is supposed to either offline the memcg that is
> no longer used or drop the reclaim protection once it is not needed
> because that has some visible consequences on the overall system
> operation.

Actually what I concern is the  case that there's no process running
but memory protection coninues protecting the file pages.
OOM is just one case of them.

Thanks
Yafang
Michal Hocko Nov. 25, 2019, 2:21 p.m. UTC | #9
On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> On Mon, Nov 25, 2019 at 8:45 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Mon 25-11-19 20:37:52, Yafang Shao wrote:
> > > On Mon, Nov 25, 2019 at 8:31 PM Michal Hocko <mhocko@kernel.org> wrote:
[...]
> > > > Again, what is a problem that you are trying to fix?
> > >
> > > When there's no processes running in a memcg, for example if they are
> > > killed by OOM killer, we can't reclaim the file page cache protected
> > > by memory.min of this memcg. These file page caches are useless in
> > > this case.
> > > That's what I'm trying to fix.
> >
> > Could you be more specific please? I would assume that the group oom
> > configured memcg would either restart its workload when killed (that is
> > why you want to kill the whole workload to restart it cleanly in many
> > case) or simply tear down the memcg altogether.
> >
> 
> Yes, we always restart it automatically if these processes are exit
> (no matter because of OOM or some other reason).
> It is safe to do that if OOM happens, because OOM is always because of
> anon pages leaked and the restart can free these anon pages.

No this is an incorrect assumption. The OOM might happen for many
different reasons.

> But there may be some cases that we can't success to restart it, while
> if that happens the protected pages will be never be reclaimed until
> the admin reset it or make this memcg offline.

If the workload cannot be restarted for whatever reason then you need an
admin intervention and a proper cleanup. That would include resetting
reclaim protection when in use.

> When there're no processes, we don't need to protect the pages. You
> can consider it as 'fault tolerance' .

I have already tried to explain why this is a bold statement that
doesn't really hold universally and that the kernel doesn't really have
enough information to make an educated guess.

> > In other words why do you care about the oom killer case so much? It is
> > not different that handling a lingering memcg with the workload already
> > finished. You simply have no way to know whether the reclaim protection
> > is still required. Admin is supposed to either offline the memcg that is
> > no longer used or drop the reclaim protection once it is not needed
> > because that has some visible consequences on the overall system
> > operation.
> 
> Actually what I concern is the  case that there's no process running
> but memory protection coninues protecting the file pages.
> OOM is just one case of them.

This sounds like a misconfiguration which should be handled by an admin.
Johannes Weiner Nov. 25, 2019, 2:42 p.m. UTC | #10
On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > When there're no processes, we don't need to protect the pages. You
> > can consider it as 'fault tolerance' .
> 
> I have already tried to explain why this is a bold statement that
> doesn't really hold universally and that the kernel doesn't really have
> enough information to make an educated guess.

I agree, this is not obviously true. And the kernel shouldn't try to
guess whether the explicit userspace configuration is still desirable
to userspace or not. Should we also delete the cgroup when it becomes
empty for example?

It's better to implement these kinds of policy decisions from
userspace.

There is a cgroup.events file that can be polled, and its "populated"
field shows conveniently whether there are tasks in a subtree or
not. You can use that to clear protection settings.
Yafang Shao Nov. 25, 2019, 2:44 p.m. UTC | #11
On Mon, Nov 25, 2019 at 10:21 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > On Mon, Nov 25, 2019 at 8:45 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Mon 25-11-19 20:37:52, Yafang Shao wrote:
> > > > On Mon, Nov 25, 2019 at 8:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> [...]
> > > > > Again, what is a problem that you are trying to fix?
> > > >
> > > > When there's no processes running in a memcg, for example if they are
> > > > killed by OOM killer, we can't reclaim the file page cache protected
> > > > by memory.min of this memcg. These file page caches are useless in
> > > > this case.
> > > > That's what I'm trying to fix.
> > >
> > > Could you be more specific please? I would assume that the group oom
> > > configured memcg would either restart its workload when killed (that is
> > > why you want to kill the whole workload to restart it cleanly in many
> > > case) or simply tear down the memcg altogether.
> > >
> >
> > Yes, we always restart it automatically if these processes are exit
> > (no matter because of OOM or some other reason).
> > It is safe to do that if OOM happens, because OOM is always because of
> > anon pages leaked and the restart can free these anon pages.
>
> No this is an incorrect assumption. The OOM might happen for many
> different reasons.
>
> > But there may be some cases that we can't success to restart it, while
> > if that happens the protected pages will be never be reclaimed until
> > the admin reset it or make this memcg offline.
>
> If the workload cannot be restarted for whatever reason then you need an
> admin intervention and a proper cleanup. That would include resetting
> reclaim protection when in use.
>
> > When there're no processes, we don't need to protect the pages. You
> > can consider it as 'fault tolerance' .
>
> I have already tried to explain why this is a bold statement that
> doesn't really hold universally and that the kernel doesn't really have
> enough information to make an educated guess.
>

I didn't mean we must relcaim the protected pages in all cases, while
I mean sometimes we should relcaim the protected pages.
If the kernel can't make an educated guess, we can tell the kernel to
do it, for example, to introduce a new controller file to tell the
kernel whehter or not relcaim the protected pages if there're no
proceses running.

> > > In other words why do you care about the oom killer case so much? It is
> > > not different that handling a lingering memcg with the workload already
> > > finished. You simply have no way to know whether the reclaim protection
> > > is still required. Admin is supposed to either offline the memcg that is
> > > no longer used or drop the reclaim protection once it is not needed
> > > because that has some visible consequences on the overall system
> > > operation.
> >
> > Actually what I concern is the  case that there's no process running
> > but memory protection coninues protecting the file pages.
> > OOM is just one case of them.
>
> This sounds like a misconfiguration which should be handled by an admin.

That may be a misconfiguration,  but the kernel can do something
before the admin notice it.

Thanks

Yafang
Yafang Shao Nov. 25, 2019, 2:45 p.m. UTC | #12
On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > When there're no processes, we don't need to protect the pages. You
> > > can consider it as 'fault tolerance' .
> >
> > I have already tried to explain why this is a bold statement that
> > doesn't really hold universally and that the kernel doesn't really have
> > enough information to make an educated guess.
>
> I agree, this is not obviously true. And the kernel shouldn't try to
> guess whether the explicit userspace configuration is still desirable
> to userspace or not. Should we also delete the cgroup when it becomes
> empty for example?
>
> It's better to implement these kinds of policy decisions from
> userspace.
>
> There is a cgroup.events file that can be polled, and its "populated"
> field shows conveniently whether there are tasks in a subtree or
> not. You can use that to clear protection settings.

Thanks for you information. I will take a look at it.

Thanks
Yafang
Yafang Shao Nov. 26, 2019, 3:52 a.m. UTC | #13
On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > When there're no processes, we don't need to protect the pages. You
> > > can consider it as 'fault tolerance' .
> >
> > I have already tried to explain why this is a bold statement that
> > doesn't really hold universally and that the kernel doesn't really have
> > enough information to make an educated guess.
>
> I agree, this is not obviously true. And the kernel shouldn't try to
> guess whether the explicit userspace configuration is still desirable
> to userspace or not. Should we also delete the cgroup when it becomes
> empty for example?
>
> It's better to implement these kinds of policy decisions from
> userspace.
>
> There is a cgroup.events file that can be polled, and its "populated"
> field shows conveniently whether there are tasks in a subtree or
> not. You can use that to clear protection settings.

Why isn't force_empty supported in cgroup2 ?
In this case we can free the protected file pages immdiately with force_empty.
The advantage of it is to avoid scaning all other memcgs in
kswapd/direct reclaim paths, because currently the reclaimer will
fairly scan all memcgs and reclaim pages from them.
What's the problem with force_empty ?

Thanks
Yafang
Michal Hocko Nov. 26, 2019, 7:31 a.m. UTC | #14
On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > When there're no processes, we don't need to protect the pages. You
> > > > can consider it as 'fault tolerance' .
> > >
> > > I have already tried to explain why this is a bold statement that
> > > doesn't really hold universally and that the kernel doesn't really have
> > > enough information to make an educated guess.
> >
> > I agree, this is not obviously true. And the kernel shouldn't try to
> > guess whether the explicit userspace configuration is still desirable
> > to userspace or not. Should we also delete the cgroup when it becomes
> > empty for example?
> >
> > It's better to implement these kinds of policy decisions from
> > userspace.
> >
> > There is a cgroup.events file that can be polled, and its "populated"
> > field shows conveniently whether there are tasks in a subtree or
> > not. You can use that to clear protection settings.
> 
> Why isn't force_empty supported in cgroup2 ?

There wasn't any sound usecase AFAIR.

> In this case we can free the protected file pages immdiately with force_empty.

You can do the same thing by setting the hard limit to 0.
Yafang Shao Nov. 26, 2019, 9:35 a.m. UTC | #15
On Tue, Nov 26, 2019 at 3:31 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> > On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > > When there're no processes, we don't need to protect the pages. You
> > > > > can consider it as 'fault tolerance' .
> > > >
> > > > I have already tried to explain why this is a bold statement that
> > > > doesn't really hold universally and that the kernel doesn't really have
> > > > enough information to make an educated guess.
> > >
> > > I agree, this is not obviously true. And the kernel shouldn't try to
> > > guess whether the explicit userspace configuration is still desirable
> > > to userspace or not. Should we also delete the cgroup when it becomes
> > > empty for example?
> > >
> > > It's better to implement these kinds of policy decisions from
> > > userspace.
> > >
> > > There is a cgroup.events file that can be polled, and its "populated"
> > > field shows conveniently whether there are tasks in a subtree or
> > > not. You can use that to clear protection settings.
> >
> > Why isn't force_empty supported in cgroup2 ?
>
> There wasn't any sound usecase AFAIR.
>
> > In this case we can free the protected file pages immdiately with force_empty.
>
> You can do the same thing by setting the hard limit to 0.

I look though the code, and the difference between setting the hard
limit to 0 and force empty is that setting the hard limit to 0 will
generate some OOM reports, that should not happen in this case.
I think we should make little improvement as bellow,

@@ -6137,9 +6137,11 @@ static ssize_t memory_max_write(struct
kernfs_open_file *of,
                        continue;
                }

-               memcg_memory_event(memcg, MEMCG_OOM);
-               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
-                       break;
+               if (cgroup_is_populated(memcg->css.cgroup)) {
+                       memcg_memory_event(memcg, MEMCG_OOM);
+                       if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+                               break;
+               }
        }

Well,  if someone don't want to kill proesses but only want ot drop
page caches, setting the hard limit to 0 won't work.

Thanks
Yafang


Thanks
Yafang
Michal Hocko Nov. 26, 2019, 9:50 a.m. UTC | #16
On Tue 26-11-19 17:35:59, Yafang Shao wrote:
> On Tue, Nov 26, 2019 at 3:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> > > On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > > > When there're no processes, we don't need to protect the pages. You
> > > > > > can consider it as 'fault tolerance' .
> > > > >
> > > > > I have already tried to explain why this is a bold statement that
> > > > > doesn't really hold universally and that the kernel doesn't really have
> > > > > enough information to make an educated guess.
> > > >
> > > > I agree, this is not obviously true. And the kernel shouldn't try to
> > > > guess whether the explicit userspace configuration is still desirable
> > > > to userspace or not. Should we also delete the cgroup when it becomes
> > > > empty for example?
> > > >
> > > > It's better to implement these kinds of policy decisions from
> > > > userspace.
> > > >
> > > > There is a cgroup.events file that can be polled, and its "populated"
> > > > field shows conveniently whether there are tasks in a subtree or
> > > > not. You can use that to clear protection settings.
> > >
> > > Why isn't force_empty supported in cgroup2 ?
> >
> > There wasn't any sound usecase AFAIR.
> >
> > > In this case we can free the protected file pages immdiately with force_empty.
> >
> > You can do the same thing by setting the hard limit to 0.
> 
> I look though the code, and the difference between setting the hard
> limit to 0 and force empty is that setting the hard limit to 0 will
> generate some OOM reports, that should not happen in this case.
> I think we should make little improvement as bellow,

Yes, if you are not able to reclaim all of the memory then the OOM
killer is triggered. And that was not the case with force_empty. I
didn't mean that the two are equivalent, sorry if I misled you.
I merely wanted to point out that you have means to cleanup the memcg
with the existing API.
 
> @@ -6137,9 +6137,11 @@ static ssize_t memory_max_write(struct
> kernfs_open_file *of,
>                         continue;
>                 }
> 
> -               memcg_memory_event(memcg, MEMCG_OOM);
> -               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> -                       break;
> +               if (cgroup_is_populated(memcg->css.cgroup)) {
> +                       memcg_memory_event(memcg, MEMCG_OOM);
> +                       if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> +                               break;
> +               }
>         }

If there are no killable tasks then 
	"Out of memory and no killable processes..."
is printed and that really reflects the situation and is the right thing
to do. Your above patch would suppress that information which might be
important.
 
> Well,  if someone don't want to kill proesses but only want ot drop
> page caches, setting the hard limit to 0 won't work.

Could you be more specific about a real world example when somebody
wants to drop per-memcg pagecache?
Yafang Shao Nov. 26, 2019, 10:02 a.m. UTC | #17
On Tue, Nov 26, 2019 at 5:50 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Tue 26-11-19 17:35:59, Yafang Shao wrote:
> > On Tue, Nov 26, 2019 at 3:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> > > > On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > >
> > > > > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > > > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > > > > When there're no processes, we don't need to protect the pages. You
> > > > > > > can consider it as 'fault tolerance' .
> > > > > >
> > > > > > I have already tried to explain why this is a bold statement that
> > > > > > doesn't really hold universally and that the kernel doesn't really have
> > > > > > enough information to make an educated guess.
> > > > >
> > > > > I agree, this is not obviously true. And the kernel shouldn't try to
> > > > > guess whether the explicit userspace configuration is still desirable
> > > > > to userspace or not. Should we also delete the cgroup when it becomes
> > > > > empty for example?
> > > > >
> > > > > It's better to implement these kinds of policy decisions from
> > > > > userspace.
> > > > >
> > > > > There is a cgroup.events file that can be polled, and its "populated"
> > > > > field shows conveniently whether there are tasks in a subtree or
> > > > > not. You can use that to clear protection settings.
> > > >
> > > > Why isn't force_empty supported in cgroup2 ?
> > >
> > > There wasn't any sound usecase AFAIR.
> > >
> > > > In this case we can free the protected file pages immdiately with force_empty.
> > >
> > > You can do the same thing by setting the hard limit to 0.
> >
> > I look though the code, and the difference between setting the hard
> > limit to 0 and force empty is that setting the hard limit to 0 will
> > generate some OOM reports, that should not happen in this case.
> > I think we should make little improvement as bellow,
>
> Yes, if you are not able to reclaim all of the memory then the OOM
> killer is triggered. And that was not the case with force_empty. I
> didn't mean that the two are equivalent, sorry if I misled you.
> I merely wanted to point out that you have means to cleanup the memcg
> with the existing API.
>
> > @@ -6137,9 +6137,11 @@ static ssize_t memory_max_write(struct
> > kernfs_open_file *of,
> >                         continue;
> >                 }
> >
> > -               memcg_memory_event(memcg, MEMCG_OOM);
> > -               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > -                       break;
> > +               if (cgroup_is_populated(memcg->css.cgroup)) {
> > +                       memcg_memory_event(memcg, MEMCG_OOM);
> > +                       if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > +                               break;
> > +               }
> >         }
>
> If there are no killable tasks then
>         "Out of memory and no killable processes..."
> is printed and that really reflects the situation and is the right thing
> to do. Your above patch would suppress that information which might be
> important.
>

Not only this output.
Pls. see dump_header(), many outputs and even worse is that the
dump_stack() is also executed.


> > Well,  if someone don't want to kill proesses but only want ot drop
> > page caches, setting the hard limit to 0 won't work.
>
> Could you be more specific about a real world example when somebody
> wants to drop per-memcg pagecache?

For example, if one memcg  has lots of negtive denties,  that causes
the file page cache continuesly been reclaimed, so we want to drop all
these negtive dentries. force_empty is a better workaround so far, and
that can give us more chance to analyze why negtive dentries are
generated.

Thanks
Yafang
Michal Hocko Nov. 26, 2019, 10:22 a.m. UTC | #18
On Tue 26-11-19 18:02:27, Yafang Shao wrote:
> On Tue, Nov 26, 2019 at 5:50 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Tue 26-11-19 17:35:59, Yafang Shao wrote:
> > > On Tue, Nov 26, 2019 at 3:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > >
> > > > On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> > > > > On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > >
> > > > > > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > > > > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > > > > > When there're no processes, we don't need to protect the pages. You
> > > > > > > > can consider it as 'fault tolerance' .
> > > > > > >
> > > > > > > I have already tried to explain why this is a bold statement that
> > > > > > > doesn't really hold universally and that the kernel doesn't really have
> > > > > > > enough information to make an educated guess.
> > > > > >
> > > > > > I agree, this is not obviously true. And the kernel shouldn't try to
> > > > > > guess whether the explicit userspace configuration is still desirable
> > > > > > to userspace or not. Should we also delete the cgroup when it becomes
> > > > > > empty for example?
> > > > > >
> > > > > > It's better to implement these kinds of policy decisions from
> > > > > > userspace.
> > > > > >
> > > > > > There is a cgroup.events file that can be polled, and its "populated"
> > > > > > field shows conveniently whether there are tasks in a subtree or
> > > > > > not. You can use that to clear protection settings.
> > > > >
> > > > > Why isn't force_empty supported in cgroup2 ?
> > > >
> > > > There wasn't any sound usecase AFAIR.
> > > >
> > > > > In this case we can free the protected file pages immdiately with force_empty.
> > > >
> > > > You can do the same thing by setting the hard limit to 0.
> > >
> > > I look though the code, and the difference between setting the hard
> > > limit to 0 and force empty is that setting the hard limit to 0 will
> > > generate some OOM reports, that should not happen in this case.
> > > I think we should make little improvement as bellow,
> >
> > Yes, if you are not able to reclaim all of the memory then the OOM
> > killer is triggered. And that was not the case with force_empty. I
> > didn't mean that the two are equivalent, sorry if I misled you.
> > I merely wanted to point out that you have means to cleanup the memcg
> > with the existing API.
> >
> > > @@ -6137,9 +6137,11 @@ static ssize_t memory_max_write(struct
> > > kernfs_open_file *of,
> > >                         continue;
> > >                 }
> > >
> > > -               memcg_memory_event(memcg, MEMCG_OOM);
> > > -               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > > -                       break;
> > > +               if (cgroup_is_populated(memcg->css.cgroup)) {
> > > +                       memcg_memory_event(memcg, MEMCG_OOM);
> > > +                       if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > > +                               break;
> > > +               }
> > >         }
> >
> > If there are no killable tasks then
> >         "Out of memory and no killable processes..."
> > is printed and that really reflects the situation and is the right thing
> > to do. Your above patch would suppress that information which might be
> > important.
> >
> 
> Not only this output.
> Pls. see dump_header(), many outputs and even worse is that the
> dump_stack() is also executed.

Yes, there will be the full oom report. I have outlined the "no
killable" part because this is the main distinguisher for the "no tasks"
case.

> > > Well,  if someone don't want to kill proesses but only want ot drop
> > > page caches, setting the hard limit to 0 won't work.
> >
> > Could you be more specific about a real world example when somebody
> > wants to drop per-memcg pagecache?
> 
> For example, if one memcg  has lots of negtive denties,  that causes
> the file page cache continuesly been reclaimed, so we want to drop all
> these negtive dentries. force_empty is a better workaround so far, and
> that can give us more chance to analyze why negtive dentries are
> generated.

force_empty sounds like a brute force to clean negative dentries TBH.
And it is not really way too much different from shrinking the hard
limit.

Why doesn't a normal reclaim work for those situation? Anyway, this is
getting really tangent to the original topic so I would suggest to start
a new email thread with a clear description of a problem you are facing
and we can go from there.
Yafang Shao Nov. 26, 2019, 10:56 a.m. UTC | #19
On Tue, Nov 26, 2019 at 6:22 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Tue 26-11-19 18:02:27, Yafang Shao wrote:
> > On Tue, Nov 26, 2019 at 5:50 PM Michal Hocko <mhocko@kernel.org> wrote:
> > >
> > > On Tue 26-11-19 17:35:59, Yafang Shao wrote:
> > > > On Tue, Nov 26, 2019 at 3:31 PM Michal Hocko <mhocko@kernel.org> wrote:
> > > > >
> > > > > On Tue 26-11-19 11:52:19, Yafang Shao wrote:
> > > > > > On Mon, Nov 25, 2019 at 10:42 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > > > >
> > > > > > > On Mon, Nov 25, 2019 at 03:21:50PM +0100, Michal Hocko wrote:
> > > > > > > > On Mon 25-11-19 22:11:15, Yafang Shao wrote:
> > > > > > > > > When there're no processes, we don't need to protect the pages. You
> > > > > > > > > can consider it as 'fault tolerance' .
> > > > > > > >
> > > > > > > > I have already tried to explain why this is a bold statement that
> > > > > > > > doesn't really hold universally and that the kernel doesn't really have
> > > > > > > > enough information to make an educated guess.
> > > > > > >
> > > > > > > I agree, this is not obviously true. And the kernel shouldn't try to
> > > > > > > guess whether the explicit userspace configuration is still desirable
> > > > > > > to userspace or not. Should we also delete the cgroup when it becomes
> > > > > > > empty for example?
> > > > > > >
> > > > > > > It's better to implement these kinds of policy decisions from
> > > > > > > userspace.
> > > > > > >
> > > > > > > There is a cgroup.events file that can be polled, and its "populated"
> > > > > > > field shows conveniently whether there are tasks in a subtree or
> > > > > > > not. You can use that to clear protection settings.
> > > > > >
> > > > > > Why isn't force_empty supported in cgroup2 ?
> > > > >
> > > > > There wasn't any sound usecase AFAIR.
> > > > >
> > > > > > In this case we can free the protected file pages immdiately with force_empty.
> > > > >
> > > > > You can do the same thing by setting the hard limit to 0.
> > > >
> > > > I look though the code, and the difference between setting the hard
> > > > limit to 0 and force empty is that setting the hard limit to 0 will
> > > > generate some OOM reports, that should not happen in this case.
> > > > I think we should make little improvement as bellow,
> > >
> > > Yes, if you are not able to reclaim all of the memory then the OOM
> > > killer is triggered. And that was not the case with force_empty. I
> > > didn't mean that the two are equivalent, sorry if I misled you.
> > > I merely wanted to point out that you have means to cleanup the memcg
> > > with the existing API.
> > >
> > > > @@ -6137,9 +6137,11 @@ static ssize_t memory_max_write(struct
> > > > kernfs_open_file *of,
> > > >                         continue;
> > > >                 }
> > > >
> > > > -               memcg_memory_event(memcg, MEMCG_OOM);
> > > > -               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > > > -                       break;
> > > > +               if (cgroup_is_populated(memcg->css.cgroup)) {
> > > > +                       memcg_memory_event(memcg, MEMCG_OOM);
> > > > +                       if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> > > > +                               break;
> > > > +               }
> > > >         }
> > >
> > > If there are no killable tasks then
> > >         "Out of memory and no killable processes..."
> > > is printed and that really reflects the situation and is the right thing
> > > to do. Your above patch would suppress that information which might be
> > > important.
> > >
> >
> > Not only this output.
> > Pls. see dump_header(), many outputs and even worse is that the
> > dump_stack() is also executed.
>
> Yes, there will be the full oom report. I have outlined the "no
> killable" part because this is the main distinguisher for the "no tasks"
> case.
>

But the case here is there is "no tasks"
(!cgroup_is_populated(memcg->css.cgroup)), rather than "no killable".
This output is really a misleading.

> > > > Well,  if someone don't want to kill proesses but only want ot drop
> > > > page caches, setting the hard limit to 0 won't work.
> > >
> > > Could you be more specific about a real world example when somebody
> > > wants to drop per-memcg pagecache?
> >
> > For example, if one memcg  has lots of negtive denties,  that causes
> > the file page cache continuesly been reclaimed, so we want to drop all
> > these negtive dentries. force_empty is a better workaround so far, and
> > that can give us more chance to analyze why negtive dentries are
> > generated.
>
> force_empty sounds like a brute force to clean negative dentries TBH.
> And it is not really way too much different from shrinking the hard
> limit.
>
> Why doesn't a normal reclaim work for those situation? Anyway, this is
> getting really tangent to the original topic so I would suggest to start
> a new email thread with a clear description of a problem you are facing
> and we can go from there.

Sure.
I will start a new thread.
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c762e8..f68a1a5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -369,6 +369,13 @@  static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
 		   READ_ONCE(memcg->memory.elow));
 }
 
+static inline void mem_cgroup_clear_protection(struct mem_cgroup *memcg)
+{
+
+	page_counter_set_min(&memcg->memory, 0);
+	page_counter_set_low(&memcg->memory, 0);
+}
+
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
 
@@ -850,6 +857,10 @@  static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
 	return 0;
 }
 
+static inline void mem_cgroup_clear_protection(struct mem_cgroup *memcg)
+{
+}
+
 static inline enum mem_cgroup_protection mem_cgroup_protected(
 	struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c4c08b..e5ab119 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5190,9 +5190,7 @@  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	}
 	spin_unlock(&memcg->event_list_lock);
 
-	page_counter_set_min(&memcg->memory, 0);
-	page_counter_set_low(&memcg->memory, 0);
-
+	mem_cgroup_clear_protection(memcg);
 	memcg_offline_kmem(memcg);
 	wb_memcg_offline(memcg);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 93eae76..550f830 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -988,6 +988,7 @@  static void oom_kill_process(struct oom_control *oc, const char *message)
 		mem_cgroup_print_oom_group(oom_group);
 		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
 				      (void*)message);
+		mem_cgroup_clear_protection(oom_group);
 		mem_cgroup_put(oom_group);
 	}
 }