diff mbox series

mm, memcg: skip killing processes under memcg protection at first scan

Message ID 1566102294-14803-1-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series mm, memcg: skip killing processes under memcg protection at first scan | expand

Commit Message

Yafang Shao Aug. 18, 2019, 4:24 a.m. UTC
In the current memory.min design, the system is going to do OOM instead
of reclaiming the reclaimable pages protected by memory.min if the
system is lack of free memory. While under this condition, the OOM
killer may kill the processes in the memcg protected by memory.min.
This behavior is very weird.
In order to make it more reasonable, I make some changes in the OOM
killer. In this patch, the OOM killer will do two-round scan. It will
skip the processes under memcg protection at the first scan, and if it
can't kill any processes it will rescan all the processes.

Regarding the overhead this change may takes, I don't think it will be a
problem because this only happens under system  memory pressure and
the OOM killer can't find any proper victims which are not under memcg
protection.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/memcontrol.h |  6 ++++++
 mm/memcontrol.c            | 16 ++++++++++++++++
 mm/oom_kill.c              | 23 +++++++++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

Comments

Souptick Joarder Aug. 18, 2019, 5:11 p.m. UTC | #1
On Sun, Aug 18, 2019 at 9:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> In the current memory.min design, the system is going to do OOM instead
> of reclaiming the reclaimable pages protected by memory.min if the
> system is lack of free memory. While under this condition, the OOM
> killer may kill the processes in the memcg protected by memory.min.
> This behavior is very weird.
> In order to make it more reasonable, I make some changes in the OOM
> killer. In this patch, the OOM killer will do two-round scan. It will
> skip the processes under memcg protection at the first scan, and if it
> can't kill any processes it will rescan all the processes.
>
> Regarding the overhead this change may takes, I don't think it will be a
> problem because this only happens under system  memory pressure and
> the OOM killer can't find any proper victims which are not under memcg
> protection.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Cc: Roman Gushchin <guro@fb.com>
> Cc: Randy Dunlap <rdunlap@infradead.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/linux/memcontrol.h |  6 ++++++
>  mm/memcontrol.c            | 16 ++++++++++++++++
>  mm/oom_kill.c              | 23 +++++++++++++++++++++--
>  3 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 44c4146..58bd86b 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -337,6 +337,7 @@ static inline bool mem_cgroup_disabled(void)
>
>  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
>                                                 struct mem_cgroup *memcg);
> +int task_under_memcg_protection(struct task_struct *p);
>
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
> @@ -813,6 +814,11 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
>         return MEMCG_PROT_NONE;
>  }
>
> +int task_under_memcg_protection(struct task_struct *p)
> +{
> +       return 0;
> +}
> +
>  static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>                                         gfp_t gfp_mask,
>                                         struct mem_cgroup **memcgp,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cdbb7a8..c4d8e53 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6030,6 +6030,22 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
>                 return MEMCG_PROT_NONE;
>  }
>
> +int task_under_memcg_protection(struct task_struct *p)
> +{
> +       struct mem_cgroup *memcg;
> +       int protected;
> +
> +       rcu_read_lock();
> +       memcg = mem_cgroup_from_task(p);
> +       if (memcg != root_mem_cgroup && memcg->memory.min)
> +               protected = 1;
> +       else
> +               protected = 0;
> +       rcu_read_unlock();
> +
> +       return protected;

I think returning a bool type would be more appropriate.

> +}
> +
>  /**
>   * mem_cgroup_try_charge - try charging a page
>   * @page: page to charge
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index eda2e2a..259dd2c 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -368,11 +368,30 @@ static void select_bad_process(struct oom_control *oc)
>                 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
>         else {
>                 struct task_struct *p;
> +               int memcg_check = 0;
> +               int memcg_skip = 0;
> +               int selected = 0;
>
>                 rcu_read_lock();
> -               for_each_process(p)
> -                       if (oom_evaluate_task(p, oc))
> +retry:
> +               for_each_process(p) {
> +                       if (!memcg_check && task_under_memcg_protection(p)) {
> +                               memcg_skip = 1;
> +                               continue;
> +                       }
> +                       selected = oom_evaluate_task(p, oc);
> +                       if (selected)
>                                 break;
> +               }
> +
> +               if (!selected) {
> +                       if (memcg_skip) {
> +                               if (!oc->chosen || oc->chosen == (void *)-1UL) {
> +                                       memcg_check = 1;
> +                                       goto retry;
> +                               }
> +                       }
> +               }
>                 rcu_read_unlock();
>         }
>  }
> --
> 1.8.3.1
>
>
Yafang Shao Aug. 19, 2019, 1:03 a.m. UTC | #2
On Mon, Aug 19, 2019 at 1:11 AM Souptick Joarder <jrdr.linux@gmail.com> wrote:
>
> On Sun, Aug 18, 2019 at 9:55 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > In the current memory.min design, the system is going to do OOM instead
> > of reclaiming the reclaimable pages protected by memory.min if the
> > system is lack of free memory. While under this condition, the OOM
> > killer may kill the processes in the memcg protected by memory.min.
> > This behavior is very weird.
> > In order to make it more reasonable, I make some changes in the OOM
> > killer. In this patch, the OOM killer will do two-round scan. It will
> > skip the processes under memcg protection at the first scan, and if it
> > can't kill any processes it will rescan all the processes.
> >
> > Regarding the overhead this change may takes, I don't think it will be a
> > problem because this only happens under system  memory pressure and
> > the OOM killer can't find any proper victims which are not under memcg
> > protection.
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > Cc: Roman Gushchin <guro@fb.com>
> > Cc: Randy Dunlap <rdunlap@infradead.org>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> > Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > ---
> >  include/linux/memcontrol.h |  6 ++++++
> >  mm/memcontrol.c            | 16 ++++++++++++++++
> >  mm/oom_kill.c              | 23 +++++++++++++++++++++--
> >  3 files changed, 43 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 44c4146..58bd86b 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -337,6 +337,7 @@ static inline bool mem_cgroup_disabled(void)
> >
> >  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> >                                                 struct mem_cgroup *memcg);
> > +int task_under_memcg_protection(struct task_struct *p);
> >
> >  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> >                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
> > @@ -813,6 +814,11 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
> >         return MEMCG_PROT_NONE;
> >  }
> >
> > +int task_under_memcg_protection(struct task_struct *p)
> > +{
> > +       return 0;
> > +}
> > +
> >  static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> >                                         gfp_t gfp_mask,
> >                                         struct mem_cgroup **memcgp,
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index cdbb7a8..c4d8e53 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -6030,6 +6030,22 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> >                 return MEMCG_PROT_NONE;
> >  }
> >
> > +int task_under_memcg_protection(struct task_struct *p)
> > +{
> > +       struct mem_cgroup *memcg;
> > +       int protected;
> > +
> > +       rcu_read_lock();
> > +       memcg = mem_cgroup_from_task(p);
> > +       if (memcg != root_mem_cgroup && memcg->memory.min)
> > +               protected = 1;
> > +       else
> > +               protected = 0;
> > +       rcu_read_unlock();
> > +
> > +       return protected;
>
> I think returning a bool type would be more appropriate.
>

Sure. Will change it.

> > +}
> > +
> >  /**
> >   * mem_cgroup_try_charge - try charging a page
> >   * @page: page to charge
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > index eda2e2a..259dd2c 100644
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -368,11 +368,30 @@ static void select_bad_process(struct oom_control *oc)
> >                 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
> >         else {
> >                 struct task_struct *p;
> > +               int memcg_check = 0;
> > +               int memcg_skip = 0;
> > +               int selected = 0;
> >
> >                 rcu_read_lock();
> > -               for_each_process(p)
> > -                       if (oom_evaluate_task(p, oc))
> > +retry:
> > +               for_each_process(p) {
> > +                       if (!memcg_check && task_under_memcg_protection(p)) {
> > +                               memcg_skip = 1;
> > +                               continue;
> > +                       }
> > +                       selected = oom_evaluate_task(p, oc);
> > +                       if (selected)
> >                                 break;
> > +               }
> > +
> > +               if (!selected) {
> > +                       if (memcg_skip) {
> > +                               if (!oc->chosen || oc->chosen == (void *)-1UL) {
> > +                                       memcg_check = 1;
> > +                                       goto retry;
> > +                               }
> > +                       }
> > +               }
> >                 rcu_read_unlock();
> >         }
> >  }
> > --
> > 1.8.3.1
> >
> >
Michal Hocko Aug. 19, 2019, 7:31 a.m. UTC | #3
On Sun 18-08-19 00:24:54, Yafang Shao wrote:
> In the current memory.min design, the system is going to do OOM instead
> of reclaiming the reclaimable pages protected by memory.min if the
> system is lack of free memory. While under this condition, the OOM
> killer may kill the processes in the memcg protected by memory.min.

Could you be more specific about the configuration that leads to this
situation?
Yafang Shao Aug. 19, 2019, 8:15 a.m. UTC | #4
On Mon, Aug 19, 2019 at 3:31 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Sun 18-08-19 00:24:54, Yafang Shao wrote:
> > In the current memory.min design, the system is going to do OOM instead
> > of reclaiming the reclaimable pages protected by memory.min if the
> > system is lack of free memory. While under this condition, the OOM
> > killer may kill the processes in the memcg protected by memory.min.
>
> Could you be more specific about the configuration that leads to this
> situation?

When I did memory pressure test to verify memory.min I found that issue.
This issue can be produced as bellow,
    memcg setting,
        memory.max: 1G
        memory.min: 512M
        some processes are running is this memcg, with both serveral
hundreds MB  file mapping and serveral hundreds MB anon mapping.
    system setting,
         swap: off.
         some memory pressure test are running on the system.

When the memory usage of this memcg is bellow the memory.min, the
global reclaimers stop reclaiming pages in this memcg, and when
there's no available memory, the OOM killer will be invoked.
Unfortunately the OOM killer can chose the process running in the
protected memcg.

In order to produce it easy, you can incease the memroy.min and set
-1000 to the oom_socre_adj of the processes outside of the protected
memcg.

Is this setting proper ?

Thanks
Yafang
Michal Hocko Aug. 20, 2019, 6:31 a.m. UTC | #5
[hmm the email got stuck on my send queue - sending again]

On Mon 19-08-19 16:15:08, Yafang Shao wrote:
> On Mon, Aug 19, 2019 at 3:31 PM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Sun 18-08-19 00:24:54, Yafang Shao wrote:
> > > In the current memory.min design, the system is going to do OOM instead
> > > of reclaiming the reclaimable pages protected by memory.min if the
> > > system is lack of free memory. While under this condition, the OOM
> > > killer may kill the processes in the memcg protected by memory.min.
> >
> > Could you be more specific about the configuration that leads to this
> > situation?
> 
> When I did memory pressure test to verify memory.min I found that issue.
> This issue can be produced as bellow,
>     memcg setting,
>         memory.max: 1G
>         memory.min: 512M
>         some processes are running is this memcg, with both serveral
> hundreds MB  file mapping and serveral hundreds MB anon mapping.
>     system setting,
>          swap: off.
>          some memory pressure test are running on the system.
> 
> When the memory usage of this memcg is bellow the memory.min, the
> global reclaimers stop reclaiming pages in this memcg, and when
> there's no available memory, the OOM killer will be invoked.
> Unfortunately the OOM killer can chose the process running in the
> protected memcg.

Well, the memcg protection was designed to prevent from regular
memory reclaim.  It was not aimed at acting as a group wide oom
protection. The global oom killer (but memcg as well) simply cares only
about oom_score_adj when selecting a victim.

Adding yet another oom protection is likely to complicate the oom
selection logic and make it more surprising. E.g. why should workload
fitting inside the min limit be so special? Do you have any real world
example?
 
> In order to produce it easy, you can incease the memroy.min and set
> -1000 to the oom_socre_adj of the processes outside of the protected
> memcg.

This sounds like a very dubious configuration to me. There is no other
option than chosing from the protected group.

> Is this setting proper ?
> 
> Thanks
> Yafang
Yafang Shao Aug. 20, 2019, 7:02 a.m. UTC | #6
On Tue, Aug 20, 2019 at 2:31 PM Michal Hocko <mhocko@suse.com> wrote:
>
> [hmm the email got stuck on my send queue - sending again]
>
> On Mon 19-08-19 16:15:08, Yafang Shao wrote:
> > On Mon, Aug 19, 2019 at 3:31 PM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Sun 18-08-19 00:24:54, Yafang Shao wrote:
> > > > In the current memory.min design, the system is going to do OOM instead
> > > > of reclaiming the reclaimable pages protected by memory.min if the
> > > > system is lack of free memory. While under this condition, the OOM
> > > > killer may kill the processes in the memcg protected by memory.min.
> > >
> > > Could you be more specific about the configuration that leads to this
> > > situation?
> >
> > When I did memory pressure test to verify memory.min I found that issue.
> > This issue can be produced as bellow,
> >     memcg setting,
> >         memory.max: 1G
> >         memory.min: 512M
> >         some processes are running is this memcg, with both serveral
> > hundreds MB  file mapping and serveral hundreds MB anon mapping.
> >     system setting,
> >          swap: off.
> >          some memory pressure test are running on the system.
> >
> > When the memory usage of this memcg is bellow the memory.min, the
> > global reclaimers stop reclaiming pages in this memcg, and when
> > there's no available memory, the OOM killer will be invoked.
> > Unfortunately the OOM killer can chose the process running in the
> > protected memcg.
>
> Well, the memcg protection was designed to prevent from regular
> memory reclaim.  It was not aimed at acting as a group wide oom
> protection. The global oom killer (but memcg as well) simply cares only
> about oom_score_adj when selecting a victim.
>

OOM is a kind of memory reclaim, isn't it ?

> Adding yet another oom protection is likely to complicate the oom
> selection logic and make it more surprising. E.g. why should workload
> fitting inside the min limit be so special? Do you have any real world
> example?
>

The problem here is we want to use it ini the real world, but the
issuses we found  prevent us from using it in the real world.


> > In order to produce it easy, you can incease the memroy.min and set
> > -1000 to the oom_socre_adj of the processes outside of the protected
> > memcg.
>
> This sounds like a very dubious configuration to me. There is no other
> option than chosing from the protected group.
>

This is only an easy example to produce it.

> > Is this setting proper ?
> >
> > Thanks
> > Yafang
>
> --
> Michal Hocko
> SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44c4146..58bd86b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -337,6 +337,7 @@  static inline bool mem_cgroup_disabled(void)
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
+int task_under_memcg_protection(struct task_struct *p);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -813,6 +814,11 @@  static inline enum mem_cgroup_protection mem_cgroup_protected(
 	return MEMCG_PROT_NONE;
 }
 
+int task_under_memcg_protection(struct task_struct *p)
+{
+	return 0;
+}
+
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask,
 					struct mem_cgroup **memcgp,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cdbb7a8..c4d8e53 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6030,6 +6030,22 @@  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 		return MEMCG_PROT_NONE;
 }
 
+int task_under_memcg_protection(struct task_struct *p)
+{
+	struct mem_cgroup *memcg;
+	int protected;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(p);
+	if (memcg != root_mem_cgroup && memcg->memory.min)
+		protected = 1;
+	else
+		protected = 0;
+	rcu_read_unlock();
+
+	return protected;
+}
+
 /**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a..259dd2c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -368,11 +368,30 @@  static void select_bad_process(struct oom_control *oc)
 		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
 	else {
 		struct task_struct *p;
+		int memcg_check = 0;
+		int memcg_skip = 0;
+		int selected = 0;
 
 		rcu_read_lock();
-		for_each_process(p)
-			if (oom_evaluate_task(p, oc))
+retry:
+		for_each_process(p) {
+			if (!memcg_check && task_under_memcg_protection(p)) {
+				memcg_skip = 1;
+				continue;
+			}
+			selected = oom_evaluate_task(p, oc);
+			if (selected)
 				break;
+		}
+
+		if (!selected) {
+			if (memcg_skip) {
+				if (!oc->chosen || oc->chosen == (void *)-1UL) {
+					memcg_check = 1;
+					goto retry;
+				}
+			}
+		}
 		rcu_read_unlock();
 	}
 }