diff mbox series

[5/5,v1] mm, oom: enable per numa node oom for CONSTRAINT_MEMORY_POLICY

Message ID 20220512044634.63586-6-ligang.bdlg@bytedance.com (mailing list archive)
State New
Headers show
Series mm, oom: Introduce per numa node oom for CONSTRAINT_MEMORY_POLICY | expand

Commit Message

Gang Li May 12, 2022, 4:46 a.m. UTC
Page allocator will only alloc pages on node indicated by
`nodemask`. But oom will still select bad process by total rss usage
which may reclam nothing on the node indicated by `nodemask`.

This patch let oom only calculate rss on the given node when
oc->constraint equals to CONSTRAINT_MEMORY_POLICY.

If `nodemask` is asigned, the process with the highest memory
consumption on the specific node will be killed. oom_kill dmesg will
looks like this:

```
[ 1471.436027] Tasks state (memory values in pages):
[ 1471.438518] [  pid  ]   uid  tgid total_vm      rss (01)nrss  pgtables_bytes swapents oom_score_adj name
[ 1471.554703] [   1011]     0  1011   220005     8589     1872   823296        0             0 node
[ 1471.707912] [  12399]     0 12399  1311306  1311056   262170 10534912        0             0 a.out
[ 1471.712429] [  13135]     0 13135   787018   674666   674300  5439488        0             0 a.out
[ 1471.721506] [  13295]     0 13295      597      188        0    24576        0             0 sh
[ 1471.734600] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=1,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/user.slice/user-0.slice/session-3.scope,task=a.out,pid=13135,uid=0
[ 1471.742583] Out of memory: Killed process 13135 (a.out) total-vm:3148072kB, anon-rss:2697304kB, file-rss:1360kB, shmem-rss:0kB, UID:0 pgtables:5312kB oom_score_adj:0
[ 1471.849615] oom_reaper: reaped process 13135 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
```

Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---
 fs/proc/base.c      |  6 +++++-
 include/linux/oom.h |  2 +-
 mm/oom_kill.c       | 45 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 10 deletions(-)
diff mbox series

Patch

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..caf0f51284d0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -552,8 +552,12 @@  static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 	long badness;
+	struct oom_control oc = {
+		.totalpages =  totalpages,
+		.gfp_mask = 0,
+	};
 
-	badness = oom_badness(task, totalpages);
+	badness = oom_badness(task, &oc);
 	/*
 	 * Special case OOM_SCORE_ADJ_MIN for all others scale the
 	 * badness value into [0, 2000] range which we have been
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..0cb6a60be776 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -109,7 +109,7 @@  static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 long oom_badness(struct task_struct *p,
-		unsigned long totalpages);
+		struct oom_control *oc);
 
 extern bool out_of_memory(struct oom_control *oc);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 757f5665ae94..75a80b5a63bf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -198,7 +198,7 @@  static bool should_dump_unreclaim_slab(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, struct oom_control *oc)
 {
 	long points;
 	long adj;
@@ -227,12 +227,22 @@  long oom_badness(struct task_struct *p, unsigned long totalpages)
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+		points = get_mm_counter(p->mm, -1, nid_to_find_victim) +
+			get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+			mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	} else {
+		points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+			mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	}
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
-	adj *= totalpages / 1000;
+	adj *= oc->totalpages / 1000;
 	points += adj;
 
 	return points;
@@ -338,7 +348,7 @@  static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, oc->totalpages);
+	points = oom_badness(task, oc);
 	if (points == LONG_MIN || points < oc->chosen_points)
 		goto next;
 
@@ -382,6 +392,7 @@  static int dump_task(struct task_struct *p, void *arg)
 {
 	struct oom_control *oc = arg;
 	struct task_struct *task;
+	unsigned long node_mm_rss;
 
 	if (oom_unkillable_task(p))
 		return 0;
@@ -399,9 +410,18 @@  static int dump_task(struct task_struct *p, void *arg)
 		return 0;
 	}
 
-	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+	if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+		node_mm_rss = get_mm_counter(p->mm, -1, nid_to_find_victim);
+	} else {
+		node_mm_rss = 0;
+	}
+	pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8ld %8lu         %5hd %s\n",
 		task->pid, from_kuid(&init_user_ns, task_uid(task)),
-		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+		task->tgid, task->mm->total_vm, get_mm_rss(task->mm), node_mm_rss,
 		mm_pgtables_bytes(task->mm),
 		get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE),
 		task->signal->oom_score_adj, task->comm);
@@ -422,8 +442,17 @@  static int dump_task(struct task_struct *p, void *arg)
  */
 static void dump_tasks(struct oom_control *oc)
 {
+	int nid_to_find_victim;
+
+	if (oc->nodemask) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		nid_to_find_victim = zone_to_nid(zoneref->zone);
+	} else {
+		nid_to_find_victim = -1;
+	}
 	pr_info("Tasks state (memory values in pages):\n");
-	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	pr_info("[  pid  ]   uid  tgid total_vm      rss (%02d)nrss  pgtables_bytes swapents oom_score_adj name\n", nid_to_find_victim);
 
 	if (is_memcg_oom(oc))
 		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);