@@ -552,8 +552,12 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
unsigned long totalpages = totalram_pages() + total_swap_pages;
unsigned long points = 0;
long badness;
+ struct oom_control oc = {
+ .totalpages = totalpages,
+ .gfp_mask = 0,
+ };
- badness = oom_badness(task, totalpages);
+ badness = oom_badness(task, &oc);
/*
* Special case OOM_SCORE_ADJ_MIN for all others scale the
* badness value into [0, 2000] range which we have been
@@ -109,7 +109,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
bool __oom_reap_task_mm(struct mm_struct *mm);
long oom_badness(struct task_struct *p,
- unsigned long totalpages);
+ struct oom_control *oc);
extern bool out_of_memory(struct oom_control *oc);
@@ -198,7 +198,7 @@ static bool should_dump_unreclaim_slab(void)
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, struct oom_control *oc)
{
long points;
long adj;
@@ -227,12 +227,22 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
- mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+ points = get_mm_counter(p->mm, -1, nid_to_find_victim) +
+ get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ } else {
+ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ }
task_unlock(p);
/* Normalize to oom_score_adj units */
- adj *= totalpages / 1000;
+ adj *= oc->totalpages / 1000;
points += adj;
return points;
@@ -338,7 +348,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto select;
}
- points = oom_badness(task, oc->totalpages);
+ points = oom_badness(task, oc);
if (points == LONG_MIN || points < oc->chosen_points)
goto next;
@@ -382,6 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
{
struct oom_control *oc = arg;
struct task_struct *task;
+ unsigned long node_mm_rss;
if (oom_unkillable_task(p))
return 0;
@@ -399,9 +410,18 @@ static int dump_task(struct task_struct *p, void *arg)
return 0;
}
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+ node_mm_rss = get_mm_counter(p->mm, -1, nid_to_find_victim);
+ } else {
+ node_mm_rss = 0;
+ }
+ pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm), node_mm_rss,
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE),
task->signal->oom_score_adj, task->comm);
@@ -422,8 +442,17 @@ static int dump_task(struct task_struct *p, void *arg)
*/
static void dump_tasks(struct oom_control *oc)
{
+ int nid_to_find_victim;
+
+ if (oc->nodemask) {
+ struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+ oc->nodemask);
+ nid_to_find_victim = zone_to_nid(zoneref->zone);
+ } else {
+ nid_to_find_victim = -1;
+ }
pr_info("Tasks state (memory values in pages):\n");
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss (%02d)nrss pgtables_bytes swapents oom_score_adj name\n", nid_to_find_victim);
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
Page allocator will only alloc pages on node indicated by `nodemask`. But oom will still select bad process by total rss usage which may reclam nothing on the node indicated by `nodemask`. This patch let oom only calculate rss on the given node when oc->constraint equals to CONSTRAINT_MEMORY_POLICY. If `nodemask` is asigned, the process with the highest memory consumption on the specific node will be killed. oom_kill dmesg will looks like this: ``` [ 1471.436027] Tasks state (memory values in pages): [ 1471.438518] [ pid ] uid tgid total_vm rss (01)nrss pgtables_bytes swapents oom_score_adj name [ 1471.554703] [ 1011] 0 1011 220005 8589 1872 823296 0 0 node [ 1471.707912] [ 12399] 0 12399 1311306 1311056 262170 10534912 0 0 a.out [ 1471.712429] [ 13135] 0 13135 787018 674666 674300 5439488 0 0 a.out [ 1471.721506] [ 13295] 0 13295 597 188 0 24576 0 0 sh [ 1471.734600] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=1,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/user.slice/user-0.slice/session-3.scope,task=a.out,pid=13135,uid=0 [ 1471.742583] Out of memory: Killed process 13135 (a.out) total-vm:3148072kB, anon-rss:2697304kB, file-rss:1360kB, shmem-rss:0kB, UID:0 pgtables:5312kB oom_score_adj:0 [ 1471.849615] oom_reaper: reaped process 13135 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB ``` Signed-off-by: Gang Li <ligang.bdlg@bytedance.com> --- fs/proc/base.c | 6 +++++- include/linux/oom.h | 2 +- mm/oom_kill.c | 45 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 10 deletions(-)