@@ -192,6 +192,7 @@ read the file /proc/PID/status::
VmLib: 1412 kB
VmPTE: 20 kb
VmSwap: 0 kB
+ NumaB_enabled: default
HugetlbPages: 0 kB
CoreDumping: 0
THP_enabled: 1
@@ -273,6 +274,7 @@ It's slow but very precise.
VmPTE size of page table entries
VmSwap amount of swap used by anonymous private data
(shmem swap usage is not included)
+ NumaB_enabled numa balancing state, set by prctl(PR_PROCESS_NUMAB, ...)
HugetlbPages size of hugetlb memory portions
CoreDumping process's memory is currently being dumped
(killing the process may lead to a corrupted core)
@@ -20,6 +20,7 @@
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
#include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -76,6 +77,24 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
" kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+ seq_puts(m, "NumaB_enabled:\t");
+ switch (mm->numab_enabled) {
+ case PROCESS_NUMAB_DEFAULT:
+ seq_puts(m, "default");
+ break;
+ case PROCESS_NUMAB_DISABLED:
+ seq_puts(m, "disabled");
+ break;
+ case PROCESS_NUMAB_ENABLED:
+ seq_puts(m, "enabled");
+ break;
+ default:
+ seq_puts(m, "unknown");
+ break;
+ }
+ seq_putc(m, '\n');
+#endif
hugetlb_report_usage(m, mm);
}
#undef SEQ_PUT_DEC
@@ -626,6 +626,9 @@ struct mm_struct {
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /* Controls whether NUMA balancing is active for this mm. */
+ int numab_enabled;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
@@ -16,6 +16,12 @@
#define TNF_MIGRATE_FAIL 0x10
#ifdef CONFIG_NUMA_BALANCING
+enum {
+ PROCESS_NUMAB_DISABLED,
+ PROCESS_NUMAB_ENABLED,
+ PROCESS_NUMAB_DEFAULT
+};
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
@@ -3,6 +3,7 @@
#define _LINUX_SCHED_SYSCTL_H
#include <linux/types.h>
+#include <linux/sched/numa_balancing.h>
struct ctl_table;
@@ -29,8 +30,26 @@ enum sched_tunable_scaling {
#ifdef CONFIG_NUMA_BALANCING
extern int sysctl_numa_balancing_mode;
+static inline int process_sysctl_numab_mode(struct mm_struct *mm)
+{
+ int numab = mm->numab_enabled;
+
+ switch (numab) {
+ case PROCESS_NUMAB_ENABLED:
+ return NUMA_BALANCING_NORMAL;
+ case PROCESS_NUMAB_DISABLED:
+ return NUMA_BALANCING_DISABLED;
+ case PROCESS_NUMAB_DEFAULT:
+ default:
+ return sysctl_numa_balancing_mode;
+ }
+}
#else
#define sysctl_numa_balancing_mode 0
+static inline int process_sysctl_numab_mode(struct mm_struct *mm)
+{
+ return NUMA_BALANCING_DISABLED;
+}
#endif
/*
@@ -275,4 +275,11 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0
+/* Set/get enabled per-process numa_balancing */
+#define PR_PROCESS_NUMAB 63
+# define PR_SET_PROCESS_NUMAB_DISABLED PROCESS_NUMAB_DISABLED
+# define PR_SET_PROCESS_NUMAB_ENABLED PROCESS_NUMAB_ENABLED
+# define PR_SET_PROCESS_NUMAB_DEFAULT PROCESS_NUMAB_DEFAULT
+# define PR_GET_PROCESS_NUMAB 3
+
#endif /* _LINUX_PRCTL_H */
@@ -1059,6 +1059,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->numab_enabled = PROCESS_NUMAB_DEFAULT;
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
@@ -2575,6 +2575,23 @@ void task_numa_free(struct task_struct *p, bool final)
}
}
+static inline bool process_sched_numab_enabled(struct task_struct *p)
+{
+ if (p->mm) {
+ int numab = p->mm->numab_enabled;
+
+ switch (numab) {
+ case PROCESS_NUMAB_ENABLED:
+ return true;
+ case PROCESS_NUMAB_DISABLED:
+ return false;
+ case PROCESS_NUMAB_DEFAULT:
+ break;
+ }
+ }
+ return static_branch_unlikely(&sched_numa_balancing);
+}
+
/*
* Got a PROT_NONE fault for a page on @node.
*/
@@ -2587,13 +2604,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
struct numa_group *ng;
int priv;
- if (!static_branch_likely(&sched_numa_balancing))
- return;
-
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
+ if (!process_sched_numab_enabled(p))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2894,7 +2911,7 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
int src_nid = cpu_to_node(task_cpu(p));
int dst_nid = cpu_to_node(new_cpu);
- if (!static_branch_likely(&sched_numa_balancing))
+ if (!process_sched_numab_enabled(p))
return;
if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
@@ -2928,6 +2945,11 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
+static inline bool process_sched_numab_enabled(struct task_struct *p)
+{
+ return false;
+}
+
static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
}
@@ -7687,7 +7709,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
- if (!static_branch_likely(&sched_numa_balancing))
+ if (!process_sched_numab_enabled(p))
return -1;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
@@ -11164,7 +11186,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
- if (static_branch_unlikely(&sched_numa_balancing))
+ if (process_sched_numab_enabled(curr))
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
@@ -58,6 +58,7 @@
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/numa_balancing.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
@@ -2081,6 +2082,23 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
return 0;
}
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+ if (numa_balancing != PR_SET_PROCESS_NUMAB_DEFAULT
+ && numa_balancing != PR_SET_PROCESS_NUMAB_DISABLED
+ && numa_balancing != PR_SET_PROCESS_NUMAB_ENABLED)
+ return -EINVAL;
+ current->mm->numab_enabled = numa_balancing;
+ return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+ return current->mm->numab_enabled;
+}
+#endif
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
@@ -2585,6 +2603,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = set_syscall_user_dispatch(arg2, arg3, arg4,
(char __user *) arg5);
break;
+#ifdef CONFIG_NUMA_BALANCING
+ case PR_PROCESS_NUMAB:
+ switch (arg2) {
+ case PR_SET_PROCESS_NUMAB_DEFAULT:
+ case PR_SET_PROCESS_NUMAB_DISABLED:
+ case PR_SET_PROCESS_NUMAB_ENABLED:
+ error = prctl_pid_numa_balancing_write((int)arg2);
+ break;
+ case PR_GET_PROCESS_NUMAB:
+ error = put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ break;
+#endif
#ifdef CONFIG_SCHED_CORE
case PR_SCHED_CORE:
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
@@ -1760,7 +1760,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ if (!(process_sysctl_numab_mode(vma->vm_mm) & NUMA_BALANCING_NORMAL) &&
node_is_toptier(page_to_nid(page)))
goto unlock;
}
@@ -33,7 +33,7 @@
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
-
+#include <linux/sched/numa_balancing.h>
#include "internal.h"
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -119,8 +119,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(nid))
+ if (!(process_sysctl_numab_mode(vma->vm_mm) & NUMA_BALANCING_NORMAL)
+ && node_is_toptier(nid))
continue;
}
This patch add a new api PR_PROCESS_NUMAB in prctl. A large number of page faults will cause performance loss when numa balancing is performing. Thus those processes which care about worst-case performance need numa balancing disabled. Others, on the contrary, allow a temporary performance loss in exchange for higher average performance, so enable numa balancing is better for them. Numa balancing can only be controlled globally by /proc/sys/kernel/numa_balancing. Due to the above case, we want to disable/enable numa_balancing per-process instead. Add numa_balancing under mm_struct. Then use it in task_tick_fair. Set per-process numa balancing: prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_DISABLED); prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_ENABLED); prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_DEFAULT); Get numa_balancing state: prctl(PR_PROCESS_NUMAB, PR_GET_PROCESS_NUMAB, &ret); cat /proc/<pid>/status | grep NumaB_enabled Cc: linux-api@vger.kernel.org Signed-off-by: Gang Li <ligang.bdlg@bytedance.com> --- Changes in v4: - Adaptation of new feature: optimize page placement for memory tiering system. https://lore.kernel.org/all/20220128082751.593478-3-ying.huang@intel.com/ - warp sched_numa_balancing and mm->numab_enabled with process_sched_numab_enabled(). Changes in v3: - Fix compile error. Changes in v2: - Now PR_NUMA_BALANCING support three states: enabled, disabled, default. enabled and disabled will ignore global setting, and default will follow global setting. --- Documentation/filesystems/proc.rst | 2 ++ fs/proc/task_mmu.c | 19 +++++++++++++++ include/linux/mm_types.h | 3 +++ include/linux/sched/numa_balancing.h | 6 +++++ include/linux/sched/sysctl.h | 19 +++++++++++++++ include/uapi/linux/prctl.h | 7 ++++++ kernel/fork.c | 3 +++ kernel/sched/fair.c | 34 ++++++++++++++++++++++----- kernel/sys.c | 35 ++++++++++++++++++++++++++++ mm/huge_memory.c | 2 +- mm/mprotect.c | 6 ++--- 11 files changed, 126 insertions(+), 10 deletions(-)