@@ -14,6 +14,7 @@
#define NUMA_NO_NODE (-1)
#define NUMA_NO_MEMBLK (-1)
+#define NUMA_TASK_MIG (1)
static inline bool numa_valid_node(int nid)
{
@@ -64,6 +64,7 @@ enum {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_MOFT (1 << 5) /* allow task but no page migrate on fault */
/*
* These bit locations are exposed in the vm.zone_reclaim_mode sysctl
@@ -5683,7 +5683,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
writable, &last_cpupid);
- if (target_nid == NUMA_NO_NODE)
+ if (target_nid == NUMA_NO_NODE || target_nid == NUMA_TASK_MIG)
goto out_map;
if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
flags |= TNF_MIGRATE_FAIL;
@@ -1510,6 +1510,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
if (*flags & MPOL_F_NUMA_BALANCING) {
if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
*flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ else if (*mode == MPOL_INTERLEAVE)
+ *flags |= (MPOL_F_MOF | MPOL_F_MOFT);
else
return -EINVAL;
}
@@ -2779,6 +2781,11 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
if (!(pol->flags & MPOL_F_MOF))
goto out;
+ if (pol->flags & MPOL_F_MOFT) {
+ ret = NUMA_TASK_MIG;
+ goto out;
+ }
+
switch (pol->mode) {
case MPOL_INTERLEAVE:
polnid = interleave_nid(pol, ilx);
MPOL_INTERLEAVE is used to allocate pages interleaved across different NUMA nodes to make the best use of memory bandwidth. Under MPOL_INTERLEAVE mode, NUMA load balance page migration does not occur because the page is already in its designated place. Similarly, NUMA load task migration does not occur either—mpol_misplaced() returns NUMA_NO_NODE, which instructs do_numa_page() to skip page/task migration. However, there is a scenario in the production environment where NUMA balance could benefit MPOL_INTERLEAVE. This typical scenario involves tasks within cgroup g_A being bound to two SNC (Sub-NUMA Cluster) nodes via cpuset, with their pages allocated only on these two SNC nodes in an interleaved manner using MPOL_INTERLEAVE. This setup allows g_A to achieve good resource isolation while effectively utilizing the memory bandwidth of the two SNC nodes. However, it is possible that tasks t1 and t2 in g_A could experience remote access patterns: Node 0 Node 1 t1 t1.page t2.page t2 Ideally, a NUMA balance task swap would be beneficial: Node 0 Node 1 t2 t1.page t2.page t1 In other words, NUMA balancing can help swap t1 and t2 to improve NUMA locality without migrating pages, thereby still honoring the MPOL_INTERLEAVE policy. To enable NUMA balancing to manage MPOL_INTERLEAVE, add MPOL_F_MOF to the MPOL_INTERLEAVE policy if the user has requested it via MPOL_F_NUMA_BALANCING (similar to MPOL_BIND). In summary, pages will not be migrated for MPOL_INTERLEAVE, but tasks will be migrated to their preferred nodes. Tested on a system with 4 nodes, 40 Cores(80 CPUs)/node, using autonumabench NUMA01_THREADLOCAL, with some minor changes to support MPOL_INTERLEAVE: p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, \ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); set_mempolicy(MPOL_INTERLEAVE | MPOL_F_NUMA_BALANCING, \ &nodemask_global, max_nodes); ... //each thread accesses 4K of data every 8K, //1 thread should access the pages on 1 node. No obvious score difference was observed, but noticed some Numa balance task migration: baseline_nocg_interleav nb_nocg_interlave baseline_nocg_interleave nb_nocg_interlave/ Min syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) Min elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) Amean syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) Amean elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) Stddev syst-NUMA01_THREADLOCAL 0.00 ( 0.00%) 0.00 ( 0.00%) Stddev elsp-NUMA01_THREADLOCAL 0.00 ( 0.00%) 0.00 ( 0.00%) CoeffVar syst-NUMA01_THREADLOCAL 0.00 ( 0.00%) 0.00 ( 0.00%) CoeffVar elsp-NUMA01_THREADLOCAL 0.00 ( 0.00%) 0.00 ( 0.00%) Max syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) Max elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) BAmean-50 syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) BAmean-50 elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) BAmean-95 syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) BAmean-95 elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) BAmean-99 syst-NUMA01_THREADLOCAL 7156.34 ( 0.00%) 7267.28 ( -1.55%) BAmean-99 elsp-NUMA01_THREADLOCAL 90.73 ( 0.00%) 90.88 ( -0.17%) delta of /sys/fs/cgroup/mytest/memory.stat during the test: numa_pages_migrated: 0 numa_pte_updates: 9156154 numa_hint_faults: 8659673 numa_task_migrated: 282 <--- introduced in previous patch numa_task_swaped: 114 <---- introduced in previous patch More tests to come. Suggested-by: Aubrey Li <aubrey.li@intel.com> Signed-off-by: Chen Yu <yu.c.chen@intel.com> --- include/linux/numa.h | 1 + include/uapi/linux/mempolicy.h | 1 + mm/memory.c | 2 +- mm/mempolicy.c | 7 +++++++ 4 files changed, 10 insertions(+), 1 deletion(-)