[RFC,-V3,3/5] autonuma, memory tiering: Hot page selection with hint page fault latency

Message ID	20200825002354.17038-4-ying.huang@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=AAJG=CD=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org B98FF207D3 IronPort-SDR: jD2Zz1Lky9jYG/m6ta/XzhxNImd67o9YvXXLb83n1cjke6HfiO7nwBXjTOC9h5p5dpz/7thIfC Xheccfn7awOQ== IronPort-SDR: OXvxewSZ0AW+H04nzzd+EOFXoSQFKoGoY9qlYYUrW2WyCV/SrTzhBAg0K9y1Jbo5Erle6UhgmJ HvBcxOGmXh5Q== From: Huang Ying <ying.huang@intel.com> To: Peter Zijlstra <peterz@infradead.org> Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, Huang Ying <ying.huang@intel.com>, Andrew Morton <akpm@linux-foundation.org>, Michal Hocko <mhocko@suse.com>, Rik van Riel <riel@redhat.com>, Mel Gorman <mgorman@suse.de>, Ingo Molnar <mingo@kernel.org>, Dave Hansen <dave.hansen@linux.intel.com>, Dan Williams <dan.j.williams@intel.com> Subject: [RFC -V3 3/5] autonuma, memory tiering: Hot page selection with hint page fault latency Date: Tue, 25 Aug 2020 08:23:52 +0800 Message-Id: <20200825002354.17038-4-ying.huang@intel.com> In-Reply-To: <20200825002354.17038-1-ying.huang@intel.com> References: <20200825002354.17038-1-ying.huang@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	autonuma: Optimize memory placement for memory tiering system \| expand [RFC,-V3,0/5] autonuma: Optimize memory placement for memory tiering system [RFC,-V3,1/5] autonuma: Optimize page placement for memory tiering system [RFC,-V3,2/5] autonuma, memory tiering: Skip to scan fast memory [RFC,-V3,3/5] autonuma, memory tiering: Hot page selection with hint page fault latency [RFC,-V3,4/5] autonuma, memory tiering: Rate limit NUMA migration throughput [RFC,-V3,5/5] autonuma, memory tiering: Adjust hot threshold automatically

diff --git a/include/linux/mm.h b/include/linux/mm.h index dc7b87310c10..0eac5049c153 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1278,6 +1278,18 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); @@ -1320,6 +1332,16 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); } +static inline unsigned int xchg_page_access_time(struct page *page, + unsigned int time) +{ + unsigned int last_time; + + last_time = xchg(&page->_last_cpupid, + (time >> PAGE_ACCESS_TIME_BUCKETS) & LAST_CPUPID_MASK); + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} + static inline int page_cpupid_last(struct page *page) { return page->_last_cpupid; @@ -1335,6 +1357,7 @@ static inline int page_cpupid_last(struct page *page) } extern int page_cpupid_xchg_last(struct page *page, int cpupid); +extern unsigned int xchg_page_access_time(struct page *page, unsigned int time); static inline void page_cpupid_reset_last(struct page *page) { @@ -1347,6 +1370,12 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) return page_to_nid(page); /* XXX */ } +static inline unsigned int xchg_page_access_time(struct page *page, + unsigned int time) +{ + return 0; +} + static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bdd38045d14c..435d66269d0a 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,6 +48,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04fa8dbcfa4d..62510b435a89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1076,6 +1076,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = 1000; + struct numa_group { refcount_t refcount; @@ -1416,6 +1419,37 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_mark; + + enough_mark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + enough_mark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +static int numa_hint_fault_latency(struct page *page) +{ + unsigned int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1423,6 +1457,27 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of accessing CPU node. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long latency, th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) + return true; + + th = sysctl_numa_balancing_hot_threshold; + latency = numa_hint_fault_latency(page); + if (latency > th) + return false; + + return true; + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); @@ -2636,6 +2691,11 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* Numa faults statistics are unnecessary for the slow memory node */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(mem_node)) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2655,6 +2715,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) */ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { priv = 1; + } else if (unlikely(!cpu_online(cpupid_to_cpu(last_cpupid)))) { + /* + * In memory tiering mode, cpupid of slow memory page is + * used to record page access time, so its value may be + * invalid during numa balancing mode transition. + */ + return; } else { priv = cpupid_match_pid(p, last_cpupid); if (!priv && !(flags & TNF_NO_GROUP)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75a6d35bd22e..69b93b1b20aa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1755,6 +1755,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "numa_balancing_hot_threshold_ms", + .data = &sysctl_numa_balancing_hot_threshold, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing", .data = &sysctl_numa_balancing_mode, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7d5db965a48c..7be0398dc973 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1399,7 +1399,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); - int target_nid, last_cpupid = -1; + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool page_locked; bool migrated = false; bool was_writable; @@ -1426,7 +1426,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); + if (node_is_toptier(page_nid)) + last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); @@ -1824,6 +1825,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa) { struct page *page; + bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1836,13 +1838,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto unlock; page = pmd_page(*pmd); + toptier = node_is_toptier(page_to_nid(page)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(page_to_nid(page))) + toptier) goto unlock; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical diff --git a/mm/memory.c b/mm/memory.c index 3ecad55103ad..69d416b88a09 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -71,6 +71,7 @@ #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/sched/sysctl.h> #include <trace/events/kmem.h> @@ -4077,8 +4078,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); + /* + * In memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(page_nid)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); diff --git a/mm/migrate.c b/mm/migrate.c index 9c5aa588ea4f..20774c88a307 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -641,6 +641,18 @@ void migrate_page_states(struct page *newpage, struct page *page) * future migrations of this same page. */ cpupid = page_cpupid_xchg_last(page, -1); + /* + * If migrate between slow and fast memory node, reset cpupid, + * because that is used to record page access time in slow + * memory node + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { + bool f_toptier = node_is_toptier(page_to_nid(page)); + bool t_toptier = node_is_toptier(page_to_nid(newpage)); + + if (f_toptier != t_toptier) + cpupid = -1; + } page_cpupid_xchg_last(newpage, cpupid); ksm_migrate_page(newpage, page); diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..aa94ec7176ed 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -112,4 +112,21 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) return last_cpupid; } + +unsigned int xchg_page_access_time(struct page *page, unsigned int time) +{ + unsigned long old_flags, flags; + unsigned int last_time; + + time >>= PAGE_ACCESS_TIME_BUCKETS; + do { + old_flags = flags = page->flags; + last_time = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (time & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_time << PAGE_ACCESS_TIME_BUCKETS; +} #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 8abec0c267fa..7c617bed45ee 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -85,6 +85,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa) { struct page *page; int nid; + bool toptier; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -114,14 +115,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, nid = page_to_nid(page); if (target_node == nid) continue; + toptier = node_is_toptier(nid); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(nid)) + toptier) continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, + jiffies_to_msecs(jiffies)); } oldpte = ptep_modify_prot_start(vma, addr, pte);

[RFC,-V3,3/5] autonuma, memory tiering: Hot page selection with hint page fault latency

Commit Message

Patch