Message ID | 1537327066-27852-4-git-send-email-kernelfans@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | introduce a new state 'isolate' for memblock to split the isolation and migration steps | expand |
Hi Pingfan, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on linus/master] [also build test WARNING on v4.19-rc4 next-20180918] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Pingfan-Liu/introduce-a-new-state-isolate-for-memblock-to-split-the-isolation-and-migration-steps/20180919-112650 config: x86_64-randconfig-x018-201837 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All warnings (new ones prefixed by >>): mm/memory_hotplug.c: In function 'do_migrate_range': >> mm/memory_hotplug.c:1442:53: warning: passing argument 4 of 'migrate_pages' makes integer from pointer without a cast [-Wint-conversion] ret = migrate_pages(&source, new_node_page, NULL, &nmask, ^ In file included from mm/memory_hotplug.c:27:0: include/linux/migrate.h:68:12: note: expected 'long unsigned int' but argument is of type 'nodemask_t * {aka struct <anonymous> *}' extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, ^~~~~~~~~~~~~ vim +/migrate_pages +1442 mm/memory_hotplug.c 1356 1357 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1358 static int 1359 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1360 { 1361 unsigned long pfn; 1362 struct page *page; 1363 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1364 int not_managed = 0; 1365 int ret = 0; 1366 LIST_HEAD(source); 1367 int nid; 1368 nodemask_t nmask = node_states[N_MEMORY]; 1369 1370 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1371 if (!pfn_valid(pfn)) 1372 continue; 1373 page = pfn_to_page(pfn); 1374 1375 if (PageHuge(page)) { 1376 struct page *head = compound_head(page); 1377 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1378 if (compound_order(head) > PFN_SECTION_SHIFT) { 1379 ret = -EBUSY; 1380 break; 1381 } 1382 if (isolate_huge_page(page, &source)) 1383 move_pages -= 1 << compound_order(head); 1384 continue; 1385 } else if (PageTransHuge(page)) 1386 pfn = page_to_pfn(compound_head(page)) 1387 + hpage_nr_pages(page) - 1; 1388 1389 if (!get_page_unless_zero(page)) 1390 continue; 1391 /* 1392 * We can skip free pages. And we can deal with pages on 1393 * LRU and non-lru movable pages. 1394 */ 1395 if (PageLRU(page)) 1396 ret = isolate_lru_page(page); 1397 else 1398 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1399 if (!ret) { /* Success */ 1400 put_page(page); 1401 list_add_tail(&page->lru, &source); 1402 move_pages--; 1403 if (!__PageMovable(page)) 1404 inc_node_page_state(page, NR_ISOLATED_ANON + 1405 page_is_file_cache(page)); 1406 1407 } else { 1408 #ifdef CONFIG_DEBUG_VM 1409 pr_alert("failed to isolate pfn %lx\n", pfn); 1410 dump_page(page, "isolation failed"); 1411 #endif 1412 put_page(page); 1413 /* Because we don't have big zone->lock. we should 1414 check this again here. */ 1415 if (page_count(page)) { 1416 not_managed++; 1417 ret = -EBUSY; 1418 break; 1419 } 1420 } 1421 } 1422 if (!list_empty(&source)) { 1423 if (not_managed) { 1424 putback_movable_pages(&source); 1425 goto out; 1426 } 1427 1428 page = list_entry(source.next, struct page, lru); 1429 nid = page_to_nid(page); 1430 if (!NODE_DATA(nid)->partial_offline) { 1431 /* 1432 * try to allocate from a different node but reuse this 1433 * node if there are no other online nodes to be used 1434 * (e.g. we are offlining a part of the only existing 1435 * node) 1436 */ 1437 node_clear(nid, nmask); 1438 if (nodes_empty(nmask)) 1439 node_set(nid, nmask); 1440 } 1441 /* Allocate a new page from the nearest neighbor node */ > 1442 ret = migrate_pages(&source, new_node_page, NULL, &nmask, 1443 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1444 if (ret) 1445 putback_movable_pages(&source); 1446 } 1447 out: 1448 return ret; 1449 } 1450 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
diff --git a/drivers/base/node.c b/drivers/base/node.c index 1ac4c36..64b0cb8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -25,6 +25,36 @@ static struct bus_type node_subsys = { .dev_name = "node", }; +static ssize_t read_partial_offline(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); + ssize_t len = 0; + + if (pgdat->partial_offline) + len = sprintf(buf, "1\n"); + else + len = sprintf(buf, "0\n"); + + return len; +} + +static ssize_t write_partial_offline(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); + + if (sysfs_streq(buf, "1")) + pgdat->partial_offline = true; + else if (sysfs_streq(buf, "0")) + pgdat->partial_offline = false; + else + return -EINVAL; + + return strlen(buf); +} static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf) { @@ -56,6 +86,8 @@ static inline ssize_t node_read_cpulist(struct device *dev, return node_read_cpumap(dev, true, buf); } +static DEVICE_ATTR(partial_offline, 0600, read_partial_offline, + write_partial_offline); static DEVICE_ATTR(cpumap, S_IRUGO, node_read_cpumask, NULL); static DEVICE_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL); @@ -235,6 +267,7 @@ static struct attribute *node_dev_attrs[] = { &dev_attr_numastat.attr, &dev_attr_distance.attr, &dev_attr_vmstat.attr, + &dev_attr_partial_offline.attr, NULL }; ATTRIBUTE_GROUPS(node_dev); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e22d96..80c44c8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -722,6 +722,7 @@ typedef struct pglist_data { /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; + bool partial_offline; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 228de4d..3c66075 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1346,18 +1346,10 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) static struct page *new_node_page(struct page *page, unsigned long private) { - int nid = page_to_nid(page); - nodemask_t nmask = node_states[N_MEMORY]; - - /* - * try to allocate from a different node but reuse this node if there - * are no other online nodes to be used (e.g. we are offlining a part - * of the only existing node) - */ - node_clear(nid, nmask); - if (nodes_empty(nmask)) - node_set(nid, nmask); + nodemask_t nmask = *(nodemask_t *)private; + int nid; + nid = page_to_nid(page); return new_page_nodemask(page, nid, &nmask); } @@ -1371,6 +1363,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) int not_managed = 0; int ret = 0; LIST_HEAD(source); + int nid; + nodemask_t nmask = node_states[N_MEMORY]; for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { if (!pfn_valid(pfn)) @@ -1430,8 +1424,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } + page = list_entry(source.next, struct page, lru); + nid = page_to_nid(page); + if (!NODE_DATA(nid)->partial_offline) { + /* + * try to allocate from a different node but reuse this + * node if there are no other online nodes to be used + * (e.g. we are offlining a part of the only existing + * node) + */ + node_clear(nid, nmask); + if (nodes_empty(nmask)) + node_set(nid, nmask); + } /* Allocate a new page from the nearest neighbor node */ - ret = migrate_pages(&source, new_node_page, NULL, 0, + ret = migrate_pages(&source, new_node_page, NULL, &nmask, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source);
When offline mem, there are two cases: 1st, offline all of memblock under a node. 2nd, only offline and replace part of mem under a node. For the 2nd case, there is not need to alloc new page from other nodes, which may incur extra numa fault to resolve the misplaced issue, and place unnecessary mem pressure on other nodes. The patch suggests to introduce an interface /sys/../node/nodeX/partial_offline to let the user order how to allocate a new page, i.e. from local node or other nodes. Signed-off-by: Pingfan Liu <kernelfans@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Pavel Tatashin <pasha.tatashin@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Bharata B Rao <bharata@linux.vnet.ibm.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- drivers/base/node.c | 33 +++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 1 + mm/memory_hotplug.c | 31 +++++++++++++++++++------------ 3 files changed, 53 insertions(+), 12 deletions(-)