@@ -26,6 +26,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern int evacuate_mem_region(struct zone *z, struct zone_mem_region *zmr);
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -102,6 +103,12 @@ static inline bool compaction_deferred(struct zone *zone, int order)
return true;
}
+static inline int evacuate_mem_region(struct zone *z,
+ struct zone_mem_region *zmr)
+{
+ return 0;
+}
+
#endif /* CONFIG_COMPACTION */
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
@@ -351,6 +351,8 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
+int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count,
+ struct list_head *list, int migratetype, int cold);
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
/* This is different from alloc_pages_exact_node !!! */
@@ -30,7 +30,8 @@ enum migrate_reason {
MR_SYSCALL, /* also applies to cpusets */
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
- MR_CMA
+ MR_CMA,
+ MR_PWR_MGMT
};
#ifdef CONFIG_MIGRATION
@@ -470,6 +470,7 @@ void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
int split_free_page(struct page *page);
+void __split_free_page(struct page *page, unsigned int order);
/*
* Compound pages have a destructor function. Provide a
@@ -15,7 +15,8 @@
{MR_MEMORY_HOTPLUG, "memory_hotplug"}, \
{MR_SYSCALL, "syscall_or_cpuset"}, \
{MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \
- {MR_CMA, "cma"}
+ {MR_CMA, "cma"}, \
+ {MR_PWR_MGMT, "power_management"}
TRACE_EVENT(mm_migrate_pages,
@@ -1168,6 +1168,105 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
return rc;
}
+static struct page *power_mgmt_alloc(struct page *migratepage,
+ unsigned long data, int **result)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ /*
+ * Try to allocate pages from lower memory regions. If it fails,
+ * abort.
+ */
+ if (list_empty(&cc->freepages)) {
+ struct zone *z = page_zone(migratepage);
+ unsigned int i, count, order = 0;
+ struct page *page, *tmp;
+ LIST_HEAD(list);
+
+ /* Get a bunch of order-0 pages from the buddy freelists */
+ count = rmqueue_bulk(z, order, cc->nr_migratepages, &list,
+ MIGRATE_MOVABLE, 1);
+
+ cc->nr_freepages = count * (1ULL << order);
+
+ if (list_empty(&list))
+ return NULL;
+
+ list_for_each_entry_safe(page, tmp, &list, lru) {
+ __split_free_page(page, order);
+
+ list_move_tail(&page->lru, &cc->freepages);
+
+ /*
+ * Now add all the order-0 subdivisions of this page
+ * to the freelist as well.
+ */
+ for (i = 1; i < (1ULL << order); i++) {
+ page++;
+ list_add(&page->lru, &cc->freepages);
+ }
+
+ }
+
+ VM_BUG_ON(!list_empty(&list));
+
+ /* Now map all the order-0 pages on the freelist. */
+ map_pages(&cc->freepages);
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+
+ if (page_zone_region_id(freepage) >= page_zone_region_id(migratepage))
+ return NULL; /* Freepage is not from lower region, so abort */
+
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+static unsigned long power_mgmt_release_freepages(unsigned long info)
+{
+ struct compact_control *cc = (struct compact_control *)info;
+
+ return release_freepages(&cc->freepages);
+}
+
+int evacuate_mem_region(struct zone *z, struct zone_mem_region *zmr)
+{
+ unsigned long start_pfn = zmr->start_pfn;
+ unsigned long end_pfn = zmr->end_pfn;
+
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start_pfn)),
+ .sync = false, /* Async migration */
+ .ignore_skip_hint = true,
+ };
+
+ struct aggression_control ac = {
+ .isolate_unevictable = false,
+ .prep_all = false,
+ .reclaim_clean = true,
+ .max_tries = 1,
+ .reason = MR_PWR_MGMT,
+ };
+
+ struct free_page_control fc = {
+ .free_page_alloc = power_mgmt_alloc,
+ .alloc_data = (unsigned long)&cc,
+ .release_freepages = power_mgmt_release_freepages,
+ .free_data = (unsigned long)&cc,
+ };
+
+ INIT_LIST_HEAD(&cc.migratepages);
+ INIT_LIST_HEAD(&cc.freepages);
+
+ return compact_range(&cc, &ac, &fc, start_pfn, end_pfn);
+}
+
/* Compact all zones within a node */
static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
@@ -1793,9 +1793,8 @@ retry:
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
-static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list,
- int migratetype, int cold)
+int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count,
+ struct list_head *list, int migratetype, int cold)
{
int mt = migratetype, i;
@@ -2111,6 +2110,20 @@ static int __isolate_free_page(struct page *page, unsigned int order)
return 1UL << order;
}
+
+/*
+ * The page is already free and isolated (removed) from the buddy system.
+ * Set up the refcounts appropriately. Note that we can't use page_order()
+ * here, since the buddy system would have invoked rmv_page_order() before
+ * giving the page.
+ */
+void __split_free_page(struct page *page, unsigned int order)
+{
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+}
+
/*
* Similar to split_page except the page is already free. As this is only
* being used for migration, the migratetype of the block also changes.
@@ -2132,9 +2145,7 @@ int split_free_page(struct page *page)
if (!nr_pages)
return 0;
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
+ __split_free_page(page, order);
return nr_pages;
}
To enhance memory power-savings, we need to be able to completely evacuate lightly allocated regions, and move those used pages to lower regions, which would help consolidate all the allocations to a minimum no. of regions. This can be done using some of the memory compaction and reclaim algorithms. Develop such an infrastructure to evacuate memory regions completely. The traditional compaction algorithm uses a pfn walker to get free pages for compaction. But this would be way too costly for us. So we do a pfn walk only to isolate the used pages, but to get free pages, we just depend on the fast buddy allocator itself. But we are careful to abort the compaction run when the buddy allocator starts giving free pages in this region itself or higher regions (because in that case, if we proceed, it would be defeating the purpose of the entire effort). Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> --- include/linux/compaction.h | 7 +++ include/linux/gfp.h | 2 + include/linux/migrate.h | 3 + include/linux/mm.h | 1 include/trace/events/migrate.h | 3 + mm/compaction.c | 99 ++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 23 +++++++-- 7 files changed, 130 insertions(+), 8 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html