@@ -53,6 +53,7 @@ enum {
#define MPOL_MF_MOVE_MT (1<<6) /* Use multi-threaded page copy routine */
#define MPOL_MF_MOVE_CONCUR (1<<7) /* Move pages in a batch */
#define MPOL_MF_EXCHANGE (1<<8) /* Exchange pages */
+#define MPOL_MF_SHRINK_LISTS (1<<9) /* Exchange pages */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
@@ -564,7 +564,7 @@ extern int copy_page_lists_mt(struct page **to,
extern int exchange_page_mthread(struct page *to, struct page *from,
int nr_pages);
extern int exchange_page_lists_mthread(struct page **to,
- struct page **from,
+ struct page **from,
int nr_pages);
extern int exchange_two_pages(struct page *page1, struct page *page2);
@@ -577,4 +577,95 @@ int expected_page_refs(struct address_space *mapping, struct page *page);
int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode);
+unsigned move_active_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list,
+ struct list_head *pages_to_free,
+ enum lru_list lru);
+void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list);
+
+struct scan_control {
+ /* How many pages shrink_list() should reclaim */
+ unsigned long nr_to_reclaim;
+
+ /*
+ * Nodemask of nodes allowed by the caller. If NULL, all nodes
+ * are scanned.
+ */
+ nodemask_t *nodemask;
+
+ /*
+ * The memory cgroup that hit its limit and as a result is the
+ * primary target of this reclaim invocation.
+ */
+ struct mem_cgroup *target_mem_cgroup;
+
+ /* Writepage batching in laptop mode; RECLAIM_WRITE */
+ unsigned int may_writepage:1;
+
+ /* Can mapped pages be reclaimed? */
+ unsigned int may_unmap:1;
+
+ /* Can pages be swapped as part of reclaim? */
+ unsigned int may_swap:1;
+
+ /* e.g. boosted watermark reclaim leaves slabs alone */
+ unsigned int may_shrinkslab:1;
+
+ /*
+ * Cgroups are not reclaimed below their configured memory.low,
+ * unless we threaten to OOM. If any cgroups are skipped due to
+ * memory.low and nothing was reclaimed, go back for memory.low.
+ */
+ unsigned int memcg_low_reclaim:1;
+ unsigned int memcg_low_skipped:1;
+
+ unsigned int hibernation_mode:1;
+
+ /* One of the zones is ready for compaction */
+ unsigned int compaction_ready:1;
+
+ unsigned int isolate_only_huge_page:1;
+ unsigned int isolate_only_base_page:1;
+ unsigned int no_reclaim:1;
+
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
+ /* Incremented by the number of inactive pages that were scanned */
+ unsigned long nr_scanned;
+
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
+
+ struct {
+ unsigned int dirty;
+ unsigned int unqueued_dirty;
+ unsigned int congested;
+ unsigned int writeback;
+ unsigned int immediate;
+ unsigned int file_taken;
+ unsigned int taken;
+ } nr;
+};
+
+unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct list_head *dst,
+ unsigned long *nr_scanned, struct scan_control *sc,
+ enum lru_list lru);
+void shrink_active_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec,
+ struct scan_control *sc,
+ enum lru_list lru);
+unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru);
+
#endif /* __MM_INTERNAL_H */
@@ -5,13 +5,79 @@
#include <linux/sched/mm.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/nodemask.h>
+#include <linux/rmap.h>
#include <linux/security.h>
+#include <linux/swap.h>
#include <linux/syscalls.h>
#include "internal.h"
+static unsigned long shrink_lists_node_memcg(pg_data_t *pgdat,
+ struct mem_cgroup *memcg, unsigned long nr_to_scan)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ unsigned long nr_to_scan_local = lruvec_size_memcg_node(lru, memcg,
+ pgdat->node_id) / 2;
+ struct scan_control sc = {.may_unmap = 1, .no_reclaim = 1};
+ /*nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, memcg, sc);*/
+ /*
+ * for slow node, we want active list, we start from the top of
+ * the active list. For pages in the bottom of
+ * the inactive list, we can place it to the top of inactive list
+ */
+ /*
+ * for fast node, we want inactive list, we start from the bottom of
+ * the inactive list. For pages in the active list, we just keep them.
+ */
+ /*
+ * A key question is how many pages to scan each time, and what criteria
+ * to use to move pages between active/inactive page lists.
+ * */
+ if (is_active_lru(lru))
+ shrink_active_list(nr_to_scan_local, lruvec, &sc, lru);
+ else
+ shrink_inactive_list(nr_to_scan_local, lruvec, &sc, lru);
+ }
+ cond_resched();
+
+ return 0;
+}
+
+static int shrink_lists(struct task_struct *p, struct mm_struct *mm,
+ const nodemask_t *slow, const nodemask_t *fast, unsigned long nr_to_scan)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_task(p);
+ int slow_nid, fast_nid;
+ int err = 0;
+
+ if (!memcg)
+ return 0;
+ /* Let's handle simplest situation first */
+ if (!(nodes_weight(*slow) == 1 && nodes_weight(*fast) == 1))
+ return 0;
+
+ if (memcg == root_mem_cgroup)
+ return 0;
+
+ slow_nid = first_node(*slow);
+ fast_nid = first_node(*fast);
+
+ /* move pages between page lists in slow node */
+ shrink_lists_node_memcg(NODE_DATA(slow_nid), memcg, nr_to_scan);
+
+ /* move pages between page lists in fast node */
+ shrink_lists_node_memcg(NODE_DATA(fast_nid), memcg, nr_to_scan);
+
+ return err;
+}
+
SYSCALL_DEFINE6(mm_manage, pid_t, pid, unsigned long, nr_pages,
unsigned long, maxnode,
const unsigned long __user *, slow_nodes,
@@ -42,10 +108,14 @@ SYSCALL_DEFINE6(mm_manage, pid_t, pid, unsigned long, nr_pages,
goto out;
/* Check flags */
- if (flags & ~(MPOL_MF_MOVE_MT|
+ if (flags & ~(
+ MPOL_MF_MOVE|
+ MPOL_MF_MOVE_MT|
MPOL_MF_MOVE_DMA|
MPOL_MF_MOVE_CONCUR|
- MPOL_MF_EXCHANGE))
+ MPOL_MF_EXCHANGE|
+ MPOL_MF_SHRINK_LISTS|
+ MPOL_MF_MOVE_ALL))
return -EINVAL;
/* Find the mm_struct */
@@ -94,6 +164,8 @@ SYSCALL_DEFINE6(mm_manage, pid_t, pid, unsigned long, nr_pages,
set_bit(MMF_MM_MANAGE, &mm->flags);
}
+ if (flags & MPOL_MF_SHRINK_LISTS)
+ shrink_lists(task, mm, slow, fast, nr_pages);
clear_bit(MMF_MM_MANAGE, &mm->flags);
mmput(mm);
@@ -63,75 +63,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
-struct scan_control {
- /* How many pages shrink_list() should reclaim */
- unsigned long nr_to_reclaim;
-
- /*
- * Nodemask of nodes allowed by the caller. If NULL, all nodes
- * are scanned.
- */
- nodemask_t *nodemask;
-
- /*
- * The memory cgroup that hit its limit and as a result is the
- * primary target of this reclaim invocation.
- */
- struct mem_cgroup *target_mem_cgroup;
-
- /* Writepage batching in laptop mode; RECLAIM_WRITE */
- unsigned int may_writepage:1;
-
- /* Can mapped pages be reclaimed? */
- unsigned int may_unmap:1;
-
- /* Can pages be swapped as part of reclaim? */
- unsigned int may_swap:1;
-
- /* e.g. boosted watermark reclaim leaves slabs alone */
- unsigned int may_shrinkslab:1;
-
- /*
- * Cgroups are not reclaimed below their configured memory.low,
- * unless we threaten to OOM. If any cgroups are skipped due to
- * memory.low and nothing was reclaimed, go back for memory.low.
- */
- unsigned int memcg_low_reclaim:1;
- unsigned int memcg_low_skipped:1;
-
- unsigned int hibernation_mode:1;
-
- /* One of the zones is ready for compaction */
- unsigned int compaction_ready:1;
-
- /* Allocation order */
- s8 order;
-
- /* Scan (total_size >> priority) pages at once */
- s8 priority;
-
- /* The highest zone to isolate pages for reclaim from */
- s8 reclaim_idx;
-
- /* This context's GFP mask */
- gfp_t gfp_mask;
-
- /* Incremented by the number of inactive pages that were scanned */
- unsigned long nr_scanned;
-
- /* Number of pages freed so far during a call to shrink_zones() */
- unsigned long nr_reclaimed;
-
- struct {
- unsigned int dirty;
- unsigned int unqueued_dirty;
- unsigned int congested;
- unsigned int writeback;
- unsigned int immediate;
- unsigned int file_taken;
- unsigned int taken;
- } nr;
-};
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
@@ -1261,6 +1192,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
; /* try to reclaim the page below */
}
+ /* We keep the page in inactive list for migration in the next
+ * step */
+ if (sc->no_reclaim) {
+ stat->nr_ref_keep++;
+ goto keep_locked;
+ }
+
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
@@ -1613,7 +1551,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
*
* returns how many pages were moved onto *@dst.
*/
-static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
enum lru_list lru)
@@ -1634,6 +1572,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct page *page;
page = lru_to_page(src);
+ nr_pages = hpage_nr_pages(page);
+
+ if (sc->isolate_only_base_page && nr_pages != 1)
+ continue;
+ if (sc->isolate_only_huge_page && nr_pages == 1)
+ continue;
+
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON_PAGE(!PageLRU(page), page);
@@ -1653,7 +1598,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
scan++;
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
@@ -1855,7 +1799,7 @@ static int current_may_throttle(void)
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static noinline_for_stack unsigned long
+noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
@@ -2029,7 +1973,7 @@ unsigned move_active_pages_to_lru(struct lruvec *lruvec,
return nr_moved;
}
-static void shrink_active_list(unsigned long nr_to_scan,
+void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)