@@ -203,6 +203,7 @@ extern int user_min_free_kbytes;
extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -1161,6 +1161,10 @@ out:
return rc;
}
+/*
+ * Writes to this array occur without locking. READ_ONCE()
+ * is recommended for readers to ensure consistent reads.
+ */
static int node_demotion[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
/**
@@ -1174,7 +1178,13 @@ static int node_demotion[MAX_NUMNODES] =
*/
int next_demotion_node(int node)
{
- return node_demotion[node];
+ /*
+ * node_demotion[] is updated without excluding
+ * this function from running. READ_ONCE() avoids
+ * reading multiple, inconsistent 'node' values
+ * during an update.
+ */
+ return READ_ONCE(node_demotion[node]);
}
/*
@@ -3112,3 +3122,128 @@ void migrate_vma_finalize(struct migrate
}
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+
+/* Disable reclaim-based migration. */
+static void disable_all_migrate_targets(void)
+{
+ int node;
+
+ for_each_online_node(node)
+ node_demotion[node] = NUMA_NO_NODE;
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+ int migration_target;
+
+ /*
+ * Can not set a migration target on a
+ * node with it already set.
+ *
+ * No need for READ_ONCE() here since this
+ * in the write path for node_demotion[].
+ * This should be the only thread writing.
+ */
+ if (node_demotion[node] != NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ migration_target = find_next_best_node(node, used);
+ if (migration_target == NUMA_NO_NODE)
+ return NUMA_NO_NODE;
+
+ node_demotion[node] = migration_target;
+
+ return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided. If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[]. However, it can not run simultaneously
+ * with itself. Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+void __set_migration_target_nodes(void)
+{
+ nodemask_t next_pass = NODE_MASK_NONE;
+ nodemask_t this_pass = NODE_MASK_NONE;
+ nodemask_t used_targets = NODE_MASK_NONE;
+ int node;
+
+ /*
+ * Avoid any oddities like cycles that could occur
+ * from changes in the topology. This will leave
+ * a momentary gap when migration is disabled.
+ */
+ disable_all_migrate_targets();
+
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ *
+ * The before+after state together might have cycles and
+ * could cause readers to do things like loop until this
+ * function finishes. This ensures they can only see a
+ * single "bad" read and would, for instance, only loop
+ * once.
+ */
+ smp_wmb();
+
+ /*
+ * Allocations go close to CPUs, first. Assume that
+ * the migration path starts at the nodes with CPUs.
+ */
+ next_pass = node_states[N_CPU];
+again:
+ this_pass = next_pass;
+ next_pass = NODE_MASK_NONE;
+ /*
+ * To avoid cycles in the migration "graph", ensure
+ * that migration sources are not future targets by
+ * setting them in 'used_targets'. Do this only
+ * once per pass so that multiple source nodes can
+ * share a target node.
+ *
+ * 'used_targets' will become unavailable in future
+ * passes. This limits some opportunities for
+ * multiple source nodes to share a desintation.
+ */
+ nodes_or(used_targets, used_targets, this_pass);
+ for_each_node_mask(node, this_pass) {
+ int target_node = establish_migrate_target(node, &used_targets);
+
+ if (target_node == NUMA_NO_NODE)
+ continue;
+
+ /* Visit targets from this pass in the next pass: */
+ node_set(target_node, next_pass);
+ }
+ /* Is another pass necessary? */
+ if (!nodes_empty(next_pass))
+ goto again;
+}
+
+void set_migration_target_nodes(void)
+{
+ get_online_mems();
+ __set_migration_target_nodes();
+ put_online_mems();
+}
@@ -5632,7 +5632,7 @@ static int node_load[MAX_NUMNODES];
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;