diff mbox series

[v10,7/8] mm/demotion: Demote pages according to allocation fallback order

Message ID 20220720025920.1373558-8-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V July 20, 2022, 2:59 a.m. UTC
From: Jagdish Gediya <jvgediya.oss@gmail.com>

Currently, a higher tier node can only be demoted to selected
nodes on the next lower tier as defined by the demotion path.
This strict, hard-coded demotion order does not work in all
use cases (e.g. some use cases may want to allow cross-socket
demotion to another node in the same demotion tier as a fallback
when the preferred demotion node is out of space). This demotion
order is also inconsistent with the page allocation fallback order
when all the nodes in a higher tier are out of space: The page
allocation can fall back to any node from any lower tier, whereas
the demotion order doesn't allow that currently.

This patch adds support to get all the allowed demotion targets
for a memory tier. demote_page_list() function is now modified
to utilize this allowed node mask as the fallback allocation mask.

Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/memory-tiers.h | 11 +++++++
 mm/memory-tiers.c            | 54 +++++++++++++++++++++++++++++++--
 mm/vmscan.c                  | 58 ++++++++++++++++++++++++++----------
 3 files changed, 106 insertions(+), 17 deletions(-)

Comments

Huang, Ying July 26, 2022, 8:24 a.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> From: Jagdish Gediya <jvgediya.oss@gmail.com>
>
> Currently, a higher tier node can only be demoted to selected
> nodes on the next lower tier as defined by the demotion path.
> This strict, hard-coded demotion order does not work in all
> use cases (e.g. some use cases may want to allow cross-socket
> demotion to another node in the same demotion tier as a fallback
> when the preferred demotion node is out of space). This demotion
> order is also inconsistent with the page allocation fallback order
> when all the nodes in a higher tier are out of space: The page
> allocation can fall back to any node from any lower tier, whereas
> the demotion order doesn't allow that currently.
>
> This patch adds support to get all the allowed demotion targets
> for a memory tier. demote_page_list() function is now modified
> to utilize this allowed node mask as the fallback allocation mask.
>
> Signed-off-by: Jagdish Gediya <jvgediya@linux.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  include/linux/memory-tiers.h | 11 +++++++
>  mm/memory-tiers.c            | 54 +++++++++++++++++++++++++++++++--
>  mm/vmscan.c                  | 58 ++++++++++++++++++++++++++----------
>  3 files changed, 106 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index 852e86bd0a23..0e58588fa066 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -19,11 +19,17 @@
>  extern bool numa_demotion_enabled;
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
> +void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>  #else
>  static inline int next_demotion_node(int node)
>  {
>  	return NUMA_NO_NODE;
>  }
> +
> +static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
> +{
> +	*targets = NODE_MASK_NONE;
> +}
>  #endif
>  
>  #else
> @@ -33,5 +39,10 @@ static inline int next_demotion_node(int node)
>  {
>  	return NUMA_NO_NODE;
>  }
> +
> +static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
> +{
> +	*targets = NODE_MASK_NONE;
> +}
>  #endif	/* CONFIG_NUMA */
>  #endif  /* _LINUX_MEMORY_TIERS_H */
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 4715f9b96a44..4a96e4213d66 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -15,6 +15,7 @@ struct memory_tier {
>  	struct list_head list;
>  	int perf_level;
>  	nodemask_t nodelist;
> +	nodemask_t lower_tier_mask;
>  };
>  
>  struct demotion_nodes {
> @@ -153,6 +154,24 @@ static struct memory_tier *__node_get_memory_tier(int node)
>  }
>  
>  #ifdef CONFIG_MIGRATION
> +void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
> +{
> +	struct memory_tier *memtier;
> +
> +	/*
> +	 * pg_data_t.memtier updates includes a synchronize_rcu()
> +	 * which ensures that we either find NULL or a valid memtier
> +	 * in NODE_DATA. protect the access via rcu_read_lock();
> +	 */
> +	rcu_read_lock();
> +	memtier = rcu_dereference(pgdat->memtier);
> +	if (memtier)
> +		*targets = memtier->lower_tier_mask;
> +	else
> +		*targets = NODE_MASK_NONE;
> +	rcu_read_unlock();
> +}
> +
>  /**
>   * next_demotion_node() - Get the next node in the demotion path
>   * @node: The starting node to lookup the next node
> @@ -201,10 +220,19 @@ int next_demotion_node(int node)
>  /* Disable reclaim-based migration. */
>  static void __disable_all_migrate_targets(void)
>  {
> +	struct memory_tier *memtier;
>  	int node;
>  
> -	for_each_node_state(node, N_MEMORY)
> +	for_each_node_state(node, N_MEMORY) {
>  		node_demotion[node].preferred = NODE_MASK_NONE;
> +		/*
> +		 * We are holding memory_tier_lock, it is safe
> +		 * to access pgda->memtier.
> +		 */
> +		memtier = rcu_dereference_check(NODE_DATA(node)->memtier,
> +						lockdep_is_held(&memory_tier_lock));
> +		memtier->lower_tier_mask = NODE_MASK_NONE;
> +	}
>  }
>  
>  static void disable_all_migrate_targets(void)
> @@ -230,7 +258,7 @@ static void establish_migration_targets(void)
>  	struct demotion_nodes *nd;
>  	int target = NUMA_NO_NODE, node;
>  	int distance, best_distance;
> -	nodemask_t used;
> +	nodemask_t used, lower_tier = NODE_MASK_NONE;
>  
>  	if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
>  		return;
> @@ -276,6 +304,28 @@ static void establish_migration_targets(void)
>  			}
>  		} while (1);
>  	}
> +	/*
> +	 * Now build the lower_tier mask for each node collecting node mask from
> +	 * all memory tier below it. This allows us to fallback demotion page
> +	 * allocation to a set of nodes that is closer the above selected
> +	 * perferred node.
> +	 */
> +	list_for_each_entry(memtier, &memory_tiers, list)
> +		nodes_or(lower_tier, lower_tier, memtier->nodelist);
> +	/*
> +	 * Removes nodes not yet in N_MEMORY.
> +	 */
> +	nodes_and(lower_tier, node_states[N_MEMORY], lower_tier);

The above code is equivalent with

        lower_tier = node_states[N_MEMORY];

?

> +
> +	list_for_each_entry(memtier, &memory_tiers, list) {
> +		/*
> +		 * Keep removing current tier from lower_tier nodes,
> +		 * This will remove all nodes in current and above
> +		 * memory tier from the lower_tier mask.
> +		 */
> +		nodes_andnot(lower_tier, lower_tier, memtier->nodelist);
> +		memtier->lower_tier_mask = lower_tier;
> +	}

This is per-memtier instead of per-node.  So we need not run this code
for each node?  That is, move the above code out of for_each_node()
loop?

Best Regards,
Huang, Ying

>  }
>  #else
>  static inline void disable_all_migrate_targets(void) {}
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 3a8f78277f99..60a5235dd639 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1460,21 +1460,34 @@ static void folio_check_dirty_writeback(struct folio *folio,
>  		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
>  }
>  
> -static struct page *alloc_demote_page(struct page *page, unsigned long node)
> +static struct page *alloc_demote_page(struct page *page, unsigned long private)
>  {
> -	struct migration_target_control mtc = {
> -		/*
> -		 * Allocate from 'node', or fail quickly and quietly.
> -		 * When this happens, 'page' will likely just be discarded
> -		 * instead of migrated.
> -		 */
> -		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
> -			    __GFP_THISNODE  | __GFP_NOWARN |
> -			    __GFP_NOMEMALLOC | GFP_NOWAIT,
> -		.nid = node
> -	};
> +	struct page *target_page;
> +	nodemask_t *allowed_mask;
> +	struct migration_target_control *mtc;
> +
> +	mtc = (struct migration_target_control *)private;
> +
> +	allowed_mask = mtc->nmask;
> +	/*
> +	 * make sure we allocate from the target node first also trying to
> +	 * reclaim pages from the target node via kswapd if we are low on
           ~~~~~~~

demote or reclaim

> +	 * free memory on target node. If we don't do this and if we have low
                                                           ~~~~~~~~~~~~~~~~~~
> +	 * free memory on the target memtier, we would start allocating pages
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

and if we have free memory on the slower(lower) memtier,

> +	 * from higher memory tiers without even forcing a demotion of cold
                ~~~~~~

slower(lower)

> +	 * pages from the target memtier. This can result in the kernel placing
                                 ~~~~~~~

node

> +	 * hotpages in higher memory tiers.
           ~~~~~~~~    ~~~~~~

hot pages

slower(lower)

Best Regards,
Huang, Ying

> +	 */
> +	mtc->nmask = NULL;
> +	mtc->gfp_mask |= __GFP_THISNODE;
> +	target_page = alloc_migration_target(page, (unsigned long)mtc);
> +	if (target_page)
> +		return target_page;
>  
> -	return alloc_migration_target(page, (unsigned long)&mtc);
> +	mtc->gfp_mask &= ~__GFP_THISNODE;
> +	mtc->nmask = allowed_mask;
> +
> +	return alloc_migration_target(page, (unsigned long)mtc);
>  }
>  
>  /*
> @@ -1487,6 +1500,19 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
>  {
>  	int target_nid = next_demotion_node(pgdat->node_id);
>  	unsigned int nr_succeeded;
> +	nodemask_t allowed_mask;
> +
> +	struct migration_target_control mtc = {
> +		/*
> +		 * Allocate from 'node', or fail quickly and quietly.
> +		 * When this happens, 'page' will likely just be discarded
> +		 * instead of migrated.
> +		 */
> +		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
> +			__GFP_NOMEMALLOC | GFP_NOWAIT,
> +		.nid = target_nid,
> +		.nmask = &allowed_mask
> +	};
>  
>  	if (list_empty(demote_pages))
>  		return 0;
> @@ -1494,10 +1520,12 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
>  	if (target_nid == NUMA_NO_NODE)
>  		return 0;
>  
> +	node_get_allowed_targets(pgdat, &allowed_mask);
> +
>  	/* Demotion ignores all cpuset and mempolicy settings */
>  	migrate_pages(demote_pages, alloc_demote_page, NULL,
> -			    target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> -			    &nr_succeeded);
> +		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
> +		      &nr_succeeded);
>  
>  	if (current_is_kswapd())
>  		__count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
diff mbox series

Patch

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 852e86bd0a23..0e58588fa066 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -19,11 +19,17 @@ 
 extern bool numa_demotion_enabled;
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 #else
 static inline int next_demotion_node(int node)
 {
 	return NUMA_NO_NODE;
 }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+	*targets = NODE_MASK_NONE;
+}
 #endif
 
 #else
@@ -33,5 +39,10 @@  static inline int next_demotion_node(int node)
 {
 	return NUMA_NO_NODE;
 }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+	*targets = NODE_MASK_NONE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 4715f9b96a44..4a96e4213d66 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -15,6 +15,7 @@  struct memory_tier {
 	struct list_head list;
 	int perf_level;
 	nodemask_t nodelist;
+	nodemask_t lower_tier_mask;
 };
 
 struct demotion_nodes {
@@ -153,6 +154,24 @@  static struct memory_tier *__node_get_memory_tier(int node)
 }
 
 #ifdef CONFIG_MIGRATION
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+	struct memory_tier *memtier;
+
+	/*
+	 * pg_data_t.memtier updates includes a synchronize_rcu()
+	 * which ensures that we either find NULL or a valid memtier
+	 * in NODE_DATA. protect the access via rcu_read_lock();
+	 */
+	rcu_read_lock();
+	memtier = rcu_dereference(pgdat->memtier);
+	if (memtier)
+		*targets = memtier->lower_tier_mask;
+	else
+		*targets = NODE_MASK_NONE;
+	rcu_read_unlock();
+}
+
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node
@@ -201,10 +220,19 @@  int next_demotion_node(int node)
 /* Disable reclaim-based migration. */
 static void __disable_all_migrate_targets(void)
 {
+	struct memory_tier *memtier;
 	int node;
 
-	for_each_node_state(node, N_MEMORY)
+	for_each_node_state(node, N_MEMORY) {
 		node_demotion[node].preferred = NODE_MASK_NONE;
+		/*
+		 * We are holding memory_tier_lock, it is safe
+		 * to access pgda->memtier.
+		 */
+		memtier = rcu_dereference_check(NODE_DATA(node)->memtier,
+						lockdep_is_held(&memory_tier_lock));
+		memtier->lower_tier_mask = NODE_MASK_NONE;
+	}
 }
 
 static void disable_all_migrate_targets(void)
@@ -230,7 +258,7 @@  static void establish_migration_targets(void)
 	struct demotion_nodes *nd;
 	int target = NUMA_NO_NODE, node;
 	int distance, best_distance;
-	nodemask_t used;
+	nodemask_t used, lower_tier = NODE_MASK_NONE;
 
 	if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
 		return;
@@ -276,6 +304,28 @@  static void establish_migration_targets(void)
 			}
 		} while (1);
 	}
+	/*
+	 * Now build the lower_tier mask for each node collecting node mask from
+	 * all memory tier below it. This allows us to fallback demotion page
+	 * allocation to a set of nodes that is closer the above selected
+	 * perferred node.
+	 */
+	list_for_each_entry(memtier, &memory_tiers, list)
+		nodes_or(lower_tier, lower_tier, memtier->nodelist);
+	/*
+	 * Removes nodes not yet in N_MEMORY.
+	 */
+	nodes_and(lower_tier, node_states[N_MEMORY], lower_tier);
+
+	list_for_each_entry(memtier, &memory_tiers, list) {
+		/*
+		 * Keep removing current tier from lower_tier nodes,
+		 * This will remove all nodes in current and above
+		 * memory tier from the lower_tier mask.
+		 */
+		nodes_andnot(lower_tier, lower_tier, memtier->nodelist);
+		memtier->lower_tier_mask = lower_tier;
+	}
 }
 #else
 static inline void disable_all_migrate_targets(void) {}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3a8f78277f99..60a5235dd639 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1460,21 +1460,34 @@  static void folio_check_dirty_writeback(struct folio *folio,
 		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
 }
 
-static struct page *alloc_demote_page(struct page *page, unsigned long node)
+static struct page *alloc_demote_page(struct page *page, unsigned long private)
 {
-	struct migration_target_control mtc = {
-		/*
-		 * Allocate from 'node', or fail quickly and quietly.
-		 * When this happens, 'page' will likely just be discarded
-		 * instead of migrated.
-		 */
-		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
-			    __GFP_THISNODE  | __GFP_NOWARN |
-			    __GFP_NOMEMALLOC | GFP_NOWAIT,
-		.nid = node
-	};
+	struct page *target_page;
+	nodemask_t *allowed_mask;
+	struct migration_target_control *mtc;
+
+	mtc = (struct migration_target_control *)private;
+
+	allowed_mask = mtc->nmask;
+	/*
+	 * make sure we allocate from the target node first also trying to
+	 * reclaim pages from the target node via kswapd if we are low on
+	 * free memory on target node. If we don't do this and if we have low
+	 * free memory on the target memtier, we would start allocating pages
+	 * from higher memory tiers without even forcing a demotion of cold
+	 * pages from the target memtier. This can result in the kernel placing
+	 * hotpages in higher memory tiers.
+	 */
+	mtc->nmask = NULL;
+	mtc->gfp_mask |= __GFP_THISNODE;
+	target_page = alloc_migration_target(page, (unsigned long)mtc);
+	if (target_page)
+		return target_page;
 
-	return alloc_migration_target(page, (unsigned long)&mtc);
+	mtc->gfp_mask &= ~__GFP_THISNODE;
+	mtc->nmask = allowed_mask;
+
+	return alloc_migration_target(page, (unsigned long)mtc);
 }
 
 /*
@@ -1487,6 +1500,19 @@  static unsigned int demote_page_list(struct list_head *demote_pages,
 {
 	int target_nid = next_demotion_node(pgdat->node_id);
 	unsigned int nr_succeeded;
+	nodemask_t allowed_mask;
+
+	struct migration_target_control mtc = {
+		/*
+		 * Allocate from 'node', or fail quickly and quietly.
+		 * When this happens, 'page' will likely just be discarded
+		 * instead of migrated.
+		 */
+		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+			__GFP_NOMEMALLOC | GFP_NOWAIT,
+		.nid = target_nid,
+		.nmask = &allowed_mask
+	};
 
 	if (list_empty(demote_pages))
 		return 0;
@@ -1494,10 +1520,12 @@  static unsigned int demote_page_list(struct list_head *demote_pages,
 	if (target_nid == NUMA_NO_NODE)
 		return 0;
 
+	node_get_allowed_targets(pgdat, &allowed_mask);
+
 	/* Demotion ignores all cpuset and mempolicy settings */
 	migrate_pages(demote_pages, alloc_demote_page, NULL,
-			    target_nid, MIGRATE_ASYNC, MR_DEMOTION,
-			    &nr_succeeded);
+		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
+		      &nr_succeeded);
 
 	if (current_is_kswapd())
 		__count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);