diff mbox series

[v10,6/8] mm/demotion: Add pg_data_t member to track node memory tier details

Message ID 20220720025920.1373558-7-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V July 20, 2022, 2:59 a.m. UTC
Also update different helpes to use NODE_DATA()->memtier. Since
node specific memtier can change based on the reassignment of
NUMA node to a different memory tiers, accessing NODE_DATA()->memtier
needs to happen under an rcu read lock or memory_tier_lock.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/mmzone.h |  3 ++
 mm/memory-tiers.c      | 65 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 11 deletions(-)

Comments

Huang, Ying July 26, 2022, 8:02 a.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> Also update different helpes to use NODE_DATA()->memtier. Since
> node specific memtier can change based on the reassignment of
> NUMA node to a different memory tiers, accessing NODE_DATA()->memtier
> needs to happen under an rcu read lock or memory_tier_lock.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  include/linux/mmzone.h |  3 ++
>  mm/memory-tiers.c      | 65 +++++++++++++++++++++++++++++++++++-------
>  2 files changed, 57 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index aab70355d64f..353812495a70 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -928,6 +928,9 @@ typedef struct pglist_data {
>  	/* Per-node vmstats */
>  	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
>  	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
> +#ifdef CONFIG_NUMA
> +	struct memory_tier __rcu *memtier;
> +#endif
>  } pg_data_t;
>  
>  #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index a8cfe2ca3903..4715f9b96a44 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -138,13 +138,18 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
>  
>  static struct memory_tier *__node_get_memory_tier(int node)
>  {
> -	struct memory_tier *memtier;
> +	pg_data_t *pgdat;
>  
> -	list_for_each_entry(memtier, &memory_tiers, list) {
> -		if (node_isset(node, memtier->nodelist))
> -			return memtier;
> -	}
> -	return NULL;
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return NULL;
> +	/*
> +	 * Since we hold memory_tier_lock, we can avoid
> +	 * RCU read locks when accessing the details. No
> +	 * parallel updates are possible here.
> +	 */
> +	return rcu_dereference_check(pgdat->memtier,
> +				     lockdep_is_held(&memory_tier_lock));
>  }
>  
>  #ifdef CONFIG_MIGRATION
> @@ -277,6 +282,29 @@ static inline void disable_all_migrate_targets(void) {}
>  static inline void establish_migration_targets(void) {}
>  #endif /* CONFIG_MIGRATION */
>  
> +static void memtier_node_set(int node, struct memory_tier *memtier)
> +{
> +	pg_data_t *pgdat;
> +	struct memory_tier *current_memtier;
> +
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return;
> +	/*
> +	 * Make sure we mark the memtier NULL before we assign the new memory tier
> +	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
> +	 * finds a NULL memtier or the one which is still valid.
> +	 */
> +	current_memtier = rcu_dereference_check(pgdat->memtier,
> +						lockdep_is_held(&memory_tier_lock));
> +	rcu_assign_pointer(pgdat->memtier, NULL);
> +	synchronize_rcu();
> +	if (current_memtier)
> +		node_clear(node, current_memtier->nodelist);

If pgdat->memtier == NULL, we don't need to set it to NULL and call
synchronize_rcu().  That is,

+	current_memtier = rcu_dereference_check(pgdat->memtier,
+						lockdep_is_held(&memory_tier_lock));
+	if (current_memtier) {
+               rcu_assign_pointer(pgdat->memtier, NULL);
+               synchronize_rcu();
+		node_clear(node, current_memtier->nodelist);
+       }

Same for clear_node_memory_tier().

Best Regards,
Huang, Ying

> +	node_set(node, memtier->nodelist);
> +	rcu_assign_pointer(pgdat->memtier, memtier);
> +}
> +
>  static void init_node_memory_tier(int node)
>  {
>  	int perf_level;
> @@ -295,7 +323,7 @@ static void init_node_memory_tier(int node)
>  	if (!memtier) {
>  		perf_level = node_devices[node]->perf_level;
>  		memtier = find_create_memory_tier(perf_level);
> -		node_set(node, memtier->nodelist);
> +		memtier_node_set(node, memtier);
>  	}
>  	establish_migration_targets();
>  	mutex_unlock(&memory_tier_lock);
> @@ -303,12 +331,25 @@ static void init_node_memory_tier(int node)
>  
>  static void clear_node_memory_tier(int node)
>  {
> -	struct memory_tier *memtier;
> +	pg_data_t *pgdat;
> +	struct memory_tier *current_memtier;
> +
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return;
>  
>  	mutex_lock(&memory_tier_lock);
> -	memtier = __node_get_memory_tier(node);
> -	if (memtier) {
> -		node_clear(node, memtier->nodelist);
> +	/*
> +	 * Make sure we mark the memtier NULL before we assign the new memory tier
> +	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
> +	 * finds a NULL memtier or the one which is still valid.
> +	 */
> +	current_memtier = rcu_dereference_check(pgdat->memtier,
> +						lockdep_is_held(&memory_tier_lock));
> +	rcu_assign_pointer(pgdat->memtier, NULL);
> +	synchronize_rcu();
> +	if (current_memtier) {
> +		node_clear(node, current_memtier->nodelist);
>  		establish_migration_targets();
>  	}
>  	mutex_unlock(&memory_tier_lock);
> @@ -383,6 +424,8 @@ static int __init memory_tier_init(void)
>  
>  		if (!node_property->perf_level)
>  			node_property->perf_level = default_memtier_perf_level;
> +
> +		rcu_assign_pointer(NODE_DATA(node)->memtier, memtier);
>  	}
>  	mutex_unlock(&memory_tier_lock);
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..353812495a70 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -928,6 +928,9 @@  typedef struct pglist_data {
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_NUMA
+	struct memory_tier __rcu *memtier;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index a8cfe2ca3903..4715f9b96a44 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -138,13 +138,18 @@  static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
 
 static struct memory_tier *__node_get_memory_tier(int node)
 {
-	struct memory_tier *memtier;
+	pg_data_t *pgdat;
 
-	list_for_each_entry(memtier, &memory_tiers, list) {
-		if (node_isset(node, memtier->nodelist))
-			return memtier;
-	}
-	return NULL;
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return NULL;
+	/*
+	 * Since we hold memory_tier_lock, we can avoid
+	 * RCU read locks when accessing the details. No
+	 * parallel updates are possible here.
+	 */
+	return rcu_dereference_check(pgdat->memtier,
+				     lockdep_is_held(&memory_tier_lock));
 }
 
 #ifdef CONFIG_MIGRATION
@@ -277,6 +282,29 @@  static inline void disable_all_migrate_targets(void) {}
 static inline void establish_migration_targets(void) {}
 #endif /* CONFIG_MIGRATION */
 
+static void memtier_node_set(int node, struct memory_tier *memtier)
+{
+	pg_data_t *pgdat;
+	struct memory_tier *current_memtier;
+
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return;
+	/*
+	 * Make sure we mark the memtier NULL before we assign the new memory tier
+	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
+	 * finds a NULL memtier or the one which is still valid.
+	 */
+	current_memtier = rcu_dereference_check(pgdat->memtier,
+						lockdep_is_held(&memory_tier_lock));
+	rcu_assign_pointer(pgdat->memtier, NULL);
+	synchronize_rcu();
+	if (current_memtier)
+		node_clear(node, current_memtier->nodelist);
+	node_set(node, memtier->nodelist);
+	rcu_assign_pointer(pgdat->memtier, memtier);
+}
+
 static void init_node_memory_tier(int node)
 {
 	int perf_level;
@@ -295,7 +323,7 @@  static void init_node_memory_tier(int node)
 	if (!memtier) {
 		perf_level = node_devices[node]->perf_level;
 		memtier = find_create_memory_tier(perf_level);
-		node_set(node, memtier->nodelist);
+		memtier_node_set(node, memtier);
 	}
 	establish_migration_targets();
 	mutex_unlock(&memory_tier_lock);
@@ -303,12 +331,25 @@  static void init_node_memory_tier(int node)
 
 static void clear_node_memory_tier(int node)
 {
-	struct memory_tier *memtier;
+	pg_data_t *pgdat;
+	struct memory_tier *current_memtier;
+
+	pgdat = NODE_DATA(node);
+	if (!pgdat)
+		return;
 
 	mutex_lock(&memory_tier_lock);
-	memtier = __node_get_memory_tier(node);
-	if (memtier) {
-		node_clear(node, memtier->nodelist);
+	/*
+	 * Make sure we mark the memtier NULL before we assign the new memory tier
+	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
+	 * finds a NULL memtier or the one which is still valid.
+	 */
+	current_memtier = rcu_dereference_check(pgdat->memtier,
+						lockdep_is_held(&memory_tier_lock));
+	rcu_assign_pointer(pgdat->memtier, NULL);
+	synchronize_rcu();
+	if (current_memtier) {
+		node_clear(node, current_memtier->nodelist);
 		establish_migration_targets();
 	}
 	mutex_unlock(&memory_tier_lock);
@@ -383,6 +424,8 @@  static int __init memory_tier_init(void)
 
 		if (!node_property->perf_level)
 			node_property->perf_level = default_memtier_perf_level;
+
+		rcu_assign_pointer(NODE_DATA(node)->memtier, memtier);
 	}
 	mutex_unlock(&memory_tier_lock);