diff mbox series

[mm-unstable] mm/demotion: Assign correct memory type for multiple dax devices with the same node affinity

Message ID 20220826100224.542312-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series [mm-unstable] mm/demotion: Assign correct memory type for multiple dax devices with the same node affinity | expand

Commit Message

Aneesh Kumar K.V Aug. 26, 2022, 10:02 a.m. UTC
With multiple dax devices having the same node affinity, the kernel wrongly assigned
default_dram memory type to some devices after the memory hotplug operation. Fix this by
not clearing node_memory_types on the dax device remove.

The current kernel cleared node_memory_type on successful removal of a dax device.
But then we can have multiple dax devices with the same node affinity. Clearing the
node_memory_type results in assigning other dax devices to the default dram type when
we bring them online.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/memory-tiers.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

Comments

Andrew Morton Aug. 27, 2022, 3 a.m. UTC | #1
On Fri, 26 Aug 2022 15:32:24 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> wrote:

> With multiple dax devices having the same node affinity, the kernel wrongly assigned
> default_dram memory type to some devices after the memory hotplug operation. Fix this by
> not clearing node_memory_types on the dax device remove.
> 
> The current kernel cleared node_memory_type on successful removal of a dax device.
> But then we can have multiple dax devices with the same node affinity. Clearing the
> node_memory_type results in assigning other dax devices to the default dram type when
> we bring them online.

Thanks, I added this as a fix against "mm/demotion/dax/kmem: set node's
abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE".

Reworked this patch to apply, reworked all the subsequent patches as
a result of applying that, checked that this patch as-sent cleanly reverts
when all are applied, checked that everything compiled at each step of
the resulting series.
Huang, Ying Sept. 1, 2022, 6:15 a.m. UTC | #2
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> With multiple dax devices having the same node affinity, the kernel wrongly assigned
> default_dram memory type to some devices after the memory hotplug operation. Fix this by
> not clearing node_memory_types on the dax device remove.

Sorry for late reply.

Just for confirmation.  There are multiple dax devices in one NUMA node?

If you can show the bug reproducing steps, that will make it even easier
to understand.

Best Regards,
Huang, Ying

> The current kernel cleared node_memory_type on successful removal of a dax device.
> But then we can have multiple dax devices with the same node affinity. Clearing the
> node_memory_type results in assigning other dax devices to the default dram type when
> we bring them online.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/memory-tiers.c | 37 +++++++++++++++++++++++++++++--------
>  1 file changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index ba844fe9cc8c..c4bd6d052a33 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -27,9 +27,14 @@ struct demotion_nodes {
>  	nodemask_t preferred;
>  };
>  
> +struct node_memory_type_map {
> +	struct memory_dev_type *memtype;
> +	int map_count;
> +};
> +
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
> -static struct memory_dev_type *node_memory_types[MAX_NUMNODES];
> +static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  static struct memory_dev_type *default_dram_type;
>  #ifdef CONFIG_MIGRATION
>  static int top_tier_adistance;
> @@ -386,9 +391,19 @@ static inline void establish_demotion_targets(void) {}
>  
>  static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
>  {
> -	if (!node_memory_types[node]) {
> -		node_memory_types[node] = memtype;
> -		kref_get(&memtype->kref);
> +	if (!node_memory_types[node].memtype)
> +		node_memory_types[node].memtype = memtype;
> +	/*
> +	 * for each device getting added in the same NUMA node
> +	 * with this specific memtype, bump the map count. We
> +	 * Only take memtype device reference once, so that
> +	 * changing a node memtype can be done by droping the
> +	 * only reference count taken here.
> +	 */
> +
> +	if (node_memory_types[node].memtype == memtype) {
> +		if (!node_memory_types[node].map_count++)
> +			kref_get(&memtype->kref);
>  	}
>  }
>  
> @@ -406,7 +421,7 @@ static struct memory_tier *set_node_memory_tier(int node)
>  
>  	__init_node_memory_type(node, default_dram_type);
>  
> -	memtype = node_memory_types[node];
> +	memtype = node_memory_types[node].memtype;
>  	node_set(node, memtype->nodes);
>  	memtier = find_create_memory_tier(memtype);
>  	if (!IS_ERR(memtier))
> @@ -448,7 +463,7 @@ static bool clear_node_memory_tier(int node)
>  
>  		rcu_assign_pointer(pgdat->memtier, NULL);
>  		synchronize_rcu();
> -		memtype = node_memory_types[node];
> +		memtype = node_memory_types[node].memtype;
>  		node_clear(node, memtype->nodes);
>  		if (nodes_empty(memtype->nodes)) {
>  			list_del_init(&memtype->tier_sibiling);
> @@ -502,8 +517,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype)
>  {
>  	mutex_lock(&memory_tier_lock);
> -	if (node_memory_types[node] == memtype) {
> -		node_memory_types[node] = NULL;
> +	if (node_memory_types[node].memtype == memtype)
> +		node_memory_types[node].map_count--;
> +	/*
> +	 * If we umapped all the attached devices to this node,
> +	 * clear the node memory type.
> +	 */
> +	if (!node_memory_types[node].map_count) {
> +		node_memory_types[node].memtype = NULL;
>  		kref_put(&memtype->kref, release_memtype);
>  	}
>  	mutex_unlock(&memory_tier_lock);
Aneesh Kumar K.V Sept. 1, 2022, 6:24 a.m. UTC | #3
On 9/1/22 11:45 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> 
>> With multiple dax devices having the same node affinity, the kernel wrongly assigned
>> default_dram memory type to some devices after the memory hotplug operation. Fix this by
>> not clearing node_memory_types on the dax device remove.
> 
> Sorry for late reply.
> 
> Just for confirmation.  There are multiple dax devices in one NUMA node?
> 
> If you can show the bug reproducing steps, that will make it even easier
> to understand.
> 

NUMA nodes are assigned per region and you can have multiple devdax namespace with same NUMA node affinity. 

dax0.1 and dax0.2 are examples. To recreate you can follow the below steps


root@ubuntu-guest:/sys/devices/system/node/node3# ls
compact  cpumap    meminfo   memory34  memory36  memory38  memory41  memory43  memory45  memory47  memory50  memory52  memory54  numastat   uevent
cpulist  distance  memory33  memory35  memory37  memory39  memory42  memory44  memory46  memory49  memory51  memory53  memory55  subsystem  vmstat
root@ubuntu-guest:/sys/devices/system/node/node3#
root@ubuntu-guest:/sys/devices/system/node/node3# for mem in memory*; do echo 0 > $mem/online; done
root@ubuntu-guest:/sys/devices/system/node/node3# cd /sys/bus/dax/drivers
root@ubuntu-guest:/sys/bus/dax/drivers# echo dax0.0  > kmem/unbind
root@ubuntu-guest:/sys/bus/dax/drivers# cd /sys/devices/system/node/node3/
root@ubuntu-guest:/sys/devices/system/node/node3# ls
compact  cpumap    meminfo   memory42  memory44  memory46  memory49  memory51  memory53  memory55  subsystem  vmstat
cpulist  distance  memory41  memory43  memory45  memory47  memory50  memory52  memory54  numastat  uevent
root@ubuntu-guest:/sys/devices/system/node/node3# for mem in memory*; do echo 1 > $mem/online; done


-aneesh
Huang, Ying Sept. 1, 2022, 6:45 a.m. UTC | #4
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 9/1/22 11:45 AM, Huang, Ying wrote:
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> With multiple dax devices having the same node affinity, the kernel wrongly assigned
>>> default_dram memory type to some devices after the memory hotplug operation. Fix this by
>>> not clearing node_memory_types on the dax device remove.
>> 
>> Sorry for late reply.
>> 
>> Just for confirmation.  There are multiple dax devices in one NUMA node?
>> 
>> If you can show the bug reproducing steps, that will make it even easier
>> to understand.
>> 
>
> NUMA nodes are assigned per region and you can have multiple devdax namespace with same NUMA node affinity. 

I think that the patch description will be clearer if we added the above
line into it.

> dax0.1 and dax0.2 are examples. To recreate you can follow the below steps
>
>
> root@ubuntu-guest:/sys/devices/system/node/node3# ls
> compact  cpumap    meminfo   memory34  memory36  memory38  memory41  memory43  memory45  memory47  memory50  memory52  memory54  numastat   uevent
> cpulist  distance  memory33  memory35  memory37  memory39  memory42  memory44  memory46  memory49  memory51  memory53  memory55  subsystem  vmstat
> root@ubuntu-guest:/sys/devices/system/node/node3#
> root@ubuntu-guest:/sys/devices/system/node/node3# for mem in memory*; do echo 0 > $mem/online; done
> root@ubuntu-guest:/sys/devices/system/node/node3# cd /sys/bus/dax/drivers
> root@ubuntu-guest:/sys/bus/dax/drivers# echo dax0.0  > kmem/unbind
> root@ubuntu-guest:/sys/bus/dax/drivers# cd /sys/devices/system/node/node3/
> root@ubuntu-guest:/sys/devices/system/node/node3# ls
> compact  cpumap    meminfo   memory42  memory44  memory46  memory49  memory51  memory53  memory55  subsystem  vmstat
> cpulist  distance  memory41  memory43  memory45  memory47  memory50  memory52  memory54  numastat  uevent
> root@ubuntu-guest:/sys/devices/system/node/node3# for mem in memory*; do echo 1 > $mem/online; done

Thanks a lot for detailed information.

Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index ba844fe9cc8c..c4bd6d052a33 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -27,9 +27,14 @@  struct demotion_nodes {
 	nodemask_t preferred;
 };
 
+struct node_memory_type_map {
+	struct memory_dev_type *memtype;
+	int map_count;
+};
+
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
-static struct memory_dev_type *node_memory_types[MAX_NUMNODES];
+static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 static struct memory_dev_type *default_dram_type;
 #ifdef CONFIG_MIGRATION
 static int top_tier_adistance;
@@ -386,9 +391,19 @@  static inline void establish_demotion_targets(void) {}
 
 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
 {
-	if (!node_memory_types[node]) {
-		node_memory_types[node] = memtype;
-		kref_get(&memtype->kref);
+	if (!node_memory_types[node].memtype)
+		node_memory_types[node].memtype = memtype;
+	/*
+	 * for each device getting added in the same NUMA node
+	 * with this specific memtype, bump the map count. We
+	 * Only take memtype device reference once, so that
+	 * changing a node memtype can be done by droping the
+	 * only reference count taken here.
+	 */
+
+	if (node_memory_types[node].memtype == memtype) {
+		if (!node_memory_types[node].map_count++)
+			kref_get(&memtype->kref);
 	}
 }
 
@@ -406,7 +421,7 @@  static struct memory_tier *set_node_memory_tier(int node)
 
 	__init_node_memory_type(node, default_dram_type);
 
-	memtype = node_memory_types[node];
+	memtype = node_memory_types[node].memtype;
 	node_set(node, memtype->nodes);
 	memtier = find_create_memory_tier(memtype);
 	if (!IS_ERR(memtier))
@@ -448,7 +463,7 @@  static bool clear_node_memory_tier(int node)
 
 		rcu_assign_pointer(pgdat->memtier, NULL);
 		synchronize_rcu();
-		memtype = node_memory_types[node];
+		memtype = node_memory_types[node].memtype;
 		node_clear(node, memtype->nodes);
 		if (nodes_empty(memtype->nodes)) {
 			list_del_init(&memtype->tier_sibiling);
@@ -502,8 +517,14 @@  EXPORT_SYMBOL_GPL(init_node_memory_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 {
 	mutex_lock(&memory_tier_lock);
-	if (node_memory_types[node] == memtype) {
-		node_memory_types[node] = NULL;
+	if (node_memory_types[node].memtype == memtype)
+		node_memory_types[node].map_count--;
+	/*
+	 * If we umapped all the attached devices to this node,
+	 * clear the node memory type.
+	 */
+	if (!node_memory_types[node].map_count) {
+		node_memory_types[node].memtype = NULL;
 		kref_put(&memtype->kref, release_memtype);
 	}
 	mutex_unlock(&memory_tier_lock);