diff mbox series

[v10,3/8] mm/demotion: Add hotplug callbacks to handle new numa node onlined

Message ID 20220720025920.1373558-4-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series mm/demotion: Memory tiers and demotion | expand

Commit Message

Aneesh Kumar K.V July 20, 2022, 2:59 a.m. UTC
If the new NUMA node onlined doesn't have a performance level assigned,
the kernel adds the NUMA node to default memory tier.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/memory-tiers.h |  1 +
 mm/memory-tiers.c            | 75 ++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

Comments

Huang, Ying July 26, 2022, 4:03 a.m. UTC | #1
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> If the new NUMA node onlined doesn't have a performance level assigned,
> the kernel adds the NUMA node to default memory tier.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  include/linux/memory-tiers.h |  1 +
>  mm/memory-tiers.c            | 75 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 76 insertions(+)
>
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index ef380a39db3a..3d5f14d57ae6 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -14,6 +14,7 @@
>  #define MEMTIER_PERF_LEVEL_DRAM	(1 << (MEMTIER_CHUNK_BITS + 2))
>  /* leave one tier below this slow pmem */
>  #define MEMTIER_PERF_LEVEL_PMEM	(1 << MEMTIER_CHUNK_BITS)
> +#define MEMTIER_HOTPLUG_PRIO	100
>  
>  extern bool numa_demotion_enabled;
>  
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 41a21cc5ae55..cc3a47ec18e4 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -5,6 +5,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/moduleparam.h>
>  #include <linux/node.h>
> +#include <linux/memory.h>
>  #include <linux/memory-tiers.h>
>  
>  struct memory_tier {
> @@ -64,6 +65,78 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
>  	return new_memtier;
>  }
>  
> +static struct memory_tier *__node_get_memory_tier(int node)
> +{
> +	struct memory_tier *memtier;
> +
> +	list_for_each_entry(memtier, &memory_tiers, list) {
> +		if (node_isset(node, memtier->nodelist))
> +			return memtier;
> +	}
> +	return NULL;
> +}
> +
> +static void init_node_memory_tier(int node)

set_node_memory_tier()?

> +{
> +	int perf_level;
> +	struct memory_tier *memtier;
> +
> +	mutex_lock(&memory_tier_lock);
> +
> +	memtier = __node_get_memory_tier(node);
> +	if (!memtier) {
> +		perf_level = node_devices[node]->perf_level;
> +		memtier = find_create_memory_tier(perf_level);
> +		node_set(node, memtier->nodelist);
> +	}
> +	mutex_unlock(&memory_tier_lock);
> +}
> +
> +static void clear_node_memory_tier(int node)
> +{
> +	struct memory_tier *memtier;
> +
> +	mutex_lock(&memory_tier_lock);
> +	memtier = __node_get_memory_tier(node);
> +	if (memtier)
> +		node_clear(node, memtier->nodelist);

When memtier->nodelist becomes empty, we need to free memtier?

> +	mutex_unlock(&memory_tier_lock);
> +}
> +
> +/*
> + * This runs whether reclaim-based migration is enabled or not,
> + * which ensures that the user can turn reclaim-based migration
> + * at any time without needing to recalculate migration targets.
> + */

The comments doesn't apply here.

> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
> +						 unsigned long action, void *_arg)

Now we are building memory tiers instead of working on demotion.  So I
think we should rename the function to memtier_hotplug_callback().

> +{
> +	struct memory_notify *arg = _arg;
> +
> +	/*
> +	 * Only update the node migration order when a node is
> +	 * changing status, like online->offline.
> +	 */
> +	if (arg->status_change_nid < 0)
> +		return notifier_from_errno(0);
> +
> +	switch (action) {
> +	case MEM_OFFLINE:
> +		clear_node_memory_tier(arg->status_change_nid);
> +		break;
> +	case MEM_ONLINE:
> +		init_node_memory_tier(arg->status_change_nid);
> +		break;
> +	}
> +
> +	return notifier_from_errno(0);
> +}
> +
> +static void __init migrate_on_reclaim_init(void)
> +{
> +	hotplug_memory_notifier(migrate_on_reclaim_callback, MEMTIER_HOTPLUG_PRIO);
> +}

I suggest to call hotplug_memory_notifier() in memory_tier_init()
directly.  We are not working on demotion here.

> +
>  static int __init memory_tier_init(void)
>  {
>  	int node;
> @@ -96,6 +169,8 @@ static int __init memory_tier_init(void)
>  			node_property->perf_level = default_memtier_perf_level;
>  	}
>  	mutex_unlock(&memory_tier_lock);
> +
> +	migrate_on_reclaim_init();
>  	return 0;
>  }
>  subsys_initcall(memory_tier_init);

Best Regards,
Huang, Ying
Aneesh Kumar K.V July 26, 2022, 12:03 p.m. UTC | #2
On 7/26/22 9:33 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> 
>> If the new NUMA node onlined doesn't have a performance level assigned,
>> the kernel adds the NUMA node to default memory tier.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  include/linux/memory-tiers.h |  1 +
>>  mm/memory-tiers.c            | 75 ++++++++++++++++++++++++++++++++++++
>>  2 files changed, 76 insertions(+)
>>
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index ef380a39db3a..3d5f14d57ae6 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -14,6 +14,7 @@
>>  #define MEMTIER_PERF_LEVEL_DRAM	(1 << (MEMTIER_CHUNK_BITS + 2))
>>  /* leave one tier below this slow pmem */
>>  #define MEMTIER_PERF_LEVEL_PMEM	(1 << MEMTIER_CHUNK_BITS)
>> +#define MEMTIER_HOTPLUG_PRIO	100
>>  
>>  extern bool numa_demotion_enabled;
>>  
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 41a21cc5ae55..cc3a47ec18e4 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -5,6 +5,7 @@
>>  #include <linux/lockdep.h>
>>  #include <linux/moduleparam.h>
>>  #include <linux/node.h>
>> +#include <linux/memory.h>
>>  #include <linux/memory-tiers.h>
>>  
>>  struct memory_tier {
>> @@ -64,6 +65,78 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
>>  	return new_memtier;
>>  }
>>  
>> +static struct memory_tier *__node_get_memory_tier(int node)
>> +{
>> +	struct memory_tier *memtier;
>> +
>> +	list_for_each_entry(memtier, &memory_tiers, list) {
>> +		if (node_isset(node, memtier->nodelist))
>> +			return memtier;
>> +	}
>> +	return NULL;
>> +}
>> +
>> +static void init_node_memory_tier(int node)
> 
> set_node_memory_tier()?

That was done based on feedback from Alistair 

https://lore.kernel.org/linux-mm/87h73iapg1.fsf@nvdebian.thelocal


> 
>> +{
>> +	int perf_level;
>> +	struct memory_tier *memtier;
>> +
>> +	mutex_lock(&memory_tier_lock);
>> +
>> +	memtier = __node_get_memory_tier(node);
>> +	if (!memtier) {
>> +		perf_level = node_devices[node]->perf_level;
>> +		memtier = find_create_memory_tier(perf_level);
>> +		node_set(node, memtier->nodelist);
>> +	}
>> +	mutex_unlock(&memory_tier_lock);
>> +}
>> +
>> +static void clear_node_memory_tier(int node)
>> +{
>> +	struct memory_tier *memtier;
>> +
>> +	mutex_lock(&memory_tier_lock);
>> +	memtier = __node_get_memory_tier(node);
>> +	if (memtier)
>> +		node_clear(node, memtier->nodelist);
> 
> When memtier->nodelist becomes empty, we need to free memtier?
> 
>> +	mutex_unlock(&memory_tier_lock);
>> +}
>> +
>> +/*
>> + * This runs whether reclaim-based migration is enabled or not,
>> + * which ensures that the user can turn reclaim-based migration
>> + * at any time without needing to recalculate migration targets.
>> + */
> 
> The comments doesn't apply here.
> 
>> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>> +						 unsigned long action, void *_arg)
> 
> Now we are building memory tiers instead of working on demotion.  So I
> think we should rename the function to memtier_hotplug_callback().
> 
>> +{
>> +	struct memory_notify *arg = _arg;
>> +
>> +	/*
>> +	 * Only update the node migration order when a node is
>> +	 * changing status, like online->offline.
>> +	 */
>> +	if (arg->status_change_nid < 0)
>> +		return notifier_from_errno(0);
>> +
>> +	switch (action) {
>> +	case MEM_OFFLINE:
>> +		clear_node_memory_tier(arg->status_change_nid);
>> +		break;
>> +	case MEM_ONLINE:
>> +		init_node_memory_tier(arg->status_change_nid);
>> +		break;
>> +	}
>> +
>> +	return notifier_from_errno(0);
>> +}
>> +
>> +static void __init migrate_on_reclaim_init(void)
>> +{
>> +	hotplug_memory_notifier(migrate_on_reclaim_callback, MEMTIER_HOTPLUG_PRIO);
>> +}
> 
> I suggest to call hotplug_memory_notifier() in memory_tier_init()
> directly.  We are not working on demotion here.
> 
>> +
>>  static int __init memory_tier_init(void)
>>  {
>>  	int node;
>> @@ -96,6 +169,8 @@ static int __init memory_tier_init(void)
>>  			node_property->perf_level = default_memtier_perf_level;
>>  	}
>>  	mutex_unlock(&memory_tier_lock);
>> +
>> +	migrate_on_reclaim_init();
>>  	return 0;
>>  }
>>  subsys_initcall(memory_tier_init);
> 
> Best Regards,
> Huang, Ying


Will update the patch in next iteration to take care of other feedback.

Thanks
-aneesh
Huang, Ying July 27, 2022, 1:53 a.m. UTC | #3
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:

> On 7/26/22 9:33 AM, Huang, Ying wrote:
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>> 
>>> If the new NUMA node onlined doesn't have a performance level assigned,
>>> the kernel adds the NUMA node to default memory tier.
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>>  include/linux/memory-tiers.h |  1 +
>>>  mm/memory-tiers.c            | 75 ++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 76 insertions(+)
>>>
>>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>>> index ef380a39db3a..3d5f14d57ae6 100644
>>> --- a/include/linux/memory-tiers.h
>>> +++ b/include/linux/memory-tiers.h
>>> @@ -14,6 +14,7 @@
>>>  #define MEMTIER_PERF_LEVEL_DRAM	(1 << (MEMTIER_CHUNK_BITS + 2))
>>>  /* leave one tier below this slow pmem */
>>>  #define MEMTIER_PERF_LEVEL_PMEM	(1 << MEMTIER_CHUNK_BITS)
>>> +#define MEMTIER_HOTPLUG_PRIO	100
>>>  
>>>  extern bool numa_demotion_enabled;
>>>  
>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>> index 41a21cc5ae55..cc3a47ec18e4 100644
>>> --- a/mm/memory-tiers.c
>>> +++ b/mm/memory-tiers.c
>>> @@ -5,6 +5,7 @@
>>>  #include <linux/lockdep.h>
>>>  #include <linux/moduleparam.h>
>>>  #include <linux/node.h>
>>> +#include <linux/memory.h>
>>>  #include <linux/memory-tiers.h>
>>>  
>>>  struct memory_tier {
>>> @@ -64,6 +65,78 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
>>>  	return new_memtier;
>>>  }
>>>  
>>> +static struct memory_tier *__node_get_memory_tier(int node)
>>> +{
>>> +	struct memory_tier *memtier;
>>> +
>>> +	list_for_each_entry(memtier, &memory_tiers, list) {
>>> +		if (node_isset(node, memtier->nodelist))
>>> +			return memtier;
>>> +	}
>>> +	return NULL;
>>> +}
>>> +
>>> +static void init_node_memory_tier(int node)
>> 
>> set_node_memory_tier()?
>
> That was done based on feedback from Alistair 
>
> https://lore.kernel.org/linux-mm/87h73iapg1.fsf@nvdebian.thelocal
>
>> 
>>> +{
>>> +	int perf_level;
>>> +	struct memory_tier *memtier;
>>> +
>>> +	mutex_lock(&memory_tier_lock);
>>> +
>>> +	memtier = __node_get_memory_tier(node);
>>> +	if (!memtier) {
>>> +		perf_level = node_devices[node]->perf_level;
>>> +		memtier = find_create_memory_tier(perf_level);
>>> +		node_set(node, memtier->nodelist);
>>> +	}

It's related to Alistair's comments too.  When will memtier != NULL
here?  We may need just VM_WARN_ON() here?

>>> +	mutex_unlock(&memory_tier_lock);
>>> +}
>>> +
>>> +static void clear_node_memory_tier(int node)
>>> +{
>>> +	struct memory_tier *memtier;
>>> +
>>> +	mutex_lock(&memory_tier_lock);
>>> +	memtier = __node_get_memory_tier(node);
>>> +	if (memtier)
>>> +		node_clear(node, memtier->nodelist);
>> 
>> When memtier->nodelist becomes empty, we need to free memtier?
>> 
>>> +	mutex_unlock(&memory_tier_lock);
>>> +}
>>> +
>>> +/*
>>> + * This runs whether reclaim-based migration is enabled or not,
>>> + * which ensures that the user can turn reclaim-based migration
>>> + * at any time without needing to recalculate migration targets.
>>> + */
>> 
>> The comments doesn't apply here.
>> 
>>> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>>> +						 unsigned long action, void *_arg)
>> 
>> Now we are building memory tiers instead of working on demotion.  So I
>> think we should rename the function to memtier_hotplug_callback().
>> 
>>> +{
>>> +	struct memory_notify *arg = _arg;
>>> +
>>> +	/*
>>> +	 * Only update the node migration order when a node is
>>> +	 * changing status, like online->offline.
>>> +	 */
>>> +	if (arg->status_change_nid < 0)
>>> +		return notifier_from_errno(0);
>>> +
>>> +	switch (action) {
>>> +	case MEM_OFFLINE:
>>> +		clear_node_memory_tier(arg->status_change_nid);
>>> +		break;
>>> +	case MEM_ONLINE:
>>> +		init_node_memory_tier(arg->status_change_nid);
>>> +		break;
>>> +	}
>>> +
>>> +	return notifier_from_errno(0);
>>> +}
>>> +
>>> +static void __init migrate_on_reclaim_init(void)
>>> +{
>>> +	hotplug_memory_notifier(migrate_on_reclaim_callback, MEMTIER_HOTPLUG_PRIO);
>>> +}
>> 
>> I suggest to call hotplug_memory_notifier() in memory_tier_init()
>> directly.  We are not working on demotion here.
>> 
>>> +
>>>  static int __init memory_tier_init(void)
>>>  {
>>>  	int node;
>>> @@ -96,6 +169,8 @@ static int __init memory_tier_init(void)
>>>  			node_property->perf_level = default_memtier_perf_level;
>>>  	}
>>>  	mutex_unlock(&memory_tier_lock);
>>> +
>>> +	migrate_on_reclaim_init();
>>>  	return 0;
>>>  }
>>>  subsys_initcall(memory_tier_init);
>> 
>> Best Regards,
>> Huang, Ying
>
>
> Will update the patch in next iteration to take care of other feedback.

Thanks!

Best Regards,
Huang, Ying
Aneesh Kumar K.V July 27, 2022, 4:38 a.m. UTC | #4
"Huang, Ying" <ying.huang@intel.com> writes:

> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>
>> On 7/26/22 9:33 AM, Huang, Ying wrote:
>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>> 
>>>> If the new NUMA node onlined doesn't have a performance level assigned,
>>>> the kernel adds the NUMA node to default memory tier.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>>  include/linux/memory-tiers.h |  1 +
>>>>  mm/memory-tiers.c            | 75 ++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 76 insertions(+)
>>>>
>>>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>>>> index ef380a39db3a..3d5f14d57ae6 100644
>>>> --- a/include/linux/memory-tiers.h
>>>> +++ b/include/linux/memory-tiers.h
>>>> @@ -14,6 +14,7 @@
>>>>  #define MEMTIER_PERF_LEVEL_DRAM	(1 << (MEMTIER_CHUNK_BITS + 2))
>>>>  /* leave one tier below this slow pmem */
>>>>  #define MEMTIER_PERF_LEVEL_PMEM	(1 << MEMTIER_CHUNK_BITS)
>>>> +#define MEMTIER_HOTPLUG_PRIO	100
>>>>  
>>>>  extern bool numa_demotion_enabled;
>>>>  
>>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>>> index 41a21cc5ae55..cc3a47ec18e4 100644
>>>> --- a/mm/memory-tiers.c
>>>> +++ b/mm/memory-tiers.c
>>>> @@ -5,6 +5,7 @@
>>>>  #include <linux/lockdep.h>
>>>>  #include <linux/moduleparam.h>
>>>>  #include <linux/node.h>
>>>> +#include <linux/memory.h>
>>>>  #include <linux/memory-tiers.h>
>>>>  
>>>>  struct memory_tier {
>>>> @@ -64,6 +65,78 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
>>>>  	return new_memtier;
>>>>  }
>>>>  
>>>> +static struct memory_tier *__node_get_memory_tier(int node)
>>>> +{
>>>> +	struct memory_tier *memtier;
>>>> +
>>>> +	list_for_each_entry(memtier, &memory_tiers, list) {
>>>> +		if (node_isset(node, memtier->nodelist))
>>>> +			return memtier;
>>>> +	}
>>>> +	return NULL;
>>>> +}
>>>> +
>>>> +static void init_node_memory_tier(int node)
>>> 
>>> set_node_memory_tier()?
>>
>> That was done based on feedback from Alistair 
>>
>> https://lore.kernel.org/linux-mm/87h73iapg1.fsf@nvdebian.thelocal
>>
>>> 
>>>> +{
>>>> +	int perf_level;
>>>> +	struct memory_tier *memtier;
>>>> +
>>>> +	mutex_lock(&memory_tier_lock);
>>>> +
>>>> +	memtier = __node_get_memory_tier(node);
>>>> +	if (!memtier) {
>>>> +		perf_level = node_devices[node]->perf_level;
>>>> +		memtier = find_create_memory_tier(perf_level);
>>>> +		node_set(node, memtier->nodelist);
>>>> +	}
>
> It's related to Alistair's comments too.  When will memtier != NULL
> here?  We may need just VM_WARN_ON() here?

When the platform driver sets memory tier directly. With the old code
it can happen when dax/kmem register a node to a memory tier. With
memory_type proposal this can happen if the node is part of memory
type that is already added to a memory tier. 

>
>>>> +	mutex_unlock(&memory_tier_lock);
>>>> +}
>>>> +
>>>> +static void clear_node_memory_tier(int node)
>>>> +{
>>>> +	struct memory_tier *memtier;
>>>> +
>>>> +	mutex_lock(&memory_tier_lock);
>>>> +	memtier = __node_get_memory_tier(node);
>>>> +	if (memtier)
>>>> +		node_clear(node, memtier->nodelist);
>>> 
>>> When memtier->nodelist becomes empty, we need to free memtier?
>>> 
>>>> +	mutex_unlock(&memory_tier_lock);
>>>> +}
>>>> +
>>>> +/*
>>>> + * This runs whether reclaim-based migration is enabled or not,
>>>> + * which ensures that the user can turn reclaim-based migration
>>>> + * at any time without needing to recalculate migration targets.
>>>> + */
>>> 
>>> The comments doesn't apply here.
>>> 
>>>> +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
>>>> +						 unsigned long action, void *_arg)
>>> 
>>> Now we are building memory tiers instead of working on demotion.  So I
>>> think we should rename the function to memtier_hotplug_callback().
>>> 
>>>> +{
>>>> +	struct memory_notify *arg = _arg;
>>>> +
>>>> +	/*
>>>> +	 * Only update the node migration order when a node is
>>>> +	 * changing status, like online->offline.
>>>> +	 */
>>>> +	if (arg->status_change_nid < 0)
>>>> +		return notifier_from_errno(0);
>>>> +
>>>> +	switch (action) {
>>>> +	case MEM_OFFLINE:
>>>> +		clear_node_memory_tier(arg->status_change_nid);
>>>> +		break;
>>>> +	case MEM_ONLINE:
>>>> +		init_node_memory_tier(arg->status_change_nid);
>>>> +		break;
>>>> +	}
>>>> +
>>>> +	return notifier_from_errno(0);
>>>> +}
>>>> +
>>>> +static void __init migrate_on_reclaim_init(void)
>>>> +{
>>>> +	hotplug_memory_notifier(migrate_on_reclaim_callback, MEMTIER_HOTPLUG_PRIO);
>>>> +}
>>> 
>>> I suggest to call hotplug_memory_notifier() in memory_tier_init()
>>> directly.  We are not working on demotion here.
>>> 
>>>> +
>>>>  static int __init memory_tier_init(void)
>>>>  {
>>>>  	int node;
>>>> @@ -96,6 +169,8 @@ static int __init memory_tier_init(void)
>>>>  			node_property->perf_level = default_memtier_perf_level;
>>>>  	}
>>>>  	mutex_unlock(&memory_tier_lock);
>>>> +
>>>> +	migrate_on_reclaim_init();
>>>>  	return 0;
>>>>  }
>>>>  subsys_initcall(memory_tier_init);
>>> 
>>> Best Regards,
>>> Huang, Ying
>>
>>
>> Will update the patch in next iteration to take care of other feedback.
>
> Thanks!
>
> Best Regards,
> Huang, Ying
Huang, Ying July 28, 2022, 6:42 a.m. UTC | #5
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> "Huang, Ying" <ying.huang@intel.com> writes:
>
>> Aneesh Kumar K V <aneesh.kumar@linux.ibm.com> writes:
>>
>>> On 7/26/22 9:33 AM, Huang, Ying wrote:
>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

[snip]

>>>>>  
>>>>> +static struct memory_tier *__node_get_memory_tier(int node)
>>>>> +{
>>>>> +	struct memory_tier *memtier;
>>>>> +
>>>>> +	list_for_each_entry(memtier, &memory_tiers, list) {
>>>>> +		if (node_isset(node, memtier->nodelist))
>>>>> +			return memtier;
>>>>> +	}
>>>>> +	return NULL;
>>>>> +}
>>>>> +
>>>>> +static void init_node_memory_tier(int node)
>>>> 
>>>> set_node_memory_tier()?
>>>
>>> That was done based on feedback from Alistair 
>>>
>>> https://lore.kernel.org/linux-mm/87h73iapg1.fsf@nvdebian.thelocal
>>>
>>>> 
>>>>> +{
>>>>> +	int perf_level;
>>>>> +	struct memory_tier *memtier;
>>>>> +
>>>>> +	mutex_lock(&memory_tier_lock);
>>>>> +
>>>>> +	memtier = __node_get_memory_tier(node);
>>>>> +	if (!memtier) {
>>>>> +		perf_level = node_devices[node]->perf_level;
>>>>> +		memtier = find_create_memory_tier(perf_level);
>>>>> +		node_set(node, memtier->nodelist);
>>>>> +	}
>>
>> It's related to Alistair's comments too.  When will memtier != NULL
>> here?  We may need just VM_WARN_ON() here?
>
> When the platform driver sets memory tier directly. With the old code
> it can happen when dax/kmem register a node to a memory tier. With
> memory_type proposal this can happen if the node is part of memory
> type that is already added to a memory tier. 

Let's look at what it looks like with memory_type in place.

Best Regards,
Huang, Ying

>>
>>>>> +	mutex_unlock(&memory_tier_lock);
>>>>> +}
>>>>> +

[snip]
diff mbox series

Patch

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index ef380a39db3a..3d5f14d57ae6 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -14,6 +14,7 @@ 
 #define MEMTIER_PERF_LEVEL_DRAM	(1 << (MEMTIER_CHUNK_BITS + 2))
 /* leave one tier below this slow pmem */
 #define MEMTIER_PERF_LEVEL_PMEM	(1 << MEMTIER_CHUNK_BITS)
+#define MEMTIER_HOTPLUG_PRIO	100
 
 extern bool numa_demotion_enabled;
 
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 41a21cc5ae55..cc3a47ec18e4 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,7 @@ 
 #include <linux/lockdep.h>
 #include <linux/moduleparam.h>
 #include <linux/node.h>
+#include <linux/memory.h>
 #include <linux/memory-tiers.h>
 
 struct memory_tier {
@@ -64,6 +65,78 @@  static struct memory_tier *find_create_memory_tier(unsigned int perf_level)
 	return new_memtier;
 }
 
+static struct memory_tier *__node_get_memory_tier(int node)
+{
+	struct memory_tier *memtier;
+
+	list_for_each_entry(memtier, &memory_tiers, list) {
+		if (node_isset(node, memtier->nodelist))
+			return memtier;
+	}
+	return NULL;
+}
+
+static void init_node_memory_tier(int node)
+{
+	int perf_level;
+	struct memory_tier *memtier;
+
+	mutex_lock(&memory_tier_lock);
+
+	memtier = __node_get_memory_tier(node);
+	if (!memtier) {
+		perf_level = node_devices[node]->perf_level;
+		memtier = find_create_memory_tier(perf_level);
+		node_set(node, memtier->nodelist);
+	}
+	mutex_unlock(&memory_tier_lock);
+}
+
+static void clear_node_memory_tier(int node)
+{
+	struct memory_tier *memtier;
+
+	mutex_lock(&memory_tier_lock);
+	memtier = __node_get_memory_tier(node);
+	if (memtier)
+		node_clear(node, memtier->nodelist);
+	mutex_unlock(&memory_tier_lock);
+}
+
+/*
+ * This runs whether reclaim-based migration is enabled or not,
+ * which ensures that the user can turn reclaim-based migration
+ * at any time without needing to recalculate migration targets.
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+						 unsigned long action, void *_arg)
+{
+	struct memory_notify *arg = _arg;
+
+	/*
+	 * Only update the node migration order when a node is
+	 * changing status, like online->offline.
+	 */
+	if (arg->status_change_nid < 0)
+		return notifier_from_errno(0);
+
+	switch (action) {
+	case MEM_OFFLINE:
+		clear_node_memory_tier(arg->status_change_nid);
+		break;
+	case MEM_ONLINE:
+		init_node_memory_tier(arg->status_change_nid);
+		break;
+	}
+
+	return notifier_from_errno(0);
+}
+
+static void __init migrate_on_reclaim_init(void)
+{
+	hotplug_memory_notifier(migrate_on_reclaim_callback, MEMTIER_HOTPLUG_PRIO);
+}
+
 static int __init memory_tier_init(void)
 {
 	int node;
@@ -96,6 +169,8 @@  static int __init memory_tier_init(void)
 			node_property->perf_level = default_memtier_perf_level;
 	}
 	mutex_unlock(&memory_tier_lock);
+
+	migrate_on_reclaim_init();
 	return 0;
 }
 subsys_initcall(memory_tier_init);