diff mbox series

[RFC,1/4] drivers/base/node: Add demotion_nodes sys infterface

Message ID 20231102025648.1285477-2-lizhijian@fujitsu.com (mailing list archive)
State New
Headers show
Series [RFC,1/4] drivers/base/node: Add demotion_nodes sys infterface | expand

Commit Message

Li Zhijian Nov. 2, 2023, 2:56 a.m. UTC
It shows the demotion target nodes of a node. Export this information to
user directly.

Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
- Before PMEM is online, no demotion_nodes for node0 and node1.
$ cat /sys/devices/system/node/node0/demotion_nodes
 <show nothing>
- After node3 is online as kmem
$ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
[
  {
    "chardev":"dax0.0",
    "size":1054867456,
    "target_node":3,
    "align":2097152,
    "mode":"system-ram",
    "online_memblocks":0,
    "total_memblocks":7
  }
]
$ cat /sys/devices/system/node/node0/demotion_nodes
3
$ cat /sys/devices/system/node/node1/demotion_nodes
3
$ cat /sys/devices/system/node/node3/demotion_nodes
 <show nothing>

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
 drivers/base/node.c          | 13 +++++++++++++
 include/linux/memory-tiers.h |  6 ++++++
 mm/memory-tiers.c            |  8 ++++++++
 3 files changed, 27 insertions(+)

Comments

Huang, Ying Nov. 2, 2023, 3:17 a.m. UTC | #1
Li Zhijian <lizhijian@fujitsu.com> writes:

> It shows the demotion target nodes of a node. Export this information to
> user directly.
>
> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
> - Before PMEM is online, no demotion_nodes for node0 and node1.
> $ cat /sys/devices/system/node/node0/demotion_nodes
>  <show nothing>
> - After node3 is online as kmem
> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
> [
>   {
>     "chardev":"dax0.0",
>     "size":1054867456,
>     "target_node":3,
>     "align":2097152,
>     "mode":"system-ram",
>     "online_memblocks":0,
>     "total_memblocks":7
>   }
> ]
> $ cat /sys/devices/system/node/node0/demotion_nodes
> 3
> $ cat /sys/devices/system/node/node1/demotion_nodes
> 3
> $ cat /sys/devices/system/node/node3/demotion_nodes
>  <show nothing>

We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
already.  A node in a higher tier can demote to any node in the lower
tiers.  What's more need to be displayed in nodeX/demotion_nodes?

--
Best Regards,
Huang, Ying

> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> ---
>  drivers/base/node.c          | 13 +++++++++++++
>  include/linux/memory-tiers.h |  6 ++++++
>  mm/memory-tiers.c            |  8 ++++++++
>  3 files changed, 27 insertions(+)
>
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 493d533f8375..27e8502548a7 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -7,6 +7,7 @@
>  #include <linux/init.h>
>  #include <linux/mm.h>
>  #include <linux/memory.h>
> +#include <linux/memory-tiers.h>
>  #include <linux/vmstat.h>
>  #include <linux/notifier.h>
>  #include <linux/node.h>
> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>  }
>  static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>  
> +static ssize_t demotion_nodes_show(struct device *dev,
> +			     struct device_attribute *attr, char *buf)
> +{
> +	int ret;
> +	nodemask_t nmask = next_demotion_nodes(dev->id);
> +
> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
> +	return ret;
> +}
> +static DEVICE_ATTR_RO(demotion_nodes);
> +
>  static struct attribute *node_dev_attrs[] = {
>  	&dev_attr_meminfo.attr,
>  	&dev_attr_numastat.attr,
>  	&dev_attr_distance.attr,
>  	&dev_attr_vmstat.attr,
> +	&dev_attr_demotion_nodes.attr,
>  	NULL
>  };
>  
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index 437441cdf78f..8eb04923f965 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
> +nodemask_t next_demotion_nodes(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>  bool node_is_toptier(int node);
>  #else
> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>  	return NUMA_NO_NODE;
>  }
>  
> +static inline next_demotion_nodes next_demotion_nodes(int node)
> +{
> +	return NODE_MASK_NONE;
> +}
> +
>  static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>  {
>  	*targets = NODE_MASK_NONE;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 37a4f59d9585..90047f37d98a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>  	rcu_read_unlock();
>  }
>  
> +nodemask_t next_demotion_nodes(int node)
> +{
> +	if (!node_demotion)
> +		return NODE_MASK_NONE;
> +
> +	return node_demotion[node].preferred;
> +}
> +
>  /**
>   * next_demotion_node() - Get the next node in the demotion path
>   * @node: The starting node to lookup the next node
Huang, Ying Nov. 2, 2023, 5:18 a.m. UTC | #2
"Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:

>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> already.  A node in a higher tier can demote to any node in the lower
>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>
> IIRC, they are not the same. memory_tier[number], where the number is shared by
> the memory using the same memory driver(dax/kmem etc). Not reflect the actual distance
> across nodes(different distance will be grouped into the same memory_tier).
> But demotion will only select the nearest nodelist to demote.

In the following patchset, we will use the performance information from
HMAT to place nodes using the same memory driver into different memory
tiers.

https://lore.kernel.org/all/20230926060628.265989-1-ying.huang@intel.com/

The patch is in mm-stable tree.

> Below is an example, node0 node1 are DRAM, node2 node3 are PMEM, but distance to DRAM nodes
> are different.
>  
> # numactl -H
> available: 4 nodes (0-3)
> node 0 cpus: 0
> node 0 size: 964 MB
> node 0 free: 746 MB
> node 1 cpus: 1
> node 1 size: 685 MB
> node 1 free: 455 MB
> node 2 cpus:
> node 2 size: 896 MB
> node 2 free: 897 MB
> node 3 cpus:
> node 3 size: 896 MB
> node 3 free: 896 MB
> node distances:
> node   0   1   2   3
>   0:  10  20  20  25
>   1:  20  10  25  20
>   2:  20  25  10  20
>   3:  25  20  20  10
> # cat /sys/devices/system/node/node0/demotion_nodes
> 2

node 2 is only the preferred demotion target.  In fact, memory in node 0
can be demoted to node 2,3.  Please check demote_folio_list() for
details.

--
Best Regards,
Huang, Ying

> # cat /sys/devices/system/node/node1/demotion_nodes
> 3
> # cat /sys/devices/virtual/memory_tiering/memory_tier22/nodelist
> 2-3
>
> Thanks
> Zhijian
>
> (I hate the outlook native reply composition format.)
> ________________________________________
> From: Huang, Ying <ying.huang@intel.com>
> Sent: Thursday, November 2, 2023 11:17
> To: Li, Zhijian/李 智坚
> Cc: Andrew Morton; Greg Kroah-Hartman; rafael@kernel.org; linux-mm@kvack.org; Gotou, Yasunori/五島 康文; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH RFC 1/4] drivers/base/node: Add demotion_nodes sys infterface
>
> Li Zhijian <lizhijian@fujitsu.com> writes:
>
>> It shows the demotion target nodes of a node. Export this information to
>> user directly.
>>
>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>  <show nothing>
>> - After node3 is online as kmem
>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>> [
>>   {
>>     "chardev":"dax0.0",
>>     "size":1054867456,
>>     "target_node":3,
>>     "align":2097152,
>>     "mode":"system-ram",
>>     "online_memblocks":0,
>>     "total_memblocks":7
>>   }
>> ]
>> $ cat /sys/devices/system/node/node0/demotion_nodes
>> 3
>> $ cat /sys/devices/system/node/node1/demotion_nodes
>> 3
>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>  <show nothing>
>
> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
> already.  A node in a higher tier can demote to any node in the lower
> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>
> --
> Best Regards,
> Huang, Ying
>
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> ---
>>  drivers/base/node.c          | 13 +++++++++++++
>>  include/linux/memory-tiers.h |  6 ++++++
>>  mm/memory-tiers.c            |  8 ++++++++
>>  3 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index 493d533f8375..27e8502548a7 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -7,6 +7,7 @@
>>  #include <linux/init.h>
>>  #include <linux/mm.h>
>>  #include <linux/memory.h>
>> +#include <linux/memory-tiers.h>
>>  #include <linux/vmstat.h>
>>  #include <linux/notifier.h>
>>  #include <linux/node.h>
>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>  }
>>  static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>
>> +static ssize_t demotion_nodes_show(struct device *dev,
>> +                          struct device_attribute *attr, char *buf)
>> +{
>> +     int ret;
>> +     nodemask_t nmask = next_demotion_nodes(dev->id);
>> +
>> +     ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>> +     return ret;
>> +}
>> +static DEVICE_ATTR_RO(demotion_nodes);
>> +
>>  static struct attribute *node_dev_attrs[] = {
>>       &dev_attr_meminfo.attr,
>>       &dev_attr_numastat.attr,
>>       &dev_attr_distance.attr,
>>       &dev_attr_vmstat.attr,
>> +     &dev_attr_demotion_nodes.attr,
>>       NULL
>>  };
>>
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index 437441cdf78f..8eb04923f965 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>  #ifdef CONFIG_MIGRATION
>>  int next_demotion_node(int node);
>> +nodemask_t next_demotion_nodes(int node);
>>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>  bool node_is_toptier(int node);
>>  #else
>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>       return NUMA_NO_NODE;
>>  }
>>
>> +static inline next_demotion_nodes next_demotion_nodes(int node)
>> +{
>> +     return NODE_MASK_NONE;
>> +}
>> +
>>  static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>  {
>>       *targets = NODE_MASK_NONE;
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 37a4f59d9585..90047f37d98a 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>       rcu_read_unlock();
>>  }
>>
>> +nodemask_t next_demotion_nodes(int node)
>> +{
>> +     if (!node_demotion)
>> +             return NODE_MASK_NONE;
>> +
>> +     return node_demotion[node].preferred;
>> +}
>> +
>>  /**
>>   * next_demotion_node() - Get the next node in the demotion path
>>   * @node: The starting node to lookup the next node
Li Zhijian Nov. 2, 2023, 5:54 a.m. UTC | #3
On 02/11/2023 13:18, Huang, Ying wrote:
> "Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:
> 
>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>> already.  A node in a higher tier can demote to any node in the lower
>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>
>> IIRC, they are not the same. memory_tier[number], where the number is shared by
>> the memory using the same memory driver(dax/kmem etc). Not reflect the actual distance
>> across nodes(different distance will be grouped into the same memory_tier).
>> But demotion will only select the nearest nodelist to demote.
> 
> In the following patchset, we will use the performance information from
> HMAT to place nodes using the same memory driver into different memory
> tiers.
> 
> https://lore.kernel.org/all/20230926060628.265989-1-ying.huang@intel.com/

Thanks for your reminder. It seems like I've fallen behind the world by months.
I will rebase on it later if this patch is still needed.

> 
> The patch is in mm-stable tree.
> 
>> Below is an example, node0 node1 are DRAM, node2 node3 are PMEM, but distance to DRAM nodes
>> are different.
>>   
>> # numactl -H
>> available: 4 nodes (0-3)
>> node 0 cpus: 0
>> node 0 size: 964 MB
>> node 0 free: 746 MB
>> node 1 cpus: 1
>> node 1 size: 685 MB
>> node 1 free: 455 MB
>> node 2 cpus:
>> node 2 size: 896 MB
>> node 2 free: 897 MB
>> node 3 cpus:
>> node 3 size: 896 MB
>> node 3 free: 896 MB
>> node distances:
>> node   0   1   2   3
>>    0:  10  20  20  25
>>    1:  20  10  25  20
>>    2:  20  25  10  20
>>    3:  25  20  20  10
>> # cat /sys/devices/system/node/node0/demotion_nodes
>> 2
> 
> node 2 is only the preferred demotion target.  In fact, memory in node 0
> can be demoted to node 2,3.  Please check demote_folio_list() for
> details.

Have I missed something, at least the on master tree, nd->preferred only include the
nearest ones(by specific algorithms), so in above numa topology, nd->preferred of
node0 is node2 only. node0 distance to node3 is 25 greater than to node2(20).

> 1657         int target_nid = next_demotion_node(pgdat->node_id);

So target_nid cannot be node3 IIUC.

(I cooked this patches weeks ago, maybe something has changed, i will also take a deep look later.)

1650 /*
1651  * Take folios on @demote_folios and attempt to demote them to another node.
1652  * Folios which are not demoted are left on @demote_folios.
1653  */
1654 static unsigned int demote_folio_list(struct list_head *demote_folios,
1655                                      struct pglist_data *pgdat)
1656 {
1657         int target_nid = next_demotion_node(pgdat->node_id);
1658         unsigned int nr_succeeded;
1659         nodemask_t allowed_mask;
1660
1661         struct migration_target_control mtc = {
1662                 /*
1663                  * Allocate from 'node', or fail quickly and quietly.
1664                  * When this happens, 'page' will likely just be discarded
1665                  * instead of migrated.
1666                  */
1667                 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
1668                         __GFP_NOMEMALLOC | GFP_NOWAIT,
1669                 .nid = target_nid,
1670                 .nmask = &allowed_mask
1671         };
1672
1673         if (list_empty(demote_folios))
1674                 return 0;
1675
1676         if (target_nid == NUMA_NO_NODE)
1677                 return 0;
1678
1679         node_get_allowed_targets(pgdat, &allowed_mask);
1680
1681         /* Demotion ignores all cpuset and mempolicy settings */
1682         migrate_pages(demote_folios, alloc_demote_folio, NULL,
1683                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
1684                       &nr_succeeded);


> 
> --
> Best Regards,
> Huang, Ying
> 
>> # cat /sys/devices/system/node/node1/demotion_nodes
>> 3
>> # cat /sys/devices/virtual/memory_tiering/memory_tier22/nodelist
>> 2-3
>>
>> Thanks
>> Zhijian
>>
>> (I hate the outlook native reply composition format.)
>> ________________________________________
>> From: Huang, Ying <ying.huang@intel.com>
>> Sent: Thursday, November 2, 2023 11:17
>> To: Li, Zhijian/李 智坚
>> Cc: Andrew Morton; Greg Kroah-Hartman; rafael@kernel.org; linux-mm@kvack.org; Gotou, Yasunori/五島 康文; linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH RFC 1/4] drivers/base/node: Add demotion_nodes sys infterface
>>
>> Li Zhijian <lizhijian@fujitsu.com> writes:
>>
>>> It shows the demotion target nodes of a node. Export this information to
>>> user directly.
>>>
>>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>   <show nothing>
>>> - After node3 is online as kmem
>>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>>> [
>>>    {
>>>      "chardev":"dax0.0",
>>>      "size":1054867456,
>>>      "target_node":3,
>>>      "align":2097152,
>>>      "mode":"system-ram",
>>>      "online_memblocks":0,
>>>      "total_memblocks":7
>>>    }
>>> ]
>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>> 3
>>> $ cat /sys/devices/system/node/node1/demotion_nodes
>>> 3
>>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>>   <show nothing>
>>
>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> already.  A node in a higher tier can demote to any node in the lower
>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>
>> --
>> Best Regards,
>> Huang, Ying
>>
>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>> ---
>>>   drivers/base/node.c          | 13 +++++++++++++
>>>   include/linux/memory-tiers.h |  6 ++++++
>>>   mm/memory-tiers.c            |  8 ++++++++
>>>   3 files changed, 27 insertions(+)
>>>
>>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>>> index 493d533f8375..27e8502548a7 100644
>>> --- a/drivers/base/node.c
>>> +++ b/drivers/base/node.c
>>> @@ -7,6 +7,7 @@
>>>   #include <linux/init.h>
>>>   #include <linux/mm.h>
>>>   #include <linux/memory.h>
>>> +#include <linux/memory-tiers.h>
>>>   #include <linux/vmstat.h>
>>>   #include <linux/notifier.h>
>>>   #include <linux/node.h>
>>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>>   }
>>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>>
>>> +static ssize_t demotion_nodes_show(struct device *dev,
>>> +                          struct device_attribute *attr, char *buf)
>>> +{
>>> +     int ret;
>>> +     nodemask_t nmask = next_demotion_nodes(dev->id);
>>> +
>>> +     ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>>> +     return ret;
>>> +}
>>> +static DEVICE_ATTR_RO(demotion_nodes);
>>> +
>>>   static struct attribute *node_dev_attrs[] = {
>>>        &dev_attr_meminfo.attr,
>>>        &dev_attr_numastat.attr,
>>>        &dev_attr_distance.attr,
>>>        &dev_attr_vmstat.attr,
>>> +     &dev_attr_demotion_nodes.attr,
>>>        NULL
>>>   };
>>>
>>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>>> index 437441cdf78f..8eb04923f965 100644
>>> --- a/include/linux/memory-tiers.h
>>> +++ b/include/linux/memory-tiers.h
>>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>>   #ifdef CONFIG_MIGRATION
>>>   int next_demotion_node(int node);
>>> +nodemask_t next_demotion_nodes(int node);
>>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>>   bool node_is_toptier(int node);
>>>   #else
>>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>>        return NUMA_NO_NODE;
>>>   }
>>>
>>> +static inline next_demotion_nodes next_demotion_nodes(int node)
>>> +{
>>> +     return NODE_MASK_NONE;
>>> +}
>>> +
>>>   static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>   {
>>>        *targets = NODE_MASK_NONE;
>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>> index 37a4f59d9585..90047f37d98a 100644
>>> --- a/mm/memory-tiers.c
>>> +++ b/mm/memory-tiers.c
>>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>        rcu_read_unlock();
>>>   }
>>>
>>> +nodemask_t next_demotion_nodes(int node)
>>> +{
>>> +     if (!node_demotion)
>>> +             return NODE_MASK_NONE;
>>> +
>>> +     return node_demotion[node].preferred;
>>> +}
>>> +
>>>   /**
>>>    * next_demotion_node() - Get the next node in the demotion path
>>>    * @node: The starting node to lookup the next node
Huang, Ying Nov. 2, 2023, 5:58 a.m. UTC | #4
"Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:

> On 02/11/2023 13:18, Huang, Ying wrote:
>> "Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:
>> 
>>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>>> already.  A node in a higher tier can demote to any node in the lower
>>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>>
>>> IIRC, they are not the same. memory_tier[number], where the number is shared by
>>> the memory using the same memory driver(dax/kmem etc). Not reflect the actual distance
>>> across nodes(different distance will be grouped into the same memory_tier).
>>> But demotion will only select the nearest nodelist to demote.
>> 
>> In the following patchset, we will use the performance information from
>> HMAT to place nodes using the same memory driver into different memory
>> tiers.
>> 
>> https://lore.kernel.org/all/20230926060628.265989-1-ying.huang@intel.com/
>
> Thanks for your reminder. It seems like I've fallen behind the world by months.
> I will rebase on it later if this patch is still needed.
>
>> 
>> The patch is in mm-stable tree.
>> 
>>> Below is an example, node0 node1 are DRAM, node2 node3 are PMEM, but distance to DRAM nodes
>>> are different.
>>>   
>>> # numactl -H
>>> available: 4 nodes (0-3)
>>> node 0 cpus: 0
>>> node 0 size: 964 MB
>>> node 0 free: 746 MB
>>> node 1 cpus: 1
>>> node 1 size: 685 MB
>>> node 1 free: 455 MB
>>> node 2 cpus:
>>> node 2 size: 896 MB
>>> node 2 free: 897 MB
>>> node 3 cpus:
>>> node 3 size: 896 MB
>>> node 3 free: 896 MB
>>> node distances:
>>> node   0   1   2   3
>>>    0:  10  20  20  25
>>>    1:  20  10  25  20
>>>    2:  20  25  10  20
>>>    3:  25  20  20  10
>>> # cat /sys/devices/system/node/node0/demotion_nodes
>>> 2
>> 
>> node 2 is only the preferred demotion target.  In fact, memory in node 0
>> can be demoted to node 2,3.  Please check demote_folio_list() for
>> details.
>
> Have I missed something, at least the on master tree, nd->preferred only include the
> nearest ones(by specific algorithms), so in above numa topology, nd->preferred of
> node0 is node2 only. node0 distance to node3 is 25 greater than to node2(20).
>
>> 1657         int target_nid = next_demotion_node(pgdat->node_id);
>
> So target_nid cannot be node3 IIUC.
>
> (I cooked this patches weeks ago, maybe something has changed, i will also take a deep look later.)
>
> 1650 /*
> 1651  * Take folios on @demote_folios and attempt to demote them to another node.
> 1652  * Folios which are not demoted are left on @demote_folios.
> 1653  */
> 1654 static unsigned int demote_folio_list(struct list_head *demote_folios,
> 1655                                      struct pglist_data *pgdat)
> 1656 {
> 1657         int target_nid = next_demotion_node(pgdat->node_id);
> 1658         unsigned int nr_succeeded;
> 1659         nodemask_t allowed_mask;
> 1660
> 1661         struct migration_target_control mtc = {
> 1662                 /*
> 1663                  * Allocate from 'node', or fail quickly and quietly.
> 1664                  * When this happens, 'page' will likely just be discarded
> 1665                  * instead of migrated.
> 1666                  */
> 1667                 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
> 1668                         __GFP_NOMEMALLOC | GFP_NOWAIT,
> 1669                 .nid = target_nid,
> 1670                 .nmask = &allowed_mask
> 1671         };
> 1672
> 1673         if (list_empty(demote_folios))
> 1674                 return 0;
> 1675
> 1676         if (target_nid == NUMA_NO_NODE)
> 1677                 return 0;
> 1678
> 1679         node_get_allowed_targets(pgdat, &allowed_mask);
> 1680
> 1681         /* Demotion ignores all cpuset and mempolicy settings */
> 1682         migrate_pages(demote_folios, alloc_demote_folio, NULL,
> 1683                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
> 1684                       &nr_succeeded);
>

In alloc_demote_folio(), target_nid is tried firstly. Then, if
allocation fails, any node in allowed_mask will be tried.

--
Best Regards,
Huang, Ying

>> 
>>> # cat /sys/devices/system/node/node1/demotion_nodes
>>> 3
>>> # cat /sys/devices/virtual/memory_tiering/memory_tier22/nodelist
>>> 2-3
>>>
>>> Thanks
>>> Zhijian
>>>
>>> (I hate the outlook native reply composition format.)
>>> ________________________________________
>>> From: Huang, Ying <ying.huang@intel.com>
>>> Sent: Thursday, November 2, 2023 11:17
>>> To: Li, Zhijian/李 智坚
>>> Cc: Andrew Morton; Greg Kroah-Hartman; rafael@kernel.org; linux-mm@kvack.org; Gotou, Yasunori/五島 康文; linux-kernel@vger.kernel.org
>>> Subject: Re: [PATCH RFC 1/4] drivers/base/node: Add demotion_nodes sys infterface
>>>
>>> Li Zhijian <lizhijian@fujitsu.com> writes:
>>>
>>>> It shows the demotion target nodes of a node. Export this information to
>>>> user directly.
>>>>
>>>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>>>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>>   <show nothing>
>>>> - After node3 is online as kmem
>>>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>>>> [
>>>>    {
>>>>      "chardev":"dax0.0",
>>>>      "size":1054867456,
>>>>      "target_node":3,
>>>>      "align":2097152,
>>>>      "mode":"system-ram",
>>>>      "online_memblocks":0,
>>>>      "total_memblocks":7
>>>>    }
>>>> ]
>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>> 3
>>>> $ cat /sys/devices/system/node/node1/demotion_nodes
>>>> 3
>>>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>>>   <show nothing>
>>>
>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>> already.  A node in a higher tier can demote to any node in the lower
>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>>
>>> --
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>>> ---
>>>>   drivers/base/node.c          | 13 +++++++++++++
>>>>   include/linux/memory-tiers.h |  6 ++++++
>>>>   mm/memory-tiers.c            |  8 ++++++++
>>>>   3 files changed, 27 insertions(+)
>>>>
>>>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>>>> index 493d533f8375..27e8502548a7 100644
>>>> --- a/drivers/base/node.c
>>>> +++ b/drivers/base/node.c
>>>> @@ -7,6 +7,7 @@
>>>>   #include <linux/init.h>
>>>>   #include <linux/mm.h>
>>>>   #include <linux/memory.h>
>>>> +#include <linux/memory-tiers.h>
>>>>   #include <linux/vmstat.h>
>>>>   #include <linux/notifier.h>
>>>>   #include <linux/node.h>
>>>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>>>   }
>>>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>>>
>>>> +static ssize_t demotion_nodes_show(struct device *dev,
>>>> +                          struct device_attribute *attr, char *buf)
>>>> +{
>>>> +     int ret;
>>>> +     nodemask_t nmask = next_demotion_nodes(dev->id);
>>>> +
>>>> +     ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>>>> +     return ret;
>>>> +}
>>>> +static DEVICE_ATTR_RO(demotion_nodes);
>>>> +
>>>>   static struct attribute *node_dev_attrs[] = {
>>>>        &dev_attr_meminfo.attr,
>>>>        &dev_attr_numastat.attr,
>>>>        &dev_attr_distance.attr,
>>>>        &dev_attr_vmstat.attr,
>>>> +     &dev_attr_demotion_nodes.attr,
>>>>        NULL
>>>>   };
>>>>
>>>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>>>> index 437441cdf78f..8eb04923f965 100644
>>>> --- a/include/linux/memory-tiers.h
>>>> +++ b/include/linux/memory-tiers.h
>>>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>>>   #ifdef CONFIG_MIGRATION
>>>>   int next_demotion_node(int node);
>>>> +nodemask_t next_demotion_nodes(int node);
>>>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>>>   bool node_is_toptier(int node);
>>>>   #else
>>>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>>>        return NUMA_NO_NODE;
>>>>   }
>>>>
>>>> +static inline next_demotion_nodes next_demotion_nodes(int node)
>>>> +{
>>>> +     return NODE_MASK_NONE;
>>>> +}
>>>> +
>>>>   static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>   {
>>>>        *targets = NODE_MASK_NONE;
>>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>>> index 37a4f59d9585..90047f37d98a 100644
>>>> --- a/mm/memory-tiers.c
>>>> +++ b/mm/memory-tiers.c
>>>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>        rcu_read_unlock();
>>>>   }
>>>>
>>>> +nodemask_t next_demotion_nodes(int node)
>>>> +{
>>>> +     if (!node_demotion)
>>>> +             return NODE_MASK_NONE;
>>>> +
>>>> +     return node_demotion[node].preferred;
>>>> +}
>>>> +
>>>>   /**
>>>>    * next_demotion_node() - Get the next node in the demotion path
>>>>    * @node: The starting node to lookup the next node
Li Zhijian Nov. 3, 2023, 3:05 a.m. UTC | #5
On 02/11/2023 13:58, Huang, Ying wrote:
> "Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:
> 
>> On 02/11/2023 13:18, Huang, Ying wrote:
>>> "Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:
>>>
>>>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>>>> already.  A node in a higher tier can demote to any node in the lower
>>>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>>>
>>>> IIRC, they are not the same. memory_tier[number], where the number is shared by
>>>> the memory using the same memory driver(dax/kmem etc). Not reflect the actual distance
>>>> across nodes(different distance will be grouped into the same memory_tier).
>>>> But demotion will only select the nearest nodelist to demote.
>>>
>>> In the following patchset, we will use the performance information from
>>> HMAT to place nodes using the same memory driver into different memory
>>> tiers.
>>>
>>> https://lore.kernel.org/all/20230926060628.265989-1-ying.huang@intel.com/
>>
>> Thanks for your reminder. It seems like I've fallen behind the world by months.
>> I will rebase on it later if this patch is still needed.
>>
>>>
>>> The patch is in mm-stable tree.
>>>
>>>> Below is an example, node0 node1 are DRAM, node2 node3 are PMEM, but distance to DRAM nodes
>>>> are different.
>>>>    
>>>> # numactl -H
>>>> available: 4 nodes (0-3)
>>>> node 0 cpus: 0
>>>> node 0 size: 964 MB
>>>> node 0 free: 746 MB
>>>> node 1 cpus: 1
>>>> node 1 size: 685 MB
>>>> node 1 free: 455 MB
>>>> node 2 cpus:
>>>> node 2 size: 896 MB
>>>> node 2 free: 897 MB
>>>> node 3 cpus:
>>>> node 3 size: 896 MB
>>>> node 3 free: 896 MB
>>>> node distances:
>>>> node   0   1   2   3
>>>>     0:  10  20  20  25
>>>>     1:  20  10  25  20
>>>>     2:  20  25  10  20
>>>>     3:  25  20  20  10
>>>> # cat /sys/devices/system/node/node0/demotion_nodes
>>>> 2
>>>
>>> node 2 is only the preferred demotion target.  In fact, memory in node 0
>>> can be demoted to node 2,3.  Please check demote_folio_list() for
>>> details.
>>
>> Have I missed something, at least the on master tree, nd->preferred only include the
>> nearest ones(by specific algorithms), so in above numa topology, nd->preferred of
>> node0 is node2 only. node0 distance to node3 is 25 greater than to node2(20).
>>
>>> 1657         int target_nid = next_demotion_node(pgdat->node_id);
>>
>> So target_nid cannot be node3 IIUC.
>>
>> (I cooked this patches weeks ago, maybe something has changed, i will also take a deep look later.)
>>
>> 1650 /*
>> 1651  * Take folios on @demote_folios and attempt to demote them to another node.
>> 1652  * Folios which are not demoted are left on @demote_folios.
>> 1653  */
>> 1654 static unsigned int demote_folio_list(struct list_head *demote_folios,
>> 1655                                      struct pglist_data *pgdat)
>> 1656 {
>> 1657         int target_nid = next_demotion_node(pgdat->node_id);
>> 1658         unsigned int nr_succeeded;
>> 1659         nodemask_t allowed_mask;
>> 1660
>> 1661         struct migration_target_control mtc = {
>> 1662                 /*
>> 1663                  * Allocate from 'node', or fail quickly and quietly.
>> 1664                  * When this happens, 'page' will likely just be discarded
>> 1665                  * instead of migrated.
>> 1666                  */
>> 1667                 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
>> 1668                         __GFP_NOMEMALLOC | GFP_NOWAIT,
>> 1669                 .nid = target_nid,
>> 1670                 .nmask = &allowed_mask
>> 1671         };
>> 1672
>> 1673         if (list_empty(demote_folios))
>> 1674                 return 0;
>> 1675
>> 1676         if (target_nid == NUMA_NO_NODE)
>> 1677                 return 0;
>> 1678
>> 1679         node_get_allowed_targets(pgdat, &allowed_mask);
>> 1680
>> 1681         /* Demotion ignores all cpuset and mempolicy settings */
>> 1682         migrate_pages(demote_folios, alloc_demote_folio, NULL,
>> 1683                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>> 1684                       &nr_succeeded);
>>
> 
> In alloc_demote_folio(), target_nid is tried firstly. Then, if
> allocation fails, any node in allowed_mask will be tried.


Very thanks for your kindly explanation. You are right.
Let me re-think if it's still needed...


BTW, i will split PATCH2 as a separate patch first.

Thanks
Zhijian


> 
> --
> Best Regards,
> Huang, Ying
> 
>>>
>>>> # cat /sys/devices/system/node/node1/demotion_nodes
>>>> 3
>>>> # cat /sys/devices/virtual/memory_tiering/memory_tier22/nodelist
>>>> 2-3
>>>>
>>>> Thanks
>>>> Zhijian
>>>>
>>>> (I hate the outlook native reply composition format.)
>>>> ________________________________________
>>>> From: Huang, Ying <ying.huang@intel.com>
>>>> Sent: Thursday, November 2, 2023 11:17
>>>> To: Li, Zhijian/李 智坚
>>>> Cc: Andrew Morton; Greg Kroah-Hartman; rafael@kernel.org; linux-mm@kvack.org; Gotou, Yasunori/五島 康文; linux-kernel@vger.kernel.org
>>>> Subject: Re: [PATCH RFC 1/4] drivers/base/node: Add demotion_nodes sys infterface
>>>>
>>>> Li Zhijian <lizhijian@fujitsu.com> writes:
>>>>
>>>>> It shows the demotion target nodes of a node. Export this information to
>>>>> user directly.
>>>>>
>>>>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>>>>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>>>    <show nothing>
>>>>> - After node3 is online as kmem
>>>>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>>>>> [
>>>>>     {
>>>>>       "chardev":"dax0.0",
>>>>>       "size":1054867456,
>>>>>       "target_node":3,
>>>>>       "align":2097152,
>>>>>       "mode":"system-ram",
>>>>>       "online_memblocks":0,
>>>>>       "total_memblocks":7
>>>>>     }
>>>>> ]
>>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>>> 3
>>>>> $ cat /sys/devices/system/node/node1/demotion_nodes
>>>>> 3
>>>>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>>>>    <show nothing>
>>>>
>>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>>> already.  A node in a higher tier can demote to any node in the lower
>>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>>>
>>>> --
>>>> Best Regards,
>>>> Huang, Ying
>>>>
>>>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>>>> ---
>>>>>    drivers/base/node.c          | 13 +++++++++++++
>>>>>    include/linux/memory-tiers.h |  6 ++++++
>>>>>    mm/memory-tiers.c            |  8 ++++++++
>>>>>    3 files changed, 27 insertions(+)
>>>>>
>>>>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>>>>> index 493d533f8375..27e8502548a7 100644
>>>>> --- a/drivers/base/node.c
>>>>> +++ b/drivers/base/node.c
>>>>> @@ -7,6 +7,7 @@
>>>>>    #include <linux/init.h>
>>>>>    #include <linux/mm.h>
>>>>>    #include <linux/memory.h>
>>>>> +#include <linux/memory-tiers.h>
>>>>>    #include <linux/vmstat.h>
>>>>>    #include <linux/notifier.h>
>>>>>    #include <linux/node.h>
>>>>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>>>>    }
>>>>>    static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>>>>
>>>>> +static ssize_t demotion_nodes_show(struct device *dev,
>>>>> +                          struct device_attribute *attr, char *buf)
>>>>> +{
>>>>> +     int ret;
>>>>> +     nodemask_t nmask = next_demotion_nodes(dev->id);
>>>>> +
>>>>> +     ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>>>>> +     return ret;
>>>>> +}
>>>>> +static DEVICE_ATTR_RO(demotion_nodes);
>>>>> +
>>>>>    static struct attribute *node_dev_attrs[] = {
>>>>>         &dev_attr_meminfo.attr,
>>>>>         &dev_attr_numastat.attr,
>>>>>         &dev_attr_distance.attr,
>>>>>         &dev_attr_vmstat.attr,
>>>>> +     &dev_attr_demotion_nodes.attr,
>>>>>         NULL
>>>>>    };
>>>>>
>>>>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>>>>> index 437441cdf78f..8eb04923f965 100644
>>>>> --- a/include/linux/memory-tiers.h
>>>>> +++ b/include/linux/memory-tiers.h
>>>>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>>>>    void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>>>>    #ifdef CONFIG_MIGRATION
>>>>>    int next_demotion_node(int node);
>>>>> +nodemask_t next_demotion_nodes(int node);
>>>>>    void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>>>>    bool node_is_toptier(int node);
>>>>>    #else
>>>>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>>>>         return NUMA_NO_NODE;
>>>>>    }
>>>>>
>>>>> +static inline next_demotion_nodes next_demotion_nodes(int node)
>>>>> +{
>>>>> +     return NODE_MASK_NONE;
>>>>> +}
>>>>> +
>>>>>    static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>>    {
>>>>>         *targets = NODE_MASK_NONE;
>>>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>>>> index 37a4f59d9585..90047f37d98a 100644
>>>>> --- a/mm/memory-tiers.c
>>>>> +++ b/mm/memory-tiers.c
>>>>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>>         rcu_read_unlock();
>>>>>    }
>>>>>
>>>>> +nodemask_t next_demotion_nodes(int node)
>>>>> +{
>>>>> +     if (!node_demotion)
>>>>> +             return NODE_MASK_NONE;
>>>>> +
>>>>> +     return node_demotion[node].preferred;
>>>>> +}
>>>>> +
>>>>>    /**
>>>>>     * next_demotion_node() - Get the next node in the demotion path
>>>>>     * @node: The starting node to lookup the next node
Li Zhijian Jan. 30, 2024, 8:53 a.m. UTC | #6
Hi Ying


I need to pick up this thread/patch again.

> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
> already.  A node in a higher tier can demote to any node in the lower
> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
> 

Yes, it's believed that /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
are intended to show nodes in memory_tierN. But IMHO, it's not enough, especially
for the preferred demotion node(s).

Currently, when a demotion occurs, it will prioritize selecting a node
from the preferred nodes as the destination node for the demotion. If
the preferred nodes does not meet the requirements, it will try from all
the lower memory tier nodes until it finds a suitable demotion destination
node or ultimately fails.
                                                                                 
However, currently it only lists the nodes of each tier. If the
administrators want to know all the possible demotion destinations for a
given node, they need to calculate it themselves:
Step 1, find the memory tier where the given node is located
Step 2, list all nodes under all its lower tiers
                                                                                    
It will be even more difficult to know the preferred nodes which depend on
more factors, distance etc. For the following example, we may have 6 nodes
splitting into three memory tiers.
                                                                                 
For emulated hmat numa topology example:
> $ numactl -H                                                                  
> available: 6 nodes (0-5)                                                      
> node 0 cpus: 0                                                                
> node 0 size: 1974 MB                                                          
> node 0 free: 1767 MB                                                             
> node 1 cpus: 1                                                                
> node 1 size: 1694 MB                                                          
> node 1 free: 1454 MB                                                          
> node 2 cpus:                                                                  
> node 2 size: 896 MB                                                           
> node 2 free: 896 MB                                                           
> node 3 cpus:                                                                  
> node 3 size: 896 MB                                                           
> node 3 free: 896 MB                                                           
> node 4 cpus:                                                                  
> node 4 size: 896 MB                                                           
> node 4 free: 896 MB                                                           
> node 5 cpus:                                                                  
> node 5 size: 896 MB                                                           
> node 5 free: 896 MB                                                           
> node distances:                                                               
> node   0   1   2   3   4   5                                                  
>   0:  10  31  21  41  21  41                                                  
>   1:  31  10  41  21  41  21                                                  
>   2:  21  41  10  51  21  51                                                  
>   3:  31  21  51  10  51  21                                                  
>   4:  21  41  21  51  10  51                                                  
>   5:  31  21  51  21  51  10                                                  
>                                                                               
> $ cat memory_tier4/nodelist                                                   
> 0-1                                                                           
> $ cat memory_tier12/nodelist                                                  
> 2,5
> $ cat memory_tier54/nodelist                                                  
> 3-4                                                                           
                                                                                 
For above topology, memory-tier will build the demotion path for each node
like this:
node[0].preferred = 2
node[0].demotion_targets = 2-5
node[1].preferred = 5
node[1].demotion_targets = 2-5
node[2].preferred = 4
node[2].demotion_targets = 3-4
node[3].preferred = <empty>
node[3].demotion_targets = <empty>
node[4].preferred = <empty>
node[4].demotion_targets = <empty>
node[5].preferred = 3
node[5].demotion_targets = 3-4
                                                                          
But this demotion path is not explicitly known to administrator. And with the
feedback from our customers, they also think it is helpful to know demotion
path built by kernel to understand the demotion behaviors.

So i think we should have 2 new interfaces for each node:

/sys/devices/system/node/nodeN/demotion_allowed_nodes
/sys/devices/system/node/nodeN/demotion_preferred_nodes

I value your opinion, and I'd like to know what you think about...


Thanks
Zhijian


On 02/11/2023 11:17, Huang, Ying wrote:
> Li Zhijian <lizhijian@fujitsu.com> writes:
> 
>> It shows the demotion target nodes of a node. Export this information to
>> user directly.
>>
>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>   <show nothing>
>> - After node3 is online as kmem
>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>> [
>>    {
>>      "chardev":"dax0.0",
>>      "size":1054867456,
>>      "target_node":3,
>>      "align":2097152,
>>      "mode":"system-ram",
>>      "online_memblocks":0,
>>      "total_memblocks":7
>>    }
>> ]
>> $ cat /sys/devices/system/node/node0/demotion_nodes
>> 3
>> $ cat /sys/devices/system/node/node1/demotion_nodes
>> 3
>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>   <show nothing>
> 
> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
> already.  A node in a higher tier can demote to any node in the lower
> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
> 
> --
> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> ---
>>   drivers/base/node.c          | 13 +++++++++++++
>>   include/linux/memory-tiers.h |  6 ++++++
>>   mm/memory-tiers.c            |  8 ++++++++
>>   3 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index 493d533f8375..27e8502548a7 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -7,6 +7,7 @@
>>   #include <linux/init.h>
>>   #include <linux/mm.h>
>>   #include <linux/memory.h>
>> +#include <linux/memory-tiers.h>
>>   #include <linux/vmstat.h>
>>   #include <linux/notifier.h>
>>   #include <linux/node.h>
>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>   }
>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>   
>> +static ssize_t demotion_nodes_show(struct device *dev,
>> +			     struct device_attribute *attr, char *buf)
>> +{
>> +	int ret;
>> +	nodemask_t nmask = next_demotion_nodes(dev->id);
>> +
>> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>> +	return ret;
>> +}
>> +static DEVICE_ATTR_RO(demotion_nodes);
>> +
>>   static struct attribute *node_dev_attrs[] = {
>>   	&dev_attr_meminfo.attr,
>>   	&dev_attr_numastat.attr,
>>   	&dev_attr_distance.attr,
>>   	&dev_attr_vmstat.attr,
>> +	&dev_attr_demotion_nodes.attr,
>>   	NULL
>>   };
>>   
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index 437441cdf78f..8eb04923f965 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>   #ifdef CONFIG_MIGRATION
>>   int next_demotion_node(int node);
>> +nodemask_t next_demotion_nodes(int node);
>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>   bool node_is_toptier(int node);
>>   #else
>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>   	return NUMA_NO_NODE;
>>   }
>>   
>> +static inline next_demotion_nodes next_demotion_nodes(int node)
>> +{
>> +	return NODE_MASK_NONE;
>> +}
>> +
>>   static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>   {
>>   	*targets = NODE_MASK_NONE;
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 37a4f59d9585..90047f37d98a 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>   	rcu_read_unlock();
>>   }
>>   
>> +nodemask_t next_demotion_nodes(int node)
>> +{
>> +	if (!node_demotion)
>> +		return NODE_MASK_NONE;
>> +
>> +	return node_demotion[node].preferred;
>> +}
>> +
>>   /**
>>    * next_demotion_node() - Get the next node in the demotion path
>>    * @node: The starting node to lookup the next node
Huang, Ying Jan. 31, 2024, 1:13 a.m. UTC | #7
Li Zhijian <lizhijian@fujitsu.com> writes:

> Hi Ying
>
>
> I need to pick up this thread/patch again.
>
>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> already.  A node in a higher tier can demote to any node in the lower
>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>> 
>
> Yes, it's believed that /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
> are intended to show nodes in memory_tierN. But IMHO, it's not enough, especially
> for the preferred demotion node(s).
>
> Currently, when a demotion occurs, it will prioritize selecting a node
> from the preferred nodes as the destination node for the demotion. If
> the preferred nodes does not meet the requirements, it will try from all
> the lower memory tier nodes until it finds a suitable demotion destination
> node or ultimately fails.
>                                                                                 However,
> currently it only lists the nodes of each tier. If the
> administrators want to know all the possible demotion destinations for a
> given node, they need to calculate it themselves:
> Step 1, find the memory tier where the given node is located
> Step 2, list all nodes under all its lower tiers
>                                                                                    It
> will be even more difficult to know the preferred nodes which depend
> on
> more factors, distance etc. For the following example, we may have 6 nodes
> splitting into three memory tiers.
>                                                                                 For
> emulated hmat numa topology example:
>> $ numactl -H
>> available: 6 nodes (0-5)
>> node 0 cpus: 0
>> node 0 size: 1974 MB
>> node 0 free: 1767 MB
>> node 1 cpus: 1
>> node 1 size: 1694 MB
>> node 1 free: 1454 MB
>> node 2 cpus:
>> node 2 size: 896 MB
>> node 2 free: 896 MB
>> node 3 cpus:
>> node 3 size: 896 MB
>> node 3 free: 896 MB
>> node 4 cpus:
>> node 4 size: 896 MB
>> node 4 free: 896 MB
>> node 5 cpus:
>> node 5 size: 896 MB
>> node 5 free: 896 MB
>> node distances:
>> node   0   1   2   3   4   5
>> 0:  10  31  21  41  21  41
>> 1:  31  10  41  21  41  21
>> 2:  21  41  10  51  21  51
>> 3:  31  21  51  10  51  21
>> 4:  21  41  21  51  10  51
>> 5:  31  21  51  21  51  10
>> $ cat memory_tier4/nodelist
>> 0-1
>> $ cat memory_tier12/nodelist
>> 2,5
>> $ cat memory_tier54/nodelist
>> 3-4                                                                           
>                                                                                 For
> above topology, memory-tier will build the demotion path for each node
> like this:
> node[0].preferred = 2
> node[0].demotion_targets = 2-5
> node[1].preferred = 5
> node[1].demotion_targets = 2-5
> node[2].preferred = 4
> node[2].demotion_targets = 3-4
> node[3].preferred = <empty>
> node[3].demotion_targets = <empty>
> node[4].preferred = <empty>
> node[4].demotion_targets = <empty>
> node[5].preferred = 3
> node[5].demotion_targets = 3-4
>                                                                          But
> this demotion path is not explicitly known to administrator. And with
> the
> feedback from our customers, they also think it is helpful to know demotion
> path built by kernel to understand the demotion behaviors.
>
> So i think we should have 2 new interfaces for each node:
>
> /sys/devices/system/node/nodeN/demotion_allowed_nodes
> /sys/devices/system/node/nodeN/demotion_preferred_nodes
>
> I value your opinion, and I'd like to know what you think about...

Per my understanding, we will not expose everything inside kernel to
user space.  For page placement in a tiered memory system, demotion is
just a part of the story.  For example, if the DRAM of a system becomes
full, new page allocation will fall back to the CXL memory.  Have we
exposed the default page allocation fallback order to user space?

All in all, in my opinion, we only expose as little as possible to user
space because we need to maintain the ABI for ever.

--
Best Regards,
Huang, Ying

>
> On 02/11/2023 11:17, Huang, Ying wrote:
>> Li Zhijian <lizhijian@fujitsu.com> writes:
>> 
>>> It shows the demotion target nodes of a node. Export this information to
>>> user directly.
>>>
>>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>   <show nothing>
>>> - After node3 is online as kmem
>>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>>> [
>>>    {
>>>      "chardev":"dax0.0",
>>>      "size":1054867456,
>>>      "target_node":3,
>>>      "align":2097152,
>>>      "mode":"system-ram",
>>>      "online_memblocks":0,
>>>      "total_memblocks":7
>>>    }
>>> ]
>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>> 3
>>> $ cat /sys/devices/system/node/node1/demotion_nodes
>>> 3
>>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>>   <show nothing>
>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> already.  A node in a higher tier can demote to any node in the lower
>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>> --
>> Best Regards,
>> Huang, Ying
>> 
>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>> ---
>>>   drivers/base/node.c          | 13 +++++++++++++
>>>   include/linux/memory-tiers.h |  6 ++++++
>>>   mm/memory-tiers.c            |  8 ++++++++
>>>   3 files changed, 27 insertions(+)
>>>
>>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>>> index 493d533f8375..27e8502548a7 100644
>>> --- a/drivers/base/node.c
>>> +++ b/drivers/base/node.c
>>> @@ -7,6 +7,7 @@
>>>   #include <linux/init.h>
>>>   #include <linux/mm.h>
>>>   #include <linux/memory.h>
>>> +#include <linux/memory-tiers.h>
>>>   #include <linux/vmstat.h>
>>>   #include <linux/notifier.h>
>>>   #include <linux/node.h>
>>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>>   }
>>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>>   +static ssize_t demotion_nodes_show(struct device *dev,
>>> +			     struct device_attribute *attr, char *buf)
>>> +{
>>> +	int ret;
>>> +	nodemask_t nmask = next_demotion_nodes(dev->id);
>>> +
>>> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>>> +	return ret;
>>> +}
>>> +static DEVICE_ATTR_RO(demotion_nodes);
>>> +
>>>   static struct attribute *node_dev_attrs[] = {
>>>   	&dev_attr_meminfo.attr,
>>>   	&dev_attr_numastat.attr,
>>>   	&dev_attr_distance.attr,
>>>   	&dev_attr_vmstat.attr,
>>> +	&dev_attr_demotion_nodes.attr,
>>>   	NULL
>>>   };
>>>   diff --git a/include/linux/memory-tiers.h
>>> b/include/linux/memory-tiers.h
>>> index 437441cdf78f..8eb04923f965 100644
>>> --- a/include/linux/memory-tiers.h
>>> +++ b/include/linux/memory-tiers.h
>>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>>   #ifdef CONFIG_MIGRATION
>>>   int next_demotion_node(int node);
>>> +nodemask_t next_demotion_nodes(int node);
>>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>>   bool node_is_toptier(int node);
>>>   #else
>>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>>   	return NUMA_NO_NODE;
>>>   }
>>>   +static inline next_demotion_nodes next_demotion_nodes(int node)
>>> +{
>>> +	return NODE_MASK_NONE;
>>> +}
>>> +
>>>   static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>   {
>>>   	*targets = NODE_MASK_NONE;
>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>> index 37a4f59d9585..90047f37d98a 100644
>>> --- a/mm/memory-tiers.c
>>> +++ b/mm/memory-tiers.c
>>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>   	rcu_read_unlock();
>>>   }
>>>   +nodemask_t next_demotion_nodes(int node)
>>> +{
>>> +	if (!node_demotion)
>>> +		return NODE_MASK_NONE;
>>> +
>>> +	return node_demotion[node].preferred;
>>> +}
>>> +
>>>   /**
>>>    * next_demotion_node() - Get the next node in the demotion path
>>>    * @node: The starting node to lookup the next node
Li Zhijian Jan. 31, 2024, 3:18 a.m. UTC | #8
+CC Jagdish,

Who may also still be interesting in this interface.
You had ever tried to add such interface[1], but memory-tier was introduced afterwards.

[1]: [PATCH v3 6/7] mm: demotion: expose per-node demotion targets via sysfs
https://lore.kernel.org/all/20220422195516.10769-7-jvgediya@linux.ibm.com/


On 31/01/2024 09:13, Huang, Ying wrote:
> Li Zhijian <lizhijian@fujitsu.com> writes:
> 
>> Hi Ying
>>
>>
>> I need to pick up this thread/patch again.
>>
>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>> already.  A node in a higher tier can demote to any node in the lower
>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>>
>>
>> Yes, it's believed that /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
>> are intended to show nodes in memory_tierN. But IMHO, it's not enough, especially
>> for the preferred demotion node(s).
>>
>> Currently, when a demotion occurs, it will prioritize selecting a node
>> from the preferred nodes as the destination node for the demotion. If
>> the preferred nodes does not meet the requirements, it will try from all
>> the lower memory tier nodes until it finds a suitable demotion destination
>> node or ultimately fails.
>>                                                                                  However,
>> currently it only lists the nodes of each tier. If the
>> administrators want to know all the possible demotion destinations for a
>> given node, they need to calculate it themselves:
>> Step 1, find the memory tier where the given node is located
>> Step 2, list all nodes under all its lower tiers
>>                                                                                     It
>> will be even more difficult to know the preferred nodes which depend
>> on
>> more factors, distance etc. For the following example, we may have 6 nodes
>> splitting into three memory tiers.
>>                                                                                  For
>> emulated hmat numa topology example:
>>> $ numactl -H
>>> available: 6 nodes (0-5)
>>> node 0 cpus: 0
>>> node 0 size: 1974 MB
>>> node 0 free: 1767 MB
>>> node 1 cpus: 1
>>> node 1 size: 1694 MB
>>> node 1 free: 1454 MB
>>> node 2 cpus:
>>> node 2 size: 896 MB
>>> node 2 free: 896 MB
>>> node 3 cpus:
>>> node 3 size: 896 MB
>>> node 3 free: 896 MB
>>> node 4 cpus:
>>> node 4 size: 896 MB
>>> node 4 free: 896 MB
>>> node 5 cpus:
>>> node 5 size: 896 MB
>>> node 5 free: 896 MB
>>> node distances:
>>> node   0   1   2   3   4   5
>>> 0:  10  31  21  41  21  41
>>> 1:  31  10  41  21  41  21
>>> 2:  21  41  10  51  21  51
>>> 3:  31  21  51  10  51  21
>>> 4:  21  41  21  51  10  51
>>> 5:  31  21  51  21  51  10
>>> $ cat memory_tier4/nodelist
>>> 0-1
>>> $ cat memory_tier12/nodelist
>>> 2,5
>>> $ cat memory_tier54/nodelist
>>> 3-4
>>                                                                                  For
>> above topology, memory-tier will build the demotion path for each node
>> like this:
>> node[0].preferred = 2
>> node[0].demotion_targets = 2-5
>> node[1].preferred = 5
>> node[1].demotion_targets = 2-5
>> node[2].preferred = 4
>> node[2].demotion_targets = 3-4
>> node[3].preferred = <empty>
>> node[3].demotion_targets = <empty>
>> node[4].preferred = <empty>
>> node[4].demotion_targets = <empty>
>> node[5].preferred = 3
>> node[5].demotion_targets = 3-4
>>                                                                           But
>> this demotion path is not explicitly known to administrator. And with
>> the
>> feedback from our customers, they also think it is helpful to know demotion
>> path built by kernel to understand the demotion behaviors.
>>
>> So i think we should have 2 new interfaces for each node:
>>
>> /sys/devices/system/node/nodeN/demotion_allowed_nodes
>> /sys/devices/system/node/nodeN/demotion_preferred_nodes
>>
>> I value your opinion, and I'd like to know what you think about...
> 
> Per my understanding, we will not expose everything inside kernel to
> user space.  For page placement in a tiered memory system, demotion is
> just a part of the story.  For example, if the DRAM of a system becomes
> full, new page allocation will fall back to the CXL memory.  Have we
> exposed the default page allocation fallback order to user space?

Good question, I have no answer yet, but I think we can get the fallback order
from the dmesg now.

The further action for us is that we will also try improve the use space tool,
such as numactl to show the demotion path with the help of this exposed information.


Thanks
Zhijian

> 
> All in all, in my opinion, we only expose as little as possible to user
> space because we need to maintain the ABI for ever.

> 
> --
> Best Regards,
> Huang, Ying
> 
>>
>> On 02/11/2023 11:17, Huang, Ying wrote:
>>> Li Zhijian <lizhijian@fujitsu.com> writes:
>>>
>>>> It shows the demotion target nodes of a node. Export this information to
>>>> user directly.
>>>>
>>>> Below is an example where node0 node1 are DRAM, node3 is a PMEM node.
>>>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>>    <show nothing>
>>>> - After node3 is online as kmem
>>>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 && daxctl online-memory dax0.0
>>>> [
>>>>     {
>>>>       "chardev":"dax0.0",
>>>>       "size":1054867456,
>>>>       "target_node":3,
>>>>       "align":2097152,
>>>>       "mode":"system-ram",
>>>>       "online_memblocks":0,
>>>>       "total_memblocks":7
>>>>     }
>>>> ]
>>>> $ cat /sys/devices/system/node/node0/demotion_nodes
>>>> 3
>>>> $ cat /sys/devices/system/node/node1/demotion_nodes
>>>> 3
>>>> $ cat /sys/devices/system/node/node3/demotion_nodes
>>>>    <show nothing>
>>> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>>> already.  A node in a higher tier can demote to any node in the lower
>>> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>>> --
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>>> ---
>>>>    drivers/base/node.c          | 13 +++++++++++++
>>>>    include/linux/memory-tiers.h |  6 ++++++
>>>>    mm/memory-tiers.c            |  8 ++++++++
>>>>    3 files changed, 27 insertions(+)
>>>>
>>>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>>>> index 493d533f8375..27e8502548a7 100644
>>>> --- a/drivers/base/node.c
>>>> +++ b/drivers/base/node.c
>>>> @@ -7,6 +7,7 @@
>>>>    #include <linux/init.h>
>>>>    #include <linux/mm.h>
>>>>    #include <linux/memory.h>
>>>> +#include <linux/memory-tiers.h>
>>>>    #include <linux/vmstat.h>
>>>>    #include <linux/notifier.h>
>>>>    #include <linux/node.h>
>>>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device *dev,
>>>>    }
>>>>    static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>>>>    +static ssize_t demotion_nodes_show(struct device *dev,
>>>> +			     struct device_attribute *attr, char *buf)
>>>> +{
>>>> +	int ret;
>>>> +	nodemask_t nmask = next_demotion_nodes(dev->id);
>>>> +
>>>> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>>>> +	return ret;
>>>> +}
>>>> +static DEVICE_ATTR_RO(demotion_nodes);
>>>> +
>>>>    static struct attribute *node_dev_attrs[] = {
>>>>    	&dev_attr_meminfo.attr,
>>>>    	&dev_attr_numastat.attr,
>>>>    	&dev_attr_distance.attr,
>>>>    	&dev_attr_vmstat.attr,
>>>> +	&dev_attr_demotion_nodes.attr,
>>>>    	NULL
>>>>    };
>>>>    diff --git a/include/linux/memory-tiers.h
>>>> b/include/linux/memory-tiers.h
>>>> index 437441cdf78f..8eb04923f965 100644
>>>> --- a/include/linux/memory-tiers.h
>>>> +++ b/include/linux/memory-tiers.h
>>>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>>>    void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>>>    #ifdef CONFIG_MIGRATION
>>>>    int next_demotion_node(int node);
>>>> +nodemask_t next_demotion_nodes(int node);
>>>>    void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
>>>>    bool node_is_toptier(int node);
>>>>    #else
>>>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>>>>    	return NUMA_NO_NODE;
>>>>    }
>>>>    +static inline next_demotion_nodes next_demotion_nodes(int node)
>>>> +{
>>>> +	return NODE_MASK_NONE;
>>>> +}
>>>> +
>>>>    static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>    {
>>>>    	*targets = NODE_MASK_NONE;
>>>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>>>> index 37a4f59d9585..90047f37d98a 100644
>>>> --- a/mm/memory-tiers.c
>>>> +++ b/mm/memory-tiers.c
>>>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
>>>>    	rcu_read_unlock();
>>>>    }
>>>>    +nodemask_t next_demotion_nodes(int node)
>>>> +{
>>>> +	if (!node_demotion)
>>>> +		return NODE_MASK_NONE;
>>>> +
>>>> +	return node_demotion[node].preferred;
>>>> +}
>>>> +
>>>>    /**
>>>>     * next_demotion_node() - Get the next node in the demotion path
>>>>     * @node: The starting node to lookup the next node
Yasunori Gotou (Fujitsu) Jan. 31, 2024, 6:23 a.m. UTC | #9
Hello,

> Li Zhijian <lizhijian@fujitsu.com> writes:
> 
> > Hi Ying
> >
> > I need to pick up this thread/patch again.
> >
> >> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
> >> already.  A node in a higher tier can demote to any node in the lower
> >> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
> >>
> >
> > Yes, it's believed that
> > /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
> > are intended to show nodes in memory_tierN. But IMHO, it's not enough,
> > especially for the preferred demotion node(s).
> >
> > Currently, when a demotion occurs, it will prioritize selecting a node
> > from the preferred nodes as the destination node for the demotion. If
> > the preferred nodes does not meet the requirements, it will try from
> > all the lower memory tier nodes until it finds a suitable demotion
> > destination node or ultimately fails.
> >
> > However, currently it only lists the nodes of each tier. If the
> > administrators want to know all the possible demotion destinations for
> > a given node, they need to calculate it themselves:
> > Step 1, find the memory tier where the given node is located Step 2,
> > list all nodes under all its lower tiers
> >
> > It will be even more difficult to know the preferred nodes which
> > depend on more factors, distance etc. For the following example, we
> > may have 6 nodes splitting into three memory tiers.
> >
> > For emulated hmat numa topology example:
> >> $ numactl -H
> >> available: 6 nodes (0-5)
> >> node 0 cpus: 0
> >> node 0 size: 1974 MB
> >> node 0 free: 1767 MB
> >> node 1 cpus: 1
> >> node 1 size: 1694 MB
> >> node 1 free: 1454 MB
> >> node 2 cpus:
> >> node 2 size: 896 MB
> >> node 2 free: 896 MB
> >> node 3 cpus:
> >> node 3 size: 896 MB
> >> node 3 free: 896 MB
> >> node 4 cpus:
> >> node 4 size: 896 MB
> >> node 4 free: 896 MB
> >> node 5 cpus:
> >> node 5 size: 896 MB
> >> node 5 free: 896 MB
> >> node distances:
> >> node   0   1   2   3   4   5
> >> 0:  10  31  21  41  21  41
> >> 1:  31  10  41  21  41  21
> >> 2:  21  41  10  51  21  51
> >> 3:  31  21  51  10  51  21
> >> 4:  21  41  21  51  10  51
> >> 5:  31  21  51  21  51  10
> >> $ cat memory_tier4/nodelist
> >> 0-1
> >> $ cat memory_tier12/nodelist
> >> 2,5
> >> $ cat memory_tier54/nodelist
> >> 3-4
> >
> > For above topology, memory-tier will build the demotion path for each
> > node like this:
> > node[0].preferred = 2
> > node[0].demotion_targets = 2-5
> > node[1].preferred = 5
> > node[1].demotion_targets = 2-5
> > node[2].preferred = 4
> > node[2].demotion_targets = 3-4
> > node[3].preferred = <empty>
> > node[3].demotion_targets = <empty>
> > node[4].preferred = <empty>
> > node[4].demotion_targets = <empty>
> > node[5].preferred = 3
> > node[5].demotion_targets = 3-4
> >
> > But this demotion path is not explicitly known to administrator. And
> > with the feedback from our customers, they also think it is helpful to
> > know demotion path built by kernel to understand the demotion
> > behaviors.
> >
> > So i think we should have 2 new interfaces for each node:
> >
> > /sys/devices/system/node/nodeN/demotion_allowed_nodes
> > /sys/devices/system/node/nodeN/demotion_preferred_nodes
> >
> > I value your opinion, and I'd like to know what you think about...
> 
> Per my understanding, we will not expose everything inside kernel to user
> space.  For page placement in a tiered memory system, demotion is just a part
> of the story.  For example, if the DRAM of a system becomes full, new page
> allocation will fall back to the CXL memory.  Have we exposed the default page
> allocation fallback order to user space?

In extreme terms, users want to analyze all the memory behaviors of memory management
while executing their workload, and want to trace ALL of them if possible.
Of course, it is impossible due to the heavy load, then users want to have other ways as
a compromise. Our request, the demotion target information, is just one of them.

In my impression, users worry about the impact of the CXL memory device on their workload, 
and want to have a way to understand the impact.
If they know there is no information to remove their anxious, they may avoid to buy CXL memory.

In addition, our support team also needs to have clues to solve users' performance problems. 
Even if new page allocation will fall back to the CXL memory, we need to explain why it would
happen as accountability.

> 
> All in all, in my opinion, we only expose as little as possible to user space
> because we need to maintain the ABI for ever.

I can understand there is a compatibility problem by our propose, and kernel may
change its logic in future. This is a tug-of-war situation between kernel developers
and users or support engineers. I suppose It often occurs in many place...

Hmm... I hope there is a new idea to solve this situation even if our proposal is rejected..
Anyone?

Thanks,
----
Yasunori Goto

> 
> --
> Best Regards,
> Huang, Ying
> 
> >
> > On 02/11/2023 11:17, Huang, Ying wrote:
> >> Li Zhijian <lizhijian@fujitsu.com> writes:
> >>
> >>> It shows the demotion target nodes of a node. Export this
> >>> information to user directly.
> >>>
> >>> Below is an example where node0 node1 are DRAM, node3 is a PMEM
> node.
> >>> - Before PMEM is online, no demotion_nodes for node0 and node1.
> >>> $ cat /sys/devices/system/node/node0/demotion_nodes
> >>>   <show nothing>
> >>> - After node3 is online as kmem
> >>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 &&
> >>> daxctl online-memory dax0.0 [
> >>>    {
> >>>      "chardev":"dax0.0",
> >>>      "size":1054867456,
> >>>      "target_node":3,
> >>>      "align":2097152,
> >>>      "mode":"system-ram",
> >>>      "online_memblocks":0,
> >>>      "total_memblocks":7
> >>>    }
> >>> ]
> >>> $ cat /sys/devices/system/node/node0/demotion_nodes
> >>> 3
> >>> $ cat /sys/devices/system/node/node1/demotion_nodes
> >>> 3
> >>> $ cat /sys/devices/system/node/node3/demotion_nodes
> >>>   <show nothing>
> >> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
> >> already.  A node in a higher tier can demote to any node in the lower
> >> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
> >> --
> >> Best Regards,
> >> Huang, Ying
> >>
> >>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> >>> ---
> >>>   drivers/base/node.c          | 13 +++++++++++++
> >>>   include/linux/memory-tiers.h |  6 ++++++
> >>>   mm/memory-tiers.c            |  8 ++++++++
> >>>   3 files changed, 27 insertions(+)
> >>>
> >>> diff --git a/drivers/base/node.c b/drivers/base/node.c index
> >>> 493d533f8375..27e8502548a7 100644
> >>> --- a/drivers/base/node.c
> >>> +++ b/drivers/base/node.c
> >>> @@ -7,6 +7,7 @@
> >>>   #include <linux/init.h>
> >>>   #include <linux/mm.h>
> >>>   #include <linux/memory.h>
> >>> +#include <linux/memory-tiers.h>
> >>>   #include <linux/vmstat.h>
> >>>   #include <linux/notifier.h>
> >>>   #include <linux/node.h>
> >>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device
> *dev,
> >>>   }
> >>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
> >>>   +static ssize_t demotion_nodes_show(struct device *dev,
> >>> +			     struct device_attribute *attr, char *buf) {
> >>> +	int ret;
> >>> +	nodemask_t nmask = next_demotion_nodes(dev->id);
> >>> +
> >>> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
> >>> +	return ret;
> >>> +}
> >>> +static DEVICE_ATTR_RO(demotion_nodes);
> >>> +
> >>>   static struct attribute *node_dev_attrs[] = {
> >>>   	&dev_attr_meminfo.attr,
> >>>   	&dev_attr_numastat.attr,
> >>>   	&dev_attr_distance.attr,
> >>>   	&dev_attr_vmstat.attr,
> >>> +	&dev_attr_demotion_nodes.attr,
> >>>   	NULL
> >>>   };
> >>>   diff --git a/include/linux/memory-tiers.h
> >>> b/include/linux/memory-tiers.h index 437441cdf78f..8eb04923f965
> >>> 100644
> >>> --- a/include/linux/memory-tiers.h
> >>> +++ b/include/linux/memory-tiers.h
> >>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct
> memory_dev_type *default_type);
> >>>   void clear_node_memory_type(int node, struct memory_dev_type
> *memtype);
> >>>   #ifdef CONFIG_MIGRATION
> >>>   int next_demotion_node(int node);
> >>> +nodemask_t next_demotion_nodes(int node);
> >>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t
> *targets);
> >>>   bool node_is_toptier(int node);
> >>>   #else
> >>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
> >>>   	return NUMA_NO_NODE;
> >>>   }
> >>>   +static inline next_demotion_nodes next_demotion_nodes(int node)
> >>> +{
> >>> +	return NODE_MASK_NONE;
> >>> +}
> >>> +
> >>>   static inline void node_get_allowed_targets(pg_data_t *pgdat,
> nodemask_t *targets)
> >>>   {
> >>>   	*targets = NODE_MASK_NONE;
> >>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index
> >>> 37a4f59d9585..90047f37d98a 100644
> >>> --- a/mm/memory-tiers.c
> >>> +++ b/mm/memory-tiers.c
> >>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat,
> nodemask_t *targets)
> >>>   	rcu_read_unlock();
> >>>   }
> >>>   +nodemask_t next_demotion_nodes(int node)
> >>> +{
> >>> +	if (!node_demotion)
> >>> +		return NODE_MASK_NONE;
> >>> +
> >>> +	return node_demotion[node].preferred; }
> >>> +
> >>>   /**
> >>>    * next_demotion_node() - Get the next node in the demotion path
> >>>    * @node: The starting node to lookup the next node
Huang, Ying Jan. 31, 2024, 6:52 a.m. UTC | #10
"Yasunori Gotou (Fujitsu)" <y-goto@fujitsu.com> writes:

> Hello,
>
>> Li Zhijian <lizhijian@fujitsu.com> writes:
>> 
>> > Hi Ying
>> >
>> > I need to pick up this thread/patch again.
>> >
>> >> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> >> already.  A node in a higher tier can demote to any node in the lower
>> >> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>> >>
>> >
>> > Yes, it's believed that
>> > /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
>> > are intended to show nodes in memory_tierN. But IMHO, it's not enough,
>> > especially for the preferred demotion node(s).
>> >
>> > Currently, when a demotion occurs, it will prioritize selecting a node
>> > from the preferred nodes as the destination node for the demotion. If
>> > the preferred nodes does not meet the requirements, it will try from
>> > all the lower memory tier nodes until it finds a suitable demotion
>> > destination node or ultimately fails.
>> >
>> > However, currently it only lists the nodes of each tier. If the
>> > administrators want to know all the possible demotion destinations for
>> > a given node, they need to calculate it themselves:
>> > Step 1, find the memory tier where the given node is located Step 2,
>> > list all nodes under all its lower tiers
>> >
>> > It will be even more difficult to know the preferred nodes which
>> > depend on more factors, distance etc. For the following example, we
>> > may have 6 nodes splitting into three memory tiers.
>> >
>> > For emulated hmat numa topology example:
>> >> $ numactl -H
>> >> available: 6 nodes (0-5)
>> >> node 0 cpus: 0
>> >> node 0 size: 1974 MB
>> >> node 0 free: 1767 MB
>> >> node 1 cpus: 1
>> >> node 1 size: 1694 MB
>> >> node 1 free: 1454 MB
>> >> node 2 cpus:
>> >> node 2 size: 896 MB
>> >> node 2 free: 896 MB
>> >> node 3 cpus:
>> >> node 3 size: 896 MB
>> >> node 3 free: 896 MB
>> >> node 4 cpus:
>> >> node 4 size: 896 MB
>> >> node 4 free: 896 MB
>> >> node 5 cpus:
>> >> node 5 size: 896 MB
>> >> node 5 free: 896 MB
>> >> node distances:
>> >> node   0   1   2   3   4   5
>> >> 0:  10  31  21  41  21  41
>> >> 1:  31  10  41  21  41  21
>> >> 2:  21  41  10  51  21  51
>> >> 3:  31  21  51  10  51  21
>> >> 4:  21  41  21  51  10  51
>> >> 5:  31  21  51  21  51  10
>> >> $ cat memory_tier4/nodelist
>> >> 0-1
>> >> $ cat memory_tier12/nodelist
>> >> 2,5
>> >> $ cat memory_tier54/nodelist
>> >> 3-4
>> >
>> > For above topology, memory-tier will build the demotion path for each
>> > node like this:
>> > node[0].preferred = 2
>> > node[0].demotion_targets = 2-5
>> > node[1].preferred = 5
>> > node[1].demotion_targets = 2-5
>> > node[2].preferred = 4
>> > node[2].demotion_targets = 3-4
>> > node[3].preferred = <empty>
>> > node[3].demotion_targets = <empty>
>> > node[4].preferred = <empty>
>> > node[4].demotion_targets = <empty>
>> > node[5].preferred = 3
>> > node[5].demotion_targets = 3-4
>> >
>> > But this demotion path is not explicitly known to administrator. And
>> > with the feedback from our customers, they also think it is helpful to
>> > know demotion path built by kernel to understand the demotion
>> > behaviors.
>> >
>> > So i think we should have 2 new interfaces for each node:
>> >
>> > /sys/devices/system/node/nodeN/demotion_allowed_nodes
>> > /sys/devices/system/node/nodeN/demotion_preferred_nodes
>> >
>> > I value your opinion, and I'd like to know what you think about...
>> 
>> Per my understanding, we will not expose everything inside kernel to user
>> space.  For page placement in a tiered memory system, demotion is just a part
>> of the story.  For example, if the DRAM of a system becomes full, new page
>> allocation will fall back to the CXL memory.  Have we exposed the default page
>> allocation fallback order to user space?
>
> In extreme terms, users want to analyze all the memory behaviors of memory management
> while executing their workload, and want to trace ALL of them if possible.
> Of course, it is impossible due to the heavy load, then users want to have other ways as
> a compromise. Our request, the demotion target information, is just one of them.
>
> In my impression, users worry about the impact of the CXL memory device on their workload, 
> and want to have a way to understand the impact.
> If they know there is no information to remove their anxious, they may avoid to buy CXL memory.
>
> In addition, our support team also needs to have clues to solve users' performance problems. 
> Even if new page allocation will fall back to the CXL memory, we need to explain why it would
> happen as accountability.

I guess

/proc/<PID>/numa_maps
/sys/fs/cgroup/<CGNAME>/memory.numa_stat

may help to understand system behavior.

--
Best Regards,
Huang, Ying

>> 
>> All in all, in my opinion, we only expose as little as possible to user space
>> because we need to maintain the ABI for ever.
>
> I can understand there is a compatibility problem by our propose, and kernel may
> change its logic in future. This is a tug-of-war situation between kernel developers
> and users or support engineers. I suppose It often occurs in many place...
>
> Hmm... I hope there is a new idea to solve this situation even if our proposal is rejected..
> Anyone?
>
> Thanks,
> ----
> Yasunori Goto
>
>> 
>> --
>> Best Regards,
>> Huang, Ying
>> 
>> >
>> > On 02/11/2023 11:17, Huang, Ying wrote:
>> >> Li Zhijian <lizhijian@fujitsu.com> writes:
>> >>
>> >>> It shows the demotion target nodes of a node. Export this
>> >>> information to user directly.
>> >>>
>> >>> Below is an example where node0 node1 are DRAM, node3 is a PMEM
>> node.
>> >>> - Before PMEM is online, no demotion_nodes for node0 and node1.
>> >>> $ cat /sys/devices/system/node/node0/demotion_nodes
>> >>>   <show nothing>
>> >>> - After node3 is online as kmem
>> >>> $ daxctl reconfigure-device --mode=system-ram --no-online dax0.0 &&
>> >>> daxctl online-memory dax0.0 [
>> >>>    {
>> >>>      "chardev":"dax0.0",
>> >>>      "size":1054867456,
>> >>>      "target_node":3,
>> >>>      "align":2097152,
>> >>>      "mode":"system-ram",
>> >>>      "online_memblocks":0,
>> >>>      "total_memblocks":7
>> >>>    }
>> >>> ]
>> >>> $ cat /sys/devices/system/node/node0/demotion_nodes
>> >>> 3
>> >>> $ cat /sys/devices/system/node/node1/demotion_nodes
>> >>> 3
>> >>> $ cat /sys/devices/system/node/node3/demotion_nodes
>> >>>   <show nothing>
>> >> We have /sys/devices/virtual/memory_tiering/memory_tier*/nodelist
>> >> already.  A node in a higher tier can demote to any node in the lower
>> >> tiers.  What's more need to be displayed in nodeX/demotion_nodes?
>> >> --
>> >> Best Regards,
>> >> Huang, Ying
>> >>
>> >>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> >>> ---
>> >>>   drivers/base/node.c          | 13 +++++++++++++
>> >>>   include/linux/memory-tiers.h |  6 ++++++
>> >>>   mm/memory-tiers.c            |  8 ++++++++
>> >>>   3 files changed, 27 insertions(+)
>> >>>
>> >>> diff --git a/drivers/base/node.c b/drivers/base/node.c index
>> >>> 493d533f8375..27e8502548a7 100644
>> >>> --- a/drivers/base/node.c
>> >>> +++ b/drivers/base/node.c
>> >>> @@ -7,6 +7,7 @@
>> >>>   #include <linux/init.h>
>> >>>   #include <linux/mm.h>
>> >>>   #include <linux/memory.h>
>> >>> +#include <linux/memory-tiers.h>
>> >>>   #include <linux/vmstat.h>
>> >>>   #include <linux/notifier.h>
>> >>>   #include <linux/node.h>
>> >>> @@ -569,11 +570,23 @@ static ssize_t node_read_distance(struct device
>> *dev,
>> >>>   }
>> >>>   static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
>> >>>   +static ssize_t demotion_nodes_show(struct device *dev,
>> >>> +			     struct device_attribute *attr, char *buf) {
>> >>> +	int ret;
>> >>> +	nodemask_t nmask = next_demotion_nodes(dev->id);
>> >>> +
>> >>> +	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
>> >>> +	return ret;
>> >>> +}
>> >>> +static DEVICE_ATTR_RO(demotion_nodes);
>> >>> +
>> >>>   static struct attribute *node_dev_attrs[] = {
>> >>>   	&dev_attr_meminfo.attr,
>> >>>   	&dev_attr_numastat.attr,
>> >>>   	&dev_attr_distance.attr,
>> >>>   	&dev_attr_vmstat.attr,
>> >>> +	&dev_attr_demotion_nodes.attr,
>> >>>   	NULL
>> >>>   };
>> >>>   diff --git a/include/linux/memory-tiers.h
>> >>> b/include/linux/memory-tiers.h index 437441cdf78f..8eb04923f965
>> >>> 100644
>> >>> --- a/include/linux/memory-tiers.h
>> >>> +++ b/include/linux/memory-tiers.h
>> >>> @@ -38,6 +38,7 @@ void init_node_memory_type(int node, struct
>> memory_dev_type *default_type);
>> >>>   void clear_node_memory_type(int node, struct memory_dev_type
>> *memtype);
>> >>>   #ifdef CONFIG_MIGRATION
>> >>>   int next_demotion_node(int node);
>> >>> +nodemask_t next_demotion_nodes(int node);
>> >>>   void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t
>> *targets);
>> >>>   bool node_is_toptier(int node);
>> >>>   #else
>> >>> @@ -46,6 +47,11 @@ static inline int next_demotion_node(int node)
>> >>>   	return NUMA_NO_NODE;
>> >>>   }
>> >>>   +static inline next_demotion_nodes next_demotion_nodes(int node)
>> >>> +{
>> >>> +	return NODE_MASK_NONE;
>> >>> +}
>> >>> +
>> >>>   static inline void node_get_allowed_targets(pg_data_t *pgdat,
>> nodemask_t *targets)
>> >>>   {
>> >>>   	*targets = NODE_MASK_NONE;
>> >>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index
>> >>> 37a4f59d9585..90047f37d98a 100644
>> >>> --- a/mm/memory-tiers.c
>> >>> +++ b/mm/memory-tiers.c
>> >>> @@ -282,6 +282,14 @@ void node_get_allowed_targets(pg_data_t *pgdat,
>> nodemask_t *targets)
>> >>>   	rcu_read_unlock();
>> >>>   }
>> >>>   +nodemask_t next_demotion_nodes(int node)
>> >>> +{
>> >>> +	if (!node_demotion)
>> >>> +		return NODE_MASK_NONE;
>> >>> +
>> >>> +	return node_demotion[node].preferred; }
>> >>> +
>> >>>   /**
>> >>>    * next_demotion_node() - Get the next node in the demotion path
>> >>>    * @node: The starting node to lookup the next node
Li Zhijian Feb. 2, 2024, 7:43 a.m. UTC | #11
On 31/01/2024 11:17, Li Zhijian wrote:
>>> node[0].preferred = 2
>>> node[0].demotion_targets = 2-5
>>> node[1].preferred = 5
>>> node[1].demotion_targets = 2-5
>>> node[2].preferred = 4
>>> node[2].demotion_targets = 3-4
>>> node[3].preferred = <empty>
>>> node[3].demotion_targets = <empty>
>>> node[4].preferred = <empty>
>>> node[4].demotion_targets = <empty>
>>> node[5].preferred = 3
>>> node[5].demotion_targets = 3-4
>>>                                                                           But
>>> this demotion path is not explicitly known to administrator. And with
>>> the
>>> feedback from our customers, they also think it is helpful to know demotion
>>> path built by kernel to understand the demotion behaviors.
>>>
>>> So i think we should have 2 new interfaces for each node:
>>>

>>> /sys/devices/system/node/nodeN/demotion_allowed_nodes
>>> /sys/devices/system/node/nodeN/demotion_preferred_nodes
>>>
>>> I value your opinion, and I'd like to know what you think about...
>>
>> Per my understanding, we will not expose everything inside kernel to
>> user space.  For page placement in a tiered memory system, demotion is
>> just a part of the story.  For example, if the DRAM of a system becomes
>> full, new page allocation will fall back to the CXL memory.  Have we
>> exposed the default page allocation fallback order to user space?


Back to our initial requirement:
When demotion is enabled, what's the demotion path, especially the preferred node?
are they consistent with administrator's expectations?"

It seems there is no a direct answer. But actually, kernel have already known
this information, IMHO, exposing them to users is not a bad choice.

This information is able to help them adjust/tune the machine before really
deploy their workloads.

If the sysfs approach isn't better enough, is it possible to have another more
user-friendly way to convey this information? like the allocation fallback order does,
simply print them to dmesg?


Thanks
Zhijian


> 
> Good question, I have no answer yet, but I think we can get the fallback order
> from the dmesg now.
> 
> The further action for us is that we will also try improve the use space tool,
> such as numactl to show the demotion path with the help of this exposed information.
Huang, Ying Feb. 2, 2024, 8:19 a.m. UTC | #12
"Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:

> On 31/01/2024 11:17, Li Zhijian wrote:
>>>> node[0].preferred = 2
>>>> node[0].demotion_targets = 2-5
>>>> node[1].preferred = 5
>>>> node[1].demotion_targets = 2-5
>>>> node[2].preferred = 4
>>>> node[2].demotion_targets = 3-4
>>>> node[3].preferred = <empty>
>>>> node[3].demotion_targets = <empty>
>>>> node[4].preferred = <empty>
>>>> node[4].demotion_targets = <empty>
>>>> node[5].preferred = 3
>>>> node[5].demotion_targets = 3-4
>>>>                                                                           But
>>>> this demotion path is not explicitly known to administrator. And with
>>>> the
>>>> feedback from our customers, they also think it is helpful to know demotion
>>>> path built by kernel to understand the demotion behaviors.
>>>>
>>>> So i think we should have 2 new interfaces for each node:
>>>>
>
>>>> /sys/devices/system/node/nodeN/demotion_allowed_nodes
>>>> /sys/devices/system/node/nodeN/demotion_preferred_nodes
>>>>
>>>> I value your opinion, and I'd like to know what you think about...
>>>
>>> Per my understanding, we will not expose everything inside kernel to
>>> user space.  For page placement in a tiered memory system, demotion is
>>> just a part of the story.  For example, if the DRAM of a system becomes
>>> full, new page allocation will fall back to the CXL memory.  Have we
>>> exposed the default page allocation fallback order to user space?
>
>
> Back to our initial requirement:
> When demotion is enabled, what's the demotion path, especially the preferred node?
> are they consistent with administrator's expectations?"
>
> It seems there is no a direct answer. But actually, kernel have already known
> this information, IMHO, exposing them to users is not a bad choice.
>
> This information is able to help them adjust/tune the machine before really
> deploy their workloads.
>
> If the sysfs approach isn't better enough, is it possible to have another more
> user-friendly way to convey this information? like the allocation fallback order does,
> simply print them to dmesg?

I have no object to print some demotion information in dmesg.

--
Best Regards,
Huang, Ying

>
>> 
>> Good question, I have no answer yet, but I think we can get the fallback order
>> from the dmesg now.
>> 
>> The further action for us is that we will also try improve the use space tool,
>> such as numactl to show the demotion path with the help of this exposed information.
Li Zhijian Feb. 5, 2024, 7:31 a.m. UTC | #13
On 02/02/2024 16:19, Huang, Ying wrote:
>> Back to our initial requirement:
>> When demotion is enabled, what's the demotion path, especially the preferred node?
>> are they consistent with administrator's expectations?"
>>
>> It seems there is no a direct answer. But actually, kernel have already known
>> this information, IMHO, exposing them to users is not a bad choice.
>>
>> This information is able to help them adjust/tune the machine before really
>> deploy their workloads.
>>
>> If the sysfs approach isn't better enough, is it possible to have another more
>> user-friendly way to convey this information? like the allocation fallback order does,
>> simply print them to dmesg?
> I have no object to print some demotion information in dmesg.
> 

Thank you for sharing your thoughts and feedback on this.
I will attempt to do so.


Thanks
Zhijian
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 493d533f8375..27e8502548a7 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -7,6 +7,7 @@ 
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
+#include <linux/memory-tiers.h>
 #include <linux/vmstat.h>
 #include <linux/notifier.h>
 #include <linux/node.h>
@@ -569,11 +570,23 @@  static ssize_t node_read_distance(struct device *dev,
 }
 static DEVICE_ATTR(distance, 0444, node_read_distance, NULL);
 
+static ssize_t demotion_nodes_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	int ret;
+	nodemask_t nmask = next_demotion_nodes(dev->id);
+
+	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+	return ret;
+}
+static DEVICE_ATTR_RO(demotion_nodes);
+
 static struct attribute *node_dev_attrs[] = {
 	&dev_attr_meminfo.attr,
 	&dev_attr_numastat.attr,
 	&dev_attr_distance.attr,
 	&dev_attr_vmstat.attr,
+	&dev_attr_demotion_nodes.attr,
 	NULL
 };
 
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..8eb04923f965 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -38,6 +38,7 @@  void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
+nodemask_t next_demotion_nodes(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 bool node_is_toptier(int node);
 #else
@@ -46,6 +47,11 @@  static inline int next_demotion_node(int node)
 	return NUMA_NO_NODE;
 }
 
+static inline next_demotion_nodes next_demotion_nodes(int node)
+{
+	return NODE_MASK_NONE;
+}
+
 static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
 {
 	*targets = NODE_MASK_NONE;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 37a4f59d9585..90047f37d98a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -282,6 +282,14 @@  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
 	rcu_read_unlock();
 }
 
+nodemask_t next_demotion_nodes(int node)
+{
+	if (!node_demotion)
+		return NODE_MASK_NONE;
+
+	return node_demotion[node].preferred;
+}
+
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node