diff mbox series

[RFC,3/4] mm/vmstat: rename pgdemote_* to pgdemote_dst_* and add pgdemote_src_*

Message ID 20231102025648.1285477-4-lizhijian@fujitsu.com (mailing list archive)
State New
Headers show
Series [RFC,1/4] drivers/base/node: Add demotion_nodes sys infterface | expand

Commit Message

Zhijian Li (Fujitsu) Nov. 2, 2023, 2:56 a.m. UTC
pgdemote_src_*: pages demoted from this node.
pgdemote_dst_*: pages demoted to this node.

So that we are able to know their demotion per-node stats by checking this.

In the environment, node0 and node1 are DRAM, node3 is PMEM.

Global stats:
$ grep -E 'demote' /proc/vmstat
pgdemote_src_kswapd 130155
pgdemote_src_direct 113497
pgdemote_src_khugepaged 0
pgdemote_dst_kswapd 130155
pgdemote_dst_direct 113497
pgdemote_dst_khugepaged 0

Per-node stats:
$ grep demote /sys/devices/system/node/node0/vmstat
pgdemote_src_kswapd 68454
pgdemote_src_direct 83431
pgdemote_src_khugepaged 0
pgdemote_dst_kswapd 0
pgdemote_dst_direct 0
pgdemote_dst_khugepaged 0

$ grep demote /sys/devices/system/node/node1/vmstat
pgdemote_src_kswapd 185834
pgdemote_src_direct 30066
pgdemote_src_khugepaged 0
pgdemote_dst_kswapd 0
pgdemote_dst_direct 0
pgdemote_dst_khugepaged 0

$ grep demote /sys/devices/system/node/node3/vmstat
pgdemote_src_kswapd 0
pgdemote_src_direct 0
pgdemote_src_khugepaged 0
pgdemote_dst_kswapd 254288
pgdemote_dst_direct 113497
pgdemote_dst_khugepaged 0

From above stats, we know node3 is the demotion destination which one
the node0 and node1 will demote to.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
RFC: their names are open to discussion, maybe pgdemote_from/to_*
Another defect of this patch is that, SUM(pgdemote_src_*) is always same
as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
---
 include/linux/mmzone.h |  9 ++++++---
 mm/vmscan.c            | 13 ++++++++++---
 mm/vmstat.c            |  9 ++++++---
 3 files changed, 22 insertions(+), 9 deletions(-)

Comments

Huang, Ying Nov. 2, 2023, 5:45 a.m. UTC | #1
Li Zhijian <lizhijian@fujitsu.com> writes:

> pgdemote_src_*: pages demoted from this node.
> pgdemote_dst_*: pages demoted to this node.
>
> So that we are able to know their demotion per-node stats by checking this.
>
> In the environment, node0 and node1 are DRAM, node3 is PMEM.
>
> Global stats:
> $ grep -E 'demote' /proc/vmstat
> pgdemote_src_kswapd 130155
> pgdemote_src_direct 113497
> pgdemote_src_khugepaged 0
> pgdemote_dst_kswapd 130155
> pgdemote_dst_direct 113497
> pgdemote_dst_khugepaged 0
>
> Per-node stats:
> $ grep demote /sys/devices/system/node/node0/vmstat
> pgdemote_src_kswapd 68454
> pgdemote_src_direct 83431
> pgdemote_src_khugepaged 0
> pgdemote_dst_kswapd 0
> pgdemote_dst_direct 0
> pgdemote_dst_khugepaged 0
>
> $ grep demote /sys/devices/system/node/node1/vmstat
> pgdemote_src_kswapd 185834
> pgdemote_src_direct 30066
> pgdemote_src_khugepaged 0
> pgdemote_dst_kswapd 0
> pgdemote_dst_direct 0
> pgdemote_dst_khugepaged 0
>
> $ grep demote /sys/devices/system/node/node3/vmstat
> pgdemote_src_kswapd 0
> pgdemote_src_direct 0
> pgdemote_src_khugepaged 0
> pgdemote_dst_kswapd 254288
> pgdemote_dst_direct 113497
> pgdemote_dst_khugepaged 0
>
> From above stats, we know node3 is the demotion destination which one
> the node0 and node1 will demote to.

Why do we need these information?  Do you have some use case?

--
Best Regards,
Huang, Ying

> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> ---
> RFC: their names are open to discussion, maybe pgdemote_from/to_*
> Another defect of this patch is that, SUM(pgdemote_src_*) is always same
> as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
> ---
>  include/linux/mmzone.h |  9 ++++++---
>  mm/vmscan.c            | 13 ++++++++++---
>  mm/vmstat.c            |  9 ++++++---
>  3 files changed, 22 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index ad0309eea850..a6140d894bec 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -207,9 +207,12 @@ enum node_stat_item {
>  	PGPROMOTE_SUCCESS,	/* promote successfully */
>  	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
>  	/* PGDEMOTE_*: pages demoted */
> -	PGDEMOTE_KSWAPD,
> -	PGDEMOTE_DIRECT,
> -	PGDEMOTE_KHUGEPAGED,
> +	PGDEMOTE_SRC_KSWAPD,
> +	PGDEMOTE_SRC_DIRECT,
> +	PGDEMOTE_SRC_KHUGEPAGED,
> +	PGDEMOTE_DST_KSWAPD,
> +	PGDEMOTE_DST_DIRECT,
> +	PGDEMOTE_DST_KHUGEPAGED,
>  #endif
>  	NR_VM_NODE_STAT_ITEMS
>  };
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2f1fb4ec3235..55d2287d7150 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1111,13 +1111,18 @@ void drop_slab(void)
>  static int reclaimer_offset(void)
>  {
>  	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
> -			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
> +			PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD);
>  	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>  			PGSCAN_DIRECT - PGSCAN_KSWAPD);
>  	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
> -			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
> +			PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD);
>  	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>  			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD !=
> +			PGDEMOTE_DST_DIRECT - PGDEMOTE_DST_KSWAPD);
> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD !=
> +			PGDEMOTE_DST_KHUGEPAGED - PGDEMOTE_DST_KSWAPD);
> +
>  
>  	if (current_is_kswapd())
>  		return 0;
> @@ -1678,8 +1683,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>  		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>  		      &nr_succeeded);
>  
> +	mod_node_page_state(pgdat,
> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(), nr_succeeded);
>  	mod_node_page_state(NODE_DATA(target_nid),
> -		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);
>  
>  	return nr_succeeded;
>  }
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index f141c48c39e4..63f106a5e008 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
>  #ifdef CONFIG_NUMA_BALANCING
>  	"pgpromote_success",
>  	"pgpromote_candidate",
> -	"pgdemote_kswapd",
> -	"pgdemote_direct",
> -	"pgdemote_khugepaged",
> +	"pgdemote_src_kswapd",
> +	"pgdemote_src_direct",
> +	"pgdemote_src_khugepaged",
> +	"pgdemote_dst_kswapd",
> +	"pgdemote_dst_direct",
> +	"pgdemote_dst_khugepaged",
>  #endif
>  
>  	/* enum writeback_stat_item counters */
Zhijian Li (Fujitsu) Nov. 2, 2023, 6:34 a.m. UTC | #2
On 02/11/2023 13:45, Huang, Ying wrote:
> Li Zhijian <lizhijian@fujitsu.com> writes:
> 
>> pgdemote_src_*: pages demoted from this node.
>> pgdemote_dst_*: pages demoted to this node.
>>
>> So that we are able to know their demotion per-node stats by checking this.
>>
>> In the environment, node0 and node1 are DRAM, node3 is PMEM.
>>
>> Global stats:
>> $ grep -E 'demote' /proc/vmstat
>> pgdemote_src_kswapd 130155
>> pgdemote_src_direct 113497
>> pgdemote_src_khugepaged 0
>> pgdemote_dst_kswapd 130155
>> pgdemote_dst_direct 113497
>> pgdemote_dst_khugepaged 0
>>
>> Per-node stats:
>> $ grep demote /sys/devices/system/node/node0/vmstat
>> pgdemote_src_kswapd 68454
>> pgdemote_src_direct 83431
>> pgdemote_src_khugepaged 0
>> pgdemote_dst_kswapd 0
>> pgdemote_dst_direct 0
>> pgdemote_dst_khugepaged 0
>>
>> $ grep demote /sys/devices/system/node/node1/vmstat
>> pgdemote_src_kswapd 185834
>> pgdemote_src_direct 30066
>> pgdemote_src_khugepaged 0
>> pgdemote_dst_kswapd 0
>> pgdemote_dst_direct 0
>> pgdemote_dst_khugepaged 0
>>
>> $ grep demote /sys/devices/system/node/node3/vmstat
>> pgdemote_src_kswapd 0
>> pgdemote_src_direct 0
>> pgdemote_src_khugepaged 0
>> pgdemote_dst_kswapd 254288
>> pgdemote_dst_direct 113497
>> pgdemote_dst_khugepaged 0
>>
>>  From above stats, we know node3 is the demotion destination which one
>> the node0 and node1 will demote to.
> 
> Why do we need these information?  Do you have some use case?

I recall our customers have mentioned that they want to know how much the memory is demoted
to the CXL memory device in a specific period.


>>>   	mod_node_page_state(NODE_DATA(target_nid),
>>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
>>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);

But if the *target_nid* is only indicate the preferred node, this accounting maybe not accurate.


Thanks
Zhijian

> 
> --
> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> ---
>> RFC: their names are open to discussion, maybe pgdemote_from/to_*
>> Another defect of this patch is that, SUM(pgdemote_src_*) is always same
>> as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
>> ---
>>   include/linux/mmzone.h |  9 ++++++---
>>   mm/vmscan.c            | 13 ++++++++++---
>>   mm/vmstat.c            |  9 ++++++---
>>   3 files changed, 22 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index ad0309eea850..a6140d894bec 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -207,9 +207,12 @@ enum node_stat_item {
>>   	PGPROMOTE_SUCCESS,	/* promote successfully */
>>   	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
>>   	/* PGDEMOTE_*: pages demoted */
>> -	PGDEMOTE_KSWAPD,
>> -	PGDEMOTE_DIRECT,
>> -	PGDEMOTE_KHUGEPAGED,
>> +	PGDEMOTE_SRC_KSWAPD,
>> +	PGDEMOTE_SRC_DIRECT,
>> +	PGDEMOTE_SRC_KHUGEPAGED,
>> +	PGDEMOTE_DST_KSWAPD,
>> +	PGDEMOTE_DST_DIRECT,
>> +	PGDEMOTE_DST_KHUGEPAGED,
>>   #endif
>>   	NR_VM_NODE_STAT_ITEMS
>>   };
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 2f1fb4ec3235..55d2287d7150 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1111,13 +1111,18 @@ void drop_slab(void)
>>   static int reclaimer_offset(void)
>>   {
>>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>> -			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
>> +			PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD);
>>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>>   			PGSCAN_DIRECT - PGSCAN_KSWAPD);
>>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>> -			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
>> +			PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD);
>>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>>   			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
>> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD !=
>> +			PGDEMOTE_DST_DIRECT - PGDEMOTE_DST_KSWAPD);
>> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD !=
>> +			PGDEMOTE_DST_KHUGEPAGED - PGDEMOTE_DST_KSWAPD);
>> +
>>   
>>   	if (current_is_kswapd())
>>   		return 0;
>> @@ -1678,8 +1683,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>>   		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>>   		      &nr_succeeded);
>>   
>> +	mod_node_page_state(pgdat,
>> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(), nr_succeeded);
>>   	mod_node_page_state(NODE_DATA(target_nid),
>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);
>>   
>>   	return nr_succeeded;
>>   }
>> diff --git a/mm/vmstat.c b/mm/vmstat.c
>> index f141c48c39e4..63f106a5e008 100644
>> --- a/mm/vmstat.c
>> +++ b/mm/vmstat.c
>> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
>>   #ifdef CONFIG_NUMA_BALANCING
>>   	"pgpromote_success",
>>   	"pgpromote_candidate",
>> -	"pgdemote_kswapd",
>> -	"pgdemote_direct",
>> -	"pgdemote_khugepaged",
>> +	"pgdemote_src_kswapd",
>> +	"pgdemote_src_direct",
>> +	"pgdemote_src_khugepaged",
>> +	"pgdemote_dst_kswapd",
>> +	"pgdemote_dst_direct",
>> +	"pgdemote_dst_khugepaged",
>>   #endif
>>   
>>   	/* enum writeback_stat_item counters */
Huang, Ying Nov. 2, 2023, 6:56 a.m. UTC | #3
"Zhijian Li (Fujitsu)" <lizhijian@fujitsu.com> writes:

> On 02/11/2023 13:45, Huang, Ying wrote:
>> Li Zhijian <lizhijian@fujitsu.com> writes:
>> 
>>> pgdemote_src_*: pages demoted from this node.
>>> pgdemote_dst_*: pages demoted to this node.
>>>
>>> So that we are able to know their demotion per-node stats by checking this.
>>>
>>> In the environment, node0 and node1 are DRAM, node3 is PMEM.
>>>
>>> Global stats:
>>> $ grep -E 'demote' /proc/vmstat
>>> pgdemote_src_kswapd 130155
>>> pgdemote_src_direct 113497
>>> pgdemote_src_khugepaged 0
>>> pgdemote_dst_kswapd 130155
>>> pgdemote_dst_direct 113497
>>> pgdemote_dst_khugepaged 0
>>>
>>> Per-node stats:
>>> $ grep demote /sys/devices/system/node/node0/vmstat
>>> pgdemote_src_kswapd 68454
>>> pgdemote_src_direct 83431
>>> pgdemote_src_khugepaged 0
>>> pgdemote_dst_kswapd 0
>>> pgdemote_dst_direct 0
>>> pgdemote_dst_khugepaged 0
>>>
>>> $ grep demote /sys/devices/system/node/node1/vmstat
>>> pgdemote_src_kswapd 185834
>>> pgdemote_src_direct 30066
>>> pgdemote_src_khugepaged 0
>>> pgdemote_dst_kswapd 0
>>> pgdemote_dst_direct 0
>>> pgdemote_dst_khugepaged 0
>>>
>>> $ grep demote /sys/devices/system/node/node3/vmstat
>>> pgdemote_src_kswapd 0
>>> pgdemote_src_direct 0
>>> pgdemote_src_khugepaged 0
>>> pgdemote_dst_kswapd 254288
>>> pgdemote_dst_direct 113497
>>> pgdemote_dst_khugepaged 0
>>>
>>>  From above stats, we know node3 is the demotion destination which one
>>> the node0 and node1 will demote to.
>> 
>> Why do we need these information?  Do you have some use case?
>
> I recall our customers have mentioned that they want to know how much the memory is demoted
> to the CXL memory device in a specific period.

This doesn't sound like a use case.  Can you elaborate it?  What can
only be tuned with the help of the added stats?

--
Best Regards,
Huang, Ying

>
>>>>   	mod_node_page_state(NODE_DATA(target_nid),
>>>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
>>>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);
>
> But if the *target_nid* is only indicate the preferred node, this accounting maybe not accurate.
>
>
> Thanks
> Zhijian
>
>> 
>> --
>> Best Regards,
>> Huang, Ying
>> 
>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>> ---
>>> RFC: their names are open to discussion, maybe pgdemote_from/to_*
>>> Another defect of this patch is that, SUM(pgdemote_src_*) is always same
>>> as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
>>> ---
>>>   include/linux/mmzone.h |  9 ++++++---
>>>   mm/vmscan.c            | 13 ++++++++++---
>>>   mm/vmstat.c            |  9 ++++++---
>>>   3 files changed, 22 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index ad0309eea850..a6140d894bec 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -207,9 +207,12 @@ enum node_stat_item {
>>>   	PGPROMOTE_SUCCESS,	/* promote successfully */
>>>   	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
>>>   	/* PGDEMOTE_*: pages demoted */
>>> -	PGDEMOTE_KSWAPD,
>>> -	PGDEMOTE_DIRECT,
>>> -	PGDEMOTE_KHUGEPAGED,
>>> +	PGDEMOTE_SRC_KSWAPD,
>>> +	PGDEMOTE_SRC_DIRECT,
>>> +	PGDEMOTE_SRC_KHUGEPAGED,
>>> +	PGDEMOTE_DST_KSWAPD,
>>> +	PGDEMOTE_DST_DIRECT,
>>> +	PGDEMOTE_DST_KHUGEPAGED,
>>>   #endif
>>>   	NR_VM_NODE_STAT_ITEMS
>>>   };
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 2f1fb4ec3235..55d2287d7150 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -1111,13 +1111,18 @@ void drop_slab(void)
>>>   static int reclaimer_offset(void)
>>>   {
>>>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>>> -			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
>>> +			PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD);
>>>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>>>   			PGSCAN_DIRECT - PGSCAN_KSWAPD);
>>>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>>> -			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
>>> +			PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD);
>>>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>>>   			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
>>> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD !=
>>> +			PGDEMOTE_DST_DIRECT - PGDEMOTE_DST_KSWAPD);
>>> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD !=
>>> +			PGDEMOTE_DST_KHUGEPAGED - PGDEMOTE_DST_KSWAPD);
>>> +
>>>   
>>>   	if (current_is_kswapd())
>>>   		return 0;
>>> @@ -1678,8 +1683,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>>>   		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>>>   		      &nr_succeeded);
>>>   
>>> +	mod_node_page_state(pgdat,
>>> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(), nr_succeeded);
>>>   	mod_node_page_state(NODE_DATA(target_nid),
>>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
>>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);
>>>   
>>>   	return nr_succeeded;
>>>   }
>>> diff --git a/mm/vmstat.c b/mm/vmstat.c
>>> index f141c48c39e4..63f106a5e008 100644
>>> --- a/mm/vmstat.c
>>> +++ b/mm/vmstat.c
>>> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
>>>   #ifdef CONFIG_NUMA_BALANCING
>>>   	"pgpromote_success",
>>>   	"pgpromote_candidate",
>>> -	"pgdemote_kswapd",
>>> -	"pgdemote_direct",
>>> -	"pgdemote_khugepaged",
>>> +	"pgdemote_src_kswapd",
>>> +	"pgdemote_src_direct",
>>> +	"pgdemote_src_khugepaged",
>>> +	"pgdemote_dst_kswapd",
>>> +	"pgdemote_dst_direct",
>>> +	"pgdemote_dst_khugepaged",
>>>   #endif
>>>   
>>>   	/* enum writeback_stat_item counters */
Yasunori Gotou (Fujitsu) Nov. 2, 2023, 7:38 a.m. UTC | #4
Hello,

> On 02/11/2023 13:45, Huang, Ying wrote:
> > Li Zhijian <lizhijian@fujitsu.com> writes:
> >
> >> pgdemote_src_*: pages demoted from this node.
> >> pgdemote_dst_*: pages demoted to this node.
> >>
> >> So that we are able to know their demotion per-node stats by checking this.
> >>
> >> In the environment, node0 and node1 are DRAM, node3 is PMEM.
> >>
> >> Global stats:
> >> $ grep -E 'demote' /proc/vmstat
> >> pgdemote_src_kswapd 130155
> >> pgdemote_src_direct 113497
> >> pgdemote_src_khugepaged 0
> >> pgdemote_dst_kswapd 130155
> >> pgdemote_dst_direct 113497
> >> pgdemote_dst_khugepaged 0
> >>
> >> Per-node stats:
> >> $ grep demote /sys/devices/system/node/node0/vmstat
> >> pgdemote_src_kswapd 68454
> >> pgdemote_src_direct 83431
> >> pgdemote_src_khugepaged 0
> >> pgdemote_dst_kswapd 0
> >> pgdemote_dst_direct 0
> >> pgdemote_dst_khugepaged 0
> >>
> >> $ grep demote /sys/devices/system/node/node1/vmstat
> >> pgdemote_src_kswapd 185834
> >> pgdemote_src_direct 30066
> >> pgdemote_src_khugepaged 0
> >> pgdemote_dst_kswapd 0
> >> pgdemote_dst_direct 0
> >> pgdemote_dst_khugepaged 0
> >>
> >> $ grep demote /sys/devices/system/node/node3/vmstat
> >> pgdemote_src_kswapd 0
> >> pgdemote_src_direct 0
> >> pgdemote_src_khugepaged 0
> >> pgdemote_dst_kswapd 254288
> >> pgdemote_dst_direct 113497
> >> pgdemote_dst_khugepaged 0
> >>
> >>  From above stats, we know node3 is the demotion destination which one
> >> the node0 and node1 will demote to.
> >
> > Why do we need these information?  Do you have some use case?
> 
> I recall our customers have mentioned that they want to know how much the
> memory is demoted
> to the CXL memory device in a specific period.

I'll mention about it more.

I had a conversation with one of our customers. He expressed a desire for more detailed
profile information to analyze the behavior of demotion (and promotion) when
his workloads are executed. 
If the results are not satisfactory for his workloads, he wants to tune his servers for his workloads
with these profiles.
Additionally, depending on the results, he may want to change his server configuration. 
For example, he may want to buy more expensive DDR memories rather than cheaper CXL memory.

In my impression, our customers seems to think that CXL memory is NOT as reliable as DDR memory yet.
Therefore, they want to prepare for the new world that CXL will bring, and want to have a method 
for the preparation by profiling information as much as possible.

it this enough for your question?

Thanks,

> 
> 
> >>>   	mod_node_page_state(NODE_DATA(target_nid),
> >>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
> nr_succeeded);
> >>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
> nr_succeeded);
> 
> But if the *target_nid* is only indicate the preferred node, this accounting
> maybe not accurate.
> 
> 
> Thanks
> Zhijian
> 
> >
> > --
> > Best Regards,
> > Huang, Ying
> >
> >> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> >> ---
> >> RFC: their names are open to discussion, maybe pgdemote_from/to_*
> >> Another defect of this patch is that, SUM(pgdemote_src_*) is always same
> >> as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
> >> ---
> >>   include/linux/mmzone.h |  9 ++++++---
> >>   mm/vmscan.c            | 13 ++++++++++---
> >>   mm/vmstat.c            |  9 ++++++---
> >>   3 files changed, 22 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >> index ad0309eea850..a6140d894bec 100644
> >> --- a/include/linux/mmzone.h
> >> +++ b/include/linux/mmzone.h
> >> @@ -207,9 +207,12 @@ enum node_stat_item {
> >>   	PGPROMOTE_SUCCESS,	/* promote successfully */
> >>   	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
> >>   	/* PGDEMOTE_*: pages demoted */
> >> -	PGDEMOTE_KSWAPD,
> >> -	PGDEMOTE_DIRECT,
> >> -	PGDEMOTE_KHUGEPAGED,
> >> +	PGDEMOTE_SRC_KSWAPD,
> >> +	PGDEMOTE_SRC_DIRECT,
> >> +	PGDEMOTE_SRC_KHUGEPAGED,
> >> +	PGDEMOTE_DST_KSWAPD,
> >> +	PGDEMOTE_DST_DIRECT,
> >> +	PGDEMOTE_DST_KHUGEPAGED,
> >>   #endif
> >>   	NR_VM_NODE_STAT_ITEMS
> >>   };
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 2f1fb4ec3235..55d2287d7150 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -1111,13 +1111,18 @@ void drop_slab(void)
> >>   static int reclaimer_offset(void)
> >>   {
> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
> >> -			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
> >> +			PGDEMOTE_SRC_DIRECT -
> PGDEMOTE_SRC_KSWAPD);
> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
> >>   			PGSCAN_DIRECT - PGSCAN_KSWAPD);
> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
> >> -			PGDEMOTE_KHUGEPAGED -
> PGDEMOTE_KSWAPD);
> >> +			PGDEMOTE_SRC_KHUGEPAGED -
> PGDEMOTE_SRC_KSWAPD);
> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
> >>   			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT -
> PGDEMOTE_SRC_KSWAPD !=
> >> +			PGDEMOTE_DST_DIRECT -
> PGDEMOTE_DST_KSWAPD);
> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED -
> PGDEMOTE_SRC_KSWAPD !=
> >> +			PGDEMOTE_DST_KHUGEPAGED -
> PGDEMOTE_DST_KSWAPD);
> >> +
> >>
> >>   	if (current_is_kswapd())
> >>   		return 0;
> >> @@ -1678,8 +1683,10 @@ static unsigned int demote_folio_list(struct
> list_head *demote_folios,
> >>   		      (unsigned long)&mtc, MIGRATE_ASYNC,
> MR_DEMOTION,
> >>   		      &nr_succeeded);
> >>
> >> +	mod_node_page_state(pgdat,
> >> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(),
> nr_succeeded);
> >>   	mod_node_page_state(NODE_DATA(target_nid),
> >> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
> nr_succeeded);
> >> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
> nr_succeeded);
> >>
> >>   	return nr_succeeded;
> >>   }
> >> diff --git a/mm/vmstat.c b/mm/vmstat.c
> >> index f141c48c39e4..63f106a5e008 100644
> >> --- a/mm/vmstat.c
> >> +++ b/mm/vmstat.c
> >> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
> >>   #ifdef CONFIG_NUMA_BALANCING
> >>   	"pgpromote_success",
> >>   	"pgpromote_candidate",
> >> -	"pgdemote_kswapd",
> >> -	"pgdemote_direct",
> >> -	"pgdemote_khugepaged",
> >> +	"pgdemote_src_kswapd",
> >> +	"pgdemote_src_direct",
> >> +	"pgdemote_src_khugepaged",
> >> +	"pgdemote_dst_kswapd",
> >> +	"pgdemote_dst_direct",
> >> +	"pgdemote_dst_khugepaged",
> >>   #endif
> >>
> >>   	/* enum writeback_stat_item counters */
Huang, Ying Nov. 2, 2023, 7:46 a.m. UTC | #5
"Yasunori Gotou (Fujitsu)" <y-goto@fujitsu.com> writes:

> Hello,
>
>> On 02/11/2023 13:45, Huang, Ying wrote:
>> > Li Zhijian <lizhijian@fujitsu.com> writes:
>> >
>> >> pgdemote_src_*: pages demoted from this node.
>> >> pgdemote_dst_*: pages demoted to this node.
>> >>
>> >> So that we are able to know their demotion per-node stats by checking this.
>> >>
>> >> In the environment, node0 and node1 are DRAM, node3 is PMEM.
>> >>
>> >> Global stats:
>> >> $ grep -E 'demote' /proc/vmstat
>> >> pgdemote_src_kswapd 130155
>> >> pgdemote_src_direct 113497
>> >> pgdemote_src_khugepaged 0
>> >> pgdemote_dst_kswapd 130155
>> >> pgdemote_dst_direct 113497
>> >> pgdemote_dst_khugepaged 0
>> >>
>> >> Per-node stats:
>> >> $ grep demote /sys/devices/system/node/node0/vmstat
>> >> pgdemote_src_kswapd 68454
>> >> pgdemote_src_direct 83431
>> >> pgdemote_src_khugepaged 0
>> >> pgdemote_dst_kswapd 0
>> >> pgdemote_dst_direct 0
>> >> pgdemote_dst_khugepaged 0
>> >>
>> >> $ grep demote /sys/devices/system/node/node1/vmstat
>> >> pgdemote_src_kswapd 185834
>> >> pgdemote_src_direct 30066
>> >> pgdemote_src_khugepaged 0
>> >> pgdemote_dst_kswapd 0
>> >> pgdemote_dst_direct 0
>> >> pgdemote_dst_khugepaged 0
>> >>
>> >> $ grep demote /sys/devices/system/node/node3/vmstat
>> >> pgdemote_src_kswapd 0
>> >> pgdemote_src_direct 0
>> >> pgdemote_src_khugepaged 0
>> >> pgdemote_dst_kswapd 254288
>> >> pgdemote_dst_direct 113497
>> >> pgdemote_dst_khugepaged 0
>> >>
>> >>  From above stats, we know node3 is the demotion destination which one
>> >> the node0 and node1 will demote to.
>> >
>> > Why do we need these information?  Do you have some use case?
>> 
>> I recall our customers have mentioned that they want to know how much the
>> memory is demoted
>> to the CXL memory device in a specific period.
>
> I'll mention about it more.
>
> I had a conversation with one of our customers. He expressed a desire for more detailed
> profile information to analyze the behavior of demotion (and promotion) when
> his workloads are executed. 
> If the results are not satisfactory for his workloads, he wants to tune his servers for his workloads
> with these profiles.
> Additionally, depending on the results, he may want to change his server configuration. 
> For example, he may want to buy more expensive DDR memories rather than cheaper CXL memory.
>
> In my impression, our customers seems to think that CXL memory is NOT as reliable as DDR memory yet.
> Therefore, they want to prepare for the new world that CXL will bring, and want to have a method 
> for the preparation by profiling information as much as possible.
>
> it this enough for your question?

I want some more detailed information about how these stats are used?
Why isn't per-node pgdemote_xxx counter enough?

--
Best Regards,
Huang, Ying

> Thanks,
>
>> 
>> 
>> >>>   	mod_node_page_state(NODE_DATA(target_nid),
>> >>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
>> nr_succeeded);
>> >>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
>> nr_succeeded);
>> 
>> But if the *target_nid* is only indicate the preferred node, this accounting
>> maybe not accurate.
>> 
>> 
>> Thanks
>> Zhijian
>> 
>> >
>> > --
>> > Best Regards,
>> > Huang, Ying
>> >
>> >> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> >> ---
>> >> RFC: their names are open to discussion, maybe pgdemote_from/to_*
>> >> Another defect of this patch is that, SUM(pgdemote_src_*) is always same
>> >> as SUM(pgdemote_dst_*) in the global stats, shall we hide one of them.
>> >> ---
>> >>   include/linux/mmzone.h |  9 ++++++---
>> >>   mm/vmscan.c            | 13 ++++++++++---
>> >>   mm/vmstat.c            |  9 ++++++---
>> >>   3 files changed, 22 insertions(+), 9 deletions(-)
>> >>
>> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> >> index ad0309eea850..a6140d894bec 100644
>> >> --- a/include/linux/mmzone.h
>> >> +++ b/include/linux/mmzone.h
>> >> @@ -207,9 +207,12 @@ enum node_stat_item {
>> >>   	PGPROMOTE_SUCCESS,	/* promote successfully */
>> >>   	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
>> >>   	/* PGDEMOTE_*: pages demoted */
>> >> -	PGDEMOTE_KSWAPD,
>> >> -	PGDEMOTE_DIRECT,
>> >> -	PGDEMOTE_KHUGEPAGED,
>> >> +	PGDEMOTE_SRC_KSWAPD,
>> >> +	PGDEMOTE_SRC_DIRECT,
>> >> +	PGDEMOTE_SRC_KHUGEPAGED,
>> >> +	PGDEMOTE_DST_KSWAPD,
>> >> +	PGDEMOTE_DST_DIRECT,
>> >> +	PGDEMOTE_DST_KHUGEPAGED,
>> >>   #endif
>> >>   	NR_VM_NODE_STAT_ITEMS
>> >>   };
>> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> >> index 2f1fb4ec3235..55d2287d7150 100644
>> >> --- a/mm/vmscan.c
>> >> +++ b/mm/vmscan.c
>> >> @@ -1111,13 +1111,18 @@ void drop_slab(void)
>> >>   static int reclaimer_offset(void)
>> >>   {
>> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>> >> -			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
>> >> +			PGDEMOTE_SRC_DIRECT -
>> PGDEMOTE_SRC_KSWAPD);
>> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
>> >>   			PGSCAN_DIRECT - PGSCAN_KSWAPD);
>> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>> >> -			PGDEMOTE_KHUGEPAGED -
>> PGDEMOTE_KSWAPD);
>> >> +			PGDEMOTE_SRC_KHUGEPAGED -
>> PGDEMOTE_SRC_KSWAPD);
>> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
>> >>   			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
>> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT -
>> PGDEMOTE_SRC_KSWAPD !=
>> >> +			PGDEMOTE_DST_DIRECT -
>> PGDEMOTE_DST_KSWAPD);
>> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED -
>> PGDEMOTE_SRC_KSWAPD !=
>> >> +			PGDEMOTE_DST_KHUGEPAGED -
>> PGDEMOTE_DST_KSWAPD);
>> >> +
>> >>
>> >>   	if (current_is_kswapd())
>> >>   		return 0;
>> >> @@ -1678,8 +1683,10 @@ static unsigned int demote_folio_list(struct
>> list_head *demote_folios,
>> >>   		      (unsigned long)&mtc, MIGRATE_ASYNC,
>> MR_DEMOTION,
>> >>   		      &nr_succeeded);
>> >>
>> >> +	mod_node_page_state(pgdat,
>> >> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(),
>> nr_succeeded);
>> >>   	mod_node_page_state(NODE_DATA(target_nid),
>> >> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
>> nr_succeeded);
>> >> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
>> nr_succeeded);
>> >>
>> >>   	return nr_succeeded;
>> >>   }
>> >> diff --git a/mm/vmstat.c b/mm/vmstat.c
>> >> index f141c48c39e4..63f106a5e008 100644
>> >> --- a/mm/vmstat.c
>> >> +++ b/mm/vmstat.c
>> >> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
>> >>   #ifdef CONFIG_NUMA_BALANCING
>> >>   	"pgpromote_success",
>> >>   	"pgpromote_candidate",
>> >> -	"pgdemote_kswapd",
>> >> -	"pgdemote_direct",
>> >> -	"pgdemote_khugepaged",
>> >> +	"pgdemote_src_kswapd",
>> >> +	"pgdemote_src_direct",
>> >> +	"pgdemote_src_khugepaged",
>> >> +	"pgdemote_dst_kswapd",
>> >> +	"pgdemote_dst_direct",
>> >> +	"pgdemote_dst_khugepaged",
>> >>   #endif
>> >>
>> >>   	/* enum writeback_stat_item counters */
Yasunori Gotou (Fujitsu) Nov. 2, 2023, 9:45 a.m. UTC | #6
> > Hello,
> >
> >> On 02/11/2023 13:45, Huang, Ying wrote:
> >> > Li Zhijian <lizhijian@fujitsu.com> writes:
> >> >
> >> >> pgdemote_src_*: pages demoted from this node.
> >> >> pgdemote_dst_*: pages demoted to this node.
> >> >>
> >> >> So that we are able to know their demotion per-node stats by checking
> this.
> >> >>
> >> >> In the environment, node0 and node1 are DRAM, node3 is PMEM.
> >> >>
> >> >> Global stats:
> >> >> $ grep -E 'demote' /proc/vmstat
> >> >> pgdemote_src_kswapd 130155
> >> >> pgdemote_src_direct 113497
> >> >> pgdemote_src_khugepaged 0
> >> >> pgdemote_dst_kswapd 130155
> >> >> pgdemote_dst_direct 113497
> >> >> pgdemote_dst_khugepaged 0
> >> >>
> >> >> Per-node stats:
> >> >> $ grep demote /sys/devices/system/node/node0/vmstat
> >> >> pgdemote_src_kswapd 68454
> >> >> pgdemote_src_direct 83431
> >> >> pgdemote_src_khugepaged 0
> >> >> pgdemote_dst_kswapd 0
> >> >> pgdemote_dst_direct 0
> >> >> pgdemote_dst_khugepaged 0
> >> >>
> >> >> $ grep demote /sys/devices/system/node/node1/vmstat
> >> >> pgdemote_src_kswapd 185834
> >> >> pgdemote_src_direct 30066
> >> >> pgdemote_src_khugepaged 0
> >> >> pgdemote_dst_kswapd 0
> >> >> pgdemote_dst_direct 0
> >> >> pgdemote_dst_khugepaged 0
> >> >>
> >> >> $ grep demote /sys/devices/system/node/node3/vmstat
> >> >> pgdemote_src_kswapd 0
> >> >> pgdemote_src_direct 0
> >> >> pgdemote_src_khugepaged 0
> >> >> pgdemote_dst_kswapd 254288
> >> >> pgdemote_dst_direct 113497
> >> >> pgdemote_dst_khugepaged 0
> >> >>
> >> >>  From above stats, we know node3 is the demotion destination which
> >> >> one the node0 and node1 will demote to.
> >> >
> >> > Why do we need these information?  Do you have some use case?
> >>
> >> I recall our customers have mentioned that they want to know how much
> >> the memory is demoted to the CXL memory device in a specific period.
> >
> > I'll mention about it more.
> >
> > I had a conversation with one of our customers. He expressed a desire
> > for more detailed profile information to analyze the behavior of
> > demotion (and promotion) when his workloads are executed.
> > If the results are not satisfactory for his workloads, he wants to
> > tune his servers for his workloads with these profiles.
> > Additionally, depending on the results, he may want to change his server
> configuration.
> > For example, he may want to buy more expensive DDR memories rather than
> cheaper CXL memory.
> >
> > In my impression, our customers seems to think that CXL memory is NOT as
> reliable as DDR memory yet.
> > Therefore, they want to prepare for the new world that CXL will bring,
> > and want to have a method for the preparation by profiling information as
> much as possible.
> >
> > it this enough for your question?
> 
> I want some more detailed information about how these stats are used?
> Why isn't per-node pgdemote_xxx counter enough?

I rechecked the customer's original request.

- If a memory area is demoted to a CXL memory node, he wanted to analyze how it affects performance
 of their workload, such as latency. He wanted to use CXL Node memory usage as basic
 information for the analysis.
- If he notices that demotion occurs well on a server and CXL memories are used 85% constantly, he 
  may want to add DDR DRAM or select some other ways to avoid demotion.
  (His image is likely Swap free/used.)
  IIRC, demotion target is not spread to all of the CXL memory node, right? 
  Then, he needs to know how CXL memory is occupied by demoted memory.

If I misunderstand something, or you have any better idea,
please let us know. I'll talk with him again. (It will be next week...)

Thanks,

> 
> --
> Best Regards,
> Huang, Ying
> 
> > Thanks,
> >
> >>
> >>
> >> >>>   	mod_node_page_state(NODE_DATA(target_nid),
> >> >>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
> >> nr_succeeded);
> >> >>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
> >> nr_succeeded);
> >>
> >> But if the *target_nid* is only indicate the preferred node, this
> >> accounting maybe not accurate.
> >>
> >>
> >> Thanks
> >> Zhijian
> >>
> >> >
> >> > --
> >> > Best Regards,
> >> > Huang, Ying
> >> >
> >> >> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> >> >> ---
> >> >> RFC: their names are open to discussion, maybe pgdemote_from/to_*
> >> >> Another defect of this patch is that, SUM(pgdemote_src_*) is
> >> >> always same as SUM(pgdemote_dst_*) in the global stats, shall we
> hide one of them.
> >> >> ---
> >> >>   include/linux/mmzone.h |  9 ++++++---
> >> >>   mm/vmscan.c            | 13 ++++++++++---
> >> >>   mm/vmstat.c            |  9 ++++++---
> >> >>   3 files changed, 22 insertions(+), 9 deletions(-)
> >> >>
> >> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index
> >> >> ad0309eea850..a6140d894bec 100644
> >> >> --- a/include/linux/mmzone.h
> >> >> +++ b/include/linux/mmzone.h
> >> >> @@ -207,9 +207,12 @@ enum node_stat_item {
> >> >>   	PGPROMOTE_SUCCESS,	/* promote successfully */
> >> >>   	PGPROMOTE_CANDIDATE,	/* candidate pages to
> promote */
> >> >>   	/* PGDEMOTE_*: pages demoted */
> >> >> -	PGDEMOTE_KSWAPD,
> >> >> -	PGDEMOTE_DIRECT,
> >> >> -	PGDEMOTE_KHUGEPAGED,
> >> >> +	PGDEMOTE_SRC_KSWAPD,
> >> >> +	PGDEMOTE_SRC_DIRECT,
> >> >> +	PGDEMOTE_SRC_KHUGEPAGED,
> >> >> +	PGDEMOTE_DST_KSWAPD,
> >> >> +	PGDEMOTE_DST_DIRECT,
> >> >> +	PGDEMOTE_DST_KHUGEPAGED,
> >> >>   #endif
> >> >>   	NR_VM_NODE_STAT_ITEMS
> >> >>   };
> >> >> diff --git a/mm/vmscan.c b/mm/vmscan.c index
> >> >> 2f1fb4ec3235..55d2287d7150 100644
> >> >> --- a/mm/vmscan.c
> >> >> +++ b/mm/vmscan.c
> >> >> @@ -1111,13 +1111,18 @@ void drop_slab(void)
> >> >>   static int reclaimer_offset(void)
> >> >>   {
> >> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
> >> >> -			PGDEMOTE_DIRECT -
> PGDEMOTE_KSWAPD);
> >> >> +			PGDEMOTE_SRC_DIRECT -
> >> PGDEMOTE_SRC_KSWAPD);
> >> >>   	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
> >> >>   			PGSCAN_DIRECT - PGSCAN_KSWAPD);
> >> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED -
> PGSTEAL_KSWAPD !=
> >> >> -			PGDEMOTE_KHUGEPAGED -
> >> PGDEMOTE_KSWAPD);
> >> >> +			PGDEMOTE_SRC_KHUGEPAGED -
> >> PGDEMOTE_SRC_KSWAPD);
> >> >>   	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED -
> PGSTEAL_KSWAPD !=
> >> >>   			PGSCAN_KHUGEPAGED -
> PGSCAN_KSWAPD);
> >> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT -
> >> PGDEMOTE_SRC_KSWAPD !=
> >> >> +			PGDEMOTE_DST_DIRECT -
> >> PGDEMOTE_DST_KSWAPD);
> >> >> +	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED -
> >> PGDEMOTE_SRC_KSWAPD !=
> >> >> +			PGDEMOTE_DST_KHUGEPAGED -
> >> PGDEMOTE_DST_KSWAPD);
> >> >> +
> >> >>
> >> >>   	if (current_is_kswapd())
> >> >>   		return 0;
> >> >> @@ -1678,8 +1683,10 @@ static unsigned int
> >> >> demote_folio_list(struct
> >> list_head *demote_folios,
> >> >>   		      (unsigned long)&mtc, MIGRATE_ASYNC,
> >> MR_DEMOTION,
> >> >>   		      &nr_succeeded);
> >> >>
> >> >> +	mod_node_page_state(pgdat,
> >> >> +		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(),
> >> nr_succeeded);
> >> >>   	mod_node_page_state(NODE_DATA(target_nid),
> >> >> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
> >> nr_succeeded);
> >> >> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
> >> nr_succeeded);
> >> >>
> >> >>   	return nr_succeeded;
> >> >>   }
> >> >> diff --git a/mm/vmstat.c b/mm/vmstat.c index
> >> >> f141c48c39e4..63f106a5e008 100644
> >> >> --- a/mm/vmstat.c
> >> >> +++ b/mm/vmstat.c
> >> >> @@ -1244,9 +1244,12 @@ const char * const vmstat_text[] = {
> >> >>   #ifdef CONFIG_NUMA_BALANCING
> >> >>   	"pgpromote_success",
> >> >>   	"pgpromote_candidate",
> >> >> -	"pgdemote_kswapd",
> >> >> -	"pgdemote_direct",
> >> >> -	"pgdemote_khugepaged",
> >> >> +	"pgdemote_src_kswapd",
> >> >> +	"pgdemote_src_direct",
> >> >> +	"pgdemote_src_khugepaged",
> >> >> +	"pgdemote_dst_kswapd",
> >> >> +	"pgdemote_dst_direct",
> >> >> +	"pgdemote_dst_khugepaged",
> >> >>   #endif
> >> >>
> >> >>   	/* enum writeback_stat_item counters */
Huang, Ying Nov. 3, 2023, 6:14 a.m. UTC | #7
"Yasunori Gotou (Fujitsu)" <y-goto@fujitsu.com> writes:

>> > Hello,
>> >
>> >> On 02/11/2023 13:45, Huang, Ying wrote:
>> >> > Li Zhijian <lizhijian@fujitsu.com> writes:
>> >> >
>> >> >> pgdemote_src_*: pages demoted from this node.
>> >> >> pgdemote_dst_*: pages demoted to this node.
>> >> >>
>> >> >> So that we are able to know their demotion per-node stats by checking
>> this.
>> >> >>
>> >> >> In the environment, node0 and node1 are DRAM, node3 is PMEM.
>> >> >>
>> >> >> Global stats:
>> >> >> $ grep -E 'demote' /proc/vmstat
>> >> >> pgdemote_src_kswapd 130155
>> >> >> pgdemote_src_direct 113497
>> >> >> pgdemote_src_khugepaged 0
>> >> >> pgdemote_dst_kswapd 130155
>> >> >> pgdemote_dst_direct 113497
>> >> >> pgdemote_dst_khugepaged 0
>> >> >>
>> >> >> Per-node stats:
>> >> >> $ grep demote /sys/devices/system/node/node0/vmstat
>> >> >> pgdemote_src_kswapd 68454
>> >> >> pgdemote_src_direct 83431
>> >> >> pgdemote_src_khugepaged 0
>> >> >> pgdemote_dst_kswapd 0
>> >> >> pgdemote_dst_direct 0
>> >> >> pgdemote_dst_khugepaged 0
>> >> >>
>> >> >> $ grep demote /sys/devices/system/node/node1/vmstat
>> >> >> pgdemote_src_kswapd 185834
>> >> >> pgdemote_src_direct 30066
>> >> >> pgdemote_src_khugepaged 0
>> >> >> pgdemote_dst_kswapd 0
>> >> >> pgdemote_dst_direct 0
>> >> >> pgdemote_dst_khugepaged 0
>> >> >>
>> >> >> $ grep demote /sys/devices/system/node/node3/vmstat
>> >> >> pgdemote_src_kswapd 0
>> >> >> pgdemote_src_direct 0
>> >> >> pgdemote_src_khugepaged 0
>> >> >> pgdemote_dst_kswapd 254288
>> >> >> pgdemote_dst_direct 113497
>> >> >> pgdemote_dst_khugepaged 0
>> >> >>
>> >> >>  From above stats, we know node3 is the demotion destination which
>> >> >> one the node0 and node1 will demote to.
>> >> >
>> >> > Why do we need these information?  Do you have some use case?
>> >>
>> >> I recall our customers have mentioned that they want to know how much
>> >> the memory is demoted to the CXL memory device in a specific period.
>> >
>> > I'll mention about it more.
>> >
>> > I had a conversation with one of our customers. He expressed a desire
>> > for more detailed profile information to analyze the behavior of
>> > demotion (and promotion) when his workloads are executed.
>> > If the results are not satisfactory for his workloads, he wants to
>> > tune his servers for his workloads with these profiles.
>> > Additionally, depending on the results, he may want to change his server
>> configuration.
>> > For example, he may want to buy more expensive DDR memories rather than
>> cheaper CXL memory.
>> >
>> > In my impression, our customers seems to think that CXL memory is NOT as
>> reliable as DDR memory yet.
>> > Therefore, they want to prepare for the new world that CXL will bring,
>> > and want to have a method for the preparation by profiling information as
>> much as possible.
>> >
>> > it this enough for your question?
>> 
>> I want some more detailed information about how these stats are used?
>> Why isn't per-node pgdemote_xxx counter enough?
>
> I rechecked the customer's original request.
>
> - If a memory area is demoted to a CXL memory node, he wanted to analyze how it affects performance
>  of their workload, such as latency. He wanted to use CXL Node memory usage as basic
>  information for the analysis.
>
> - If he notices that demotion occurs well on a server and CXL memories are used 85% constantly, he 
>   may want to add DDR DRAM or select some other ways to avoid demotion.
>   (His image is likely Swap free/used.)
>   IIRC, demotion target is not spread to all of the CXL memory node, right? 
>   Then, he needs to know how CXL memory is occupied by demoted memory.
>
> If I misunderstand something, or you have any better idea,
> please let us know. I'll talk with him again. (It will be next week...)


To check CXL memory usage, /proc/PID/numa_maps,
/sys/fs/cgroup/CGROUP/memory.numa_stat, and
/sys/devices/system/node/nodeN/meminfo can be used for process, cgroup,
and NUMA node respectively.  Is this enough?

--
Best Regards,
Huang, Ying

>> >
>> >>
>> >>
>> >> >>>   	mod_node_page_state(NODE_DATA(target_nid),
>> >> >>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
>> >> nr_succeeded);
>> >> >>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
>> >> nr_succeeded);
>> >>
>> >> But if the *target_nid* is only indicate the preferred node, this
>> >> accounting maybe not accurate.
>> >>

[snip]
Yasunori Gotou (Fujitsu) Nov. 6, 2023, 5:02 a.m. UTC | #8
> >> > Hello,
> >> >
> >> >> On 02/11/2023 13:45, Huang, Ying wrote:
> >> >> > Li Zhijian <lizhijian@fujitsu.com> writes:
> >> >> >
> >> >> >> pgdemote_src_*: pages demoted from this node.
> >> >> >> pgdemote_dst_*: pages demoted to this node.
> >> >> >>
> >> >> >> So that we are able to know their demotion per-node stats by
> >> >> >> checking
> >> this.
> >> >> >>
> >> >> >> In the environment, node0 and node1 are DRAM, node3 is PMEM.
> >> >> >>
> >> >> >> Global stats:
> >> >> >> $ grep -E 'demote' /proc/vmstat pgdemote_src_kswapd 130155
> >> >> >> pgdemote_src_direct 113497 pgdemote_src_khugepaged 0
> >> >> >> pgdemote_dst_kswapd 130155 pgdemote_dst_direct 113497
> >> >> >> pgdemote_dst_khugepaged 0
> >> >> >>
> >> >> >> Per-node stats:
> >> >> >> $ grep demote /sys/devices/system/node/node0/vmstat
> >> >> >> pgdemote_src_kswapd 68454
> >> >> >> pgdemote_src_direct 83431
> >> >> >> pgdemote_src_khugepaged 0
> >> >> >> pgdemote_dst_kswapd 0
> >> >> >> pgdemote_dst_direct 0
> >> >> >> pgdemote_dst_khugepaged 0
> >> >> >>
> >> >> >> $ grep demote /sys/devices/system/node/node1/vmstat
> >> >> >> pgdemote_src_kswapd 185834
> >> >> >> pgdemote_src_direct 30066
> >> >> >> pgdemote_src_khugepaged 0
> >> >> >> pgdemote_dst_kswapd 0
> >> >> >> pgdemote_dst_direct 0
> >> >> >> pgdemote_dst_khugepaged 0
> >> >> >>
> >> >> >> $ grep demote /sys/devices/system/node/node3/vmstat
> >> >> >> pgdemote_src_kswapd 0
> >> >> >> pgdemote_src_direct 0
> >> >> >> pgdemote_src_khugepaged 0
> >> >> >> pgdemote_dst_kswapd 254288
> >> >> >> pgdemote_dst_direct 113497
> >> >> >> pgdemote_dst_khugepaged 0
> >> >> >>
> >> >> >>  From above stats, we know node3 is the demotion destination
> >> >> >> which one the node0 and node1 will demote to.
> >> >> >
> >> >> > Why do we need these information?  Do you have some use case?
> >> >>
> >> >> I recall our customers have mentioned that they want to know how
> >> >> much the memory is demoted to the CXL memory device in a specific
> period.
> >> >
> >> > I'll mention about it more.
> >> >
> >> > I had a conversation with one of our customers. He expressed a
> >> > desire for more detailed profile information to analyze the
> >> > behavior of demotion (and promotion) when his workloads are executed.
> >> > If the results are not satisfactory for his workloads, he wants to
> >> > tune his servers for his workloads with these profiles.
> >> > Additionally, depending on the results, he may want to change his
> >> > server
> >> configuration.
> >> > For example, he may want to buy more expensive DDR memories rather
> >> > than
> >> cheaper CXL memory.
> >> >
> >> > In my impression, our customers seems to think that CXL memory is
> >> > NOT as
> >> reliable as DDR memory yet.
> >> > Therefore, they want to prepare for the new world that CXL will
> >> > bring, and want to have a method for the preparation by profiling
> >> > information as
> >> much as possible.
> >> >
> >> > it this enough for your question?
> >>
> >> I want some more detailed information about how these stats are used?
> >> Why isn't per-node pgdemote_xxx counter enough?
> >
> > I rechecked the customer's original request.
> >
> > - If a memory area is demoted to a CXL memory node, he wanted to
> > analyze how it affects performance  of their workload, such as
> > latency. He wanted to use CXL Node memory usage as basic  information for
> the analysis.
> >
> > - If he notices that demotion occurs well on a server and CXL memories are
> used 85% constantly, he
> >   may want to add DDR DRAM or select some other ways to avoid demotion.
> >   (His image is likely Swap free/used.)
> >   IIRC, demotion target is not spread to all of the CXL memory node, right?
> >   Then, he needs to know how CXL memory is occupied by demoted
> memory.
> >
> > If I misunderstand something, or you have any better idea, please let
> > us know. I'll talk with him again. (It will be next week...)
> 
> 
> To check CXL memory usage, /proc/PID/numa_maps,
> /sys/fs/cgroup/CGROUP/memory.numa_stat, and
> /sys/devices/system/node/nodeN/meminfo can be used for process, cgroup,
> and NUMA node respectively.  Is this enough?

Thank you for your idea
We will investigate your idea and talk with our customer.
Please wait.

Thanks,
---
Yasunori Goto


> 
> --
> Best Regards,
> Huang, Ying
> 
> >> >
> >> >>
> >> >>
> >> >> >>>   	mod_node_page_state(NODE_DATA(target_nid),
> >> >> >>> -		    PGDEMOTE_KSWAPD + reclaimer_offset(),
> >> >> nr_succeeded);
> >> >> >>> +		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(),
> >> >> nr_succeeded);
> >> >>
> >> >> But if the *target_nid* is only indicate the preferred node, this
> >> >> accounting maybe not accurate.
> >> >>
> 
> [snip]
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ad0309eea850..a6140d894bec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -207,9 +207,12 @@  enum node_stat_item {
 	PGPROMOTE_SUCCESS,	/* promote successfully */
 	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
 	/* PGDEMOTE_*: pages demoted */
-	PGDEMOTE_KSWAPD,
-	PGDEMOTE_DIRECT,
-	PGDEMOTE_KHUGEPAGED,
+	PGDEMOTE_SRC_KSWAPD,
+	PGDEMOTE_SRC_DIRECT,
+	PGDEMOTE_SRC_KHUGEPAGED,
+	PGDEMOTE_DST_KSWAPD,
+	PGDEMOTE_DST_DIRECT,
+	PGDEMOTE_DST_KHUGEPAGED,
 #endif
 	NR_VM_NODE_STAT_ITEMS
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2f1fb4ec3235..55d2287d7150 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1111,13 +1111,18 @@  void drop_slab(void)
 static int reclaimer_offset(void)
 {
 	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
-			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+			PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 			PGSCAN_DIRECT - PGSCAN_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
-			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+			PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+	BUILD_BUG_ON(PGDEMOTE_SRC_DIRECT - PGDEMOTE_SRC_KSWAPD !=
+			PGDEMOTE_DST_DIRECT - PGDEMOTE_DST_KSWAPD);
+	BUILD_BUG_ON(PGDEMOTE_SRC_KHUGEPAGED - PGDEMOTE_SRC_KSWAPD !=
+			PGDEMOTE_DST_KHUGEPAGED - PGDEMOTE_DST_KSWAPD);
+
 
 	if (current_is_kswapd())
 		return 0;
@@ -1678,8 +1683,10 @@  static unsigned int demote_folio_list(struct list_head *demote_folios,
 		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
 		      &nr_succeeded);
 
+	mod_node_page_state(pgdat,
+		    PGDEMOTE_SRC_KSWAPD + reclaimer_offset(), nr_succeeded);
 	mod_node_page_state(NODE_DATA(target_nid),
-		    PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
+		    PGDEMOTE_DST_KSWAPD + reclaimer_offset(), nr_succeeded);
 
 	return nr_succeeded;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f141c48c39e4..63f106a5e008 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1244,9 +1244,12 @@  const char * const vmstat_text[] = {
 #ifdef CONFIG_NUMA_BALANCING
 	"pgpromote_success",
 	"pgpromote_candidate",
-	"pgdemote_kswapd",
-	"pgdemote_direct",
-	"pgdemote_khugepaged",
+	"pgdemote_src_kswapd",
+	"pgdemote_src_direct",
+	"pgdemote_src_khugepaged",
+	"pgdemote_dst_kswapd",
+	"pgdemote_dst_direct",
+	"pgdemote_dst_khugepaged",
 #endif
 
 	/* enum writeback_stat_item counters */